source: icGREP/icgrep-devel/icgrep/re/to_utf8.cpp

Last change on this file was 6160, checked in by cameron, 7 months ago

Generic RE_Transformer

File size: 2.4 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <re/to_utf8.h>
8#include <UCD/unicode_set.h>
9#include <UCD/UTF.h>
10#include <cc/alphabet.h>
11#include <re/re_name.h>
12#include <re/re_start.h>
13#include <re/re_end.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_alt.h>
17#include <re/re_rep.h>
18#include <re/re_diff.h>
19#include <re/re_intersect.h>
20#include <re/re_assertion.h>
21#include <llvm/Support/Casting.h>
22#include <llvm/Support/ErrorHandling.h>
23
24using namespace llvm;
25
26namespace re {
27   
28static RE * rangeCodeUnits(codepoint_t lo, codepoint_t hi, unsigned index, const unsigned lgth){
29    const codepoint_t hunit = UTF<8>::nthCodeUnit(hi, index);
30    const codepoint_t lunit = UTF<8>::nthCodeUnit(lo, index);
31    if (index == lgth) {
32        return makeByte(lunit, hunit);
33    }
34    else if (hunit == lunit) {
35        return makeSeq({makeByte(hunit), rangeCodeUnits(lo, hi, index + 1, lgth)});
36    }
37    else {
38        const unsigned suffix_mask = (static_cast<unsigned>(1) << ((lgth - index) * 6)) - 1;
39        if ((hi & suffix_mask) != suffix_mask) {
40            const unsigned hi_floor = (~suffix_mask) & hi;
41            return makeAlt({rangeCodeUnits(hi_floor, hi, index, lgth), rangeCodeUnits(lo, hi_floor - 1, index, lgth)});
42        }
43        else if ((lo & suffix_mask) != 0) {
44            const unsigned low_ceil = lo | suffix_mask;
45            return makeAlt({rangeCodeUnits(low_ceil + 1, hi, index, lgth), rangeCodeUnits(lo, low_ceil, index, lgth)});
46        }
47        else {
48            return makeSeq({makeByte(lunit, hunit), rangeCodeUnits(lo, hi, index + 1, lgth)});
49        }
50    }
51}
52
53static RE * rangeToUTF8(codepoint_t lo, codepoint_t hi) {
54    const auto min_lgth = UTF<8>::encoded_length(lo);
55    const auto max_lgth = UTF<8>::encoded_length(hi);
56    if (min_lgth < max_lgth) {
57        const auto m = UTF<8>::max_codepoint_of_length(min_lgth);
58        return makeAlt({rangeToUTF8(lo, m), rangeToUTF8(m + 1, hi)});
59    }
60    else {
61        return rangeCodeUnits(lo, hi, 1, max_lgth);
62    }
63}
64
65RE * UTF8_Transformer::transformCC(CC * cc) {
66    if (cc->getAlphabet() != &cc::Unicode) return cc;
67    std::vector<RE *> alt;
68    for (const interval_t & i : *cc) {
69        alt.push_back(rangeToUTF8(lo_codepoint(i), hi_codepoint(i)));
70    }
71    return makeAlt(alt.begin(), alt.end());
72}
73
74}
75
Note: See TracBrowser for help on using the repository browser.