1 | /* |
---|
2 | * Copyright (c) 2017 International Characters. |
---|
3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
4 | * icgrep is a trademark of International Characters. |
---|
5 | */ |
---|
6 | |
---|
7 | #include <re/to_utf8.h> |
---|
8 | #include <UCD/unicode_set.h> |
---|
9 | #include <UCD/UTF.h> |
---|
10 | #include <re/re_name.h> |
---|
11 | #include <re/re_start.h> |
---|
12 | #include <re/re_end.h> |
---|
13 | #include <re/re_cc.h> |
---|
14 | #include <re/re_seq.h> |
---|
15 | #include <re/re_alt.h> |
---|
16 | #include <re/re_rep.h> |
---|
17 | #include <re/re_diff.h> |
---|
18 | #include <re/re_intersect.h> |
---|
19 | #include <re/re_assertion.h> |
---|
20 | #include <llvm/Support/Casting.h> |
---|
21 | #include <llvm/Support/ErrorHandling.h> |
---|
22 | |
---|
23 | using namespace llvm; |
---|
24 | |
---|
25 | namespace re { |
---|
26 | |
---|
27 | static RE * rangeCodeUnits(codepoint_t lo, codepoint_t hi, unsigned index, const unsigned lgth){ |
---|
28 | const codepoint_t hunit = UTF<8>::nthCodeUnit(hi, index); |
---|
29 | const codepoint_t lunit = UTF<8>::nthCodeUnit(lo, index); |
---|
30 | if (index == lgth) { |
---|
31 | return makeCC(lunit, hunit); |
---|
32 | } |
---|
33 | else if (hunit == lunit) { |
---|
34 | return makeSeq({makeCC(hunit), rangeCodeUnits(lo, hi, index + 1, lgth)}); |
---|
35 | } |
---|
36 | else { |
---|
37 | const unsigned suffix_mask = (static_cast<unsigned>(1) << ((lgth - index) * 6)) - 1; |
---|
38 | if ((hi & suffix_mask) != suffix_mask) { |
---|
39 | const unsigned hi_floor = (~suffix_mask) & hi; |
---|
40 | return makeAlt({rangeCodeUnits(hi_floor, hi, index, lgth), rangeCodeUnits(lo, hi_floor - 1, index, lgth)}); |
---|
41 | } |
---|
42 | else if ((lo & suffix_mask) != 0) { |
---|
43 | const unsigned low_ceil = lo | suffix_mask; |
---|
44 | return makeAlt({rangeCodeUnits(low_ceil + 1, hi, index, lgth), rangeCodeUnits(lo, low_ceil, index, lgth)}); |
---|
45 | } |
---|
46 | else { |
---|
47 | return makeSeq({makeCC(lunit, hunit), rangeCodeUnits(lo, hi, index + 1, lgth)}); |
---|
48 | } |
---|
49 | } |
---|
50 | } |
---|
51 | |
---|
52 | static RE * rangeToUTF8(codepoint_t lo, codepoint_t hi) { |
---|
53 | const auto min_lgth = UTF<8>::encoded_length(lo); |
---|
54 | const auto max_lgth = UTF<8>::encoded_length(hi); |
---|
55 | if (min_lgth < max_lgth) { |
---|
56 | const auto m = UTF<8>::max_codepoint_of_length(lo); |
---|
57 | return makeAlt({rangeToUTF8(lo, m), rangeToUTF8(m + 1, hi)}); |
---|
58 | } |
---|
59 | else { |
---|
60 | return rangeCodeUnits(lo, hi, 1, max_lgth); |
---|
61 | } |
---|
62 | } |
---|
63 | |
---|
64 | RE * toUTF8(RE * r) { |
---|
65 | if (isa<Name>(r) || isa<Start>(r) || isa<End>(r)) { |
---|
66 | return r; |
---|
67 | } else if (const CC * cc = dyn_cast<CC>(r)) { |
---|
68 | std::vector<RE *> alt; |
---|
69 | for (const interval_t & i : *cc) { |
---|
70 | alt.push_back(rangeToUTF8(lo_codepoint(i), hi_codepoint(i))); |
---|
71 | } |
---|
72 | return makeAlt(alt.begin(), alt.end()); |
---|
73 | } else if (Alt * alt = dyn_cast<Alt>(r)) { |
---|
74 | std::vector<RE *> list; |
---|
75 | list.reserve(alt->size()); |
---|
76 | for (RE * a : *alt) { |
---|
77 | list.push_back(toUTF8(a)); |
---|
78 | } |
---|
79 | return makeAlt(list.begin(), list.end()); |
---|
80 | } else if (Seq * seq = dyn_cast<Seq>(r)) { |
---|
81 | std::vector<RE *> list; |
---|
82 | list.reserve(seq->size()); |
---|
83 | for (RE * s : *seq) { |
---|
84 | list.push_back(toUTF8(s)); |
---|
85 | } |
---|
86 | return makeSeq(list.begin(), list.end()); |
---|
87 | } else if (Assertion * a = dyn_cast<Assertion>(r)) { |
---|
88 | return makeAssertion(toUTF8(a->getAsserted()), a->getKind(), a->getSense()); |
---|
89 | } else if (Rep * rep = dyn_cast<Rep>(r)) { |
---|
90 | RE * expr = toUTF8(rep->getRE()); |
---|
91 | return makeRep(expr, rep->getLB(), rep->getUB()); |
---|
92 | } else if (Diff * diff = dyn_cast<Diff>(r)) { |
---|
93 | return makeDiff(toUTF8(diff->getLH()), toUTF8(diff->getRH())); |
---|
94 | } else if (Intersect * e = dyn_cast<Intersect>(r)) { |
---|
95 | return makeIntersect(toUTF8(e->getLH()), toUTF8(e->getRH())); |
---|
96 | } |
---|
97 | llvm_unreachable("unexpected RE type given to toUTF8"); |
---|
98 | return nullptr; |
---|
99 | } |
---|
100 | |
---|
101 | } |
---|
102 | |
---|