source: icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp @ 6184

Last change on this file since 6184 was 6177, checked in by cameron, 9 months ago

GraphemeModeTransformer?, PrintREs

File size: 7.6 KB
Line 
1#include "grapheme_clusters.h"
2#include <re/re_cc.h>
3#include <re/re_name.h>
4#include <re/re_alt.h>             // for Alt, makeAlt
5#include <re/re_any.h>             // for makeAny, Any
6#include <re/re_assertion.h>       // for Assertion, Assertion::Sense, Asser...
7#include <re/re_diff.h>            // for Diff, makeDiff
8#include <re/re_group.h>
9#include <re/re_intersect.h>       // for Intersect
10#include <re/re_name.h>            // for Name
11#include <re/re_rep.h>             // for Rep, makeRep
12#include <re/re_seq.h>             // for Seq, makeSeq
13#include <re/re_start.h>
14#include <re/re_end.h>
15#include <re/re_range.h>
16#include <re/printer_re.h>
17#include <re/re_name_resolve.h>
18#include <re/re_toolchain.h>
19
20#include <vector>                  // for vector, allocator
21#include <llvm/Support/Casting.h>  // for dyn_cast, isa
22#include <llvm/Support/ErrorHandling.h>
23#include <llvm/Support/raw_ostream.h>
24
25
26/*
27 Unicode Technical Standard #18 defines grapheme cluster mode, signified by the (?g) switch.
28 The mode is defined in terms of the assertion of grapheme cluster boundary assertions \b{g}
29 after every atomic literal.
30 
31 resolveGraphemeMode transforms a regular expression to perform the required insertion of
32 grapheme cluster boundaries, and the elimination of grapheme cluster mode groups.
33
34*/
35
36using namespace llvm;
37
38namespace re {
39bool hasGraphemeClusterBoundary(const RE * re) {
40    if (isa<CC>(re) || isa<Range>(re)) {
41        return false;
42    } else if (const Name * n = dyn_cast<Name>(re)) {
43        if (n->getType() == Name::Type::ZeroWidth) {
44            const std::string nameString = n->getName();
45            return nameString == "\\b{g}";
46        }
47        return false;
48    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
49        for (const RE * re : *alt) {
50            if (hasGraphemeClusterBoundary(re)) return true;
51        }
52        return false;
53    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
54        for (const RE * re : *seq) {
55            if (hasGraphemeClusterBoundary(re)) return true;
56        }
57        return false;
58    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
59        return hasGraphemeClusterBoundary(rep->getRE());
60    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
61        return hasGraphemeClusterBoundary(diff->getLH()) || hasGraphemeClusterBoundary(diff->getRH());
62    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
63        return hasGraphemeClusterBoundary(e->getLH()) || hasGraphemeClusterBoundary(e->getRH());
64    } else if (isa<Start>(re) || isa<End>(re)) {
65        return false;
66    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
67        return hasGraphemeClusterBoundary(a->getAsserted());
68    } else if (const Group * g = dyn_cast<Group>(re)) {
69        if ((g->getMode() == Group::Mode::GraphemeMode) && (g->getSense() == Group::Sense::On)) return true;
70        else return hasGraphemeClusterBoundary(g->getRE());
71    }
72    else llvm_unreachable("Unknown RE type");
73}
74
75class GraphemeModeTransformer : public RE_Transformer {
76public:
77    GraphemeModeTransformer(bool inGraphemeMode = true) : RE_Transformer("ResolveGraphemeMode"), mGraphemeMode(inGraphemeMode) {}
78   
79    RE * transformName(Name * n) override {
80        if (mGraphemeMode && (n->getName() == ".")) {
81            RE * GCB = makeZeroWidth("\\b{g}");
82            RE * nonGCB = makeDiff(makeSeq({}), GCB);
83            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
84        }
85        return n;
86    }
87   
88    RE * transformCC(CC * cc) override {
89        if (mGraphemeMode) return makeSeq({cc, makeZeroWidth("\\b{g}")});
90        return cc;
91    }
92   
93    RE * transformRange(Range * rg) override {
94        if (mGraphemeMode) return makeSeq({rg, makeZeroWidth("\\b{g}")});
95        return rg;
96    }
97   
98    RE * transformGroup(Group * g) override {
99        if (g->getMode() == Group::Mode::GraphemeMode) {
100            RE * r = g->getRE();
101            bool modeSave = mGraphemeMode;
102            mGraphemeMode = g->getSense() == Group::Sense::On;
103            RE * t = transform(r);
104            mGraphemeMode = modeSave;
105            return t;
106        } else {
107            return RE_Transformer::transformGroup(g);
108        }
109    }
110   
111    RE * transformSeq(Seq * seq) override {
112        std::vector<RE*> list;
113        bool afterSingleChar = false;
114        bool changed = false;
115        for (auto i = seq->begin(); i != seq->end(); ++i) {
116            bool atSingleChar = isa<CC>(*i) && (cast<CC>(*i)->count() == 1);
117            if (afterSingleChar && mGraphemeMode && !atSingleChar) {
118                list.push_back(makeZeroWidth("\\b{g}"));
119                changed = true;
120            }
121            if (isa<CC>(*i)) {
122                list.push_back(*i);
123            } else {
124                RE * t = transform(*i);
125                if (*i != t) changed = true;
126                list.push_back(t);
127            }
128            afterSingleChar = atSingleChar;
129        }
130        if (afterSingleChar && mGraphemeMode) {
131            list.push_back(makeZeroWidth("\\b{g}"));
132            changed = true;
133        }
134        if (!changed) return seq;
135        return makeSeq(list.begin(), list.end());
136    }
137
138private:
139    bool mGraphemeMode;
140};
141
142RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
143    return GraphemeModeTransformer(inGraphemeMode).transformRE(re);
144}
145
146#define Behind(x) makeLookBehindAssertion(x)
147#define Ahead(x) makeLookAheadAssertion(x)
148
149RE * generateGraphemeClusterBoundaryRule() {
150    // 3.1.1 Grapheme Cluster Boundary Rules
151   
152    //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
153    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
154    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
155    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
156   
157    // Break at the start and end of text.
158    RE * GCB_1 = makeSOT();
159    RE * GCB_2 = makeEOT();
160    // Do not break between a CR and LF.
161    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
162    // Otherwise, break before and after controls.
163    RE * GCB_4 = Behind(GCB_Control_CR_LF);
164    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
165    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
166   
167    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
168    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
169    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
170    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
171    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
172    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
173    // Do not break Hangul syllable sequences.
174    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
175    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
176    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
177    // Do not break between regional indicator symbols.
178    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
179    // Do not break before extending characters.
180    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
181    // Do not break before SpacingMarks, or after Prepend characters.
182    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
183    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
184    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
185    // Otherwise, break everywhere.
186    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
187   
188    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
189    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
190    return gcb;
191}
192
193}
Note: See TracBrowser for help on using the repository browser.