source: icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

Last change on this file was 6245, checked in by cameron, 4 months ago

Fixes for grapheme cluster breaking

File size: 9.3 KB
Line 
1#include "grapheme_clusters.h"
2#include <re/re_cc.h>
3#include <re/re_name.h>
4#include <re/re_alt.h>             // for Alt, makeAlt
5#include <re/re_any.h>             // for makeAny, Any
6#include <re/re_assertion.h>       // for Assertion, Assertion::Sense, Asser...
7#include <re/re_diff.h>            // for Diff, makeDiff
8#include <re/re_group.h>
9#include <re/re_intersect.h>       // for Intersect
10#include <re/re_name.h>            // for Name
11#include <re/re_rep.h>             // for Rep, makeRep
12#include <re/re_seq.h>             // for Seq, makeSeq
13#include <re/re_start.h>
14#include <re/re_end.h>
15#include <re/re_range.h>
16#include <re/printer_re.h>
17#include <re/re_name_resolve.h>
18#include <re/re_toolchain.h>
19
20#include <vector>                  // for vector, allocator
21#include <llvm/Support/Casting.h>  // for dyn_cast, isa
22#include <llvm/Support/ErrorHandling.h>
23#include <llvm/Support/raw_ostream.h>
24
25
26/*
27 Unicode Technical Standard #18 defines grapheme cluster mode, signified by the (?g) switch.
28 The mode is defined in terms of the assertion of grapheme cluster boundary assertions \b{g}
29 after every atomic literal.
30 
31 resolveGraphemeMode transforms a regular expression to perform the required insertion of
32 grapheme cluster boundaries, and the elimination of grapheme cluster mode groups.
33
34*/
35
36using namespace llvm;
37
38namespace re {
39bool hasGraphemeClusterBoundary(const RE * re) {
40    if (isa<CC>(re) || isa<Range>(re)) {
41        return false;
42    } else if (const Name * n = dyn_cast<Name>(re)) {
43        if (n->getType() == Name::Type::ZeroWidth) {
44            const std::string nameString = n->getName();
45            return nameString == "\\b{g}";
46        }
47        return false;
48    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
49        for (const RE * re : *alt) {
50            if (hasGraphemeClusterBoundary(re)) return true;
51        }
52        return false;
53    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
54        for (const RE * re : *seq) {
55            if (hasGraphemeClusterBoundary(re)) return true;
56        }
57        return false;
58    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
59        return hasGraphemeClusterBoundary(rep->getRE());
60    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
61        return hasGraphemeClusterBoundary(diff->getLH()) || hasGraphemeClusterBoundary(diff->getRH());
62    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
63        return hasGraphemeClusterBoundary(e->getLH()) || hasGraphemeClusterBoundary(e->getRH());
64    } else if (isa<Start>(re) || isa<End>(re)) {
65        return false;
66    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
67        return hasGraphemeClusterBoundary(a->getAsserted());
68    } else if (const Group * g = dyn_cast<Group>(re)) {
69        if ((g->getMode() == Group::Mode::GraphemeMode) && (g->getSense() == Group::Sense::On)) return true;
70        else return hasGraphemeClusterBoundary(g->getRE());
71    }
72    else llvm_unreachable("Unknown RE type");
73}
74
75class GraphemeModeTransformer : public RE_Transformer {
76public:
77    GraphemeModeTransformer(bool inGraphemeMode = true) : RE_Transformer("ResolveGraphemeMode"), mGraphemeMode(inGraphemeMode) {}
78   
79    RE * transformName(Name * n) override {
80        if (mGraphemeMode && (n->getName() == ".")) {
81            RE * GCB = makeZeroWidth("\\b{g}");
82            RE * nonGCB = makeDiff(makeSeq({}), GCB);
83            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
84        }
85        return n;
86    }
87   
88    RE * transformCC(CC * cc) override {
89        if (mGraphemeMode) return makeSeq({cc, makeZeroWidth("\\b{g}")});
90        return cc;
91    }
92   
93    RE * transformRange(Range * rg) override {
94        if (mGraphemeMode) return makeSeq({rg, makeZeroWidth("\\b{g}")});
95        return rg;
96    }
97   
98    RE * transformGroup(Group * g) override {
99        if (g->getMode() == Group::Mode::GraphemeMode) {
100            RE * r = g->getRE();
101            bool modeSave = mGraphemeMode;
102            mGraphemeMode = g->getSense() == Group::Sense::On;
103            RE * t = transform(r);
104            mGraphemeMode = modeSave;
105            return t;
106        } else {
107            return RE_Transformer::transformGroup(g);
108        }
109    }
110   
111    RE * transformSeq(Seq * seq) override {
112        std::vector<RE*> list;
113        bool afterSingleChar = false;
114        bool changed = false;
115        for (auto i = seq->begin(); i != seq->end(); ++i) {
116            bool atSingleChar = isa<CC>(*i) && (cast<CC>(*i)->count() == 1);
117            if (afterSingleChar && mGraphemeMode && !atSingleChar) {
118                list.push_back(makeZeroWidth("\\b{g}"));
119                changed = true;
120            }
121            if (isa<CC>(*i)) {
122                list.push_back(*i);
123            } else {
124                RE * t = transform(*i);
125                if (*i != t) changed = true;
126                list.push_back(t);
127            }
128            afterSingleChar = atSingleChar;
129        }
130        if (afterSingleChar && mGraphemeMode) {
131            list.push_back(makeZeroWidth("\\b{g}"));
132            changed = true;
133        }
134        if (!changed) return seq;
135        return makeSeq(list.begin(), list.end());
136    }
137
138private:
139    bool mGraphemeMode;
140};
141
142RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
143    return GraphemeModeTransformer(inGraphemeMode).transformRE(re);
144}
145
146#define Behind(x) makeLookBehindAssertion(x)
147#define notBehind(x) makeNegativeLookBehindAssertion(x)
148#define Ahead(x) makeLookAheadAssertion(x)
149#define notAhead(x) makeNegativeLookAheadAssertion(x)
150
151RE * generateGraphemeClusterBoundaryRule(bool extendedGraphemeClusters) {
152    // 3.1.1 Grapheme Cluster Boundary Rules
153    // Grapheme cluster boundary rules define a number of contexts where
154    // breaks are not permitted.  In the following definitions, we identify
155    // the points at which breaks are not permitted are identified by the
156    // definitions marked GCX.
157   
158    // Rules GB1, GB2, GB4 and GB5 define rules where breaks occur overriding
159    // later rules (specifically GB9, GB9a, GB9b).
160    // Rules GB9 and GB9a are overridden by GB1 and GB4, to allow breaks
161    // at start of text or after any control|CR|LF.  This is equivalent
162    // to stating that the lookbehind context for GB9 and GB9b is any
163    // non-control character (any actual character not in control|CR|LF).
164    // Similarly, the overriding of GB9b simplifies to a lookahead assertion
165    // on a noncontrol.
166    //
167    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
168    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
169    RE * GCB_Control = makeName("gcb", "control", Name::Type::UnicodeProperty);
170    RE * GCB_Control_CR_LF = makeAlt({GCB_Control, GCB_CR, GCB_LF});
171   
172    // Break at the start and end of text.
173    RE * GCB_1 = makeSOT();
174    RE * GCB_2 = makeEOT();
175    // Do not break between a CR and LF.
176    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
177    // Otherwise, break before and after controls.
178    RE * GCB_4 = Behind(GCB_Control_CR_LF);
179    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
180    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
181   
182   
183    // Do not break Hangul syllable sequences.
184    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
185    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
186    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
187    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
188    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
189    RE * GCX_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
190    RE * GCX_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
191    RE * GCX_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
192   
193    // Do not break before extendiers or zero-width joiners.
194    RE * GCB_EX = makeName("gcb", "ex", Name::Type::UnicodeProperty);
195    RE * GCB_ZWJ = makeName("gcb", "zwj", Name::Type::UnicodeProperty);
196    RE * GCX_9 = makeSeq({notBehind(GCB_Control_CR_LF), Ahead(makeAlt({GCB_EX, GCB_ZWJ}))});
197
198    if (extendedGraphemeClusters) {
199        RE * GCB_SpacingMark = makeName("gcb", "sm", Name::Type::UnicodeProperty);
200        RE * GCB_Prepend = makeName("gcb", "pp", Name::Type::UnicodeProperty);
201        RE * GCX_9a = makeSeq({notBehind(GCB_Control_CR_LF), Ahead(GCB_SpacingMark)});
202        RE * GCX_9b = makeSeq({Behind(GCB_Prepend), notAhead(GCB_Control_CR_LF)});
203        GCX_9 = makeAlt({GCX_9, GCX_9a, GCX_9b});
204    }
205
206    RE * ExtendedPictographic = makeName("Extended_Pictographic", Name::Type::UnicodeProperty);
207    RE * EmojiSeq = makeSeq({ExtendedPictographic, makeRep(GCB_EX, 0, Rep::UNBOUNDED_REP), GCB_ZWJ});
208    RE * GCX_11 = makeSeq({Behind(EmojiSeq), Ahead(ExtendedPictographic)});
209   
210    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
211    // Note: notBehind(RI) == sot | [^RI]
212    RE * odd_RI_seq = makeSeq({notBehind(GCB_RI), makeRep(makeSeq({GCB_RI, GCB_RI}), 0, Rep::UNBOUNDED_REP), GCB_RI});
213    RE * GCX_12_13 = makeSeq({Behind(odd_RI_seq), Ahead(GCB_RI)});
214   
215    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
216    RE * GCX = makeAlt({GCX_6, GCX_7, GCX_8, GCX_9, GCX_11, GCX_12_13});
217   
218    // Otherwise, break everywhere.
219    RE * GCB_999 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
220   
221    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
222    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_999, GCX)});
223    return gcb;
224}
225
226
227}
Note: See TracBrowser for help on using the repository browser.