source: icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp @ 6141

Last change on this file since 6141 was 5896, checked in by cameron, 18 months ago

Start-of-text and End-of-text boundary assertions

File size: 7.9 KB
Line 
1#include "grapheme_clusters.h"
2#include <re/re_cc.h>
3#include <re/re_name.h>
4#include <re/re_alt.h>             // for Alt, makeAlt
5#include <re/re_any.h>             // for makeAny, Any
6#include <re/re_assertion.h>       // for Assertion, Assertion::Sense, Asser...
7#include <re/re_diff.h>            // for Diff, makeDiff
8#include <re/re_group.h>
9#include <re/re_intersect.h>       // for Intersect
10#include <re/re_name.h>            // for Name
11#include <re/re_rep.h>             // for Rep, makeRep
12#include <re/re_seq.h>             // for Seq, makeSeq
13#include <re/re_start.h>
14#include <re/re_end.h>
15#include <re/re_range.h>
16#include <re/printer_re.h>
17#include <re/re_name_resolve.h>
18#include <vector>                  // for vector, allocator
19#include <llvm/Support/Casting.h>  // for dyn_cast, isa
20#include <llvm/Support/ErrorHandling.h>
21#include <llvm/Support/raw_ostream.h>
22
23/*
24 Unicode Technical Standard #18 defines grapheme cluster mode, signified by the (?g) switch.
25 The mode is defined in terms of the assertion of grapheme cluster boundary assertions \b{g}
26 after every atomic literal.
27 
28 resolveGraphemeMode transforms a regular expression to perform the required insertion of
29 grapheme cluster boundaries, and the elimination of grapheme cluster mode groups.
30
31*/
32
33using namespace llvm;
34
35namespace re {
36bool hasGraphemeClusterBoundary(const RE * re) {
37    if (isa<CC>(re)) {
38        return false;
39    } else if (const Name * n = dyn_cast<Name>(re)) {
40        if (n->getType() == Name::Type::ZeroWidth) {
41            const std::string nameString = n->getName();
42            return nameString == "\\b{g}";
43        }
44        return false;
45    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
46        for (const RE * re : *alt) {
47            if (hasGraphemeClusterBoundary(re)) return true;
48        }
49        return false;
50    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
51        for (const RE * re : *seq) {
52            if (hasGraphemeClusterBoundary(re)) return true;
53        }
54        return false;
55    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
56        return hasGraphemeClusterBoundary(rep->getRE());
57    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
58        return hasGraphemeClusterBoundary(diff->getLH()) || hasGraphemeClusterBoundary(diff->getRH());
59    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
60        return hasGraphemeClusterBoundary(e->getLH()) || hasGraphemeClusterBoundary(e->getRH());
61    } else if (isa<Start>(re) || isa<End>(re)) {
62        return false;
63    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
64        return hasGraphemeClusterBoundary(a->getAsserted());
65    } else if (const Group * g = dyn_cast<Group>(re)) {
66        if ((g->getMode() == Group::Mode::GraphemeMode) && (g->getSense() == Group::Sense::On)) return true;
67        else return hasGraphemeClusterBoundary(g->getRE());
68    }
69    else llvm_unreachable("Unknown RE type");
70}
71   
72RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
73    if (isa<Name>(re)) {
74        if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) {
75            RE * GCB = makeZeroWidth("\\b{g}");
76            RE * nonGCB = makeDiff(makeSeq({}), GCB);
77            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
78        }
79        else return re;
80    }
81    else if (isa<CC>(re) || isa<Range>(re)) {
82        if (inGraphemeMode) return makeSeq({re, makeZeroWidth("\\b{g}")});
83        else return re;
84    }
85    else if (Seq * seq = dyn_cast<Seq>(re)) {
86        std::vector<RE*> list;
87        bool afterSingleChar = false;
88        for (auto i = seq->begin(); i != seq->end(); ++i) {
89            bool atSingleChar = isa<CC>(re) && (cast<CC>(re)->count() == 1);
90            if (afterSingleChar && inGraphemeMode && !atSingleChar)
91                list.push_back(makeZeroWidth("\\b{g}"));
92            if (isa<CC>(re)) list.push_back(*i);
93            else {
94                list.push_back(resolveGraphemeMode(*i, inGraphemeMode));
95            }
96            afterSingleChar = atSingleChar;
97        }
98        if (afterSingleChar && inGraphemeMode) list.push_back(makeZeroWidth("\\b{g}"));
99        return makeSeq(list.begin(), list.end());
100    } else if (Group * g = dyn_cast<Group>(re)) {
101        if (g->getMode() == Group::Mode::GraphemeMode) {
102            return resolveGraphemeMode(g->getRE(), g->getSense() == Group::Sense::On);
103        }
104        else {
105            return makeGroup(g->getMode(), resolveGraphemeMode(g->getRE(), inGraphemeMode), g->getSense());
106        }
107    } else if (Alt * alt = dyn_cast<Alt>(re)) {
108        std::vector<RE*> list;
109        for (auto i = alt->begin(); i != alt->end(); ++i) {
110            list.push_back(resolveGraphemeMode(*i, inGraphemeMode));
111        }
112        return makeAlt(list.begin(), list.end());
113    } else if (Rep * rep = dyn_cast<Rep>(re)) {
114        return makeRep(resolveGraphemeMode(rep->getRE(), inGraphemeMode), rep->getLB(), rep->getUB());
115    } else if (const Diff * diff = dyn_cast<const Diff>(re)) {
116        return makeDiff(resolveGraphemeMode(diff->getLH(), inGraphemeMode),
117                        resolveGraphemeMode(diff->getRH(), inGraphemeMode));
118    } else if (const Intersect * e = dyn_cast<const Intersect>(re)) {
119        return makeIntersect(resolveGraphemeMode(e->getLH(), inGraphemeMode),
120                             resolveGraphemeMode(e->getRH(), inGraphemeMode));
121    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
122        return makeAssertion(resolveGraphemeMode(a->getAsserted(), inGraphemeMode), a->getKind(), a->getSense());
123    } else if (isa<Start>(re) || isa<End>(re)) {
124        return re;
125    } else llvm_unreachable("Unknown RE type");
126}
127
128
129#define Behind(x) makeLookBehindAssertion(x)
130#define Ahead(x) makeLookAheadAssertion(x)
131
132RE * generateGraphemeClusterBoundaryRule() {
133    // 3.1.1 Grapheme Cluster Boundary Rules
134   
135    //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
136    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
137    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
138    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
139   
140    // Break at the start and end of text.
141    RE * GCB_1 = makeSOT();
142    RE * GCB_2 = makeEOT();
143    // Do not break between a CR and LF.
144    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
145    // Otherwise, break before and after controls.
146    RE * GCB_4 = Behind(GCB_Control_CR_LF);
147    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
148    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
149   
150    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
151    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
152    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
153    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
154    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
155    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
156    // Do not break Hangul syllable sequences.
157    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
158    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
159    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
160    // Do not break between regional indicator symbols.
161    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
162    // Do not break before extending characters.
163    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
164    // Do not break before SpacingMarks, or after Prepend characters.
165    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
166    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
167    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
168    // Otherwise, break everywhere.
169    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
170   
171    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
172    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
173    return gcb;
174}
175
176}
Note: See TracBrowser for help on using the repository browser.