source: icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp @ 6173

Last change on this file since 6173 was 6173, checked in by nmedfort, 11 months ago

Added RE_Inspector.

Migrated RE passes to RE_Transformer.

Incorporated Memoizer functionality into RE_Transformer/Inspector.

Removed Memoizer.

Bug fix for unicode_set.

File size: 8.0 KB
Line 
1#include "grapheme_clusters.h"
2#include <re/re_cc.h>
3#include <re/re_name.h>
4#include <re/re_alt.h>             // for Alt, makeAlt
5#include <re/re_any.h>             // for makeAny, Any
6#include <re/re_assertion.h>       // for Assertion, Assertion::Sense, Asser...
7#include <re/re_diff.h>            // for Diff, makeDiff
8#include <re/re_group.h>
9#include <re/re_intersect.h>       // for Intersect
10#include <re/re_name.h>            // for Name
11#include <re/re_rep.h>             // for Rep, makeRep
12#include <re/re_seq.h>             // for Seq, makeSeq
13#include <re/re_start.h>
14#include <re/re_end.h>
15#include <re/re_range.h>
16#include <re/printer_re.h>
17#include <re/re_name_resolve.h>
18
19#include <vector>                  // for vector, allocator
20#include <llvm/Support/Casting.h>  // for dyn_cast, isa
21#include <llvm/Support/ErrorHandling.h>
22#include <llvm/Support/raw_ostream.h>
23
24
25/*
26 Unicode Technical Standard #18 defines grapheme cluster mode, signified by the (?g) switch.
27 The mode is defined in terms of the assertion of grapheme cluster boundary assertions \b{g}
28 after every atomic literal.
29 
30 resolveGraphemeMode transforms a regular expression to perform the required insertion of
31 grapheme cluster boundaries, and the elimination of grapheme cluster mode groups.
32
33*/
34
35using namespace llvm;
36
37namespace re {
38bool hasGraphemeClusterBoundary(const RE * re) {
39    if (isa<CC>(re) || isa<Range>(re)) {
40        return false;
41    } else if (const Name * n = dyn_cast<Name>(re)) {
42        if (n->getType() == Name::Type::ZeroWidth) {
43            const std::string nameString = n->getName();
44            return nameString == "\\b{g}";
45        }
46        return false;
47    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
48        for (const RE * re : *alt) {
49            if (hasGraphemeClusterBoundary(re)) return true;
50        }
51        return false;
52    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
53        for (const RE * re : *seq) {
54            if (hasGraphemeClusterBoundary(re)) return true;
55        }
56        return false;
57    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
58        return hasGraphemeClusterBoundary(rep->getRE());
59    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
60        return hasGraphemeClusterBoundary(diff->getLH()) || hasGraphemeClusterBoundary(diff->getRH());
61    } else if (const Intersect * e = dyn_cast<Intersect>(re)) {
62        return hasGraphemeClusterBoundary(e->getLH()) || hasGraphemeClusterBoundary(e->getRH());
63    } else if (isa<Start>(re) || isa<End>(re)) {
64        return false;
65    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
66        return hasGraphemeClusterBoundary(a->getAsserted());
67    } else if (const Group * g = dyn_cast<Group>(re)) {
68        if ((g->getMode() == Group::Mode::GraphemeMode) && (g->getSense() == Group::Sense::On)) return true;
69        else return hasGraphemeClusterBoundary(g->getRE());
70    }
71    else llvm_unreachable("Unknown RE type");
72}
73
74RE * resolveGraphemeMode(RE * re, bool inGraphemeMode) {
75    if (isa<Name>(re)) {
76        if (inGraphemeMode && (cast<Name>(re)->getName() == ".")) {
77            RE * GCB = makeZeroWidth("\\b{g}");
78            RE * nonGCB = makeDiff(makeSeq({}), GCB);
79            return makeSeq({makeAny(), makeRep(makeSeq({nonGCB, makeAny()}), 0, Rep::UNBOUNDED_REP), GCB});
80        }
81        else return re;
82    }
83    else if (isa<CC>(re) || isa<Range>(re)) {
84        if (inGraphemeMode) return makeSeq({re, makeZeroWidth("\\b{g}")});
85        else return re;
86    }
87    else if (Seq * seq = dyn_cast<Seq>(re)) {
88        std::vector<RE*> list;
89        bool afterSingleChar = false;
90        for (auto i = seq->begin(); i != seq->end(); ++i) {
91            bool atSingleChar = isa<CC>(re) && (cast<CC>(re)->count() == 1);
92            if (afterSingleChar && inGraphemeMode && !atSingleChar)
93                list.push_back(makeZeroWidth("\\b{g}"));
94            if (isa<CC>(re)) list.push_back(*i);
95            else {
96                list.push_back(resolveGraphemeMode(*i, inGraphemeMode));
97            }
98            afterSingleChar = atSingleChar;
99        }
100        if (afterSingleChar && inGraphemeMode) list.push_back(makeZeroWidth("\\b{g}"));
101        return makeSeq(list.begin(), list.end());
102    } else if (Group * g = dyn_cast<Group>(re)) {
103        if (g->getMode() == Group::Mode::GraphemeMode) {
104            return resolveGraphemeMode(g->getRE(), g->getSense() == Group::Sense::On);
105        }
106        else {
107            return makeGroup(g->getMode(), resolveGraphemeMode(g->getRE(), inGraphemeMode), g->getSense());
108        }
109    } else if (Alt * alt = dyn_cast<Alt>(re)) {
110        std::vector<RE*> list;
111        for (auto i = alt->begin(); i != alt->end(); ++i) {
112            list.push_back(resolveGraphemeMode(*i, inGraphemeMode));
113        }
114        return makeAlt(list.begin(), list.end());
115    } else if (Rep * rep = dyn_cast<Rep>(re)) {
116        return makeRep(resolveGraphemeMode(rep->getRE(), inGraphemeMode), rep->getLB(), rep->getUB());
117    } else if (const Diff * diff = dyn_cast<const Diff>(re)) {
118        return makeDiff(resolveGraphemeMode(diff->getLH(), inGraphemeMode),
119                        resolveGraphemeMode(diff->getRH(), inGraphemeMode));
120    } else if (const Intersect * e = dyn_cast<const Intersect>(re)) {
121        return makeIntersect(resolveGraphemeMode(e->getLH(), inGraphemeMode),
122                             resolveGraphemeMode(e->getRH(), inGraphemeMode));
123    } else if (const Assertion * a = dyn_cast<Assertion>(re)) {
124        return makeAssertion(resolveGraphemeMode(a->getAsserted(), inGraphemeMode), a->getKind(), a->getSense());
125    } else if (isa<Start>(re) || isa<End>(re)) {
126        return re;
127    } else llvm_unreachable("Unknown RE type");
128}
129
130
131#define Behind(x) makeLookBehindAssertion(x)
132#define Ahead(x) makeLookAheadAssertion(x)
133
134RE * generateGraphemeClusterBoundaryRule() {
135    // 3.1.1 Grapheme Cluster Boundary Rules
136   
137    //    RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
138    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
139    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
140    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
141   
142    // Break at the start and end of text.
143    RE * GCB_1 = makeSOT();
144    RE * GCB_2 = makeEOT();
145    // Do not break between a CR and LF.
146    RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
147    // Otherwise, break before and after controls.
148    RE * GCB_4 = Behind(GCB_Control_CR_LF);
149    RE * GCB_5 = Ahead(GCB_Control_CR_LF);
150    RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
151   
152    RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
153    RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
154    RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
155    RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
156    RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
157    RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
158    // Do not break Hangul syllable sequences.
159    RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
160    RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
161    RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
162    // Do not break between regional indicator symbols.
163    RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
164    // Do not break before extending characters.
165    RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
166    // Do not break before SpacingMarks, or after Prepend characters.
167    RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
168    RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
169    RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
170    // Otherwise, break everywhere.
171    RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
172   
173    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
174    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
175    return gcb;
176}
177
178}
Note: See TracBrowser for help on using the repository browser.