source: icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp @ 6171

Last change on this file since 6171 was 6171, checked in by cameron, 9 months ago

NullablePrefix/Suffix? removal using RE_Transformer; clean out some setters

File size: 8.7 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <toolchain/toolchain.h>
8#include <grep_interface.h>
9#include <re/re_toolchain.h>
10#include <re/re_any.h>
11#include <re/re_name.h>
12#include <re/re_cc.h>
13#include <re/re_start.h>
14#include <re/re_end.h>
15#include <re/re_alt.h>
16#include <re/re_seq.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_group.h>
20#include <re/re_range.h>
21#include <re/re_assertion.h>
22#include <cc/cc_compiler.h>            // for CC_Compiler
23#include <llvm/Support/CommandLine.h>  // for clEnumVal, clEnumValEnd, Optio...
24#include <re/re_compiler.h>            // for RE_Compiler
25#include <re/re_nullable.h>            // for RE_Nullable
26#include <re/re_star_normal.h>         // for RE_Star_Normal
27#include <re/re_simplifier.h>          // for RE_Simplifier
28#include <re/re_minimizer.h>
29#include <re/re_local.h>
30#include <re/printer_re.h>
31#include <re/re_analysis.h>
32#include <re/re_cc.h>
33#include <re/casing.h>
34#include <re/exclude_CC.h>
35#include <re/re_name_resolve.h>
36#include <re/grapheme_clusters.h>
37#include <llvm/Support/raw_ostream.h>
38#include <llvm/Support/ErrorHandling.h>
39#include <toolchain/toolchain.h>
40
41using namespace pablo;
42using namespace llvm;
43
44namespace re {
45
46static cl::OptionCategory RegexOptions("Regex Toolchain Options",
47                                              "These options control the regular expression transformation and compilation.");
48const cl::OptionCategory * LLVM_READONLY re_toolchain_flags() {
49    return &RegexOptions;
50}
51
52static cl::bits<RE_PrintFlags> 
53    PrintOptions(cl::values(clEnumVal(ShowREs, "Print parsed or generated regular expressions"),
54                            clEnumVal(ShowAllREs, "Print all regular expression passes"),
55                            clEnumVal(ShowStrippedREs, "Print REs with nullable prefixes/suffixes removed"),
56                            clEnumVal(ShowSimplifiedREs, "Print final simplified REs")
57                            CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
58
59static cl::bits<RE_AlgorithmFlags>
60    AlgorithmOptions(cl::values(clEnumVal(DisableLog2BoundedRepetition, "disable log2 optimizations for bounded repetition of bytes"),
61                              clEnumVal(DisableIfHierarchy, "disable nested if hierarchy for generated Unicode classes (not recommended)"), 
62                              clEnumVal(DisableMatchStar, "disable MatchStar optimization"), 
63                              clEnumVal(DisableUnicodeMatchStar, "disable Unicode MatchStar optimization"),
64                              clEnumVal(DisableUnicodeLineBreak, "disable Unicode line breaks - use LF only")
65                              CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
66
67bool LLVM_READONLY AlgorithmOptionIsSet(RE_AlgorithmFlags flag) {
68    return AlgorithmOptions.isSet(flag);
69}
70
71int IfInsertionGap;
72static cl::opt<int, true> 
73    IfInsertionGapOption("if-insertion-gap",  cl::location(IfInsertionGap), cl::init(3),
74                         cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"), 
75                         cl::cat(RegexOptions));
76
77RE * resolveModesAndExternalSymbols(RE * r, bool globallyCaseInsensitive) {
78    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowREs)) {
79        errs() << "Parser:\n" << Printer_RE::PrintRE(r) << '\n';
80    }
81    r = resolveGraphemeMode(r, false /* not in grapheme mode at top level*/);
82    if (PrintOptions.isSet(ShowAllREs)) {
83        errs() << "resolveGraphemeMode:\n" << Printer_RE::PrintRE(r) << '\n';
84    }
85    r = re::resolveUnicodeNames(r);
86    r = resolveCaseInsensitiveMode(r, globallyCaseInsensitive);
87    return r;
88}
89
90RE * excludeUnicodeLineBreak(RE * r) {
91    r = exclude_CC(r, re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029))));
92    if (PrintOptions.isSet(ShowAllREs)) {
93        errs() << "excludeUnicodeLineBreak:\n" << Printer_RE::PrintRE(r) << '\n';
94    }
95    return r;
96}
97
98RE * regular_expression_passes(RE * r) {
99
100    //Optimization passes to simplify the AST.
101    r = removeNullablePrefix(r);
102    r = removeNullableSuffix(r);
103    r = RE_Star_Normal().transformRE(r);
104    if (codegen::OptLevel > 1) {
105        r = RE_Minimizer::minimize(r);
106    } else {
107        r = RE_Simplifier::simplify(r);
108    }
109    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
110        //Print to the terminal the AST that was generated by the simplifier.
111        errs() << "Simplifier:\n" << Printer_RE::PrintRE(r) << '\n';
112    }
113
114    if (!DefiniteLengthBackReferencesOnly(r)) {
115        llvm::report_fatal_error("Future back reference support: references must be within a fixed distance from a fixed-length capture.");
116    }
117    return r;
118}
119RE * RE_Transformer::transformRE(RE * re) {
120    RE * initialRE = re;
121    RE * finalRE = transform(re);
122    if (PrintOptions.isSet(ShowAllREs) && (initialRE != finalRE) && (mTransformationName != "")) {
123        errs() << mTransformationName << ":\n" << Printer_RE::PrintRE(finalRE) << '\n';
124    }
125    return finalRE;
126}
127
128RE * RE_Transformer::transform(RE * re) {
129    if (llvm::isa<CC>(re)) return transformCC(llvm::cast<CC>(re));
130    else if (llvm::isa<Start>(re)) return transformStart(llvm::cast<Start>(re));
131    else if (llvm::isa<End>(re)) return transformEnd(llvm::cast<End>(re));
132    else if (llvm::isa<Name>(re)) return transformName(llvm::cast<Name>(re));
133    else if (llvm::isa<Seq>(re)) return transformSeq(llvm::cast<Seq>(re));
134    else if (llvm::isa<Alt>(re)) return transformAlt(llvm::cast<Alt>(re));
135    else if (llvm::isa<Rep>(re)) return transformRep(llvm::cast<Rep>(re));
136    else if (llvm::isa<Intersect>(re)) return transformIntersect(llvm::cast<Intersect>(re));
137    else if (llvm::isa<Diff>(re)) return transformDiff(llvm::cast<Diff>(re));
138    else if (llvm::isa<Range>(re)) return transformRange(llvm::cast<Range>(re));
139    else if (llvm::isa<Group>(re)) return transformGroup(llvm::cast<Group>(re));
140    else if (llvm::isa<Assertion>(re)) return transformAssertion(llvm::cast<Assertion>(re));
141    else {
142        llvm_unreachable("Unknown RE type");
143        return nullptr;
144    }
145}
146
147RE * RE_Transformer::transformName(Name * nm) {
148    if (mNameTransform == NameTransformationMode::None) return nm;
149    RE * d = nm->getDefinition();
150    if (d) return transform(d);
151    UndefinedNameError(nm);
152    return nullptr;
153}
154
155RE * RE_Transformer::transformCC(CC * cc) {
156    return cc;
157}
158
159RE * RE_Transformer::transformStart(Start * s) {
160    return s;
161}
162
163RE * RE_Transformer::transformEnd(End * e) {
164    return e;
165}
166
167RE * RE_Transformer::transformSeq(Seq * seq) {
168    std::vector<RE *> elems;
169    elems.reserve(seq->size());
170    bool any_changed = false;
171    for (RE * e : *seq) {
172        RE * e1 = transform(e);
173        if (e1 != e) any_changed = true;
174        elems.push_back(e1);
175    }
176    if (!any_changed) return seq;
177    return makeSeq(elems.begin(), elems.end());
178}
179
180RE * RE_Transformer::transformAlt(Alt * alt) {
181    std::vector<RE *> elems;
182    elems.reserve(alt->size());
183    bool any_changed = false;
184    for (RE * e : *alt) {
185        RE * e1 = transform(e);
186        if (e1 != e) any_changed = true;
187        elems.push_back(e1);
188    }
189    if (!any_changed) return alt;
190    return makeAlt(elems.begin(), elems.end());
191}
192
193RE * RE_Transformer::transformRep(Rep * r) {
194    RE * x0 = r->getRE();
195    RE * x = transform(x0);
196    if (x == x0) {
197        return r;
198    } else {
199        return makeRep(x, r->getLB(), r->getUB());
200    }
201}
202
203RE * RE_Transformer::transformIntersect(Intersect * ix) {
204    RE * x0 = ix->getLH();
205    RE * y0 = ix->getRH();
206    RE * x = transform(x0);
207    RE * y = transform(y0);
208    if ((x == x0) && (y == y0)) {
209        return ix;
210    } else {
211        return makeIntersect(x, y);
212    }
213}
214
215RE * RE_Transformer::transformDiff(Diff * d) {
216    RE * x0 = d->getLH();
217    RE * y0 = d->getRH();
218    RE * x = transform(x0);
219    RE * y = transform(y0);
220    if ((x == x0) && (y == y0)) {
221        return d;
222    } else {
223        return makeDiff(x, y);
224    }
225}
226
227RE * RE_Transformer::transformRange(Range * rg) {
228    RE * x0 = rg->getLo();
229    RE * y0 = rg->getHi();
230    RE * x = transform(x0);
231    RE * y = transform(y0);
232    if ((x == x0) && (y == y0)) {
233        return rg;
234    } else {
235        return makeRange(x, y);
236    }
237}
238
239RE * RE_Transformer::transformGroup(Group * g) {
240    RE * x0 = g->getRE();
241    RE * x = transform(x0);
242    if (x == x0) {
243        return g;
244    } else {
245        return makeGroup(g->getMode(), x, g->getSense());
246    }
247}
248
249RE * RE_Transformer::transformAssertion(Assertion * a) {
250    RE * x0 = a->getAsserted();
251    RE * x = transform(x0);
252    if (x == x0) {
253        return a;
254    } else {
255        return makeAssertion(x, a->getKind(), a->getSense());
256    }
257}
258
259}
Note: See TracBrowser for help on using the repository browser.