source: icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp @ 5792

Last change on this file since 5792 was 5792, checked in by cameron, 13 months ago

\N{...} expressions now anchored; name expresions in ranges functional

File size: 8.0 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <toolchain/toolchain.h>
8#include <grep_interface.h>
9#include <re/re_toolchain.h>
10#include <cc/cc_compiler.h>            // for CC_Compiler
11#include <llvm/Support/CommandLine.h>  // for clEnumVal, clEnumValEnd, Optio...
12#include <re/re_compiler.h>            // for RE_Compiler
13#include <re/re_nullable.h>            // for RE_Nullable
14#include <re/re_star_normal.h>         // for RE_Star_Normal
15#include <re/re_simplifier.h>          // for RE_Simplifier
16#include <re/re_minimizer.h>
17#include <re/re_local.h>
18#include <re/printer_re.h>
19#include <re/re_analysis.h>
20#include <re/re_cc.h>
21#include <re/casing.h>
22#include <re/exclude_CC.h>
23#include <re/re_name_resolve.h>
24#include <re/re_collect_unicodesets.h>
25#include <re/re_multiplex.h>
26#include <re/grapheme_clusters.h>
27#include <cc/multiplex_CCs.h>
28#include <llvm/Support/raw_ostream.h>
29
30using namespace pablo;
31using namespace llvm;
32
33namespace re {
34
35static cl::OptionCategory RegexOptions("Regex Toolchain Options",
36                                              "These options control the regular expression transformation and compilation.");
37const cl::OptionCategory * re_toolchain_flags() {
38    return &RegexOptions;
39}
40
41static cl::bits<RE_PrintFlags> 
42    PrintOptions(cl::values(clEnumVal(ShowREs, "Print parsed or generated regular expressions"),
43                            clEnumVal(ShowAllREs, "Print all regular expression passes"),
44                            clEnumVal(ShowStrippedREs, "Print REs with nullable prefixes/suffixes removed"),
45                            clEnumVal(ShowSimplifiedREs, "Print final simplified REs")
46                            CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
47
48static cl::bits<RE_AlgorithmFlags>
49    AlgorithmOptions(cl::values(clEnumVal(DisableLog2BoundedRepetition, "disable log2 optimizations for bounded repetition of bytes"),
50                              clEnumVal(DisableIfHierarchy, "disable nested if hierarchy for generated Unicode classes (not recommended)"), 
51                              clEnumVal(DisableMatchStar, "disable MatchStar optimization"), 
52                              clEnumVal(DisableUnicodeMatchStar, "disable Unicode MatchStar optimization"),
53                              clEnumVal(DisableUnicodeLineBreak, "disable Unicode line breaks - use LF only")
54                              CL_ENUM_VAL_SENTINEL), cl::cat(RegexOptions));
55
56bool AlgorithmOptionIsSet(RE_AlgorithmFlags flag) {
57    return AlgorithmOptions.isSet(flag);
58}
59
60int IfInsertionGap;
61static cl::opt<int, true> 
62    IfInsertionGapOption("if-insertion-gap",  cl::location(IfInsertionGap), cl::init(3),
63                         cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"), 
64                         cl::cat(RegexOptions));
65
66
67std::pair<RE *, std::vector<re::CC *>> multiplexing_passes(RE * r) {
68    std::vector<re::CC *> charclasses;
69    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowREs)) {
70        errs() << "Parser:\n" << Printer_RE::PrintRE(r) << '\n';
71    }
72    //Optimization passes to simplify the AST.
73    r = RE_Nullable::removeNullablePrefix(r);
74    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
75        errs() << "RemoveNullablePrefix:\n" << Printer_RE::PrintRE(r) << '\n';
76    }
77    r = RE_Nullable::removeNullableSuffix(r);
78    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
79        errs() << "RemoveNullableSuffix:\n" << Printer_RE::PrintRE(r) << '\n';
80    }
81    r = RE_Nullable::removeNullableAssertion(r);
82    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
83        errs() << "RemoveNullableAssertion:\n" << Printer_RE::PrintRE(r) << '\n';
84    }
85    r = RE_Star_Normal::star_normal(r);
86
87    r = resolveGraphemeMode(r, false /* not in grapheme mode at top level*/);
88    if (PrintOptions.isSet(ShowAllREs)) {
89        errs() << "resolveGraphemeMode:\n" << Printer_RE::PrintRE(r) << '\n';
90    }
91    r = re::resolveUnicodeProperties(r);
92    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
93        errs() << "resolveUnicodeProperties:\n" << Printer_RE::PrintRE(r) << '\n';
94    }
95
96    r = RE_Simplifier::simplify(r);
97   
98    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
99        //Print to the terminal the AST that was generated by the simplifier.
100        errs() << "Simplifier:\n" << Printer_RE::PrintRE(r) << '\n';
101    }
102    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
103        //Print to the terminal the AST that was transformed to the star normal form.
104        errs() << "Star_Normal_Form:\n" << Printer_RE::PrintRE(r) << '\n';
105    }
106    r = resolveCaseInsensitiveMode(r, grep::IgnoreCaseFlag);
107    if (PrintOptions.isSet(ShowAllREs)) {
108        errs() << "resolveCaseInsensitiveMode:\n" << Printer_RE::PrintRE(r) << '\n';
109    }
110    r = re::resolveNames(r);
111    if (PrintOptions.isSet(ShowAllREs)) {
112        errs() << "resolveNames:\n" << Printer_RE::PrintRE(r) << '\n';
113    }
114    r = exclude_CC(r, re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029))));
115    if (PrintOptions.isSet(ShowAllREs)) {
116        errs() << "exclude_CC:\n" << Printer_RE::PrintRE(r) << '\n';
117    }
118    const auto UnicodeSets = re::collectUnicodeSets(r);
119    std::vector<std::vector<unsigned>> exclusiveSetIDs;
120    doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses);
121    r = multiplex(r, UnicodeSets, exclusiveSetIDs);
122    if (PrintOptions.isSet(ShowAllREs)) {
123        errs() << "multiplex:\n" << Printer_RE::PrintRE(r) << '\n';
124    }
125    return std::pair<RE *, std::vector<re::CC *>>(r, charclasses);
126}
127
128RE * regular_expression_passes(RE * r)  {
129
130    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowREs)) {
131        errs() << "Parser:\n" << Printer_RE::PrintRE(r) << '\n';
132    }
133
134    //Optimization passes to simplify the AST.
135    r = RE_Nullable::removeNullablePrefix(r);
136    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
137        errs() << "RemoveNullablePrefix:\n" << Printer_RE::PrintRE(r) << '\n';
138    }
139    r = RE_Nullable::removeNullableSuffix(r);
140    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
141        errs() << "RemoveNullableSuffix:\n" << Printer_RE::PrintRE(r) << '\n';
142    }
143    r = RE_Nullable::removeNullableAssertion(r);
144    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
145        errs() << "RemoveNullableAssertion:\n" << Printer_RE::PrintRE(r) << '\n';
146    }
147    //r = RE_Nullable::removeNullableAfterAssertion(r);
148    //if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
149    //    errs() << "RemoveNullableAfterAssertion\n" << Printer_RE::PrintRE(r) << '\n';
150    //}
151
152    r = RE_Simplifier::simplify(r);
153
154    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
155        //Print to the terminal the AST that was generated by the simplifier.
156        errs() << "Simplifier:\n" << Printer_RE::PrintRE(r) << '\n';
157    }
158   
159//    r = RE_Minimizer::minimize(r);
160
161//    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
162//        //Print to the terminal the AST that was generated by the simplifier.
163//        errs() << "Minimizer:\n" << Printer_RE::PrintRE(r) << '\n';
164//    }
165
166    r = RE_Star_Normal::star_normal(r);
167
168    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowSimplifiedREs)) {
169        //Print to the terminal the AST that was transformed to the star normal form.
170        errs() << "Star_Normal_Form:\n" << Printer_RE::PrintRE(r) << '\n';
171    }
172
173    return r;
174}
175   
176PabloAST * re2pablo_compiler(PabloKernel * kernel, RE * re_ast) {
177    Var * const basis = kernel->getInputStreamVar("basis");
178    cc::CC_Compiler cc_compiler(kernel, basis);
179    // compile Unicode names
180    RE_Compiler re_compiler(kernel, cc_compiler);
181    return re_compiler.compile(re_ast);
182}
183
184}
Note: See TracBrowser for help on using the repository browser.