source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4214

Last change on this file since 4214 was 4214, checked in by nmedfort, 5 years ago

Partial implementation of 'PabloBuilder?'.

File size: 9.7 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include <re/re_name.h>
10#include <re/re_start.h>
11#include <re/re_end.h>
12#include <re/re_alt.h>
13#include <re/re_cc.h>
14#include <re/re_seq.h>
15#include <re/re_rep.h>
16
17
18//Pablo Expressions
19#include <pablo/codegenstate.h>
20#include <pablo/pe_advance.h>
21#include <pablo/pe_all.h>
22#include <pablo/pe_and.h>
23#include <pablo/pe_call.h>
24#include <pablo/pe_charclass.h>
25#include <pablo/pe_matchstar.h>
26#include <pablo/pe_not.h>
27#include <pablo/pe_or.h>
28#include <pablo/pe_pabloe.h>
29#include <pablo/pe_scanthru.h>
30#include <pablo/pe_sel.h>
31#include <pablo/pe_var.h>
32#include <pablo/pe_xor.h>
33#include <pablo/ps_assign.h>
34#include <pablo/ps_if.h>
35#include <pablo/ps_while.h>
36
37#include <assert.h>
38#include <stdexcept>
39
40using namespace pablo;
41
42namespace re {
43
44RE_Compiler::RE_Compiler(CodeGenState & baseCG, std::map<std::string, std::string> name_map)
45: mCG(baseCG)
46, m_name_map(name_map)
47{
48
49}
50
51void RE_Compiler::compile(RE * re, CodeGenState & cg) {
52
53    if (hasUnicode(re)) {
54        //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
55        std::string gs_initial = cg.ssa("initial");
56        m_name_map.insert(make_pair("initial", gs_initial));
57        PabloE * u8single = cg.createVar(m_name_map.find("UTF8-SingleByte")->second);
58        PabloE * u8pfx2 = cg.createVar(m_name_map.find("UTF8-Prefix2")->second);
59        PabloE * u8pfx3 = cg.createVar(m_name_map.find("UTF8-Prefix3")->second);
60        PabloE * u8pfx4 = cg.createVar(m_name_map.find("UTF8-Prefix4")->second);
61        PabloE * u8pfx = cg.createOr(cg.createOr(u8pfx2, u8pfx3), u8pfx4);
62        cg.push_back(cg.createAssign(gs_initial, cg.createOr(u8pfx, u8single)));
63
64        //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
65        std::string gs_nonfinal = cg.ssa("nonfinal");
66        m_name_map.insert(make_pair("nonfinal", gs_nonfinal));
67        //#define USE_IF_FOR_NONFINAL
68        #ifdef USE_IF_FOR_NONFINAL
69        cg.push_back(make_assign(gs_nonfinal, make_all(0)));
70        #endif
71        PabloE * u8scope32 = cg.createAdvance(u8pfx3);
72        PabloE * u8scope42 = cg.createAdvance(u8pfx4);
73        PabloE * u8scope43 = cg.createAdvance(u8scope42);
74        PabloE * assign_non_final = cg.createAssign(gs_nonfinal, cg.createOr(cg.createOr(u8pfx, u8scope32), cg.createOr(u8scope42, u8scope43)));
75        #ifdef USE_IF_FOR_NONFINAL
76        std::list<PabloE *> * if_body = new std::list<PabloE *> ();
77        if_body->push_back(assign_non_final);
78        mCG.push_back(makeIf(u8pfx, *if_body));
79        #else
80        cg.push_back(assign_non_final);
81        #endif
82    }
83
84    PabloE * start_marker = cg.createAll(1);
85    cg.push_back(start_marker);
86    PabloE * result = process(re, start_marker, cg);
87
88    //These three lines are specifically for grep.
89    Assign * final = cg.createAssign(cg.ssa("marker"), cg.createAnd(cg.createMatchStar(cg.createVarIfAssign(result),
90        cg.createNot(cg.createVar(m_name_map.find("LineFeed")->second))), cg.createVar(m_name_map.find("LineFeed")->second)));
91    cg.push_back(final);
92}
93
94
95PabloE * RE_Compiler::process(RE * re, PabloE * target, CodeGenState & cg) {
96    if (Name * name = dyn_cast<Name>(re)) {
97        target = process(name, target, cg);
98    }
99    else if (Seq* seq = dyn_cast<Seq>(re)) {
100        target = process(seq, target, cg);
101    }
102    else if (Alt * alt = dyn_cast<Alt>(re)) {
103        target = process(alt, target, cg);
104    }
105    else if (Rep * rep = dyn_cast<Rep>(re)) {
106        target = process(rep, target, cg);
107    }
108    else if (isa<Start>(re)) {
109        target = cg.createAnd(cg.createVarIfAssign(target), cg.createNot(cg.createAdvance(cg.createNot(cg.createCharClass(m_name_map.find("LineFeed")->second)))));
110        cg.push_back(target);
111    }
112    else if (isa<End>(re)) {
113        target = cg.createAnd(cg.createVarIfAssign(target), cg.createCharClass(m_name_map.find("LineFeed")->second));
114        cg.push_back(target);
115    }
116
117    return target;
118}
119
120inline PabloE * RE_Compiler::process(Name * name, PabloE * target, CodeGenState & cg) {
121    PabloE * markerExpr = cg.createVarIfAssign(target);
122    if (name->getType() != Name::Type::FixedLength) {
123        // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
124        markerExpr = cg.createAnd(markerExpr, cg.createCharClass(m_name_map.find("initial")->second));
125        markerExpr = cg.createScanThru(markerExpr, cg.createCharClass(m_name_map.find("nonfinal")->second));
126    }
127    PabloE * cc = nullptr;
128    if (name->getType() == Name::Type::UnicodeCategory) {
129        cc = cg.createCall(name->getName());
130    }
131    else {
132        cc = cg.createCharClass(name->getName());
133    }
134    if (name->isNegated()) {
135        cc = cg.createNot(cg.createOr(cg.createOr(cc, cg.createCharClass(m_name_map.find("LineFeed")->second)),
136                                cg.createCharClass(m_name_map.find("nonfinal")->second)));
137    }
138    target = cg.createAssign(cg.ssa("marker"), cg.createAdvance(cg.createAnd(cc, markerExpr)));
139    cg.push_back(target);
140    return target;
141}
142
143inline PabloE * RE_Compiler::process(Seq * seq, PabloE * target, CodeGenState & cg) {
144    for (RE * re : *seq) {
145        target = process(re, target, cg);
146    }
147    cg.push_back(target);
148    return target;
149}
150
151inline PabloE * RE_Compiler::process(Alt * alt, PabloE * target, CodeGenState & cg) {
152    if (alt->empty()) {
153        target = cg.createAll(0); // always fail (note: I'm not sure this ever occurs. How do I create a 0-element alternation?)
154    }
155    else {
156        auto i = alt->begin();
157        PabloE * const base = target;
158        target = cg.createVarIfAssign(process(*i, target, cg));
159        while (++i != alt->end()) {
160            PabloE * other = cg.createVarIfAssign(process(*i, base, cg));
161            target = cg.createOr(target, other);
162        }
163        cg.push_back(target);
164    }   
165    return target;
166}
167
168inline PabloE * RE_Compiler::process(Rep * rep, PabloE * target, CodeGenState & cg) {
169    if (rep->getUB() == Rep::UNBOUNDED_REP) {
170        target = processUnboundedRep(rep->getRE(), rep->getLB(), target, cg);
171    }
172    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
173        target = processBoundedRep(rep->getRE(), rep->getLB(), rep->getUB(), target, cg);
174    }   
175    return target;
176}
177
178inline PabloE * RE_Compiler::processUnboundedRep(RE * repeated, int lb, PabloE * target, CodeGenState & cg) {
179    while (lb-- != 0) {
180        target = process(repeated, target, cg);
181    }
182
183    target = cg.createVarIfAssign(target);
184
185    if (isa<Name>(repeated)) {
186        Name * rep_name = dyn_cast<Name>(repeated);
187
188        PabloE * ccExpr;
189        if (rep_name->getType() == Name::Type::UnicodeCategory) {
190            ccExpr = cg.createCall(rep_name->getName());
191        }
192        else {
193            ccExpr = cg.createCharClass(rep_name->getName());
194        }
195
196        if (rep_name->isNegated()) {
197            ccExpr = cg.createNot(cg.createOr(cg.createOr(ccExpr, cg.createCharClass(m_name_map.find("LineFeed")->second)), cg.createCharClass(m_name_map.find("nonfinal")->second)));
198        }
199
200        if (rep_name->getType() == Name::Type::FixedLength) {
201            target = cg.createMatchStar(target, ccExpr);
202        }
203        else { // Name::Unicode and Name::UnicodeCategory
204            target = cg.createAnd(cg.createMatchStar(target, cg.createOr(cg.createCharClass(m_name_map.find("nonfinal")->second), ccExpr)), cg.createCharClass(m_name_map.find("initial")->second));
205        }
206    }
207    else {
208
209        Assign * while_test = cg.createAssign(cg.ssa("while_test"), target);
210        Assign * while_accum = cg.createAssign(cg.ssa("while_accum"), target);
211
212        CodeGenState wt(cg);
213
214        PabloE * accum = cg.createVarIfAssign(process(repeated, while_test, wt));
215
216        cg.push_back(while_test);
217        cg.push_back(while_accum);
218
219        Var * var_while_test = cg.createVar(while_accum);
220
221        wt.push_back(cg.createAssign(while_test->getName(), cg.createAnd(accum, cg.createNot(var_while_test))));
222
223        target = cg.createAssign(while_accum->getName(), cg.createOr(var_while_test, accum));
224
225        wt.push_back(target);
226        cg.push_back(makeWhile(cg.createVar(while_test), wt.expressions()));
227    }   
228    return target;
229}
230
231inline PabloE * RE_Compiler::processBoundedRep(RE * repeated, int lb, int ub, PabloE * target, CodeGenState & cg) {
232    ub -= lb;
233    while(lb-- != 0) {
234        target = process(repeated, target, cg);
235    }
236    while (ub-- != 0) {
237        PabloE * alt = process(repeated, target, cg);
238        target = cg.createOr(cg.createVarIfAssign(target), cg.createVarIfAssign(alt));
239    }
240    cg.push_back(target);
241    return target;
242}
243
244
245bool RE_Compiler::hasUnicode(const RE * re) {
246    bool found = false;
247    if (re == nullptr) {
248        throw std::runtime_error("Unexpected Null Value passed to RE Compiler!");
249    }
250    else if (const Name * name = dyn_cast<const Name>(re)) {
251        if ((name->getType() == Name::Type::UnicodeCategory) || (name->getType() == Name::Type::Unicode)) {
252            found = true;
253        }
254    }
255    else if (const Seq * re_seq = dyn_cast<const Seq>(re)) {
256        for (auto i = re_seq->cbegin(); i != re_seq->cend(); ++i) {
257            if (hasUnicode(*i)) {
258                found = true;
259                break;
260            }
261        }
262    }
263    else if (const Alt * re_alt = dyn_cast<const Alt>(re)) {
264        for (auto i = re_alt->cbegin(); i != re_alt->cend(); ++i) {
265            if (hasUnicode(*i)) {
266                found = true;
267                break;
268            }
269        }
270    }
271    else if (const Rep * rep = dyn_cast<const Rep>(re)) {
272        found = hasUnicode(rep->getRE());
273    }
274    return found;
275}
276
277} // end of namespace re
Note: See TracBrowser for help on using the repository browser.