source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4197

Last change on this file since 4197 was 4197, checked in by nmedfort, 5 years ago

More refactoring of the RE system; moved the original re/RE_Compiler to compiler.cpp and the PBIX_Compiler to the re/RE_Compiler.

File size: 11.5 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include "re_name.h"
10#include "re_start.h"
11#include "re_end.h"
12#include "re_seq.h"
13#include "re_alt.h"
14#include "re_rep.h"
15
16//Pablo Expressions
17#include "../pe_pabloe.h"
18#include "../pe_sel.h"
19#include "../pe_advance.h"
20#include "../pe_all.h"
21#include "../pe_and.h"
22#include "../pe_charclass.h"
23#include "../pe_call.h"
24#include "../pe_matchstar.h"
25#include "../pe_scanthru.h"
26#include "../pe_not.h"
27#include "../pe_or.h"
28#include "../pe_var.h"
29#include "../pe_xor.h"
30
31//Pablo Statements
32#include "../ps_pablos.h"
33#include "../ps_assign.h"
34#include "../ps_if.h"
35#include "../ps_while.h"
36
37#include <assert.h>
38#include <stdexcept>
39
40namespace re {
41
42RE_Compiler::RE_Compiler(std::map<std::string, std::string> name_map)
43: m_name_map(name_map)
44, symgen()
45{
46
47}
48
49CodeGenState RE_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
50{
51    CodeGenState cg_state;
52    for (auto i =  re_map.rbegin(); i != re_map.rend(); ++i) {
53        //This is specifically for the utf8 multibyte character classes.
54        if (Seq * seq = dyn_cast<Seq>(i->second)) {
55            if (seq->getType() == Seq::Type::Byte) {
56                std::string gs_retVal = symgen.get("start_marker");
57                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
58                for (auto j = seq->begin();; ) {
59                    Name * name = dyn_cast<Name>(*j);
60                    assert (name);
61                    And * cc_mask = new And(new Var(gs_retVal), new CharClass(name->getName()));
62                    if (++j != seq->end()) {
63                        gs_retVal = symgen.get("marker");
64                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(cc_mask)));
65                    }
66                    else {
67                        cg_state.stmtsl.push_back(new Assign(seq->getName(), cc_mask));
68                        break;
69                    }
70                }
71                cg_state.newsym = gs_retVal;
72            }
73        }
74    }
75    return cg_state;
76}
77
78CodeGenState RE_Compiler::compile(RE * re)
79{
80    CodeGenState cg_state;
81
82    std::string gs_m0 = symgen.get("start_marker");
83    cg_state.stmtsl.push_back(new Assign(gs_m0, new All(1)));
84
85    if (hasUnicode(re)) {
86        cg_state.newsym = gs_m0;
87        //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
88        std::string gs_initial = symgen.get("internal.initial");
89        m_name_map.insert(make_pair("internal.initial", gs_initial));
90        PabloE * u8single = new Var(m_name_map.find("UTF8-SingleByte")->second);
91        PabloE * u8pfx2 = new Var(m_name_map.find("UTF8-Prefix2")->second);
92        PabloE * u8pfx3 = new Var(m_name_map.find("UTF8-Prefix3")->second);
93        PabloE * u8pfx4 = new Var(m_name_map.find("UTF8-Prefix4")->second);
94        PabloE * u8pfx = new Or(new Or(u8pfx2, u8pfx3), u8pfx4);
95        cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
96        cg_state.newsym = gs_initial;
97
98        //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
99        cg_state.newsym = gs_m0;
100        std::string gs_nonfinal = symgen.get("internal.nonfinal");
101        m_name_map.insert(make_pair("internal.nonfinal", gs_nonfinal));
102        //#define USE_IF_FOR_NONFINAL
103        #ifdef USE_IF_FOR_NONFINAL
104        cg_state.stmtsl.push_back(new Assign(gs_nonfinal, new All(0)));
105        #endif
106        PabloE * u8scope32 = new Advance(u8pfx3);
107        PabloE * u8scope42 = new Advance(u8pfx4);
108        PabloE * u8scope43 = new Advance(u8scope42);
109        PabloS * assign_non_final = new Assign(gs_nonfinal, new Or(new Or(u8pfx, u8scope32), new Or(u8scope42, u8scope43)));
110        #ifdef USE_IF_FOR_NONFINAL
111        std::list<PabloS *> * if_body = new std::list<PabloS *> ();
112        if_body->push_back(assign_non_final);
113        cg_state.stmtsl.push_back(new If(u8pfx, *if_body));
114        #else
115        cg_state.stmtsl.push_back(assign_non_final);
116        #endif
117        cg_state.newsym = gs_nonfinal;
118    }
119
120    cg_state.newsym = gs_m0;
121    compile(re, cg_state);
122
123    //These three lines are specifically for grep.
124    std::string gs_retVal = symgen.get("marker");
125    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
126        new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
127    cg_state.newsym = gs_retVal;
128
129    return cg_state;
130}
131
132void RE_Compiler::compile(RE * re, CodeGenState & cg_state) {
133    if (Name * name = dyn_cast<Name>(re)) {
134        compile(name, cg_state);
135    }
136    else if (Seq* seq = dyn_cast<Seq>(re)) {
137        compile(seq, cg_state);
138    }
139    else if (Alt * alt = dyn_cast<Alt>(re)) {
140        compile(alt, cg_state);
141    }
142    else if (Rep * rep = dyn_cast<Rep>(re)) {
143        compile(rep, cg_state);
144    }
145    else if (isa<Start>(re)) {
146        std::string gs_retVal = symgen.get("sol");
147        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
148        cg_state.newsym = gs_retVal;
149    }
150    else if (isa<End>(re)) {
151        std::string gs_retVal = symgen.get("eol");
152        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
153        cg_state.newsym = gs_retVal;
154    }
155}
156
157inline void RE_Compiler::compile(Name * name, CodeGenState & cg_state) {
158    std::string gs_retVal = symgen.get("marker");
159    PabloE * markerExpr = new Var(cg_state.newsym);
160    if (name->getType() != Name::Type::FixedLength) {
161        // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
162        markerExpr = new And(markerExpr, new CharClass(m_name_map.find("internal.initial")->second));
163        markerExpr = new ScanThru(markerExpr, new CharClass(m_name_map.find("internal.nonfinal")->second));
164    }
165    PabloE * ccExpr;
166    if (name->getType() == Name::Type::UnicodeCategory) {
167        ccExpr = new Call(name->getName());
168    }
169    else {
170        ccExpr = new CharClass(name->getName());
171    }
172    if (name->isNegated()) {
173        ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
174                                new CharClass(m_name_map.find("internal.nonfinal")->second)));
175    }
176    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(ccExpr, markerExpr))));
177    cg_state.newsym = gs_retVal;
178}
179
180inline void RE_Compiler::compile(Seq * seq, CodeGenState & cg_state) {
181    for (RE * re : *seq) {
182        compile(re, cg_state);
183    }
184}
185
186inline void RE_Compiler::compile(Alt * alt, CodeGenState & cg_state) {
187    if (alt->empty()) {
188        std::string gs_retVal = symgen.get("always_fail_marker");
189        cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
190        cg_state.newsym = gs_retVal;
191    }
192    else {
193        auto i = alt->begin();
194        const std::string startsym = cg_state.newsym;
195        compile(*i, cg_state);
196        while (++i != alt->end()) {
197            std::string oldsym = cg_state.newsym;
198            cg_state.newsym = startsym;
199            compile(*i, cg_state);
200            std::string altsym = symgen.get("alt");
201            cg_state.stmtsl.push_back(new Assign(altsym, new Or(new Var(oldsym), new Var(cg_state.newsym))));
202            cg_state.newsym = altsym;
203        }
204    }
205}
206
207inline void RE_Compiler::compile(Rep * rep, CodeGenState & cg_state) {
208    if (isa<Name>(rep->getRE()) && (rep->getLB() == 0) && (rep->getUB()== Rep::UNBOUNDED_REP)) {
209        Name * rep_name = dyn_cast<Name>(rep->getRE());
210        std::string gs_retVal = symgen.get("marker");
211
212        PabloE* ccExpr;
213        if (rep_name->getType() == Name::Type::UnicodeCategory) {
214            ccExpr = new Call(rep_name->getName());
215        }
216        else {
217            ccExpr = new CharClass(rep_name->getName());
218        }
219
220        if (rep_name->isNegated()) {
221            ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)), new CharClass(m_name_map.find("internal.nonfinal")->second)));
222        }
223        if (rep_name->getType() == Name::Type::FixedLength) {
224            cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), ccExpr)));
225        }
226        else { // Name::Unicode and Name::UnicodeCategory
227            cg_state.stmtsl.push_back(new Assign(gs_retVal,
228                new And(new MatchStar(new Var(cg_state.newsym),
229                        new Or(new CharClass(m_name_map.find("internal.nonfinal")->second), ccExpr)),
230                               new CharClass(m_name_map.find("internal.initial")->second))));
231        }
232        cg_state.newsym = gs_retVal;
233    }
234    else if (rep->getUB() == Rep::UNBOUNDED_REP) {
235        compileUnboundedRep(rep->getRE(), rep->getLB(), cg_state);
236    }
237    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
238        compileBoundedRep(rep->getRE(), rep->getLB(), rep->getUB(), cg_state);
239    }
240}
241
242inline void RE_Compiler::compileUnboundedRep(RE * repeated, int lb, CodeGenState & cg_state) {
243    for (; lb; --lb) {
244        compile(repeated, cg_state);
245    }
246    std::string while_test = symgen.get("while_test");
247    std::string while_accum = symgen.get("while_accum");
248    CodeGenState while_test_state;
249    while_test_state.newsym = while_test;
250    compile(repeated, while_test_state);
251    cg_state.stmtsl.push_back(new Assign(while_test, new Var(cg_state.newsym)));
252    cg_state.stmtsl.push_back(new Assign(while_accum, new Var(cg_state.newsym)));
253    while_test_state.stmtsl.push_back(new Assign(while_test, new And(new Var(while_test_state.newsym), new Not(new Var(while_accum)))));
254    while_test_state.stmtsl.push_back(new Assign(while_accum, new Or(new Var(while_accum), new Var(while_test_state.newsym))));
255    cg_state.stmtsl.push_back(new While(new Var(while_test), while_test_state.stmtsl));
256    cg_state.newsym = while_accum;
257}
258
259inline void RE_Compiler::compileBoundedRep(RE * repeated, int lb, int ub, CodeGenState & cg_state) {
260    ub -= lb;
261    for (; lb; --lb) {
262        compile(repeated, cg_state);
263    }
264    if (ub > 0) {
265         std::string oldsym = cg_state.newsym;
266         compile(repeated, cg_state);
267         compileBoundedRep(repeated, 0, ub - 1, cg_state);
268         std::string altsym = symgen.get("alt");
269         cg_state.stmtsl.push_back(new Assign(altsym, new Or(new Var(oldsym), new Var(cg_state.newsym))));
270         cg_state.newsym = altsym;
271    }
272}
273
274
275bool RE_Compiler::hasUnicode(const RE * re) {
276    bool found = false;
277    if (re == nullptr) {
278        throw std::runtime_error("Unexpected Null Value passed to RE Compiler!");
279    }
280    else if (const Name * name = dyn_cast<const Name>(re)) {
281        if ((name->getType() == Name::Type::UnicodeCategory) || (name->getType() == Name::Type::Unicode)) {
282            found = true;
283        }
284    }
285    else if (const Seq * re_seq = dyn_cast<const Seq>(re)) {
286        for (auto i = re_seq->cbegin(); i != re_seq->cend(); ++i) {
287            if (hasUnicode(*i)) {
288                found = true;
289                break;
290            }
291        }
292    }
293    else if (const Alt * re_alt = dyn_cast<const Alt>(re)) {
294        for (auto i = re_alt->cbegin(); i != re_alt->cend(); ++i) {
295            if (hasUnicode(*i)) {
296                found = true;
297                break;
298            }
299        }
300    }
301    else if (const Rep * rep = dyn_cast<const Rep>(re)) {
302        found = hasUnicode(rep->getRE());
303    }
304    return found;
305}
306
307} // end of namespace re
Note: See TracBrowser for help on using the repository browser.