source: icGREP/icgrep-devel/icgrep/pbix_compiler.cpp @ 4195

Last change on this file since 4195 was 4195, checked in by nmedfort, 5 years ago

Missing stdexcept import; oddly only reported with g++ and not clang++

File size: 13.0 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "pbix_compiler.h"
8//Regular Expressions
9#include "re/re_name.h"
10#include "re/re_start.h"
11#include "re/re_end.h"
12#include "re/re_seq.h"
13#include "re/re_alt.h"
14#include "re/re_rep.h"
15
16//Pablo Expressions
17#include "pe_pabloe.h"
18#include "pe_sel.h"
19#include "pe_advance.h"
20#include "pe_all.h"
21#include "pe_and.h"
22#include "pe_charclass.h"
23#include "pe_call.h"
24#include "pe_matchstar.h"
25#include "pe_scanthru.h"
26#include "pe_not.h"
27#include "pe_or.h"
28#include "pe_var.h"
29#include "pe_xor.h"
30
31//Pablo Statements
32#include "ps_pablos.h"
33#include "ps_assign.h"
34#include "ps_if.h"
35#include "ps_while.h"
36
37#include <assert.h>
38#include <stdexcept>
39
40using namespace re;
41
42Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
43{
44    m_name_map = name_map;
45    symgen = SymbolGenerator();
46}
47
48CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
49{
50    CodeGenState cg_state;
51    for (auto i =  re_map.rbegin(); i != re_map.rend(); ++i) {
52        //This is specifically for the utf8 multibyte character classes.
53        if (Seq * seq = dyn_cast<Seq>(i->second)) {
54            if (seq->getType() == Seq::Type::Byte) {
55                std::string gs_retVal = symgen.gensym("start_marker");
56                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));               
57                for (auto j = seq->begin();; ) {
58                    Name * name = dyn_cast<Name>(*j);
59                    assert (name);
60                    And * cc_mask = new And(new Var(gs_retVal), new CharClass(name->getName()));
61                    if (++j != seq->end()) {
62                        gs_retVal = symgen.gensym("marker");
63                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(cc_mask)));
64                    }
65                    else {
66                        cg_state.stmtsl.push_back(new Assign(seq->getName(), cc_mask));
67                        break;
68                    }
69                }
70                cg_state.newsym = gs_retVal;
71            }
72        }
73    }
74    return cg_state;
75}
76
77CodeGenState Pbix_Compiler::compile(RE *re)
78{   
79    CodeGenState cg_state;
80
81    std::string gs_m0 = symgen.gensym("start_marker");
82    cg_state.stmtsl.push_back(new Assign(gs_m0, new All(1)));
83
84    if (hasUnicode(re))
85    {
86        cg_state.newsym = gs_m0;
87        //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
88        std::string gs_initial = symgen.gensym("internal.initial");
89        m_name_map.insert(make_pair("internal.initial", gs_initial));
90        PabloE * u8single = new Var(m_name_map.find("UTF8-SingleByte")->second);
91        PabloE * u8pfx2 = new Var(m_name_map.find("UTF8-Prefix2")->second);
92        PabloE * u8pfx3 = new Var(m_name_map.find("UTF8-Prefix3")->second);
93        PabloE * u8pfx4 = new Var(m_name_map.find("UTF8-Prefix4")->second);
94        PabloE * u8pfx = new Or(new Or(u8pfx2, u8pfx3), u8pfx4);
95        cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
96        cg_state.newsym = gs_initial;
97
98        //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
99        cg_state.newsym = gs_m0;
100        std::string gs_nonfinal = symgen.gensym("internal.nonfinal");
101        m_name_map.insert(make_pair("internal.nonfinal", gs_nonfinal));
102        //#define USE_IF_FOR_NONFINAL
103        #ifdef USE_IF_FOR_NONFINAL
104        cg_state.stmtsl.push_back(new Assign(gs_nonfinal, new All(0)));
105        #endif
106        PabloE * u8scope32 = new Advance(u8pfx3);
107        PabloE * u8scope42 = new Advance(u8pfx4);
108        PabloE * u8scope43 = new Advance(u8scope42);
109        PabloS * assign_non_final = new Assign(gs_nonfinal, new Or(new Or(u8pfx, u8scope32), new Or(u8scope42, u8scope43)));
110        #ifdef USE_IF_FOR_NONFINAL
111        std::list<PabloS *> * if_body = new std::list<PabloS *> ();
112        if_body->push_back(assign_non_final);
113        cg_state.stmtsl.push_back(new If(u8pfx, *if_body));
114        #else
115        cg_state.stmtsl.push_back(assign_non_final);
116        #endif
117        cg_state.newsym = gs_nonfinal;
118    }
119
120    cg_state.newsym = gs_m0;
121    cg_state = re2pablo_helper(re, cg_state);
122
123    //These three lines are specifically for grep.
124    std::string gs_retVal = symgen.gensym("marker");
125    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
126        new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
127    cg_state.newsym = gs_retVal;
128
129    return cg_state;
130}
131
132CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
133{
134    if (Name* name = dyn_cast<Name>(re))
135    {
136        std::string gs_retVal = symgen.gensym("marker");
137        PabloE* markerExpr = new Var(cg_state.newsym);
138        if (name->getType() != Name::Type::FixedLength) {
139            // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
140            markerExpr = new And(markerExpr, new CharClass(m_name_map.find("internal.initial")->second));
141            markerExpr = new ScanThru(markerExpr, new CharClass(m_name_map.find("internal.nonfinal")->second));
142        }       
143        PabloE* ccExpr;
144        if (name->getType() == Name::Type::UnicodeCategory)
145        {
146            ccExpr = new Call(name->getName());
147        }
148        else 
149        {
150            ccExpr = new CharClass(name->getName());
151        }
152        if (name->isNegated()) {
153            ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
154                                    new CharClass(m_name_map.find("internal.nonfinal")->second)));
155        }
156        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(ccExpr, markerExpr))));
157        cg_state.newsym = gs_retVal;
158    }
159    else if (isa<Start>(re))
160    {
161        std::string gs_retVal = symgen.gensym("start_of_line_marker");
162        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
163        cg_state.newsym = gs_retVal;
164    }
165    else if (isa<End>(re))
166    {
167        std::string gs_retVal = symgen.gensym("end_of_line_marker");
168        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
169        cg_state.newsym = gs_retVal;
170    }
171    else if (Seq* seq = dyn_cast<Seq>(re))
172    {
173        if (!seq->empty())
174        {
175            cg_state = Seq_helper(seq, seq->begin(), cg_state);
176        }
177    }
178    else if (Alt* alt = dyn_cast<Alt>(re))
179    {
180        if (alt->empty())
181        {
182            std::string gs_retVal = symgen.gensym("always_fail_marker");
183            cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
184            cg_state.newsym = gs_retVal;
185        }
186        else
187        {
188            if (alt->size() == 1)
189            {
190                cg_state = re2pablo_helper(alt->back(), cg_state);
191            }
192            else
193            {
194                cg_state = Alt_helper(alt, alt->begin(), cg_state);
195            }
196        }
197
198    }
199    else if (Rep* rep = dyn_cast<Rep>(re))
200    {
201        if (isa<Name>(rep->getRE()) && (rep->getLB() == 0) && (rep->getUB()== Rep::UNBOUNDED_REP))
202        {
203            Name* rep_name = dyn_cast<Name>(rep->getRE());
204            std::string gs_retVal = symgen.gensym("marker");
205
206            PabloE* ccExpr;
207            if (rep_name->getType() == Name::Type::UnicodeCategory)
208            {
209                ccExpr = new Call(rep_name->getName());
210            }
211            else 
212            {
213                ccExpr = new CharClass(rep_name->getName());
214            }
215
216            if (rep_name->isNegated()) {
217                ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
218                                        new CharClass(m_name_map.find("internal.nonfinal")->second)));
219            }
220            if (rep_name->getType() == Name::Type::FixedLength)
221            {
222                cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), ccExpr)));
223            }
224            else //Name::Unicode and Name::UnicodeCategory
225            {
226                cg_state.stmtsl.push_back(new Assign(gs_retVal,
227                    new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
228                    ccExpr)), new CharClass(m_name_map.find("internal.initial")->second))));
229            }
230
231            cg_state.newsym = gs_retVal;
232        }
233        else if (rep->getUB() == Rep::UNBOUNDED_REP)
234        {
235            cg_state = UnboundedRep_helper(rep->getRE(), rep->getLB(), cg_state);
236        }
237        else if (rep->getUB() != Rep::UNBOUNDED_REP)
238        {
239            cg_state = BoundedRep_helper(rep->getRE(), rep->getLB(), rep->getUB(), cg_state);
240        }
241    }
242
243    return cg_state;
244}
245
246
247CodeGenState Pbix_Compiler::Seq_helper(Vector *lst, const_iterator it, CodeGenState cg_state)
248{
249    if (it != lst->end())
250    {
251        cg_state = re2pablo_helper(*it, cg_state);
252        cg_state = Seq_helper(lst, ++it, cg_state);
253    }
254
255    return cg_state;
256}
257
258CodeGenState Pbix_Compiler::Alt_helper(Vector* lst, const_iterator it, CodeGenState cg_state)
259{
260    CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
261    cg_state.stmtsl = t1_cg_state.stmtsl;
262    ++it;
263    if (it != lst->end())
264    {
265        CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
266        cg_state.stmtsl = t2_cg_state.stmtsl;
267        std::string gs_retVal = symgen.gensym("alt_marker");
268        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
269        cg_state.newsym = gs_retVal;
270    }
271    else
272    {
273        cg_state.newsym = t1_cg_state.newsym;
274    }
275
276    return cg_state;
277}
278
279CodeGenState Pbix_Compiler::UnboundedRep_helper(RE* repeated, int lb, CodeGenState cg_state) {
280    if (lb == 0)
281    {
282         std::string while_test_gs_retVal = symgen.gensym("while_test");
283         std::string while_accum_gs_retVal = symgen.gensym("while_accum");
284         CodeGenState while_test_state;
285         while_test_state.newsym = while_test_gs_retVal;
286         CodeGenState t1_cg_state = re2pablo_helper(repeated, while_test_state);
287         cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
288         cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
289         std::list<PabloS*> stmtList;
290         stmtList = t1_cg_state.stmtsl;
291         stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
292         stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
293         cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
294         cg_state.newsym = while_accum_gs_retVal;
295    }
296    else //if (lb > 0)
297    {
298         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
299         cg_state = UnboundedRep_helper(repeated, lb -1, t1_cg_state);
300    }
301    return cg_state;
302}
303
304
305CodeGenState Pbix_Compiler::BoundedRep_helper(RE* repeated, int lb, int ub, CodeGenState cg_state) {
306    if ((lb == 0) && (ub == 0))
307    {
308    //Just fall through...do nothing.
309    }
310    else if ((lb == 0) && (ub > 0))
311    {
312         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
313         CodeGenState t2_cg_state = BoundedRep_helper(repeated, 0, ub-1, t1_cg_state);
314         std::string gs_retVal = symgen.gensym("alt_marker");
315         cg_state.stmtsl = t2_cg_state.stmtsl;
316         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
317         cg_state.newsym = gs_retVal;
318    }
319    else //if ((lb > 0) && (ub > 0))
320    {
321         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
322         cg_state = BoundedRep_helper(repeated, lb-1, ub-1, t1_cg_state);
323    }
324    return cg_state;
325}
326
327
328bool Pbix_Compiler::hasUnicode(const RE * re) {
329    bool found = false;
330    if (re == nullptr) {
331        throw std::runtime_error("Unexpected Null Value passed to RE Compiler!");
332    }
333    else if (const Name * name = dyn_cast<const Name>(re)) {
334        if ((name->getType() == Name::Type::UnicodeCategory) || (name->getType() == Name::Type::Unicode)) {
335            found = true;
336        }
337    }
338    else if (const Seq * re_seq = dyn_cast<const Seq>(re)) {
339        for (auto i = re_seq->cbegin(); i != re_seq->cend(); ++i) {
340            if (hasUnicode(*i)) {
341                found = true;
342                break;
343            }
344        }
345    }
346    else if (const Alt * re_alt = dyn_cast<const Alt>(re)) {
347        for (auto i = re_alt->cbegin(); i != re_alt->cend(); ++i) {
348            if (hasUnicode(*i)) {
349                found = true;
350                break;
351            }
352        }
353    }
354    else if (const Rep * rep = dyn_cast<const Rep>(re)) {
355        found = hasUnicode(rep->getRE());
356    }
357    return found;
358}
Note: See TracBrowser for help on using the repository browser.