source: icGREP/icgrep-devel/icgrep/pbix_compiler.cpp @ 4133

Last change on this file since 4133 was 4129, checked in by cameron, 5 years ago

Revert CR introduction; eliminate unused code for 'predefined' syms

File size: 13.5 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "pbix_compiler.h"
8#include "printer_pablos.h"
9
10Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
11{
12    m_name_map = name_map;
13    symgen = SymbolGenerator();
14}
15
16CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
17{
18    CodeGenState cg_state;
19
20    for (auto it =  re_map.rbegin(); it != re_map.rend(); ++it)
21    {
22        //This is specifically for the utf8 multibyte character classes.
23        if (Seq* seq = dynamic_cast<Seq*>(it->second))
24        {
25            if (seq->getType() == Seq::Byte)
26            {
27                std::string gs_retVal = symgen.gensym("start_marker");
28                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
29                cg_state.newsym = gs_retVal;
30
31                std::list<RE*>::iterator endit;
32                endit = seq->GetREList()->end();
33                --endit;
34                std::list<RE*>::iterator it;
35                for (it = seq->GetREList()->begin(); it != seq->GetREList()->end(); ++it)
36                {
37                    Name* name = dynamic_cast<Name*>(*it);
38                    if (it != endit)
39                    {
40                        gs_retVal = symgen.gensym("marker");
41                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
42                        cg_state.newsym = gs_retVal;
43                    }
44                    else
45                    {
46                        cg_state.stmtsl.push_back(new Assign(seq->getName(), new And(new Var(cg_state.newsym), new CharClass(name->getName()))));
47                    }
48                }
49            }
50        }
51    }
52
53    return cg_state;
54}
55
56CodeGenState Pbix_Compiler::compile(RE *re)
57{   
58    CodeGenState cg_state;
59
60    std::string gs_m0 = symgen.gensym("start_marker");
61    cg_state.stmtsl.push_back(new Assign(gs_m0, new All(1)));
62
63    if (unicode_re(re))
64    {
65        cg_state.newsym = gs_m0;
66        //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
67        std::string gs_initial = symgen.gensym("internal.initial");
68        m_name_map.insert(make_pair("internal.initial", gs_initial));
69        PabloE * u8single = new Var(m_name_map.find("UTF8-SingleByte")->second);
70        PabloE * u8pfx2 = new Var(m_name_map.find("UTF8-Prefix2")->second);
71        PabloE * u8pfx3 = new Var(m_name_map.find("UTF8-Prefix3")->second);
72        PabloE * u8pfx4 = new Var(m_name_map.find("UTF8-Prefix4")->second);
73        PabloE * u8pfx = new Or(new Or(u8pfx2, u8pfx3), u8pfx4);
74        cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
75        cg_state.newsym = gs_initial;
76
77        //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
78        cg_state.newsym = gs_m0;
79        std::string gs_nonfinal = symgen.gensym("internal.nonfinal");
80        m_name_map.insert(make_pair("internal.nonfinal", gs_nonfinal));
81//#define USE_IF_FOR_NONFINAL
82#ifdef USE_IF_FOR_NONFINAL
83        cg_state.stmtsl.push_back(new Assign(gs_nonfinal, new All(0)));
84#endif
85        PabloE * u8scope32 = new Advance(u8pfx3);
86        PabloE * u8scope42 = new Advance(u8pfx4);
87        PabloE * u8scope43 = new Advance(u8scope42);
88        PabloS * assign_non_final = new Assign(gs_nonfinal, new Or(new Or(u8pfx, u8scope32), new Or(u8scope42, u8scope43)));
89#ifdef USE_IF_FOR_NONFINAL
90        std::list<PabloS *> * if_body = new std::list<PabloS *> ();
91        if_body->push_back(assign_non_final);
92        cg_state.stmtsl.push_back(new If(u8pfx, *if_body));
93#endif
94#ifndef USE_IF_FOR_NONFINAL
95        cg_state.stmtsl.push_back(assign_non_final);
96#endif
97        cg_state.newsym = gs_nonfinal;
98    }
99
100    cg_state.newsym = gs_m0;
101    cg_state = re2pablo_helper(re, cg_state);
102
103    //These three lines are specifically for grep.
104    std::string gs_retVal = symgen.gensym("marker");
105    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
106        new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
107    cg_state.newsym = gs_retVal;
108
109    return cg_state;
110}
111
112CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
113{
114    if (Name* name = dynamic_cast<Name*>(re))
115    {
116        std::string gs_retVal = symgen.gensym("marker");
117        PabloE* markerExpr = new Var(cg_state.newsym);
118        if (name->getType() != Name::FixedLength) {
119            // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
120            markerExpr = new And(markerExpr, new CharClass(m_name_map.find("internal.initial")->second));
121            markerExpr = new ScanThru(markerExpr, new CharClass(m_name_map.find("internal.nonfinal")->second));
122        }       
123        PabloE* ccExpr;
124        if (name->getType() == Name::UnicodeCategory)
125        {
126            ccExpr = new Call(name->getName());
127        }
128        else 
129        {
130            ccExpr = new CharClass(name->getName());
131        }
132        if (name->isNegated()) {
133            ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
134                                    new CharClass(m_name_map.find("internal.nonfinal")->second)));
135        }
136        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(ccExpr, markerExpr))));
137        cg_state.newsym = gs_retVal;
138
139        //std::cout << "\n" << "(" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << std::endl;
140    }
141    else if (Start* start = dynamic_cast<Start*>(re))
142    {
143        std::string gs_retVal = symgen.gensym("start_of_line_marker");
144        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
145        cg_state.newsym = gs_retVal;
146    }
147    else if (End* end = dynamic_cast<End*>(re))
148    {
149        std::string gs_retVal = symgen.gensym("end_of_line_marker");
150        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
151        cg_state.newsym = gs_retVal;
152    }
153    else if (Seq* seq = dynamic_cast<Seq*>(re))
154    {
155        std::list<RE*>::iterator it = seq->GetREList()->begin();
156        if (it != seq->GetREList()->end())
157        {
158            cg_state = Seq_helper(seq->GetREList(), it, cg_state);
159        }
160    //cout << "\n" << "Seq => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
161    }
162    else if (Alt* alt = dynamic_cast<Alt*>(re))
163    {
164        if (alt->GetREList() == 0)
165        {
166
167            std::string gs_retVal = symgen.gensym("always_fail_marker");
168            cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
169            cg_state.newsym = gs_retVal;
170        }
171        else
172        {
173            if (alt->GetREList()->size() == 1)
174            {
175                cg_state = re2pablo_helper(alt->GetREList()->front(), cg_state);
176            }
177            else
178            {
179                std::list<RE*>::iterator it = alt->GetREList()->begin();
180                cg_state = Alt_helper(alt->GetREList(), it, cg_state);
181            }
182        }
183    //cout << "\n" << "Alt => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
184    }
185    else if (Rep* rep = dynamic_cast<Rep*>(re))
186    {
187        if ((dynamic_cast<Name*>(rep->getRE()) != 0) && (rep->getLB() == 0) && (rep->getUB()== unboundedRep))
188        {
189            Name* rep_name = dynamic_cast<Name*>(rep->getRE());
190            std::string gs_retVal = symgen.gensym("marker");
191
192            PabloE* ccExpr;
193            if (rep_name->getType() == Name::UnicodeCategory)
194            {
195                ccExpr = new Call(rep_name->getName());
196            }
197            else 
198            {
199                ccExpr = new CharClass(rep_name->getName());
200            }
201
202            if (rep_name->isNegated()) {
203                ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
204                                        new CharClass(m_name_map.find("internal.nonfinal")->second)));
205            }
206            if (rep_name->getType() == Name::FixedLength)
207            {
208                cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), ccExpr)));
209            }
210            else //Name::Unicode and Name::UnicodeCategory
211            {
212                cg_state.stmtsl.push_back(new Assign(gs_retVal,
213                    new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
214                    ccExpr)), new CharClass(m_name_map.find("internal.initial")->second))));
215            }
216
217            cg_state.newsym = gs_retVal;
218        }
219        else if (rep->getUB() == unboundedRep)
220        {
221            cg_state = UnboundedRep_helper(rep->getRE(), rep->getLB(), cg_state);
222        }
223        else if (rep->getUB() != unboundedRep)
224        {
225            cg_state = BoundedRep_helper(rep->getRE(), rep->getLB(), rep->getUB(), cg_state);
226        }
227    }
228
229    return cg_state;
230}
231
232
233CodeGenState Pbix_Compiler::Seq_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
234{
235    if (it != lst->end())
236    {
237        cg_state = re2pablo_helper(*it, cg_state);
238        cg_state = Seq_helper(lst, ++it, cg_state);
239    }
240
241    return cg_state;
242}
243
244CodeGenState Pbix_Compiler::Alt_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
245{
246    CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
247    cg_state.stmtsl = t1_cg_state.stmtsl;
248    ++it;
249    if (it != lst->end())
250    {
251        CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
252        cg_state.stmtsl = t2_cg_state.stmtsl;
253        std::string gs_retVal = symgen.gensym("alt_marker");
254        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
255        cg_state.newsym = gs_retVal;
256    }
257    else
258    {
259        cg_state.newsym = t1_cg_state.newsym;
260    }
261
262    return cg_state;
263}
264
265CodeGenState Pbix_Compiler::UnboundedRep_helper(RE* repeated, int lb, CodeGenState cg_state) {
266    if (lb == 0)
267    {
268         //std::cout << "While, no lb." << std::endl;
269
270         std::string while_test_gs_retVal = symgen.gensym("while_test");
271         std::string while_accum_gs_retVal = symgen.gensym("while_accum");
272         CodeGenState while_test_state;
273         while_test_state.newsym = while_test_gs_retVal;
274         CodeGenState t1_cg_state = re2pablo_helper(repeated, while_test_state);
275         cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
276         cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
277         std::list<PabloS*> stmtList;
278         stmtList = t1_cg_state.stmtsl;
279         stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
280         stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
281         cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
282         cg_state.newsym = while_accum_gs_retVal;
283    }
284    else //if (lb > 0)
285    {
286         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
287         cg_state = UnboundedRep_helper(repeated, lb -1, t1_cg_state);
288    }
289    return cg_state;
290}
291
292
293CodeGenState Pbix_Compiler::BoundedRep_helper(RE* repeated, int lb, int ub, CodeGenState cg_state) {
294    if ((lb == 0) && (ub == 0))
295    {
296    //Just fall through...do nothing.
297    }
298    else if ((lb == 0) && (ub > 0))
299    {
300         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
301         CodeGenState t2_cg_state = BoundedRep_helper(repeated, 0, ub-1, t1_cg_state);
302         std::string gs_retVal = symgen.gensym("alt_marker");
303         cg_state.stmtsl = t2_cg_state.stmtsl;
304         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
305         cg_state.newsym = gs_retVal;
306    }
307    else //if ((lb > 0) && (ub > 0))
308    {
309         CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
310         cg_state = BoundedRep_helper(repeated, lb-1, ub-1, t1_cg_state);
311    }
312    return cg_state;
313}
314
315
316bool Pbix_Compiler::unicode_re(RE *re)
317{
318    bool found = false;
319
320    return unicode_re_helper(re, found);
321}
322
323bool Pbix_Compiler::unicode_re_helper(RE *re, bool found)
324{
325    if (!found)
326    {
327        if (Name* name = dynamic_cast<Name*>(re))
328        {
329            if ((name->getType() == Name::UnicodeCategory) || (name->getType() == Name::Unicode))
330            {
331                found = true;
332            }
333        }
334        else if (Seq* re_seq = dynamic_cast<Seq*>(re))
335        {
336            std::list<RE*>::iterator it;
337            for (it = re_seq->GetREList()->begin(); it != re_seq->GetREList()->end(); ++it)
338            {
339                found = unicode_re_helper(*it, found);
340                if (found) break;
341            }
342        }
343        else if (Alt* re_alt = dynamic_cast<Alt*>(re))
344        {
345            std::list<RE*>::iterator it;
346            for (it = re_alt->GetREList()->begin(); it != re_alt->GetREList()->end(); ++it)
347            {
348                found = unicode_re_helper(*it, found);
349                if (found) break;
350            }
351        }
352        else if (Rep* rep = dynamic_cast<Rep*>(re))
353        {
354            found = unicode_re_helper(rep->getRE(), found);
355        }
356    }
357
358    return found;
359}
Note: See TracBrowser for help on using the repository browser.