source: icGREP/icgrep-devel/icgrep/pbix_compiler.cpp @ 3955

Last change on this file since 3955 was 3955, checked in by daled, 5 years ago

icGREP now uses scanthru for multibyte unicode character classes.

File size: 13.0 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "pbix_compiler.h"
8
9Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
10{
11    m_name_map = name_map;
12    symgen = SymbolGenerator();
13}
14
15CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
16{
17    CodeGenState cg_state;
18
19    for (auto it =  re_map.rbegin(); it != re_map.rend(); ++it)
20    {
21        if (Seq* seq = dynamic_cast<Seq*>(it->second))
22        {
23            if (seq->getType() == Seq::Byte)
24            {
25                std::string gs_retVal = symgen.gensym("start_marker");
26                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
27                cg_state.newsym = gs_retVal;
28
29                std::list<RE*>::iterator endit;
30                endit = seq->GetREList()->end();
31                --endit;
32                std::list<RE*>::iterator it;
33                for (it = seq->GetREList()->begin(); it != seq->GetREList()->end(); ++it)
34                {
35                    Name* name = dynamic_cast<Name*>(*it);
36                    if (it != endit)
37                    {
38                        gs_retVal = symgen.gensym("marker");
39                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
40                        cg_state.newsym = gs_retVal;
41                    }
42                    else
43                    {
44                        cg_state.stmtsl.push_back(new Assign(seq->getName(), new And(new Var(cg_state.newsym), new CharClass(name->getName()))));
45                    }
46                }
47            }
48        }
49    }
50
51    return cg_state;
52}
53
54CodeGenState Pbix_Compiler::compile(RE *re)
55{   
56    std::string gs_retVal;
57    CodeGenState cg_state;
58
59    //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
60    gs_retVal = symgen.gensym("start_marker");
61    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
62    cg_state.newsym = gs_retVal;
63
64    std::string gs_retVal_m1 = symgen.gensym("marker");
65    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-SingleByte")->second), new Var(cg_state.newsym))));
66
67    std::string gs_retVal_m2 = symgen.gensym("marker");
68    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
69
70    std::string gs_retVal_m3 = symgen.gensym("marker");
71    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
72
73    std::string gs_retVal_m4 = symgen.gensym("marker");
74    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
75
76    std::string gs_retVal_m5 = symgen.gensym("marker");
77    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
78
79    std::string gs_retVal_m6 = symgen.gensym("marker");
80    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Or(new Var(gs_retVal_m5), new Var(gs_retVal_m3))));
81
82    gs_retVal = symgen.gensym("internal.initial");
83    m_name_map.insert(make_pair("internal.initial", gs_retVal));
84    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m6), new Var(gs_retVal_m4))));
85    cg_state.newsym = gs_retVal;
86
87    //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
88    gs_retVal = symgen.gensym("start_marker");
89    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
90    cg_state.newsym = gs_retVal;
91
92    gs_retVal_m1 = symgen.gensym("marker");
93    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
94
95    gs_retVal_m2 = symgen.gensym("marker");
96    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
97
98    gs_retVal_m3 = symgen.gensym("marker");
99    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new Advance(new Var(gs_retVal_m2))));
100
101    gs_retVal_m4 = symgen.gensym("marker");
102    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
103
104    gs_retVal_m5 = symgen.gensym("marker");
105    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Advance(new Var(gs_retVal_m4))));
106
107    gs_retVal_m6 = symgen.gensym("marker");
108    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Advance(new Var(gs_retVal_m5))));
109
110    std::string gs_retVal_m7 = symgen.gensym("marker");
111    cg_state.stmtsl.push_back(new Assign(gs_retVal_m7, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
112
113    std::string gs_retVal_m8 = symgen.gensym("marker");
114    cg_state.stmtsl.push_back(new Assign(gs_retVal_m8, new Or(new Var(gs_retVal_m7), new Var(gs_retVal_m3))));
115
116    std::string gs_retVal_m9 = symgen.gensym("marker");
117    cg_state.stmtsl.push_back(new Assign(gs_retVal_m9, new Or(new Var(gs_retVal_m8), new Var(gs_retVal_m4))));
118
119    std::string gs_retVal_m10 = symgen.gensym("marker");
120    cg_state.stmtsl.push_back(new Assign(gs_retVal_m10, new Or(new Var(gs_retVal_m9), new Var(gs_retVal_m5))));
121
122    gs_retVal = symgen.gensym("internal.nonfinal");
123    m_name_map.insert(make_pair("internal.nonfinal", gs_retVal));
124    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m10), new Var(gs_retVal_m6))));
125    cg_state.newsym = gs_retVal;
126
127
128    gs_retVal = symgen.gensym("start_marker");
129    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
130    cg_state.newsym = gs_retVal;
131    cg_state = re2pablo_helper(re, cg_state);
132
133    //These three lines are specifically for grep.
134    gs_retVal = symgen.gensym("marker");
135    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
136                                new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
137    cg_state.newsym = gs_retVal;
138
139    return cg_state;
140}
141
142CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
143{
144    if (Name* name = dynamic_cast<Name*>(re))
145    {
146        std::string gs_retVal = symgen.gensym("marker");
147
148        if (name->getType() == Name::FixedLength)
149        {
150            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
151        }
152        else if (name->getType() == Name::UnicodeCategory)
153        {
154            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new Call(name->getName())))));
155        }
156        else //Name::Unicode
157        {
158            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new CharClass(name->getName()), new ScanThru(new Var(cg_state.newsym), new Var(m_name_map.find("internal.nonfinal")->second))))));
159        }
160        cg_state.newsym = gs_retVal;
161
162        //cout << "\n" << "(" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
163    }
164    else if (Start* start = dynamic_cast<Start*>(re))
165    {
166        std::string gs_retVal = symgen.gensym("start_of_line_marker");
167        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
168        cg_state.newsym = gs_retVal;
169    }
170    else if (End* end = dynamic_cast<End*>(re))
171    {
172        std::string gs_retVal = symgen.gensym("end_of_line_marker");
173        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
174        cg_state.newsym = gs_retVal;
175    }
176    else if (Seq* seq = dynamic_cast<Seq*>(re))
177    {
178        std::list<RE*>::iterator it = seq->GetREList()->begin();
179        if (it != seq->GetREList()->end())
180        {
181            cg_state = Seq_helper(seq->GetREList(), it, cg_state);
182        }
183    //cout << "\n" << "Seq => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
184    }
185    else if (Alt* alt = dynamic_cast<Alt*>(re))
186    {
187        if (alt->GetREList() == 0)
188        {
189
190            std::string gs_retVal = symgen.gensym("always_fail_marker");
191            cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
192            cg_state.newsym = gs_retVal;
193        }
194        else
195        {
196            if (alt->GetREList()->size() == 1)
197            {
198                cg_state = re2pablo_helper(alt->GetREList()->front(), cg_state);
199            }
200            else
201            {
202                std::list<RE*>::iterator it = alt->GetREList()->begin();
203                cg_state = Alt_helper(alt->GetREList(), it, cg_state);
204            }
205        }
206    //cout << "\n" << "Alt => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
207    }
208    else if (Rep* rep = dynamic_cast<Rep*>(re))
209    {
210        if ((dynamic_cast<Name*>(rep->getRE()) != 0) && (rep->getLB() == 0) && (rep->getUB()== unboundedRep))
211        {
212            //std::cout << "Matchstar!" << std::endl;
213
214            Name* rep_name = dynamic_cast<Name*>(rep->getRE());
215            std::string gs_retVal = symgen.gensym("marker");
216            cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), new CharClass(rep_name->getName()))));
217            cg_state.newsym = gs_retVal;
218        }
219        else if (rep->getUB() == unboundedRep)
220        {
221            if (rep->getLB() == 0)
222            {
223                //std::cout << "While, no lb." << std::endl;
224
225                std::string while_test_gs_retVal = symgen.gensym("while_test");
226                std::string while_accum_gs_retVal = symgen.gensym("while_accum");
227                CodeGenState while_test_state;
228                while_test_state.newsym = while_test_gs_retVal;
229                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), while_test_state);
230                cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
231                cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
232                std::list<PabloS*> stmtList;
233                stmtList = t1_cg_state.stmtsl;
234                stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
235                stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
236                cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
237                cg_state.newsym = while_accum_gs_retVal;
238            }
239            else //if (rep->getLB() > 1)
240            {
241                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
242                rep->setLB(rep->getLB() - 1);
243                cg_state = re2pablo_helper(rep, t1_cg_state);
244            }
245        }
246        else if (rep->getUB() != unboundedRep)
247        {
248            if ((rep->getLB() == 0) && (rep->getUB() == 0))
249            {
250                //Just fall through...do nothing.
251            }
252            else if ((rep->getLB() == 0) && (rep->getUB() > 0))
253            {
254                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
255                rep->setUB(rep->getUB() - 1);
256                CodeGenState t2_cg_state = re2pablo_helper(re, t1_cg_state);
257                std::string gs_retVal = symgen.gensym("alt_marker");
258                cg_state.stmtsl = t2_cg_state.stmtsl;
259                cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
260                cg_state.newsym = gs_retVal;
261            }
262            else //if ((rep->getLB() > 0) && (rep->getUB() > 0))
263            {
264                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
265                rep->setLB(rep->getLB() - 1);
266                rep->setUB(rep->getUB() - 1);
267                cg_state = re2pablo_helper(rep, t1_cg_state);
268            }
269        }
270    }
271
272    return cg_state;
273}
274
275CodeGenState Pbix_Compiler::Seq_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
276{
277    if (it != lst->end())
278    {
279        cg_state = re2pablo_helper(*it, cg_state);
280        cg_state = Seq_helper(lst, ++it, cg_state);
281    }
282
283    return cg_state;
284}
285
286CodeGenState Pbix_Compiler::Alt_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
287{
288    CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
289    cg_state.stmtsl = t1_cg_state.stmtsl;
290    ++it;
291    if (it != lst->end())
292    {
293        CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
294        cg_state.stmtsl = t2_cg_state.stmtsl;
295        std::string gs_retVal = symgen.gensym("alt_marker");
296        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
297        cg_state.newsym = gs_retVal;
298    }
299    else
300    {
301        cg_state.newsym = t1_cg_state.newsym;
302    }
303
304    return cg_state;
305}
306
Note: See TracBrowser for help on using the repository browser.