source: icGREP/icgrep-devel/icgrep/pbix_compiler.cpp @ 3961

Last change on this file since 3961 was 3961, checked in by daled, 5 years ago

Multibyte character code classes parsed from hex notation are now using named byte sequences.

File size: 13.7 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "pbix_compiler.h"
8
9Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
10{
11    m_name_map = name_map;
12    symgen = SymbolGenerator();
13}
14
15CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
16{
17    CodeGenState cg_state;
18
19    for (auto it =  re_map.rbegin(); it != re_map.rend(); ++it)
20    {
21        //This is specifically for the utf8 multibyte character classes.
22        if (Seq* seq = dynamic_cast<Seq*>(it->second))
23        {
24            if (seq->getType() == Seq::Byte)
25            {
26                std::string gs_retVal = symgen.gensym("start_marker");
27                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
28                cg_state.newsym = gs_retVal;
29
30                std::list<RE*>::iterator endit;
31                endit = seq->GetREList()->end();
32                --endit;
33                std::list<RE*>::iterator it;
34                for (it = seq->GetREList()->begin(); it != seq->GetREList()->end(); ++it)
35                {
36                    Name* name = dynamic_cast<Name*>(*it);
37                    if (it != endit)
38                    {
39                        gs_retVal = symgen.gensym("marker");
40                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
41                        cg_state.newsym = gs_retVal;
42                    }
43                    else
44                    {
45                        cg_state.stmtsl.push_back(new Assign(seq->getName(), new And(new Var(cg_state.newsym), new CharClass(name->getName()))));
46                    }
47                }
48            }
49        }
50    }
51
52    return cg_state;
53}
54
55CodeGenState Pbix_Compiler::compile(RE *re)
56{   
57    std::string gs_retVal;
58    CodeGenState cg_state;
59
60    //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
61    gs_retVal = symgen.gensym("start_marker");
62    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
63    cg_state.newsym = gs_retVal;
64
65    std::string gs_retVal_m1 = symgen.gensym("marker");
66    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-SingleByte")->second), new Var(cg_state.newsym))));
67
68    std::string gs_retVal_m2 = symgen.gensym("marker");
69    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
70
71    std::string gs_retVal_m3 = symgen.gensym("marker");
72    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
73
74    std::string gs_retVal_m4 = symgen.gensym("marker");
75    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
76
77    std::string gs_retVal_m5 = symgen.gensym("marker");
78    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
79
80    std::string gs_retVal_m6 = symgen.gensym("marker");
81    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Or(new Var(gs_retVal_m5), new Var(gs_retVal_m3))));
82
83    gs_retVal = symgen.gensym("internal.initial");
84    m_name_map.insert(make_pair("internal.initial", gs_retVal));
85    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m6), new Var(gs_retVal_m4))));
86    cg_state.newsym = gs_retVal;
87
88    //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
89    gs_retVal = symgen.gensym("start_marker");
90    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
91    cg_state.newsym = gs_retVal;
92
93    gs_retVal_m1 = symgen.gensym("marker");
94    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
95
96    gs_retVal_m2 = symgen.gensym("marker");
97    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
98
99    gs_retVal_m3 = symgen.gensym("marker");
100    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new Advance(new Var(gs_retVal_m2))));
101
102    gs_retVal_m4 = symgen.gensym("marker");
103    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
104
105    gs_retVal_m5 = symgen.gensym("marker");
106    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Advance(new Var(gs_retVal_m4))));
107
108    gs_retVal_m6 = symgen.gensym("marker");
109    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Advance(new Var(gs_retVal_m5))));
110
111    std::string gs_retVal_m7 = symgen.gensym("marker");
112    cg_state.stmtsl.push_back(new Assign(gs_retVal_m7, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
113
114    std::string gs_retVal_m8 = symgen.gensym("marker");
115    cg_state.stmtsl.push_back(new Assign(gs_retVal_m8, new Or(new Var(gs_retVal_m7), new Var(gs_retVal_m3))));
116
117    std::string gs_retVal_m9 = symgen.gensym("marker");
118    cg_state.stmtsl.push_back(new Assign(gs_retVal_m9, new Or(new Var(gs_retVal_m8), new Var(gs_retVal_m4))));
119
120    std::string gs_retVal_m10 = symgen.gensym("marker");
121    cg_state.stmtsl.push_back(new Assign(gs_retVal_m10, new Or(new Var(gs_retVal_m9), new Var(gs_retVal_m5))));
122
123    gs_retVal = symgen.gensym("internal.nonfinal");
124    m_name_map.insert(make_pair("internal.nonfinal", gs_retVal));
125    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m10), new Var(gs_retVal_m6))));
126    cg_state.newsym = gs_retVal;
127
128
129    gs_retVal = symgen.gensym("start_marker");
130    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
131    cg_state.newsym = gs_retVal;
132    cg_state = re2pablo_helper(re, cg_state);
133
134    //These three lines are specifically for grep.
135    gs_retVal = symgen.gensym("marker");
136    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
137                                new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
138    cg_state.newsym = gs_retVal;
139
140    return cg_state;
141}
142
143CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
144{
145    if (Name* name = dynamic_cast<Name*>(re))
146    {
147        std::string gs_retVal = symgen.gensym("marker");
148
149        if (name->getType() == Name::FixedLength)
150        {
151            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
152        }
153        else if (name->getType() == Name::UnicodeCategory)
154        {
155            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new Call(name->getName())))));
156        }
157        else //Name::Unicode
158        {
159            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new CharClass(name->getName()), new ScanThru(new Var(cg_state.newsym), new CharClass(m_name_map.find("internal.nonfinal")->second))))));
160        }
161        cg_state.newsym = gs_retVal;
162
163        //cout << "\n" << "(" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
164    }
165    else if (Start* start = dynamic_cast<Start*>(re))
166    {
167        std::string gs_retVal = symgen.gensym("start_of_line_marker");
168        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
169        cg_state.newsym = gs_retVal;
170    }
171    else if (End* end = dynamic_cast<End*>(re))
172    {
173        std::string gs_retVal = symgen.gensym("end_of_line_marker");
174        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
175        cg_state.newsym = gs_retVal;
176    }
177    else if (Seq* seq = dynamic_cast<Seq*>(re))
178    {
179        std::list<RE*>::iterator it = seq->GetREList()->begin();
180        if (it != seq->GetREList()->end())
181        {
182            cg_state = Seq_helper(seq->GetREList(), it, cg_state);
183        }
184    //cout << "\n" << "Seq => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
185    }
186    else if (Alt* alt = dynamic_cast<Alt*>(re))
187    {
188        if (alt->GetREList() == 0)
189        {
190
191            std::string gs_retVal = symgen.gensym("always_fail_marker");
192            cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
193            cg_state.newsym = gs_retVal;
194        }
195        else
196        {
197            if (alt->GetREList()->size() == 1)
198            {
199                cg_state = re2pablo_helper(alt->GetREList()->front(), cg_state);
200            }
201            else
202            {
203                std::list<RE*>::iterator it = alt->GetREList()->begin();
204                cg_state = Alt_helper(alt->GetREList(), it, cg_state);
205            }
206        }
207    //cout << "\n" << "Alt => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
208    }
209    else if (Rep* rep = dynamic_cast<Rep*>(re))
210    {
211        if ((dynamic_cast<Name*>(rep->getRE()) != 0) && (rep->getLB() == 0) && (rep->getUB()== unboundedRep))
212        {
213            Name* rep_name = dynamic_cast<Name*>(rep->getRE());
214            std::string gs_retVal = symgen.gensym("marker");
215
216            if (rep_name->getType() == Name::FixedLength)
217            {
218                cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), new CharClass(rep_name->getName()))));
219            }
220            else if (rep_name->getType() == Name::UnicodeCategory)
221            {
222                // TODO:  ?? not too sure....
223            }
224            else //Name::unicode
225            {
226                cg_state.stmtsl.push_back(new Assign(gs_retVal,
227                    new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
228                                                                           new CharClass(rep_name->getName()))), new CharClass(m_name_map.find("internal.initial")->second))));
229            }
230
231            cg_state.newsym = gs_retVal;
232        }
233        else if (rep->getUB() == unboundedRep)
234        {
235            if (rep->getLB() == 0)
236            {
237                //std::cout << "While, no lb." << std::endl;
238
239                std::string while_test_gs_retVal = symgen.gensym("while_test");
240                std::string while_accum_gs_retVal = symgen.gensym("while_accum");
241                CodeGenState while_test_state;
242                while_test_state.newsym = while_test_gs_retVal;
243                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), while_test_state);
244                cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
245                cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
246                std::list<PabloS*> stmtList;
247                stmtList = t1_cg_state.stmtsl;
248                stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
249                stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
250                cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
251                cg_state.newsym = while_accum_gs_retVal;
252            }
253            else //if (rep->getLB() > 1)
254            {
255                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
256                rep->setLB(rep->getLB() - 1);
257                cg_state = re2pablo_helper(rep, t1_cg_state);
258            }
259        }
260        else if (rep->getUB() != unboundedRep)
261        {
262            if ((rep->getLB() == 0) && (rep->getUB() == 0))
263            {
264                //Just fall through...do nothing.
265            }
266            else if ((rep->getLB() == 0) && (rep->getUB() > 0))
267            {
268                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
269                rep->setUB(rep->getUB() - 1);
270                CodeGenState t2_cg_state = re2pablo_helper(re, t1_cg_state);
271                std::string gs_retVal = symgen.gensym("alt_marker");
272                cg_state.stmtsl = t2_cg_state.stmtsl;
273                cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
274                cg_state.newsym = gs_retVal;
275            }
276            else //if ((rep->getLB() > 0) && (rep->getUB() > 0))
277            {
278                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
279                rep->setLB(rep->getLB() - 1);
280                rep->setUB(rep->getUB() - 1);
281                cg_state = re2pablo_helper(rep, t1_cg_state);
282            }
283        }
284    }
285
286    return cg_state;
287}
288
289CodeGenState Pbix_Compiler::Seq_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
290{
291    if (it != lst->end())
292    {
293        cg_state = re2pablo_helper(*it, cg_state);
294        cg_state = Seq_helper(lst, ++it, cg_state);
295    }
296
297    return cg_state;
298}
299
300CodeGenState Pbix_Compiler::Alt_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
301{
302    CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
303    cg_state.stmtsl = t1_cg_state.stmtsl;
304    ++it;
305    if (it != lst->end())
306    {
307        CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
308        cg_state.stmtsl = t2_cg_state.stmtsl;
309        std::string gs_retVal = symgen.gensym("alt_marker");
310        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
311        cg_state.newsym = gs_retVal;
312    }
313    else
314    {
315        cg_state.newsym = t1_cg_state.newsym;
316    }
317
318    return cg_state;
319}
320
Note: See TracBrowser for help on using the repository browser.