source: icGREP/icgrep-devel/icgrep/pbix_compiler.cpp @ 3956

Last change on this file since 3956 was 3956, checked in by daled, 5 years ago

Matchstar for utf-8 character classes works. Needs refactoring.

File size: 14.3 KB
RevLine 
[3850]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "pbix_compiler.h"
8
[3955]9Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
[3914]10{
[3955]11    m_name_map = name_map;
[3914]12    symgen = SymbolGenerator();
[3850]13}
14
[3955]15CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
16{
17    CodeGenState cg_state;
18
19    for (auto it =  re_map.rbegin(); it != re_map.rend(); ++it)
20    {
[3956]21        //This is specifically for the utf8 multibyte character classes.
[3955]22        if (Seq* seq = dynamic_cast<Seq*>(it->second))
23        {
24            if (seq->getType() == Seq::Byte)
25            {
26                std::string gs_retVal = symgen.gensym("start_marker");
27                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
28                cg_state.newsym = gs_retVal;
29
30                std::list<RE*>::iterator endit;
31                endit = seq->GetREList()->end();
32                --endit;
33                std::list<RE*>::iterator it;
34                for (it = seq->GetREList()->begin(); it != seq->GetREList()->end(); ++it)
35                {
36                    Name* name = dynamic_cast<Name*>(*it);
37                    if (it != endit)
38                    {
39                        gs_retVal = symgen.gensym("marker");
40                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
41                        cg_state.newsym = gs_retVal;
42                    }
43                    else
44                    {
45                        cg_state.stmtsl.push_back(new Assign(seq->getName(), new And(new Var(cg_state.newsym), new CharClass(name->getName()))));
46                    }
47                }
48            }
49        }
50    }
51
52    return cg_state;
53}
54
[3850]55CodeGenState Pbix_Compiler::compile(RE *re)
[3914]56{   
[3850]57    std::string gs_retVal;
[3955]58    CodeGenState cg_state;
59
60    //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
[3850]61    gs_retVal = symgen.gensym("start_marker");
[3955]62    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
63    cg_state.newsym = gs_retVal;
[3850]64
[3955]65    std::string gs_retVal_m1 = symgen.gensym("marker");
66    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-SingleByte")->second), new Var(cg_state.newsym))));
67
68    std::string gs_retVal_m2 = symgen.gensym("marker");
69    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
70
71    std::string gs_retVal_m3 = symgen.gensym("marker");
72    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
73
74    std::string gs_retVal_m4 = symgen.gensym("marker");
75    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
76
77    std::string gs_retVal_m5 = symgen.gensym("marker");
78    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
79
80    std::string gs_retVal_m6 = symgen.gensym("marker");
81    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Or(new Var(gs_retVal_m5), new Var(gs_retVal_m3))));
82
83    gs_retVal = symgen.gensym("internal.initial");
84    m_name_map.insert(make_pair("internal.initial", gs_retVal));
85    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m6), new Var(gs_retVal_m4))));
86    cg_state.newsym = gs_retVal;
87
88    //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
89    gs_retVal = symgen.gensym("start_marker");
[3850]90    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
91    cg_state.newsym = gs_retVal;
92
[3955]93    gs_retVal_m1 = symgen.gensym("marker");
94    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
95
96    gs_retVal_m2 = symgen.gensym("marker");
97    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
98
99    gs_retVal_m3 = symgen.gensym("marker");
100    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new Advance(new Var(gs_retVal_m2))));
101
102    gs_retVal_m4 = symgen.gensym("marker");
103    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
104
105    gs_retVal_m5 = symgen.gensym("marker");
106    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Advance(new Var(gs_retVal_m4))));
107
108    gs_retVal_m6 = symgen.gensym("marker");
109    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Advance(new Var(gs_retVal_m5))));
110
111    std::string gs_retVal_m7 = symgen.gensym("marker");
112    cg_state.stmtsl.push_back(new Assign(gs_retVal_m7, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
113
114    std::string gs_retVal_m8 = symgen.gensym("marker");
115    cg_state.stmtsl.push_back(new Assign(gs_retVal_m8, new Or(new Var(gs_retVal_m7), new Var(gs_retVal_m3))));
116
117    std::string gs_retVal_m9 = symgen.gensym("marker");
118    cg_state.stmtsl.push_back(new Assign(gs_retVal_m9, new Or(new Var(gs_retVal_m8), new Var(gs_retVal_m4))));
119
120    std::string gs_retVal_m10 = symgen.gensym("marker");
121    cg_state.stmtsl.push_back(new Assign(gs_retVal_m10, new Or(new Var(gs_retVal_m9), new Var(gs_retVal_m5))));
122
123    gs_retVal = symgen.gensym("internal.nonfinal");
124    m_name_map.insert(make_pair("internal.nonfinal", gs_retVal));
125    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m10), new Var(gs_retVal_m6))));
126    cg_state.newsym = gs_retVal;
127
128
129    gs_retVal = symgen.gensym("start_marker");
130    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
131    cg_state.newsym = gs_retVal;
[3850]132    cg_state = re2pablo_helper(re, cg_state);
133
134    //These three lines are specifically for grep.
135    gs_retVal = symgen.gensym("marker");
[3955]136    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
137                                new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
[3850]138    cg_state.newsym = gs_retVal;
139
140    return cg_state;
141}
142
143CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
144{
[3914]145    if (Name* name = dynamic_cast<Name*>(re))
[3850]146    {
147        std::string gs_retVal = symgen.gensym("marker");
[3940]148
[3955]149        if (name->getType() == Name::FixedLength)
150        {
151            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
152        }
153        else if (name->getType() == Name::UnicodeCategory)
154        {
155            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new Call(name->getName())))));
156        }
157        else //Name::Unicode
158        {
[3956]159            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new CharClass(name->getName()), new ScanThru(new Var(cg_state.newsym), new CharClass(m_name_map.find("internal.nonfinal")->second))))));
[3955]160        }
[3850]161        cg_state.newsym = gs_retVal;
162
163        //cout << "\n" << "(" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
164    }
165    else if (Start* start = dynamic_cast<Start*>(re))
166    {
167        std::string gs_retVal = symgen.gensym("start_of_line_marker");
[3955]168        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
[3850]169        cg_state.newsym = gs_retVal;
170    }
171    else if (End* end = dynamic_cast<End*>(re))
172    {
173        std::string gs_retVal = symgen.gensym("end_of_line_marker");
[3955]174        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
[3850]175        cg_state.newsym = gs_retVal;
176    }
177    else if (Seq* seq = dynamic_cast<Seq*>(re))
178    {
179        std::list<RE*>::iterator it = seq->GetREList()->begin();
180        if (it != seq->GetREList()->end())
181        {
182            cg_state = Seq_helper(seq->GetREList(), it, cg_state);
183        }
184    //cout << "\n" << "Seq => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
185    }
186    else if (Alt* alt = dynamic_cast<Alt*>(re))
187    {
188        if (alt->GetREList() == 0)
189        {
190
191            std::string gs_retVal = symgen.gensym("always_fail_marker");
192            cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
193            cg_state.newsym = gs_retVal;
194        }
195        else
196        {
197            if (alt->GetREList()->size() == 1)
198            {
199                cg_state = re2pablo_helper(alt->GetREList()->front(), cg_state);
200            }
201            else
202            {
203                std::list<RE*>::iterator it = alt->GetREList()->begin();
204                cg_state = Alt_helper(alt->GetREList(), it, cg_state);
205            }
206        }
207    //cout << "\n" << "Alt => (" << StatementPrinter::PrintStmts(cg_state) << ")" << "\n" << endl;
208    }
209    else if (Rep* rep = dynamic_cast<Rep*>(re))
210    {
[3914]211        if ((dynamic_cast<Name*>(rep->getRE()) != 0) && (rep->getLB() == 0) && (rep->getUB()== unboundedRep))
[3850]212        {
[3956]213            //std::cout << "Matchstar Name!" << std::endl;
[3914]214
215            Name* rep_name = dynamic_cast<Name*>(rep->getRE());
[3850]216            std::string gs_retVal = symgen.gensym("marker");
[3956]217
218            if (rep_name->getType() == Name::FixedLength)
219            {
220                cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), new CharClass(rep_name->getName()))));
221            }
222            else if (rep_name->getType() == Name::UnicodeCategory)
223            {
224                // TODO:  ?? not too sure....
225            }
226            else //Name::unicode
227            {
228                std::string t_retVal = symgen.gensym("t");
229                std::string u_retVal = symgen.gensym("u");
230                std::string v_retVal = symgen.gensym("v");
231                std::string new_cur_retVal = symgen.gensym("new_cur");
232
233                cg_state.stmtsl.push_back(new Assign(t_retVal, new Or(new CharClass(m_name_map.find("internal.nonfinal")->second), new CharClass(rep_name->getName()))));
234                cg_state.stmtsl.push_back(new Assign(u_retVal, new MatchStar(new Var(cg_state.newsym), new Var(t_retVal))));
235                cg_state.stmtsl.push_back(new Assign(v_retVal, new And(new Var(u_retVal), new CharClass(m_name_map.find("internal.initial")->second))));
236                cg_state.stmtsl.push_back(new Assign(new_cur_retVal, new And(new Var(u_retVal), new Not(new Var(t_retVal)))));
237
238                cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(v_retVal), new Var(new_cur_retVal))));
239
240            }
241
[3850]242            cg_state.newsym = gs_retVal;
243        }
[3914]244        else if (rep->getUB() == unboundedRep)
[3850]245        {
246            if (rep->getLB() == 0)
247            {
[3914]248                //std::cout << "While, no lb." << std::endl;
[3850]249
250                std::string while_test_gs_retVal = symgen.gensym("while_test");
251                std::string while_accum_gs_retVal = symgen.gensym("while_accum");
252                CodeGenState while_test_state;
253                while_test_state.newsym = while_test_gs_retVal;
254                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), while_test_state);
255                cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
256                cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
257                std::list<PabloS*> stmtList;
258                stmtList = t1_cg_state.stmtsl;
259                stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
260                stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
261                cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
262                cg_state.newsym = while_accum_gs_retVal;
263            }
264            else //if (rep->getLB() > 1)
265            {
266                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
267                rep->setLB(rep->getLB() - 1);
268                cg_state = re2pablo_helper(rep, t1_cg_state);
269            }
270        }
[3914]271        else if (rep->getUB() != unboundedRep)
[3850]272        {
[3914]273            if ((rep->getLB() == 0) && (rep->getUB() == 0))
[3850]274            {
275                //Just fall through...do nothing.
276            }
[3914]277            else if ((rep->getLB() == 0) && (rep->getUB() > 0))
[3850]278            {
279                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
[3914]280                rep->setUB(rep->getUB() - 1);
[3850]281                CodeGenState t2_cg_state = re2pablo_helper(re, t1_cg_state);
282                std::string gs_retVal = symgen.gensym("alt_marker");
283                cg_state.stmtsl = t2_cg_state.stmtsl;
284                cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
285                cg_state.newsym = gs_retVal;
286            }
[3914]287            else //if ((rep->getLB() > 0) && (rep->getUB() > 0))
[3850]288            {
289                CodeGenState t1_cg_state = re2pablo_helper(rep->getRE(), cg_state);
290                rep->setLB(rep->getLB() - 1);
[3914]291                rep->setUB(rep->getUB() - 1);
[3850]292                cg_state = re2pablo_helper(rep, t1_cg_state);
293            }
294        }
295    }
296
297    return cg_state;
298}
299
300CodeGenState Pbix_Compiler::Seq_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
301{
302    if (it != lst->end())
303    {
304        cg_state = re2pablo_helper(*it, cg_state);
305        cg_state = Seq_helper(lst, ++it, cg_state);
306    }
307
308    return cg_state;
309}
310
311CodeGenState Pbix_Compiler::Alt_helper(std::list<RE*>* lst, std::list<RE*>::const_iterator it, CodeGenState cg_state)
312{
313    CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
314    cg_state.stmtsl = t1_cg_state.stmtsl;
315    ++it;
316    if (it != lst->end())
317    {
318        CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
319        cg_state.stmtsl = t2_cg_state.stmtsl;
320        std::string gs_retVal = symgen.gensym("alt_marker");
321        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
322        cg_state.newsym = gs_retVal;
323    }
324    else
325    {
326        cg_state.newsym = t1_cg_state.newsym;
327    }
328
329    return cg_state;
330}
331
Note: See TracBrowser for help on using the repository browser.