Ignore:
Timestamp:
Sep 29, 2014, 2:11:34 PM (5 years ago)
Author:
nmedfort
Message:

More refactoring of the RE system; moved the original re/RE_Compiler to compiler.cpp and the PBIX_Compiler to the re/RE_Compiler.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/pbix_compiler.cpp

    r4195 r4197  
    66
    77#include "pbix_compiler.h"
    8 //Regular Expressions
    9 #include "re/re_name.h"
    10 #include "re/re_start.h"
    11 #include "re/re_end.h"
    12 #include "re/re_seq.h"
    13 #include "re/re_alt.h"
    14 #include "re/re_rep.h"
    15 
    16 //Pablo Expressions
    17 #include "pe_pabloe.h"
    18 #include "pe_sel.h"
    19 #include "pe_advance.h"
    20 #include "pe_all.h"
    21 #include "pe_and.h"
    22 #include "pe_charclass.h"
    23 #include "pe_call.h"
    24 #include "pe_matchstar.h"
    25 #include "pe_scanthru.h"
    26 #include "pe_not.h"
    27 #include "pe_or.h"
    28 #include "pe_var.h"
    29 #include "pe_xor.h"
    30 
    31 //Pablo Statements
    32 #include "ps_pablos.h"
    33 #include "ps_assign.h"
    34 #include "ps_if.h"
    35 #include "ps_while.h"
    36 
    37 #include <assert.h>
    38 #include <stdexcept>
    39 
    40 using namespace re;
    41 
    42 Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
    43 {
    44     m_name_map = name_map;
    45     symgen = SymbolGenerator();
    46 }
    47 
    48 CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
    49 {
    50     CodeGenState cg_state;
    51     for (auto i =  re_map.rbegin(); i != re_map.rend(); ++i) {
    52         //This is specifically for the utf8 multibyte character classes.
    53         if (Seq * seq = dyn_cast<Seq>(i->second)) {
    54             if (seq->getType() == Seq::Type::Byte) {
    55                 std::string gs_retVal = symgen.gensym("start_marker");
    56                 cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));               
    57                 for (auto j = seq->begin();; ) {
    58                     Name * name = dyn_cast<Name>(*j);
    59                     assert (name);
    60                     And * cc_mask = new And(new Var(gs_retVal), new CharClass(name->getName()));
    61                     if (++j != seq->end()) {
    62                         gs_retVal = symgen.gensym("marker");
    63                         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(cc_mask)));
    64                     }
    65                     else {
    66                         cg_state.stmtsl.push_back(new Assign(seq->getName(), cc_mask));
    67                         break;
    68                     }
    69                 }
    70                 cg_state.newsym = gs_retVal;
    71             }
    72         }
    73     }
    74     return cg_state;
    75 }
    76 
    77 CodeGenState Pbix_Compiler::compile(RE *re)
    78 {   
    79     CodeGenState cg_state;
    80 
    81     std::string gs_m0 = symgen.gensym("start_marker");
    82     cg_state.stmtsl.push_back(new Assign(gs_m0, new All(1)));
    83 
    84     if (hasUnicode(re))
    85     {
    86         cg_state.newsym = gs_m0;
    87         //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
    88         std::string gs_initial = symgen.gensym("internal.initial");
    89         m_name_map.insert(make_pair("internal.initial", gs_initial));
    90         PabloE * u8single = new Var(m_name_map.find("UTF8-SingleByte")->second);
    91         PabloE * u8pfx2 = new Var(m_name_map.find("UTF8-Prefix2")->second);
    92         PabloE * u8pfx3 = new Var(m_name_map.find("UTF8-Prefix3")->second);
    93         PabloE * u8pfx4 = new Var(m_name_map.find("UTF8-Prefix4")->second);
    94         PabloE * u8pfx = new Or(new Or(u8pfx2, u8pfx3), u8pfx4);
    95         cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
    96         cg_state.newsym = gs_initial;
    97 
    98         //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
    99         cg_state.newsym = gs_m0;
    100         std::string gs_nonfinal = symgen.gensym("internal.nonfinal");
    101         m_name_map.insert(make_pair("internal.nonfinal", gs_nonfinal));
    102         //#define USE_IF_FOR_NONFINAL
    103         #ifdef USE_IF_FOR_NONFINAL
    104         cg_state.stmtsl.push_back(new Assign(gs_nonfinal, new All(0)));
    105         #endif
    106         PabloE * u8scope32 = new Advance(u8pfx3);
    107         PabloE * u8scope42 = new Advance(u8pfx4);
    108         PabloE * u8scope43 = new Advance(u8scope42);
    109         PabloS * assign_non_final = new Assign(gs_nonfinal, new Or(new Or(u8pfx, u8scope32), new Or(u8scope42, u8scope43)));
    110         #ifdef USE_IF_FOR_NONFINAL
    111         std::list<PabloS *> * if_body = new std::list<PabloS *> ();
    112         if_body->push_back(assign_non_final);
    113         cg_state.stmtsl.push_back(new If(u8pfx, *if_body));
    114         #else
    115         cg_state.stmtsl.push_back(assign_non_final);
    116         #endif
    117         cg_state.newsym = gs_nonfinal;
    118     }
    119 
    120     cg_state.newsym = gs_m0;
    121     cg_state = re2pablo_helper(re, cg_state);
    122 
    123     //These three lines are specifically for grep.
    124     std::string gs_retVal = symgen.gensym("marker");
    125     cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
    126         new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
    127     cg_state.newsym = gs_retVal;
    128 
    129     return cg_state;
    130 }
    131 
    132 CodeGenState Pbix_Compiler::re2pablo_helper(RE *re, CodeGenState cg_state)
    133 {
    134     if (Name* name = dyn_cast<Name>(re))
    135     {
    136         std::string gs_retVal = symgen.gensym("marker");
    137         PabloE* markerExpr = new Var(cg_state.newsym);
    138         if (name->getType() != Name::Type::FixedLength) {
    139             // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
    140             markerExpr = new And(markerExpr, new CharClass(m_name_map.find("internal.initial")->second));
    141             markerExpr = new ScanThru(markerExpr, new CharClass(m_name_map.find("internal.nonfinal")->second));
    142         }       
    143         PabloE* ccExpr;
    144         if (name->getType() == Name::Type::UnicodeCategory)
    145         {
    146             ccExpr = new Call(name->getName());
    147         }
    148         else
    149         {
    150             ccExpr = new CharClass(name->getName());
    151         }
    152         if (name->isNegated()) {
    153             ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
    154                                     new CharClass(m_name_map.find("internal.nonfinal")->second)));
    155         }
    156         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(ccExpr, markerExpr))));
    157         cg_state.newsym = gs_retVal;
    158     }
    159     else if (isa<Start>(re))
    160     {
    161         std::string gs_retVal = symgen.gensym("start_of_line_marker");
    162         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
    163         cg_state.newsym = gs_retVal;
    164     }
    165     else if (isa<End>(re))
    166     {
    167         std::string gs_retVal = symgen.gensym("end_of_line_marker");
    168         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
    169         cg_state.newsym = gs_retVal;
    170     }
    171     else if (Seq* seq = dyn_cast<Seq>(re))
    172     {
    173         if (!seq->empty())
    174         {
    175             cg_state = Seq_helper(seq, seq->begin(), cg_state);
    176         }
    177     }
    178     else if (Alt* alt = dyn_cast<Alt>(re))
    179     {
    180         if (alt->empty())
    181         {
    182             std::string gs_retVal = symgen.gensym("always_fail_marker");
    183             cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(0)));
    184             cg_state.newsym = gs_retVal;
    185         }
    186         else
    187         {
    188             if (alt->size() == 1)
    189             {
    190                 cg_state = re2pablo_helper(alt->back(), cg_state);
    191             }
    192             else
    193             {
    194                 cg_state = Alt_helper(alt, alt->begin(), cg_state);
    195             }
    196         }
    197 
    198     }
    199     else if (Rep* rep = dyn_cast<Rep>(re))
    200     {
    201         if (isa<Name>(rep->getRE()) && (rep->getLB() == 0) && (rep->getUB()== Rep::UNBOUNDED_REP))
    202         {
    203             Name* rep_name = dyn_cast<Name>(rep->getRE());
    204             std::string gs_retVal = symgen.gensym("marker");
    205 
    206             PabloE* ccExpr;
    207             if (rep_name->getType() == Name::Type::UnicodeCategory)
    208             {
    209                 ccExpr = new Call(rep_name->getName());
    210             }
    211             else
    212             {
    213                 ccExpr = new CharClass(rep_name->getName());
    214             }
    215 
    216             if (rep_name->isNegated()) {
    217                 ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
    218                                         new CharClass(m_name_map.find("internal.nonfinal")->second)));
    219             }
    220             if (rep_name->getType() == Name::Type::FixedLength)
    221             {
    222                 cg_state.stmtsl.push_back(new Assign(gs_retVal, new MatchStar(new Var(cg_state.newsym), ccExpr)));
    223             }
    224             else //Name::Unicode and Name::UnicodeCategory
    225             {
    226                 cg_state.stmtsl.push_back(new Assign(gs_retVal,
    227                     new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
    228                     ccExpr)), new CharClass(m_name_map.find("internal.initial")->second))));
    229             }
    230 
    231             cg_state.newsym = gs_retVal;
    232         }
    233         else if (rep->getUB() == Rep::UNBOUNDED_REP)
    234         {
    235             cg_state = UnboundedRep_helper(rep->getRE(), rep->getLB(), cg_state);
    236         }
    237         else if (rep->getUB() != Rep::UNBOUNDED_REP)
    238         {
    239             cg_state = BoundedRep_helper(rep->getRE(), rep->getLB(), rep->getUB(), cg_state);
    240         }
    241     }
    242 
    243     return cg_state;
    244 }
    245 
    246 
    247 CodeGenState Pbix_Compiler::Seq_helper(Vector *lst, const_iterator it, CodeGenState cg_state)
    248 {
    249     if (it != lst->end())
    250     {
    251         cg_state = re2pablo_helper(*it, cg_state);
    252         cg_state = Seq_helper(lst, ++it, cg_state);
    253     }
    254 
    255     return cg_state;
    256 }
    257 
    258 CodeGenState Pbix_Compiler::Alt_helper(Vector* lst, const_iterator it, CodeGenState cg_state)
    259 {
    260     CodeGenState t1_cg_state = re2pablo_helper(*it, cg_state);
    261     cg_state.stmtsl = t1_cg_state.stmtsl;
    262     ++it;
    263     if (it != lst->end())
    264     {
    265         CodeGenState t2_cg_state = Alt_helper(lst, it, cg_state);
    266         cg_state.stmtsl = t2_cg_state.stmtsl;
    267         std::string gs_retVal = symgen.gensym("alt_marker");
    268         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(t1_cg_state.newsym), new Var(t2_cg_state.newsym))));
    269         cg_state.newsym = gs_retVal;
    270     }
    271     else
    272     {
    273         cg_state.newsym = t1_cg_state.newsym;
    274     }
    275 
    276     return cg_state;
    277 }
    278 
    279 CodeGenState Pbix_Compiler::UnboundedRep_helper(RE* repeated, int lb, CodeGenState cg_state) {
    280     if (lb == 0)
    281     {
    282          std::string while_test_gs_retVal = symgen.gensym("while_test");
    283          std::string while_accum_gs_retVal = symgen.gensym("while_accum");
    284          CodeGenState while_test_state;
    285          while_test_state.newsym = while_test_gs_retVal;
    286          CodeGenState t1_cg_state = re2pablo_helper(repeated, while_test_state);
    287          cg_state.stmtsl.push_back(new Assign(while_test_gs_retVal, new Var(cg_state.newsym)));
    288          cg_state.stmtsl.push_back(new Assign(while_accum_gs_retVal, new Var(cg_state.newsym)));
    289          std::list<PabloS*> stmtList;
    290          stmtList = t1_cg_state.stmtsl;
    291          stmtList.push_back(new Assign(while_test_gs_retVal, new And(new Var(t1_cg_state.newsym), new Not(new Var(while_accum_gs_retVal)))));
    292          stmtList.push_back(new Assign(while_accum_gs_retVal, new Or(new Var(while_accum_gs_retVal), new Var(t1_cg_state.newsym))));
    293          cg_state.stmtsl.push_back( new While(new Var(while_test_gs_retVal), stmtList));
    294          cg_state.newsym = while_accum_gs_retVal;
    295     }
    296     else //if (lb > 0)
    297     {
    298          CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
    299          cg_state = UnboundedRep_helper(repeated, lb -1, t1_cg_state);
    300     }
    301     return cg_state;
    302 }
    303 
    304 
    305 CodeGenState Pbix_Compiler::BoundedRep_helper(RE* repeated, int lb, int ub, CodeGenState cg_state) {
    306     if ((lb == 0) && (ub == 0))
    307     {
    308     //Just fall through...do nothing.
    309     }
    310     else if ((lb == 0) && (ub > 0))
    311     {
    312          CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
    313          CodeGenState t2_cg_state = BoundedRep_helper(repeated, 0, ub-1, t1_cg_state);
    314          std::string gs_retVal = symgen.gensym("alt_marker");
    315          cg_state.stmtsl = t2_cg_state.stmtsl;
    316          cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(cg_state.newsym), new Var(t2_cg_state.newsym))));
    317          cg_state.newsym = gs_retVal;
    318     }
    319     else //if ((lb > 0) && (ub > 0))
    320     {
    321          CodeGenState t1_cg_state = re2pablo_helper(repeated, cg_state);
    322          cg_state = BoundedRep_helper(repeated, lb-1, ub-1, t1_cg_state);
    323     }
    324     return cg_state;
    325 }
    326 
    327 
    328 bool Pbix_Compiler::hasUnicode(const RE * re) {
    329     bool found = false;
    330     if (re == nullptr) {
    331         throw std::runtime_error("Unexpected Null Value passed to RE Compiler!");
    332     }
    333     else if (const Name * name = dyn_cast<const Name>(re)) {
    334         if ((name->getType() == Name::Type::UnicodeCategory) || (name->getType() == Name::Type::Unicode)) {
    335             found = true;
    336         }
    337     }
    338     else if (const Seq * re_seq = dyn_cast<const Seq>(re)) {
    339         for (auto i = re_seq->cbegin(); i != re_seq->cend(); ++i) {
    340             if (hasUnicode(*i)) {
    341                 found = true;
    342                 break;
    343             }
    344         }
    345     }
    346     else if (const Alt * re_alt = dyn_cast<const Alt>(re)) {
    347         for (auto i = re_alt->cbegin(); i != re_alt->cend(); ++i) {
    348             if (hasUnicode(*i)) {
    349                 found = true;
    350                 break;
    351             }
    352         }
    353     }
    354     else if (const Rep * rep = dyn_cast<const Rep>(re)) {
    355         found = hasUnicode(rep->getRE());
    356     }
    357     return found;
    358 }
Note: See TracChangeset for help on using the changeset viewer.