Ignore:
Timestamp:
Aug 4, 2014, 9:46:49 AM (5 years ago)
Author:
daled
Message:

icGREP now uses scanthru for multibyte unicode character classes.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/pbix_compiler.cpp

    r3940 r3955  
    77#include "pbix_compiler.h"
    88
    9 Pbix_Compiler::Pbix_Compiler(std::string lf_ccname)
    10 {
    11     m_lf_ccname = lf_ccname;
     9Pbix_Compiler::Pbix_Compiler(std::map<std::string, std::string> name_map)
     10{
     11    m_name_map = name_map;
    1212    symgen = SymbolGenerator();
     13}
     14
     15CodeGenState Pbix_Compiler::compile_subexpressions(const std::map<std::string, RE*>& re_map)
     16{
     17    CodeGenState cg_state;
     18
     19    for (auto it =  re_map.rbegin(); it != re_map.rend(); ++it)
     20    {
     21        if (Seq* seq = dynamic_cast<Seq*>(it->second))
     22        {
     23            if (seq->getType() == Seq::Byte)
     24            {
     25                std::string gs_retVal = symgen.gensym("start_marker");
     26                cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
     27                cg_state.newsym = gs_retVal;
     28
     29                std::list<RE*>::iterator endit;
     30                endit = seq->GetREList()->end();
     31                --endit;
     32                std::list<RE*>::iterator it;
     33                for (it = seq->GetREList()->begin(); it != seq->GetREList()->end(); ++it)
     34                {
     35                    Name* name = dynamic_cast<Name*>(*it);
     36                    if (it != endit)
     37                    {
     38                        gs_retVal = symgen.gensym("marker");
     39                        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
     40                        cg_state.newsym = gs_retVal;
     41                    }
     42                    else
     43                    {
     44                        cg_state.stmtsl.push_back(new Assign(seq->getName(), new And(new Var(cg_state.newsym), new CharClass(name->getName()))));
     45                    }
     46                }
     47            }
     48        }
     49    }
     50
     51    return cg_state;
    1352}
    1453
     
    1655{   
    1756    std::string gs_retVal;
     57    CodeGenState cg_state;
     58
     59    //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
    1860    gs_retVal = symgen.gensym("start_marker");
    19 
    20     CodeGenState cg_state;
    2161    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
    2262    cg_state.newsym = gs_retVal;
    2363
     64    std::string gs_retVal_m1 = symgen.gensym("marker");
     65    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-SingleByte")->second), new Var(cg_state.newsym))));
     66
     67    std::string gs_retVal_m2 = symgen.gensym("marker");
     68    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
     69
     70    std::string gs_retVal_m3 = symgen.gensym("marker");
     71    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
     72
     73    std::string gs_retVal_m4 = symgen.gensym("marker");
     74    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
     75
     76    std::string gs_retVal_m5 = symgen.gensym("marker");
     77    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
     78
     79    std::string gs_retVal_m6 = symgen.gensym("marker");
     80    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Or(new Var(gs_retVal_m5), new Var(gs_retVal_m3))));
     81
     82    gs_retVal = symgen.gensym("internal.initial");
     83    m_name_map.insert(make_pair("internal.initial", gs_retVal));
     84    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m6), new Var(gs_retVal_m4))));
     85    cg_state.newsym = gs_retVal;
     86
     87    //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
     88    gs_retVal = symgen.gensym("start_marker");
     89    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
     90    cg_state.newsym = gs_retVal;
     91
     92    gs_retVal_m1 = symgen.gensym("marker");
     93    cg_state.stmtsl.push_back(new Assign(gs_retVal_m1, new And(new Var(m_name_map.find("UTF8-Prefix2")->second), new Var(cg_state.newsym))));
     94
     95    gs_retVal_m2 = symgen.gensym("marker");
     96    cg_state.stmtsl.push_back(new Assign(gs_retVal_m2, new And(new Var(m_name_map.find("UTF8-Prefix3")->second), new Var(cg_state.newsym))));
     97
     98    gs_retVal_m3 = symgen.gensym("marker");
     99    cg_state.stmtsl.push_back(new Assign(gs_retVal_m3, new Advance(new Var(gs_retVal_m2))));
     100
     101    gs_retVal_m4 = symgen.gensym("marker");
     102    cg_state.stmtsl.push_back(new Assign(gs_retVal_m4, new And(new Var(m_name_map.find("UTF8-Prefix4")->second), new Var(cg_state.newsym))));
     103
     104    gs_retVal_m5 = symgen.gensym("marker");
     105    cg_state.stmtsl.push_back(new Assign(gs_retVal_m5, new Advance(new Var(gs_retVal_m4))));
     106
     107    gs_retVal_m6 = symgen.gensym("marker");
     108    cg_state.stmtsl.push_back(new Assign(gs_retVal_m6, new Advance(new Var(gs_retVal_m5))));
     109
     110    std::string gs_retVal_m7 = symgen.gensym("marker");
     111    cg_state.stmtsl.push_back(new Assign(gs_retVal_m7, new Or(new Var(gs_retVal_m2), new Var(gs_retVal_m1))));
     112
     113    std::string gs_retVal_m8 = symgen.gensym("marker");
     114    cg_state.stmtsl.push_back(new Assign(gs_retVal_m8, new Or(new Var(gs_retVal_m7), new Var(gs_retVal_m3))));
     115
     116    std::string gs_retVal_m9 = symgen.gensym("marker");
     117    cg_state.stmtsl.push_back(new Assign(gs_retVal_m9, new Or(new Var(gs_retVal_m8), new Var(gs_retVal_m4))));
     118
     119    std::string gs_retVal_m10 = symgen.gensym("marker");
     120    cg_state.stmtsl.push_back(new Assign(gs_retVal_m10, new Or(new Var(gs_retVal_m9), new Var(gs_retVal_m5))));
     121
     122    gs_retVal = symgen.gensym("internal.nonfinal");
     123    m_name_map.insert(make_pair("internal.nonfinal", gs_retVal));
     124    cg_state.stmtsl.push_back(new Assign(gs_retVal, new Or(new Var(gs_retVal_m10), new Var(gs_retVal_m6))));
     125    cg_state.newsym = gs_retVal;
     126
     127
     128    gs_retVal = symgen.gensym("start_marker");
     129    cg_state.stmtsl.push_back(new Assign(gs_retVal, new All(1)));
     130    cg_state.newsym = gs_retVal;
    24131    cg_state = re2pablo_helper(re, cg_state);
    25132
    26133    //These three lines are specifically for grep.
    27134    gs_retVal = symgen.gensym("marker");
    28     cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym), new Not(new Var(m_lf_ccname))), new Var(m_lf_ccname))));
     135    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
     136                                new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
    29137    cg_state.newsym = gs_retVal;
    30138
     
    38146        std::string gs_retVal = symgen.gensym("marker");
    39147
    40         PabloE* expr;
    41         if (name->getType() == Name::UnicodeCategory)
    42             expr = new Call(name->getName());
    43         else
    44             expr =  new CharClass(name->getName());
    45 
    46         cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), expr))));
     148        if (name->getType() == Name::FixedLength)
     149        {
     150            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new CharClass(name->getName())))));
     151        }
     152        else if (name->getType() == Name::UnicodeCategory)
     153        {
     154            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new Var(cg_state.newsym), new Call(name->getName())))));
     155        }
     156        else //Name::Unicode
     157        {
     158            cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(new CharClass(name->getName()), new ScanThru(new Var(cg_state.newsym), new Var(m_name_map.find("internal.nonfinal")->second))))));
     159        }
    47160        cg_state.newsym = gs_retVal;
    48161
     
    52165    {
    53166        std::string gs_retVal = symgen.gensym("start_of_line_marker");
    54         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_lf_ccname)))))));
     167        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
    55168        cg_state.newsym = gs_retVal;
    56169    }
     
    58171    {
    59172        std::string gs_retVal = symgen.gensym("end_of_line_marker");
    60         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_lf_ccname))));
     173        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
    61174        cg_state.newsym = gs_retVal;
    62175    }
Note: See TracChangeset for help on using the changeset viewer.