Changeset 4123


Ignore:
Timestamp:
Sep 1, 2014, 9:39:38 PM (4 years ago)
Author:
cameron
Message:

Add CRs as line breaks

Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/llvm_gen.cpp

    r4118 r4123  
    424424    Generate_PabloStatements(cg_state.stmtsl);
    425425    SetReturnMarker(cg_state.newsym, 0);
    426     SetReturnMarker(m_name_map.find("LineFeed")->second, 1);
     426    SetReturnMarker(m_name_map.find("LF")->second, 1);
    427427
    428428    //Terminate the block
  • icGREP/icgrep-devel/icgrep/pbix_compiler.cpp

    r4122 r4123  
    1212    m_name_map = name_map;
    1313    symgen = SymbolGenerator();
     14}
     15
     16std::string Pbix_Compiler::lookupCC(std::string ccname) {
     17    return m_name_map.find(ccname)->second;
    1418}
    1519
     
    6064    std::string gs_m0 = symgen.gensym("start_marker");
    6165    cg_state.stmtsl.push_back(new Assign(gs_m0, new All(1)));
     66   
     67    std::string gs_linebreak = symgen.gensym("LineBreak");
     68    m_name_map.insert(make_pair("LineBreak", gs_linebreak));
     69        cg_state.stmtsl.push_back(new Assign(gs_linebreak, new Or(new Var(lookupCC("LF")), new Var(lookupCC("CR")))));
     70   
    6271
    6372    if (unicode_re(re))
    6473    {
    6574        cg_state.newsym = gs_m0;
    66         //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.
    67         std::string gs_initial = symgen.gensym("internal.initial");
    68         m_name_map.insert(make_pair("internal.initial", gs_initial));
    69         PabloE * u8single = new Var(m_name_map.find("UTF8-SingleByte")->second);
    70         PabloE * u8pfx2 = new Var(m_name_map.find("UTF8-Prefix2")->second);
    71         PabloE * u8pfx3 = new Var(m_name_map.find("UTF8-Prefix3")->second);
    72         PabloE * u8pfx4 = new Var(m_name_map.find("UTF8-Prefix4")->second);
     75        //Set the 'utf8.initial' bit stream for the utf-8 multi-byte encoding.
     76        std::string gs_initial = symgen.gensym("utf8.initial");
     77        m_name_map.insert(make_pair("utf8.initial", gs_initial));
     78        PabloE * u8single = new Var(lookupCC("UTF8-SingleByte"));
     79        PabloE * u8pfx2 = new Var(lookupCC("UTF8-Prefix2"));
     80        PabloE * u8pfx3 = new Var(lookupCC("UTF8-Prefix3"));
     81        PabloE * u8pfx4 = new Var(lookupCC("UTF8-Prefix4"));
    7382        PabloE * u8pfx = new Or(new Or(u8pfx2, u8pfx3), u8pfx4);
    74         cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
     83            cg_state.stmtsl.push_back(new Assign(gs_initial, new Or(u8pfx, u8single)));
    7584        cg_state.newsym = gs_initial;
    7685
    77         //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
     86        //Set the 'utf8.nonfinal' bit stream for the utf-8 multi-byte encoding.
    7887        cg_state.newsym = gs_m0;
    79         std::string gs_nonfinal = symgen.gensym("internal.nonfinal");
    80         m_name_map.insert(make_pair("internal.nonfinal", gs_nonfinal));
     88        std::string gs_nonfinal = symgen.gensym("utf8.nonfinal");
     89        m_name_map.insert(make_pair("utf8.nonfinal", gs_nonfinal));
    8190//#define USE_IF_FOR_NONFINAL
    8291#ifdef USE_IF_FOR_NONFINAL
     
    104113    std::string gs_retVal = symgen.gensym("marker");
    105114    cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new MatchStar(new Var(cg_state.newsym),
    106         new Not(new Var(m_name_map.find("LineFeed")->second))), new Var(m_name_map.find("LineFeed")->second))));
     115        new Not(new Var(lookupCC("LineBreak")))), new Var(lookupCC("LineBreak")))));
    107116    cg_state.newsym = gs_retVal;
    108117
     
    118127        if (name->getType() != Name::FixedLength) {
    119128            // Move the markers forward through any nonfinal UTF-8 bytes to the final position of each character.
    120             markerExpr = new And(markerExpr, new CharClass(m_name_map.find("internal.initial")->second));
    121             markerExpr = new ScanThru(markerExpr, new CharClass(m_name_map.find("internal.nonfinal")->second));
     129            markerExpr = new And(markerExpr, new CharClass(lookupCC("utf8.initial")));
     130            markerExpr = new ScanThru(markerExpr, new CharClass(lookupCC("utf8.nonfinal")));
    122131        }       
    123132        PabloE* ccExpr;
     
    131140        }
    132141        if (name->isNegated()) {
    133             ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
    134                                     new CharClass(m_name_map.find("internal.nonfinal")->second)));
     142            ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(lookupCC("LineBreak"))),
     143                                    new CharClass(lookupCC("utf8.nonfinal"))));
    135144        }
    136145        cg_state.stmtsl.push_back(new Assign(gs_retVal, new Advance(new And(ccExpr, markerExpr))));
     
    142151    {
    143152        std::string gs_retVal = symgen.gensym("start_of_line_marker");
    144         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Not(new Advance(new Not(new CharClass(m_name_map.find("LineFeed")->second)))))));
     153        PabloE * CR_start = new And(new Advance(new Var(lookupCC("CR"))), new Not(new Var(lookupCC("LF"))));
     154        PabloE * LF_start = new Not(new Advance(new Not(new Var(lookupCC("LF")))));
     155        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new Or(CR_start, LF_start))));
    145156        cg_state.newsym = gs_retVal;
    146157    }
     
    148159    {
    149160        std::string gs_retVal = symgen.gensym("end_of_line_marker");
    150         cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(m_name_map.find("LineFeed")->second))));
     161        cg_state.stmtsl.push_back(new Assign(gs_retVal, new And(new Var(cg_state.newsym), new CharClass(lookupCC("LineBreak")))));
    151162        cg_state.newsym = gs_retVal;
    152163    }
     
    201212
    202213            if (rep_name->isNegated()) {
    203                 ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(m_name_map.find("LineFeed")->second)),
    204                                         new CharClass(m_name_map.find("internal.nonfinal")->second)));
     214                ccExpr = new Not(new Or(new Or(ccExpr, new CharClass(lookupCC("LineBreak"))),
     215                                        new CharClass(lookupCC("utf8.nonfinal"))));
    205216            }
    206217            if (rep_name->getType() == Name::FixedLength)
     
    211222            {
    212223                cg_state.stmtsl.push_back(new Assign(gs_retVal,
    213                     new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(m_name_map.find("internal.nonfinal")->second),
    214                     ccExpr)), new CharClass(m_name_map.find("internal.initial")->second))));
     224                    new And(new MatchStar(new Var(cg_state.newsym), new Or(new CharClass(lookupCC("utf8.nonfinal")),
     225                    ccExpr)), new CharClass(lookupCC("utf8.initial")))));
    215226            }
    216227
  • icGREP/icgrep-devel/icgrep/pbix_compiler.h

    r4122 r4123  
    5858public:
    5959    Pbix_Compiler(std::map<std::string, std::string> name_map);
     60    std::string lookupCC(std::string ccname);
    6061    CodeGenState compile(RE *re);
    6162    CodeGenState compile_subexpressions(const std::map<std::string, RE*>& re_map);
  • icGREP/icgrep-devel/icgrep/re_compiler.cpp

    r3976 r4123  
    7676    cc_name = cc_lf->getName();
    7777    re_map.insert(make_pair(cc_name, cc_lf));
    78     name_map.insert(make_pair("LineFeed", cc_name));
    79 
     78    name_map.insert(make_pair("LF", cc_name));
     79   
     80    CC* cc_cr = new CC(0x0D);
     81    cc_name = cc_cr->getName();
     82    re_map.insert(make_pair(cc_name, cc_cr));
     83    name_map.insert(make_pair("CR", cc_name));
     84   
    8085    CC* cc_utf8_single_byte = new CC(0x00, 0x7F);
    8186    cc_name = cc_utf8_single_byte->getName();
     
    97102    re_map.insert(make_pair(cc_name, cc_utf8_prefix4));
    98103    name_map.insert(make_pair("UTF8-Prefix4", cc_name));
     104   
     105   
    99106
    100107    CC_Compiler cc_compiler(encoding);
Note: See TracChangeset for help on using the changeset viewer.