Changeset 4249


Ignore:
Timestamp:
Oct 18, 2014, 5:43:45 PM (4 years ago)
Author:
nmedfort
Message:

Big update to use CC_NameMap; removed CharClass? and RE_Reducer.

Location:
icGREP/icgrep-devel/icgrep
Files:
4 deleted
24 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r4247 r4249  
    5353#find_package(Boost 1.21 COMPONENTS system)
    5454
    55 add_library(PabloADT pablo/pe_advance.cpp pablo/pe_and.cpp pablo/pe_call.cpp pablo/pe_charclass.cpp  pablo/pe_matchstar.cpp pablo/pe_scanthru.cpp pablo/pe_not.cpp  pablo/pe_or.cpp  pablo/pabloAST.cpp  pablo/pe_sel.cpp  pablo/pe_var.cpp  pablo/pe_xor.cpp pablo/ps_assign.cpp  pablo/ps_if.cpp  pablo/codegenstate.cpp  pablo/symbol_generator.cpp pablo/ps_while.cpp pablo/printer_pablos.cpp pablo/pablo_compiler.cpp)
    56 add_library(RegExpADT re/re_alt.cpp re/re_cc.cpp re/re_end.cpp re/re_name.cpp re/re_parser.cpp re/re_re.cpp re/re_rep.cpp re/re_seq.cpp re/re_start.cpp re/parsefailure.cpp re/re_reducer.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/printer_re.cpp)
     55add_library(PabloADT pablo/pe_advance.cpp pablo/pe_and.cpp pablo/pe_call.cpp pablo/pe_matchstar.cpp pablo/pe_scanthru.cpp pablo/pe_not.cpp  pablo/pe_or.cpp  pablo/pabloAST.cpp  pablo/pe_sel.cpp  pablo/pe_var.cpp  pablo/pe_xor.cpp pablo/ps_assign.cpp  pablo/ps_if.cpp  pablo/codegenstate.cpp  pablo/symbol_generator.cpp pablo/ps_while.cpp pablo/printer_pablos.cpp pablo/pablo_compiler.cpp)
     56add_library(RegExpADT re/re_alt.cpp re/re_cc.cpp re/re_end.cpp re/re_name.cpp re/re_parser.cpp re/re_re.cpp re/re_rep.cpp re/re_seq.cpp re/re_start.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/printer_re.cpp)
    5757add_library(CCADT cc/cc_namemap.cpp cc/cc_compiler.cpp utf_encoding.cpp utf8_encoder.cpp unicode_categories.h)
    5858
     
    6868add_executable(icgrep icgrep.cpp compiler.cpp)
    6969
    70 target_link_libraries (CCADT PabloADT)
     70target_link_libraries (CCADT PabloADT RegExpADT)
     71target_link_libraries (PabloADT ${REQ_LLVM_LIBRARIES})
    7172target_link_libraries (icgrep PabloADT RegExpADT CCADT ${REQ_LLVM_LIBRARIES})
    7273# If Boost is on the system, include the headers and libraries
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.cpp

    r4247 r4249  
    1515#include <re/re_rep.h>
    1616#include <re/re_name.h>
     17#include <re/printer_re.h>
     18#include <cc/cc_namemap.hpp>
    1719
    1820#include <utility>
     
    4143}
    4244
    43 void CC_Compiler::compile(const RENameMap & re_map) {
    44     for (auto i =  re_map.cbegin(); i != re_map.cend(); ++i) {
    45         if (const CC * cc = dyn_cast<CC>(i->second)) {
    46             // If we haven't already computed this CC, map it to the (pablo) charset statements.
    47             if (mComputedSet.insert(cc->getName()).second) {
    48                 mCG.createAssign(cc->getName(), charset_expr(cc));
    49             }
    50         }
    51         else if (const Seq* seq = dyn_cast<Seq>(i->second)) {
    52             //This is specifically for the utf8 multibyte character classes.
    53             assert (seq->getType() == Seq::Type::Byte);
    54             Assign * assignment = nullptr;
    55             auto j = seq->begin();
    56             while (true) {
    57                 Name * name = dyn_cast<Name>(*j);
    58                 assert (name);
    59                 CharClass * cc = mCG.createCharClass(name->getName());
    60                 PabloAST * sym = assignment ? mCG.createAnd(mCG.createVar(assignment->getName()), cc) : cc;
    61                 if (++j != seq->end()) {
    62                     assignment = mCG.createAssign(mCG.ssa("marker"), mCG.createAdvance(sym));
    63                     continue;
    64                 }
    65                 mCG.createAssign(seq->getName(), sym);
    66                 break;
    67             }
    68         }
    69     }
    70 }
    71 
    72 inline PabloAST * CC_Compiler::charset_expr(const CC * cc) {
     45void CC_Compiler::compile(const CC_NameMap & nameMap) {
     46    for (Name * name : nameMap) {
     47        compile_re(name);
     48    }
     49}
     50
     51PabloAST * CC_Compiler::compile_re(RE * re) {
     52    if (isa<Name>(re)) {
     53        return compile_re(cast<Name>(re));
     54    }
     55    else if (isa<Alt>(re)) {
     56        return compile_re(cast<Alt>(re));
     57    }
     58    else if (isa<Seq>(re)) {
     59        return compile_re(cast<Seq>(re));
     60    }
     61    else if (isa<CC>(re)) {
     62
     63    }
     64    throw std::runtime_error("Unexpected RE node given to CC_Compiler: " + Printer_RE::PrintRE(re));
     65}
     66
     67PabloAST * CC_Compiler::compile_re(Name * name) {
     68    assert(name);
     69    Var * var = name->getVar();
     70    if (var == nullptr) {       
     71        if (name->getType() == Name::Type::FixedLength) {
     72            RE * cc = name->getCC();
     73            assert (cc);
     74            PabloAST * value = nullptr;
     75            if (isa<CC>(cc)) {
     76                value = charset_expr(cast<CC>(cc));
     77            }
     78            else if (isa<Seq>(cc)) {
     79                value = compile_re(cast<Seq>(cc));
     80            }
     81            else if (isa<Alt>(cc)) {
     82                value = compile_re(cast<Alt>(cc));
     83            }
     84            if (value == nullptr) {
     85                throw std::runtime_error("Unexpected CC node given to CC_Compiler: " + Printer_RE::PrintRE(name) + " : " + Printer_RE::PrintRE(cc));
     86            }
     87            mCG.createAssign(name->getName(), value);
     88        }
     89        var = mCG.createVar(name->getName());
     90        name->setVar(var);
     91    }
     92    return var;
     93}
     94
     95PabloAST * CC_Compiler::compile_re(const Seq * seq) {
     96    Assign * assignment = nullptr;
     97    PabloAST * result = nullptr;
     98    auto i = seq->begin();
     99    while (true) {
     100        PabloAST * cc = compile_re(*i);
     101        result = assignment ? mCG.createAnd(mCG.createVar(assignment), cc) : cc;
     102        if (++i == seq->end()) {
     103            break;
     104        }
     105        assignment = mCG.createAssign(mCG.ssa("seq"), mCG.createAdvance(result));
     106    }
     107    return result;
     108}
     109
     110PabloAST * CC_Compiler::compile_re(const Alt *alt) {
     111    Assign * assignment = nullptr;
     112    PabloAST * result = nullptr;
     113    auto i = alt->begin();
     114    while (true) {
     115        PabloAST * cc = compile_re(*i);
     116        result = assignment ? mCG.createOr(mCG.createVar(assignment), cc) : cc;
     117        if (++i == alt->end()) {
     118            break;
     119        }
     120        assignment = mCG.createAssign(mCG.ssa("alt"), result);
     121    }
     122    return result;
     123}
     124
     125
     126PabloAST * CC_Compiler::charset_expr(const CC * cc) {
    73127    if (cc->empty()) {
    74128        return mCG.createZeroes();
     
    103157                    bit0 = mCG.createNot(bit0);
    104158                }
    105                 return mCG.createAnd(expr, bit0);
     159                return tempify(mCG.createAnd(expr, bit0));
    106160            }
    107161        }
     
    110164    for (const CharSetItem & item : *cc) {
    111165        PabloAST * temp = char_or_range_expr(item.lo_codepoint, item.hi_codepoint);
    112         expr = (expr == nullptr) ? temp : mCG.createOr(expr, temp);
     166        expr = (expr == nullptr) ? temp : tempify(mCG.createOr(expr, temp));
    113167    }
    114168    return expr;
     
    150204    {
    151205        std::vector<PabloAST*> new_terms;
    152         for (unsigned long i = 0; i < (bit_terms.size()/2); i++)
    153         {
    154             new_terms.push_back(mCG.createAnd(bit_terms[(2 * i) + 1], bit_terms[2 * i]));
     206        for (auto i = 0; i < (bit_terms.size()/2); i++)
     207        {
     208            new_terms.push_back(tempify(mCG.createAnd(bit_terms[(2 * i) + 1], bit_terms[2 * i])));
    155209        }
    156210        if (bit_terms.size() % 2 == 1)
     
    158212            new_terms.push_back(bit_terms[bit_terms.size() -1]);
    159213        }
    160         bit_terms.assign(new_terms.begin(), new_terms.end());
     214        bit_terms.swap(new_terms);
    161215    }
    162216    return bit_terms[0];
    163217}
    164218
    165 PabloAST * CC_Compiler::char_test_expr(const CodePointType ch)
     219inline PabloAST * CC_Compiler::char_test_expr(const CodePointType ch)
    166220{
    167221    return bit_pattern_expr(ch, mEncoding.getMask());
     
    190244    PabloAST* hi_test = LE_Range(diff_count - 1, n2 & mask1);
    191245
    192     return mCG.createAnd(common, mCG.createSel(getBasisVar(diff_count - 1), hi_test, lo_test));
     246    return tempify(mCG.createAnd(common, mCG.createSel(getBasisVar(diff_count - 1), hi_test, lo_test)));
    193247}
    194248
    195249PabloAST * CC_Compiler::GE_Range(const unsigned N, const unsigned n) {
    196     if (N == 0)
    197     {
     250    if (N == 0) {
    198251        return mCG.createOnes(); //Return a true literal.
    199252    }
    200     else if (((N % 2) == 0) && ((n >> (N - 2)) == 0))
    201     {
    202         return mCG.createOr(mCG.createOr(getBasisVar(N - 1), getBasisVar(N - 2)), GE_Range(N - 2, n));
    203     }
    204     else if (((N % 2) == 0) && ((n >> (N - 2)) == 3))
    205     {
    206         return mCG.createAnd(mCG.createAnd(getBasisVar(N - 1), getBasisVar(N - 2)), GE_Range(N - 2, n - (3 << (N - 2))));
     253    else if (((N % 2) == 0) && ((n >> (N - 2)) == 0)) {
     254        return tempify(mCG.createOr(tempify(mCG.createOr(getBasisVar(N - 1), getBasisVar(N - 2))), GE_Range(N - 2, n)));
     255    }
     256    else if (((N % 2) == 0) && ((n >> (N - 2)) == 3)) {
     257        return tempify(mCG.createAnd(tempify(mCG.createAnd(getBasisVar(N - 1), getBasisVar(N - 2))), GE_Range(N - 2, n - (3 << (N - 2)))));
    207258    }
    208259    else if (N >= 1)
     
    218269              the value of GE_range(N-1), lo_range) is required.
    219270            */
    220             return mCG.createOr(getBasisVar(N - 1), lo_range);
     271            return tempify(mCG.createOr(getBasisVar(N - 1), lo_range));
    221272        }
    222273        else
     
    226277              in the target for >= and GE_range(N-1, lo_bits) must also be true.
    227278            */
    228             return mCG.createAnd(getBasisVar(N - 1), lo_range);
     279            return tempify(mCG.createAnd(getBasisVar(N - 1), lo_range));
    229280        }
    230281    }
     
    242293    }
    243294    else {
    244         return mCG.createNot(GE_Range(N, n + 1));
     295        return tempify(mCG.createNot(GE_Range(N, n + 1)));
    245296    }
    246297}
     
    260311}
    261312
     313inline PabloAST * CC_Compiler::tempify(PabloAST * value) {
     314//    if (isa<Var>(value)) {
     315//        return cast<Var>(value);
     316//    }
     317//    return mCG.createVar(mCG.createAssign(mCG.ssa("t"), value));
     318    return value;
     319}
     320
    262321} // end of namespace cc
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.h

    r4244 r4249  
    1414#include <unordered_map>
    1515#include <string>
    16 #include <set>
    17 #include <re/re_reducer.h>
    1816
    1917namespace cc {
    2018
     19class CC_NameMap;
     20
    2121class CC_Compiler{
    22     typedef std::set<std::string>           ComputedSet;
    2322public:
    2423
     
    2726    CC_Compiler(pablo::PabloBlock & cg, const Encoding encoding, const std::string basis_pattern = "basis");
    2827
    29     void compile(const re::RENameMap & re_map);
     28    void compile(const CC_NameMap & nameMap);
    3029
    3130    const BasisBitVars & getBasisBitVars() const {
     
    3433
    3534private:
     35
     36    pablo::PabloAST * compile_re(re::RE * re);
     37    pablo::PabloAST * compile_re(re::Name * name);
     38    pablo::PabloAST * compile_re(const re::Alt * alt);
     39    pablo::PabloAST * compile_re(const re::Seq *seq);
     40
    3641    pablo::Var * getBasisVar(const int n) const;
    3742    pablo::PabloAST * bit_pattern_expr(const unsigned pattern, unsigned selected_bits);
     
    4247    pablo::PabloAST * char_or_range_expr(const re::CodePointType lo, const re::CodePointType hi);
    4348    pablo::PabloAST * charset_expr(const re::CC *cc);
    44 
     49    pablo::PabloAST * tempify(pablo::PabloAST * value);
    4550    pablo::PabloBlock &         mCG;
    4651    BasisBitVars                mBasisBit;
    4752    const Encoding              mEncoding;
    48     ComputedSet                 mComputedSet;
    4953};
    5054
  • icGREP/icgrep-devel/icgrep/cc/cc_namemap.cpp

    r4246 r4249  
    66#include <re/re_rep.h>
    77
     8#include <re/printer_re.h>
     9#include <iostream>
     10
    811using namespace re;
    912
    1013namespace cc {
    1114
    12 void CC_NameMap::addPredefined(const std::string friendlyName, const re::CC * cc) {
    13     Name * name = makeName(friendlyName, cc);
    14     mNameMap.insert(std::make_pair(friendlyName, name));
    15     insert(std::move(cc->getName()), name);
     15void CC_NameMap::addPredefined(const std::string friendlyName, re::CC * cc) {
     16    assert (cc);
     17    std::string classname = cc->getName();
     18    Name * name = makeName(classname, cc);
     19    assert (name->getCC() == cc);
     20    mNameMap.insert(std::make_pair(friendlyName, name));   
     21    insert(std::move(classname), name);
     22    assert (name->getCC() == cc);
     23}
     24
     25void CC_NameMap::clear() {
     26    mNameMap.clear();
     27    mNameVector.clear();
    1628}
    1729
     
    3143    }
    3244    else if (Name * name = dyn_cast<Name>(re)) {
    33         const std::string classname = name->getName();
     45        RE * cc = name->getCC();
     46        if (cc && !isa<CC>(cc)) {
     47            name->setCC(process(cc));
     48        }
     49        std::string classname = name->getName();
    3450        auto f = mNameMap.find(classname);
    3551        if (f == mNameMap.end()) {
     
    3955    }
    4056    else if (CC * cc = dyn_cast<CC>(re)) {
    41         const std::string classname = cc->getName();
     57        std::string classname = cc->getName();
    4258        auto f = mNameMap.find(classname);
    4359        if (f == mNameMap.end()) {
  • icGREP/icgrep-devel/icgrep/cc/cc_namemap.hpp

    r4246 r4249  
    2323    CC_NameMap() {}
    2424
     25    void clear();
     26
    2527    re::RE * process(re::RE * re);
    2628
    27     void addPredefined(const std::string friendlyName, const re::CC * cc);
     29    void addPredefined(const std::string friendlyName, re::CC * cc);
    2830
    29     re::Name * operator[](const std::string & name) const {
     31    inline const re::Name * operator[](const std::string & name) const {
    3032        auto f = mNameMap.find(name);
    3133        if (f == mNameMap.end()) {
     
    4547private:
    4648
    47     inline re::Name * insert(const std::string && name, re::Name * re) {
     49    inline re::Name * insert(std::string && name, re::Name * re) {
    4850        mNameMap.insert(std::make_pair(std::move(name), re));
    4951        mNameVector.push_back(re);
  • icGREP/icgrep-devel/icgrep/compiler.cpp

    r4246 r4249  
    1414#include <re/re_nullable.h>
    1515#include <re/re_simplifier.h>
    16 #include <re/re_reducer.h>
    1716#include <re/parsefailure.h>
    1817#include <re/re_parser.h>
     
    2120#include <utf8_encoder.h>
    2221#include <cc/cc_compiler.h>
     22#include <cc/cc_namemap.hpp>
    2323#include <pablo/pablo_compiler.h>
    24 
    25 //FOR TESTING AND AND ANALYSIS
    26 //#include "pbix_counter.h"
    2724
    2825//#define DEBUG_PRINT_RE_AST
     
    7168    #endif
    7269
     70    CC_NameMap nameMap;
     71    re_ast = nameMap.process(re_ast);
     72
     73    #ifdef DEBUG_PRINT_RE_AST
     74    std::cerr << "Namer:" << std::endl << Printer_RE::PrintRE(re_ast) << std::endl;
     75    #endif
     76
    7377    //Add the UTF encoding.
    74     if (encoding.getType() == Encoding::Type::UTF_8)
    75     {
    76         re_ast = UTF8_Encoder::toUTF8(re_ast);
     78    if (encoding.getType() == Encoding::Type::UTF_8) {
     79        re_ast = UTF8_Encoder::toUTF8(nameMap, re_ast);
     80        #ifdef DEBUG_PRINT_RE_AST
     81        //Print to the terminal the AST that was generated by the utf8 encoder.
     82        std::cerr << "UTF8-encoder:" << std::endl << Printer_RE::PrintRE(re_ast) << std::endl;
     83        #endif
    7784    }
    7885
    79     #ifdef DEBUG_PRINT_RE_AST
    80     //Print to the terminal the AST that was generated by the utf8 encoder.
    81     std::cerr << "UTF8-encoder:" << std::endl << Printer_RE::PrintRE(re_ast) << std::endl;
    82     #endif
     86    // note: system is clumbersome at the moment; this needs to be done AFTER toUTF8.
     87    nameMap.addPredefined("LineFeed", makeCC(0x0A));
    8388
    8489    re_ast = RE_Simplifier::simplify(re_ast);
     
    8893    #endif
    8994
    90     //Map all of the unique character classes in order to reduce redundancy.
    91     RENameMap re_map;
    92     re_ast = RE_Reducer::reduce(re_ast, re_map);
    93 
    94     #ifdef DEBUG_PRINT_RE_AST
    95     //Print to the terminal the AST with the reduced REs.
    96     std::cerr << "Reducer:" << std::endl << Printer_RE::PrintRE(re_ast) << std::endl;
    97     #endif
    98 
    99     //Build our list of predefined characters.
    100     std::string cc_name;
    101     std::map<std::string,std::string> name_map;
    102 
    103     CC* cc_lf = makeCC(0x0A);
    104     cc_name = cc_lf->getName();
    105     re_map.insert(make_pair(cc_name, cc_lf));
    106     name_map.insert(make_pair("LineFeed", cc_name));
    107 
    108     CC* cc_utf8_single_byte = makeCC(0x00, 0x7F);
    109     cc_name = cc_utf8_single_byte->getName();
    110     re_map.insert(make_pair(cc_name, cc_utf8_single_byte));
    111     name_map.insert(make_pair("UTF8-SingleByte", cc_name));
    112 
    113     CC* cc_utf8_prefix2 = makeCC(0xC2, 0xDF);
    114     cc_name = cc_utf8_prefix2->getName();
    115     re_map.insert(make_pair(cc_name, cc_utf8_prefix2));
    116     name_map.insert(make_pair("UTF8-Prefix2", cc_name));
    117 
    118     CC* cc_utf8_prefix3 = makeCC(0xE0, 0xEF);
    119     cc_name = cc_utf8_prefix3->getName();
    120     re_map.insert(make_pair(cc_name, cc_utf8_prefix3));
    121     name_map.insert(make_pair("UTF8-Prefix3", cc_name));
    122 
    123     CC* cc_utf8_prefix4 = makeCC(0xF0, 0xF4);
    124     cc_name = cc_utf8_prefix4->getName();
    125     re_map.insert(make_pair(cc_name, cc_utf8_prefix4));
    126     name_map.insert(make_pair("UTF8-Prefix4", cc_name));
    127 
    128 
    12995    SymbolGenerator symgen;
    13096    PabloBlock main(symgen);
    13197
    13298    CC_Compiler cc_compiler(main, encoding);
    133     cc_compiler.compile(re_map);
     99    cc_compiler.compile(nameMap);
    134100    #ifdef DEBUG_PRINT_PBIX_AST
    135101    //Print to the terminal the AST that was generated by the character class compiler.
     
    137103    #endif
    138104
    139     RE_Compiler re_compiler(main, name_map);
     105    RE_Compiler re_compiler(main, nameMap);
    140106    re_compiler.compile(re_ast);
    141107    #ifdef DEBUG_PRINT_PBIX_AST
     
    144110    #endif
    145111
    146     PabloCompiler pablo_compiler(name_map, cc_compiler.getBasisBitVars(), encoding.getBits());
     112    PabloCompiler pablo_compiler(nameMap, cc_compiler.getBasisBitVars(), encoding.getBits());
    147113    unsigned long long cycles = 0;
    148114    double timer = 0;
  • icGREP/icgrep-devel/icgrep/icgrep-devel.files

    r4244 r4249  
    185185pablo/printer_pablos.h
    186186CMakeLists.txt
     187cc/cc_namemap.cpp
     188cc/cc_namemap.hpp
  • icGREP/icgrep-devel/icgrep/pablo/codegenstate.cpp

    r4244 r4249  
    1717Call * PabloBlock::createCall(const std::string name) {
    1818    return mUnary.findOrMake<Call>(PabloAST::ClassTypeId::Call, mSymbolGenerator[name]);
    19 }
    20 
    21 CharClass * PabloBlock::createCharClass(const std::string name) {
    22     return mUnary.findOrMake<CharClass>(PabloAST::ClassTypeId::CharClass, mSymbolGenerator[name]);
    2319}
    2420
     
    4541
    4642Assign * PabloBlock::createAssign(const std::string name, PabloAST * expr) {
    47 //    auto key = std::make_tuple(PabloAST::ClassTypeId::Assign, expr);
    48 //    Assign * assign = cast<Assign>(mUnary.find(key));
    49 //    if (assign == nullptr) {
    50 //        assign = new Assign(mSymbolGenerator[name], expr);
    51 //        mUnary.insert(std::move(key), assign);
    52 //    }
    53 //    else {
    54 //        assign = new Assign(mSymbolGenerator[name], createVar(assign));
    55 //    }
    56     Assign * assign = mBinary.findOrMake<Assign>(PabloAST::ClassTypeId::Assign, mSymbolGenerator[name], expr);
     43    auto key = std::make_tuple(PabloAST::ClassTypeId::Assign, expr);
     44    Assign * assign = cast_or_null<Assign>(mUnary.find(key));
     45    if (assign == nullptr) {
     46        assign = new Assign(mSymbolGenerator[name], expr);
     47        mUnary.insert(std::move(key), assign);
     48    }
     49    else {
     50        assign = new Assign(mSymbolGenerator[name], createVar(assign));
     51    }
     52//    Assign * assign = mBinary.findOrMake<Assign>(PabloAST::ClassTypeId::Assign, mSymbolGenerator[name], expr);
    5753    mStatements.push_back(assign);
    5854    return assign;
  • icGREP/icgrep-devel/icgrep/pablo/codegenstate.h

    r4247 r4249  
    1313#include <pablo/pe_and.h>
    1414#include <pablo/pe_call.h>
    15 #include <pablo/pe_charclass.h>
    1615#include <pablo/pe_matchstar.h>
    1716#include <pablo/pe_not.h>
     
    8281        return isa<Assign>(input) ? createVar(cast<Assign>(input)) : input;
    8382    }
    84 
    85     CharClass * createCharClass(const std::string name);
    8683
    8784    PabloAST * createAnd(PabloAST * expr1, PabloAST * expr2);
  • icGREP/icgrep-devel/icgrep/pablo/pabloAST.cpp

    r4247 r4249  
    99#include "pe_and.h"
    1010#include "pe_call.h"
    11 #include "pe_charclass.h"
    1211#include "pe_matchstar.h"
    1312#include "pe_not.h"
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp

    r4247 r4249  
    1414#include <pablo/codegenstate.h>
    1515#include <pablo/printer_pablos.h>
     16#include <cc/cc_namemap.hpp>
     17#include <re/re_name.h>
    1618#include <stdexcept>
    1719#include <include/simd-lib/bitblock.hpp>
    1820
    19 // #define DUMP_GENERATED_IR
    20 // #define DUMP_OPTIMIZED_IR
     21//#define DUMP_GENERATED_IR
     22//#define DUMP_OPTIMIZED_IR
    2123
    2224extern "C" {
     
    7274namespace pablo {
    7375
    74 PabloCompiler::PabloCompiler(std::map<std::string, std::string> name_map, const BasisBitVars & basisBitVars, int bits)
     76PabloCompiler::PabloCompiler(const cc::CC_NameMap & nameMap, const BasisBitVars & basisBitVars, int bits)
    7577: mBits(bits)
    76 , m_name_map(name_map)
    7778, mBasisBitVars(basisBitVars)
    7879, mMod(new Module("icgrep", getGlobalContext()))
     
    9394, mPtr_carry_q_addr(nullptr)
    9495, mPtr_output_addr(nullptr)
     96, mNameMap(nameMap)
    9597{
    9698    //Create the jit execution engine.up
     
    178180    //Generate the IR instructions for the function.
    179181    SetReturnMarker(compileStatements(cg_state.expressions()), 0); // matches
    180     SetReturnMarker(GetMarker(m_name_map.find("LineFeed")->second), 1); // line feeds
     182    SetReturnMarker(GetMarker(mNameMap["LineFeed"]->getName()), 1); // line feeds
    181183
    182184    assert (mCarryQueueIdx == mCarryQueueSize);
     
    681683        retVal = b.CreateXor(expr_value, mAllOneInitializer, "not");
    682684    }
    683     else if (const CharClass * cc = dyn_cast<CharClass>(expr))
    684     {
    685         retVal = b.CreateLoad(GetMarker(cc->getCharClass()));
    686     }
    687685    else if (const Advance * adv = dyn_cast<Advance>(expr))
    688686    {
     
    701699        Value* marker_expr = compileExpression(sthru->getScanFrom());
    702700        Value* cc_expr = compileExpression(sthru->getScanThru());
    703         retVal = b.CreateAnd(genAddWithCarry(marker_expr, cc_expr), genNot(cc_expr), "scanthru_rslt");
     701        retVal = b.CreateAnd(genAddWithCarry(marker_expr, cc_expr), genNot(cc_expr), "scanthru");
    704702    }
    705703    return retVal;
     
    820818    Constant* const_packed_1_elems [] = {b.getInt32(0), b.getInt32(2)};
    821819    Constant* const_packed_1 = ConstantVector::get(const_packed_1_elems);
    822     packed_shuffle = b.CreateShuffleVector(carryq_value, srli_1_value, const_packed_1, "packed_shuffle nw");
     820    packed_shuffle = b.CreateShuffleVector(carryq_value, srli_1_value, const_packed_1);
    823821
    824822    Constant* const_packed_2_elems[] = {b.getInt64(1), b.getInt64(1)};
    825823    Constant* const_packed_2 = ConstantVector::get(const_packed_2_elems);
    826824
    827     Value* shl_value = b.CreateShl(strm_value, const_packed_2, "shl_value");
    828     Value* result_value = b.CreateOr(shl_value, packed_shuffle, "or.result_value");
     825    Value* shl_value = b.CreateShl(strm_value, const_packed_2);
     826    Value* result_value = b.CreateOr(shl_value, packed_shuffle, "advance");
    829827
    830828    Value* carry_out = genShiftHighbitToLow(strm_value, "carry_out");
    831829    //CarryQ - carry out:
    832     Value* void_1 = genCarryOutStore(carry_out, mptr_carry_q, this_carry_idx);
     830    genCarryOutStore(carry_out, mptr_carry_q, this_carry_idx);
    833831
    834832    return result_value;
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.h

    r4244 r4249  
    8585public:
    8686    typedef cc::CC_Compiler::BasisBitVars BasisBitVars;
    87     PabloCompiler(std::map<std::string, std::string> name_map, const BasisBitVars & basisBitVars, int bits);
     87    PabloCompiler(const cc::CC_NameMap & nameMap, const BasisBitVars & basisBitVars, int bits);
    8888    ~PabloCompiler();
    8989    LLVM_Gen_RetVal compile(const PabloBlock & cg_state);
     
    108108    Value* genAdvanceWithCarry(Value* e1);
    109109    Value* genBitBlockAny(Value* e);
    110     Value* genShiftHighbitToLow(Value* e, const Twine &namehint = "");
    111     Value* genShiftLeft64(Value* e, const Twine &namehint = "") ;
    112     Value* genNot(Value* e, const Twine &namehint = "");
     110    Value* genShiftHighbitToLow(Value* e, const Twine & namehint = "");
     111    Value* genShiftLeft64(Value* e, const Twine & namehint = "") ;
     112    Value* genNot(Value* e, const Twine & namehint = "");
    113113
    114114    #ifdef USE_UADD_OVERFLOW
     
    120120
    121121
    122     int                                 mBits;
    123     std::map<std::string, std::string>  m_name_map;
     122    int                                 mBits;   
    124123    const BasisBitVars &                mBasisBitVars;
    125124
     
    148147    AllocaInst*                         mPtr_carry_q_addr;
    149148    AllocaInst*                         mPtr_output_addr;
     149
     150    const cc::CC_NameMap &              mNameMap;
    150151};
    151152
  • icGREP/icgrep-devel/icgrep/pablo/printer_pablos.cpp

    r4247 r4249  
    2121#include <pablo/pe_and.h>
    2222#include <pablo/pe_call.h>
    23 #include <pablo/pe_charclass.h>
    2423#include <pablo/pe_matchstar.h>
    2524#include <pablo/pe_not.h>
     
    126125        retVal = "Not (" + ShowPabloAST(pablo_not->getExpr()) + ")";
    127126    }
    128     else if (const CharClass * cc = dyn_cast<const CharClass>(expr))
    129     {
    130         retVal = "CharClass '" + cc->getCharClass() + "'";
    131     }
    132127    else if (const Advance * adv = dyn_cast<const Advance>(expr))
    133128    {
  • icGREP/icgrep-devel/icgrep/re/re_alt.h

    r4206 r4249  
    3535private:
    3636    template<typename iterator>
    37     void construct(iterator begin, iterator end, std::queue<CC*> & ccQ) {
     37    void flatten(iterator begin, iterator end, std::queue<CC*> & ccQ) {
    3838        for (auto i = begin; i != end; ++i) {
    3939            if (Alt * alt = dyn_cast<Alt>(*i)) {
    40                 construct(alt->begin(), alt->end(), ccQ);
     40                flatten(alt->begin(), alt->end(), ccQ);
    4141                continue;
    4242            }
     
    6868    Alt * alt = makeAlt();
    6969    std::queue<CC*> ccQ;
    70     alt->construct(begin, end, ccQ);
     70    alt->flatten(begin, end, ccQ);
    7171    if (!ccQ.empty()) {
    7272        while (ccQ.size() > 1) {
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4247 r4249  
    1515#include <re/re_seq.h>
    1616#include <re/re_rep.h>
    17 
    18 
    19 //Pablo Expressions
     17#include <cc/cc_namemap.hpp>
    2018#include <pablo/codegenstate.h>
    21 #include <pablo/pe_advance.h>
    22 #include <pablo/pe_zeroes.h>
    23 #include <pablo/pe_ones.h>
    24 #include <pablo/pe_and.h>
    25 #include <pablo/pe_call.h>
    26 #include <pablo/pe_charclass.h>
    27 #include <pablo/pe_matchstar.h>
    28 #include <pablo/pe_not.h>
    29 #include <pablo/pe_or.h>
    30 #include <pablo/pabloAST.h>
    31 #include <pablo/pe_scanthru.h>
    32 #include <pablo/pe_sel.h>
    33 #include <pablo/pe_var.h>
    34 #include <pablo/pe_xor.h>
    35 #include <pablo/ps_assign.h>
    36 #include <pablo/ps_if.h>
    37 #include <pablo/ps_while.h>
    3819
    3920#include <assert.h>
     
    4728namespace re {
    4829
    49 RE_Compiler::RE_Compiler(PabloBlock & baseCG, std::map<std::string, std::string> name_map)
     30RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
    5031: mCG(baseCG)
    5132, mLineFeed(nullptr)
    5233, mInitial(nullptr)
    5334, mNonFinal(nullptr)
    54 , m_name_map(name_map)
     35, mNameMap(nameMap)
    5536{
    5637
     
    5940void RE_Compiler::compile(RE * re, PabloBlock & cg) {
    6041
    61     mLineFeed = cg.createVar(m_name_map.find("LineFeed")->second);
     42    mLineFeed = mNameMap["LineFeed"]->getVar();
    6243
    6344    const std::string initial = "initial";
     
    6647    if (hasUnicode(re)) {
    6748        //Set the 'internal.initial' bit stream for the utf-8 multi-byte encoding.       
    68         PabloAST * u8single = cg.createVar(m_name_map.find("UTF8-SingleByte")->second);
    69         PabloAST * u8pfx2 = cg.createVar(m_name_map.find("UTF8-Prefix2")->second);
    70         PabloAST * u8pfx3 = cg.createVar(m_name_map.find("UTF8-Prefix3")->second);
    71         PabloAST * u8pfx4 = cg.createVar(m_name_map.find("UTF8-Prefix4")->second);
     49        PabloAST * u8single = mNameMap["UTF8-SingleByte"]->getVar();
     50        PabloAST * u8pfx2 = mNameMap["UTF8-Prefix2"]->getVar();
     51        PabloAST * u8pfx3 = mNameMap["UTF8-Prefix3"]->getVar();
     52        PabloAST * u8pfx4 = mNameMap["UTF8-Prefix4"]->getVar();
    7253        PabloAST * u8pfx = cg.createOr(cg.createOr(u8pfx2, u8pfx3), u8pfx4);
    7354        mInitial = cg.createVar(cg.createAssign(initial, cg.createOr(u8pfx, u8single)));
     
    9576
    9677    //These three lines are specifically for grep.
    97     cg.createAssign(cg.ssa("marker"), cg.createAnd(cg.createMatchStar(cg.createVarIfAssign(result), cg.createNot(mLineFeed)), mLineFeed));
     78    cg.createAssign(cg.ssa("matches"), cg.createAnd(cg.createMatchStar(cg.createVarIfAssign(result), cg.createNot(mLineFeed)), mLineFeed));
    9879}
    9980
     
    144125    }
    145126    else {
    146         cc = cg.createCharClass(name->getName());
     127        cc = cg.createVar(name->getName());
    147128    }
    148129    if (name->isNegated()) {
     
    198179        }
    199180        else {
    200             cc = cg.createCharClass(rep_name->getName());
     181            cc = cg.createVar(rep_name->getName());
    201182        }
    202183
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r4246 r4249  
    1414#include <map>
    1515
     16namespace cc {
     17class CC_NameMap;
     18}
     19
    1620namespace pablo {
    17 
    1821class PabloBlock;
    1922class PabloAST;
    2023class Assign;
    2124class Var;
    22 
    2325}
    2426
     
    2729public:
    2830
    29     RE_Compiler(pablo::PabloBlock & baseCG, std::map<std::string, std::string> name_map);
     31    RE_Compiler(pablo::PabloBlock & baseCG, const cc::CC_NameMap & nameMap);
    3032
    3133    inline void compile(RE * re) {
     
    4951
    5052    pablo::PabloBlock &                             mCG;
     53    const cc::CC_NameMap &                          mNameMap;
    5154    pablo::Var *                                    mLineFeed;
    5255    pablo::PabloAST *                               mInitial;
    53     pablo::PabloAST *                               mNonFinal;
    54     std::map<std::string, std::string>              m_name_map;
     56    pablo::PabloAST *                               mNonFinal;   
    5557};
    5658
  • icGREP/icgrep-devel/icgrep/re/re_name.h

    r4246 r4249  
    44#include <re/re_re.h>
    55#include <string>
     6#include <iostream>
     7#include <re/printer_re.h>
    68
    79namespace pablo {
     
    4042    virtual ~Name() {}
    4143protected:
    42     friend Name * makeName();
    4344    friend Name * makeName(const std::string, RE *);
    4445    friend Name * makeName(const std::string, const bool, const Type);
    45     Name();
    46     Name(const std::string && name, const bool negated, const Type type);
    47     Name(const std::string && name, RE * cc);
     46
     47    Name(const std::string && name, const bool negated, const Type type, RE * cc)
     48    : RE(ClassTypeId::Name)
     49    , mName(std::move(name))
     50    , mNegated(negated)
     51    , mType(type)
     52    , mCC(cc)
     53    , mVar(nullptr)
     54    {
     55
     56    }
     57
    4858private:
    4959    const std::string   mName;
     
    5363    pablo::Var *        mVar;
    5464};
    55 
    56 inline Name::Name()
    57 : RE(ClassTypeId::Name)
    58 , mName()
    59 , mNegated(false)
    60 , mType(Type::FixedLength)
    61 , mCC(nullptr)
    62 , mVar(nullptr)
    63 {
    64 
    65 }
    66 
    67 inline Name::Name(const std::string && name, const bool negated, const Type type)
    68 : RE(ClassTypeId::Name)
    69 , mName(std::move(name))
    70 , mNegated(negated)
    71 , mType(type)
    72 , mCC(nullptr)
    73 , mVar(nullptr)
    74 {
    75 
    76 }
    77 
    78 inline Name::Name(const std::string && name, RE * cc)
    79 : RE(ClassTypeId::Name)
    80 , mName(std::move(name))
    81 , mNegated(false)
    82 , mType(Type::FixedLength)
    83 , mCC(cc)
    84 , mVar(nullptr)
    85 {
    86 
    87 }
    8865
    8966inline const std::string & Name::getName() const {
     
    10784}
    10885
    109 inline Name * makeName() {
    110     return new Name();
    111 }
    112 
    11386inline Name * makeName(const std::string name, const bool negated = false, const Name::Type type = Name::Type::FixedLength) {
    114     return new Name(std::move(name), negated, type);
     87    return new Name(std::move(name), negated, type, nullptr);
    11588}
    11689
    11790inline Name * makeName(const std::string name, RE * cc) {
    118     return new Name(std::move(name), cc);
     91    if (isa<Name>(cc)) {
     92        return cast<Name>(cc);
     93    }
     94    return new Name(std::move(name), false, Name::Type::FixedLength, cc);
    11995}
    12096
  • icGREP/icgrep-devel/icgrep/re/re_nullable.cpp

    r4203 r4249  
    2828            }
    2929        }
    30         re = makeSeq(seq->getType(), list.begin(), list.end());
     30        re = makeSeq(list.begin(), list.end());
    3131    }
    3232    else if (Alt * alt = dyn_cast<Alt>(re)) {
     
    6161            }
    6262        }
    63         re = makeSeq(seq->getType(), list.begin(), list.end());
     63        re = makeSeq(list.begin(), list.end());
    6464    }
    6565    else if (Alt* alt = dyn_cast<Alt>(re)) {
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4245 r4249  
    7171        throw NoRegularExpressionFound();
    7272    }
    73     return makeSeq(Seq::Type::Normal, seq.begin(), seq.end());
     73    return makeSeq(seq.begin(), seq.end());
    7474}
    7575
  • icGREP/icgrep-devel/icgrep/re/re_seq.cpp

    r4203 r4249  
    66
    77#include "re_seq.h"
    8 #include "re_cc.h"
    9 #include "re_name.h"
    10 
    11 namespace re {
    12 
    13 std::string Seq::getName() const {
    14     if (mType == Seq::Type::Byte) {
    15         std::string name = "Seq";
    16         for (const RE * re : *this) {
    17             if (const CC* seq_cc = dyn_cast<const CC>(re)) {
    18                 name += seq_cc->getName();
    19             }
    20             else if (const Name* seq_name = dyn_cast<const Name>(re)) {
    21                 name += seq_name->getName();
    22             }
    23             else {
    24                 return "Bad Byte Sequence!";
    25             }
    26         }
    27         return name;
    28     }
    29     else {
    30         return "Unnamed Sequence";
    31     }
    32 }
    33 
    34 
    35 
    36 }
  • icGREP/icgrep-devel/icgrep/re/re_seq.h

    r4206 r4249  
    2222        return false;
    2323    }
    24     enum class Type {
    25         Normal
    26         , Byte
    27     };
    28     std::string getName() const;
    29     inline Type getType() const {
    30         return mType;
    31     }
    32     inline void setType(const Type type) {
    33         mType = type;
    34     }   
    3524    virtual ~Seq() {}
    3625protected:
    37     friend Seq * makeSeq(const Seq::Type);
    38     template<typename iterator> friend RE * makeSeq(const Seq::Type, iterator, iterator);
    39     Seq(const Type type)
    40     : Vector(ClassTypeId::Seq)
    41     , mType(type) {
     26    friend Seq * makeSeq();
     27    template<typename iterator> friend RE * makeSeq(iterator, iterator);
     28    Seq()
     29    : Vector(ClassTypeId::Seq) {
    4230
    4331    }
    44     Seq(const Type type, iterator begin, iterator end)
    45     : Vector(ClassTypeId::Seq, begin, end)
    46     , mType(type)
    47     {
     32    Seq(iterator begin, iterator end)
     33    : Vector(ClassTypeId::Seq, begin, end) {
    4834
    4935    }
    50     template<typename itr> void construct(itr begin, itr end);
    51 private:
    52     Type    mType;
     36    template<typename itr> void flatten(itr begin, itr end);
    5337};
    5438
    55 inline Seq * makeSeq(const Seq::Type type = Seq::Type::Normal) {
    56     return new Seq(type);
     39inline Seq * makeSeq() {
     40    return new Seq();
    5741}
    5842
    5943template<typename itr>
    60 void Seq::construct(itr begin, itr end) {
     44void Seq::flatten(itr begin, itr end) {
    6145    for (auto i = begin; i != end; ++i) {
    6246        if (Seq * seq = dyn_cast<Seq>(*i)) {
    63             construct<Seq::iterator>(seq->begin(), seq->end());
     47            flatten<Seq::iterator>(seq->begin(), seq->end());
    6448            continue;
    6549        }
     
    6953
    7054template<typename itr>
    71 inline RE * makeSeq(const Seq::Type type, itr begin, itr end) {
    72     Seq * seq = makeSeq(type);
    73     seq->construct(begin, end);
     55inline RE * makeSeq(itr begin, itr end) {
     56    Seq * seq = makeSeq();
     57    seq->flatten(begin, end);
    7458    if (seq->size() == 1) {
    7559        return seq->back();
     
    7963
    8064inline RE * makeSeq(RE::InitializerList list) {
    81     return makeSeq(Seq::Type::Normal, list.begin(), list.end());
     65    return makeSeq(list.begin(), list.end());
    8266}
    8367
  • icGREP/icgrep-devel/icgrep/re/re_simplifier.cpp

    r4203 r4249  
    2828            list.push_back(simplify(re));
    2929        }
    30         re = makeSeq(seq->getType(), list.begin(), list.end());
     30        re = makeSeq(list.begin(), list.end());
    3131    }
    3232    else if (Rep * rep = dyn_cast<Rep>(re)) {
  • icGREP/icgrep-devel/icgrep/utf8_encoder.cpp

    r4242 r4249  
    55 */
    66
    7 #include "utf8_encoder.h"
    8 
    9 #include "re/re_name.h"
    10 #include "re/re_start.h"
    11 #include "re/re_end.h"
    12 #include "re/re_seq.h"
    13 #include "re/re_alt.h"
    14 #include "re/re_rep.h"
    15 #include "re/re_simplifier.h"
    16 
     7#include <utf8_encoder.h>
     8#include <re/re_name.h>
     9#include <re/re_seq.h>
     10#include <re/re_alt.h>
     11#include <re/re_rep.h>
     12#include <cc/cc_namemap.hpp>
    1713#include <assert.h>
    1814#include <algorithm>
     
    2117using namespace re;
    2218
    23 RE * UTF8_Encoder::toUTF8(RE* re) {
    24     if (Alt * alt = dyn_cast<Alt>(re)) {
    25         for (auto i = alt->begin(); i != alt->end(); ++i) {
    26             *i = toUTF8(*i);
     19namespace cc {
     20
     21RE * UTF8_Encoder::toUTF8(CC_NameMap & nameMap, RE * ast) {
     22    for (Name * name : nameMap) {
     23        if (const CC * cc = dyn_cast_or_null<CC>(name->getCC())) {
     24            if (cc->size() == 1) {
     25                name->setCC(rangeToUTF8(cc->front()));
     26            }
     27            else if (cc->size() > 1) {
     28                std::vector<RE *> alt;
     29                for (const CharSetItem & item : *cc) {
     30                    alt.push_back(rangeToUTF8(item));
     31                }
     32                name->setCC(makeAlt(alt.begin(), alt.end()));
     33            }
    2734        }
    2835    }
    29     else if (Seq * seq = dyn_cast<Seq>(re)) {
    30         //If this is a previously encoded Unicode byte sequence.
    31         if (seq->getType() == Seq::Type::Byte) {
    32             throw std::runtime_error("Unexpected UTF Byte Sequence given to UTF8 Encoder.");
    33         }
    34         for (auto i = seq->begin(); i != seq->end(); ++i) {
    35             *i = toUTF8(*i);
    36         }
    37     }
    38     else if (CC * cc = dyn_cast<CC>(re)) {
    39         if (cc->size() == 1) {
    40             re = rangeToUTF8(cc->front());
    41         }
    42         else if (cc->size() > 1) {
    43             std::vector<RE *> alt;
    44             for (const CharSetItem & item : *cc) {
    45                 alt.push_back(rangeToUTF8(item));
    46             }
    47             re = makeAlt(alt.begin(), alt.end());
    48         }
    49     }
    50     else if (Rep * rep = dyn_cast<Rep>(re)) {
    51         rep->setRE(toUTF8(rep->getRE()));
    52     }
    53     return re;
     36    ast = nameMap.process(ast);
     37    // Build our list of predefined characters.
     38    nameMap.addPredefined("UTF8-SingleByte", makeCC(0x00, 0x7F));
     39    nameMap.addPredefined("UTF8-Prefix2", makeCC(0xC2, 0xDF));
     40    nameMap.addPredefined("UTF8-Prefix3", makeCC(0xE0, 0xEF));
     41    nameMap.addPredefined("UTF8-Prefix4", makeCC(0xF0, 0xF4));
     42    return ast;
    5443}
    5544
    5645RE * UTF8_Encoder::rangeToUTF8(const CharSetItem & item) {
    57     int u8len_lo = lenUTF8(item.lo_codepoint);
    58     int u8len_hi = lenUTF8(item.hi_codepoint);
    59     if (u8len_lo < u8len_hi) {
    60         int m = maxUTF8Len(u8len_lo);
     46    const auto min = lenUTF8(item.lo_codepoint);
     47    const auto max = lenUTF8(item.hi_codepoint);
     48    if (min < max) {
     49        const auto m = maxCodePoint(min);
    6150        return makeAlt({rangeToUTF8(CharSetItem(item.lo_codepoint, m)), rangeToUTF8(CharSetItem(m + 1, item.hi_codepoint))});
    6251    }
    6352    else {
    64         return rangeToUTF8_helper(item.lo_codepoint, item.hi_codepoint, 1, u8len_hi);
     53        return rangeToUTF8(item.lo_codepoint, item.hi_codepoint, 1, max);
    6554    }
    6655}
    6756
    68 RE* UTF8_Encoder::rangeToUTF8_helper(int lo, int hi, int n, int hlen)
     57RE * UTF8_Encoder::rangeToUTF8(const CodePointType lo, const CodePointType hi, const unsigned index, const unsigned max)
    6958{
    70     int hbyte = u8byte(hi, n);
    71     int lbyte = u8byte(lo, n);
    72 
    73     if (n == hlen)
    74     {
     59    const CodePointType hbyte = u8byte(hi, index);
     60    const CodePointType lbyte = u8byte(lo, index);
     61    if (index == max) {
    7562        return makeByteRange(lbyte, hbyte);
    7663    }
    77     else if (hbyte == lbyte)
    78     {
    79         Seq* seq = makeSeq(isUTF8Prefix(hbyte) ? Seq::Type::Byte : Seq::Type::Normal);
    80         seq->push_back(makeByteClass(hbyte));
    81         seq->push_back(rangeToUTF8_helper(lo, hi, n+1, hlen));
    82         return seq;
     64    else if (hbyte == lbyte) {
     65        return makeSeq({makeByteClass(hbyte), rangeToUTF8(lo, hi, index + 1, max)});
    8366    }
    84     else
    85     {
    86         int suffix_mask = (1 << ((hlen - n) * 6)) - 1;
    87 
    88         if ((hi & suffix_mask) != suffix_mask)
    89         {
    90             int hi_floor = (~suffix_mask) & hi;
    91             return makeAlt({rangeToUTF8_helper(hi_floor, hi, n, hlen), rangeToUTF8_helper(lo, hi_floor - 1, n, hlen)});
     67    else {
     68        const unsigned suffix_mask = (static_cast<unsigned>(1) << ((max - index) * 6)) - 1;
     69        if ((hi & suffix_mask) != suffix_mask) {
     70            const unsigned hi_floor = (~suffix_mask) & hi;
     71            return makeAlt({rangeToUTF8(hi_floor, hi, index, max), rangeToUTF8(lo, hi_floor - 1, index, max)});
    9272        }
    93         else if ((lo & suffix_mask) != 0)
    94         {
    95             int low_ceil = lo | suffix_mask;
    96 
    97             Alt* alt = makeAlt();
    98             alt->push_back(rangeToUTF8_helper(low_ceil + 1, hi, n, hlen));
    99             alt->push_back(rangeToUTF8_helper(lo, low_ceil, n, hlen));
    100             return alt;
     73        else if ((lo & suffix_mask) != 0) {
     74            const unsigned low_ceil = lo | suffix_mask;
     75            return makeAlt({rangeToUTF8(low_ceil + 1, hi, index, max), rangeToUTF8(lo, low_ceil, index, max)});
    10176        }
    102         else
    103         {
    104             Seq* seq = makeSeq();
    105             seq->setType((isUTF8Prefix(hbyte) ? Seq::Type::Byte : Seq::Type::Normal));
    106             seq->push_back(makeByteRange(lbyte, hbyte));
    107             seq->push_back(rangeToUTF8_helper(lo, hi, n + 1, hlen));
    108             return seq;
     77        else {
     78            return makeSeq({makeByteRange(lbyte, hbyte), rangeToUTF8(lo, hi, index + 1, max)});
    10979        }
    11080    }
    11181}
    11282
    113 inline bool UTF8_Encoder::isUTF8Prefix(const int cp) {
    114     return ((cp >= 0xC2) && (cp <= 0xF4));
     83inline bool UTF8_Encoder::isUTF8Prefix(const unsigned cp) {
     84    return (cp >= 0xC2) && (cp <= 0xF4);
    11585}
    11686
    117 CC* UTF8_Encoder::makeByteRange(int lo, int hi)
     87inline CodePointType UTF8_Encoder::u8byte(const CodePointType codepoint, const unsigned n)
    11888{
    119     return makeCC(lo, hi);
    120 }
     89    CodePointType retVal = 0;
    12190
    122 CC* UTF8_Encoder::makeByteClass(int byteval)
    123 {
    124     return makeCC(byteval, byteval);
    125 }
     91    const unsigned len = lenUTF8(codepoint);
    12692
    127 inline int UTF8_Encoder::u8byte(int codepoint, int n)
    128 {
    129     int retVal = 0;
    130 
    131     int len = lenUTF8(codepoint);
    132 
    133     if (n == 1)
    134     {
    135         if (len == 1)
    136         {
    137             retVal = codepoint;
    138         }
    139         else if (len == 2)
    140         {
    141             retVal = 0xC0 | (codepoint >> 6);
    142         }
    143         else if (len == 3)
    144         {
    145             retVal = 0xE0 | (codepoint >> 12);
    146         }
    147         else
    148         {
    149             retVal = 0xF0 | (codepoint >> 18);
     93    if (n == 1) {
     94        switch (len) {
     95            case 1: retVal = codepoint; break;
     96            case 2: retVal = 0xC0 | (codepoint >> 6); break;
     97            case 3: retVal = 0xE0 | (codepoint >> 12); break;
     98            case 4: retVal = 0xF0 | (codepoint >> 18); break;
    15099        }
    151100    }
    152     else
    153     {
     101    else {
    154102        retVal = 0x80 | ((codepoint >> (6 * (len - n))) & 0x3F);
    155103    }
     
    158106}
    159107
    160 inline int UTF8_Encoder::lenUTF8(const int cp)
    161 {
    162     if (cp <= 0x7F)
    163     {
     108inline unsigned UTF8_Encoder::lenUTF8(const unsigned cp) {
     109    if (cp <= 0x7F) {
    164110        return 1;
    165111    }
    166     else if (cp <= 0x7FF)
    167     {
     112    else if (cp <= 0x7FF) {
    168113        return 2;
    169114    }
    170     else if (cp <= 0xFFFF)
    171     {
     115    else if (cp <= 0xFFFF) {
    172116        return 3;
    173117    }
    174     else
    175     {
     118    else {
    176119        return 4;
    177120    }
    178121}
    179122
    180 inline int UTF8_Encoder::maxUTF8Len(int lgth)
    181 {
    182     if (lgth == 1)
    183     {
     123inline unsigned UTF8_Encoder::maxCodePoint(const unsigned length) {
     124    if (length == 1) {
    184125        return 0x7F;
    185126    }
    186     else if (lgth == 2)
    187     {
     127    else if (length == 2) {
    188128        return 0x7FF;
    189129    }
    190     else if (lgth == 3)
    191     {
     130    else if (length == 3) {
    192131        return 0xFFFF;
    193132    }
    194     else if (lgth == 4)
    195     {
     133    else if (length == 4) {
    196134        return 0x10FFFF;
    197135    }
    198     else
    199     {
    200         return -1;
    201     }
     136    throw std::runtime_error("Unexpected UTF8 Length: " + std::to_string(length));
    202137}
    203138
     139inline CC * UTF8_Encoder::makeByteRange(const CodePointType lo, const CodePointType hi) {
     140    return makeCC(lo, hi);
     141}
     142
     143inline CC * UTF8_Encoder::makeByteClass(const CodePointType cp) {
     144    return makeCC(cp, cp);
     145}
     146
     147}
  • icGREP/icgrep-devel/icgrep/utf8_encoder.h

    r4242 r4249  
    99
    1010//Regular Expressions
    11 #include "re/re_re.h"
    12 #include "re/re_cc.h"
     11#include <re/re_cc.h>
     12#include <cc/cc_namemap.hpp>
    1313
     14namespace cc {
     15
     16class CC_NameMap;
    1417
    1518class UTF8_Encoder
    1619{
    1720public:
    18     static re::RE* toUTF8(re::RE * re);
     21    static re::RE * toUTF8(CC_NameMap & nameMap, re::RE * ast);
    1922private:
    20     static re::RE* rangeToUTF8(const re::CharSetItem &item);
    21     static re::RE* rangeToUTF8_helper(int lo, int hi, int n, int hlen);
    22     static re::CC* makeByteClass(int byteval);
    23     static re::CC* makeByteRange(int lo, int hi);
    24 
    25     static bool isUTF8Prefix(const int cp);
    26     static int lenUTF8(const int cp);
    27     static int maxUTF8Len(int lgth);
    28     static int u8byte(int codepoint, int n);
     23    static re::RE * rangeToUTF8(const re::CharSetItem & item);
     24    static re::RE * rangeToUTF8(const re::CodePointType lo, const re::CodePointType hi, const unsigned index, const unsigned max);
     25    static re::CC * makeByteClass(const re::CodePointType cp);
     26    static re::CC * makeByteRange(const re::CodePointType lo, const re::CodePointType hi);
     27    static bool isUTF8Prefix(const unsigned cp);
     28    static unsigned lenUTF8(const unsigned cp);
     29    static unsigned maxCodePoint(const unsigned length);
     30    static re::CodePointType u8byte(const re::CodePointType codepoint, const unsigned n);
    2931};
    3032
     33}
     34
    3135#endif // UTF8_ENCODER_H
Note: See TracChangeset for help on using the changeset viewer.