Changeset 5045


Ignore:
Timestamp:
Jun 9, 2016, 3:34:07 PM (3 years ago)
Author:
xuedongx
Message:

Support over UTF-16 representation of Unicode

Location:
icGREP/icgrep-devel
Files:
2 added
18 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/QA/greptest.py

    r5013 r5045  
    4343                        exit(-1)
    4444                outfpath = os.path.join(options.datafile_dir, filename)
    45                 outf = codecs.open(outfpath, encoding='utf-8', mode='w')
     45                if options.utf16: outf = codecs.open(outfpath, encoding='utf-16BE', mode='w')
     46                else: outf = codecs.open(outfpath, encoding='utf-8', mode='w')
    4647                in_datafile = True
    4748
     
    126127                          dest = 'verbose', action='store_true', default=False,
    127128                          help = 'verbose output: show successful tests')
     129        option_parser.add_option('-U', '--UTF-16',
     130                          dest = 'utf16', action='store_true', default=False,
     131                          help = 'test UTF-16 processing')
    128132        options, args = option_parser.parse_args(sys.argv[1:])
    129133        if len(args) != 1:
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5037 r5045  
    7070add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_rep.cpp re/re_diff.cpp re/re_intersect.cpp re/printer_re.cpp)
    7171add_library(RegExpCompiler re/re_parser.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp)
    72 add_library(CCADT cc/cc_compiler.cpp utf8_encoder.cpp UCD/CaseFolding_txt.cpp)
     72add_library(CCADT cc/cc_compiler.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/CaseFolding_txt.cpp)
    7373add_library(UCDlib UCD/unicode_set.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp UCD/UnicodeNameData.cpp)
    7474
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r4860 r5045  
    44#include <re/re_name.h>
    55#include <utf8_encoder.h>
     6#include <utf16_encoder.h>
     7#include <iostream>
    68
    79using namespace cc;
     
    1012
    1113namespace UCD {
     14
     15/** ------------------------------------------------------------------------------------------------------------- *
     16 * @brief UTF_16 UTF_8
     17 ** ------------------------------------------------------------------------------------------------------------- */
     18inline codepoint_t encodingByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     19        return UTF_16 ? UTF16_Encoder::encodingByte(cp, n) : UTF8_Encoder::encodingByte(cp, n);
     20}
     21
     22inline unsigned length(const codepoint_t cp, bool UTF_16) {
     23        return UTF_16 ? UTF16_Encoder::length(cp) : UTF8_Encoder::length(cp);
     24}
     25
     26inline codepoint_t maxCodePoint(const unsigned length, bool UTF_16) {
     27        return UTF_16 ?  UTF16_Encoder::maxCodePoint(length) : UTF8_Encoder::maxCodePoint(length);
     28}
     29
     30inline bool isLowCodePointAfterByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     31        return UTF_16 ? UTF16_Encoder::isLowCodePointAfterByte(cp, n) : UTF8_Encoder::isLowCodePointAfterByte(cp, n);
     32}
     33inline bool isHighCodePointAfterByte(const codepoint_t cp, const unsigned n, bool UTF_16) {
     34        return UTF_16 ? UTF16_Encoder::isHighCodePointAfterByte(cp, n) : UTF8_Encoder::isHighCodePointAfterByte(cp, n);
     35}
     36inline codepoint_t minCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n, bool UTF_16) {
     37        return UTF_16 ? UTF16_Encoder::minCodePointWithCommonBytes(cp, n) : UTF8_Encoder::minCodePointWithCommonBytes(cp, n);
     38}
     39inline codepoint_t maxCodePointWithCommonBytes(const re::codepoint_t cp, const unsigned n, bool UTF_16) {
     40        return UTF_16 ? UTF16_Encoder::maxCodePointWithCommonBytes(cp, n) : UTF8_Encoder::maxCodePointWithCommonBytes(cp, n);
     41}
    1242
    1343const UCDCompiler::RangeList UCDCompiler::defaultIfHierachy = {
     
    213243 ** ------------------------------------------------------------------------------------------------------------- */
    214244PabloAST * UCDCompiler::sequenceGenerator(const RangeList && ranges, const unsigned byte_no, PabloBuilder & builder, PabloAST * target, PabloAST * prefix) {
     245        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
    215246
    216247    if (LLVM_LIKELY(ranges.size() > 0)) {
     
    219250        std::tie(lo, hi) = ranges[0];
    220251
    221         const auto min = UTF8_Encoder::length(lo_codepoint(ranges.front()));
    222         const auto max = UTF8_Encoder::length(hi_codepoint(ranges.back()));
     252        const auto min = length(lo_codepoint(ranges.front()), isUTF_16);
     253        const auto max = length(hi_codepoint(ranges.back()), isUTF_16);
    223254
    224255        if (min != max) {
    225             const auto mid = UTF8_Encoder::maxCodePoint(min);
     256            const auto mid = maxCodePoint(min, isUTF_16);
    226257            target = sequenceGenerator(std::move(rangeIntersect(ranges, lo, mid)), byte_no, builder, target, prefix);
    227258            target = sequenceGenerator(std::move(rangeIntersect(ranges, mid + 1, hi)), byte_no, builder, target, prefix);
     
    229260            // We have a single byte remaining to match for all code points in this CC.
    230261            // Use the byte class compiler to generate matches for these codepoints.
    231             PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(byteDefinitions(ranges, byte_no)), builder);
     262            PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(byteDefinitions(ranges, byte_no, isUTF_16)), builder);
    232263            if (byte_no > 1) {
    233264                var = builder.createAnd(var, builder.createAdvance(makePrefix(lo, byte_no, builder, prefix), 1));
     
    238269                codepoint_t lo, hi;
    239270                std::tie(lo, hi) = rg;
    240                 const auto lo_byte = UTF8_Encoder::encodingByte(lo, byte_no);
    241                 const auto hi_byte = UTF8_Encoder::encodingByte(hi, byte_no);
    242                 if (lo_byte != hi_byte) {
    243                     if (!UTF8_Encoder::isLowCodePointAfterByte(lo, byte_no)) {
    244                         const codepoint_t mid = lo | ((1 << (6 * (min - byte_no))) - 1);
     271                const auto lo_byte = encodingByte(lo, byte_no, isUTF_16);
     272                const auto hi_byte = encodingByte(hi, byte_no, isUTF_16);
     273                //std::cout << "lo_byte: " << std::hex << lo_byte << " hi_byte: " << std::hex << hi_byte << std::endl;
     274                                if (lo_byte != hi_byte) {
     275                                        unsigned num = isUTF_16 ? 10 : 6;
     276                    if (!isLowCodePointAfterByte(lo, byte_no, isUTF_16)) {
     277                        const codepoint_t mid = lo | ((1 << (num * (min - byte_no))) - 1);
    245278                        target = sequenceGenerator(lo, mid, byte_no, builder, target, prefix);
    246279                        target = sequenceGenerator(mid + 1, hi, byte_no, builder, target, prefix);
    247                     } else if (!UTF8_Encoder::isHighCodePointAfterByte(hi, byte_no)) {
    248                         const codepoint_t mid = hi & ~((1 << (6 * (min - byte_no))) - 1);
     280                    } else if (!isHighCodePointAfterByte(hi, byte_no, isUTF_16)) {
     281                        const codepoint_t mid = hi & ~((1 << (num * (min - byte_no))) - 1);
    249282                        target = sequenceGenerator(lo, mid - 1, byte_no, builder, target, prefix);
    250283                        target = sequenceGenerator(mid, hi, byte_no, builder, target, prefix);
     
    254287                            var = builder.createAnd(builder.createAdvance(prefix, 1), var);
    255288                        }
    256                         for (unsigned i = byte_no; i != UTF8_Encoder::length(lo); ++i) {
     289                        for (unsigned i = byte_no; i != length(lo, isUTF_16); ++i) {
    257290                            var = builder.createAnd(mSuffixVar, builder.createAdvance(var, 1));
    258291                        }
     
    265298                        var = builder.createAnd(builder.createAdvance(prefix ? prefix : var, 1), var);
    266299                    }
    267                     if (byte_no < UTF8_Encoder::length(lo)) {
     300                    if (byte_no < length(lo, isUTF_16)) {
    268301                        target = sequenceGenerator(lo, hi, byte_no + 1, builder, target, var);
    269302                    }
     
    294327PabloAST * UCDCompiler::ifTestCompiler(const codepoint_t lo, const codepoint_t hi, const unsigned byte_no, PabloBuilder & builder, PabloAST * target) {
    295328
    296     codepoint_t lo_byte = UTF8_Encoder::encodingByte(lo, byte_no);
    297     codepoint_t hi_byte = UTF8_Encoder::encodingByte(hi, byte_no);
    298     const bool at_lo_boundary = (lo == 0 || UTF8_Encoder::encodingByte(lo - 1, byte_no) != lo_byte);
    299     const bool at_hi_boundary = (hi == 0x10FFFF || UTF8_Encoder::encodingByte(hi + 1, byte_no) != hi_byte);
     329        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
     330    codepoint_t lo_byte = encodingByte(lo, byte_no, isUTF_16);
     331    codepoint_t hi_byte = encodingByte(hi, byte_no, isUTF_16);
     332    const bool at_lo_boundary = (lo == 0 || encodingByte(lo - 1, byte_no, isUTF_16) != lo_byte);
     333    const bool at_hi_boundary = (hi == 0x10FFFF || encodingByte(hi + 1, byte_no, isUTF_16) != hi_byte);
    300334
    301335    if (at_lo_boundary && at_hi_boundary) {
    302         if (lo_byte != hi_byte) {
    303             if (lo == 0x80) lo_byte = 0xC0;
    304             if (hi == 0x10FFFF) hi_byte = 0xFF;
    305         }
     336                if (!isUTF_16) {
     337                        if (lo_byte != hi_byte) {
     338                                if (lo == 0x80) lo_byte = 0xC0;
     339                                if (hi == 0x10FFFF) hi_byte = 0xFF;
     340                        }
     341                }
    306342        PabloAST * cc = mCharacterClassCompiler.compileCC(makeCC(lo_byte, hi_byte), builder);
    307343        target = builder.createAnd(cc, target);
     
    312348        target = ifTestCompiler(lo, hi, byte_no + 1, builder, target);
    313349    } else if (!at_hi_boundary) {
    314         const auto mid = UTF8_Encoder::minCodePointWithCommonBytes(hi, byte_no);
     350        const auto mid = minCodePointWithCommonBytes(hi, byte_no, isUTF_16);
    315351        PabloAST * e1 = ifTestCompiler(lo, mid - 1, byte_no, builder, target);
    316352        PabloAST * e2 = ifTestCompiler(mid, hi, byte_no, builder, target);
    317353        target = builder.createOr(e1, e2);
    318354    } else {
    319         const auto mid = UTF8_Encoder::maxCodePointWithCommonBytes(lo, byte_no);
     355        const auto mid = maxCodePointWithCommonBytes(lo, byte_no, isUTF_16);
    320356        PabloAST * e1 = ifTestCompiler(lo, mid, byte_no, builder, target);
    321357        PabloAST * e2 = ifTestCompiler(mid + 1, hi, byte_no, builder, target);
     
    335371    assert (byte_no >= 1 && byte_no <= 4);
    336372    assert (byte_no == 1 || prefix != nullptr);
     373        bool isUTF_16 = mCharacterClassCompiler.isUTF_16();
    337374    for (unsigned i = 1; i != byte_no; ++i) {
    338         const CC * const cc = makeCC(UTF8_Encoder::encodingByte(cp, i));
     375        const CC * const cc = makeCC(encodingByte(cp, i, isUTF_16));
    339376        PabloAST * var = mCharacterClassCompiler.compileCC(cc, builder);
    340377        if (i > 1) {
     
    353390 * Ensure the sequence of preceding bytes is defined, up to, but not including the given byte_no
    354391 ** ------------------------------------------------------------------------------------------------------------- */
    355 UCDCompiler::RangeList UCDCompiler::byteDefinitions(const RangeList & list, const unsigned byte_no) {
     392UCDCompiler::RangeList UCDCompiler::byteDefinitions(const RangeList & list, const unsigned byte_no, bool isUTF_16) {
    356393    RangeList result;
    357394    result.reserve(list.size());
    358395    for (const auto & i : list) {
    359         result.emplace_back(UTF8_Encoder::encodingByte(lo_codepoint(i), byte_no), UTF8_Encoder::encodingByte(hi_codepoint(i), byte_no));
     396        result.emplace_back(encodingByte(lo_codepoint(i), byte_no, isUTF_16), encodingByte(hi_codepoint(i), byte_no, isUTF_16));
    360397    }
    361398    return result;
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.hpp

    r4991 r5045  
    6969    PabloAST * makePrefix(const codepoint_t cp, const unsigned byte_no, PabloBuilder & builder, PabloAST * prefix);
    7070
    71     static RangeList byteDefinitions(const RangeList & list, const unsigned byte_no);
     71    static RangeList byteDefinitions(const RangeList & list, const unsigned byte_no, bool isUTF_16);
    7272
    7373    template <typename RangeListOrUnicodeSet>
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.h

    r5037 r5045  
    4545    }
    4646
     47        bool isUTF_16() {
     48                return mEncoding.getBits() == 16;
     49        }
     50
    4751private:
    4852    pablo::Var * getBasisVar(const unsigned n) const;
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5037 r5045  
    6666
    6767
    68 
    69 
    70 bool GrepEngine::finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize) {
     68bool isUTF_16 = false;
     69
     70bool GrepEngine::finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize, bool UTF_16) {
    7171    if (fileSize == 0) return false;
    7272    unsigned char end_byte = static_cast<unsigned char>(fileBuffer[fileSize-1]);
     
    7777    // NEL
    7878    unsigned char penult_byte = static_cast<unsigned char>(fileBuffer[fileSize-2]);
    79     if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
     79    if ((end_byte == 0x85) && (penult_byte == (UTF_16 ? 0x00 : 0xC2))) return false;
    8080    if (fileSize == 2) return true;
    8181    // LS and PS
    8282    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
    83     return (static_cast<unsigned char>(fileBuffer[fileSize-3]) != 0xE2) || (penult_byte != 0x80);
    84 }
    85 
    86 void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> & total_CountOnly) {
     83        if (!UTF_16) {
     84        return (static_cast<unsigned char>(fileBuffer[fileSize-3]) != 0xE2) || (penult_byte != 0x80);
     85        }
     86        else {// UTF_16
     87                return (penult_byte != 0x20);
     88        }
     89}
     90
     91void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> & total_CountOnly, bool UTF_16) {
    8792    path file(fileName);
    8893    if (exists(file)) {
     
    101106            char * fileBuffer = const_cast<char *>(source.data());
    102107            if (CountOnly) {
    103                 total_CountOnly[fileIdx] = mGrepFunction_CountOnly(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize));
     108                total_CountOnly[fileIdx] = mGrepFunction_CountOnly(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize, UTF_16));
    104109            } else {
    105                 mGrepFunction(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize));
     110                mGrepFunction(fileBuffer, fileSize, fileIdx, finalLineIsUnterminated(fileBuffer, fileSize, UTF_16));
    106111            }
    107112            source.close();
     
    119124
    120125
    121 void GrepEngine::grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool isNameExpression) {
    122     Module * M = new Module(moduleName, getGlobalContext());
     126void GrepEngine::grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool UTF_16, bool isNameExpression) {
     127    isUTF_16 = UTF_16;
     128        Module * M = new Module(moduleName, getGlobalContext());
    123129   
    124130    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
     
    126132    kernel::PipelineBuilder pipelineBuilder(M, idb);
    127133
    128     Encoding encoding(Encoding::Type::UTF_8, 8);
    129     mIsNameExpression = isNameExpression;
     134        Encoding::Type type;
     135        type = UTF_16 ? Encoding::Type::UTF_16 : Encoding::Type::UTF_8;
     136        unsigned bits;
     137        bits = UTF_16 ? 16 : 8;
     138
     139        Encoding encoding(type, bits);
     140
     141        mIsNameExpression = isNameExpression;
    130142    re_ast = re::regular_expression_passes(encoding, re_ast);   
    131143    pablo::PabloFunction * function = re::re2pablo_compiler(encoding, re_ast);
    132144   
    133145
    134     pipelineBuilder.CreateKernels(function, isNameExpression);
    135 
    136     llvm::Function * grepIR = pipelineBuilder.ExecuteKernels(CountOnly);
     146    pipelineBuilder.CreateKernels(function, UTF_16, isNameExpression);
     147
     148    llvm::Function * grepIR = pipelineBuilder.ExecuteKernels(CountOnly, UTF_16);
    137149
    138150    mEngine = JIT_to_ExecutionEngine(M);
     
    165177
    166178    uint64_t finalLineUnterminated = 0;
    167     if(finalLineIsUnterminated(mFileBuffer, mFileSize))
     179    if(finalLineIsUnterminated(mFileBuffer, mFileSize, isUTF_16))
    168180        finalLineUnterminated = 1;   
    169181    mGrepFunction(mFileBuffer, mFileSize, 0, finalLineUnterminated);
     
    197209extern "C" {
    198210    void wrapped_report_match(uint64_t lineNum, uint64_t line_start, uint64_t line_end, const char * buffer, uint64_t filesize, int fileIdx) {
    199        
    200         int idx = fileIdx;
     211                int index = isUTF_16 ? 2 : 1;
     212                int idx = fileIdx;
    201213       
    202214        if (ShowFileNames) {
     
    207219        }
    208220       
    209         if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
     221        if ((!isUTF_16 && buffer[line_start] == 0xA) && (line_start != line_end)) {
    210222            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
    211223            line_start++;
    212224        }
     225        if (((isUTF_16 && buffer[line_start] == 0x0) && buffer[line_start + 1] == 0xA) && (line_start != line_end)) {
     226            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
     227            line_start += 2;
     228        }
    213229        if (line_end == filesize) {
    214230            // The match position is at end-of-file.   We have a final unterminated line.
    215             resultStrs[idx].write(&buffer[line_start], line_end - line_start);
     231            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start) * index);
    216232            if (NormalizeLineBreaks) {
    217233                resultStrs[idx] << '\n';  // terminate it
     
    220236        }
    221237        unsigned char end_byte = (unsigned char)buffer[line_end];
     238                unsigned char penult_byte = (unsigned char)(buffer[line_end - 1]);
    222239        if (NormalizeLineBreaks) {
    223240            if (end_byte == 0x85) {
     
    226243            } else if (end_byte > 0xD) {
    227244                // Line terminated with PS or LS, on the third byte.  Back up 2.
    228                 line_end -= 2;
    229             }
    230             resultStrs[idx].write(&buffer[line_start], line_end - line_start);
     245                isUTF_16 ? line_end-- : line_end -= 2;
     246            }
     247            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start) * index);
    231248            resultStrs[idx] << '\n';
    232249        }
    233250        else{   
    234             if (end_byte == 0x0D) {
     251            if ((!isUTF_16 && end_byte == 0x0D) || (isUTF_16 && (end_byte == 0x0D && penult_byte == 0x0))) {
    235252                // Check for line_end on first byte of CRLF;  note that we don't
    236253                // want to access past the end of buffer.
    237                 if ((line_end + 1 < filesize) && (buffer[line_end + 1] == 0x0A)) {
     254                                if (line_end + 1 < filesize) {
     255                                        if (!isUTF_16 && buffer[line_end + 1] == 0x0A) {
    238256                    // Found CRLF; preserve both bytes.
    239                     line_end++;
    240                 }
    241             }
    242             resultStrs[idx].write(&buffer[line_start], line_end - line_start + 1);
     257                                                line_end++;;
     258                                        }
     259                                        if (isUTF_16 && buffer[line_end + 1] == 0x0 && buffer[line_end + 2] == 0x0A) {
     260                    // Found CRLF; preserve both bytes.
     261                                                line_end += 2;
     262                                        }
     263                                }
     264            }
     265            resultStrs[idx].write(&buffer[line_start * index], (line_end - line_start + 1) * index);
    243266        }
    244267    }
  • icGREP/icgrep-devel/icgrep/grep_engine.h

    r5037 r5045  
    2626    ~GrepEngine();
    2727 
    28     void grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool isNameExpression = false);
     28    void grepCodeGen(std::string moduleName, re::RE * re_ast, bool CountOnly, bool UTF_16 = false, bool isNameExpression = false);
    2929   
    30     void doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> &total_CountOnly);
     30    void doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<uint64_t> &total_CountOnly, bool UTF_16);
    3131   
    3232    re::CC *  grepCodepoints();
     
    3434private:
    3535   
    36     static bool finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize);
     36    static bool finalLineIsUnterminated(const char * const fileBuffer, const size_t fileSize, bool UTF_16);
    3737
    3838    GrepFunctionType mGrepFunction;
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5037 r5045  
    2323static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
    2424                                       "These are standard grep options intended for compatibility with typical grep usage.");
     25static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
    2526static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
    2627                                       "These are additional options for icgrep functionality and performance.");
     
    110111
    111112    while (fileIdx < inputFiles.size()){
    112         grepEngine->doGrep(inputFiles[fileIdx], fileIdx, CountOnly, total_CountOnly);
     113        grepEngine->doGrep(inputFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
    113114       
    114115        count_mutex.lock();
     
    212213   
    213214    GrepEngine grepEngine;
    214     grepEngine.grepCodeGen(module_name, re_ast, CountOnly);
     215    grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
    215216   
    216217    initResult(inputFiles);
     
    221222    if (Threads <= 1) {
    222223        for (unsigned i = 0; i != inputFiles.size(); ++i) {
    223             grepEngine.doGrep(inputFiles[i], i, CountOnly, total_CountOnly);
     224            grepEngine.doGrep(inputFiles[i], i, CountOnly, total_CountOnly, UTF_16);
    224225        }       
    225226    } else if (Threads > 1) {
  • icGREP/icgrep-devel/icgrep/kernels/pipeline.cpp

    r5042 r5045  
    3636}
    3737
    38 void PipelineBuilder::CreateKernels(PabloFunction * function, bool isNameExpression){
     38void PipelineBuilder::CreateKernels(PabloFunction * function, bool UTF_16, bool isNameExpression){
    3939    mS2PKernel = new KernelBuilder(iBuilder, "s2p", codegen::SegmentSize);
    4040    mICgrepKernel = new KernelBuilder(iBuilder, "icgrep", codegen::SegmentSize);
    4141    mScanMatchKernel = new KernelBuilder(iBuilder, "scanMatch", codegen::SegmentSize);
    42     generateS2PKernel(mMod, iBuilder, mS2PKernel);
     42        if (UTF_16) {
     43                generateS2P_16Kernel(mMod, iBuilder, mS2PKernel);
     44        }
     45        else {
     46                generateS2PKernel(mMod, iBuilder, mS2PKernel);
     47        }
    4348    generateScanMatch(mMod, iBuilder, 64, mScanMatchKernel, isNameExpression);
    4449    pablo_function_passes(function);
     
    6873}
    6974
    70 Function * PipelineBuilder::ExecuteKernels(bool CountOnly) {
     75Function * PipelineBuilder::ExecuteKernels(bool CountOnly, bool UTF_16) {
    7176    Type * const int64ty = iBuilder->getInt64Ty();
    7277    Type * const int8PtrTy = iBuilder->getInt8PtrTy();
    73     Type * const inputType = PointerType::get(ArrayType::get(StructType::get(mMod->getContext(), std::vector<Type *>({ArrayType::get(mBitBlockType, 8)})), 1), 0);
     78    Type * const inputType = PointerType::get(ArrayType::get(StructType::get(mMod->getContext(), std::vector<Type *>({ArrayType::get(mBitBlockType, (UTF_16 ? 16 : 8))})), 1), 0);
    7479    Type * const resultTy = CountOnly ? int64ty : iBuilder->getVoidTy();
    7580    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", resultTy, inputType, int64ty, int64ty, iBuilder->getInt1Ty(), nullptr));
     
    128133        PHINode * remainingBytes = iBuilder->CreatePHI(int64ty, 2, "remainingBytes");
    129134        remainingBytes->addIncoming(bufferSize, entryBlock);
    130         Constant * const step = ConstantInt::get(int64ty, mBlockSize * segmentSize);
     135        Constant * const step = ConstantInt::get(int64ty, mBlockSize * segmentSize * (UTF_16 ? 2 : 1));
    131136        Value * segmentCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
    132137        iBuilder->CreateCondBr(segmentCondTest, fullCondBlock, segmentBodyBlock);
     
    145150                Value * prev_count = iBuilder->CreateBitCast(temp_count, iBuilder->getIntNTy(mBlockSize));
    146151                Value * add_for = iBuilder->CreateAdd(prev_count, popcount_for);
    147                 iBuilder->CreateStore(add_for, count);
     152                Value * add = iBuilder->CreateBitCast(add_for, mBitBlockType);
     153                iBuilder->CreateStore(add, count);
    148154            }
    149155        }
     
    167173    remainingBytes->addIncoming(initialBufferSize, initialBlock);
    168174
    169     Constant * const step = ConstantInt::get(int64ty, mBlockSize);
     175    Constant * const step = ConstantInt::get(int64ty, mBlockSize * (UTF_16 ? 2 : 1));
    170176    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
    171177    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
     
    201207    iBuilder->SetInsertPoint(exitBlock);
    202208
    203     Value * remaining = iBuilder->CreateZExt(remainingBytes, iBuilder->getIntNTy(mBlockSize));
    204     Value * EOFmark = iBuilder->CreateShl(ConstantInt::get(iBuilder->getIntNTy(mBlockSize), 1), remaining);
     209    Value * remainingByte = iBuilder->CreateZExt(remainingBytes, iBuilder->getIntNTy(mBlockSize));
     210        Value * remainingUnit = iBuilder->CreateLShr(remainingByte, ConstantInt::get(iBuilder->getIntNTy(mBlockSize), 1));
     211    Value * EOFmark = iBuilder->CreateShl(ConstantInt::get(iBuilder->getIntNTy(mBlockSize), 1), UTF_16 ? remainingUnit : remainingByte);
    205212        icGrepInstance->setInternalState("EOFmark", iBuilder->CreateBitCast(EOFmark, mBitBlockType));
    206213
  • icGREP/icgrep-devel/icgrep/kernels/pipeline.h

    r5025 r5045  
    3030        ~PipelineBuilder();
    3131
    32         void CreateKernels(pablo::PabloFunction * function, bool isNameExpression);
    33     llvm::Function * ExecuteKernels(bool CountOnly);
     32        void CreateKernels(pablo::PabloFunction * function, bool UTF_16, bool isNameExpression);
     33    llvm::Function * ExecuteKernels(bool CountOnly, bool UTF_16);
    3434
    3535private:
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.cpp

    r4986 r5045  
    2828}
    2929
    30 inline void s2p(IDISA::IDISA_Builder * iBuilder, Value * input, Value * output[]) {
     30void s2p(IDISA::IDISA_Builder * iBuilder, Value * input[], Value * output[]) {
    3131    Value * bit00224466[4];
    3232    Value * bit11335577[4];
    3333
    3434    for (unsigned i = 0; i < 4; i++) {
    35         Value * s0 = iBuilder->CreateBlockAlignedLoad(input, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
    36         Value * s1 = iBuilder->CreateBlockAlignedLoad(input, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
     35        Value * s0 = input[2 * i];
     36        Value * s1 = input[2 * i + 1];
    3737        s2p_step(iBuilder, s0, s1, iBuilder->simd_himask(2), 1, bit00224466[i], bit11335577[i]);
    3838    }
     
    5151    s2p_step(iBuilder, bit22226666[0], bit22226666[1], iBuilder->simd_himask(8), 4, output[2], output[6]);
    5252    s2p_step(iBuilder, bit33337777[0], bit33337777[1], iBuilder->simd_himask(8), 4, output[3], output[7]);
     53}
     54
     55void s2p(IDISA::IDISA_Builder * iBuilder, Value * input, Value * output[]) {
     56    Value * bit[8];
     57    for (unsigned i = 0; i < 8; i++) {
     58        bit[i] = iBuilder->CreateBlockAlignedLoad(input, {iBuilder->getInt32(0), iBuilder->getInt32(i)});
     59    }
     60    s2p(iBuilder, bit, output);
    5361}
    5462
     
    7179}
    7280
     81void generateS2P_16Kernel(Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
     82        kBuilder->addInputStream(16, "unit_pack");
     83        for(unsigned i = 0; i < 16; i++) {
     84                kBuilder->addOutputStream(1);
     85        }
     86        kBuilder->prepareFunction();
     87
     88    Value * ptr = kBuilder->getInputStream(0);
     89
     90    Value * lo[8];
     91    Value * hi[8];
     92    for (unsigned i = 0; i < 8; i++) {
     93        Value * s0 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i)});
     94        Value * s1 = iBuilder->CreateBlockAlignedLoad(ptr, {iBuilder->getInt32(0), iBuilder->getInt32(2 * i + 1)});
     95        lo[i] = iBuilder->hsimd_packl(16, s0, s1);
     96        hi[i] = iBuilder->hsimd_packh(16, s0, s1);
     97    }
     98
     99    Value * output[16];
     100    s2p(iBuilder, lo, output);
     101    s2p(iBuilder, hi, output + 8);
     102    for (unsigned j = 0; j < 16; j++) {
     103        iBuilder->CreateBlockAlignedStore(output[j], kBuilder->getOutputStream(j));
     104    }
     105    kBuilder->finalize();
     106}
     107       
    73108void generateS2P_idealKernel(Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder) {
    74109    kBuilder->addInputStream(8, "byte_pack");
  • icGREP/icgrep-devel/icgrep/kernels/s2p_kernel.h

    r4976 r5045  
    1515
    1616    void generateS2PKernel(llvm::Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder);
     17        void generateS2P_16Kernel(llvm::Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder);
    1718    void generateS2P_idealKernel(llvm::Module *, IDISA::IDISA_Builder * iBuilder, KernelBuilder * kBuilder);
    1819
  • icGREP/icgrep-devel/icgrep/pablo/carry_manager.cpp

    r5037 r5045  
    2626    mCarryInfoVector.resize(mRootScope->enumerateScopes(0) + 1);
    2727    mCarryPackType = mBitBlockType;
    28     const unsigned totalCarryDataSize = std::max<unsigned>(enumerate(mRootScope, 0, 0), 1);
    29     mCarryPackPtr.resize(totalCarryDataSize, nullptr);
    30     mCarryInPack.resize(totalCarryDataSize, nullptr);
    31     mCarryOutPack.resize(totalCarryDataSize, nullptr);
     28    const unsigned totalCarryDataSize = enumerate(mRootScope, 0, 0);
     29
     30
     31    mCarryPackPtr.resize(totalCarryDataSize + 1, nullptr);
     32    mCarryInPack.resize(totalCarryDataSize + 1, nullptr);
     33    mCarryOutPack.resize(totalCarryDataSize + 1, nullptr);
     34
    3235    mTotalCarryDataBitBlocks = totalCarryDataSize;
    3336    ArrayType* cdArrayTy = ArrayType::get(mBitBlockType, mTotalCarryDataBitBlocks);
     
    5356    mCurrentScope = mRootScope;
    5457    mCurrentFrameIndex = 0;
     58    assert(mCarryInfoVector.size() > 0);
    5559    mCarryInfo = mCarryInfoVector[0];
     60    assert(summaryPack() < mCarryOutPack.size());
    5661    mCarryOutPack[summaryPack()] = Constant::getNullValue(mCarryPackType);
    5762    assert (mCarrySummary.empty());
     
    6267 ** ------------------------------------------------------------------------------------------------------------- */
    6368void CarryManager::enterScope(PabloBlock * const scope) {
     69    assert(summaryPack() < mCarryOutPack.size());
    6470    Value * summaryCarry = mCarryOutPack[summaryPack()];
    6571    mCarrySummary.push_back(summaryCarry);
     
    6773    mCarryInfo = mCarryInfoVector[scope->getScopeIndex()];
    6874    mCurrentFrameIndex += mCarryInfo->getFrameIndex();
     75    assert(summaryPack() < mCarryOutPack.size());
    6976    mCarryOutPack[summaryPack()] = Constant::getNullValue(mCarryPackType);
    7077}
     
    7481 ** ------------------------------------------------------------------------------------------------------------- */
    7582void CarryManager::leaveScope() {
     83    assert(summaryPack() < mCarryOutPack.size());
    7684    Value * summaryCarry = mCarryOutPack[summaryPack()];
    7785    assert (mCurrentScope != mRootScope);
     
    7987    mCurrentScope = mCurrentScope->getParent();
    8088    mCarryInfo = mCarryInfoVector[mCurrentScope->getScopeIndex()];
     89    assert(summaryPack() < mCarryOutPack.size());
    8190    mCarryOutPack[summaryPack()] = summaryCarry;
    8291    mCarrySummary.pop_back();
     
    229238        const unsigned carrySummaryIndex = summaryPack();
    230239        if (LLVM_UNLIKELY(mCarryInfo->hasLongAdvances())) { // Force if entry
     240            assert (carrySummaryIndex < mCarryOutPack.size());
    231241            mCarryOutPack[carrySummaryIndex] = Constant::getAllOnesValue(mCarryPackType);
    232242        }
     
    267277        const unsigned scopeCarryPacks = mCarryInfo->getScopeCarryPackCount();
    268278        for (unsigned i = scopeBaseOffset; i < scopeBaseOffset + scopeCarryPacks; ++i) {
     279            assert (i < mCarryOutPack.size());
    269280            Type * const type = mCarryOutPack[i]->getType();
    270281            PHINode * phi = iBuilder->CreatePHI(type, 2);
     
    276287    if (LLVM_LIKELY(mCarrySummary.size() > 0)) {
    277288        const unsigned summaryIndex = summaryPack();
     289        assert (summaryIndex < mCarryOutPack.size());
    278290        Value * carrySummary = mCarryOutPack[summaryIndex];
    279291        if (mCarrySummary.back() != carrySummary) {
     
    307319        PHINode * phi_out = iBuilder->CreatePHI(mCarryPackType, 2);
    308320        phi_out->addIncoming(Constant::getNullValue(mCarryPackType), end);
     321        assert (index < mCarryOutAccumPhis.size());
    309322        mCarryOutAccumPhis[index] = phi_out;
    310323    }
     
    321334        mCarryInPhis[index]->addIncoming(Constant::getNullValue(mCarryPackType), whileBodyFinalBlock);
    322335        #endif
     336        assert (index < mCarryOutAccumPhis.size());
    323337        PHINode * phi = mCarryOutAccumPhis[index];
    324338        Value * carryOut = iBuilder->CreateOr(phi, mCarryOutPack[currentScopeBase + index]);
     
    370384        carryOut = iBuilder->CreateBitCast(carry_bit, mBitBlockType);
    371385    }
     386    assert (index < mCarryOutPack.size());
    372387    mCarryOutPack[index] = carryOut;
    373388    if (LLVM_LIKELY(hasSummary())) {
     
    448463inline Value * CarryManager::addToSummary(Value * const value) {
    449464    const unsigned summaryIndex = summaryPack();
     465    assert (summaryIndex < mCarryInPack.size());
    450466    Value * summary = mCarryOutPack[summaryIndex];
    451467    assert (summary);
     
    479495 ** ------------------------------------------------------------------------------------------------------------- */
    480496Value * CarryManager::getCarryPack(const unsigned packIndex) {
     497    assert (packIndex < mCarryInPack.size());
    481498    if (mCarryInPack[packIndex] == nullptr) {
    482499        Value * const packPtr = iBuilder->CreateGEP(mCarryPackBasePtr, iBuilder->getInt64(packIndex));
     
    491508 ** ------------------------------------------------------------------------------------------------------------- */
    492509void CarryManager::storeCarryOut(const unsigned packIndex) {
     510    assert (packIndex < mCarryInPack.size());
    493511    assert (mCarryOutPack[packIndex]);
    494512    assert (mCarryPackPtr[packIndex]);
  • icGREP/icgrep-devel/icgrep/pablo/pablo_compiler.cpp

    r5043 r5045  
    369369    } else if (const AtEOF * e = dyn_cast<AtEOF>(stmt)) {
    370370        Value * EOFmark = iBuilder->CreateLoad(mKernelBuilder->getInternalState("EOFmark"));
    371         expr = iBuilder->simd_and(compileExpression(e->getExpr()), EOFmark);
     371                expr = iBuilder->simd_and(compileExpression(e->getExpr()), EOFmark);
    372372    } else if (const Count * c = dyn_cast<Count>(stmt)) {
    373373        Value * const to_count = compileExpression(c->getExpr());
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5042 r5045  
    4343namespace re {
    4444
    45 void RE_Compiler::initializeRequiredStreams() {
    46 
     45void RE_Compiler::initializeRequiredStreams(Encoding encoding) {
     46        if (encoding.getType() == Encoding::Type::UTF_8) {
     47                RE_Compiler::initializeRequiredStreams_utf8();
     48        }
     49        else if (encoding.getType() == Encoding::Type::UTF_16) {
     50                RE_Compiler::initializeRequiredStreams_utf16();
     51        }
     52}
     53               
     54void RE_Compiler::initializeRequiredStreams_utf16() {
     55    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x000A)));
     56    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x000D));
     57    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x000A, 0x000D));
     58    Assign * NEL = mPB.createAssign("NEL", mCCCompiler.compileCC(makeCC(0x0085)));
     59    Assign * LS_PS = mPB.createAssign("LS_PS", mCCCompiler.compileCC(makeCC(0x2028, 0x2029)));
     60    Assign * NEL_LS_PS = mPB.createAssign("NEL_LS_PS", mPB.createOr(NEL, LS_PS));
     61
     62    PabloAST * cr1 = mPB.createAdvance(CR, 1, "cr1");
     63    Assign * acrlf = mPB.createAssign("crlf", mPB.createAnd(cr1, LF));
     64    mCRLF = acrlf;
     65
     66        PabloAST * hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDBFF));
     67        //PabloAST * lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDFFF));
     68        PabloAST * u16hi_hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDB00));    //u16hi_hi_surrogate = [\xD8-\xDB]
     69        PabloAST * u16hi_lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDF00));    //u16hi_lo_surrogate = [\xDC-\xDF]
     70
     71        PabloAST * invalidTemp = mPB.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
     72    Assign * u16invalid = mPB.createAssign("u16invalid", mPB.createXor(invalidTemp, u16hi_lo_surrogate));//errors.Unicode=pablo.Advance(u16hi_hi_surrogate) ^ u16hi_lo_surrogate
     73    Assign * u16valid = mPB.createAssign("u16valid", mPB.createNot(u16invalid));
     74
     75    PabloAST * u16single_temp = mPB.createOr(mCCCompiler.compileCC(makeCC(0x0000, 0xD7FF)), mCCCompiler.compileCC(makeCC(0xE000, 0xFFFF)));
     76        PabloAST * u16single = mPB.createAnd(u16single_temp, mPB.createNot(u16invalid));
     77   
     78    mNonFinal = mPB.createAssign("nonfinal", mPB.createAnd(hi_surrogate, u16valid));
     79    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u16invalid), "final");
     80        mInitial = mPB.createOr(u16single, hi_surrogate, "initial");
     81   
     82    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
     83    PabloAST * UnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
     84    PabloAST * lb = UNICODE_LINE_BREAK ? UnicodeLineBreak : LF;
     85    PabloAST * unterminatedLineAtEOF = mPB.createAtEOF(mPB.createAdvance(mPB.createNot(LB_chars), 1));
     86    mLineBreak = mPB.createOr(lb, unterminatedLineAtEOF);
     87    mAny = mPB.createNot(lb, "any");
     88    mFunction.setResult(1, mPB.createAssign("lf", mLineBreak));
     89        return;
     90}
     91void RE_Compiler::initializeRequiredStreams_utf8() {
    4792    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x0A)));
    4893    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x0D));
     
    282327        }
    283328    };
    284 
    285329    re = resolve(re);
    286330    gather(re);
     
    295339        for (auto t : nameMap) {
    296340            if (t.second) {
    297                 mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, t.second)));
     341                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchUnit, t.second)));
    298342            }
    299343        }
     
    320364    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    321365    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    322     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF, GCB_Control});
     366    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    323367
    324368    // Break at the start and end of text.
     
    359403
    360404void RE_Compiler::finalizeMatchResult(MarkerType match_result, bool InvertMatches) {
    361     PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
     405        PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
    362406    if (InvertMatches) {
    363407        match_follow = mPB.createNot(match_follow);
     
    367411
    368412MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
    369     return process(re, makeMarker(MarkerPosition::FinalPostPositionByte, pb.createOnes()), pb);
     413    return process(re, makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createOnes()), pb);
    370414}
    371415
     
    398442
    399443inline MarkerType RE_Compiler::compileAny(const MarkerType m, PabloBuilder & pb) {
    400     PabloAST * nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionByte, pb));
     444    PabloAST * nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionUnit, pb));
    401445    PabloAST * lb = mLineBreak;
    402446    if (UNICODE_LINE_BREAK) {
    403447        lb = pb.createOr(mLineBreak, mCRLF);
    404448    }
    405     return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
     449    return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
    406450}
    407451
     
    409453    MarkerType nameMarker = compileName(name, pb);
    410454    MarkerType nextPos;
    411     if (markerPos(marker) == MarkerPosition::FinalPostPositionByte) {
     455    if (markerPos(marker) == MarkerPosition::FinalPostPositionUnit) {
    412456        nextPos = marker;
    413457    } else if (name->getType() == Name::Type::Byte) {
    414         nextPos = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     458        nextPos = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
    415459    } else {
    416         nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     460        nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    417461    }
    418462    nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
     
    465509    // The following may be useful to force a common Advance rather than separate
    466510    // Advances in each alternative.
    467     // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
     511    // MarkerType const base = makeMarker(InitialPostPositionUnit, postPositionVar(marker, pb), pb);
    468512    for (RE * re : *alt) {
    469513        MarkerType rslt = process(re, base, pb);
     
    471515        accum[p] = pb.createOr(accum[p], markerVar(rslt), "alt");
    472516    }
    473     if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionByte]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionByte])) {
    474         return makeMarker(MarkerPosition::FinalMatchByte, accum[MarkerPosition::FinalMatchByte]);
    475     }
    476     PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[MarkerPosition::FinalMatchByte], 1), "alt");
    477     if (isa<Zeroes>(accum[FinalPostPositionByte])) {
    478         return makeMarker(InitialPostPositionByte, combine);
    479     }
    480     combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionByte], "alt");
    481     return makeMarker(MarkerPosition::FinalPostPositionByte, combine);
     517    if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionUnit]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionUnit])) {
     518        return makeMarker(MarkerPosition::FinalMatchUnit, accum[MarkerPosition::FinalMatchUnit]);
     519    }
     520    PabloAST * combine = pb.createOr(accum[InitialPostPositionUnit], pb.createAdvance(accum[MarkerPosition::FinalMatchUnit], 1), "alt");
     521    if (isa<Zeroes>(accum[FinalPostPositionUnit])) {
     522        return makeMarker(InitialPostPositionUnit, combine);
     523    }
     524    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionUnit], "alt");
     525    return makeMarker(MarkerPosition::FinalPostPositionUnit, combine);
    482526}
    483527
     
    494538    } else if (isUnicodeUnitLength(asserted)) {
    495539        MarkerType lookahead = compile(asserted, pb);
    496         if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchByte)) {
     540        if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchUnit)) {
    497541            PabloAST * la = markerVar(lookahead);
    498542            if (a->getSense() == Assertion::Sense::Negative) {
    499543                la = pb.createNot(la);
    500544            }
    501             MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    502             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     545            MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
     546            return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), la, "lookahead"));
    503547        }
    504548    }
     
    597641        PabloAST * cc = markerVar(compile(repeated, pb));
    598642        PabloAST * cc_lb = consecutive_matches(cc, 1, lb, pb);
    599         PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchByte ? lb : lb - 1);
    600         return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     643        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1);
     644        return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    601645    }
    602646    // Fall through to general case.
     
    604648        marker = process(repeated, marker, pb);
    605649        if (mGraphemeBoundaryRule) {
    606             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     650            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    607651        }
    608652    }
     
    615659        // Create a mask of positions reachable within ub from current marker.
    616660        // Use matchstar, then apply filter.
    617         PabloAST * match = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     661        PabloAST * match = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    618662        PabloAST * upperLimitMask = reachable(match, 1, ub, pb);
    619         PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     663        PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    620664        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
    621         return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
     665        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
    622666    }
    623667    // Fall through to general case.
     
    628672        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m), "upper" + std::to_string(i)));
    629673        if (mGraphemeBoundaryRule) {
    630             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     674            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    631675        }
    632676    }
     
    636680MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
    637681    // always use PostPosition markers for unbounded repetition.
    638     PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     682    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    639683    if (!mGraphemeBoundaryRule && isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
    640684        PabloAST * cc = markerVar(compile(repeated, pb));
    641685        PabloAST * mstar = nullptr;
    642686        mstar = pb.createMatchStar(base, cc, "unbounded");
    643         return makeMarker(MarkerPosition::InitialPostPositionByte, mstar);
     687        return makeMarker(MarkerPosition::InitialPostPositionUnit, mstar);
    644688    } else if (isUnicodeUnitLength(repeated) && !AlgorithmOptionIsSet(DisableMatchStar) && !AlgorithmOptionIsSet(DisableUnicodeMatchStar)) {
    645689        PabloAST * cc = markerVar(compile(repeated, pb));
     
    655699            final = mGraphemeBoundaryRule;
    656700        }
    657         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
     701        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
    658702    } else if (mStarDepth > 0){
    659703        PabloBuilder * outerb = pb.getParent();
     
    663707        PabloAST * m1 = pb.createOr(base, starPending);
    664708        PabloAST * m2 = pb.createOr(base, starAccum);
    665         MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionByte, m1), pb);
    666         result = AdvanceMarker(result, MarkerPosition::InitialPostPositionByte, pb);
     709        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, m1), pb);
     710        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, pb);
    667711        PabloAST * loopComputation = markerVar(result);
    668712        Next * nextPending = pb.createNext(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
     
    680724        PabloBuilder wb = PabloBuilder::Create(pb);
    681725        mStarDepth++;
    682         MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionByte, whilePending), wb);
    683         result = AdvanceMarker(result, MarkerPosition::InitialPostPositionByte, wb);
     726        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, whilePending), wb);
     727        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, wb);
    684728        PabloAST * loopComputation = markerVar(result);
    685729        Next * nextWhilePending = wb.createNext(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
     
    697741
    698742inline MarkerType RE_Compiler::compileStart(const MarkerType marker, pablo::PabloBuilder & pb) {
    699     MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     743    MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
    700744    if (UNICODE_LINE_BREAK) {
    701745        PabloAST * line_end = mPB.createOr(mLineBreak, mCRLF);
    702746        PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
    703         return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     747        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
    704748    } else {
    705749        PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineBreak), 1));
    706         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     750        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
    707751    }
    708752}
     
    710754inline MarkerType RE_Compiler::compileEnd(const MarkerType marker, pablo::PabloBuilder & pb) {
    711755    if (UNICODE_LINE_BREAK) {
    712         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    713         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineBreak, "eol"));
     756        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     757        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
    714758    } else {
    715         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
    716         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineBreak, "eol"));
     759        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));  // For LF match
     760        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
    717761    }
    718762}
     
    725769        mGraphemeBoundaryRule = markerVar(f->second);
    726770        marker = process(gb->getExpression(), marker, pb);
    727         marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     771        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    728772        mGraphemeBoundaryRule = graphemeBoundaryRule;
    729773    } else {
    730         marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     774        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    731775        PabloAST * rule = markerVar(f->second);
    732776        if (gb->getSense() == GraphemeBoundary::Sense::Negative) {
    733777            rule = pb.createNot(rule);
    734778        }
    735         marker = makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(marker), rule, "gb"));
     779        marker = makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(marker), rule, "gb"));
    736780    }
    737781    return marker;
     
    740784inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
    741785    if (marker.pos != newpos) {
    742         if (marker.pos == MarkerPosition::FinalMatchByte) {
     786        if (marker.pos == MarkerPosition::FinalMatchUnit) {
    743787            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
    744             marker.pos = MarkerPosition::InitialPostPositionByte;
    745         }
    746         if (newpos == MarkerPosition::FinalPostPositionByte) {
     788            marker.pos = MarkerPosition::InitialPostPositionUnit;
     789        }
     790        if (newpos == MarkerPosition::FinalPostPositionUnit) {
    747791            PabloAST * nonFinal = mNonFinal;
    748792            if (mGraphemeBoundaryRule) {
     
    750794            }
    751795            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
    752             marker.pos = MarkerPosition::FinalPostPositionByte;
     796            marker.pos = MarkerPosition::FinalPostPositionUnit;
    753797        }
    754798    }
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5042 r5045  
    2323/*   Marker streams represent the results of matching steps.
    2424     Three types of marker streams are used internally.
    25      FinalMatchByte markers are used for character classes and
     25     FinalMatchUnit markers are used for character classes and
    2626     other strings identified by a one bit at their final position.
    27      InitialPostPositionByte markers are used to mark matches with
    28      a 1 bit immediately after a match.   InitialPostPositionByte markers
     27     InitialPostPositionUnit markers are used to mark matches with
     28     a 1 bit immediately after a match.   InitialPostPositionUnit markers
    2929     are generally required whenever a regular expression element
    3030     can match the empty string (e.g., * and ? repeated items).
    31      FinalPostPositionByte markers are used for single code unit
     31     FinalPostPositionUnit markers are used for single code unit
    3232     lookahead assertions. 
    3333*/
     
    3535namespace re {
    3636
    37 enum MarkerPosition {FinalMatchByte, InitialPostPositionByte, FinalPostPositionByte};
     37enum MarkerPosition {FinalMatchUnit, InitialPostPositionUnit, FinalPostPositionUnit};
    3838
    3939struct MarkerType {
     
    5353
    5454    RE_Compiler(pablo::PabloFunction & function, cc::CC_Compiler & ccCompiler);
    55     void initializeRequiredStreams();
     55    void initializeRequiredStreams(Encoding encoding);
    5656    void compileUnicodeNames(RE *& re);
    5757    void finalizeMatchResult(MarkerType match_result, bool InvertMatches = false);
     
    6262private:
    6363
     64    void initializeRequiredStreams_utf8();
     65    void initializeRequiredStreams_utf16();
    6466    MarkerType compile(RE * re, pablo::PabloBuilder & cg);
    6567
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5033 r5045  
    8787   
    8888PabloFunction * re2pablo_compiler(const Encoding encoding, RE * re_ast) {
    89     PabloFunction * function = PabloFunction::Create("process_block", 8, 2);
     89    PabloFunction * function = PabloFunction::Create("process_block", encoding.getBits(), 2);
    9090    cc::CC_Compiler cc_compiler(*function, encoding);
    9191    re::RE_Compiler re_compiler(*function, cc_compiler);
    92     re_compiler.initializeRequiredStreams();
     92    re_compiler.initializeRequiredStreams(encoding);
    9393    re_compiler.compileUnicodeNames(re_ast);
    9494    re_compiler.finalizeMatchResult(re_compiler.compile(re_ast), AlgorithmOptions.isSet(InvertMatches));
  • icGREP/icgrep-devel/icgrep/utf_encoding.h

    r4209 r5045  
    1818        ASCII
    1919        , UTF_8
     20                , UTF_16
    2021    };
    2122
Note: See TracChangeset for help on using the changeset viewer.