Ignore:
Timestamp:
Jun 9, 2016, 3:34:07 PM (3 years ago)
Author:
xuedongx
Message:

Support over UTF-16 representation of Unicode

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5042 r5045  
    4343namespace re {
    4444
    45 void RE_Compiler::initializeRequiredStreams() {
    46 
     45void RE_Compiler::initializeRequiredStreams(Encoding encoding) {
     46        if (encoding.getType() == Encoding::Type::UTF_8) {
     47                RE_Compiler::initializeRequiredStreams_utf8();
     48        }
     49        else if (encoding.getType() == Encoding::Type::UTF_16) {
     50                RE_Compiler::initializeRequiredStreams_utf16();
     51        }
     52}
     53               
     54void RE_Compiler::initializeRequiredStreams_utf16() {
     55    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x000A)));
     56    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x000D));
     57    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x000A, 0x000D));
     58    Assign * NEL = mPB.createAssign("NEL", mCCCompiler.compileCC(makeCC(0x0085)));
     59    Assign * LS_PS = mPB.createAssign("LS_PS", mCCCompiler.compileCC(makeCC(0x2028, 0x2029)));
     60    Assign * NEL_LS_PS = mPB.createAssign("NEL_LS_PS", mPB.createOr(NEL, LS_PS));
     61
     62    PabloAST * cr1 = mPB.createAdvance(CR, 1, "cr1");
     63    Assign * acrlf = mPB.createAssign("crlf", mPB.createAnd(cr1, LF));
     64    mCRLF = acrlf;
     65
     66        PabloAST * hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDBFF));
     67        //PabloAST * lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDFFF));
     68        PabloAST * u16hi_hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDB00));    //u16hi_hi_surrogate = [\xD8-\xDB]
     69        PabloAST * u16hi_lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDF00));    //u16hi_lo_surrogate = [\xDC-\xDF]
     70
     71        PabloAST * invalidTemp = mPB.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
     72    Assign * u16invalid = mPB.createAssign("u16invalid", mPB.createXor(invalidTemp, u16hi_lo_surrogate));//errors.Unicode=pablo.Advance(u16hi_hi_surrogate) ^ u16hi_lo_surrogate
     73    Assign * u16valid = mPB.createAssign("u16valid", mPB.createNot(u16invalid));
     74
     75    PabloAST * u16single_temp = mPB.createOr(mCCCompiler.compileCC(makeCC(0x0000, 0xD7FF)), mCCCompiler.compileCC(makeCC(0xE000, 0xFFFF)));
     76        PabloAST * u16single = mPB.createAnd(u16single_temp, mPB.createNot(u16invalid));
     77   
     78    mNonFinal = mPB.createAssign("nonfinal", mPB.createAnd(hi_surrogate, u16valid));
     79    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u16invalid), "final");
     80        mInitial = mPB.createOr(u16single, hi_surrogate, "initial");
     81   
     82    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
     83    PabloAST * UnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
     84    PabloAST * lb = UNICODE_LINE_BREAK ? UnicodeLineBreak : LF;
     85    PabloAST * unterminatedLineAtEOF = mPB.createAtEOF(mPB.createAdvance(mPB.createNot(LB_chars), 1));
     86    mLineBreak = mPB.createOr(lb, unterminatedLineAtEOF);
     87    mAny = mPB.createNot(lb, "any");
     88    mFunction.setResult(1, mPB.createAssign("lf", mLineBreak));
     89        return;
     90}
     91void RE_Compiler::initializeRequiredStreams_utf8() {
    4792    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x0A)));
    4893    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x0D));
     
    282327        }
    283328    };
    284 
    285329    re = resolve(re);
    286330    gather(re);
     
    295339        for (auto t : nameMap) {
    296340            if (t.second) {
    297                 mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, t.second)));
     341                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchUnit, t.second)));
    298342            }
    299343        }
     
    320364    RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    321365    RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    322     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF, GCB_Control});
     366    RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    323367
    324368    // Break at the start and end of text.
     
    359403
    360404void RE_Compiler::finalizeMatchResult(MarkerType match_result, bool InvertMatches) {
    361     PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
     405        PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
    362406    if (InvertMatches) {
    363407        match_follow = mPB.createNot(match_follow);
     
    367411
    368412MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
    369     return process(re, makeMarker(MarkerPosition::FinalPostPositionByte, pb.createOnes()), pb);
     413    return process(re, makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createOnes()), pb);
    370414}
    371415
     
    398442
    399443inline MarkerType RE_Compiler::compileAny(const MarkerType m, PabloBuilder & pb) {
    400     PabloAST * nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionByte, pb));
     444    PabloAST * nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionUnit, pb));
    401445    PabloAST * lb = mLineBreak;
    402446    if (UNICODE_LINE_BREAK) {
    403447        lb = pb.createOr(mLineBreak, mCRLF);
    404448    }
    405     return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
     449    return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
    406450}
    407451
     
    409453    MarkerType nameMarker = compileName(name, pb);
    410454    MarkerType nextPos;
    411     if (markerPos(marker) == MarkerPosition::FinalPostPositionByte) {
     455    if (markerPos(marker) == MarkerPosition::FinalPostPositionUnit) {
    412456        nextPos = marker;
    413457    } else if (name->getType() == Name::Type::Byte) {
    414         nextPos = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     458        nextPos = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
    415459    } else {
    416         nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     460        nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    417461    }
    418462    nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
     
    465509    // The following may be useful to force a common Advance rather than separate
    466510    // Advances in each alternative.
    467     // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
     511    // MarkerType const base = makeMarker(InitialPostPositionUnit, postPositionVar(marker, pb), pb);
    468512    for (RE * re : *alt) {
    469513        MarkerType rslt = process(re, base, pb);
     
    471515        accum[p] = pb.createOr(accum[p], markerVar(rslt), "alt");
    472516    }
    473     if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionByte]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionByte])) {
    474         return makeMarker(MarkerPosition::FinalMatchByte, accum[MarkerPosition::FinalMatchByte]);
    475     }
    476     PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[MarkerPosition::FinalMatchByte], 1), "alt");
    477     if (isa<Zeroes>(accum[FinalPostPositionByte])) {
    478         return makeMarker(InitialPostPositionByte, combine);
    479     }
    480     combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionByte], "alt");
    481     return makeMarker(MarkerPosition::FinalPostPositionByte, combine);
     517    if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionUnit]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionUnit])) {
     518        return makeMarker(MarkerPosition::FinalMatchUnit, accum[MarkerPosition::FinalMatchUnit]);
     519    }
     520    PabloAST * combine = pb.createOr(accum[InitialPostPositionUnit], pb.createAdvance(accum[MarkerPosition::FinalMatchUnit], 1), "alt");
     521    if (isa<Zeroes>(accum[FinalPostPositionUnit])) {
     522        return makeMarker(InitialPostPositionUnit, combine);
     523    }
     524    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionUnit], "alt");
     525    return makeMarker(MarkerPosition::FinalPostPositionUnit, combine);
    482526}
    483527
     
    494538    } else if (isUnicodeUnitLength(asserted)) {
    495539        MarkerType lookahead = compile(asserted, pb);
    496         if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchByte)) {
     540        if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchUnit)) {
    497541            PabloAST * la = markerVar(lookahead);
    498542            if (a->getSense() == Assertion::Sense::Negative) {
    499543                la = pb.createNot(la);
    500544            }
    501             MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    502             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     545            MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
     546            return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), la, "lookahead"));
    503547        }
    504548    }
     
    597641        PabloAST * cc = markerVar(compile(repeated, pb));
    598642        PabloAST * cc_lb = consecutive_matches(cc, 1, lb, pb);
    599         PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchByte ? lb : lb - 1);
    600         return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     643        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1);
     644        return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    601645    }
    602646    // Fall through to general case.
     
    604648        marker = process(repeated, marker, pb);
    605649        if (mGraphemeBoundaryRule) {
    606             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     650            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    607651        }
    608652    }
     
    615659        // Create a mask of positions reachable within ub from current marker.
    616660        // Use matchstar, then apply filter.
    617         PabloAST * match = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     661        PabloAST * match = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    618662        PabloAST * upperLimitMask = reachable(match, 1, ub, pb);
    619         PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     663        PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    620664        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
    621         return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
     665        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
    622666    }
    623667    // Fall through to general case.
     
    628672        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m), "upper" + std::to_string(i)));
    629673        if (mGraphemeBoundaryRule) {
    630             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     674            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    631675        }
    632676    }
     
    636680MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
    637681    // always use PostPosition markers for unbounded repetition.
    638     PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
     682    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
    639683    if (!mGraphemeBoundaryRule && isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
    640684        PabloAST * cc = markerVar(compile(repeated, pb));
    641685        PabloAST * mstar = nullptr;
    642686        mstar = pb.createMatchStar(base, cc, "unbounded");
    643         return makeMarker(MarkerPosition::InitialPostPositionByte, mstar);
     687        return makeMarker(MarkerPosition::InitialPostPositionUnit, mstar);
    644688    } else if (isUnicodeUnitLength(repeated) && !AlgorithmOptionIsSet(DisableMatchStar) && !AlgorithmOptionIsSet(DisableUnicodeMatchStar)) {
    645689        PabloAST * cc = markerVar(compile(repeated, pb));
     
    655699            final = mGraphemeBoundaryRule;
    656700        }
    657         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
     701        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
    658702    } else if (mStarDepth > 0){
    659703        PabloBuilder * outerb = pb.getParent();
     
    663707        PabloAST * m1 = pb.createOr(base, starPending);
    664708        PabloAST * m2 = pb.createOr(base, starAccum);
    665         MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionByte, m1), pb);
    666         result = AdvanceMarker(result, MarkerPosition::InitialPostPositionByte, pb);
     709        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, m1), pb);
     710        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, pb);
    667711        PabloAST * loopComputation = markerVar(result);
    668712        Next * nextPending = pb.createNext(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
     
    680724        PabloBuilder wb = PabloBuilder::Create(pb);
    681725        mStarDepth++;
    682         MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionByte, whilePending), wb);
    683         result = AdvanceMarker(result, MarkerPosition::InitialPostPositionByte, wb);
     726        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, whilePending), wb);
     727        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, wb);
    684728        PabloAST * loopComputation = markerVar(result);
    685729        Next * nextWhilePending = wb.createNext(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
     
    697741
    698742inline MarkerType RE_Compiler::compileStart(const MarkerType marker, pablo::PabloBuilder & pb) {
    699     MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     743    MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
    700744    if (UNICODE_LINE_BREAK) {
    701745        PabloAST * line_end = mPB.createOr(mLineBreak, mCRLF);
    702746        PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
    703         return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     747        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
    704748    } else {
    705749        PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineBreak), 1));
    706         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     750        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
    707751    }
    708752}
     
    710754inline MarkerType RE_Compiler::compileEnd(const MarkerType marker, pablo::PabloBuilder & pb) {
    711755    if (UNICODE_LINE_BREAK) {
    712         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    713         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineBreak, "eol"));
     756        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     757        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
    714758    } else {
    715         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
    716         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineBreak, "eol"));
     759        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));  // For LF match
     760        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
    717761    }
    718762}
     
    725769        mGraphemeBoundaryRule = markerVar(f->second);
    726770        marker = process(gb->getExpression(), marker, pb);
    727         marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     771        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    728772        mGraphemeBoundaryRule = graphemeBoundaryRule;
    729773    } else {
    730         marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     774        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    731775        PabloAST * rule = markerVar(f->second);
    732776        if (gb->getSense() == GraphemeBoundary::Sense::Negative) {
    733777            rule = pb.createNot(rule);
    734778        }
    735         marker = makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(marker), rule, "gb"));
     779        marker = makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(marker), rule, "gb"));
    736780    }
    737781    return marker;
     
    740784inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
    741785    if (marker.pos != newpos) {
    742         if (marker.pos == MarkerPosition::FinalMatchByte) {
     786        if (marker.pos == MarkerPosition::FinalMatchUnit) {
    743787            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
    744             marker.pos = MarkerPosition::InitialPostPositionByte;
    745         }
    746         if (newpos == MarkerPosition::FinalPostPositionByte) {
     788            marker.pos = MarkerPosition::InitialPostPositionUnit;
     789        }
     790        if (newpos == MarkerPosition::FinalPostPositionUnit) {
    747791            PabloAST * nonFinal = mNonFinal;
    748792            if (mGraphemeBoundaryRule) {
     
    750794            }
    751795            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
    752             marker.pos = MarkerPosition::FinalPostPositionByte;
     796            marker.pos = MarkerPosition::FinalPostPositionUnit;
    753797        }
    754798    }
Note: See TracChangeset for help on using the changeset viewer.