Ignore:
Timestamp:
Dec 28, 2017, 1:15:13 PM (15 months ago)
Author:
nmedfort
Message:

Bug fix for RE local + some clean up of RE local and the RE Compiler

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5810 r5812  
    4242using namespace llvm;
    4343
     44using FollowMap = std::map<re::CC *, re::CC*>;
     45
    4446namespace re {
    4547
    46 PabloAST * RE_Compiler::compile(RE * re) {
    47     return markerVar(AdvanceMarker(compile(re, mPB), MarkerPosition::FinalPostPositionUnit, mPB));
    48 }
    49    
    50 MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
    51     return process(re, makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createOnes()), pb);
     48using MarkerType = RE_Compiler::MarkerType;
     49
     50PabloAST * RE_Compiler::compile(RE * re, PabloAST * const initialCursors) {
     51    const auto markers = initialCursors ? compile(re, initialCursors, mPB) : compile(re, mPB);
     52    return markerVar(AdvanceMarker(markers, FinalPostPositionUnit, mPB));
     53}
     54
     55inline MarkerType RE_Compiler::compile(RE * re, PabloAST * const cursors, PabloBuilder & pb) {
     56    return process(re, makeMarker(FinalMatchUnit, cursors), pb);
     57}
     58
     59inline MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
     60    return process(re, makeMarker(FinalPostPositionUnit, pb.createOnes()), pb);
    5261}
    5362   
     
    8291
    8392inline MarkerType RE_Compiler::compileAny(const MarkerType m, PabloBuilder & pb) {
    84     PabloAST * const nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionUnit, pb));
    85     return makeMarker(MarkerPosition::FinalMatchUnit, nextFinalByte);
     93    PabloAST * const nextFinalByte = markerVar(AdvanceMarker(m, FinalPostPositionUnit, pb));
     94    return makeMarker(FinalMatchUnit, nextFinalByte);
    8695}
    8796
     
    8998    // If Unicode CCs weren't pulled out earlier, we generate the equivalent
    9099    // byte sequence as an RE.
    91     if (cc->getAlphabet() == &cc::Unicode) return process(toUTF8(cc), marker, pb);
     100    if (cc->getAlphabet() == &cc::Unicode) {
     101        return process(toUTF8(cc), marker, pb);
     102    }
    92103    PabloAST * nextPos = markerVar(marker);
    93104    if (isByteLength(cc)) {
    94         if (marker.pos == MarkerPosition::FinalMatchUnit) nextPos = pb.createAdvance(nextPos, 1);
    95     }
    96     else {
    97         nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
    98     }
    99     return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
     105        if (marker.pos == FinalMatchUnit) {
     106            nextPos = pb.createAdvance(nextPos, 1);
     107        }
     108    } else {
     109        nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     110    }
     111    return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
    100112}
    101113
     
    110122    } else if (isUnicodeUnitLength(name)) {
    111123        MarkerType nameMarker = compileName(name, pb);
    112         MarkerType nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
     124        MarkerType nextPos = AdvanceMarker(marker, FinalPostPositionUnit, pb);
    113125        nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
    114126        return nameMarker;
     
    172184}
    173185
    174 MarkerType RE_Compiler::compileAlt(Alt * alt, MarkerType marker, PabloBuilder & pb) {
    175     std::vector<PabloAST *>  accum(3, pb.createZeroes());
    176     MarkerType const base = marker;
     186MarkerType RE_Compiler::compileAlt(Alt * alt, const MarkerType base, PabloBuilder & pb) {
     187    std::array<PabloAST *, 3> accum;
    177188    // The following may be useful to force a common Advance rather than separate
    178189    // Advances in each alternative.
     190    accum.fill(pb.createZeroes());
    179191    for (RE * re : *alt) {
    180192        MarkerType m = process(re, base, pb);
    181         MarkerPosition p = markerPos(m);
     193        const MarkerPosition p = markerPos(m);
    182194        accum[p] = pb.createOr(accum[p], markerVar(m), "alt");
    183195    }
    184     if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionUnit]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionUnit])) {
    185         return makeMarker(MarkerPosition::FinalMatchUnit, accum[MarkerPosition::FinalMatchUnit]);
    186     }
    187     PabloAST * combine = pb.createOr(accum[InitialPostPositionUnit], pb.createAdvance(accum[MarkerPosition::FinalMatchUnit], 1), "alt");
     196    if (isa<Zeroes>(accum[InitialPostPositionUnit]) && isa<Zeroes>(accum[FinalPostPositionUnit])) {
     197        return makeMarker(FinalMatchUnit, accum[FinalMatchUnit]);
     198    }
     199
     200    PabloAST * combine = pb.createOr(accum[InitialPostPositionUnit], pb.createAdvance(accum[FinalMatchUnit], 1), "alt");
    188201    if (isa<Zeroes>(accum[FinalPostPositionUnit])) {
    189202        return makeMarker(InitialPostPositionUnit, combine);
    190203    }
    191     combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionUnit], "alt");
    192     return makeMarker(MarkerPosition::FinalPostPositionUnit, combine);
     204    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[FinalPostPositionUnit], "alt");
     205    return makeMarker(FinalPostPositionUnit, combine);
    193206}
    194207
     
    205218    } else if (a->getKind() == Assertion::Kind::Boundary) {
    206219        MarkerType cond = compile(asserted, pb);
    207         if (LLVM_LIKELY(markerPos(cond) == MarkerPosition::FinalMatchUnit)) {
    208             MarkerType postCond = AdvanceMarker(cond, MarkerPosition::FinalPostPositionUnit, pb);
     220        if (LLVM_LIKELY(markerPos(cond) == FinalMatchUnit)) {
     221            MarkerType postCond = AdvanceMarker(cond, FinalPostPositionUnit, pb);
    209222            PabloAST * boundaryCond = pb.createXor(markerVar(cond), markerVar(postCond));
    210223            if (a->getSense() == Assertion::Sense::Negative) {
    211224                boundaryCond = pb.createNot(boundaryCond);
    212225            }
    213             MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    214             return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), boundaryCond, "boundary"));
     226            MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionUnit, pb);
     227            return makeMarker(FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), boundaryCond, "boundary"));
    215228        }
    216229        else UnsupportedRE("Unsupported boundary assertion");
    217230    } else if (isUnicodeUnitLength(asserted)) {
    218231        MarkerType lookahead = compile(asserted, pb);
    219         if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchUnit)) {
     232        if (LLVM_LIKELY(markerPos(lookahead) == FinalMatchUnit)) {
    220233            PabloAST * la = markerVar(lookahead);
    221234            if (a->getSense() == Assertion::Sense::Negative) {
    222235                la = pb.createNot(la);
    223236            }
    224             MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    225             return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     237            MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionUnit, pb);
     238            return makeMarker(FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), la, "lookahead"));
    226239        }
    227240    }
     
    335348
    336349MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, int ifGroupSize, PabloBuilder & pb) {
    337     if (LLVM_UNLIKELY(lb == 0)) return marker;
    338     else if (LLVM_UNLIKELY(lb == 1)) return process(repeated, marker, pb);
     350    if (LLVM_UNLIKELY(lb == 0)) {
     351        return marker;
     352    } else if (LLVM_UNLIKELY(lb == 1)) {
     353        return process(repeated, marker, pb);
     354    }
    339355    //
    340356    // A bounded repetition with an upper bound of at least 2.
     
    345361            PabloAST * cc = markerVar(compile(repeated, pb));
    346362            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, nullptr, pb);
    347             const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
     363            const auto pos = markerPos(marker) == FinalMatchUnit ? lb : lb - 1;
    348364            PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), pos);
    349             return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     365            return makeMarker(FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    350366        }
    351367        else if (isUnicodeUnitLength(repeated)) {
    352368            PabloAST * cc = markerVar(compile(repeated, pb));
    353369            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, mFinal, pb);
    354             const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
     370            const auto pos = markerPos(marker) == FinalMatchUnit ? lb : lb - 1;
    355371            PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), mFinal, pos);
    356             return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     372            return makeMarker(FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    357373        }
    358374        else if (isTypeForLocal(repeated)) {
    359             CC * firstSymSet = RE_Local::first(repeated);
    360             std::map<CC *, CC*> followMap;
    361             RE_Local::follow(repeated, followMap);
    362             bool firstSymSet_found_in_follow = false;
    363             for (auto & entry : followMap) {
    364                 if (entry.second->intersects(*firstSymSet)) {
    365                     firstSymSet_found_in_follow = true;
    366                 }
    367             }
    368             if (!firstSymSet_found_in_follow) {
    369                 // When the first symbol is unique, we can use it as an index stream.
    370                 PabloAST * firstCCstream = markerVar(compile(firstSymSet, pb));
     375            CC * const first = RE_Local::getFirstUniqueSymbol(repeated);
     376            if (first) { // if the first symbol is unique, we can use it as an index stream.
     377                PabloAST * firstCCstream = markerVar(compile(first, pb));
    371378                // Find all matches to the repeated regexp.
    372                 PabloAST * submatch = markerVar(AdvanceMarker(compile(repeated, pb), MarkerPosition::FinalPostPositionUnit, pb));
     379                PabloAST * submatch = markerVar(AdvanceMarker(compile(repeated, pb), FinalPostPositionUnit, pb));
    373380                // Consecutive submatches require that the symbol following the end of one submatch is the first symbol for
    374381                // the next submatch.   lb-1 such submatches are required.
    375382                PabloAST * consecutive_submatch = consecutive_matches(submatch, 1, lb-1, firstCCstream, pb);
    376383                // Find submatch positions which are lb-2 start symbols forward from the current marker position.
    377                 PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     384                PabloAST * base = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    378385                PabloAST * marker_fwd = pb.createIndexedAdvance(base, firstCCstream, lb - 1);
    379386                PabloAST * consecutive_lb_1 = pb.createAnd(marker_fwd, consecutive_submatch);
    380387                // From any of these positions, any position reachable by one more occurrence of the
    381388                // regexp is a match.
    382                 return process(repeated, makeMarker(MarkerPosition::FinalPostPositionUnit, consecutive_lb_1), pb);
     389                return process(repeated, makeMarker(FinalPostPositionUnit, consecutive_lb_1), pb);
    383390            }
    384391        }
     
    404411   
    405412MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, int ifGroupSize,  PabloBuilder & pb) {
    406     if (LLVM_UNLIKELY(ub == 0)) return marker;
     413    if (LLVM_UNLIKELY(ub == 0)) {
     414        return marker;
     415    }
    407416    //
    408417    // A bounded repetition with an upper bound of at least 2.
     
    414423            // Create a mask of positions reachable within ub from current marker.
    415424            // Use matchstar, then apply filter.
    416             PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
     425            PabloAST * cursor = markerVar(AdvanceMarker(marker, InitialPostPositionUnit, pb));
    417426            // If we've advanced the cursor to the post position unit, cursor begins on the first masked bit of the bounded mask.
    418427            // Extend the mask by ub - 1 byte positions to ensure the mask ends on the FinalMatchUnit of the repeated region.
     
    422431            // "multi-byte" character. Combine the masked range with any nonFinals.
    423432            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
    424             return makeMarker(MarkerPosition::InitialPostPositionUnit, bounded);
     433            return makeMarker(InitialPostPositionUnit, bounded);
    425434        }
    426435        else if (isUnicodeUnitLength(repeated)) {
    427436            // For a regexp which represent a single Unicode codepoint, we can use the mFinal stream
    428437            // as an index stream for an indexed advance operation.
    429             PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     438            PabloAST * cursor = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    430439            PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, mFinal, pb);
    431440            PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask, "masked");
    432441            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
    433             return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     442            return makeMarker(FinalPostPositionUnit, bounded);
    434443        }
    435444        else if (isTypeForLocal(repeated)) {
    436             CC * firstSymSet = RE_Local::first(repeated);
    437             std::map<CC *, CC*> followMap;
    438             RE_Local::follow(repeated, followMap);
    439             bool firstSymSet_found_in_follow = false;
    440             for (auto & entry : followMap) {
    441                 if (entry.second->intersects(*firstSymSet)) {
    442                     firstSymSet_found_in_follow = true;
    443                 }
    444             }
    445             if (!firstSymSet_found_in_follow) {
    446                 // When the first symbol is unique, we can use it as an index stream.
    447                 PabloAST * firstCCstream = markerVar(compile(firstSymSet, pb));
    448                 PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     445            CC * const first = RE_Local::getFirstUniqueSymbol(repeated);
     446            if (first) { // if the first symbol is unique, we can use it as an index stream.
     447                PabloAST * firstCCstream = markerVar(compile(first, pb));
     448                PabloAST * cursor = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    449449                PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, firstCCstream, pb);
    450                 PabloAST * masked = pb.createAnd(markerVar(AdvanceMarker(compile(repeated, pb), MarkerPosition::FinalPostPositionUnit, pb)), upperLimitMask, "masked");
     450                PabloAST * masked = pb.createAnd(markerVar(AdvanceMarker(compile(repeated, pb), FinalPostPositionUnit, pb)), upperLimitMask, "masked");
    451451                PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
    452                 return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     452                return makeMarker(FinalPostPositionUnit, bounded);
    453453            }
    454454        }
     
    478478MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
    479479    // always use PostPosition markers for unbounded repetition.
    480     PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     480    PabloAST * base = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    481481    if (isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
    482482        PabloAST * mask = markerVar(compile(repeated, pb));
     
    484484        // The post position character may land on the initial byte of a multi-byte character. Combine them with the masked range.
    485485        PabloAST * unbounded = pb.createMatchStar(base, pb.createOr(mask, nonFinal), "unbounded");
    486         return makeMarker(MarkerPosition::FinalPostPositionUnit, unbounded);
     486        return makeMarker(FinalPostPositionUnit, unbounded);
    487487    } else if (isUnicodeUnitLength(repeated) && !AlgorithmOptionIsSet(DisableMatchStar) && !AlgorithmOptionIsSet(DisableUnicodeMatchStar)) {
    488488        PabloAST * cc = markerVar(compile(repeated, pb));
     
    492492        mstar = pb.createMatchStar(base, cc);
    493493        PabloAST * final = mFinal;
    494         return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
     494        return makeMarker(FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
    495495    } else if (mStarDepth > 0){
    496496        PabloBuilder * const outer = pb.getParent();
     
    500500        PabloAST * m1 = pb.createOr(base, starPending);
    501501        PabloAST * m2 = pb.createOr(base, starAccum);
    502         MarkerType result = process(repeated, makeMarker(MarkerPosition::FinalPostPositionUnit, m1), pb);
    503         result = AdvanceMarker(result, MarkerPosition::FinalPostPositionUnit, pb);
     502        MarkerType result = process(repeated, makeMarker(FinalPostPositionUnit, m1), pb);
     503        result = AdvanceMarker(result, FinalPostPositionUnit, pb);
    504504        PabloAST * loopComputation = markerVar(result);
    505505        pb.createAssign(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
     
    517517        mCompiledName = &nestedMap;
    518518        mStarDepth++;
    519         MarkerType result = process(repeated, makeMarker(MarkerPosition::FinalPostPositionUnit, whilePending), wb);
    520         result = AdvanceMarker(result, MarkerPosition::FinalPostPositionUnit, wb);
     519        MarkerType result = process(repeated, makeMarker(FinalPostPositionUnit, whilePending), wb);
     520        result = AdvanceMarker(result, FinalPostPositionUnit, wb);
    521521        PabloAST * loopComputation = markerVar(result);
    522522        wb.createAssign(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
     
    532532inline MarkerType RE_Compiler::compileStart(MarkerType marker, pablo::PabloBuilder & pb) {
    533533    PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineBreak), 1));
    534     MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
    535     return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
     534    MarkerType m = AdvanceMarker(marker, InitialPostPositionUnit, pb);
     535    return makeMarker(InitialPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
    536536}
    537537
    538538inline MarkerType RE_Compiler::compileEnd(MarkerType marker, pablo::PabloBuilder & pb) {
    539     PabloAST * const nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
    540     PabloAST * atEOL = pb.createOr(pb.createAnd(mLineBreak, nextPos), pb.createAdvance(pb.createAnd(nextPos, mCRLF), 1), "eol");
    541     return makeMarker(MarkerPosition::FinalPostPositionUnit, atEOL);
     539    PabloAST * const nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     540    PabloAST * const atEOL = pb.createOr(pb.createAnd(mLineBreak, nextPos), pb.createAdvance(pb.createAnd(nextPos, mCRLF), 1), "eol");
     541    return makeMarker(FinalPostPositionUnit, atEOL);
    542542}
    543543
    544544inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
    545545    if (marker.pos != newpos) {
    546         if (marker.pos == MarkerPosition::FinalMatchUnit) {
     546        if (marker.pos == FinalMatchUnit) {
    547547            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
    548             marker.pos = MarkerPosition::InitialPostPositionUnit;
    549         }
    550         if (newpos == MarkerPosition::FinalPostPositionUnit) {
    551             PabloAST * nonFinal = mNonFinal;
    552             marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
    553             marker.pos = MarkerPosition::FinalPostPositionUnit;
     548            marker.pos = InitialPostPositionUnit;
     549        }
     550        if (newpos == FinalPostPositionUnit) {
     551            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), mNonFinal, "fpp");
     552            marker.pos = FinalPostPositionUnit;
    554553        }
    555554    }
     
    589588    mNonFinal = mPB.createExtract(required, mPB.getInteger(1));
    590589    mFinal = mPB.createExtract(required, mPB.getInteger(2));
    591 
    592590}
    593591
Note: See TracChangeset for help on using the changeset viewer.