Changeset 5720


Ignore:
Timestamp:
Nov 1, 2017, 2:26:02 PM (22 months ago)
Author:
cameron
Message:

Log2 bounded repetition: complex subexpressions and if-hierarchies

Location:
icGREP/icgrep-devel/icgrep/re
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5682 r5720  
    313313//Cases that not include bounded repetition, assertion, start and end type can suit for local language compile pipeline.
    314314bool isTypeForLocal(const RE * re) {
    315     if (const Alt * alt = dyn_cast<Alt>(re)) {
     315    if (const Name * n = dyn_cast<Name>(re)) {
     316        return isTypeForLocal(n->getDefinition());
     317    } else if (const Alt * alt = dyn_cast<Alt>(re)) {
    316318        for (const RE * re : *alt) {
    317319            if (!isTypeForLocal(re)) {
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5717 r5720  
    338338
    339339/*
    340    Given a stream |repeated| marking positions associated with matches to an item
    341    of length |repeated_lgth|, compute a stream marking |repeat_count| consecutive
    342    occurrences of such items.
     340   Given a stream |repeated_j| marking positions associated with |j| conecutive matches to an item
     341   compute a stream marking |repeat_count| consecutive occurrences of such items.
    343342*/
    344 
    345 inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloAST * indexStream, PabloBuilder & pb) {
    346     int i = length;
    347     int total = repeat_count * length;
    348     PabloAST * consecutive_i = repeated;
    349     while ((i * 2) < total) {
    350         PabloAST * v = consecutive_i;
    351         PabloAST * v2 = nullptr;
    352         if (indexStream == nullptr) v2 =  pb.createAdvance(v, i);
    353         else v2 =  pb.createIndexedAdvance(v, indexStream, i);
    354         i *= 2;
    355         consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
    356     }
    357     if (LLVM_LIKELY(i < total)) {
    358         PabloAST * v = consecutive_i;
    359         PabloAST * v2 =  nullptr;
    360         if (indexStream == nullptr) v2 = pb.createAdvance(v, total - i);
    361         else v2 = pb.createIndexedAdvance(v, indexStream, total - i);
    362         consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(total));
    363     }
    364     return consecutive_i;
    365 }
     343   
     344PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated_j, int j, int repeat_count, PabloAST * indexStream, PabloBuilder & pb) {
     345    if (j == repeat_count) return repeated_j;
     346    int i = std::min(j, repeat_count - j);
     347    int k = j + i;
     348    if (j > IfInsertionGap) {
     349        Var * repeated = pb.createVar("repeated", pb.createZeroes());
     350        PabloBuilder nested = PabloBuilder::Create(pb);
     351        NameMap nestedMap(mCompiledName);
     352        mCompiledName = &nestedMap;
     353       
     354        PabloAST * adv_i = nullptr;
     355        if (indexStream == nullptr) adv_i = nested.createAdvance(repeated_j, i);
     356        else adv_i = nested.createIndexedAdvance(repeated_j, indexStream, i);
     357        PabloAST * repeated_k = nested.createAnd(repeated_j, adv_i, "at" + std::to_string(k) + "of" + std::to_string(repeat_count));
     358        nested.createAssign(repeated, consecutive_matches(repeated_k, k, repeat_count, indexStream, nested));
     359        pb.createIf(repeated_j, nested);
     360        mCompiledName = nestedMap.getParent();
     361        return repeated;
     362    }
     363    else {
     364        PabloAST * adv_i = nullptr;
     365        if (indexStream == nullptr) adv_i = pb.createAdvance(repeated_j, i);
     366        else adv_i = pb.createIndexedAdvance(repeated_j, indexStream, i);
     367        PabloAST * repeated_k = pb.createAnd(repeated_j, adv_i, "at" + std::to_string(k) + "of" + std::to_string(repeat_count));
     368        return consecutive_matches(repeated_k, k, repeat_count, indexStream, pb);
     369    }
     370}
     371
    366372
    367373inline PabloAST * RE_Compiler::reachable(PabloAST * repeated, int length, int repeat_count, PabloAST * indexStream, PabloBuilder & pb) {
     
    393399
    394400MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, int ifGroupSize, PabloBuilder & pb) {
    395     if (lb == 0) {
    396         return marker;
    397     } else if (!mGraphemeBoundaryRule && isByteLength(repeated) && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
    398         PabloAST * cc = markerVar(compile(repeated, pb));
    399         PabloAST * cc_lb = consecutive_matches(cc, 1, lb, nullptr, pb);
    400         const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
    401         PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), pos);
    402         return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    403     } else if (!mGraphemeBoundaryRule && isUnicodeUnitLength(repeated) && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition) && AVX2_available()) {
    404         PabloAST * cc = markerVar(compile(repeated, pb));
    405         PabloAST * cc_lb = consecutive_matches(cc, 1, lb, mFinal, pb);
    406         const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
    407         PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), mFinal, pos);
    408         return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     401    if (LLVM_UNLIKELY(lb == 0)) return marker;
     402    else if (LLVM_UNLIKELY(lb == 1)) return process(repeated, marker, pb);
     403    //
     404    // A bounded repetition with an upper bound of at least 2.
     405    if (!mGraphemeBoundaryRule && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
     406        // Check for a regular expression that satisfies on of the special conditions that
     407        // allow implementation using the log2 technique.
     408        if (isByteLength(repeated)) {
     409            PabloAST * cc = markerVar(compile(repeated, pb));
     410            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, nullptr, pb);
     411            const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
     412            PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), pos);
     413            return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     414        }
     415        else if (isUnicodeUnitLength(repeated) && AVX2_available()) {
     416            PabloAST * cc = markerVar(compile(repeated, pb));
     417            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, mFinal, pb);
     418            const auto pos = markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1;
     419            PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), mFinal, pos);
     420            return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     421        }
     422        else if (isTypeForLocal(repeated) && AVX2_available()) {
     423            CC * firstSymSet = RE_Local::first(repeated);
     424            std::map<CC *, CC*> followMap;
     425            RE_Local::follow(repeated, followMap);
     426            bool firstSymSet_found_in_follow = false;
     427            for (auto & entry : followMap) {
     428                if (entry.second->intersects(*firstSymSet)) {
     429                    firstSymSet_found_in_follow = true;
     430                }
     431            }
     432            if (!firstSymSet_found_in_follow) {
     433                // When the first symbol is unique, we can use it as an index stream.
     434                PabloAST * firstCCstream = markerVar(compile(firstSymSet, pb));
     435                // Find all matches to the repeated regexp.
     436                PabloAST * submatch = markerVar(AdvanceMarker(compile(repeated, pb), MarkerPosition::FinalPostPositionUnit, pb));
     437                // Consecutive submatches require that the symbol following the end of one submatch is the first symbol for
     438                // the next submatch.   lb-1 such submatches are required.
     439                PabloAST * consecutive_submatch = consecutive_matches(submatch, 1, lb-1, firstCCstream, pb);
     440                // Find submatch positions which are lb-2 start symbols forward from the current marker position.
     441                PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     442                PabloAST * marker_fwd = pb.createIndexedAdvance(base, firstCCstream, lb - 1);
     443                PabloAST * consecutive_lb_1 = pb.createAnd(marker_fwd, consecutive_submatch);
     444                // From any of these positions, any position reachable by one more occurrence of the
     445                // regexp is a match.
     446                return process(repeated, makeMarker(MarkerPosition::FinalPostPositionUnit, consecutive_lb_1), pb);
     447            }
     448        }
    409449    }
    410450    // Fall through to general case.  Process the first item and insert the rest into an if-structure.
     
    431471   
    432472MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, int ifGroupSize,  PabloBuilder & pb) {
    433     if (LLVM_UNLIKELY(ub == 0)) {
    434         return marker;
    435     } else if (!mGraphemeBoundaryRule && isByteLength(repeated) && ub > 1 && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
    436         // log2 upper bound for fixed length (=1) class
    437         // Create a mask of positions reachable within ub from current marker.
    438         // Use matchstar, then apply filter.
    439         PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
    440         // If we've advanced the cursor to the post position unit, cursor begins on the first masked bit of the bounded mask.
    441         // Extend the mask by ub - 1 byte positions to ensure the mask ends on the FinalMatchUnit of the repeated region.
    442         PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, nullptr, pb);
    443         PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask);
    444         PabloAST * nonFinal = mNonFinal;
    445         // MatchStar deposits any cursors on the post position. However those cursors may may land on the initial "byte" of a
    446         // "multi-byte" character. Combine the masked range with any nonFinals.
    447         PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, nonFinal), "bounded");
    448         return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
    449     } else if (!mGraphemeBoundaryRule && isUnicodeUnitLength(repeated) && ub > 1 && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition) && AVX2_available()) {
    450         // log2 upper bound for fixed length (=1) class
    451         // Create a mask of positions reachable within ub from current marker.
    452         // Use matchstar, then apply filter.
    453         PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
    454         // If we've advanced the cursor to the post position unit, cursor begins on the first masked bit of the bounded mask.
    455         // Extend the mask by ub - 1 byte positions to ensure the mask ends on the FinalMatchUnit of the repeated region.
    456         PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, mFinal, pb);
    457         PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask, "masked");
    458         PabloAST * nonFinal = mNonFinal;
    459         // MatchStar deposits any cursors on the post position. However those cursors may may land on the initial "byte" of a
    460         // "multi-byte" character. Combine the masked range with any nonFinals.
    461         PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, nonFinal), "bounded");
    462         return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     473    if (LLVM_UNLIKELY(ub == 0)) return marker;
     474    //
     475    // A bounded repetition with an upper bound of at least 2.
     476    if (!mGraphemeBoundaryRule && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition) && (ub > 1)) {
     477        // Check for a regular expression that satisfies on of the special conditions that
     478        // allow implementation using the log2 technique.
     479        if (isByteLength(repeated)) {
     480            // log2 upper bound for fixed length (=1) class
     481            // Create a mask of positions reachable within ub from current marker.
     482            // Use matchstar, then apply filter.
     483            PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     484            // If we've advanced the cursor to the post position unit, cursor begins on the first masked bit of the bounded mask.
     485            // Extend the mask by ub - 1 byte positions to ensure the mask ends on the FinalMatchUnit of the repeated region.
     486            PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, nullptr, pb);
     487            PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask);
     488            // MatchStar deposits any cursors on the post position. However those cursors may may land on the initial "byte" of a
     489            // "multi-byte" character. Combine the masked range with any nonFinals.
     490            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     491            return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     492        }
     493        else if (isUnicodeUnitLength(repeated) && AVX2_available()) {
     494            // For a regexp which represent a single Unicode codepoint, we can use the mFinal stream
     495            // as an index stream for an indexed advance operation.
     496            PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     497            PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, mFinal, pb);
     498            PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask, "masked");
     499            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     500            return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     501        }
     502        else if (isTypeForLocal(repeated) && AVX2_available()) {
     503            CC * firstSymSet = RE_Local::first(repeated);
     504            std::map<CC *, CC*> followMap;
     505            RE_Local::follow(repeated, followMap);
     506            bool firstSymSet_found_in_follow = false;
     507            for (auto & entry : followMap) {
     508                if (entry.second->intersects(*firstSymSet)) {
     509                    firstSymSet_found_in_follow = true;
     510                }
     511            }
     512            if (!firstSymSet_found_in_follow) {
     513                // When the first symbol is unique, we can use it as an index stream.
     514                PabloAST * firstCCstream = markerVar(compile(firstSymSet, pb));
     515                PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
     516                PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, firstCCstream, pb);
     517                PabloAST * masked = pb.createAnd(markerVar(AdvanceMarker(compile(repeated, pb), MarkerPosition::FinalPostPositionUnit, pb)), upperLimitMask, "masked");
     518                PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     519                return makeMarker(MarkerPosition::FinalPostPositionUnit, bounded);
     520            }
     521        }
    463522    }
    464523    // Fall through to general case.  Process the first item and insert the rest into an if-structure.
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5710 r5720  
    102102    MarkerType compileDiff(Diff * diff, MarkerType marker, pablo::PabloBuilder & cg);
    103103    MarkerType compileIntersect(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
    104     pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated, int length, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
     104    pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated_j, int j, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
    105105    pablo::PabloAST * reachable(pablo::PabloAST * repeated, int length, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
    106106    static bool isFixedLength(RE * regexp);
Note: See TracChangeset for help on using the changeset viewer.