Changeset 4411


Ignore:
Timestamp:
Jan 7, 2015, 8:34:23 PM (4 years ago)
Author:
cameron
Message:

Support for single Unicode position lookahead assertions, \b

Location:
icGREP/icgrep-devel/icgrep/re
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4410 r4411  
    2929namespace re {
    3030
    31 inline MarkerType makePostPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
    32     return MarkerType{PostPosition, pb.createAssign(marker_name, s)};
    33 }
    34 
    35 inline MarkerType wrapPostPositionMarker(Assign * s) {
    36     return MarkerType{PostPosition, s};
    37 }
    38 
    39 inline MarkerType makeFinalPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
    40     return MarkerType{FinalByte, pb.createAssign(marker_name, s)};
    41 }
    42 
    43 inline Assign * markerStream(MarkerType m, PabloBlock &) {
    44     return m.stream;
    45 }
    46 
    47 inline Assign * markerVar(MarkerType m, PabloBlock &) {
    48     return m.stream;
    49 }
    50 
    51 inline Assign * postPositionVar(MarkerType m, PabloBlock & pb) {
    52     if (isFinalPositionMarker(m)) {
    53         return pb.createAssign("f", pb.createAdvance(markerVar(m, pb), 1));
    54     }
    55     return markerVar(m, pb);
    56 }
    57 
    58 //Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
    59 //#define USE_IF_FOR_NONFINAL
    60 
    61 
    6231
    6332RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
     
    6938
    7039}
     40   
     41   
     42MarkerType RE_Compiler::AdvanceMarker(MarkerType m, MarkerPosition newpos, PabloBlock & pb) {
     43    if (m.pos == newpos) return m;
     44    if (m.pos > newpos) throw std::runtime_error("icgrep internal error: attempt to AdvanceMarker backwards");
     45    Assign * a = m.stream;
     46    if (m.pos == FinalMatchByte) {
     47        // Must advance at least to InitialPostPositionByte
     48        a = pb.createAssign("adv", pb.createAdvance(a, 1));
     49    }
     50    // Now at InitialPostPositionByte; is a further advance needed?
     51    if (newpos == FinalPostPositionByte) {
     52        // Must advance through nonfinal bytes
     53        a = pb.createAssign("scanToFinal", pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal));
     54    }
     55    return {newpos, a};
     56}
     57
     58void RE_Compiler::AlignMarkers(MarkerType & m1, MarkerType & m2, PabloBlock & pb) {
     59    if (m1.pos < m2.pos) {
     60        m1 = AdvanceMarker(m1, m2.pos, pb);
     61    }
     62    else if (m2.pos < m1.pos) {
     63        m2 = AdvanceMarker(m2, m1.pos, pb);
     64    }
     65}
     66   
     67   
    7168
    7269#define USE_IF_FOR_NONFINAL 1
     
    140137    //These three lines are specifically for grep.
    141138    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
    142     Assign * v = markerVar(match_result, mCG);
     139    Assign * v = markerVar(match_result);
    143140    mCG.createAssign("matches", mCG.createAnd(mCG.createMatchStar(v, mCG.createNot(lb)), lb), 0);
    144141    mCG.createAssign("lf", mCG.createAnd(lb, mCG.createNot(mCRLF)), 1);
     
    146143
    147144MarkerType RE_Compiler::compile(RE * re, PabloBlock & pb) {
    148     return process(re, makePostPositionMarker("start", pb.createOnes(), pb), pb);
     145    return process(re, makeMarker(InitialPostPositionByte, pb.createAssign("start", pb.createOnes())), pb);
    149146}
    150147
     
    158155        if (def != nullptr) {
    159156            MarkerType m = compile(def, mCG);
    160             assert(isFinalPositionMarker(m));
    161             Assign * v = markerStream(m, mCG);
    162             name->setCompiled(markerStream(m, mCG));
     157            assert(markerPos(m) == FinalMatchByte);
     158            Assign * v = markerVar(m);
     159            name->setCompiled(markerVar(m));
    163160            return v;
    164161        }
     
    173170
    174171PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBlock & pb) {
    175     if (isPostPositionMarker(m)) {
    176         return pb.createScanThru(pb.createAnd(mInitial, markerVar(m, pb)), mNonFinal);
     172    if (markerPos(m) == FinalPostPositionByte) {
     173        return markerVar(m);
     174    }
     175    else if (markerPos(m) == InitialPostPositionByte) {
     176        return pb.createScanThru(pb.createAnd(mInitial, markerVar(m)), mNonFinal);
    177177    }
    178178    else {
    179179        //return pb.createAdvanceThenScanThru(pb.createVar(markerVar(m), pb), mNonFinal);
    180         return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(markerVar(m, pb), 1)), mNonFinal);
     180        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(markerVar(m), 1)), mNonFinal);
    181181    }
    182182}
     
    201201        PabloAST * nextPos = nextUnicodePosition(marker, pb);
    202202        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
    203         return makeFinalPositionMarker("dot", pb.createAnd(nextPos, dot), pb);
     203        return makeMarker(FinalMatchByte, pb.createAssign("dot", pb.createAnd(nextPos, dot)));
    204204    }
    205205    else if (Diff * diff = dyn_cast<Diff>(re)) {
     
    210210    }
    211211    else if (isa<Start>(re)) {
     212        MarkerType m = AdvanceMarker(marker, InitialPostPositionByte, pb);
    212213        if (UNICODE_LINE_BREAK) {
    213214            PabloAST * line_end = mCG.createOr(mUnicodeLineBreak, mCRLF);
    214215            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
    215             return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
     216            return makeMarker(InitialPostPositionByte, pb.createAssign("sol", pb.createAnd(markerVar(m), sol)));
    216217        }
    217218        else {
    218219            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
    219             return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
     220            return makeMarker(InitialPostPositionByte, pb.createAssign("sol", pb.createAnd(markerVar(m), sol)));
    220221        }
    221222    }
     
    223224        if (UNICODE_LINE_BREAK) {
    224225            PabloAST * nextPos = nextUnicodePosition(marker, pb);
    225             return makeFinalPositionMarker("end", pb.createAnd(nextPos, mUnicodeLineBreak), pb);
    226         }
    227         PabloAST * nextPos = postPositionVar(marker, pb);  // For LF match
    228         return makePostPositionMarker("eol", pb.createAnd(nextPos, mLineFeed), pb);
     226            return makeMarker(FinalPostPositionByte, pb.createAssign("end", pb.createAnd(nextPos, mUnicodeLineBreak)));
     227        }
     228        PabloAST * nextPos = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));  // For LF match
     229        return makeMarker(InitialPostPositionByte, pb.createAssign("eol", pb.createAnd(nextPos, mLineFeed)));
    229230    }
    230231    return marker;
     
    232233
    233234MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBlock & pb) {
    234     PabloAST * nextPos = (name->getType() == Name::Type::Byte) ? postPositionVar(marker, pb): nextUnicodePosition(marker, pb);
    235     return makeFinalPositionMarker("m", pb.createAnd(nextPos, character_class_strm(name, pb)), pb);
     235    MarkerType nextPos;
     236    if (markerPos(marker) == FinalPostPositionByte) nextPos = marker;
     237    else if (name->getType() == Name::Type::Byte) {
     238        nextPos = AdvanceMarker(marker, InitialPostPositionByte, pb);
     239    }
     240    else {
     241        nextPos = AdvanceMarker(marker, FinalPostPositionByte, pb);
     242    }
     243    return makeMarker(FinalMatchByte, pb.createAssign("m", pb.createAnd(markerVar(nextPos), character_class_strm(name, pb))));
    236244}
    237245
     
    244252
    245253MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBlock & pb) {
    246     PabloAST * atPosnAccumulator = nullptr;
    247     PabloAST * postPosnAccumulator = nullptr;
     254    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
    248255    MarkerType const base = marker;
    249256    // The following may be useful to force a common Advance rather than separate
    250257    // Advances in each alternative.
    251     // MarkerType const base = makePostPositionMarker(postPositionVar(marker, pb), pb);
     258    // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
    252259    for (RE * re : *alt) {
    253260        MarkerType rslt = process(re, base, pb);
    254         PabloAST * rsltStream = markerVar(rslt, pb);
    255         if (isFinalPositionMarker(rslt)) {
    256             atPosnAccumulator = (atPosnAccumulator == nullptr) ? rsltStream : pb.createOr(atPosnAccumulator, rsltStream);
    257         }
    258         else {
    259             postPosnAccumulator = (postPosnAccumulator == nullptr) ? rsltStream : pb.createOr(postPosnAccumulator, rsltStream);
    260         }
    261     }
    262     if (postPosnAccumulator == nullptr) {
    263         return makeFinalPositionMarker("alt", atPosnAccumulator == nullptr ? pb.createZeroes() : atPosnAccumulator, pb);
     261        MarkerPosition p = markerPos(rslt);
     262        accum[p] = pb.createOr(accum[p], markerVar(rslt));
     263    }
     264    if (isa<Zeroes>(accum[InitialPostPositionByte]) && isa<Zeroes>(accum[FinalPostPositionByte])) {
     265        return makeMarker(FinalMatchByte, pb.createAssign("alt", accum[FinalMatchByte]));
     266    }
     267    PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[FinalMatchByte], 1));
     268    if (isa<Zeroes>(accum[FinalPostPositionByte])) {
     269        return makeMarker(InitialPostPositionByte, pb.createAssign("alt", combine));
     270    }
     271    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[FinalPostPositionByte]);
     272    return makeMarker(FinalPostPositionByte, pb.createAssign("alt", combine));
     273}
     274
     275MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBlock & pb) {
     276    RE * asserted = a->getAsserted();
     277    if (a->getKind() == Assertion::Kind::Lookbehind) {
     278        MarkerType m = marker;
     279        MarkerType lookback = compile(asserted, pb);
     280        AlignMarkers(m, lookback, pb);
     281        Assign * lb = markerVar(lookback);
     282        if (a->getSense() == Assertion::Sense::Negative) {
     283            lb = pb.createAssign("not", pb.createNot(lb));
     284        }
     285        return makeMarker(markerPos(m), pb.createAssign("lookback", pb.createAnd(markerVar(marker), lb)));
     286    }
     287    else if (isUnicodeUnitLength(asserted)) {
     288        MarkerType lookahead = compile(asserted, pb);
     289        assert(markerPos(lookahead) == FinalMatchByte);
     290        Assign * la = markerVar(lookahead);
     291        if (a->getSense() == Assertion::Sense::Negative) {
     292            la = pb.createAssign("not", pb.createNot(la));
     293        }
     294        MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionByte, pb);
     295        return makeMarker(FinalPostPositionByte, pb.createAssign("lookahead", pb.createAnd(markerVar(fbyte), la)));
    264296    }
    265297    else {
    266         if (atPosnAccumulator != nullptr) {
    267             postPosnAccumulator = pb.createOr(postPosnAccumulator, pb.createAdvance(atPosnAccumulator, 1));
    268         }
    269         return makePostPositionMarker("alt", postPosnAccumulator, pb);
    270     }
    271 }
    272 
    273 MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBlock & pb) {
    274     if (a->getKind() == Assertion::Kind::Lookbehind) {
    275         MarkerType lookback = compile(a->getAsserted(), pb);
    276         if (isFinalPositionMarker(marker) && isFinalPositionMarker(lookback)) {
    277             PabloAST * lb = markerVar(lookback, pb);
    278             if (a->getSense() == Assertion::Sense::Negative) lb = pb.createNot(lb);
    279             return makeFinalPositionMarker("lookback", pb.createAnd(markerVar(marker, pb), lb), pb);
    280         }
    281         else {
    282             Assign * m1 = postPositionVar(marker, pb);
    283             PabloAST * lb = postPositionVar(lookback, pb);
    284             if (a->getSense() == Assertion::Sense::Negative) lb = pb.createNot(lb);
    285             return makePostPositionMarker("lookback", pb.createAnd(m1, lb), pb);
    286         }
    287     }
    288     throw std::runtime_error("Lookahead assertions not implemented.");
     298        throw std::runtime_error("Unsupported lookahead assertion.");
     299    }
    289300}
    290301
     
    295306        MarkerType t1 = process(lh, marker, pb);
    296307        MarkerType t2 = process(rh, marker, pb);
    297         assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
    298         return makeFinalPositionMarker("diff", pb.createAnd(markerVar(t1, pb), pb.createNot(markerVar(t2, pb))), pb);
     308        AlignMarkers(t1, t2, pb);
     309        return makeMarker(markerPos(t1), pb.createAssign("diff", pb.createAnd(markerVar(t1), pb.createNot(markerVar(t2)))));
    299310    }
    300311    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
     
    307318        MarkerType t1 = process(lh, marker, pb);
    308319        MarkerType t2 = process(rh, marker, pb);
    309         assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
    310         return makeFinalPositionMarker("intersect", pb.createAnd(markerVar(t1, pb), markerVar(t2, pb)), pb);
     320        AlignMarkers(t1, t2, pb);
     321        return makeMarker(markerPos(t1), pb.createAssign("intersect", pb.createAnd(markerVar(t1), markerVar(t2))));
    311322    }
    312323    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
     
    351362MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
    352363    if (isByteLength(repeated)) {
    353         PabloAST * cc = markerVar(compile(repeated, pb), pb);
     364        PabloAST * cc = markerVar(compile(repeated, pb));
    354365        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(cc,1)), 1, lb, pb);
    355         PabloAST * marker_fwd = pb.createAdvance(markerVar(marker, pb), isFinalPositionMarker(marker) ? lb+ 1 : lb);
    356         return makePostPositionMarker("lowerbound", pb.createAnd(marker_fwd, cc_lb), pb);
     366        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == FinalMatchByte ? lb + 1 : lb);
     367        return makeMarker(InitialPostPositionByte, pb.createAssign("lowerbound", pb.createAnd(marker_fwd, cc_lb)));
    357368    }
    358369    // Fall through to general case.
     
    368379        // Mask out any positions that are more than ub positions from a current match.
    369380        // Use matchstar, then apply filter.
    370         Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(postPositionVar(marker, pb)));
     381        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb))));
    371382        PabloAST * upperLimitMask = pb.createNot(consecutive(nonMatch, 1, ub + 1, pb));
    372         PabloAST * rep_class_var = markerVar(compile(repeated, pb), pb);
    373         return makePostPositionMarker("bounded", pb.createAnd(pb.createMatchStar(postPositionVar(marker, pb), rep_class_var), upperLimitMask), pb);
     383        PabloAST * cursor = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
     384        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
     385        return makeMarker(InitialPostPositionByte, pb.createAssign("bounded", pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask)));
    374386    }
    375387    // Fall through to general case.
    376388    while (ub-- != 0) {
    377389        MarkerType a = process(repeated, marker, pb);
    378         if (isFinalPositionMarker(a) && isFinalPositionMarker(marker)) {
    379             marker = makeFinalPositionMarker("m", pb.createOr(markerVar(marker, pb), markerVar(a, pb)), pb);
    380         }
    381         else {
    382             marker = makePostPositionMarker("m", pb.createOr(postPositionVar(marker, pb), postPositionVar(a, pb)), pb);
    383         }
     390        MarkerType m = marker;
     391        AlignMarkers(a, m, pb);
     392        marker = makeMarker(markerPos(a), pb.createAssign("m", pb.createOr(markerVar(a), markerVar(m))));
    384393    }
    385394    return marker;
     
    388397MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBlock & pb) {
    389398    // always use PostPosition markers for unbounded repetition.
    390     PabloAST * base = postPositionVar(marker, pb);
     399    PabloAST * base = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
    391400   
    392401    if (isByteLength(repeated)) {
    393         PabloAST * cc = markerVar(compile(repeated, pb), pb);
    394         return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
     402        PabloAST * cc = markerVar(compile(repeated, pb)); 
     403        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createMatchStar(base, cc)));
    395404    }
    396405    else if (isUnicodeUnitLength(repeated)) {
    397         PabloAST * cc = markerVar(compile(repeated, pb), pb);
    398         return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
     406        PabloAST * cc = markerVar(compile(repeated, pb));
     407        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial)));
    399408    }
    400409    else {
     
    404413        PabloBlock & wb = PabloBlock::Create(pb);
    405414
    406         Assign * loopComputation = postPositionVar(process(repeated, wrapPostPositionMarker(whileTest), wb), wb);
     415        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, whileTest), wb), InitialPostPositionByte, wb));
    407416        Next * nextWhileTest = wb.createNext(whileTest, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
    408417        wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccum));
     
    410419        pb.createWhile(nextWhileTest, wb);
    411420
    412         return makePostPositionMarker("unbounded", whileAccum, pb);
     421        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", whileAccum));
    413422    }   
    414423} // end of namespace re
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r4410 r4411  
    2626}
    2727
    28 /*  Marker streams represent the results of matching steps.
    29     Two types of marker streams are used internally.
    30     FinalByte markers are used for character classes and
    31     other strings by a one bit at their final position.
    32     PostPosition markers are used to mark matches with
    33     a 1 bit immediately after a match.   PostPosition markers
    34     are generally required whenever a regular expression element
    35     can match the empty string (e.g., * and ? repeated items).
     28/*   Marker streams represent the results of matching steps.
     29     Three types of marker streams are used internally.
     30     FinalMatchByte markers are used for character classes and
     31     other strings identified by a one bit at their final position.
     32     InitialPostPositionByte markers are used to mark matches with
     33     a 1 bit immediately after a match.   InitialPostPositionByte markers
     34     are generally required whenever a regular expression element
     35     can match the empty string (e.g., * and ? repeated items).
     36     FinalPostPositionByte markers are used for single code unit
     37     lookahead assertions. 
    3638*/
    37    
     39
    3840namespace re {
    3941
    40 enum MarkerPosition {FinalByte, PostPosition};
     42enum MarkerPosition {FinalMatchByte, InitialPostPositionByte, FinalPostPositionByte};
    4143
    4244struct MarkerType {
     
    4547};
    4648
    47 inline bool isPostPositionMarker(MarkerType m) {
    48     return m.pos == PostPosition;
    49 }
     49inline MarkerPosition markerPos(MarkerType m) {return m.pos;}
    5050
    51 inline bool isFinalPositionMarker(MarkerType m) {
    52     return m.pos == FinalByte;
    53 }
     51inline pablo::Assign * markerVar(MarkerType m) {return m.stream;}
     52   
     53inline MarkerType makeMarker(MarkerPosition newpos, pablo::Assign * strm) {return {newpos, strm};}
    5454
    55 MarkerType makePostPositionMarker(std::string marker_name, pablo::PabloAST * s, pablo::PabloBlock & pb);
    56 
    57 MarkerType makeFinalPositionMarker(std::string marker_name, pablo::PabloAST * s, pablo::PabloBlock & pb);
    58 
    59 pablo::Assign * markerStream(MarkerType m, pablo::PabloBlock &);
    60 
    61 pablo::Assign * markerVar(MarkerType m, pablo::PabloBlock & pb);
    62 
    63 pablo::Assign * postPositionVar(MarkerType m, pablo::PabloBlock & pb);
    6455
    6556class RE_Compiler {
     
    7667
    7768    MarkerType compile(RE * re, pablo::PabloBlock & cg);
    78 
     69    MarkerType AdvanceMarker(MarkerType m, MarkerPosition newpos, pablo::PabloBlock & pb);
     70   
     71    void AlignMarkers(MarkerType & m1, MarkerType & m2, pablo::PabloBlock & pb);
     72   
    7973    pablo::PabloAST * character_class_strm(Name * name, pablo::PabloBlock & pb);
    8074    pablo::PabloAST * nextUnicodePosition(MarkerType m, pablo::PabloBlock & pb);
Note: See TracChangeset for help on using the changeset viewer.