Changeset 4831


Ignore:
Timestamp:
Oct 13, 2015, 3:57:17 PM (2 years ago)
Author:
nmedfort
Message:

First attempt at adding grapheme cluster mode to icgrep.

Location:
icGREP/icgrep-devel/icgrep
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/pablo/optimizers/pablo_simplifier.cpp

    r4829 r4831  
    285285                Advance * op = cast<Advance>(stmt->getOperand(0));
    286286                if (LLVM_UNLIKELY(op->getNumUses() == 1)) {
    287                     block.setInsertPoint(scanThru);
     287                    block.setInsertPoint(scanThru->getPrevNode());
    288288                    PabloAST * expr = block.createAdvance(op->getOperand(0), op->getAdvanceAmount() - 1);
    289289                    scanThru->setOperand(0, expr);
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4829 r4831  
    3333#include "llvm/Support/CommandLine.h"
    3434#include <sstream>
     35#include <unordered_set>
    3536
    3637static cl::OptionCategory fREcompilationOptions("Regex Compilation Options", "These options control the compilation of regular expressions to Pablo.");
     
    258259
    259260    UCD::UCDCompiler::NameMap nameMap;
     261    std::unordered_set<Name *> visited;
    260262
    261263    std::function<void(RE*)> gather = [&](RE * re) {
    262264        if (Name * name = dyn_cast<Name>(re)) {
    263             if (name->getCompiled() == nullptr) {
     265            if (visited.insert(name).second) {
    264266                if (isa<CC>(name->getDefinition())) {
    265267                    nameMap.emplace(name, nullptr);
     
    300302        for (auto t : nameMap) {
    301303            if (t.second) {
    302                 t.first->setCompiled(mPB.createAnd(t.second, mAny));
     304                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, mPB.createAnd(t.second, mAny))));
    303305            }
    304306        }
     
    307309    // Now precompile any grapheme segmentation rules
    308310    if (gcbRule) {
    309         compileName(gcbRule, mPB);
     311        mCompiledName.insert(std::make_pair(gcbRule, compileName(gcbRule, mPB)));
    310312    }
    311313    return re;
     
    365367        return compileName(name, marker, pb);
    366368    } else if (Seq* seq = dyn_cast<Seq>(re)) {
    367         return process(seq, marker, pb);
     369        return compileSeq(seq, marker, pb);
    368370    } else if (Alt * alt = dyn_cast<Alt>(re)) {
    369         return process(alt, marker, pb);
     371        return compileAlt(alt, marker, pb);
    370372    } else if (Rep * rep = dyn_cast<Rep>(re)) {
    371         return process(rep, marker, pb);
     373        return compileRep(rep, marker, pb);
    372374    } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    373         return process(a, marker, pb);
     375        return compileAssertion(a, marker, pb);
    374376    } else if (isa<Any>(re)) {
    375377        return compileAny(marker, pb);
    376378    } else if (Diff * diff = dyn_cast<Diff>(re)) {
    377         return process(diff, marker, pb);
     379        return compileDiff(diff, marker, pb);
    378380    } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
    379         return process(ix, marker, pb);
     381        return compileIntersect(ix, marker, pb);
    380382    } else if (isa<Start>(re)) {
    381         MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
    382         if (UNICODE_LINE_BREAK) {
    383             PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
    384             PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
    385             return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
    386         } else {
    387             PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
    388             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
    389         }
     383        return compileStart(marker, pb);
    390384    } else if (isa<End>(re)) {
    391         if (UNICODE_LINE_BREAK) {
    392             PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    393             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
    394         }
    395         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
    396         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
     385        return compileEnd(marker, pb);
    397386    } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    398         const auto inGraphemeBoundaryRule = mGraphemeBoundaryRule;
    399         mGraphemeBoundaryRule = gb->getGraphemeBoundaryRule();
    400         assert (mGraphemeBoundaryRule);
    401         marker = process(gb->getExpression(), marker, pb);
    402         mGraphemeBoundaryRule = inGraphemeBoundaryRule;
    403     }
    404     return marker;
     387        return compileGraphemeBoundary(gb, marker, pb);
     388    }
     389    throw std::runtime_error("RE Compiler failed to process " + Printer_RE::PrintRE(re));
    405390}
    406391
     
    411396        lb = pb.createOr(mUnicodeLineBreak, mCRLF);
    412397    }
    413     PabloAST * dot = pb.createAnd(nextFinalByte, pb.createNot(lb), "dot");
    414     MarkerPosition pos = MarkerPosition::FinalMatchByte;
    415     if (mGraphemeBoundaryRule) {
    416         dot = pb.createScanThru(dot, pb.createOr(dot, mGraphemeBoundaryRule->getCompiled()), "dot_gext");
    417         pos = MarkerPosition::InitialPostPositionByte;
    418     }
    419     return makeMarker(pos, dot);
     398    return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
    420399}
    421400
    422401inline MarkerType RE_Compiler::compileName(Name * name, MarkerType marker, PabloBuilder & pb) {
     402    MarkerType nameMarker = compileName(name, pb);
    423403    MarkerType nextPos;
    424404    if (markerPos(marker) == MarkerPosition::FinalPostPositionByte) {
     
    429409        nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    430410    }
    431     PabloAST * namePos = pb.createAnd(markerVar(nextPos), compileName(name, pb), name->getName());
    432     MarkerPosition pos = MarkerPosition::FinalMatchByte;
    433     if (mGraphemeBoundaryRule) {
    434         namePos = pb.createScanThru(namePos, pb.createOr(namePos, pb.createOr(mNonFinal, mGraphemeBoundaryRule->getCompiled())), name->getName() + "_gext");
    435         pos = MarkerPosition::FinalPostPositionByte;
    436     }
    437     return makeMarker(pos, namePos);
    438 }
    439 
    440 inline PabloAST * RE_Compiler::compileName(Name * name, PabloBuilder & pb) {
    441     PabloAST * var = name->getCompiled();
    442     if (LLVM_LIKELY(var != nullptr)) {
    443         return var;
     411    nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
     412    return nameMarker;
     413}
     414
     415inline MarkerType RE_Compiler::compileName(Name * name, PabloBuilder & pb) {
     416    auto f = mCompiledName.find(name);
     417    if (LLVM_LIKELY(f != mCompiledName.end())) {
     418        return f->second;
    444419    } else if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    445420        MarkerType m = compile(name->getDefinition(), pb);
    446         assert(markerPos(m) == MarkerPosition::FinalMatchByte);
    447         var = pb.createAnd(markerVar(m), mAny);
    448         name->setCompiled(var);
    449         return var;
     421        mCompiledName.insert(std::make_pair(name, m));
     422        return m;
    450423    }
    451424    throw std::runtime_error("Unresolved name " + name->getName());
    452425}
    453426
    454 MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBuilder & pb) {
     427MarkerType RE_Compiler::compileSeq(Seq * seq, MarkerType marker, PabloBuilder & pb) {
    455428    // if-hierarchies are not inserted within unbounded repetitions
    456429    if (mStarDepth > 0) {
     
    460433        return marker;
    461434    } else {
    462         return processSeqTail(seq->begin(), seq->end(), 0, marker, pb);
    463     }
    464 }
    465 
    466 MarkerType RE_Compiler::processSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
     435        return compileSeqTail(seq->begin(), seq->end(), 0, marker, pb);
     436    }
     437}
     438
     439MarkerType RE_Compiler::compileSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
    467440    if (current == end) return marker;
    468441    if (matchLenSoFar < IfInsertionGap) {
     
    470443        marker = process(r, marker, pb);
    471444        current++;
    472         return processSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
     445        return compileSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
    473446    } else {
    474447        PabloBuilder nested = PabloBuilder::Create(pb);
    475         MarkerType m1 = processSeqTail(current, end, 0, marker, nested);
     448        MarkerType m1 = compileSeqTail(current, end, 0, marker, nested);
    476449        Assign * m1a = nested.createAssign("m", markerVar(m1));
    477450        pb.createIf(markerVar(marker), {m1a}, nested);
     
    480453}
    481454
    482 MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBuilder & pb) {
     455MarkerType RE_Compiler::compileAlt(Alt * alt, MarkerType marker, PabloBuilder & pb) {
    483456    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
    484457    MarkerType const base = marker;
     
    502475}
    503476
    504 MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBuilder & pb) {
     477MarkerType RE_Compiler::compileAssertion(Assertion * a, MarkerType marker, PabloBuilder & pb) {
    505478    RE * asserted = a->getAsserted();
    506479    if (a->getKind() == Assertion::Kind::Lookbehind) {
     
    515488    } else if (isUnicodeUnitLength(asserted)) {
    516489        MarkerType lookahead = compile(asserted, pb);
    517         assert(markerPos(lookahead) == MarkerPosition::FinalMatchByte);
    518         PabloAST * la = markerVar(lookahead);
    519         if (a->getSense() == Assertion::Sense::Negative) {
    520             la = pb.createNot(la);
    521         }
    522         MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    523         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     490        if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchByte)) {
     491            PabloAST * la = markerVar(lookahead);
     492            if (a->getSense() == Assertion::Sense::Negative) {
     493                la = pb.createNot(la);
     494            }
     495            MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     496            return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     497        }
    524498    }
    525499    throw std::runtime_error("Unsupported lookahead assertion.");
     
    532506}
    533507
    534 MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBuilder & pb) {
     508MarkerType RE_Compiler::compileDiff(Diff * diff, MarkerType marker, PabloBuilder & pb) {
    535509    RE * lh = diff->getLH();
    536510    RE * rh = diff->getRH();
     
    544518}
    545519
    546 MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBuilder & pb) {
     520MarkerType RE_Compiler::compileIntersect(Intersect * x, MarkerType marker, PabloBuilder & pb) {
    547521    RE * lh = x->getLH();
    548522    RE * rh = x->getRH();
     
    556530}
    557531
    558 MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBuilder & pb) {
     532MarkerType RE_Compiler::compileRep(Rep * rep, MarkerType marker, PabloBuilder & pb) {
    559533    int lb = rep->getLB();
    560534    int ub = rep->getUB();
     
    563537    }
    564538    if (ub == Rep::UNBOUNDED_REP) {
    565         return processUnboundedRep(rep->getRE(), marker, pb);
    566     }
    567     else if (ub == lb) { // if (rep->getUB() != Rep::UNBOUNDED_REP)
    568         return marker;
    569     }
    570     else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
    571         return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
    572     }
     539        marker = processUnboundedRep(rep->getRE(), marker, pb);
     540    } else if (lb < ub) {
     541        marker = processBoundedRep(rep->getRE(), ub - lb, marker, pb);
     542    }
     543    return marker;
    573544}
    574545
     
    579550*/
    580551
    581 inline PabloAST * RE_Compiler::consecutive1(PabloAST * repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
    582         int i = repeated_lgth;
    583         int total_lgth = repeat_count * repeated_lgth;
     552inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
     553        int i = length;
     554        int total = repeat_count * length;
    584555        PabloAST * consecutive_i = repeated;
    585         while (i * 2 <= total_lgth) {
     556        while (i * 2 < total) {
    586557            PabloAST * v = consecutive_i;
    587558            PabloAST * v2 =  pb.createAdvance(v, i);
    588559            i *= 2;
    589             consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "inarow");
     560            consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
    590561        }       
    591         if (i < total_lgth) {
     562        if (i < total) {
    592563            PabloAST * v = consecutive_i;
    593             consecutive_i = pb.createAnd(v, pb.createAdvance(v, total_lgth - i), "at" + std::to_string(total_lgth) + "inarow");
     564            consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
    594565        }
    595566        return consecutive_i;
     
    619590    if (isByteLength(repeated) && !DisableLog2BoundedRepetition) {
    620591        PabloAST * cc = markerVar(compile(repeated, pb));
    621         PabloAST * cc_lb = consecutive1(cc, 1, lb, pb);
     592        PabloAST * cc_lb = consecutive_matches(cc, 1, lb, pb);
    622593        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchByte ? lb : lb - 1);
    623594        return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     
    653624MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
    654625    // always use PostPosition markers for unbounded repetition.
    655     PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
    656    
     626    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));   
    657627    if (isByteLength(repeated)  && !DisableMatchStar) {
    658628        PabloAST * cc = markerVar(compile(repeated, pb));
     
    670640        PabloAST * nonFinal = mNonFinal;
    671641        if (mGraphemeBoundaryRule) {
    672             nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule->getCompiled()));
     642            nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule));
    673643        }
    674644        cc = pb.createOr(cc, nonFinal);
     
    680650        PabloAST * final = mFinal;
    681651        if (mGraphemeBoundaryRule) {
    682             final = pb.createOr(final, mGraphemeBoundaryRule->getCompiled());
     652            final = pb.createOr(final, mGraphemeBoundaryRule);
    683653        }
    684654        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
    685655    } else if (mStarDepth > 0){
    686 
    687         PabloBuilder * outerb = pb.getParent();
    688        
     656        PabloBuilder * outerb = pb.getParent();       
    689657        Assign * starPending = outerb->createAssign("pending", outerb->createZeroes());
    690         Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());
    691        
     658        Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());       
    692659        mStarDepth++;
    693660        PabloAST * m1 = pb.createOr(base, starPending);
     
    699666        mLoopVariants.push_back(nextPending);
    700667        mLoopVariants.push_back(nextStarAccum);
    701         mStarDepth--;
    702        
     668        mStarDepth--;       
    703669        return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAssign("unbounded", pb.createOr(base, nextStarAccum)));
    704670    } else {
     
    725691}
    726692
     693inline MarkerType RE_Compiler::compileStart(const MarkerType marker, pablo::PabloBuilder & pb) {
     694    MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     695    if (UNICODE_LINE_BREAK) {
     696        PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
     697        PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
     698        return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     699    } else {
     700        PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
     701        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     702    }
     703}
     704
     705inline MarkerType RE_Compiler::compileEnd(const MarkerType marker, pablo::PabloBuilder & pb) {
     706    if (UNICODE_LINE_BREAK) {
     707        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
     708        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
     709    } else {
     710        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
     711        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
     712    }
     713}
     714
     715inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, const MarkerType marker, pablo::PabloBuilder & pb) {
     716    const auto inGraphemeBoundaryRule = mGraphemeBoundaryRule;
     717    auto f = mCompiledName.find(gb->getGraphemeBoundaryRule());
     718    if (LLVM_UNLIKELY(f == mCompiledName.end())) {
     719        throw std::runtime_error("Internal error: failed to locate grapheme boundary rule!");
     720    }
     721    mGraphemeBoundaryRule = markerVar(f->second);
     722    assert (mGraphemeBoundaryRule);
     723    MarkerType result = process(gb->getExpression(), marker, pb);
     724    mGraphemeBoundaryRule = inGraphemeBoundaryRule;
     725    return result;
     726}
     727
    727728inline MarkerType RE_Compiler::AdvanceMarker(const MarkerType m, const MarkerPosition newpos, PabloBuilder & pb) {
    728729    if (m.pos == newpos) return m;
     
    730731    if (m.pos == MarkerPosition::FinalMatchByte) {
    731732        // Must advance the previous marker to the InitialPostPositionByte
    732         a = pb.createAdvance(a, 1, "initial");
     733        a = pb.createAdvance(a, 1, "ipp");
    733734    }
    734735    // Now at InitialPostPositionByte; is a further advance needed?
    735736    if (newpos == MarkerPosition::FinalPostPositionByte) {
    736737        // Must advance through nonfinal bytes
    737         a = pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal, "final");
     738        PabloAST * nonFinal = mNonFinal;
     739        if (mGraphemeBoundaryRule) {
     740            nonFinal = pb.createOr(nonFinal, mGraphemeBoundaryRule, "gext");
     741        }
     742        a = pb.createScanThru(pb.createAnd(mInitial, a), nonFinal, "fpp");
    738743    }
    739744    return {newpos, a};
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r4829 r4831  
    6969    MarkerType process(RE * re, MarkerType marker, pablo::PabloBuilder & pb);
    7070    MarkerType compileName(Name * name, MarkerType marker, pablo::PabloBuilder & pb);
    71     MarkerType process(Seq * seq, MarkerType marker, pablo::PabloBuilder & pb);
    72     MarkerType processSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, pablo::PabloBuilder & pb);
    73     MarkerType process(Alt * alt, MarkerType marker, pablo::PabloBuilder & pb);
    74     MarkerType process(Assertion * a, MarkerType marker, pablo::PabloBuilder & pb);
    75     MarkerType process(Rep * rep, MarkerType marker, pablo::PabloBuilder & pb);
    76     MarkerType process(Diff * diff, MarkerType marker, pablo::PabloBuilder & cg);
    77     MarkerType process(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
    78     pablo::PabloAST *consecutive1(pablo::PabloAST *repeated,  int repeated_lgth, int repeat_count, pablo::PabloBuilder & pb);
     71    MarkerType compileSeq(Seq * seq, MarkerType marker, pablo::PabloBuilder & pb);
     72    MarkerType compileSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, pablo::PabloBuilder & pb);
     73    MarkerType compileAlt(Alt * alt, MarkerType marker, pablo::PabloBuilder & pb);
     74    MarkerType compileAssertion(Assertion * a, MarkerType marker, pablo::PabloBuilder & pb);
     75    MarkerType compileRep(Rep * rep, MarkerType marker, pablo::PabloBuilder & pb);
     76    MarkerType compileDiff(Diff * diff, MarkerType marker, pablo::PabloBuilder & cg);
     77    MarkerType compileIntersect(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
     78    pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated,  int length, int repeat_count, pablo::PabloBuilder & pb);
    7979    pablo::PabloAST * reachable(pablo::PabloAST * repeated,  int repeated_lgth, int repeat_count, pablo::PabloBuilder & pb);
    8080    static bool isFixedLength(RE * regexp);
     
    8585
    8686    Name * generateGraphemeClusterBoundaryRule();
    87     pablo::PabloAST * compileName(Name * name, pablo::PabloBuilder & pb);
     87    MarkerType compileName(Name * name, pablo::PabloBuilder & pb);
    8888    MarkerType compileAny(const MarkerType m, pablo::PabloBuilder & pb);
     89    MarkerType compileStart(const MarkerType marker, pablo::PabloBuilder & pb);
     90    MarkerType compileEnd(const MarkerType marker, pablo::PabloBuilder & pb);
     91    MarkerType compileGraphemeBoundary(GraphemeBoundary *gb, const MarkerType marker, pablo::PabloBuilder & pb);
    8992
    9093private:
     
    9598    pablo::PabloAST *                               mUnicodeLineBreak;
    9699    pablo::PabloAST *                               mAny;
    97     Name *                                          mGraphemeBoundaryRule;
     100    pablo::PabloAST *                               mGraphemeBoundaryRule;
    98101    pablo::PabloAST *                               mInitial;
    99     pablo::Assign *                                 mNonFinal;
     102    pablo::Assign *                                 mNonFinal;   
    100103    pablo::PabloAST *                               mFinal;
    101104    pablo::PabloAST *                               mWhileTest;
     
    103106    std::vector<pablo::Next *>                      mLoopVariants; // <- rethink name
    104107    pablo::PabloBuilder                             mPB;
     108    std::unordered_map<Name *, MarkerType>          mCompiledName;   
    105109    pablo::PabloFunction &                          mFunction;
    106110};
  • icGREP/icgrep-devel/icgrep/re/re_name.h

    r4823 r4831  
    55#include <re/re_cc.h>
    66#include <string>
    7 
    8 namespace pablo {
    9     class PabloAST;
    10 }
    117
    128namespace UCD {
     
    3632    Type getType() const;
    3733    RE * getDefinition() const;
    38     pablo::PabloAST * getCompiled() const {
    39         return mCompiled;
    40     }
    41     void setCompiled(pablo::PabloAST * var) {
    42         mCompiled = var;
    43     }
    4434    bool operator<(const Name & other) const;
    4535    bool operator<(const CC & other) const;
     
    6050    , mType(type)
    6151    , mDefinition(defn)
    62     , mCompiled(nullptr)
    6352    {
    6453
     
    8069    const Type          mType;
    8170    RE *                mDefinition;
    82     pablo::PabloAST *   mCompiled;
    8371};
    8472
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4830 r4831  
    4141    RE_Parser parser(regular_expression);
    4242    parser.fModeFlagSet = initialFlags;
    43 
    4443    parser.fNested = false;
    4544    RE * re = parser.parse_RE();
    4645    if (re == nullptr) {
    4746        throw ParseFailure("An unexpected parsing error occurred!");
     47    }
     48    if (parser.fModeFlagSet & ModeFlagType::GRAPHEME_CLUSTER_MODE) {
     49        re = makeGraphemeBoundary(re, GraphemeBoundary::Type::ClusterBoundary);
    4850    }
    4951    return re;
     
    203205                }
    204206                break;
    205             case '-': case 'd' : case 'i': case 'm': case 's': case 'x': {
     207            case '-': case 'd' : case 'i': case 'm': case 's': case 'x': case 'g': {
    206208                bool negateMode = false;
    207209                ModeFlagType modeBit;
    208                 while (mCursor.more() && *mCursor != ')' && *mCursor != ':') {
     210                while (*mCursor != ')' && *mCursor != ':') {
    209211                    if (*mCursor == '-') {
    210212                        negateMode = true;
     
    213215                    switch (*mCursor++) {
    214216                        case 'i': modeBit = CASE_INSENSITIVE_MODE_FLAG; break;
     217                        case 'g': modeBit = GRAPHEME_CLUSTER_MODE; break;
    215218                        //case 'm': modeBit = MULTILINE_MODE_FLAG; break;
    216219                        //case 's': modeBit = DOTALL_MODE_FLAG; break;
     
    282285        }
    283286        ++mCursor;
    284         if (mCursor.more()) {
    285             if (*mCursor == '?') { // Non-greedy qualifier
    286                 // Greedy vs. non-greedy makes no difference for icgrep.
    287                 ++mCursor;
    288             } else if (*mCursor == '+') {
    289                 ++mCursor;
    290                 throw ParseFailure("Possessive repetition is not supported in icgrep 1.0");
    291             }
     287        if (*mCursor == '?') { // Non-greedy qualifier
     288            // Greedy vs. non-greedy makes no difference for icgrep.
     289            ++mCursor;
     290        } else if (*mCursor == '+') {
     291            ++mCursor;
     292            throw ParseFailure("Possessive repetition is not supported in icgrep 1.0");
    292293        }
    293294        return makeRep(re, lb, ub);
     
    297298inline std::pair<int, int> RE_Parser::parse_range_bound() {
    298299    int lower_bound = 0, upper_bound = 0;
    299     ++mCursor;
    300     if (*mCursor == ',') {
     300    if (*++mCursor == ',') {
    301301        ++mCursor;
    302         lower_bound = 0;
    303302    } else {
    304303        lower_bound = parse_int();
     
    308307    } else if (*mCursor != ',') {
    309308        throw BadLowerBound();
    310     } else { // [^,}]
    311         ++mCursor;
    312         if (*mCursor == '}') {
    313             upper_bound = Rep::UNBOUNDED_REP;
    314         }
    315         else {
    316             upper_bound = parse_int();
    317             if (*mCursor != '}') {
    318                 throw BadUpperBound();
    319             }
     309    } else if (*++mCursor == '}') {
     310        upper_bound = Rep::UNBOUNDED_REP;
     311    } else {
     312        upper_bound = parse_int();
     313        if (*mCursor != '}') {
     314            throw BadUpperBound();
    320315        }
    321316    }
     
    325320unsigned RE_Parser::parse_int() {
    326321    unsigned value = 0;
    327     for (; mCursor.more(); ++mCursor) {
    328         if (!isdigit(*mCursor)) {
    329             break;
    330         }
     322    while (isdigit(*mCursor)) {
    331323        value *= 10;
    332         value += static_cast<int>(*mCursor) - 48;
     324        value += static_cast<int>(*mCursor++) - 48;
    333325    }
    334326    return value;
     
    364356        case 'B': complemented = true;
    365357        case 'b':
    366             ++mCursor;
    367             if (!mCursor.more() || *mCursor != '{') {
     358            if (*++mCursor != '{') {
    368359                return complemented ? makeWordNonBoundary() : makeWordBoundary();
    369             }
    370             else {
    371                 ++mCursor;
    372                 switch (*mCursor) {
     360            } else {
     361                switch (*++mCursor) {
    373362                    case 'g': re = makeGraphemeClusterBoundary();
    374363                    case 'w': throw ParseFailure("\\b{w} not yet supported.");
     
    377366                    default: throw ParseFailure("Unrecognized boundary assertion");
    378367                }
    379                 ++mCursor;
    380                 if (*mCursor != '}') {
     368                if (*++mCursor != '}') {
    381369                    throw ParseFailure("Malformed boundary assertion");
    382370                }
     
    432420            // to get to the next extended grapheme cluster boundary.
    433421            ++mCursor;
    434             return makeSeq({makeRep(makeAny(),1,Rep::UNBOUNDED_REP), makeGraphemeClusterBoundary()});
     422            return makeGraphemeBoundary(makeAny(), GraphemeBoundary::Type::ClusterBoundary);
    435423        case 'N':
    436424            if (*++mCursor != '{') {
     
    597585        case '&':
    598586            ++mCursor;
    599             if (mCursor.more() && *mCursor == '&') {
     587            if (*mCursor == '&') {
    600588                ++mCursor;
    601589                return intersectOp;
    602             }
    603             else if (mCursor.more() && *mCursor == '[') {
     590            } else if (*mCursor == '[') {
    604591                // Short-hand for intersectOp when a set follows
    605592                return intersectOp;
    606593            }
    607             else return ampChar;
     594            return ampChar;
    608595        case '-':
    609596            ++mCursor;
    610             if (mCursor.more() && *mCursor == '-') {
     597            if (*mCursor == '-') {
    611598                ++mCursor;
    612599                return setDiffOp;
    613             }
    614             else if (mCursor.more() && *mCursor == '[') {
     600            } else if (*mCursor == '[') {
    615601                return setDiffOp;
    616             }
    617             else if (mCursor.more() && *mCursor == ']') {
     602            } else if (*mCursor == ']') {
    618603                return hyphenChar;
    619604            }
    620             else return rangeHyphen;
     605            return rangeHyphen;
    621606        case '[':
    622607            ++mCursor;
    623             if (mCursor.more() && *mCursor == ':') {
     608            if (*mCursor == ':') {
    624609                ++mCursor;
    625610                return posixPropertyOpener;
    626611            }
    627             else return setOpener;
     612            return setOpener;
    628613        case ']':
    629614            ++mCursor;
     
    809794
    810795codepoint_t RE_Parser::parse_codepoint() {
    811     if (mCursor.more() && *mCursor == '\\') {
     796    if (*mCursor == '\\') {
    812797        mCursor++;
    813798        return parse_escaped_codepoint();
    814     }
    815     else {
     799    } else {
    816800        return parse_utf8_codepoint();
    817801    }
  • icGREP/icgrep-devel/icgrep/re/re_parser.h

    r4829 r4831  
    2424    {intersectOp, setDiffOp, ampChar, hyphenChar, rangeHyphen, posixPropertyOpener, setOpener, setCloser, backSlash, emptyOperator};
    2525
    26 enum ModeFlagType
    27     {CASE_INSENSITIVE_MODE_FLAG = 1,
    28      MULTILINE_MODE_FLAG = 2,      // not currently implemented
    29      DOTALL_MODE_FLAG = 4,         // not currently implemented
    30      IGNORE_SPACE_MODE_FLAG = 8,   // not currently implemented
    31      UNIX_LINES_MODE_FLAG = 16};   // not currently implemented
     26enum ModeFlagType : unsigned {
     27    NONE = 0,
     28    CASE_INSENSITIVE_MODE_FLAG = 1,
     29    MULTILINE_MODE_FLAG = 2,      // not currently implemented
     30    DOTALL_MODE_FLAG = 4,         // not currently implemented
     31    IGNORE_SPACE_MODE_FLAG = 8,   // not currently implemented
     32    UNIX_LINES_MODE_FLAG = 16,    // not currently implemented
     33    GRAPHEME_CLUSTER_MODE = 32
     34};
    3235
    3336const int MAX_REPETITION_LOWER_BOUND = 1024;
     
    7174        inline const char_t operator*() const {
    7275            if (LLVM_UNLIKELY(mCursor == mEnd)) {
    73                 throw IncompleteRegularExpression();
     76                return 0;
    7477            }
    7578            return *mCursor;
Note: See TracChangeset for help on using the changeset viewer.