Ignore:
Timestamp:
Oct 13, 2015, 3:57:17 PM (3 years ago)
Author:
nmedfort
Message:

First attempt at adding grapheme cluster mode to icgrep.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r4829 r4831  
    3333#include "llvm/Support/CommandLine.h"
    3434#include <sstream>
     35#include <unordered_set>
    3536
    3637static cl::OptionCategory fREcompilationOptions("Regex Compilation Options", "These options control the compilation of regular expressions to Pablo.");
     
    258259
    259260    UCD::UCDCompiler::NameMap nameMap;
     261    std::unordered_set<Name *> visited;
    260262
    261263    std::function<void(RE*)> gather = [&](RE * re) {
    262264        if (Name * name = dyn_cast<Name>(re)) {
    263             if (name->getCompiled() == nullptr) {
     265            if (visited.insert(name).second) {
    264266                if (isa<CC>(name->getDefinition())) {
    265267                    nameMap.emplace(name, nullptr);
     
    300302        for (auto t : nameMap) {
    301303            if (t.second) {
    302                 t.first->setCompiled(mPB.createAnd(t.second, mAny));
     304                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchByte, mPB.createAnd(t.second, mAny))));
    303305            }
    304306        }
     
    307309    // Now precompile any grapheme segmentation rules
    308310    if (gcbRule) {
    309         compileName(gcbRule, mPB);
     311        mCompiledName.insert(std::make_pair(gcbRule, compileName(gcbRule, mPB)));
    310312    }
    311313    return re;
     
    365367        return compileName(name, marker, pb);
    366368    } else if (Seq* seq = dyn_cast<Seq>(re)) {
    367         return process(seq, marker, pb);
     369        return compileSeq(seq, marker, pb);
    368370    } else if (Alt * alt = dyn_cast<Alt>(re)) {
    369         return process(alt, marker, pb);
     371        return compileAlt(alt, marker, pb);
    370372    } else if (Rep * rep = dyn_cast<Rep>(re)) {
    371         return process(rep, marker, pb);
     373        return compileRep(rep, marker, pb);
    372374    } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    373         return process(a, marker, pb);
     375        return compileAssertion(a, marker, pb);
    374376    } else if (isa<Any>(re)) {
    375377        return compileAny(marker, pb);
    376378    } else if (Diff * diff = dyn_cast<Diff>(re)) {
    377         return process(diff, marker, pb);
     379        return compileDiff(diff, marker, pb);
    378380    } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
    379         return process(ix, marker, pb);
     381        return compileIntersect(ix, marker, pb);
    380382    } else if (isa<Start>(re)) {
    381         MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
    382         if (UNICODE_LINE_BREAK) {
    383             PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
    384             PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
    385             return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
    386         } else {
    387             PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
    388             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
    389         }
     383        return compileStart(marker, pb);
    390384    } else if (isa<End>(re)) {
    391         if (UNICODE_LINE_BREAK) {
    392             PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
    393             return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
    394         }
    395         PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
    396         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
     385        return compileEnd(marker, pb);
    397386    } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    398         const auto inGraphemeBoundaryRule = mGraphemeBoundaryRule;
    399         mGraphemeBoundaryRule = gb->getGraphemeBoundaryRule();
    400         assert (mGraphemeBoundaryRule);
    401         marker = process(gb->getExpression(), marker, pb);
    402         mGraphemeBoundaryRule = inGraphemeBoundaryRule;
    403     }
    404     return marker;
     387        return compileGraphemeBoundary(gb, marker, pb);
     388    }
     389    throw std::runtime_error("RE Compiler failed to process " + Printer_RE::PrintRE(re));
    405390}
    406391
     
    411396        lb = pb.createOr(mUnicodeLineBreak, mCRLF);
    412397    }
    413     PabloAST * dot = pb.createAnd(nextFinalByte, pb.createNot(lb), "dot");
    414     MarkerPosition pos = MarkerPosition::FinalMatchByte;
    415     if (mGraphemeBoundaryRule) {
    416         dot = pb.createScanThru(dot, pb.createOr(dot, mGraphemeBoundaryRule->getCompiled()), "dot_gext");
    417         pos = MarkerPosition::InitialPostPositionByte;
    418     }
    419     return makeMarker(pos, dot);
     398    return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
    420399}
    421400
    422401inline MarkerType RE_Compiler::compileName(Name * name, MarkerType marker, PabloBuilder & pb) {
     402    MarkerType nameMarker = compileName(name, pb);
    423403    MarkerType nextPos;
    424404    if (markerPos(marker) == MarkerPosition::FinalPostPositionByte) {
     
    429409        nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    430410    }
    431     PabloAST * namePos = pb.createAnd(markerVar(nextPos), compileName(name, pb), name->getName());
    432     MarkerPosition pos = MarkerPosition::FinalMatchByte;
    433     if (mGraphemeBoundaryRule) {
    434         namePos = pb.createScanThru(namePos, pb.createOr(namePos, pb.createOr(mNonFinal, mGraphemeBoundaryRule->getCompiled())), name->getName() + "_gext");
    435         pos = MarkerPosition::FinalPostPositionByte;
    436     }
    437     return makeMarker(pos, namePos);
    438 }
    439 
    440 inline PabloAST * RE_Compiler::compileName(Name * name, PabloBuilder & pb) {
    441     PabloAST * var = name->getCompiled();
    442     if (LLVM_LIKELY(var != nullptr)) {
    443         return var;
     411    nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
     412    return nameMarker;
     413}
     414
     415inline MarkerType RE_Compiler::compileName(Name * name, PabloBuilder & pb) {
     416    auto f = mCompiledName.find(name);
     417    if (LLVM_LIKELY(f != mCompiledName.end())) {
     418        return f->second;
    444419    } else if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    445420        MarkerType m = compile(name->getDefinition(), pb);
    446         assert(markerPos(m) == MarkerPosition::FinalMatchByte);
    447         var = pb.createAnd(markerVar(m), mAny);
    448         name->setCompiled(var);
    449         return var;
     421        mCompiledName.insert(std::make_pair(name, m));
     422        return m;
    450423    }
    451424    throw std::runtime_error("Unresolved name " + name->getName());
    452425}
    453426
    454 MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBuilder & pb) {
     427MarkerType RE_Compiler::compileSeq(Seq * seq, MarkerType marker, PabloBuilder & pb) {
    455428    // if-hierarchies are not inserted within unbounded repetitions
    456429    if (mStarDepth > 0) {
     
    460433        return marker;
    461434    } else {
    462         return processSeqTail(seq->begin(), seq->end(), 0, marker, pb);
    463     }
    464 }
    465 
    466 MarkerType RE_Compiler::processSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
     435        return compileSeqTail(seq->begin(), seq->end(), 0, marker, pb);
     436    }
     437}
     438
     439MarkerType RE_Compiler::compileSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
    467440    if (current == end) return marker;
    468441    if (matchLenSoFar < IfInsertionGap) {
     
    470443        marker = process(r, marker, pb);
    471444        current++;
    472         return processSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
     445        return compileSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
    473446    } else {
    474447        PabloBuilder nested = PabloBuilder::Create(pb);
    475         MarkerType m1 = processSeqTail(current, end, 0, marker, nested);
     448        MarkerType m1 = compileSeqTail(current, end, 0, marker, nested);
    476449        Assign * m1a = nested.createAssign("m", markerVar(m1));
    477450        pb.createIf(markerVar(marker), {m1a}, nested);
     
    480453}
    481454
    482 MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBuilder & pb) {
     455MarkerType RE_Compiler::compileAlt(Alt * alt, MarkerType marker, PabloBuilder & pb) {
    483456    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
    484457    MarkerType const base = marker;
     
    502475}
    503476
    504 MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBuilder & pb) {
     477MarkerType RE_Compiler::compileAssertion(Assertion * a, MarkerType marker, PabloBuilder & pb) {
    505478    RE * asserted = a->getAsserted();
    506479    if (a->getKind() == Assertion::Kind::Lookbehind) {
     
    515488    } else if (isUnicodeUnitLength(asserted)) {
    516489        MarkerType lookahead = compile(asserted, pb);
    517         assert(markerPos(lookahead) == MarkerPosition::FinalMatchByte);
    518         PabloAST * la = markerVar(lookahead);
    519         if (a->getSense() == Assertion::Sense::Negative) {
    520             la = pb.createNot(la);
    521         }
    522         MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
    523         return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     490        if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchByte)) {
     491            PabloAST * la = markerVar(lookahead);
     492            if (a->getSense() == Assertion::Sense::Negative) {
     493                la = pb.createNot(la);
     494            }
     495            MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb);
     496            return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
     497        }
    524498    }
    525499    throw std::runtime_error("Unsupported lookahead assertion.");
     
    532506}
    533507
    534 MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBuilder & pb) {
     508MarkerType RE_Compiler::compileDiff(Diff * diff, MarkerType marker, PabloBuilder & pb) {
    535509    RE * lh = diff->getLH();
    536510    RE * rh = diff->getRH();
     
    544518}
    545519
    546 MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBuilder & pb) {
     520MarkerType RE_Compiler::compileIntersect(Intersect * x, MarkerType marker, PabloBuilder & pb) {
    547521    RE * lh = x->getLH();
    548522    RE * rh = x->getRH();
     
    556530}
    557531
    558 MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBuilder & pb) {
     532MarkerType RE_Compiler::compileRep(Rep * rep, MarkerType marker, PabloBuilder & pb) {
    559533    int lb = rep->getLB();
    560534    int ub = rep->getUB();
     
    563537    }
    564538    if (ub == Rep::UNBOUNDED_REP) {
    565         return processUnboundedRep(rep->getRE(), marker, pb);
    566     }
    567     else if (ub == lb) { // if (rep->getUB() != Rep::UNBOUNDED_REP)
    568         return marker;
    569     }
    570     else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
    571         return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
    572     }
     539        marker = processUnboundedRep(rep->getRE(), marker, pb);
     540    } else if (lb < ub) {
     541        marker = processBoundedRep(rep->getRE(), ub - lb, marker, pb);
     542    }
     543    return marker;
    573544}
    574545
     
    579550*/
    580551
    581 inline PabloAST * RE_Compiler::consecutive1(PabloAST * repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
    582         int i = repeated_lgth;
    583         int total_lgth = repeat_count * repeated_lgth;
     552inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
     553        int i = length;
     554        int total = repeat_count * length;
    584555        PabloAST * consecutive_i = repeated;
    585         while (i * 2 <= total_lgth) {
     556        while (i * 2 < total) {
    586557            PabloAST * v = consecutive_i;
    587558            PabloAST * v2 =  pb.createAdvance(v, i);
    588559            i *= 2;
    589             consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "inarow");
     560            consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
    590561        }       
    591         if (i < total_lgth) {
     562        if (i < total) {
    592563            PabloAST * v = consecutive_i;
    593             consecutive_i = pb.createAnd(v, pb.createAdvance(v, total_lgth - i), "at" + std::to_string(total_lgth) + "inarow");
     564            consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
    594565        }
    595566        return consecutive_i;
     
    619590    if (isByteLength(repeated) && !DisableLog2BoundedRepetition) {
    620591        PabloAST * cc = markerVar(compile(repeated, pb));
    621         PabloAST * cc_lb = consecutive1(cc, 1, lb, pb);
     592        PabloAST * cc_lb = consecutive_matches(cc, 1, lb, pb);
    622593        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchByte ? lb : lb - 1);
    623594        return makeMarker(MarkerPosition::FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
     
    653624MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
    654625    // always use PostPosition markers for unbounded repetition.
    655     PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));
    656    
     626    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));   
    657627    if (isByteLength(repeated)  && !DisableMatchStar) {
    658628        PabloAST * cc = markerVar(compile(repeated, pb));
     
    670640        PabloAST * nonFinal = mNonFinal;
    671641        if (mGraphemeBoundaryRule) {
    672             nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule->getCompiled()));
     642            nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule));
    673643        }
    674644        cc = pb.createOr(cc, nonFinal);
     
    680650        PabloAST * final = mFinal;
    681651        if (mGraphemeBoundaryRule) {
    682             final = pb.createOr(final, mGraphemeBoundaryRule->getCompiled());
     652            final = pb.createOr(final, mGraphemeBoundaryRule);
    683653        }
    684654        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(mstar, final, "unbounded"));
    685655    } else if (mStarDepth > 0){
    686 
    687         PabloBuilder * outerb = pb.getParent();
    688        
     656        PabloBuilder * outerb = pb.getParent();       
    689657        Assign * starPending = outerb->createAssign("pending", outerb->createZeroes());
    690         Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());
    691        
     658        Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());       
    692659        mStarDepth++;
    693660        PabloAST * m1 = pb.createOr(base, starPending);
     
    699666        mLoopVariants.push_back(nextPending);
    700667        mLoopVariants.push_back(nextStarAccum);
    701         mStarDepth--;
    702        
     668        mStarDepth--;       
    703669        return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAssign("unbounded", pb.createOr(base, nextStarAccum)));
    704670    } else {
     
    725691}
    726692
     693inline MarkerType RE_Compiler::compileStart(const MarkerType marker, pablo::PabloBuilder & pb) {
     694    MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb);
     695    if (UNICODE_LINE_BREAK) {
     696        PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
     697        PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
     698        return makeMarker(MarkerPosition::InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     699    } else {
     700        PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
     701        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
     702    }
     703}
     704
     705inline MarkerType RE_Compiler::compileEnd(const MarkerType marker, pablo::PabloBuilder & pb) {
     706    if (UNICODE_LINE_BREAK) {
     707        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionByte, pb));
     708        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
     709    } else {
     710        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionByte, pb));  // For LF match
     711        return makeMarker(MarkerPosition::FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
     712    }
     713}
     714
     715inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, const MarkerType marker, pablo::PabloBuilder & pb) {
     716    const auto inGraphemeBoundaryRule = mGraphemeBoundaryRule;
     717    auto f = mCompiledName.find(gb->getGraphemeBoundaryRule());
     718    if (LLVM_UNLIKELY(f == mCompiledName.end())) {
     719        throw std::runtime_error("Internal error: failed to locate grapheme boundary rule!");
     720    }
     721    mGraphemeBoundaryRule = markerVar(f->second);
     722    assert (mGraphemeBoundaryRule);
     723    MarkerType result = process(gb->getExpression(), marker, pb);
     724    mGraphemeBoundaryRule = inGraphemeBoundaryRule;
     725    return result;
     726}
     727
    727728inline MarkerType RE_Compiler::AdvanceMarker(const MarkerType m, const MarkerPosition newpos, PabloBuilder & pb) {
    728729    if (m.pos == newpos) return m;
     
    730731    if (m.pos == MarkerPosition::FinalMatchByte) {
    731732        // Must advance the previous marker to the InitialPostPositionByte
    732         a = pb.createAdvance(a, 1, "initial");
     733        a = pb.createAdvance(a, 1, "ipp");
    733734    }
    734735    // Now at InitialPostPositionByte; is a further advance needed?
    735736    if (newpos == MarkerPosition::FinalPostPositionByte) {
    736737        // Must advance through nonfinal bytes
    737         a = pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal, "final");
     738        PabloAST * nonFinal = mNonFinal;
     739        if (mGraphemeBoundaryRule) {
     740            nonFinal = pb.createOr(nonFinal, mGraphemeBoundaryRule, "gext");
     741        }
     742        a = pb.createScanThru(pb.createAnd(mInitial, a), nonFinal, "fpp");
    738743    }
    739744    return {newpos, a};
Note: See TracChangeset for help on using the changeset viewer.