source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4638

Last change on this file since 4638 was 4638, checked in by nmedfort, 4 years ago

Minor modifications

File size: 23.8 KB
RevLine 
[3850]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
[4197]8//Regular Expressions
[4199]9#include <re/re_name.h>
[4245]10#include <re/re_any.h>
[4199]11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
[4255]17#include <re/re_diff.h>
[4298]18#include <re/re_intersect.h>
[4405]19#include <re/re_assertion.h>
[4393]20#include <re/re_analysis.h>
[4249]21#include <cc/cc_namemap.hpp>
[4207]22#include <pablo/codegenstate.h>
[4626]23#include <resolve_properties.h>
[4197]24#include <assert.h>
25#include <stdexcept>
[4182]26
[4459]27#include "llvm/Support/CommandLine.h"
[4544]28static cl::OptionCategory fREcompilationOptions("Regex Compilation Options",
[4459]29                                      "These options control the compilation of regular expressions to Pablo.");
30
[4626]31static cl::opt<bool> DisableLog2BoundedRepetition("disable-log2-bounded-repetition", cl::init(false),
[4459]32                     cl::desc("disable log2 optimizations for bounded repetition of bytes"), cl::cat(fREcompilationOptions));
33static cl::opt<int> IfInsertionGap("if-insertion-gap", cl::init(3), cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"), cl::cat(fREcompilationOptions));
[4626]34static cl::opt<bool> DisableMatchStar("disable-matchstar", cl::init(false),
[4459]35                     cl::desc("disable MatchStar optimization"), cl::cat(fREcompilationOptions));
[4623]36static cl::opt<bool> DisableUnicodeMatchStar("disable-unicode-matchstar", cl::init(false),
[4459]37                     cl::desc("disable Unicode MatchStar optimization"), cl::cat(fREcompilationOptions));
[4623]38static cl::opt<bool> DisableUnicodeLineBreak("disable-unicode-linebreak", cl::init(false),
[4508]39                     cl::desc("disable Unicode line breaks - use LF only"), cl::cat(fREcompilationOptions));
[4638]40static cl::opt<bool> DisablePregeneratedUnicode("disable-pregenerated-unicode", cl::init(true),
[4623]41                     cl::desc("disable use of pregenerated Unicode character class sets"), cl::cat(fREcompilationOptions));
[4459]42
[4340]43using namespace pablo;
44
45namespace re {
46
[4638]47bool UsePregeneratedUnicode() {
[4626]48    return !DisablePregeneratedUnicode;
49}
[4405]50
[4622]51RE_Compiler::RE_Compiler(cc::CC_Compiler & ccCompiler)
52: mCCCompiler(ccCompiler)
[4246]53, mLineFeed(nullptr)
[4623]54, mCRLF(nullptr)
55, mUnicodeLineBreak(nullptr)
[4246]56, mInitial(nullptr)
57, mNonFinal(nullptr)
[4520]58, mFinal(nullptr)
[4623]59, mWhileTest(nullptr)
[4442]60, mStarDepth(0)
[4623]61, mPB(ccCompiler.getBuilder().getPabloBlock(), ccCompiler.getBuilder())
62, mUCDCompiler(ccCompiler)
[3850]63{
64
[4197]65}
[4508]66
[4411]67   
[4622]68MarkerType RE_Compiler::AdvanceMarker(MarkerType m, MarkerPosition newpos, PabloBuilder & pb) {
[4411]69    if (m.pos == newpos) return m;
[4438]70    PabloAST * a = m.stream;
[4411]71    if (m.pos == FinalMatchByte) {
72        // Must advance at least to InitialPostPositionByte
[4439]73        a = pb.createAdvance(a, 1, "adv");
[4411]74    }
75    // Now at InitialPostPositionByte; is a further advance needed?
76    if (newpos == FinalPostPositionByte) {
77        // Must advance through nonfinal bytes
[4439]78        a = pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal, "scanToFinal");
[4411]79    }
80    return {newpos, a};
81}
[4182]82
[4622]83void RE_Compiler::AlignMarkers(MarkerType & m1, MarkerType & m2, PabloBuilder & pb) {
[4411]84    if (m1.pos < m2.pos) {
85        m1 = AdvanceMarker(m1, m2.pos, pb); 
86    }
87    else if (m2.pos < m1.pos) {
88        m2 = AdvanceMarker(m2, m1.pos, pb); 
89    }
90}
91
[4508]92#define UNICODE_LINE_BREAK (!DisableUnicodeLineBreak)
[4319]93
[4405]94
[4622]95void RE_Compiler::initializeRequiredStreams() {
[3850]96
[4622]97    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x0A)));
[4410]98    mLineFeed = LF;
[4622]99    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x0D));
100    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x0A, 0x0D));
[4508]101
[4622]102    PabloBuilder crb = PabloBuilder::Create(mPB);
[4439]103    PabloAST * cr1 = crb.createAdvance(CR, 1, "cr1");
[4410]104    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(cr1, LF));
[4510]105    mPB.createIf(CR, {acrlf}, crb);
[4410]106    mCRLF = acrlf;
[4405]107
[4622]108    PabloAST * u8pfx = mCCCompiler.compileCC(makeCC(0xC0, 0xFF));
109    PabloBuilder it = PabloBuilder::Create(mPB);
110    PabloAST * u8pfx2 = mCCCompiler.compileCC(makeCC(0xC2, 0xDF), it);
111    PabloAST * u8pfx3 = mCCCompiler.compileCC(makeCC(0xE0, 0xEF), it);
112    PabloAST * u8pfx4 = mCCCompiler.compileCC(makeCC(0xF0, 0xF4), it);
113    Assign * u8suffix = it.createAssign("u8suffix", mCCCompiler.compileCC(makeCC(0x80, 0xBF)));
[4509]114   
115    //
116    // Two-byte sequences
[4622]117    PabloBuilder it2 = PabloBuilder::Create(it);
[4509]118    Assign * u8scope22 = it2.createAssign("u8scope22", it2.createAdvance(u8pfx2, 1));
[4622]119    Assign * NEL = it2.createAssign("NEL", it2.createAnd(it2.createAdvance(mCCCompiler.compileCC(makeCC(0xC2), it2), 1), mCCCompiler.compileCC(makeCC(0x85), it2)));
[4510]120    it.createIf(u8pfx2, {u8scope22, NEL}, it2);
[4509]121   
122    //
123    // Three-byte sequences
[4622]124    PabloBuilder it3 = PabloBuilder::Create(it);
[4509]125    Assign * u8scope32 = it3.createAssign("u8scope32", it3.createAdvance(u8pfx3, 1));
126    PabloAST * u8scope33 = it3.createAdvance(u8pfx3, 2);
127    Assign * u8scope3X = it3.createAssign("u8scope3X", it3.createOr(u8scope32, u8scope33));
[4622]128    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE2), it3), 1), mCCCompiler.compileCC(makeCC(0x80), it3));
129    Assign * LS_PS = it3.createAssign("LS_PS", it3.createAnd(it3.createAdvance(E2_80, 1), mCCCompiler.compileCC(makeCC(0xA8,0xA9), it3)));
130    PabloAST * E0_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE0), it3), 1), mCCCompiler.compileCC(makeCC(0x80, 0x9F), it3));
131    PabloAST * ED_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xED), it3), 1), mCCCompiler.compileCC(makeCC(0xA0, 0xBF), it3));
[4509]132    Assign * EX_invalid = it3.createAssign("EX_invalid", it3.createOr(E0_invalid, ED_invalid));
[4510]133    it.createIf(u8pfx3, {u8scope32, u8scope3X, LS_PS, EX_invalid}, it3);
[4509]134 
135    //
136    // Four-byte sequences
[4622]137    PabloBuilder it4 = PabloBuilder::Create(it);
[4509]138    PabloAST * u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
139    PabloAST * u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
140    PabloAST * u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
141    Assign * u8scope4nonfinal = it4.createAssign("u8scope4nonfinal", it4.createOr(u8scope42, u8scope43));
142    Assign * u8scope4X = it4.createAssign("u8scope4X", it4.createOr(u8scope4nonfinal, u8scope44));
[4622]143    PabloAST * F0_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF0), it4), 1), mCCCompiler.compileCC(makeCC(0x80, 0x8F), it4));
144    PabloAST * F4_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF4), it4), 1), mCCCompiler.compileCC(makeCC(0x90, 0xBF), it4));
[4509]145    Assign * FX_invalid = it4.createAssign("FX_invalid", it4.createOr(F0_invalid, F4_invalid));
[4510]146    it.createIf(u8pfx4, {u8scope4nonfinal, u8scope4X, FX_invalid}, it4);
[4509]147
148    //
149    // Invalid cases
150    PabloAST * anyscope = it.createOr(u8scope22, it.createOr(u8scope3X, u8scope4X));
151    PabloAST * legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
152    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
153    //  a scope is a mismatch, i.e., invalid UTF-8.
154    PabloAST * mismatch = it.createXor(anyscope, u8suffix);
155    //
156    PabloAST * EF_invalid = it.createOr(EX_invalid, FX_invalid);
157    PabloAST * pfx_invalid = it.createXor(u8pfx, legalpfx);
158    Assign * u8invalid = it.createAssign("u8invalid", it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
[4520]159    Assign * u8valid = it.createAssign("u8valid", it.createNot(u8invalid));
[4509]160    //
161    //
162   
[4520]163    Assign * valid_pfx = it.createAssign("valid_pfx", it.createAnd(u8pfx, u8valid));
164    mNonFinal = it.createAssign("nonfinal", it.createAnd(it.createOr(it.createOr(u8pfx, u8scope32), u8scope4nonfinal), u8valid));
[4509]165   
[4352]166    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
[4510]167    mPB.createIf(u8pfx, {u8invalid, valid_pfx, mNonFinal, NEL_LS_PS}, it);
[4509]168   
[4438]169    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
[4622]170    PabloAST * u8single = mPB.createAnd(mCCCompiler.compileCC(makeCC(0x00, 0x7F)), mPB.createNot(u8invalid));
[4439]171    mInitial = mPB.createOr(u8single, valid_pfx, "initial");
[4520]172    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u8invalid), "final");
[4438]173    mUnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
[4330]174}
[4182]175
[4340]176void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
[4197]177    //These three lines are specifically for grep.
[4340]178    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
[4438]179    PabloAST * v = markerVar(match_result);
180    mPB.createAssign("matches", mPB.createAnd(mPB.createMatchStar(v, mPB.createNot(lb)), lb), 0);
181    mPB.createAssign("lf", mPB.createAnd(lb, mPB.createNot(mCRLF)), 1);
[4197]182}
[4405]183
[4622]184MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
[4520]185    return process(re, makeMarker(FinalPostPositionByte, pb.createOnes()), pb);
[4330]186}
[4405]187
[4622]188PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBuilder & pb) {
[4411]189    if (markerPos(m) == FinalPostPositionByte) {
190        return markerVar(m);
[4340]191    }
[4411]192    else if (markerPos(m) == InitialPostPositionByte) {
193        return pb.createScanThru(pb.createAnd(mInitial, markerVar(m)), mNonFinal);
194    }
[4340]195    else {
[4411]196        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(markerVar(m), 1)), mNonFinal);
[4340]197    }
198}
199
[4622]200MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBuilder & pb) {
[4197]201    if (Name * name = dyn_cast<Name>(re)) {
[4340]202        return process(name, marker, pb);
[4197]203    }
204    else if (Seq* seq = dyn_cast<Seq>(re)) {
[4340]205        return process(seq, marker, pb);
[4197]206    }
207    else if (Alt * alt = dyn_cast<Alt>(re)) {
[4340]208        return process(alt, marker, pb);
[4197]209    }
210    else if (Rep * rep = dyn_cast<Rep>(re)) {
[4340]211        return process(rep, marker, pb);
[4197]212    }
[4405]213    else if (Assertion * a = dyn_cast<Assertion>(re)) {
214        return process(a, marker, pb);
215    }
[4245]216    else if (isa<Any>(re)) {
[4340]217        PabloAST * nextPos = nextUnicodePosition(marker, pb);
218        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
[4439]219        return makeMarker(FinalMatchByte, pb.createAnd(nextPos, dot, "dot"));
[4245]220    }
[4255]221    else if (Diff * diff = dyn_cast<Diff>(re)) {
[4340]222        return process(diff, marker, pb);
[4255]223    }
[4308]224    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
[4340]225        return process(ix, marker, pb);
[4308]226    }
[4197]227    else if (isa<Start>(re)) {
[4411]228        MarkerType m = AdvanceMarker(marker, InitialPostPositionByte, pb);
[4340]229        if (UNICODE_LINE_BREAK) {
[4438]230            PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
[4340]231            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
[4439]232            return makeMarker(InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
[4340]233        }
234        else {
235            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
[4520]236            return makeMarker(FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
[4340]237        }
[4197]238    }
239    else if (isa<End>(re)) {
[4340]240        if (UNICODE_LINE_BREAK) {
[4520]241            PabloAST * nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionByte, pb));
[4439]242            return makeMarker(FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
[4340]243        }
[4411]244        PabloAST * nextPos = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));  // For LF match
[4520]245        return makeMarker(FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
[4197]246    }
[4266]247    return marker;
[4197]248}
[3850]249
[4622]250MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBuilder & pb) {
[4411]251    MarkerType nextPos;
252    if (markerPos(marker) == FinalPostPositionByte) nextPos = marker;
253    else if (name->getType() == Name::Type::Byte) {
254        nextPos = AdvanceMarker(marker, InitialPostPositionByte, pb);
255    }
256    else {
257        nextPos = AdvanceMarker(marker, FinalPostPositionByte, pb);
258    }
[4622]259    return makeMarker(FinalMatchByte, pb.createAnd(markerVar(nextPos), getNamedCharacterClassStream(name, pb), "m"));
[4197]260}
[4129]261
[4622]262PabloAST * RE_Compiler::getNamedCharacterClassStream(Name * name, PabloBuilder & pb) {
[4486]263    PabloAST * var = name->getCompiled();
264    if (LLVM_LIKELY(var != nullptr)) {
265        return var;
266    }
267    else if (name->getDefinition() != nullptr) {
[4622]268        MarkerType m = compile(name->getDefinition(), pb);
[4486]269        assert(markerPos(m) == FinalMatchByte);
270        var = markerVar(m);
271    }
272    else if (name->getType() == Name::Type::UnicodeProperty) {
[4638]273        if (UsePregeneratedUnicode()) {
[4626]274            var = mPB.createCall(name->getName());
[4623]275        }
276        else {
[4626]277            var = mUCDCompiler.generateWithDefaultIfHierarchy(resolveUnicodeSet(name), pb);
[4623]278        }
[4486]279    }
280    else {
281        throw std::runtime_error("Unresolved name " + name->getName());
282    }
[4622]283    var = pb.createAnd(var, pb.createNot(UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed));
[4486]284    name->setCompiled(var);
285    return var;
286}
287
288
[4622]289MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBuilder & pb) {
[4442]290    // if-hierarchies are not inserted within unbounded repetitions
291    if (mStarDepth > 0) {
292        for (RE * re : *seq) {
293            marker = process(re, marker, pb);
294        }
295        return marker;
[4197]296    }
[4442]297    else {
298        return processSeqTail(seq->begin(), seq->end(), 0, marker, pb);
299    }
[4197]300}
[3955]301
[4622]302MarkerType RE_Compiler::processSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
[4442]303    if (current == end) return marker;
[4459]304    if (matchLenSoFar < IfInsertionGap) {
[4442]305        RE * r = *current;
306        marker = process(r, marker, pb);
307        current++;
308        return processSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
309    }
310    else {
[4622]311        PabloBuilder nested = PabloBuilder::Create(pb);
[4442]312        MarkerType m1 = processSeqTail(current, end, 0, marker, nested);
313        Assign * m1a = nested.createAssign("m", markerVar(m1));
[4510]314        pb.createIf(markerVar(marker), {m1a}, nested);
[4442]315        return makeMarker(m1.pos, m1a);
316    }
317}
318
[4622]319MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBuilder & pb) {
[4411]320    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
[4340]321    MarkerType const base = marker;
322    // The following may be useful to force a common Advance rather than separate
323    // Advances in each alternative.
[4411]324    // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
[4340]325    for (RE * re : *alt) {
326        MarkerType rslt = process(re, base, pb);
[4411]327        MarkerPosition p = markerPos(rslt);
[4439]328        accum[p] = pb.createOr(accum[p], markerVar(rslt), "alt");
[4197]329    }
[4411]330    if (isa<Zeroes>(accum[InitialPostPositionByte]) && isa<Zeroes>(accum[FinalPostPositionByte])) {
[4439]331        return makeMarker(FinalMatchByte, accum[FinalMatchByte]);
[4340]332    }
[4439]333    PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[FinalMatchByte], 1), "alt");
[4411]334    if (isa<Zeroes>(accum[FinalPostPositionByte])) {
[4439]335        return makeMarker(InitialPostPositionByte, combine);
[4340]336    }
[4439]337    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[FinalPostPositionByte], "alt");
338    return makeMarker(FinalPostPositionByte, combine);
[4197]339}
[3955]340
[4622]341MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBuilder & pb) {
[4411]342    RE * asserted = a->getAsserted();
[4408]343    if (a->getKind() == Assertion::Kind::Lookbehind) {
[4411]344        MarkerType m = marker;
345        MarkerType lookback = compile(asserted, pb);
346        AlignMarkers(m, lookback, pb);
[4438]347        PabloAST * lb = markerVar(lookback);
[4411]348        if (a->getSense() == Assertion::Sense::Negative) {
[4439]349            lb = pb.createNot(lb);
[4408]350        }
[4439]351        return makeMarker(markerPos(m), pb.createAnd(markerVar(marker), lb, "lookback"));
[4411]352    }
353    else if (isUnicodeUnitLength(asserted)) {
354        MarkerType lookahead = compile(asserted, pb);
355        assert(markerPos(lookahead) == FinalMatchByte);
[4438]356        PabloAST * la = markerVar(lookahead);
[4411]357        if (a->getSense() == Assertion::Sense::Negative) {
[4439]358            la = pb.createNot(la);
[4408]359        }
[4411]360        MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionByte, pb);
[4439]361        return makeMarker(FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
[4408]362    }
[4411]363    else {
364        throw std::runtime_error("Unsupported lookahead assertion.");
365    }
[4405]366}
367
[4622]368MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBuilder & pb) {
[4255]369    RE * lh = diff->getLH();
370    RE * rh = diff->getRH();
[4393]371    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
[4340]372        MarkerType t1 = process(lh, marker, pb);
373        MarkerType t2 = process(rh, marker, pb);
[4411]374        AlignMarkers(t1, t2, pb);
[4439]375        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), pb.createNot(markerVar(t2)), "diff"));
[4255]376    }
377    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
378}
379
[4622]380MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBuilder & pb) {
[4298]381    RE * lh = x->getLH();
382    RE * rh = x->getRH();
[4393]383    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
[4340]384        MarkerType t1 = process(lh, marker, pb);
385        MarkerType t2 = process(rh, marker, pb);
[4411]386        AlignMarkers(t1, t2, pb);
[4439]387        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), markerVar(t2), "intersect"));
[4298]388    }
389    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
390}
391
[4622]392MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBuilder & pb) {
[4261]393    int lb = rep->getLB();
394    int ub = rep->getUB();
395    if (lb > 0) {
[4266]396        marker = processLowerBound(rep->getRE(), lb, marker, pb);
[4208]397    }
[4261]398    if (ub == Rep::UNBOUNDED_REP) {
[4340]399        return processUnboundedRep(rep->getRE(), marker, pb);
[4261]400    }
[4420]401    else if (ub == lb) { // if (rep->getUB() != Rep::UNBOUNDED_REP)
402        return marker;
403    }
[4208]404    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
[4340]405        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
[4405]406    }
[4208]407}
408
[4261]409/*
[4420]410   Given a stream |repeated| marking positions associated with matches to an item
411   of length |repeated_lgth|, compute a stream marking |repeat_count| consecutive
412   occurrences of such items.
[4261]413*/
[4405]414
[4622]415inline PabloAST * RE_Compiler::consecutive1(PabloAST * repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
[4340]416        int i = repeated_lgth;
417        int total_lgth = repeat_count * repeated_lgth;
[4438]418        PabloAST * consecutive_i = repeated;
[4420]419        while (i * 2 <= total_lgth) {
[4410]420            PabloAST * v = consecutive_i;
[4420]421            PabloAST * v2 =  pb.createAdvance(v, i);
[4410]422            i *= 2;
[4439]423            consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "inarow");
[4410]424        }       
[4340]425        if (i < total_lgth) {
[4410]426            PabloAST * v = consecutive_i;
[4439]427            consecutive_i = pb.createAnd(v, pb.createAdvance(v, total_lgth - i), "at" + std::to_string(total_lgth) + "inarow");
[4340]428        }
429        return consecutive_i;
[4261]430}
[4405]431
[4622]432inline PabloAST * RE_Compiler::reachable(PabloAST *repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
[4420]433        int i = repeated_lgth;
434        int total_lgth = repeat_count * repeated_lgth;
435        if (repeat_count == 0) {
436            return repeated;
437        }
[4439]438        PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
[4420]439        while (i * 2 < total_lgth) {
440            PabloAST * v = reachable_i;
441            PabloAST * v2 =  pb.createAdvance(v, i);
442            i *= 2;
[4439]443            reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
[4420]444        }       
445        if (i < total_lgth) {
446            PabloAST * v = reachable_i;
[4439]447            reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
[4420]448        }
449        return reachable_i;
450}
451
[4622]452MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBuilder & pb) {
[4459]453    if (isByteLength(repeated) && !DisableLog2BoundedRepetition) {
[4438]454        PabloAST * cc = markerVar(compile(repeated, pb));
455        PabloAST * cc_lb = consecutive1(cc, 1, lb, pb);
[4420]456        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == FinalMatchByte ? lb : lb-1);
[4439]457        return makeMarker(FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
[4340]458    }
459    // Fall through to general case.
460    while (lb-- != 0) {
461        marker = process(repeated, marker, pb);
462    }
463    return marker;
[4267]464}
[4213]465
[4622]466MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBuilder & pb) {
[4459]467    if (isByteLength(repeated) && ub > 1 && !DisableLog2BoundedRepetition) {
[4340]468        // log2 upper bound for fixed length (=1) class
[4420]469        // Create a mask of positions reachable within ub from current marker.
[4340]470        // Use matchstar, then apply filter.
[4438]471        PabloAST * match = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
472        PabloAST * upperLimitMask = reachable(match, 1, ub, pb);
[4411]473        PabloAST * cursor = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
474        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
[4439]475        return makeMarker(InitialPostPositionByte, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
[4340]476    }
477    // Fall through to general case.
478    while (ub-- != 0) {
479        MarkerType a = process(repeated, marker, pb);
[4411]480        MarkerType m = marker;
481        AlignMarkers(a, m, pb);
[4439]482        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m), "m"));
[4340]483    }
484    return marker;
[4261]485}
486
[4622]487MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
[4340]488    // always use PostPosition markers for unbounded repetition.
[4411]489    PabloAST * base = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
[4340]490   
[4459]491    if (isByteLength(repeated)  && !DisableMatchStar) {
[4411]492        PabloAST * cc = markerVar(compile(repeated, pb)); 
[4439]493        return makeMarker(InitialPostPositionByte, pb.createMatchStar(base, cc, "unbounded"));
[4197]494    }
[4459]495    else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
[4411]496        PabloAST * cc = markerVar(compile(repeated, pb));
[4520]497        return makeMarker(FinalPostPositionByte, pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mFinal, "unbounded"));
[4283]498    }
[4609]499    else if (mStarDepth > 0){
500       
[4622]501        PabloBuilder * outerb = pb.getParent();
[4609]502       
503        Assign * starPending = outerb->createAssign("pending", outerb->createZeroes());
504        Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());
505       
506        mStarDepth++;
507        PabloAST * m1 = pb.createOr(base, starPending);
508        PabloAST * m2 = pb.createOr(base, starAccum);
509        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, m1), pb), InitialPostPositionByte, pb));
510        Next * nextPending = pb.createNext(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
511        Next * nextStarAccum = pb.createNext(starAccum, pb.createOr(loopComputation, m2));
512        mWhileTest = pb.createOr(mWhileTest, nextPending);
513        mStarDepth--;
514       
515        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createOr(base, nextStarAccum)));
516    }   
[4208]517    else {
[4340]518        Assign * whileTest = pb.createAssign("test", base);
[4609]519        Assign * whilePending = pb.createAssign("pending", base);
[4340]520        Assign * whileAccum = pb.createAssign("accum", base);
[4609]521        mWhileTest = pb.createZeroes();
[4235]522
[4622]523        PabloBuilder wb = PabloBuilder::Create(pb);
[4442]524        mStarDepth++;
[4252]525
[4609]526        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, whilePending), wb), InitialPostPositionByte, wb));
527        Next * nextWhilePending = wb.createNext(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
[4416]528        Next * nextWhileAccum = wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccum));
[4609]529        Next * nextWhileTest = wb.createNext(whileTest, wb.createOr(mWhileTest, nextWhilePending));
[4410]530        pb.createWhile(nextWhileTest, wb);
[4442]531        mStarDepth--;
[4252]532
[4416]533        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", nextWhileAccum));
[4211]534    }   
[4340]535} // end of namespace re
[4197]536}
Note: See TracBrowser for help on using the repository browser.