source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4684

Last change on this file since 4684 was 4684, checked in by nmedfort, 4 years ago

First attempt to intergrate 'generate_predefined_ucd_functions' into build process.

File size: 24.4 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include <re/re_name.h>
10#include <re/re_any.h>
11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <re/re_analysis.h>
21#include <cc/cc_namemap.hpp>
22#include <pablo/codegenstate.h>
23#include <UCD/ucd_compiler.hpp>
24#include <UCD/precompiled_properties.h>
25#include <UCD/resolve_properties.h>
26#include <assert.h>
27#include <stdexcept>
28
29#include "llvm/Support/CommandLine.h"
30static cl::OptionCategory fREcompilationOptions("Regex Compilation Options",
31                                      "These options control the compilation of regular expressions to Pablo.");
32
33static cl::opt<bool> DisableLog2BoundedRepetition("disable-log2-bounded-repetition", cl::init(false),
34                     cl::desc("disable log2 optimizations for bounded repetition of bytes"), cl::cat(fREcompilationOptions));
35static cl::opt<int> IfInsertionGap("if-insertion-gap", cl::init(3), cl::desc("minimum number of nonempty elements between inserted if short-circuit tests"), cl::cat(fREcompilationOptions));
36static cl::opt<bool> DisableMatchStar("disable-matchstar", cl::init(false),
37                     cl::desc("disable MatchStar optimization"), cl::cat(fREcompilationOptions));
38static cl::opt<bool> DisableUnicodeMatchStar("disable-unicode-matchstar", cl::init(false),
39                     cl::desc("disable Unicode MatchStar optimization"), cl::cat(fREcompilationOptions));
40static cl::opt<bool> DisableUnicodeLineBreak("disable-unicode-linebreak", cl::init(false),
41                     cl::desc("disable Unicode line breaks - use LF only"), cl::cat(fREcompilationOptions));
42static cl::opt<bool> DisablePregeneratedUnicode("disable-pregenerated-unicode", cl::init(false),
43                     cl::desc("disable use of pregenerated Unicode character class sets"), cl::cat(fREcompilationOptions));
44using namespace pablo;
45
46namespace re {
47
48RE_Compiler::RE_Compiler(pablo::PabloFunction & function, cc::CC_Compiler & ccCompiler)
49: mCCCompiler(ccCompiler)
50, mLineFeed(nullptr)
51, mCRLF(nullptr)
52, mUnicodeLineBreak(nullptr)
53, mInitial(nullptr)
54, mNonFinal(nullptr)
55, mFinal(nullptr)
56, mWhileTest(nullptr)
57, mStarDepth(0)
58, mLoopVariants()
59, mPB(*ccCompiler.getBuilder().getPabloBlock(), ccCompiler.getBuilder())
60, mFunction(function)
61{
62
63}
64
65   
66MarkerType RE_Compiler::AdvanceMarker(MarkerType m, MarkerPosition newpos, PabloBuilder & pb) {
67    if (m.pos == newpos) return m;
68    PabloAST * a = m.stream;
69    if (m.pos == FinalMatchByte) {
70        // Must advance at least to InitialPostPositionByte
71        a = pb.createAdvance(a, 1, "adv");
72    }
73    // Now at InitialPostPositionByte; is a further advance needed?
74    if (newpos == FinalPostPositionByte) {
75        // Must advance through nonfinal bytes
76        a = pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal, "scanToFinal");
77    }
78    return {newpos, a};
79}
80
81void RE_Compiler::AlignMarkers(MarkerType & m1, MarkerType & m2, PabloBuilder & pb) {
82    if (m1.pos < m2.pos) {
83        m1 = AdvanceMarker(m1, m2.pos, pb); 
84    }
85    else if (m2.pos < m1.pos) {
86        m2 = AdvanceMarker(m2, m1.pos, pb); 
87    }
88}
89
90#define UNICODE_LINE_BREAK (!DisableUnicodeLineBreak)
91
92
93void RE_Compiler::initializeRequiredStreams() {
94
95    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x0A)));
96    mLineFeed = LF;
97    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x0D));
98    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x0A, 0x0D));
99
100    PabloBuilder crb = PabloBuilder::Create(mPB);
101    PabloAST * cr1 = crb.createAdvance(CR, 1, "cr1");
102    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(cr1, LF));
103    mPB.createIf(CR, {acrlf}, crb);
104    mCRLF = acrlf;
105
106    PabloAST * u8pfx = mCCCompiler.compileCC(makeCC(0xC0, 0xFF));
107    PabloBuilder it = PabloBuilder::Create(mPB);
108    PabloAST * u8pfx2 = mCCCompiler.compileCC(makeCC(0xC2, 0xDF), it);
109    PabloAST * u8pfx3 = mCCCompiler.compileCC(makeCC(0xE0, 0xEF), it);
110    PabloAST * u8pfx4 = mCCCompiler.compileCC(makeCC(0xF0, 0xF4), it);
111    Assign * u8suffix = it.createAssign("u8suffix", mCCCompiler.compileCC(makeCC(0x80, 0xBF)));
112   
113    //
114    // Two-byte sequences
115    PabloBuilder it2 = PabloBuilder::Create(it);
116    Assign * u8scope22 = it2.createAssign("u8scope22", it2.createAdvance(u8pfx2, 1));
117    Assign * NEL = it2.createAssign("NEL", it2.createAnd(it2.createAdvance(mCCCompiler.compileCC(makeCC(0xC2), it2), 1), mCCCompiler.compileCC(makeCC(0x85), it2)));
118    it.createIf(u8pfx2, {u8scope22, NEL}, it2);
119   
120    //
121    // Three-byte sequences
122    PabloBuilder it3 = PabloBuilder::Create(it);
123    Assign * u8scope32 = it3.createAssign("u8scope32", it3.createAdvance(u8pfx3, 1));
124    PabloAST * u8scope33 = it3.createAdvance(u8pfx3, 2);
125    Assign * u8scope3X = it3.createAssign("u8scope3X", it3.createOr(u8scope32, u8scope33));
126    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE2), it3), 1), mCCCompiler.compileCC(makeCC(0x80), it3));
127    Assign * LS_PS = it3.createAssign("LS_PS", it3.createAnd(it3.createAdvance(E2_80, 1), mCCCompiler.compileCC(makeCC(0xA8,0xA9), it3)));
128    PabloAST * E0_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE0), it3), 1), mCCCompiler.compileCC(makeCC(0x80, 0x9F), it3));
129    PabloAST * ED_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xED), it3), 1), mCCCompiler.compileCC(makeCC(0xA0, 0xBF), it3));
130    Assign * EX_invalid = it3.createAssign("EX_invalid", it3.createOr(E0_invalid, ED_invalid));
131    it.createIf(u8pfx3, {u8scope32, u8scope3X, LS_PS, EX_invalid}, it3);
132 
133    //
134    // Four-byte sequences
135    PabloBuilder it4 = PabloBuilder::Create(it);
136    PabloAST * u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
137    PabloAST * u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
138    PabloAST * u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
139    Assign * u8scope4nonfinal = it4.createAssign("u8scope4nonfinal", it4.createOr(u8scope42, u8scope43));
140    Assign * u8scope4X = it4.createAssign("u8scope4X", it4.createOr(u8scope4nonfinal, u8scope44));
141    PabloAST * F0_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF0), it4), 1), mCCCompiler.compileCC(makeCC(0x80, 0x8F), it4));
142    PabloAST * F4_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF4), it4), 1), mCCCompiler.compileCC(makeCC(0x90, 0xBF), it4));
143    Assign * FX_invalid = it4.createAssign("FX_invalid", it4.createOr(F0_invalid, F4_invalid));
144    it.createIf(u8pfx4, {u8scope4nonfinal, u8scope4X, FX_invalid}, it4);
145
146    //
147    // Invalid cases
148    PabloAST * anyscope = it.createOr(u8scope22, it.createOr(u8scope3X, u8scope4X));
149    PabloAST * legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
150    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
151    //  a scope is a mismatch, i.e., invalid UTF-8.
152    PabloAST * mismatch = it.createXor(anyscope, u8suffix);
153    //
154    PabloAST * EF_invalid = it.createOr(EX_invalid, FX_invalid);
155    PabloAST * pfx_invalid = it.createXor(u8pfx, legalpfx);
156    Assign * u8invalid = it.createAssign("u8invalid", it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
157    Assign * u8valid = it.createAssign("u8valid", it.createNot(u8invalid));
158    //
159    //
160   
161    Assign * valid_pfx = it.createAssign("valid_pfx", it.createAnd(u8pfx, u8valid));
162    mNonFinal = it.createAssign("nonfinal", it.createAnd(it.createOr(it.createOr(u8pfx, u8scope32), u8scope4nonfinal), u8valid));
163   
164    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
165    mPB.createIf(u8pfx, {u8invalid, valid_pfx, mNonFinal, NEL_LS_PS}, it);
166   
167    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
168    PabloAST * u8single = mPB.createAnd(mCCCompiler.compileCC(makeCC(0x00, 0x7F)), mPB.createNot(u8invalid));
169    mInitial = mPB.createOr(u8single, valid_pfx, "initial");
170    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u8invalid), "final");
171    mUnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
172}
173
174void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
175    //These three lines are specifically for grep.
176    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
177    PabloAST * v = markerVar(match_result);
178    mFunction.setResult(0, mPB.createAssign("matches", mPB.createAnd(mPB.createMatchStar(v, mPB.createNot(lb)), lb)));
179    mFunction.setResult(1, mPB.createAssign("lf", mPB.createAnd(lb, mPB.createNot(mCRLF))));
180}
181
182MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
183    return process(re, makeMarker(FinalPostPositionByte, pb.createOnes()), pb);
184}
185
186PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBuilder & pb) {
187    if (markerPos(m) == FinalPostPositionByte) {
188        return markerVar(m);
189    }
190    else if (markerPos(m) == InitialPostPositionByte) {
191        return pb.createScanThru(pb.createAnd(mInitial, markerVar(m)), mNonFinal);
192    }
193    else {
194        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(markerVar(m), 1)), mNonFinal);
195    }
196}
197
198MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBuilder & pb) {
199    if (Name * name = dyn_cast<Name>(re)) {
200        return process(name, marker, pb);
201    }
202    else if (Seq* seq = dyn_cast<Seq>(re)) {
203        return process(seq, marker, pb);
204    }
205    else if (Alt * alt = dyn_cast<Alt>(re)) {
206        return process(alt, marker, pb);
207    }
208    else if (Rep * rep = dyn_cast<Rep>(re)) {
209        return process(rep, marker, pb);
210    }
211    else if (Assertion * a = dyn_cast<Assertion>(re)) {
212        return process(a, marker, pb);
213    }
214    else if (isa<Any>(re)) {
215        PabloAST * nextPos = nextUnicodePosition(marker, pb);
216        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
217        return makeMarker(FinalMatchByte, pb.createAnd(nextPos, dot, "dot"));
218    }
219    else if (Diff * diff = dyn_cast<Diff>(re)) {
220        return process(diff, marker, pb);
221    }
222    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
223        return process(ix, marker, pb);
224    }
225    else if (isa<Start>(re)) {
226        MarkerType m = AdvanceMarker(marker, InitialPostPositionByte, pb);
227        if (UNICODE_LINE_BREAK) {
228            PabloAST * line_end = mPB.createOr(mUnicodeLineBreak, mCRLF);
229            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
230            return makeMarker(InitialPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
231        }
232        else {
233            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
234            return makeMarker(FinalPostPositionByte, pb.createAnd(markerVar(m), sol, "sol"));
235        }
236    }
237    else if (isa<End>(re)) {
238        if (UNICODE_LINE_BREAK) {
239            PabloAST * nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionByte, pb));
240            return makeMarker(FinalPostPositionByte, pb.createAnd(nextPos, mUnicodeLineBreak, "end"));
241        }
242        PabloAST * nextPos = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));  // For LF match
243        return makeMarker(FinalPostPositionByte, pb.createAnd(nextPos, mLineFeed, "eol"));
244    }
245    return marker;
246}
247
248MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBuilder & pb) {
249    MarkerType nextPos;
250    if (markerPos(marker) == FinalPostPositionByte) {
251        nextPos = marker;
252    }
253    else if (name->getType() == Name::Type::Byte) {
254        nextPos = AdvanceMarker(marker, InitialPostPositionByte, pb);
255    }
256    else {
257        nextPos = AdvanceMarker(marker, FinalPostPositionByte, pb);
258    }
259    return makeMarker(FinalMatchByte, pb.createAnd(markerVar(nextPos), getNamedCharacterClassStream(name, pb), "m"));
260}
261
262PabloAST * RE_Compiler::getNamedCharacterClassStream(Name * name, PabloBuilder & pb) {
263    PabloAST * var = name->getCompiled();
264    if (LLVM_LIKELY(var != nullptr)) {
265        return var;
266    }
267    else if (name->getDefinition() != nullptr) {
268        MarkerType m = compile(name->getDefinition(), pb);
269        assert(markerPos(m) == FinalMatchByte);
270        var = markerVar(m);
271    }
272    else if (name->getType() == Name::Type::UnicodeProperty) {
273        if (DisablePregeneratedUnicode) {
274            UCD::UCDCompiler ucdCompiler(mCCCompiler);
275            var = ucdCompiler.generateWithDefaultIfHierarchy(UCD::resolveUnicodeSet(name), pb);
276        }
277        else {
278            const UCD::ExternalProperty & ep = UCD::resolveExternalProperty(name->getFunctionName());
279            var = pb.createCall(Prototype::Create(name->getFunctionName(), std::get<1>(ep), std::get<2>(ep), std::get<3>(ep), std::get<0>(ep)), mCCCompiler.getBasisBits());
280        }
281    }
282    else {
283        throw std::runtime_error("Unresolved name " + name->getName());
284    }
285    var = pb.createAnd(var, pb.createNot(UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed));
286    name->setCompiled(var);
287    return var;
288}
289
290MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBuilder & pb) {
291    // if-hierarchies are not inserted within unbounded repetitions
292    if (mStarDepth > 0) {
293        for (RE * re : *seq) {
294            marker = process(re, marker, pb);
295        }
296        return marker;
297    }
298    else {
299        return processSeqTail(seq->begin(), seq->end(), 0, marker, pb);
300    }
301}
302
303MarkerType RE_Compiler::processSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
304    if (current == end) return marker;
305    if (matchLenSoFar < IfInsertionGap) {
306        RE * r = *current;
307        marker = process(r, marker, pb);
308        current++;
309        return processSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
310    }
311    else {
312        PabloBuilder nested = PabloBuilder::Create(pb);
313        MarkerType m1 = processSeqTail(current, end, 0, marker, nested);
314        Assign * m1a = nested.createAssign("m", markerVar(m1));
315        pb.createIf(markerVar(marker), {m1a}, nested);
316        return makeMarker(m1.pos, m1a);
317    }
318}
319
320MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBuilder & pb) {
321    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
322    MarkerType const base = marker;
323    // The following may be useful to force a common Advance rather than separate
324    // Advances in each alternative.
325    // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
326    for (RE * re : *alt) {
327        MarkerType rslt = process(re, base, pb);
328        MarkerPosition p = markerPos(rslt);
329        accum[p] = pb.createOr(accum[p], markerVar(rslt), "alt");
330    }
331    if (isa<Zeroes>(accum[InitialPostPositionByte]) && isa<Zeroes>(accum[FinalPostPositionByte])) {
332        return makeMarker(FinalMatchByte, accum[FinalMatchByte]);
333    }
334    PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[FinalMatchByte], 1), "alt");
335    if (isa<Zeroes>(accum[FinalPostPositionByte])) {
336        return makeMarker(InitialPostPositionByte, combine);
337    }
338    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[FinalPostPositionByte], "alt");
339    return makeMarker(FinalPostPositionByte, combine);
340}
341
342MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBuilder & pb) {
343    RE * asserted = a->getAsserted();
344    if (a->getKind() == Assertion::Kind::Lookbehind) {
345        MarkerType m = marker;
346        MarkerType lookback = compile(asserted, pb);
347        AlignMarkers(m, lookback, pb);
348        PabloAST * lb = markerVar(lookback);
349        if (a->getSense() == Assertion::Sense::Negative) {
350            lb = pb.createNot(lb);
351        }
352        return makeMarker(markerPos(m), pb.createAnd(markerVar(marker), lb, "lookback"));
353    }
354    else if (isUnicodeUnitLength(asserted)) {
355        MarkerType lookahead = compile(asserted, pb);
356        assert(markerPos(lookahead) == FinalMatchByte);
357        PabloAST * la = markerVar(lookahead);
358        if (a->getSense() == Assertion::Sense::Negative) {
359            la = pb.createNot(la);
360        }
361        MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionByte, pb);
362        return makeMarker(FinalPostPositionByte, pb.createAnd(markerVar(fbyte), la, "lookahead"));
363    }
364    else {
365        throw std::runtime_error("Unsupported lookahead assertion.");
366    }
367}
368
369MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBuilder & pb) {
370    RE * lh = diff->getLH();
371    RE * rh = diff->getRH();
372    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
373        MarkerType t1 = process(lh, marker, pb);
374        MarkerType t2 = process(rh, marker, pb);
375        AlignMarkers(t1, t2, pb);
376        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), pb.createNot(markerVar(t2)), "diff"));
377    }
378    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
379}
380
381MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBuilder & pb) {
382    RE * lh = x->getLH();
383    RE * rh = x->getRH();
384    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
385        MarkerType t1 = process(lh, marker, pb);
386        MarkerType t2 = process(rh, marker, pb);
387        AlignMarkers(t1, t2, pb);
388        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), markerVar(t2), "intersect"));
389    }
390    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
391}
392
393MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBuilder & pb) {
394    int lb = rep->getLB();
395    int ub = rep->getUB();
396    if (lb > 0) {
397        marker = processLowerBound(rep->getRE(), lb, marker, pb);
398    }
399    if (ub == Rep::UNBOUNDED_REP) {
400        return processUnboundedRep(rep->getRE(), marker, pb);
401    }
402    else if (ub == lb) { // if (rep->getUB() != Rep::UNBOUNDED_REP)
403        return marker;
404    }
405    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
406        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
407    }
408}
409
410/*
411   Given a stream |repeated| marking positions associated with matches to an item
412   of length |repeated_lgth|, compute a stream marking |repeat_count| consecutive
413   occurrences of such items.
414*/
415
416inline PabloAST * RE_Compiler::consecutive1(PabloAST * repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
417        int i = repeated_lgth;
418        int total_lgth = repeat_count * repeated_lgth;
419        PabloAST * consecutive_i = repeated;
420        while (i * 2 <= total_lgth) {
421            PabloAST * v = consecutive_i;
422            PabloAST * v2 =  pb.createAdvance(v, i);
423            i *= 2;
424            consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "inarow");
425        }       
426        if (i < total_lgth) {
427            PabloAST * v = consecutive_i;
428            consecutive_i = pb.createAnd(v, pb.createAdvance(v, total_lgth - i), "at" + std::to_string(total_lgth) + "inarow");
429        }
430        return consecutive_i;
431}
432
433inline PabloAST * RE_Compiler::reachable(PabloAST *repeated, int repeated_lgth, int repeat_count, PabloBuilder & pb) {
434        int i = repeated_lgth;
435        int total_lgth = repeat_count * repeated_lgth;
436        if (repeat_count == 0) {
437            return repeated;
438        }
439        PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
440        while (i * 2 < total_lgth) {
441            PabloAST * v = reachable_i;
442            PabloAST * v2 =  pb.createAdvance(v, i);
443            i *= 2;
444            reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
445        }       
446        if (i < total_lgth) {
447            PabloAST * v = reachable_i;
448            reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
449        }
450        return reachable_i;
451}
452
453MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBuilder & pb) {
454    if (isByteLength(repeated) && !DisableLog2BoundedRepetition) {
455        PabloAST * cc = markerVar(compile(repeated, pb));
456        PabloAST * cc_lb = consecutive1(cc, 1, lb, pb);
457        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == FinalMatchByte ? lb : lb-1);
458        return makeMarker(FinalMatchByte, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
459    }
460    // Fall through to general case.
461    while (lb-- != 0) {
462        marker = process(repeated, marker, pb);
463    }
464    return marker;
465}
466
467MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBuilder & pb) {
468    if (isByteLength(repeated) && ub > 1 && !DisableLog2BoundedRepetition) {
469        // log2 upper bound for fixed length (=1) class
470        // Create a mask of positions reachable within ub from current marker.
471        // Use matchstar, then apply filter.
472        PabloAST * match = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
473        PabloAST * upperLimitMask = reachable(match, 1, ub, pb);
474        PabloAST * cursor = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
475        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
476        return makeMarker(InitialPostPositionByte, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
477    }
478    // Fall through to general case.
479    while (ub-- != 0) {
480        MarkerType a = process(repeated, marker, pb);
481        MarkerType m = marker;
482        AlignMarkers(a, m, pb);
483        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m), "m"));
484    }
485    return marker;
486}
487
488MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
489    // always use PostPosition markers for unbounded repetition.
490    PabloAST * base = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
491   
492    if (isByteLength(repeated)  && !DisableMatchStar) {
493        PabloAST * cc = markerVar(compile(repeated, pb)); 
494        return makeMarker(InitialPostPositionByte, pb.createMatchStar(base, cc, "unbounded"));
495    }
496    else if (isUnicodeUnitLength(repeated) && !DisableMatchStar && !DisableUnicodeMatchStar) {
497        PabloAST * cc = markerVar(compile(repeated, pb));
498        return makeMarker(FinalPostPositionByte, pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mFinal, "unbounded"));
499    }
500    else if (mStarDepth > 0){
501       
502        PabloBuilder * outerb = pb.getParent();
503       
504        Assign * starPending = outerb->createAssign("pending", outerb->createZeroes());
505        Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());
506       
507        mStarDepth++;
508        PabloAST * m1 = pb.createOr(base, starPending);
509        PabloAST * m2 = pb.createOr(base, starAccum);
510        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, m1), pb), InitialPostPositionByte, pb));
511        Next * nextPending = pb.createNext(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
512        Next * nextStarAccum = pb.createNext(starAccum, pb.createOr(loopComputation, m2));
513        mWhileTest = pb.createOr(mWhileTest, nextPending);
514        mLoopVariants.push_back(nextPending);
515        mLoopVariants.push_back(nextStarAccum);
516        mStarDepth--;
517       
518        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createOr(base, nextStarAccum)));
519    }   
520    else {
521        Assign * whileTest = pb.createAssign("test", base);
522        Assign * whilePending = pb.createAssign("pending", base);
523        Assign * whileAccum = pb.createAssign("accum", base);
524        mWhileTest = pb.createZeroes();
525
526        PabloBuilder wb = PabloBuilder::Create(pb);
527        mStarDepth++;
528
529        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, whilePending), wb), InitialPostPositionByte, wb));
530        Next * nextWhilePending = wb.createNext(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
531        Next * nextWhileAccum = wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccum));
532        Next * nextWhileTest = wb.createNext(whileTest, wb.createOr(mWhileTest, nextWhilePending));
533        mLoopVariants.push_back(nextWhilePending);
534        mLoopVariants.push_back(nextWhileAccum);
535        mLoopVariants.push_back(nextWhileTest);
536        pb.createWhile(nextWhileTest, mLoopVariants, wb);
537        mStarDepth--;
538        mLoopVariants.clear();
539        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", nextWhileAccum));
540    }   
541} // end of namespace re
542}
Note: See TracBrowser for help on using the repository browser.