source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 5149

Last change on this file since 5149 was 5137, checked in by cameron, 3 years ago

Some clean ups of encoding info for ccc restructuring.

File size: 30.0 KB
RevLine 
[3850]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
[4803]6#include <re/re_compiler.h>
[5030]7#include <re/re_toolchain.h>
[5083]8#include <re/re_name_resolve.h>
[4197]9//Regular Expressions
[4199]10#include <re/re_name.h>
[4245]11#include <re/re_any.h>
[4199]12#include <re/re_start.h>
13#include <re/re_end.h>
14#include <re/re_alt.h>
15#include <re/re_cc.h>
16#include <re/re_seq.h>
17#include <re/re_rep.h>
[4255]18#include <re/re_diff.h>
[4298]19#include <re/re_intersect.h>
[4405]20#include <re/re_assertion.h>
[4393]21#include <re/re_analysis.h>
[4820]22#include <re/re_memoizer.hpp>
[4819]23#include <re/printer_re.h>
[4207]24#include <pablo/codegenstate.h>
[4684]25#include <UCD/ucd_compiler.hpp>
[4686]26#include <UCD/resolve_properties.h>
27#ifndef DISABLE_PREGENERATED_UCD_FUNCTIONS
[4684]28#include <UCD/precompiled_properties.h>
[4686]29#endif
[4197]30#include <assert.h>
31#include <stdexcept>
[4716]32#include <iostream>
[4808]33#include <pablo/printer_pablos.h>
[4459]34#include "llvm/Support/CommandLine.h"
[4829]35#include <sstream>
[4831]36#include <unordered_set>
[4459]37
[4829]38
[5030]39#define UNICODE_LINE_BREAK (!AlgorithmOptionIsSet(DisableUnicodeLineBreak))
[4829]40
[4340]41using namespace pablo;
42
43namespace re {
44
[5137]45void RE_Compiler::initializeRequiredStreams(const unsigned encodingBits) {
46    if (encodingBits == 8) {
[5083]47      RE_Compiler::initializeRequiredStreams_utf8();
[5046]48    }
[5137]49    else if (encodingBits == 16) {
[5083]50      RE_Compiler::initializeRequiredStreams_utf16();
[5046]51    }
[5045]52}
[5083]53
[5045]54void RE_Compiler::initializeRequiredStreams_utf16() {
55    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x000A)));
56    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x000D));
57    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x000A, 0x000D));
58    Assign * NEL = mPB.createAssign("NEL", mCCCompiler.compileCC(makeCC(0x0085)));
59    Assign * LS_PS = mPB.createAssign("LS_PS", mCCCompiler.compileCC(makeCC(0x2028, 0x2029)));
60    Assign * NEL_LS_PS = mPB.createAssign("NEL_LS_PS", mPB.createOr(NEL, LS_PS));
[3850]61
[5045]62    PabloAST * cr1 = mPB.createAdvance(CR, 1, "cr1");
63    Assign * acrlf = mPB.createAssign("crlf", mPB.createAnd(cr1, LF));
64    mCRLF = acrlf;
65
[5046]66    PabloAST * hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDBFF));
67    //PabloAST * lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDFFF));
68    PabloAST * u16hi_hi_surrogate = mCCCompiler.compileCC(makeCC(0xD800, 0xDB00));    //u16hi_hi_surrogate = [\xD8-\xDB]
69    PabloAST * u16hi_lo_surrogate = mCCCompiler.compileCC(makeCC(0xDC00, 0xDF00));    //u16hi_lo_surrogate = [\xDC-\xDF]
[5045]70
[5046]71    PabloAST * invalidTemp = mPB.createAdvance(u16hi_hi_surrogate, 1, "InvalidTemp");
[5045]72    Assign * u16invalid = mPB.createAssign("u16invalid", mPB.createXor(invalidTemp, u16hi_lo_surrogate));//errors.Unicode=pablo.Advance(u16hi_hi_surrogate) ^ u16hi_lo_surrogate
73    Assign * u16valid = mPB.createAssign("u16valid", mPB.createNot(u16invalid));
74
75    PabloAST * u16single_temp = mPB.createOr(mCCCompiler.compileCC(makeCC(0x0000, 0xD7FF)), mCCCompiler.compileCC(makeCC(0xE000, 0xFFFF)));
[5046]76    PabloAST * u16single = mPB.createAnd(u16single_temp, mPB.createNot(u16invalid));
[5045]77   
78    mNonFinal = mPB.createAssign("nonfinal", mPB.createAnd(hi_surrogate, u16valid));
79    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u16invalid), "final");
[5046]80    mInitial = mPB.createOr(u16single, hi_surrogate, "initial");
[5045]81   
82    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
83    PabloAST * UnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
84    PabloAST * lb = UNICODE_LINE_BREAK ? UnicodeLineBreak : LF;
85    PabloAST * unterminatedLineAtEOF = mPB.createAtEOF(mPB.createAdvance(mPB.createNot(LB_chars), 1));
86    mLineBreak = mPB.createOr(lb, unterminatedLineAtEOF);
87    mAny = mPB.createNot(lb, "any");
[5134]88    if (!mCountOnly) mFunction.setResult(1, mPB.createAssign("lf", mLineBreak));
[5046]89    return;
[5045]90}
91void RE_Compiler::initializeRequiredStreams_utf8() {
[4622]92    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x0A)));
93    PabloAST * CR = mCCCompiler.compileCC(makeCC(0x0D));
94    PabloAST * LF_VT_FF_CR = mCCCompiler.compileCC(makeCC(0x0A, 0x0D));
[4508]95
[4622]96    PabloBuilder crb = PabloBuilder::Create(mPB);
[4439]97    PabloAST * cr1 = crb.createAdvance(CR, 1, "cr1");
[4410]98    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(cr1, LF));
[4510]99    mPB.createIf(CR, {acrlf}, crb);
[4410]100    mCRLF = acrlf;
[4405]101
[4622]102    PabloAST * u8pfx = mCCCompiler.compileCC(makeCC(0xC0, 0xFF));
103    PabloBuilder it = PabloBuilder::Create(mPB);
104    PabloAST * u8pfx2 = mCCCompiler.compileCC(makeCC(0xC2, 0xDF), it);
105    PabloAST * u8pfx3 = mCCCompiler.compileCC(makeCC(0xE0, 0xEF), it);
106    PabloAST * u8pfx4 = mCCCompiler.compileCC(makeCC(0xF0, 0xF4), it);
107    Assign * u8suffix = it.createAssign("u8suffix", mCCCompiler.compileCC(makeCC(0x80, 0xBF)));
[4846]108
[4509]109    //
110    // Two-byte sequences
[4622]111    PabloBuilder it2 = PabloBuilder::Create(it);
[4509]112    Assign * u8scope22 = it2.createAssign("u8scope22", it2.createAdvance(u8pfx2, 1));
[4622]113    Assign * NEL = it2.createAssign("NEL", it2.createAnd(it2.createAdvance(mCCCompiler.compileCC(makeCC(0xC2), it2), 1), mCCCompiler.compileCC(makeCC(0x85), it2)));
[4510]114    it.createIf(u8pfx2, {u8scope22, NEL}, it2);
[4846]115
[4509]116    //
117    // Three-byte sequences
[4622]118    PabloBuilder it3 = PabloBuilder::Create(it);
[4509]119    Assign * u8scope32 = it3.createAssign("u8scope32", it3.createAdvance(u8pfx3, 1));
120    PabloAST * u8scope33 = it3.createAdvance(u8pfx3, 2);
121    Assign * u8scope3X = it3.createAssign("u8scope3X", it3.createOr(u8scope32, u8scope33));
[4622]122    PabloAST * E2_80 = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE2), it3), 1), mCCCompiler.compileCC(makeCC(0x80), it3));
123    Assign * LS_PS = it3.createAssign("LS_PS", it3.createAnd(it3.createAdvance(E2_80, 1), mCCCompiler.compileCC(makeCC(0xA8,0xA9), it3)));
124    PabloAST * E0_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xE0), it3), 1), mCCCompiler.compileCC(makeCC(0x80, 0x9F), it3));
125    PabloAST * ED_invalid = it3.createAnd(it3.createAdvance(mCCCompiler.compileCC(makeCC(0xED), it3), 1), mCCCompiler.compileCC(makeCC(0xA0, 0xBF), it3));
[4509]126    Assign * EX_invalid = it3.createAssign("EX_invalid", it3.createOr(E0_invalid, ED_invalid));
[4510]127    it.createIf(u8pfx3, {u8scope32, u8scope3X, LS_PS, EX_invalid}, it3);
[4846]128
[4509]129    //
130    // Four-byte sequences
[4622]131    PabloBuilder it4 = PabloBuilder::Create(it);
[4509]132    PabloAST * u8scope42 = it4.createAdvance(u8pfx4, 1, "u8scope42");
133    PabloAST * u8scope43 = it4.createAdvance(u8scope42, 1, "u8scope43");
134    PabloAST * u8scope44 = it4.createAdvance(u8scope43, 1, "u8scope44");
135    Assign * u8scope4nonfinal = it4.createAssign("u8scope4nonfinal", it4.createOr(u8scope42, u8scope43));
136    Assign * u8scope4X = it4.createAssign("u8scope4X", it4.createOr(u8scope4nonfinal, u8scope44));
[4622]137    PabloAST * F0_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF0), it4), 1), mCCCompiler.compileCC(makeCC(0x80, 0x8F), it4));
138    PabloAST * F4_invalid = it4.createAnd(it4.createAdvance(mCCCompiler.compileCC(makeCC(0xF4), it4), 1), mCCCompiler.compileCC(makeCC(0x90, 0xBF), it4));
[4509]139    Assign * FX_invalid = it4.createAssign("FX_invalid", it4.createOr(F0_invalid, F4_invalid));
[4510]140    it.createIf(u8pfx4, {u8scope4nonfinal, u8scope4X, FX_invalid}, it4);
[4509]141
142    //
143    // Invalid cases
144    PabloAST * anyscope = it.createOr(u8scope22, it.createOr(u8scope3X, u8scope4X));
145    PabloAST * legalpfx = it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4);
146    //  Any scope that does not have a suffix byte, and any suffix byte that is not in
147    //  a scope is a mismatch, i.e., invalid UTF-8.
148    PabloAST * mismatch = it.createXor(anyscope, u8suffix);
149    //
150    PabloAST * EF_invalid = it.createOr(EX_invalid, FX_invalid);
151    PabloAST * pfx_invalid = it.createXor(u8pfx, legalpfx);
152    Assign * u8invalid = it.createAssign("u8invalid", it.createOr(pfx_invalid, it.createOr(mismatch, EF_invalid)));
[4520]153    Assign * u8valid = it.createAssign("u8valid", it.createNot(u8invalid));
[4509]154    //
155    //
[4846]156
[4520]157    Assign * valid_pfx = it.createAssign("valid_pfx", it.createAnd(u8pfx, u8valid));
158    mNonFinal = it.createAssign("nonfinal", it.createAnd(it.createOr(it.createOr(u8pfx, u8scope32), u8scope4nonfinal), u8valid));
[4846]159
[4352]160    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
[4510]161    mPB.createIf(u8pfx, {u8invalid, valid_pfx, mNonFinal, NEL_LS_PS}, it);
[4846]162
[4438]163    PabloAST * LB_chars = mPB.createOr(LF_VT_FF_CR, NEL_LS_PS);
[4622]164    PabloAST * u8single = mPB.createAnd(mCCCompiler.compileCC(makeCC(0x00, 0x7F)), mPB.createNot(u8invalid));
[4439]165    mInitial = mPB.createOr(u8single, valid_pfx, "initial");
[4520]166    mFinal = mPB.createNot(mPB.createOr(mNonFinal, u8invalid), "final");
[5042]167    PabloAST * UnicodeLineBreak = mPB.createAnd(LB_chars, mPB.createNot(mCRLF));  // count the CR, but not CRLF
168    PabloAST * lb = UNICODE_LINE_BREAK ? UnicodeLineBreak : LF;
169    PabloAST * unterminatedLineAtEOF = mPB.createAtEOF(mPB.createAdvance(mPB.createNot(LB_chars), 1));
170    mLineBreak = mPB.createOr(lb, unterminatedLineAtEOF);
[4829]171    mAny = mPB.createNot(lb, "any");
[5134]172    if (!mCountOnly) mFunction.setResult(1, mPB.createAssign("lf", mLineBreak));
[4330]173}
[4182]174
[4809]175
[4814]176RE * RE_Compiler::resolveUnicodeProperties(RE * re) {
[5091]177    Name * ZeroWidth = nullptr;
[4820]178    UCD::UCDCompiler::NameMap nameMap;
[4831]179    std::unordered_set<Name *> visited;
[5091]180    nameMap = resolveNames(re, ZeroWidth);
[5083]181   
[4829]182    if (LLVM_LIKELY(nameMap.size() > 0)) {
[4814]183        UCD::UCDCompiler ucdCompiler(mCCCompiler);
[5030]184        if (LLVM_UNLIKELY(AlgorithmOptionIsSet(DisableIfHierarchy))) {
[4919]185            ucdCompiler.generateWithoutIfHierarchy(nameMap, mPB);
186        } else {
187            ucdCompiler.generateWithDefaultIfHierarchy(nameMap, mPB);
188        }
[4814]189        for (auto t : nameMap) {
190            if (t.second) {
[5045]191                mCompiledName.insert(std::make_pair(t.first, makeMarker(MarkerPosition::FinalMatchUnit, t.second)));
[4808]192            }
193        }
194    }
[4814]195
[4829]196    // Now precompile any grapheme segmentation rules
[5091]197    if (ZeroWidth) {
198        auto gcb = compileName(ZeroWidth, mPB);
199        mCompiledName.insert(std::make_pair(ZeroWidth, gcb));
[4829]200    }
[4814]201    return re;
[4808]202}
203
[4814]204void RE_Compiler::compileUnicodeNames(RE *& re) {
205    re = resolveUnicodeProperties(re);
206}
207
[5030]208void RE_Compiler::finalizeMatchResult(MarkerType match_result, bool InvertMatches) {
[5083]209    PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
[4978]210    if (InvertMatches) {
211        match_follow = mPB.createNot(match_follow);
212    }
[5063]213    Assign * matches = mPB.createAssign("matches", mPB.createAnd(match_follow, mLineBreak));
[5134]214    if (mCountOnly) {
215        mFunction.setResultCount(mPB.createCount("matchedLineCount", matches));
216    }
217    else {
218        mFunction.setResult(0, matches);
219    }
[4197]220}
[4405]221
[4622]222MarkerType RE_Compiler::compile(RE * re, PabloBuilder & pb) {
[5045]223    return process(re, makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createOnes()), pb);
[4330]224}
[4405]225
[4622]226MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBuilder & pb) {
[4852]227    if (isa<Name>(re)) {
228        return compileName(cast<Name>(re), marker, pb);
229    } else if (isa<Seq>(re)) {
230        return compileSeq(cast<Seq>(re), marker, pb);
231    } else if (isa<Alt>(re)) {
232        return compileAlt(cast<Alt>(re), marker, pb);
233    } else if (isa<Rep>(re)) {
234        return compileRep(cast<Rep>(re), marker, pb);
235    } else if (isa<Assertion>(re)) {
236        return compileAssertion(cast<Assertion>(re), marker, pb);
[4829]237    } else if (isa<Any>(re)) {
238        return compileAny(marker, pb);
[4852]239    } else if (isa<Diff>(re)) {
240        return compileDiff(cast<Diff>(re), marker, pb);
241    } else if (isa<Intersect>(re)) {
242        return compileIntersect(cast<Intersect>(re), marker, pb);
[4829]243    } else if (isa<Start>(re)) {
[4831]244        return compileStart(marker, pb);
[4829]245    } else if (isa<End>(re)) {
[4831]246        return compileEnd(marker, pb);
[4197]247    }
[4831]248    throw std::runtime_error("RE Compiler failed to process " + Printer_RE::PrintRE(re));
[4197]249}
[3850]250
[4829]251inline MarkerType RE_Compiler::compileAny(const MarkerType m, PabloBuilder & pb) {
[5045]252    PabloAST * nextFinalByte = markerVar(AdvanceMarker(m, MarkerPosition::FinalPostPositionUnit, pb));
[5042]253    PabloAST * lb = mLineBreak;
[4829]254    if (UNICODE_LINE_BREAK) {
[5042]255        lb = pb.createOr(mLineBreak, mCRLF);
[4829]256    }
[5045]257    return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(nextFinalByte, pb.createNot(lb), "dot"));
[4829]258}
259
260inline MarkerType RE_Compiler::compileName(Name * name, MarkerType marker, PabloBuilder & pb) {
[5080]261    if (isUnicodeUnitLength(name)) {
262        MarkerType nameMarker = compileName(name, pb);
263        MarkerType nextPos;
264        if (markerPos(marker) == MarkerPosition::FinalPostPositionUnit) {
265            nextPos = marker;
266        } else if (name->getType() == Name::Type::Byte) {
267            nextPos = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
268        } else {
269            nextPos = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
270        }
271        nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
272        return nameMarker;
[5091]273    } else if (name->getType() == Name::Type::ZeroWidth) {
274        RE * zerowidth = name->getDefinition();
275        MarkerType zero = compile(zerowidth, pb);
276        AlignMarkers(marker, zero, pb);
277        PabloAST * ze = markerVar(zero);
278        const std::string value = name->getName();
279        if (value == "NonGCB") {
280            ze = pb.createNot(ze);
281        }
282        return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth"));
283    } else {
[5080]284        return process(name->getDefinition(), marker, pb);
285    }
[4197]286}
[4129]287
[4831]288inline MarkerType RE_Compiler::compileName(Name * name, PabloBuilder & pb) {
289    auto f = mCompiledName.find(name);
290    if (LLVM_LIKELY(f != mCompiledName.end())) {
291        return f->second;
[4814]292    } else if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
[4622]293        MarkerType m = compile(name->getDefinition(), pb);
[4831]294        mCompiledName.insert(std::make_pair(name, m));
295        return m;
[4486]296    }
[4829]297    throw std::runtime_error("Unresolved name " + name->getName());
[4486]298}
299
[4831]300MarkerType RE_Compiler::compileSeq(Seq * seq, MarkerType marker, PabloBuilder & pb) {
[4442]301    // if-hierarchies are not inserted within unbounded repetitions
302    if (mStarDepth > 0) {
303        for (RE * re : *seq) {
304            marker = process(re, marker, pb);
305        }
306        return marker;
[4829]307    } else {
[4831]308        return compileSeqTail(seq->begin(), seq->end(), 0, marker, pb);
[4442]309    }
[4197]310}
[3955]311
[4831]312MarkerType RE_Compiler::compileSeqTail(Seq::iterator current, Seq::iterator end, int matchLenSoFar, MarkerType marker, PabloBuilder & pb) {
[4442]313    if (current == end) return marker;
[4459]314    if (matchLenSoFar < IfInsertionGap) {
[4442]315        RE * r = *current;
316        marker = process(r, marker, pb);
317        current++;
[4831]318        return compileSeqTail(current, end, matchLenSoFar + minMatchLength(r), marker, pb);
[4829]319    } else {
[4622]320        PabloBuilder nested = PabloBuilder::Create(pb);
[4831]321        MarkerType m1 = compileSeqTail(current, end, 0, marker, nested);
[4442]322        Assign * m1a = nested.createAssign("m", markerVar(m1));
[4510]323        pb.createIf(markerVar(marker), {m1a}, nested);
[4442]324        return makeMarker(m1.pos, m1a);
325    }
326}
327
[4831]328MarkerType RE_Compiler::compileAlt(Alt * alt, MarkerType marker, PabloBuilder & pb) {
[4411]329    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
[4340]330    MarkerType const base = marker;
331    // The following may be useful to force a common Advance rather than separate
332    // Advances in each alternative.
[5045]333    // MarkerType const base = makeMarker(InitialPostPositionUnit, postPositionVar(marker, pb), pb);
[4340]334    for (RE * re : *alt) {
335        MarkerType rslt = process(re, base, pb);
[4411]336        MarkerPosition p = markerPos(rslt);
[4439]337        accum[p] = pb.createOr(accum[p], markerVar(rslt), "alt");
[4197]338    }
[5045]339    if (isa<Zeroes>(accum[MarkerPosition::InitialPostPositionUnit]) && isa<Zeroes>(accum[MarkerPosition::FinalPostPositionUnit])) {
340        return makeMarker(MarkerPosition::FinalMatchUnit, accum[MarkerPosition::FinalMatchUnit]);
[4340]341    }
[5045]342    PabloAST * combine = pb.createOr(accum[InitialPostPositionUnit], pb.createAdvance(accum[MarkerPosition::FinalMatchUnit], 1), "alt");
343    if (isa<Zeroes>(accum[FinalPostPositionUnit])) {
344        return makeMarker(InitialPostPositionUnit, combine);
[4340]345    }
[5045]346    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[MarkerPosition::FinalPostPositionUnit], "alt");
347    return makeMarker(MarkerPosition::FinalPostPositionUnit, combine);
[4197]348}
[3955]349
[4831]350MarkerType RE_Compiler::compileAssertion(Assertion * a, MarkerType marker, PabloBuilder & pb) {
[4411]351    RE * asserted = a->getAsserted();
[4408]352    if (a->getKind() == Assertion::Kind::Lookbehind) {
[4411]353        MarkerType lookback = compile(asserted, pb);
[4835]354        AlignMarkers(marker, lookback, pb);
[4438]355        PabloAST * lb = markerVar(lookback);
[4411]356        if (a->getSense() == Assertion::Sense::Negative) {
[4439]357            lb = pb.createNot(lb);
[4408]358        }
[4835]359        return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), lb, "lookback"));
[4829]360    } else if (isUnicodeUnitLength(asserted)) {
[4411]361        MarkerType lookahead = compile(asserted, pb);
[5045]362        if (LLVM_LIKELY(markerPos(lookahead) == MarkerPosition::FinalMatchUnit)) {
[4831]363            PabloAST * la = markerVar(lookahead);
364            if (a->getSense() == Assertion::Sense::Negative) {
365                la = pb.createNot(la);
366            }
[5045]367            MarkerType fbyte = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
368            return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(fbyte), la, "lookahead"));
[4408]369        }
370    }
[4829]371    throw std::runtime_error("Unsupported lookahead assertion.");
[4405]372}
373
[4829]374inline bool alignedUnicodeLength(const RE * lh, const RE * rh) {
375    const auto lhl = getUnicodeUnitLengthRange(lh);
376    const auto rhl = getUnicodeUnitLengthRange(rh);
377    return (lhl.first == lhl.second && lhl.first == rhl.first && lhl.second == rhl.second);
378}
379
[4831]380MarkerType RE_Compiler::compileDiff(Diff * diff, MarkerType marker, PabloBuilder & pb) {
[4255]381    RE * lh = diff->getLH();
382    RE * rh = diff->getRH();
[4829]383    if (alignedUnicodeLength(lh, rh)) {
[4340]384        MarkerType t1 = process(lh, marker, pb);
385        MarkerType t2 = process(rh, marker, pb);
[4411]386        AlignMarkers(t1, t2, pb);
[4439]387        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), pb.createNot(markerVar(t2)), "diff"));
[4255]388    }
389    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
390}
391
[4831]392MarkerType RE_Compiler::compileIntersect(Intersect * x, MarkerType marker, PabloBuilder & pb) {
[4298]393    RE * lh = x->getLH();
394    RE * rh = x->getRH();
[4829]395    if (alignedUnicodeLength(lh, rh)) {
[4340]396        MarkerType t1 = process(lh, marker, pb);
397        MarkerType t2 = process(rh, marker, pb);
[4411]398        AlignMarkers(t1, t2, pb);
[4439]399        return makeMarker(markerPos(t1), pb.createAnd(markerVar(t1), markerVar(t2), "intersect"));
[4298]400    }
401    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
402}
403
[4831]404MarkerType RE_Compiler::compileRep(Rep * rep, MarkerType marker, PabloBuilder & pb) {
[4261]405    int lb = rep->getLB();
406    int ub = rep->getUB();
407    if (lb > 0) {
[4266]408        marker = processLowerBound(rep->getRE(), lb, marker, pb);
[4208]409    }
[4261]410    if (ub == Rep::UNBOUNDED_REP) {
[4831]411        marker = processUnboundedRep(rep->getRE(), marker, pb);
412    } else if (lb < ub) {
413        marker = processBoundedRep(rep->getRE(), ub - lb, marker, pb);
[4261]414    }
[4831]415    return marker;
[4208]416}
417
[4261]418/*
[4420]419   Given a stream |repeated| marking positions associated with matches to an item
[4846]420   of length |repeated_lgth|, compute a stream marking |repeat_count| consecutive
[4420]421   occurrences of such items.
[4261]422*/
[4405]423
[4831]424inline PabloAST * RE_Compiler::consecutive_matches(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
[4841]425    int i = length;
426    int total = repeat_count * length;
427    PabloAST * consecutive_i = repeated;
428    while (i * 2 < total) {
429        PabloAST * v = consecutive_i;
430        PabloAST * v2 =  pb.createAdvance(v, i);
431        i *= 2;
432        consecutive_i = pb.createAnd(v, v2, "at" + std::to_string(i) + "of" + std::to_string(total));
433    }
434    if (i < total) {
435        PabloAST * v = consecutive_i;
436        consecutive_i = pb.createAnd(v, pb.createAdvance(v, total - i), "at" + std::to_string(total));
437    }
438    return consecutive_i;
[4261]439}
[4405]440
[4841]441inline PabloAST * RE_Compiler::reachable(PabloAST * repeated, int length, int repeat_count, PabloBuilder & pb) {
442    int i = length;
443    int total_lgth = repeat_count * length;
444    if (repeat_count == 0) {
445        return repeated;
446    }
447    PabloAST * reachable_i = pb.createOr(repeated, pb.createAdvance(repeated, 1), "within1");
448    while (i * 2 < total_lgth) {
449        PabloAST * v = reachable_i;
450        PabloAST * v2 =  pb.createAdvance(v, i);
451        i *= 2;
452        reachable_i = pb.createOr(v, v2, "within" + std::to_string(i));
453    }
454    if (i < total_lgth) {
455        PabloAST * v = reachable_i;
456        reachable_i = pb.createOr(v, pb.createAdvance(v, total_lgth - i), "within" + std::to_string(total_lgth));
457    }
458    return reachable_i;
[4420]459}
460
[4622]461MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBuilder & pb) {
[5030]462    if (!mGraphemeBoundaryRule && isByteLength(repeated) && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
[4438]463        PabloAST * cc = markerVar(compile(repeated, pb));
[4831]464        PabloAST * cc_lb = consecutive_matches(cc, 1, lb, pb);
[5045]465        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == MarkerPosition::FinalMatchUnit ? lb : lb - 1);
466        return makeMarker(MarkerPosition::FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
[4340]467    }
468    // Fall through to general case.
[4846]469    for (int i = 1; i <= lb; ++i) {
[4340]470        marker = process(repeated, marker, pb);
[4846]471        if (mGraphemeBoundaryRule) {
[5045]472            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
[4846]473        }
[4340]474    }
475    return marker;
[4267]476}
[4213]477
[4622]478MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBuilder & pb) {
[5030]479    if (!mGraphemeBoundaryRule && isByteLength(repeated) && ub > 1 && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
[4340]480        // log2 upper bound for fixed length (=1) class
[4420]481        // Create a mask of positions reachable within ub from current marker.
[4340]482        // Use matchstar, then apply filter.
[5045]483        PabloAST * match = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
[4438]484        PabloAST * upperLimitMask = reachable(match, 1, ub, pb);
[5045]485        PabloAST * cursor = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
[4411]486        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
[5045]487        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask, "bounded"));
[4340]488    }
489    // Fall through to general case.
[4846]490    for (int i = 1; i <= ub; ++i) {
[4340]491        MarkerType a = process(repeated, marker, pb);
[4411]492        MarkerType m = marker;
493        AlignMarkers(a, m, pb);
[4846]494        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m), "upper" + std::to_string(i)));
495        if (mGraphemeBoundaryRule) {
[5045]496            marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
[4846]497        }
[4340]498    }
499    return marker;
[4261]500}
501
[4622]502MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBuilder & pb) {
[4340]503    // always use PostPosition markers for unbounded repetition.
[5045]504    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));
[5030]505    if (!mGraphemeBoundaryRule && isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
[4829]506        PabloAST * cc = markerVar(compile(repeated, pb));
507        PabloAST * mstar = nullptr;
[4980]508        mstar = pb.createMatchStar(base, cc, "unbounded");
[5045]509        return makeMarker(MarkerPosition::InitialPostPositionUnit, mstar);
[5030]510    } else if (isUnicodeUnitLength(repeated) && !AlgorithmOptionIsSet(DisableMatchStar) && !AlgorithmOptionIsSet(DisableUnicodeMatchStar)) {
[4411]511        PabloAST * cc = markerVar(compile(repeated, pb));
[4829]512        PabloAST * mstar = nullptr;
513        PabloAST * nonFinal = mNonFinal;
[4841]514        if (mGraphemeBoundaryRule) {
515            nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
[4829]516        }
517        cc = pb.createOr(cc, nonFinal);
[4980]518        mstar = pb.createMatchStar(base, cc);
[4829]519        PabloAST * final = mFinal;
[4841]520        if (mGraphemeBoundaryRule) {
521            final = mGraphemeBoundaryRule;
[4829]522        }
[5045]523        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
[4829]524    } else if (mStarDepth > 0){
[4846]525        PabloBuilder * outerb = pb.getParent();
[4609]526        Assign * starPending = outerb->createAssign("pending", outerb->createZeroes());
[4846]527        Assign * starAccum = outerb->createAssign("accum", outerb->createZeroes());
[4609]528        mStarDepth++;
529        PabloAST * m1 = pb.createOr(base, starPending);
530        PabloAST * m2 = pb.createOr(base, starAccum);
[5045]531        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, m1), pb);
532        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, pb);
[4844]533        PabloAST * loopComputation = markerVar(result);
[4650]534        Next * nextPending = pb.createNext(starPending, pb.createAnd(loopComputation, pb.createNot(m2)));
535        Next * nextStarAccum = pb.createNext(starAccum, pb.createOr(loopComputation, m2));
536        mWhileTest = pb.createOr(mWhileTest, nextPending);
[4641]537        mLoopVariants.push_back(nextPending);
538        mLoopVariants.push_back(nextStarAccum);
[4844]539        mStarDepth--;
540        return makeMarker(markerPos(result), pb.createAssign("unbounded", pb.createOr(base, nextStarAccum)));
[4829]541    } else {
[4340]542        Assign * whileTest = pb.createAssign("test", base);
[4609]543        Assign * whilePending = pb.createAssign("pending", base);
[4340]544        Assign * whileAccum = pb.createAssign("accum", base);
[4609]545        mWhileTest = pb.createZeroes();
[4622]546        PabloBuilder wb = PabloBuilder::Create(pb);
[4442]547        mStarDepth++;
[5045]548        MarkerType result = process(repeated, makeMarker(MarkerPosition::InitialPostPositionUnit, whilePending), wb);
549        result = AdvanceMarker(result, MarkerPosition::InitialPostPositionUnit, wb);
[4844]550        PabloAST * loopComputation = markerVar(result);
[4650]551        Next * nextWhilePending = wb.createNext(whilePending, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
552        Next * nextWhileAccum = wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccum));
553        Next * nextWhileTest = wb.createNext(whileTest, wb.createOr(mWhileTest, nextWhilePending));
[4641]554        mLoopVariants.push_back(nextWhilePending);
555        mLoopVariants.push_back(nextWhileAccum);
556        mLoopVariants.push_back(nextWhileTest);
[4650]557        pb.createWhile(nextWhileTest, mLoopVariants, wb);
[4442]558        mStarDepth--;
[4641]559        mLoopVariants.clear();
[4844]560        return makeMarker(markerPos(result), pb.createAssign("unbounded", nextWhileAccum));
[4846]561    }
[4829]562}
563
[4831]564inline MarkerType RE_Compiler::compileStart(const MarkerType marker, pablo::PabloBuilder & pb) {
[5045]565    MarkerType m = AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb);
[4831]566    if (UNICODE_LINE_BREAK) {
[5042]567        PabloAST * line_end = mPB.createOr(mLineBreak, mCRLF);
[4831]568        PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
[5045]569        return makeMarker(MarkerPosition::InitialPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
[4831]570    } else {
[5042]571        PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineBreak), 1));
[5045]572        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(m), sol, "sol"));
[4831]573    }
574}
575
576inline MarkerType RE_Compiler::compileEnd(const MarkerType marker, pablo::PabloBuilder & pb) {
577    if (UNICODE_LINE_BREAK) {
[5045]578        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
579        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
[4831]580    } else {
[5045]581        PabloAST * nextPos = markerVar(AdvanceMarker(marker, MarkerPosition::InitialPostPositionUnit, pb));  // For LF match
582        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(nextPos, mLineBreak, "eol"));
[4831]583    }
584}
585
[5091]586/*inline MarkerType RE_Compiler::compileGraphemeBoundary(GraphemeBoundary * gb, MarkerType marker, pablo::PabloBuilder & pb) {
[4841]587    auto f = mCompiledName.find(gb->getBoundaryRule());
588    assert ("Internal error: failed to locate grapheme boundary rule!" && (f != mCompiledName.end()));
589    if (gb->getExpression()) {
590        const auto graphemeBoundaryRule = mGraphemeBoundaryRule;
591        mGraphemeBoundaryRule = markerVar(f->second);
592        marker = process(gb->getExpression(), marker, pb);
[5045]593        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
[4841]594        mGraphemeBoundaryRule = graphemeBoundaryRule;
595    } else {
[5045]596        marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
[4841]597        PabloAST * rule = markerVar(f->second);
598        if (gb->getSense() == GraphemeBoundary::Sense::Negative) {
599            rule = pb.createNot(rule);
600        }
[5045]601        marker = makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(markerVar(marker), rule, "gb"));
[4831]602    }
[4841]603    return marker;
[5091]604}*/
[4831]605
[4841]606inline MarkerType RE_Compiler::AdvanceMarker(MarkerType marker, const MarkerPosition newpos, PabloBuilder & pb) {
607    if (marker.pos != newpos) {
[5045]608        if (marker.pos == MarkerPosition::FinalMatchUnit) {
[4841]609            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
[5045]610            marker.pos = MarkerPosition::InitialPostPositionUnit;
[4831]611        }
[5045]612        if (newpos == MarkerPosition::FinalPostPositionUnit) {
[4841]613            PabloAST * nonFinal = mNonFinal;
614            if (mGraphemeBoundaryRule) {
615                nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
616            }
617            marker.stream = pb.createScanThru(pb.createAnd(mInitial, marker.stream), nonFinal, "fpp");
[5045]618            marker.pos = MarkerPosition::FinalPostPositionUnit;
[4841]619        }
[4829]620    }
[4841]621    return marker;
[4829]622}
623
624inline void RE_Compiler::AlignMarkers(MarkerType & m1, MarkerType & m2, PabloBuilder & pb) {
625    if (m1.pos < m2.pos) {
626        m1 = AdvanceMarker(m1, m2.pos, pb);
627    } else if (m2.pos < m1.pos) {
628        m2 = AdvanceMarker(m2, m1.pos, pb);
629    }
630}
631
[5134]632RE_Compiler::RE_Compiler(pablo::PabloFunction & function, cc::CC_Compiler & ccCompiler, bool CountOnly)
633: mCountOnly(CountOnly)
634, mCCCompiler(ccCompiler)
[5042]635, mLineBreak(nullptr)
[4829]636, mCRLF(nullptr)
637, mAny(nullptr)
[4841]638, mGraphemeBoundaryRule(nullptr)
[4829]639, mInitial(nullptr)
640, mNonFinal(nullptr)
641, mFinal(nullptr)
642, mWhileTest(nullptr)
643, mStarDepth(0)
644, mLoopVariants()
[4870]645, mPB(ccCompiler.getBuilder().getPabloBlock(), ccCompiler.getBuilder())
[4829]646, mFunction(function)
647{
648
649}
650
[4340]651} // end of namespace re
Note: See TracBrowser for help on using the repository browser.