source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4405

Last change on this file since 4405 was 4405, checked in by cameron, 4 years ago

AST support for Lookahead/Lookbehind? assertions

File size: 16.7 KB
RevLine 
[3850]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
[4197]8//Regular Expressions
[4199]9#include <re/re_name.h>
[4245]10#include <re/re_any.h>
[4199]11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
[4255]17#include <re/re_diff.h>
[4298]18#include <re/re_intersect.h>
[4405]19#include <re/re_assertion.h>
[4393]20#include <re/re_analysis.h>
[4249]21#include <cc/cc_namemap.hpp>
[4207]22#include <pablo/codegenstate.h>
[4182]23
[4197]24#include <assert.h>
25#include <stdexcept>
[4182]26
[4340]27using namespace pablo;
28
29namespace re {
30
31MarkerType makePostPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
32    return MarkerType{PostPosition, pb.createAssign(marker_name, s)};
33}
[4405]34
[4340]35MarkerType wrapPostPositionMarker(Assign * s) {
36    return MarkerType{PostPosition, s};
37}
[4405]38
[4340]39MarkerType makeFinalPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
40    return MarkerType{FinalByte, pb.createAssign(marker_name, s)};
41}
42
43Assign * markerStream(MarkerType m, PabloBlock & pb) {
44    return m.stream;
45}
46
47Var * markerVar(MarkerType m, PabloBlock & pb) {
48    return pb.createVar(m.stream);
49}
50
51Var * postPositionVar(MarkerType m, PabloBlock & pb) {
52    if (isFinalPositionMarker(m)) return pb.createVar(pb.createAssign("f", pb.createAdvance(pb.createVar(m.stream), 1)));
53    else return pb.createVar(m.stream);
54}
55
[4246]56//Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
57//#define USE_IF_FOR_NONFINAL
58
[4199]59
[4194]60
[4249]61RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
[4213]62: mCG(baseCG)
[4246]63, mLineFeed(nullptr)
64, mInitial(nullptr)
65, mNonFinal(nullptr)
[3850]66{
67
[4197]68}
[4182]69
[4352]70#define USE_IF_FOR_NONFINAL 1
71#define USE_IF_FOR_CRLF
[4342]72#define UNICODE_LINE_BREAK true
[4319]73
[4405]74
[4334]75void RE_Compiler::initializeRequiredStreams(cc::CC_Compiler & ccc) {
[3850]76
[4352]77    const std::string initial = "initial";
78    const std::string nonfinal = "nonfinal";
[4405]79
[4347]80    Assign * LF = mCG.createAssign("LF", ccc.compileCC(makeCC(0x0A)));
81    mLineFeed = mCG.createVar(LF);
[4340]82    PabloAST * CR = ccc.compileCC(makeCC(0x0D));
[4357]83    PabloAST * LF_VT_FF_CR = ccc.compileCC(makeCC(0x0A, 0x0D));
[4347]84#ifndef USE_IF_FOR_CRLF
[4340]85    mCRLF = mCG.createAnd(mCG.createAdvance(CR, 1), mLineFeed);
[4347]86#else
[4404]87    PabloBlock & crb = PabloBlock::Create(mCG);
[4347]88    Assign * cr1 = crb.createAssign("cr1", crb.createAdvance(CR, 1));
89    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(crb.createVar(cr1), crb.createVar(LF)));
[4404]90    mCG.createIf(CR, std::move(std::vector<Assign *>{acrlf}), crb);
[4347]91    mCRLF = mCG.createVar(acrlf);
92#endif
[4405]93
[4357]94#ifndef USE_IF_FOR_NONFINAL
[4334]95    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
96    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF));
97    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF));
98    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4));
[4330]99    PabloAST * u8pfx = mCG.createOr(mCG.createOr(u8pfx2, u8pfx3), u8pfx4);
100    mInitial = mCG.createVar(mCG.createAssign(initial, mCG.createOr(u8pfx, u8single)));
[4405]101
[4330]102    PabloAST * u8scope32 = mCG.createAdvance(u8pfx3, 1);
103    PabloAST * u8scope42 = mCG.createAdvance(u8pfx4, 1);
104    PabloAST * u8scope43 = mCG.createAdvance(u8scope42, 1);
[4352]105    PabloAST * NEL = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xC2)), 1), ccc.compileCC(makeCC(0x85)));
106    PabloAST * E2_80 = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xE2)), 1), ccc.compileCC(makeCC(0x80)));
107    PabloAST * LS_PS = mCG.createAnd(mCG.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9)));
108    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createOr(NEL, LS_PS));
[4347]109    mNonFinal = mCG.createVar(mCG.createAssign(nonfinal, mCG.createOr(mCG.createOr(u8pfx, u8scope32), mCG.createOr(u8scope42, u8scope43))));
[4352]110    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
[4357]111#endif
112
113#ifdef USE_IF_FOR_NONFINAL
114    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
115    PabloAST * u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
[4404]116    PabloBlock & it = PabloBlock::Create(mCG);
[4357]117    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
118    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
119    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
120    Assign * valid_pfx = it.createAssign("valid_pfx", it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4));
[4347]121    PabloAST * u8scope32 = it.createAdvance(u8pfx3, 1);
122    PabloAST * u8scope42 = it.createVar(it.createAssign("u8scope42", it.createAdvance(u8pfx4, 1)));
123    PabloAST * u8scope43 = it.createAdvance(u8scope42, 1);
[4357]124    Assign * a_nonfinal = it.createAssign(nonfinal, it.createOr(it.createOr(u8pfx, u8scope32), it.createOr(u8scope42, u8scope43)));
125    PabloAST * NEL = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xC2), it), 1), ccc.compileCC(makeCC(0x85), it));
126    PabloAST * E2_80 = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xE2), it), 1), ccc.compileCC(makeCC(0x80), it));
127    PabloAST * LS_PS = it.createAnd(it.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it));
[4352]128    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
[4404]129    mCG.createIf(u8pfx, std::move(std::vector<Assign *>{valid_pfx, a_nonfinal, NEL_LS_PS}), it);
[4352]130    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createVar(NEL_LS_PS));
[4357]131    mInitial = mCG.createVar(mCG.createAssign(initial, mCG.createOr(u8single, mCG.createVar(valid_pfx))));
132    mNonFinal = mCG.createVar(a_nonfinal);   
[4352]133    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
[4280]134    #endif
[4330]135}
[4182]136
[4340]137void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
[4197]138    //These three lines are specifically for grep.
[4340]139    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
[4347]140    Var * v = markerVar(match_result, mCG);
141    mCG.createAssign("matches", mCG.createAnd(mCG.createMatchStar(v, mCG.createNot(lb)), lb), 0);
142    mCG.createAssign("lf", mCG.createAnd(lb, mCG.createNot(mCRLF)), 1);
[4197]143}
[4405]144
[4340]145MarkerType RE_Compiler::compile(RE * re, PabloBlock & pb) {
[4347]146    return process(re, makePostPositionMarker("start", pb.createOnes(), pb), pb);
[4330]147}
[4405]148
[4283]149PabloAST * RE_Compiler::character_class_strm(Name * name, PabloBlock & pb) {
[4390]150    Var * var = name->getCompiled();
151    if (var != nullptr) return var;
[4283]152    else {
[4390]153        RE * def = name->getDefinition();
154        if (def != nullptr) {
[4347]155            MarkerType m = compile(def, mCG);
156            assert(isFinalPositionMarker(m));
157            Var * v = pb.createVar(markerStream(m, mCG));
158            name -> setCompiled(v);
159            return v;
[4340]160        }
[4390]161        else if (name->getType() == Name::Type::UnicodeProperty) {
162            return pb.createCall(name->getName());
163        }
164        else {
165            throw std::runtime_error("Unresolved name " + name->getName());
166        }
[4283]167    }
168}
[4214]169
[4340]170PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBlock & pb) {
171    if (isPostPositionMarker(m)) {
172        return pb.createScanThru(pb.createVar(pb.createAnd(mInitial, markerVar(m, pb))), mNonFinal);
173    }
174    else {
175        //return pb.createAdvanceThenScanThru(pb.createVar(markerVar(m), pb), mNonFinal);
176        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(pb.createVar(markerVar(m, pb)), 1)), mNonFinal);
177    }
178}
179
180MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBlock & pb) {
[4197]181    if (Name * name = dyn_cast<Name>(re)) {
[4340]182        return process(name, marker, pb);
[4197]183    }
184    else if (Seq* seq = dyn_cast<Seq>(re)) {
[4340]185        return process(seq, marker, pb);
[4197]186    }
187    else if (Alt * alt = dyn_cast<Alt>(re)) {
[4340]188        return process(alt, marker, pb);
[4197]189    }
190    else if (Rep * rep = dyn_cast<Rep>(re)) {
[4340]191        return process(rep, marker, pb);
[4197]192    }
[4405]193    else if (Assertion * a = dyn_cast<Assertion>(re)) {
194        return process(a, marker, pb);
195    }
[4245]196    else if (isa<Any>(re)) {
[4340]197        PabloAST * nextPos = nextUnicodePosition(marker, pb);
198        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
199        return makeFinalPositionMarker("dot", pb.createAnd(nextPos, dot), pb);
[4245]200    }
[4255]201    else if (Diff * diff = dyn_cast<Diff>(re)) {
[4340]202        return process(diff, marker, pb);
[4255]203    }
[4308]204    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
[4340]205        return process(ix, marker, pb);
[4308]206    }
[4197]207    else if (isa<Start>(re)) {
[4340]208        if (UNICODE_LINE_BREAK) {
209            PabloAST * line_end = mCG.createOr(mUnicodeLineBreak, mCRLF);
210            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
211            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
212        }
213        else {
214            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
215            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
216        }
[4197]217    }
218    else if (isa<End>(re)) {
[4340]219        if (UNICODE_LINE_BREAK) {
[4342]220            PabloAST * nextPos = nextUnicodePosition(marker, pb);
221            return makeFinalPositionMarker("end", pb.createAnd(nextPos, mUnicodeLineBreak), pb);
[4340]222        }
223        PabloAST * nextPos = postPositionVar(marker, pb);  // For LF match
224        return makePostPositionMarker("eol", pb.createAnd(nextPos, mLineFeed), pb);
[4197]225    }
[4266]226    return marker;
[4197]227}
[3850]228
[4340]229MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBlock & pb) {
230    PabloAST * nextPos = (name->getType() == Name::Type::Byte) ? postPositionVar(marker, pb): nextUnicodePosition(marker, pb);
231    return makeFinalPositionMarker("m", pb.createAnd(nextPos, character_class_strm(name, pb)), pb);
[4197]232}
[4129]233
[4340]234MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBlock & pb) {
[4197]235    for (RE * re : *seq) {
[4266]236        marker = process(re, marker, pb);
[4197]237    }
[4266]238    return marker;
[4197]239}
[3955]240
[4340]241MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBlock & pb) {
242    PabloAST * atPosnAccumulator = nullptr;
243    PabloAST * postPosnAccumulator = nullptr;
244    MarkerType const base = marker;
245    // The following may be useful to force a common Advance rather than separate
246    // Advances in each alternative.
247    // MarkerType const base = makePostPositionMarker(postPositionVar(marker, pb), pb);
248    for (RE * re : *alt) {
249        MarkerType rslt = process(re, base, pb);
250        PabloAST * rsltStream = markerVar(rslt, pb); 
251        if (isFinalPositionMarker(rslt)) {
252            atPosnAccumulator = (atPosnAccumulator == nullptr) ? rsltStream : pb.createOr(atPosnAccumulator, rsltStream);
253        }
254        else {
255            postPosnAccumulator = (postPosnAccumulator == nullptr) ? rsltStream : pb.createOr(postPosnAccumulator, rsltStream);
256        }
[4197]257    }
[4340]258    if (postPosnAccumulator == nullptr) {
259        return makeFinalPositionMarker("alt", atPosnAccumulator == nullptr ? pb.createZeroes() : atPosnAccumulator, pb);
260    }
[4197]261    else {
[4340]262        if (atPosnAccumulator != nullptr) {
263            postPosnAccumulator = pb.createOr(postPosnAccumulator, pb.createAdvance(atPosnAccumulator, 1));
[4197]264        }
[4340]265        return makePostPositionMarker("alt", postPosnAccumulator, pb);
266    }
[4197]267}
[3955]268
[4405]269MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBlock & pb) {
270    throw std::runtime_error("Assertions not implemented.");
271}
272
[4340]273MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBlock & pb) {
[4255]274    RE * lh = diff->getLH();
275    RE * rh = diff->getRH();
[4393]276    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
[4340]277        MarkerType t1 = process(lh, marker, pb);
278        MarkerType t2 = process(rh, marker, pb);
279        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
280        return makeFinalPositionMarker("diff", pb.createAnd(markerVar(t1, pb), pb.createNot(markerVar(t2, pb))), pb);
[4255]281    }
282    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
283}
284
[4340]285MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBlock & pb) {
[4298]286    RE * lh = x->getLH();
287    RE * rh = x->getRH();
[4393]288    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
[4340]289        MarkerType t1 = process(lh, marker, pb);
290        MarkerType t2 = process(rh, marker, pb);
291        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
292        return makeFinalPositionMarker("intersect", pb.createAnd(markerVar(t1, pb), markerVar(t2, pb)), pb);
[4298]293    }
294    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
295}
296
[4340]297MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBlock & pb) {
[4261]298    int lb = rep->getLB();
299    int ub = rep->getUB();
300    if (lb > 0) {
[4266]301        marker = processLowerBound(rep->getRE(), lb, marker, pb);
[4208]302    }
[4261]303    if (ub == Rep::UNBOUNDED_REP) {
[4340]304        return processUnboundedRep(rep->getRE(), marker, pb);
[4261]305    }
[4208]306    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
[4340]307        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
[4405]308    }
[4208]309}
310
[4261]311/*
312   Given a stream |repeated| marking positions immediately after matches to an item
313   of length |repeated_lgth|, compute a stream marking positions immediately after
314   |repeat_count| consecutive occurrences of such items.
315*/
[4405]316
[4261]317inline Assign * RE_Compiler::consecutive(Assign * repeated, int repeated_lgth, int repeat_count, pablo::PabloBlock & pb) {
[4340]318        int i = repeated_lgth;
319        int total_lgth = repeat_count * repeated_lgth;
320        Assign * consecutive_i = repeated;
321        while (i * 2 < total_lgth) {
[4261]322        PabloAST * v = pb.createVar(consecutive_i);
[4340]323                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, i)));
324                i *= 2;
325        }
326        if (i < total_lgth) {
[4261]327        PabloAST * v = pb.createVar(consecutive_i);
[4340]328                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, total_lgth - i)));
329        }
330        return consecutive_i;
[4261]331}
[4405]332
[4340]333MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
[4393]334    if (isByteLength(repeated)) {
335        PabloAST * cc = markerVar(compile(repeated, pb), pb);
336        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(cc,1)), 1, lb, pb);
[4340]337        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker, pb), isFinalPositionMarker(marker) ? lb+ 1 : lb);
338        return makePostPositionMarker("lowerbound", pb.createAnd(marker_fwd, pb.createVar(cc_lb)), pb);
339    }
340    // Fall through to general case.
341    while (lb-- != 0) {
342        marker = process(repeated, marker, pb);
343    }
344    return marker;
[4267]345}
[4213]346
[4340]347MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBlock & pb) {
[4393]348    if (isByteLength(repeated)) {
[4340]349        // log2 upper bound for fixed length (=1) class
350        // Mask out any positions that are more than ub positions from a current match.
351        // Use matchstar, then apply filter.
352        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(postPositionVar(marker, pb)));
353        PabloAST * upperLimitMask = pb.createNot(pb.createVar(consecutive(nonMatch, 1, ub + 1, pb)));
[4393]354        PabloAST * rep_class_var = markerVar(compile(repeated, pb), pb);
[4340]355        return makePostPositionMarker("bounded", pb.createAnd(pb.createMatchStar(postPositionVar(marker, pb), rep_class_var), upperLimitMask), pb);
356    }
357    // Fall through to general case.
358    while (ub-- != 0) {
359        MarkerType a = process(repeated, marker, pb);
360        if (isFinalPositionMarker(a) && isFinalPositionMarker(marker)) {
361            marker = makeFinalPositionMarker("m", pb.createOr(markerVar(marker, pb), markerVar(a, pb)), pb); 
362        }
363        else {
364            marker = makePostPositionMarker("m", pb.createOr(postPositionVar(marker, pb), postPositionVar(a, pb)), pb);
365        }
366    }
367    return marker;
[4261]368}
369
[4340]370MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBlock & pb) {
371    // always use PostPosition markers for unbounded repetition.
372    PabloAST * base = postPositionVar(marker, pb);
373   
[4393]374    if (isByteLength(repeated)) {
375        PabloAST * cc = markerVar(compile(repeated, pb), pb);
376        return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
[4197]377    }
[4393]378    else if (isUnicodeUnitLength(repeated)) {
379        PabloAST * cc = markerVar(compile(repeated, pb), pb);
[4340]380        return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
[4283]381    }
[4208]382    else {
[4340]383        Assign * whileTest = pb.createAssign("test", base);
384        Assign * whileAccum = pb.createAssign("accum", base);
[4235]385
[4404]386        PabloBlock & wb = PabloBlock::Create(pb);
[4252]387
[4340]388        Var * loopComputation = postPositionVar(process(repeated, wrapPostPositionMarker(whileTest), wb), wb);
[4252]389
[4264]390        Var * whileAccumVar = wb.createVar(whileAccum);
[4252]391
[4264]392        Next * nextWhileTest = wb.createNext(whileTest, wb.createAnd(loopComputation, wb.createNot(whileAccumVar)));
[4252]393
[4264]394        wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccumVar));
[4252]395
[4404]396        pb.createWhile(wb.createVar(nextWhileTest), wb);
[4252]397
[4340]398        return makePostPositionMarker("unbounded", whileAccumVar, pb);
[4211]399    }   
[4340]400} // end of namespace re
[4197]401}
Note: See TracBrowser for help on using the repository browser.