source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4411

Last change on this file since 4411 was 4411, checked in by cameron, 5 years ago

Support for single Unicode position lookahead assertions, \b

File size: 18.1 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include <re/re_name.h>
10#include <re/re_any.h>
11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <re/re_analysis.h>
21#include <cc/cc_namemap.hpp>
22#include <pablo/codegenstate.h>
23
24#include <assert.h>
25#include <stdexcept>
26
27using namespace pablo;
28
29namespace re {
30
31
32RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
33: mCG(baseCG)
34, mLineFeed(nullptr)
35, mInitial(nullptr)
36, mNonFinal(nullptr)
37{
38
39}
40   
41   
42MarkerType RE_Compiler::AdvanceMarker(MarkerType m, MarkerPosition newpos, PabloBlock & pb) {
43    if (m.pos == newpos) return m;
44    if (m.pos > newpos) throw std::runtime_error("icgrep internal error: attempt to AdvanceMarker backwards");
45    Assign * a = m.stream;
46    if (m.pos == FinalMatchByte) {
47        // Must advance at least to InitialPostPositionByte
48        a = pb.createAssign("adv", pb.createAdvance(a, 1));
49    }
50    // Now at InitialPostPositionByte; is a further advance needed?
51    if (newpos == FinalPostPositionByte) {
52        // Must advance through nonfinal bytes
53        a = pb.createAssign("scanToFinal", pb.createScanThru(pb.createAnd(mInitial, a), mNonFinal));
54    }
55    return {newpos, a};
56}
57
58void RE_Compiler::AlignMarkers(MarkerType & m1, MarkerType & m2, PabloBlock & pb) {
59    if (m1.pos < m2.pos) {
60        m1 = AdvanceMarker(m1, m2.pos, pb); 
61    }
62    else if (m2.pos < m1.pos) {
63        m2 = AdvanceMarker(m2, m1.pos, pb); 
64    }
65}
66   
67   
68
69#define USE_IF_FOR_NONFINAL 1
70#define USE_IF_FOR_CRLF
71#define UNICODE_LINE_BREAK true
72
73
74void RE_Compiler::initializeRequiredStreams(cc::CC_Compiler & ccc) {
75
76    const std::string initial = "initial";
77    const std::string nonfinal = "nonfinal";
78
79    Assign * LF = mCG.createAssign("LF", ccc.compileCC(makeCC(0x0A)));
80    mLineFeed = LF;
81    PabloAST * CR = ccc.compileCC(makeCC(0x0D));
82    PabloAST * LF_VT_FF_CR = ccc.compileCC(makeCC(0x0A, 0x0D));
83#ifndef USE_IF_FOR_CRLF
84    mCRLF = mCG.createAnd(mCG.createAdvance(CR, 1), mLineFeed);
85#else
86    PabloBlock & crb = PabloBlock::Create(mCG);
87    Assign * cr1 = crb.createAssign("cr1", crb.createAdvance(CR, 1));
88    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(cr1, LF));
89    mCG.createIf(CR, std::move(std::vector<Assign *>{acrlf}), crb);
90    mCRLF = acrlf;
91#endif
92
93#ifndef USE_IF_FOR_NONFINAL
94    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
95    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF));
96    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF));
97    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4));
98    PabloAST * u8pfx = mCG.createOr(mCG.createOr(u8pfx2, u8pfx3), u8pfx4);
99    mInitial = mCG.createAssign(initial, mCG.createOr(u8pfx, u8single));
100
101    PabloAST * u8scope32 = mCG.createAdvance(u8pfx3, 1);
102    PabloAST * u8scope42 = mCG.createAdvance(u8pfx4, 1);
103    PabloAST * u8scope43 = mCG.createAdvance(u8scope42, 1);
104    PabloAST * NEL = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xC2)), 1), ccc.compileCC(makeCC(0x85)));
105    PabloAST * E2_80 = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xE2)), 1), ccc.compileCC(makeCC(0x80)));
106    PabloAST * LS_PS = mCG.createAnd(mCG.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9)));
107    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createOr(NEL, LS_PS));
108    mNonFinal = mCG.createAssign(nonfinal, mCG.createOr(mCG.createOr(u8pfx, u8scope32), mCG.createOr(u8scope42, u8scope43)));
109    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
110#endif
111
112#ifdef USE_IF_FOR_NONFINAL
113    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
114    PabloAST * u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
115    PabloBlock & it = PabloBlock::Create(mCG);
116    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
117    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
118    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
119    Assign * valid_pfx = it.createAssign("valid_pfx", it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4));
120    PabloAST * u8scope32 = it.createAdvance(u8pfx3, 1);
121    PabloAST * u8scope42 = it.createAssign("u8scope42", it.createAdvance(u8pfx4, 1));
122    PabloAST * u8scope43 = it.createAdvance(u8scope42, 1);
123    Assign * a_nonfinal = it.createAssign(nonfinal, it.createOr(it.createOr(u8pfx, u8scope32), it.createOr(u8scope42, u8scope43)));
124    PabloAST * NEL = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xC2), it), 1), ccc.compileCC(makeCC(0x85), it));
125    PabloAST * E2_80 = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xE2), it), 1), ccc.compileCC(makeCC(0x80), it));
126    PabloAST * LS_PS = it.createAnd(it.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it));
127    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
128    mCG.createIf(u8pfx, std::move(std::vector<Assign *>{valid_pfx, a_nonfinal, NEL_LS_PS}), it);
129    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, NEL_LS_PS);
130    mInitial = mCG.createAssign(initial, mCG.createOr(u8single, valid_pfx));
131    mNonFinal = a_nonfinal;
132    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
133    #endif
134}
135
136void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
137    //These three lines are specifically for grep.
138    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
139    Assign * v = markerVar(match_result);
140    mCG.createAssign("matches", mCG.createAnd(mCG.createMatchStar(v, mCG.createNot(lb)), lb), 0);
141    mCG.createAssign("lf", mCG.createAnd(lb, mCG.createNot(mCRLF)), 1);
142}
143
144MarkerType RE_Compiler::compile(RE * re, PabloBlock & pb) {
145    return process(re, makeMarker(InitialPostPositionByte, pb.createAssign("start", pb.createOnes())), pb);
146}
147
148PabloAST * RE_Compiler::character_class_strm(Name * name, PabloBlock & pb) {
149    Assign * var = name->getCompiled();
150    if (var) {
151        return var;
152    }
153    else {
154        RE * def = name->getDefinition();
155        if (def != nullptr) {
156            MarkerType m = compile(def, mCG);
157            assert(markerPos(m) == FinalMatchByte);
158            Assign * v = markerVar(m);
159            name->setCompiled(markerVar(m));
160            return v;
161        }
162        else if (name->getType() == Name::Type::UnicodeProperty) {
163            return pb.createCall(name->getName());
164        }
165        else {
166            throw std::runtime_error("Unresolved name " + name->getName());
167        }
168    }
169}
170
171PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBlock & pb) {
172    if (markerPos(m) == FinalPostPositionByte) {
173        return markerVar(m);
174    }
175    else if (markerPos(m) == InitialPostPositionByte) {
176        return pb.createScanThru(pb.createAnd(mInitial, markerVar(m)), mNonFinal);
177    }
178    else {
179        //return pb.createAdvanceThenScanThru(pb.createVar(markerVar(m), pb), mNonFinal);
180        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(markerVar(m), 1)), mNonFinal);
181    }
182}
183
184MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBlock & pb) {
185    if (Name * name = dyn_cast<Name>(re)) {
186        return process(name, marker, pb);
187    }
188    else if (Seq* seq = dyn_cast<Seq>(re)) {
189        return process(seq, marker, pb);
190    }
191    else if (Alt * alt = dyn_cast<Alt>(re)) {
192        return process(alt, marker, pb);
193    }
194    else if (Rep * rep = dyn_cast<Rep>(re)) {
195        return process(rep, marker, pb);
196    }
197    else if (Assertion * a = dyn_cast<Assertion>(re)) {
198        return process(a, marker, pb);
199    }
200    else if (isa<Any>(re)) {
201        PabloAST * nextPos = nextUnicodePosition(marker, pb);
202        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
203        return makeMarker(FinalMatchByte, pb.createAssign("dot", pb.createAnd(nextPos, dot)));
204    }
205    else if (Diff * diff = dyn_cast<Diff>(re)) {
206        return process(diff, marker, pb);
207    }
208    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
209        return process(ix, marker, pb);
210    }
211    else if (isa<Start>(re)) {
212        MarkerType m = AdvanceMarker(marker, InitialPostPositionByte, pb);
213        if (UNICODE_LINE_BREAK) {
214            PabloAST * line_end = mCG.createOr(mUnicodeLineBreak, mCRLF);
215            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
216            return makeMarker(InitialPostPositionByte, pb.createAssign("sol", pb.createAnd(markerVar(m), sol)));
217        }
218        else {
219            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
220            return makeMarker(InitialPostPositionByte, pb.createAssign("sol", pb.createAnd(markerVar(m), sol)));
221        }
222    }
223    else if (isa<End>(re)) {
224        if (UNICODE_LINE_BREAK) {
225            PabloAST * nextPos = nextUnicodePosition(marker, pb);
226            return makeMarker(FinalPostPositionByte, pb.createAssign("end", pb.createAnd(nextPos, mUnicodeLineBreak)));
227        }
228        PabloAST * nextPos = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));  // For LF match
229        return makeMarker(InitialPostPositionByte, pb.createAssign("eol", pb.createAnd(nextPos, mLineFeed)));
230    }
231    return marker;
232}
233
234MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBlock & pb) {
235    MarkerType nextPos;
236    if (markerPos(marker) == FinalPostPositionByte) nextPos = marker;
237    else if (name->getType() == Name::Type::Byte) {
238        nextPos = AdvanceMarker(marker, InitialPostPositionByte, pb);
239    }
240    else {
241        nextPos = AdvanceMarker(marker, FinalPostPositionByte, pb);
242    }
243    return makeMarker(FinalMatchByte, pb.createAssign("m", pb.createAnd(markerVar(nextPos), character_class_strm(name, pb))));
244}
245
246MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBlock & pb) {
247    for (RE * re : *seq) {
248        marker = process(re, marker, pb);
249    }
250    return marker;
251}
252
253MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBlock & pb) {
254    std::vector<PabloAST *>  accum = {pb.createZeroes(), pb.createZeroes(), pb.createZeroes()};
255    MarkerType const base = marker;
256    // The following may be useful to force a common Advance rather than separate
257    // Advances in each alternative.
258    // MarkerType const base = makeMarker(InitialPostPositionByte, postPositionVar(marker, pb), pb);
259    for (RE * re : *alt) {
260        MarkerType rslt = process(re, base, pb);
261        MarkerPosition p = markerPos(rslt);
262        accum[p] = pb.createOr(accum[p], markerVar(rslt));
263    }
264    if (isa<Zeroes>(accum[InitialPostPositionByte]) && isa<Zeroes>(accum[FinalPostPositionByte])) {
265        return makeMarker(FinalMatchByte, pb.createAssign("alt", accum[FinalMatchByte]));
266    }
267    PabloAST * combine = pb.createOr(accum[InitialPostPositionByte], pb.createAdvance(accum[FinalMatchByte], 1));
268    if (isa<Zeroes>(accum[FinalPostPositionByte])) {
269        return makeMarker(InitialPostPositionByte, pb.createAssign("alt", combine));
270    }
271    combine = pb.createOr(pb.createScanThru(pb.createAnd(mInitial, combine), mNonFinal), accum[FinalPostPositionByte]);
272    return makeMarker(FinalPostPositionByte, pb.createAssign("alt", combine));
273}
274
275MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBlock & pb) {
276    RE * asserted = a->getAsserted();
277    if (a->getKind() == Assertion::Kind::Lookbehind) {
278        MarkerType m = marker;
279        MarkerType lookback = compile(asserted, pb);
280        AlignMarkers(m, lookback, pb);
281        Assign * lb = markerVar(lookback);
282        if (a->getSense() == Assertion::Sense::Negative) {
283            lb = pb.createAssign("not", pb.createNot(lb));
284        }
285        return makeMarker(markerPos(m), pb.createAssign("lookback", pb.createAnd(markerVar(marker), lb)));
286    }
287    else if (isUnicodeUnitLength(asserted)) {
288        MarkerType lookahead = compile(asserted, pb);
289        assert(markerPos(lookahead) == FinalMatchByte);
290        Assign * la = markerVar(lookahead);
291        if (a->getSense() == Assertion::Sense::Negative) {
292            la = pb.createAssign("not", pb.createNot(la));
293        }
294        MarkerType fbyte = AdvanceMarker(marker, FinalPostPositionByte, pb);
295        return makeMarker(FinalPostPositionByte, pb.createAssign("lookahead", pb.createAnd(markerVar(fbyte), la)));
296    }
297    else {
298        throw std::runtime_error("Unsupported lookahead assertion.");
299    }
300}
301
302MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBlock & pb) {
303    RE * lh = diff->getLH();
304    RE * rh = diff->getRH();
305    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
306        MarkerType t1 = process(lh, marker, pb);
307        MarkerType t2 = process(rh, marker, pb);
308        AlignMarkers(t1, t2, pb);
309        return makeMarker(markerPos(t1), pb.createAssign("diff", pb.createAnd(markerVar(t1), pb.createNot(markerVar(t2)))));
310    }
311    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
312}
313
314MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBlock & pb) {
315    RE * lh = x->getLH();
316    RE * rh = x->getRH();
317    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
318        MarkerType t1 = process(lh, marker, pb);
319        MarkerType t2 = process(rh, marker, pb);
320        AlignMarkers(t1, t2, pb);
321        return makeMarker(markerPos(t1), pb.createAssign("intersect", pb.createAnd(markerVar(t1), markerVar(t2))));
322    }
323    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
324}
325
326MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBlock & pb) {
327    int lb = rep->getLB();
328    int ub = rep->getUB();
329    if (lb > 0) {
330        marker = processLowerBound(rep->getRE(), lb, marker, pb);
331    }
332    if (ub == Rep::UNBOUNDED_REP) {
333        return processUnboundedRep(rep->getRE(), marker, pb);
334    }
335    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
336        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
337    }
338}
339
340/*
341   Given a stream |repeated| marking positions immediately after matches to an item
342   of length |repeated_lgth|, compute a stream marking positions immediately after
343   |repeat_count| consecutive occurrences of such items.
344*/
345
346inline Assign * RE_Compiler::consecutive(Assign * repeated, int repeated_lgth, int repeat_count, pablo::PabloBlock & pb) {
347        int i = repeated_lgth;
348        int total_lgth = repeat_count * repeated_lgth;
349        Assign * consecutive_i = repeated;
350        while (i * 2 < total_lgth) {
351            PabloAST * v = consecutive_i;
352            consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, i)));
353            i *= 2;
354        }       
355        if (i < total_lgth) {
356            PabloAST * v = consecutive_i;
357            consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, total_lgth - i)));
358        }
359        return consecutive_i;
360}
361
362MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
363    if (isByteLength(repeated)) {
364        PabloAST * cc = markerVar(compile(repeated, pb));
365        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(cc,1)), 1, lb, pb);
366        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), markerPos(marker) == FinalMatchByte ? lb + 1 : lb);
367        return makeMarker(InitialPostPositionByte, pb.createAssign("lowerbound", pb.createAnd(marker_fwd, cc_lb)));
368    }
369    // Fall through to general case.
370    while (lb-- != 0) {
371        marker = process(repeated, marker, pb);
372    }
373    return marker;
374}
375
376MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBlock & pb) {
377    if (isByteLength(repeated)) {
378        // log2 upper bound for fixed length (=1) class
379        // Mask out any positions that are more than ub positions from a current match.
380        // Use matchstar, then apply filter.
381        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb))));
382        PabloAST * upperLimitMask = pb.createNot(consecutive(nonMatch, 1, ub + 1, pb));
383        PabloAST * cursor = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
384        PabloAST * rep_class_var = markerVar(compile(repeated, pb));
385        return makeMarker(InitialPostPositionByte, pb.createAssign("bounded", pb.createAnd(pb.createMatchStar(cursor, rep_class_var), upperLimitMask)));
386    }
387    // Fall through to general case.
388    while (ub-- != 0) {
389        MarkerType a = process(repeated, marker, pb);
390        MarkerType m = marker;
391        AlignMarkers(a, m, pb);
392        marker = makeMarker(markerPos(a), pb.createAssign("m", pb.createOr(markerVar(a), markerVar(m))));
393    }
394    return marker;
395}
396
397MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBlock & pb) {
398    // always use PostPosition markers for unbounded repetition.
399    PabloAST * base = markerVar(AdvanceMarker(marker, InitialPostPositionByte, pb));
400   
401    if (isByteLength(repeated)) {
402        PabloAST * cc = markerVar(compile(repeated, pb)); 
403        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createMatchStar(base, cc)));
404    }
405    else if (isUnicodeUnitLength(repeated)) {
406        PabloAST * cc = markerVar(compile(repeated, pb));
407        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial)));
408    }
409    else {
410        Assign * whileTest = pb.createAssign("test", base);
411        Assign * whileAccum = pb.createAssign("accum", base);
412
413        PabloBlock & wb = PabloBlock::Create(pb);
414
415        PabloAST * loopComputation = markerVar(AdvanceMarker(process(repeated, makeMarker(InitialPostPositionByte, whileTest), wb), InitialPostPositionByte, wb));
416        Next * nextWhileTest = wb.createNext(whileTest, wb.createAnd(loopComputation, wb.createNot(whileAccum)));
417        wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccum));
418
419        pb.createWhile(nextWhileTest, wb);
420
421        return makeMarker(InitialPostPositionByte, pb.createAssign("unbounded", whileAccum));
422    }   
423} // end of namespace re
424}
Note: See TracBrowser for help on using the repository browser.