source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4405

Last change on this file since 4405 was 4405, checked in by cameron, 4 years ago

AST support for Lookahead/Lookbehind? assertions

File size: 16.7 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include <re/re_name.h>
10#include <re/re_any.h>
11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <re/re_analysis.h>
21#include <cc/cc_namemap.hpp>
22#include <pablo/codegenstate.h>
23
24#include <assert.h>
25#include <stdexcept>
26
27using namespace pablo;
28
29namespace re {
30
31MarkerType makePostPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
32    return MarkerType{PostPosition, pb.createAssign(marker_name, s)};
33}
34
35MarkerType wrapPostPositionMarker(Assign * s) {
36    return MarkerType{PostPosition, s};
37}
38
39MarkerType makeFinalPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
40    return MarkerType{FinalByte, pb.createAssign(marker_name, s)};
41}
42
43Assign * markerStream(MarkerType m, PabloBlock & pb) {
44    return m.stream;
45}
46
47Var * markerVar(MarkerType m, PabloBlock & pb) {
48    return pb.createVar(m.stream);
49}
50
51Var * postPositionVar(MarkerType m, PabloBlock & pb) {
52    if (isFinalPositionMarker(m)) return pb.createVar(pb.createAssign("f", pb.createAdvance(pb.createVar(m.stream), 1)));
53    else return pb.createVar(m.stream);
54}
55
56//Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
57//#define USE_IF_FOR_NONFINAL
58
59
60
61RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
62: mCG(baseCG)
63, mLineFeed(nullptr)
64, mInitial(nullptr)
65, mNonFinal(nullptr)
66{
67
68}
69
70#define USE_IF_FOR_NONFINAL 1
71#define USE_IF_FOR_CRLF
72#define UNICODE_LINE_BREAK true
73
74
75void RE_Compiler::initializeRequiredStreams(cc::CC_Compiler & ccc) {
76
77    const std::string initial = "initial";
78    const std::string nonfinal = "nonfinal";
79
80    Assign * LF = mCG.createAssign("LF", ccc.compileCC(makeCC(0x0A)));
81    mLineFeed = mCG.createVar(LF);
82    PabloAST * CR = ccc.compileCC(makeCC(0x0D));
83    PabloAST * LF_VT_FF_CR = ccc.compileCC(makeCC(0x0A, 0x0D));
84#ifndef USE_IF_FOR_CRLF
85    mCRLF = mCG.createAnd(mCG.createAdvance(CR, 1), mLineFeed);
86#else
87    PabloBlock & crb = PabloBlock::Create(mCG);
88    Assign * cr1 = crb.createAssign("cr1", crb.createAdvance(CR, 1));
89    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(crb.createVar(cr1), crb.createVar(LF)));
90    mCG.createIf(CR, std::move(std::vector<Assign *>{acrlf}), crb);
91    mCRLF = mCG.createVar(acrlf);
92#endif
93
94#ifndef USE_IF_FOR_NONFINAL
95    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
96    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF));
97    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF));
98    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4));
99    PabloAST * u8pfx = mCG.createOr(mCG.createOr(u8pfx2, u8pfx3), u8pfx4);
100    mInitial = mCG.createVar(mCG.createAssign(initial, mCG.createOr(u8pfx, u8single)));
101
102    PabloAST * u8scope32 = mCG.createAdvance(u8pfx3, 1);
103    PabloAST * u8scope42 = mCG.createAdvance(u8pfx4, 1);
104    PabloAST * u8scope43 = mCG.createAdvance(u8scope42, 1);
105    PabloAST * NEL = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xC2)), 1), ccc.compileCC(makeCC(0x85)));
106    PabloAST * E2_80 = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xE2)), 1), ccc.compileCC(makeCC(0x80)));
107    PabloAST * LS_PS = mCG.createAnd(mCG.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9)));
108    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createOr(NEL, LS_PS));
109    mNonFinal = mCG.createVar(mCG.createAssign(nonfinal, mCG.createOr(mCG.createOr(u8pfx, u8scope32), mCG.createOr(u8scope42, u8scope43))));
110    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
111#endif
112
113#ifdef USE_IF_FOR_NONFINAL
114    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
115    PabloAST * u8pfx = ccc.compileCC(makeCC(0xC0, 0xFF));
116    PabloBlock & it = PabloBlock::Create(mCG);
117    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF), it);
118    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF), it);
119    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4), it);
120    Assign * valid_pfx = it.createAssign("valid_pfx", it.createOr(it.createOr(u8pfx2, u8pfx3), u8pfx4));
121    PabloAST * u8scope32 = it.createAdvance(u8pfx3, 1);
122    PabloAST * u8scope42 = it.createVar(it.createAssign("u8scope42", it.createAdvance(u8pfx4, 1)));
123    PabloAST * u8scope43 = it.createAdvance(u8scope42, 1);
124    Assign * a_nonfinal = it.createAssign(nonfinal, it.createOr(it.createOr(u8pfx, u8scope32), it.createOr(u8scope42, u8scope43)));
125    PabloAST * NEL = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xC2), it), 1), ccc.compileCC(makeCC(0x85), it));
126    PabloAST * E2_80 = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xE2), it), 1), ccc.compileCC(makeCC(0x80), it));
127    PabloAST * LS_PS = it.createAnd(it.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9), it));
128    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
129    mCG.createIf(u8pfx, std::move(std::vector<Assign *>{valid_pfx, a_nonfinal, NEL_LS_PS}), it);
130    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createVar(NEL_LS_PS));
131    mInitial = mCG.createVar(mCG.createAssign(initial, mCG.createOr(u8single, mCG.createVar(valid_pfx))));
132    mNonFinal = mCG.createVar(a_nonfinal);   
133    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
134    #endif
135}
136
137void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
138    //These three lines are specifically for grep.
139    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
140    Var * v = markerVar(match_result, mCG);
141    mCG.createAssign("matches", mCG.createAnd(mCG.createMatchStar(v, mCG.createNot(lb)), lb), 0);
142    mCG.createAssign("lf", mCG.createAnd(lb, mCG.createNot(mCRLF)), 1);
143}
144
145MarkerType RE_Compiler::compile(RE * re, PabloBlock & pb) {
146    return process(re, makePostPositionMarker("start", pb.createOnes(), pb), pb);
147}
148
149PabloAST * RE_Compiler::character_class_strm(Name * name, PabloBlock & pb) {
150    Var * var = name->getCompiled();
151    if (var != nullptr) return var;
152    else {
153        RE * def = name->getDefinition();
154        if (def != nullptr) {
155            MarkerType m = compile(def, mCG);
156            assert(isFinalPositionMarker(m));
157            Var * v = pb.createVar(markerStream(m, mCG));
158            name -> setCompiled(v);
159            return v;
160        }
161        else if (name->getType() == Name::Type::UnicodeProperty) {
162            return pb.createCall(name->getName());
163        }
164        else {
165            throw std::runtime_error("Unresolved name " + name->getName());
166        }
167    }
168}
169
170PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBlock & pb) {
171    if (isPostPositionMarker(m)) {
172        return pb.createScanThru(pb.createVar(pb.createAnd(mInitial, markerVar(m, pb))), mNonFinal);
173    }
174    else {
175        //return pb.createAdvanceThenScanThru(pb.createVar(markerVar(m), pb), mNonFinal);
176        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(pb.createVar(markerVar(m, pb)), 1)), mNonFinal);
177    }
178}
179
180MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBlock & pb) {
181    if (Name * name = dyn_cast<Name>(re)) {
182        return process(name, marker, pb);
183    }
184    else if (Seq* seq = dyn_cast<Seq>(re)) {
185        return process(seq, marker, pb);
186    }
187    else if (Alt * alt = dyn_cast<Alt>(re)) {
188        return process(alt, marker, pb);
189    }
190    else if (Rep * rep = dyn_cast<Rep>(re)) {
191        return process(rep, marker, pb);
192    }
193    else if (Assertion * a = dyn_cast<Assertion>(re)) {
194        return process(a, marker, pb);
195    }
196    else if (isa<Any>(re)) {
197        PabloAST * nextPos = nextUnicodePosition(marker, pb);
198        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
199        return makeFinalPositionMarker("dot", pb.createAnd(nextPos, dot), pb);
200    }
201    else if (Diff * diff = dyn_cast<Diff>(re)) {
202        return process(diff, marker, pb);
203    }
204    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
205        return process(ix, marker, pb);
206    }
207    else if (isa<Start>(re)) {
208        if (UNICODE_LINE_BREAK) {
209            PabloAST * line_end = mCG.createOr(mUnicodeLineBreak, mCRLF);
210            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
211            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
212        }
213        else {
214            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
215            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
216        }
217    }
218    else if (isa<End>(re)) {
219        if (UNICODE_LINE_BREAK) {
220            PabloAST * nextPos = nextUnicodePosition(marker, pb);
221            return makeFinalPositionMarker("end", pb.createAnd(nextPos, mUnicodeLineBreak), pb);
222        }
223        PabloAST * nextPos = postPositionVar(marker, pb);  // For LF match
224        return makePostPositionMarker("eol", pb.createAnd(nextPos, mLineFeed), pb);
225    }
226    return marker;
227}
228
229MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBlock & pb) {
230    PabloAST * nextPos = (name->getType() == Name::Type::Byte) ? postPositionVar(marker, pb): nextUnicodePosition(marker, pb);
231    return makeFinalPositionMarker("m", pb.createAnd(nextPos, character_class_strm(name, pb)), pb);
232}
233
234MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBlock & pb) {
235    for (RE * re : *seq) {
236        marker = process(re, marker, pb);
237    }
238    return marker;
239}
240
241MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBlock & pb) {
242    PabloAST * atPosnAccumulator = nullptr;
243    PabloAST * postPosnAccumulator = nullptr;
244    MarkerType const base = marker;
245    // The following may be useful to force a common Advance rather than separate
246    // Advances in each alternative.
247    // MarkerType const base = makePostPositionMarker(postPositionVar(marker, pb), pb);
248    for (RE * re : *alt) {
249        MarkerType rslt = process(re, base, pb);
250        PabloAST * rsltStream = markerVar(rslt, pb); 
251        if (isFinalPositionMarker(rslt)) {
252            atPosnAccumulator = (atPosnAccumulator == nullptr) ? rsltStream : pb.createOr(atPosnAccumulator, rsltStream);
253        }
254        else {
255            postPosnAccumulator = (postPosnAccumulator == nullptr) ? rsltStream : pb.createOr(postPosnAccumulator, rsltStream);
256        }
257    }
258    if (postPosnAccumulator == nullptr) {
259        return makeFinalPositionMarker("alt", atPosnAccumulator == nullptr ? pb.createZeroes() : atPosnAccumulator, pb);
260    }
261    else {
262        if (atPosnAccumulator != nullptr) {
263            postPosnAccumulator = pb.createOr(postPosnAccumulator, pb.createAdvance(atPosnAccumulator, 1));
264        }
265        return makePostPositionMarker("alt", postPosnAccumulator, pb);
266    }
267}
268
269MarkerType RE_Compiler::process(Assertion * a, MarkerType marker, PabloBlock & pb) {
270    throw std::runtime_error("Assertions not implemented.");
271}
272
273MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBlock & pb) {
274    RE * lh = diff->getLH();
275    RE * rh = diff->getRH();
276    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
277        MarkerType t1 = process(lh, marker, pb);
278        MarkerType t2 = process(rh, marker, pb);
279        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
280        return makeFinalPositionMarker("diff", pb.createAnd(markerVar(t1, pb), pb.createNot(markerVar(t2, pb))), pb);
281    }
282    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
283}
284
285MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBlock & pb) {
286    RE * lh = x->getLH();
287    RE * rh = x->getRH();
288    if (isUnicodeUnitLength(lh) && isUnicodeUnitLength(rh)) {
289        MarkerType t1 = process(lh, marker, pb);
290        MarkerType t2 = process(rh, marker, pb);
291        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
292        return makeFinalPositionMarker("intersect", pb.createAnd(markerVar(t1, pb), markerVar(t2, pb)), pb);
293    }
294    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
295}
296
297MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBlock & pb) {
298    int lb = rep->getLB();
299    int ub = rep->getUB();
300    if (lb > 0) {
301        marker = processLowerBound(rep->getRE(), lb, marker, pb);
302    }
303    if (ub == Rep::UNBOUNDED_REP) {
304        return processUnboundedRep(rep->getRE(), marker, pb);
305    }
306    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
307        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
308    }
309}
310
311/*
312   Given a stream |repeated| marking positions immediately after matches to an item
313   of length |repeated_lgth|, compute a stream marking positions immediately after
314   |repeat_count| consecutive occurrences of such items.
315*/
316
317inline Assign * RE_Compiler::consecutive(Assign * repeated, int repeated_lgth, int repeat_count, pablo::PabloBlock & pb) {
318        int i = repeated_lgth;
319        int total_lgth = repeat_count * repeated_lgth;
320        Assign * consecutive_i = repeated;
321        while (i * 2 < total_lgth) {
322        PabloAST * v = pb.createVar(consecutive_i);
323                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, i)));
324                i *= 2;
325        }
326        if (i < total_lgth) {
327        PabloAST * v = pb.createVar(consecutive_i);
328                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, total_lgth - i)));
329        }
330        return consecutive_i;
331}
332
333MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
334    if (isByteLength(repeated)) {
335        PabloAST * cc = markerVar(compile(repeated, pb), pb);
336        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(cc,1)), 1, lb, pb);
337        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker, pb), isFinalPositionMarker(marker) ? lb+ 1 : lb);
338        return makePostPositionMarker("lowerbound", pb.createAnd(marker_fwd, pb.createVar(cc_lb)), pb);
339    }
340    // Fall through to general case.
341    while (lb-- != 0) {
342        marker = process(repeated, marker, pb);
343    }
344    return marker;
345}
346
347MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBlock & pb) {
348    if (isByteLength(repeated)) {
349        // log2 upper bound for fixed length (=1) class
350        // Mask out any positions that are more than ub positions from a current match.
351        // Use matchstar, then apply filter.
352        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(postPositionVar(marker, pb)));
353        PabloAST * upperLimitMask = pb.createNot(pb.createVar(consecutive(nonMatch, 1, ub + 1, pb)));
354        PabloAST * rep_class_var = markerVar(compile(repeated, pb), pb);
355        return makePostPositionMarker("bounded", pb.createAnd(pb.createMatchStar(postPositionVar(marker, pb), rep_class_var), upperLimitMask), pb);
356    }
357    // Fall through to general case.
358    while (ub-- != 0) {
359        MarkerType a = process(repeated, marker, pb);
360        if (isFinalPositionMarker(a) && isFinalPositionMarker(marker)) {
361            marker = makeFinalPositionMarker("m", pb.createOr(markerVar(marker, pb), markerVar(a, pb)), pb); 
362        }
363        else {
364            marker = makePostPositionMarker("m", pb.createOr(postPositionVar(marker, pb), postPositionVar(a, pb)), pb);
365        }
366    }
367    return marker;
368}
369
370MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBlock & pb) {
371    // always use PostPosition markers for unbounded repetition.
372    PabloAST * base = postPositionVar(marker, pb);
373   
374    if (isByteLength(repeated)) {
375        PabloAST * cc = markerVar(compile(repeated, pb), pb);
376        return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
377    }
378    else if (isUnicodeUnitLength(repeated)) {
379        PabloAST * cc = markerVar(compile(repeated, pb), pb);
380        return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
381    }
382    else {
383        Assign * whileTest = pb.createAssign("test", base);
384        Assign * whileAccum = pb.createAssign("accum", base);
385
386        PabloBlock & wb = PabloBlock::Create(pb);
387
388        Var * loopComputation = postPositionVar(process(repeated, wrapPostPositionMarker(whileTest), wb), wb);
389
390        Var * whileAccumVar = wb.createVar(whileAccum);
391
392        Next * nextWhileTest = wb.createNext(whileTest, wb.createAnd(loopComputation, wb.createNot(whileAccumVar)));
393
394        wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccumVar));
395
396        pb.createWhile(wb.createVar(nextWhileTest), wb);
397
398        return makePostPositionMarker("unbounded", whileAccumVar, pb);
399    }   
400} // end of namespace re
401}
Note: See TracBrowser for help on using the repository browser.