source: icGREP/icgrep-devel/icgrep/re/re_compiler.cpp @ 4352

Last change on this file since 4352 was 4352, checked in by cameron, 5 years ago

Embed predefined streams in if-structures

File size: 16.9 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "re_compiler.h"
8//Regular Expressions
9#include <re/re_name.h>
10#include <re/re_any.h>
11#include <re/re_start.h>
12#include <re/re_end.h>
13#include <re/re_alt.h>
14#include <re/re_cc.h>
15#include <re/re_seq.h>
16#include <re/re_rep.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <cc/cc_namemap.hpp>
20#include <pablo/codegenstate.h>
21
22#include <assert.h>
23#include <stdexcept>
24
25using namespace pablo;
26
27namespace re {
28
29MarkerType makePostPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
30    return MarkerType{PostPosition, pb.createAssign(marker_name, s)};
31}
32   
33MarkerType wrapPostPositionMarker(Assign * s) {
34    return MarkerType{PostPosition, s};
35}
36   
37MarkerType makeFinalPositionMarker(std::string marker_name, PabloAST * s, PabloBlock & pb) {
38    return MarkerType{FinalByte, pb.createAssign(marker_name, s)};
39}
40
41Assign * markerStream(MarkerType m, PabloBlock & pb) {
42    return m.stream;
43}
44
45Var * markerVar(MarkerType m, PabloBlock & pb) {
46    return pb.createVar(m.stream);
47}
48
49Var * postPositionVar(MarkerType m, PabloBlock & pb) {
50    if (isFinalPositionMarker(m)) return pb.createVar(pb.createAssign("f", pb.createAdvance(pb.createVar(m.stream), 1)));
51    else return pb.createVar(m.stream);
52}
53
54//Set the 'internal.nonfinal' bit stream for the utf-8 multi-byte encoding.
55//#define USE_IF_FOR_NONFINAL
56
57
58
59RE_Compiler::RE_Compiler(PabloBlock & baseCG, const cc::CC_NameMap & nameMap)
60: mCG(baseCG)
61, mLineFeed(nullptr)
62, mInitial(nullptr)
63, mNonFinal(nullptr)
64, mNameMap(nameMap)
65{
66
67}
68
69#define USE_IF_FOR_NONFINAL 1
70#define USE_IF_FOR_CRLF
71#define UNICODE_LINE_BREAK true
72
73   
74void RE_Compiler::initializeRequiredStreams(cc::CC_Compiler & ccc) {
75
76    const std::string initial = "initial";
77    const std::string nonfinal = "nonfinal";
78   
79    Assign * LF = mCG.createAssign("LF", ccc.compileCC(makeCC(0x0A)));
80    mLineFeed = mCG.createVar(LF);
81    PabloAST * CR = ccc.compileCC(makeCC(0x0D));
82#ifndef USE_IF_FOR_CRLF
83    mCRLF = mCG.createAnd(mCG.createAdvance(CR, 1), mLineFeed);
84#else
85    PabloBlock crb(mCG);
86    Assign * cr1 = crb.createAssign("cr1", crb.createAdvance(CR, 1));
87    Assign * acrlf = crb.createAssign("crlf", crb.createAnd(crb.createVar(cr1), crb.createVar(LF)));
88    mCG.createIf(CR, std::move(std::vector<Assign *>{acrlf}), std::move(crb));
89    mCRLF = mCG.createVar(acrlf);
90#endif
91   
92    PabloAST * LF_VT_FF_CR = ccc.compileCC(makeCC(0x0A, 0x0D));
93    PabloAST * u8single = ccc.compileCC(makeCC(0x00, 0x7F));
94    PabloAST * u8pfx2 = ccc.compileCC(makeCC(0xC2, 0xDF));
95    PabloAST * u8pfx3 = ccc.compileCC(makeCC(0xE0, 0xEF));
96    PabloAST * u8pfx4 = ccc.compileCC(makeCC(0xF0, 0xF4));
97    PabloAST * u8pfx = mCG.createOr(mCG.createOr(u8pfx2, u8pfx3), u8pfx4);
98    mInitial = mCG.createVar(mCG.createAssign(initial, mCG.createOr(u8pfx, u8single)));
99
100    #ifndef USE_IF_FOR_NONFINAL
101    PabloAST * u8scope32 = mCG.createAdvance(u8pfx3, 1);
102    PabloAST * u8scope42 = mCG.createAdvance(u8pfx4, 1);
103    PabloAST * u8scope43 = mCG.createAdvance(u8scope42, 1);
104    PabloAST * NEL = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xC2)), 1), ccc.compileCC(makeCC(0x85)));
105    PabloAST * E2_80 = mCG.createAnd(mCG.createAdvance(ccc.compileCC(makeCC(0xE2)), 1), ccc.compileCC(makeCC(0x80)));
106    PabloAST * LS_PS = mCG.createAnd(mCG.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9)));
107    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createOr(NEL, LS_PS));
108    mNonFinal = mCG.createVar(mCG.createAssign(nonfinal, mCG.createOr(mCG.createOr(u8pfx, u8scope32), mCG.createOr(u8scope42, u8scope43))));
109    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
110    #endif
111    #ifdef USE_IF_FOR_NONFINAL
112    PabloBlock it(mCG);
113    PabloAST * u8scope32 = it.createAdvance(u8pfx3, 1);
114    PabloAST * u8scope42 = it.createVar(it.createAssign("u8scope42", it.createAdvance(u8pfx4, 1)));
115    PabloAST * u8scope43 = it.createAdvance(u8scope42, 1);
116    Assign * a = it.createAssign(nonfinal, it.createOr(it.createOr(u8pfx, u8scope32), it.createOr(u8scope42, u8scope43)));
117    PabloAST * NEL = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xC2)), 1), ccc.compileCC(makeCC(0x85)));
118    PabloAST * E2_80 = it.createAnd(it.createAdvance(ccc.compileCC(makeCC(0xE2)), 1), ccc.compileCC(makeCC(0x80)));
119    PabloAST * LS_PS = it.createAnd(it.createAdvance(E2_80, 1), ccc.compileCC(makeCC(0xA8,0xA9)));
120    Assign * NEL_LS_PS = it.createAssign("NEL_LS_PS", it.createOr(NEL, LS_PS));
121    mCG.createIf(u8pfx, std::move(std::vector<Assign *>{a, NEL_LS_PS}), std::move(it));
122    PabloAST * LB_chars = mCG.createOr(LF_VT_FF_CR, mCG.createVar(NEL_LS_PS));
123    mNonFinal = mCG.createVar(a);   
124    mUnicodeLineBreak = mCG.createAnd(LB_chars, mCG.createNot(mCRLF));  // count the CR, but not CRLF
125    #endif
126}
127
128void RE_Compiler::finalizeMatchResult(MarkerType match_result) {
129    //These three lines are specifically for grep.
130    PabloAST * lb = UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed;
131    Var * v = markerVar(match_result, mCG);
132    mCG.createAssign("matches", mCG.createAnd(mCG.createMatchStar(v, mCG.createNot(lb)), lb), 0);
133    mCG.createAssign("lf", mCG.createAnd(lb, mCG.createNot(mCRLF)), 1);
134}
135   
136MarkerType RE_Compiler::compile(RE * re, PabloBlock & pb) {
137    return process(re, makePostPositionMarker("start", pb.createOnes(), pb), pb);
138}
139       
140PabloAST * RE_Compiler::character_class_strm(Name * name, PabloBlock & pb) {
141    if (name->getType() == Name::Type::UnicodeCategory) {
142        return pb.createCall(name->getName());
143    }
144    else {
145        Var * var = name->getCompiled();
146        if (var == nullptr) {
147            RE * def = name->getDefinition();
148            assert(!isa<CC>(def));  //  Names mapping to CCs should have been compiled.
149            assert(name->getType == Name::Type::Unicode);  //
150            // compile in top-level block
151            MarkerType m = compile(def, mCG);
152            assert(isFinalPositionMarker(m));
153            Var * v = pb.createVar(markerStream(m, mCG));
154            name -> setCompiled(v);
155            return v;
156        }
157        else return var;
158    }
159}
160
161PabloAST * RE_Compiler::nextUnicodePosition(MarkerType m, PabloBlock & pb) {
162    if (isPostPositionMarker(m)) {
163        return pb.createScanThru(pb.createVar(pb.createAnd(mInitial, markerVar(m, pb))), mNonFinal);
164    }
165    else {
166        //return pb.createAdvanceThenScanThru(pb.createVar(markerVar(m), pb), mNonFinal);
167        return pb.createScanThru(pb.createAnd(mInitial, pb.createAdvance(pb.createVar(markerVar(m, pb)), 1)), mNonFinal);
168       
169    }
170}
171   
172
173MarkerType RE_Compiler::process(RE * re, MarkerType marker, PabloBlock & pb) {
174    if (Name * name = dyn_cast<Name>(re)) {
175        return process(name, marker, pb);
176    }
177    else if (Seq* seq = dyn_cast<Seq>(re)) {
178        return process(seq, marker, pb);
179    }
180    else if (Alt * alt = dyn_cast<Alt>(re)) {
181        return process(alt, marker, pb);
182    }
183    else if (Rep * rep = dyn_cast<Rep>(re)) {
184        return process(rep, marker, pb);
185    }
186    else if (isa<Any>(re)) {
187        PabloAST * nextPos = nextUnicodePosition(marker, pb);
188        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? pb.createOr(mUnicodeLineBreak, mCRLF) : mLineFeed);
189        return makeFinalPositionMarker("dot", pb.createAnd(nextPos, dot), pb);
190    }
191    else if (Diff * diff = dyn_cast<Diff>(re)) {
192        return process(diff, marker, pb);
193    }
194    else if (Intersect * ix = dyn_cast<Intersect>(re)) {
195        return process(ix, marker, pb);
196    }
197    else if (isa<Start>(re)) {
198        if (UNICODE_LINE_BREAK) {
199            PabloAST * line_end = mCG.createOr(mUnicodeLineBreak, mCRLF);
200            PabloAST * sol = pb.createNot(pb.createOr(pb.createAdvance(pb.createNot(line_end), 1), mCRLF));
201            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
202        }
203        else {
204            PabloAST * sol = pb.createNot(pb.createAdvance(pb.createNot(mLineFeed), 1));
205            return makePostPositionMarker("sol", pb.createAnd(postPositionVar(marker, pb), sol), pb);
206        }
207    }
208    else if (isa<End>(re)) {
209        if (UNICODE_LINE_BREAK) {
210            PabloAST * nextPos = nextUnicodePosition(marker, pb);
211            return makeFinalPositionMarker("end", pb.createAnd(nextPos, mUnicodeLineBreak), pb);
212        }
213        PabloAST * nextPos = postPositionVar(marker, pb);  // For LF match
214        return makePostPositionMarker("eol", pb.createAnd(nextPos, mLineFeed), pb);
215    }
216    return marker;
217}
218
219MarkerType RE_Compiler::process(Name * name, MarkerType marker, PabloBlock & pb) {
220    PabloAST * nextPos = (name->getType() == Name::Type::Byte) ? postPositionVar(marker, pb): nextUnicodePosition(marker, pb);
221    return makeFinalPositionMarker("m", pb.createAnd(nextPos, character_class_strm(name, pb)), pb);
222}
223
224MarkerType RE_Compiler::process(Seq * seq, MarkerType marker, PabloBlock & pb) {
225    for (RE * re : *seq) {
226        marker = process(re, marker, pb);
227    }
228    return marker;
229}
230
231MarkerType RE_Compiler::process(Alt * alt, MarkerType marker, PabloBlock & pb) {
232    PabloAST * atPosnAccumulator = nullptr;
233    PabloAST * postPosnAccumulator = nullptr;
234    MarkerType const base = marker;
235    // The following may be useful to force a common Advance rather than separate
236    // Advances in each alternative.
237    // MarkerType const base = makePostPositionMarker(postPositionVar(marker, pb), pb);
238    for (RE * re : *alt) {
239        MarkerType rslt = process(re, base, pb);
240        PabloAST * rsltStream = markerVar(rslt, pb); 
241        if (isFinalPositionMarker(rslt)) {
242            atPosnAccumulator = (atPosnAccumulator == nullptr) ? rsltStream : pb.createOr(atPosnAccumulator, rsltStream);
243        }
244        else {
245            postPosnAccumulator = (postPosnAccumulator == nullptr) ? rsltStream : pb.createOr(postPosnAccumulator, rsltStream);
246        }
247    }
248    if (postPosnAccumulator == nullptr) {
249        return makeFinalPositionMarker("alt", atPosnAccumulator == nullptr ? pb.createZeroes() : atPosnAccumulator, pb);
250    }
251    else {
252        if (atPosnAccumulator != nullptr) {
253            postPosnAccumulator = pb.createOr(postPosnAccumulator, pb.createAdvance(atPosnAccumulator, 1));
254        }
255        return makePostPositionMarker("alt", postPosnAccumulator, pb);
256    }
257}
258
259MarkerType RE_Compiler::process(Diff * diff, MarkerType marker, PabloBlock & pb) {
260    RE * lh = diff->getLH();
261    RE * rh = diff->getRH();
262    if ((isa<Any>(lh) || isa<Name>(lh)) && (isa<Any>(rh) || isa<Name>(rh))) {
263        MarkerType t1 = process(lh, marker, pb);
264        MarkerType t2 = process(rh, marker, pb);
265        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
266        return makeFinalPositionMarker("diff", pb.createAnd(markerVar(t1, pb), pb.createNot(markerVar(t2, pb))), pb);
267    }
268    throw std::runtime_error("Unsupported Diff operands: " + Printer_RE::PrintRE(diff));
269}
270
271MarkerType RE_Compiler::process(Intersect * x, MarkerType marker, PabloBlock & pb) {
272    RE * lh = x->getLH();
273    RE * rh = x->getRH();
274    if ((isa<Any>(lh) || isa<Name>(lh)) && (isa<Any>(rh) || isa<Name>(rh))) {
275        MarkerType t1 = process(lh, marker, pb);
276        MarkerType t2 = process(rh, marker, pb);
277        assert(isFinalPositionMarker(t1) && isFinalPositionMarker(t2));
278        return makeFinalPositionMarker("intersect", pb.createAnd(markerVar(t1, pb), markerVar(t2, pb)), pb);
279    }
280    throw std::runtime_error("Unsupported Intersect operands: " + Printer_RE::PrintRE(x));
281}
282
283MarkerType RE_Compiler::process(Rep * rep, MarkerType marker, PabloBlock & pb) {
284    int lb = rep->getLB();
285    int ub = rep->getUB();
286    if (lb > 0) {
287        marker = processLowerBound(rep->getRE(), lb, marker, pb);
288    }
289    if (ub == Rep::UNBOUNDED_REP) {
290        return processUnboundedRep(rep->getRE(), marker, pb);
291    }
292    else { // if (rep->getUB() != Rep::UNBOUNDED_REP)
293        return processBoundedRep(rep->getRE(), ub - lb, marker, pb);
294    }   
295}
296
297/*
298   Given a stream |repeated| marking positions immediately after matches to an item
299   of length |repeated_lgth|, compute a stream marking positions immediately after
300   |repeat_count| consecutive occurrences of such items.
301*/
302       
303inline Assign * RE_Compiler::consecutive(Assign * repeated, int repeated_lgth, int repeat_count, pablo::PabloBlock & pb) {
304        int i = repeated_lgth;
305        int total_lgth = repeat_count * repeated_lgth;
306        Assign * consecutive_i = repeated;
307        while (i * 2 < total_lgth) {
308        PabloAST * v = pb.createVar(consecutive_i);
309                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, i)));
310                i *= 2;
311        }
312        if (i < total_lgth) {
313        PabloAST * v = pb.createVar(consecutive_i);
314                consecutive_i = pb.createAssign("consecutive", pb.createAnd(v, pb.createAdvance(v, total_lgth - i)));
315        }
316        return consecutive_i;
317}
318               
319inline bool RE_Compiler::isFixedLength(RE * regexp) {
320    return isa<Name>(regexp) && ((cast<Name>(regexp)->getType()) == Name::Type::Byte);
321}
322
323MarkerType RE_Compiler::processLowerBound(RE * repeated, int lb, MarkerType marker, PabloBlock & pb) {
324    if (isFixedLength(repeated)) {
325        Name * name = cast<Name>(repeated);
326        Assign * cc_lb = consecutive(pb.createAssign("repeated", pb.createAdvance(name->getCompiled(),1)), 1, lb, pb);
327        PabloAST * marker_fwd = pb.createAdvance(markerVar(marker, pb), isFinalPositionMarker(marker) ? lb+ 1 : lb);
328        return makePostPositionMarker("lowerbound", pb.createAnd(marker_fwd, pb.createVar(cc_lb)), pb);
329    }
330    // Fall through to general case.
331    while (lb-- != 0) {
332        marker = process(repeated, marker, pb);
333    }
334    return marker;
335}
336
337MarkerType RE_Compiler::processBoundedRep(RE * repeated, int ub, MarkerType marker, PabloBlock & pb) {
338    if (isFixedLength(repeated)) {
339        // log2 upper bound for fixed length (=1) class
340        // Mask out any positions that are more than ub positions from a current match.
341        // Use matchstar, then apply filter.
342        Assign * nonMatch = pb.createAssign("nonmatch", pb.createNot(postPositionVar(marker, pb)));
343        PabloAST * upperLimitMask = pb.createNot(pb.createVar(consecutive(nonMatch, 1, ub + 1, pb)));
344        PabloAST * rep_class_var = cast<Name>(repeated)->getCompiled();
345        return makePostPositionMarker("bounded", pb.createAnd(pb.createMatchStar(postPositionVar(marker, pb), rep_class_var), upperLimitMask), pb);
346    }
347    // Fall through to general case.
348    while (ub-- != 0) {
349        MarkerType a = process(repeated, marker, pb);
350        if (isFinalPositionMarker(a) && isFinalPositionMarker(marker)) {
351            marker = makeFinalPositionMarker("m", pb.createOr(markerVar(marker, pb), markerVar(a, pb)), pb); 
352        }
353        else {
354            marker = makePostPositionMarker("m", pb.createOr(postPositionVar(marker, pb), postPositionVar(a, pb)), pb);
355        }
356    }
357    return marker;
358}
359
360MarkerType RE_Compiler::processUnboundedRep(RE * repeated, MarkerType marker, PabloBlock & pb) {
361    // always use PostPosition markers for unbounded repetition.
362    PabloAST * base = postPositionVar(marker, pb);
363   
364    if (isa<Name>(repeated)) {
365        Name * name = cast<Name>(repeated);
366        PabloAST * cc = character_class_strm(name, pb);
367        if (name->getType() == Name::Type::Byte) {
368            return makePostPositionMarker("unbounded", pb.createMatchStar(base, cc), pb);
369        }
370        else { // Name::Unicode and Name::UnicodeCategory
371            return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
372        }       
373    }
374    else if (isa<Any>(repeated)) {
375        PabloAST * dot = pb.createNot(UNICODE_LINE_BREAK ? mUnicodeLineBreak : mLineFeed);
376        return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, dot)), mInitial), pb);
377    }
378    else if (isa<Diff>(repeated) && isa<Any>(cast<Diff>(repeated)->getLH()) && isa<Name>(cast<Diff>(repeated)->getRH())) {
379        Name * name = cast<Name>(cast<Diff>(repeated)->getRH());
380        PabloAST * cc = pb.createNot(pb.createOr(character_class_strm(name, pb), mLineFeed));
381        return makePostPositionMarker("unbounded", pb.createAnd(pb.createMatchStar(base, pb.createOr(mNonFinal, cc)), mInitial), pb);
382    }
383    else {
384        Assign * whileTest = pb.createAssign("test", base);
385        Assign * whileAccum = pb.createAssign("accum", base);
386
387        PabloBlock wb(pb); 
388
389        Var * loopComputation = postPositionVar(process(repeated, wrapPostPositionMarker(whileTest), wb), wb);
390
391        Var * whileAccumVar = wb.createVar(whileAccum);
392
393        Next * nextWhileTest = wb.createNext(whileTest, wb.createAnd(loopComputation, wb.createNot(whileAccumVar)));
394
395        wb.createNext(whileAccum, wb.createOr(loopComputation, whileAccumVar));
396
397        pb.createWhile(wb.createVar(nextWhileTest), std::move(wb));
398
399        return makePostPositionMarker("unbounded", whileAccumVar, pb);
400    }   
401} // end of namespace re
402}
Note: See TracBrowser for help on using the repository browser.