source: icGREP/icgrep-devel/icgrep/re/re_compiler.h @ 6104

Last change on this file since 6104 was 5901, checked in by cameron, 19 months ago

RE compiler can generate UTF8_nonfinal if not provided externally

File size: 6.9 KB
RevLine 
[3850]1/*
[5888]2 *  Copyright (c) 2018 International Characters.
[3850]3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[4197]7#ifndef RE_TO_PABLO_COMPILER_H
8#define RE_TO_PABLO_COMPILER_H
[3850]9
[5267]10#include <re/re_seq.h>  // for Seq
[5233]11#include <boost/container/flat_map.hpp>
[5357]12#include <pablo/builder.hpp>
[5267]13#include <vector>       // for vector<>::iterator
[5816]14namespace cc { class CC_Compiler; class Alphabet;}
[5267]15namespace pablo { class PabloAST; }
[5888]16namespace pablo { class PabloBlock; }
[5816]17namespace pablo { class Var; }
[5267]18namespace re { class Alt; }
19namespace re { class Assertion; }
20namespace re { class Diff; }
21namespace re { class Intersect; }
22namespace re { class Name; }
23namespace re { class RE; }
24namespace re { class Rep; }
[5555]25namespace re { class CC; }
[3850]26
[4411]27/*   Marker streams represent the results of matching steps.
28     Three types of marker streams are used internally.
[5045]29     FinalMatchUnit markers are used for character classes and
[4411]30     other strings identified by a one bit at their final position.
[5045]31     InitialPostPositionUnit markers are used to mark matches with
32     a 1 bit immediately after a match.   InitialPostPositionUnit markers
[4411]33     are generally required whenever a regular expression element
34     can match the empty string (e.g., * and ? repeated items).
[5045]35     FinalPostPositionUnit markers are used for single code unit
[4411]36     lookahead assertions. 
[4340]37*/
[4411]38
[4194]39namespace re {
[4340]40
[5812]41class RE_Compiler {
42public:
[4340]43
[5812]44    enum MarkerPosition {FinalMatchUnit, InitialPostPositionUnit, FinalPostPositionUnit};
[4340]45
[5812]46    struct MarkerType {
47        MarkerPosition pos;
48        pablo::PabloAST * stream;
49        MarkerType & operator =(const MarkerType &) = default;
50    };
[4340]51
[5888]52    RE_Compiler(pablo::PabloBlock * scope, cc::CC_Compiler & ccCompiler);
[5816]53   
54    //
55    // The CCs (character classes) within a regular expression are generally
56    // expressed using a single alphabet.   But multiple alphabets may be
57    // used under some circumstances.   For example, regular expressions for
58    // Unicode may use both the Unicode alphabet for full Unicode characters
59    // as well as the Byte alphabet for the individual code units of UTF-8.
60    // In other cases, a multiplexed alphabet may be used for a certain
61    // subexpression, for example, if the subexpression involves a local
62    // language or a capture-backreference combination.
63    //
64    // Alphabets are added as needed using the addAlphabet method, giving both
65    // the alphabet value and the set of parallel bit streams that comprise
66    // a basis for the coded alphabet values.
67
[5843]68    void addAlphabet(cc::Alphabet * a, std::vector<pablo::PabloAST* > basis_set);
[5881]69   
70    void addPrecompiled(std::string precompiledName, pablo::PabloAST * precompiledStream);
[5843]71
[5812]72    pablo::PabloAST * compile(RE * re, pablo::PabloAST * const initialCursors = nullptr);
[4210]73
[5308]74    static LLVM_ATTRIBUTE_NORETURN void UnsupportedRE(std::string errmsg);
75   
[4197]76private:
[4214]77
[5233]78    struct NameMap {
79        NameMap(NameMap * parent = nullptr) : mParent(parent), mMap() {}
80        bool get(const Name * name, MarkerType & marker) const {
81            auto f = mMap.find(name);
82            if (f == mMap.end()) {
83                return mParent ? mParent->get(name, marker) : false;
84            } else {
85                marker = f->second;
86                return true;
87            }
88        }
89        void add(const Name * const name, MarkerType marker) {
90            mMap.emplace(name, std::move(marker));
91        }
92        NameMap * getParent() const { return mParent; }
93    private:
94        NameMap * const mParent;
95        boost::container::flat_map<const Name *, MarkerType> mMap;
96    };
97
[4841]98
[5812]99    MarkerType compile(RE * re, pablo::PabloBuilder & pb);
100    MarkerType compile(RE * re, pablo::PabloAST * const cursors, pablo::PabloBuilder & pb);
101
[4622]102    MarkerType process(RE * re, MarkerType marker, pablo::PabloBuilder & pb);
[4829]103    MarkerType compileName(Name * name, MarkerType marker, pablo::PabloBuilder & pb);
[5555]104    MarkerType compileCC(CC * cc, MarkerType marker, pablo::PabloBuilder & pb);
[4831]105    MarkerType compileSeq(Seq * seq, MarkerType marker, pablo::PabloBuilder & pb);
[5821]106    MarkerType compileSeqTail(Seq::const_iterator current, const Seq::const_iterator end, int matchLenSoFar, MarkerType marker, pablo::PabloBuilder & pb);
[5812]107    MarkerType compileAlt(Alt * alt, MarkerType base, pablo::PabloBuilder & pb);
[4831]108    MarkerType compileAssertion(Assertion * a, MarkerType marker, pablo::PabloBuilder & pb);
109    MarkerType compileRep(Rep * rep, MarkerType marker, pablo::PabloBuilder & pb);
110    MarkerType compileDiff(Diff * diff, MarkerType marker, pablo::PabloBuilder & cg);
111    MarkerType compileIntersect(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
[5720]112    pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated_j, int j, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
[5710]113    pablo::PabloAST * reachable(pablo::PabloAST * repeated, int length, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
[4280]114    static bool isFixedLength(RE * regexp);
[5555]115    MarkerType processLowerBound(RE * repeated,  int lb, MarkerType marker, int ifGroupSize, pablo::PabloBuilder & pb);
[4622]116    MarkerType processUnboundedRep(RE * repeated, MarkerType marker, pablo::PabloBuilder & pb);
[5555]117    MarkerType processBoundedRep(RE * repeated, int ub, MarkerType marker, int ifGroupSize,  pablo::PabloBuilder & pb);
[4194]118
[4831]119    MarkerType compileName(Name * name, pablo::PabloBuilder & pb);
[4829]120    MarkerType compileAny(const MarkerType m, pablo::PabloBuilder & pb);
[5782]121    MarkerType compileStart(MarkerType marker, pablo::PabloBuilder & pb);
122    MarkerType compileEnd(MarkerType marker, pablo::PabloBuilder & pb);
[4829]123
[4841]124    MarkerType AdvanceMarker(MarkerType marker, const MarkerPosition newpos, pablo::PabloBuilder & pb);
125    void AlignMarkers(MarkerType & m1, MarkerType & m2, pablo::PabloBuilder & pb);
[5901]126   
127    pablo::PabloAST * u8NonFinal(pablo::PabloBuilder & pb);
128    pablo::PabloAST * u8Final(pablo::PabloBuilder & pb);
[4841]129
[5812]130    static inline MarkerPosition markerPos(const MarkerType & m) {return m.pos; }
131    static inline pablo::PabloAST * markerVar(const MarkerType & m) {return m.stream; }
132    static inline MarkerType makeMarker(MarkerPosition newpos, pablo::PabloAST * strm) {return {newpos, strm};}
133
[4622]134private:
135
[5872]136    pablo::PabloBlock * const                       mEntryScope;
[5816]137    std::vector<cc::Alphabet *>                     mAlphabets;
138    std::vector<std::unique_ptr<cc::CC_Compiler>>   mAlphabetCompilers;
139
[4622]140    cc::CC_Compiler &                               mCCCompiler;
[5091]141    pablo::PabloAST *                               mLineBreak;
[5901]142    re::Name *                                      mNonFinalName;
[4609]143    pablo::PabloAST *                               mWhileTest;
[4442]144    int                                             mStarDepth;
[5233]145    NameMap *                                       mCompiledName;
146    NameMap                                         mBaseMap;
[5881]147    std::map<std::string, pablo::PabloAST *>        mExternalNameMap;
148
[3850]149};
150
[4194]151}
152
[4197]153#endif // COMPILER_H
Note: See TracBrowser for help on using the repository browser.