Changeset 5083


Ignore:
Timestamp:
Jul 6, 2016, 1:48:51 PM (3 years ago)
Author:
xuedongx
Message:

separate module for resolve names

Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5076 r5083  
    6969add_library(PabloADT ${PABLO_SRC})
    7070add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_rep.cpp re/re_diff.cpp re/re_intersect.cpp re/printer_re.cpp)
    71 add_library(RegExpCompiler re/re_parser.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp)
     71add_library(RegExpCompiler re/re_parser.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/re_analysis.cpp re/re_toolchain.cpp re/re_name_resolve.cpp)
    7272add_library(CCADT cc/cc_compiler.cpp utf8_encoder.cpp utf16_encoder.cpp UCD/CaseFolding_txt.cpp)
    7373add_library(UCDlib UCD/unicode_set.cpp UCD/ucd_compiler.cpp UCD/PropertyObjects.cpp UCD/resolve_properties.cpp UCD/UnicodeNameData.cpp)
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5080 r5083  
    66#include <re/re_compiler.h>
    77#include <re/re_toolchain.h>
     8#include <re/re_name_resolve.h>
    89//Regular Expressions
    910#include <re/re_name.h>
     
    4546void RE_Compiler::initializeRequiredStreams(Encoding encoding) {
    4647    if (encoding.getType() == Encoding::Type::UTF_8) {
    47             RE_Compiler::initializeRequiredStreams_utf8();
     48      RE_Compiler::initializeRequiredStreams_utf8();
    4849    }
    4950    else if (encoding.getType() == Encoding::Type::UTF_16) {
    50             RE_Compiler::initializeRequiredStreams_utf16();
    51     }
    52 }
    53                
     51      RE_Compiler::initializeRequiredStreams_utf16();
     52    }
     53}
     54
    5455void RE_Compiler::initializeRequiredStreams_utf16() {
    5556    Assign * LF = mPB.createAssign("LF", mCCCompiler.compileCC(makeCC(0x000A)));
     
    173174}
    174175
    175 static inline CC * getDefinitionIfCC(RE * re) {
    176     if (LLVM_LIKELY(isa<Name>(re))) {
    177         Name * name = cast<Name>(re);
    178         if (name->getDefinition() && isa<CC>(name->getDefinition())) {
    179             return cast<CC>(name->getDefinition());
    180         }
    181     }
    182     return nullptr;
    183 }
    184176
    185177RE * RE_Compiler::resolveUnicodeProperties(RE * re) {
    186 
    187     Memoizer memoizer;
    188178    Name * graphemeClusterRule = nullptr;
    189 
    190     std::function<RE*(RE*)> resolve = [&](RE * re) -> RE * {
    191         if (Name * name = dyn_cast<Name>(re)) {
    192             auto f = memoizer.find(name);
    193             if (f == memoizer.end()) {
    194                 if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    195                     name->setDefinition(resolve(name->getDefinition()));
    196                 } else if (LLVM_LIKELY(name->getType() == Name::Type::UnicodeProperty)) {
    197                     if (UCD::resolvePropertyDefinition(name)) {
    198                         resolve(name->getDefinition());
    199                     } else {
    200                         #ifndef DISABLE_PREGENERATED_UCD_FUNCTIONS
    201                         if (AlgorithmOptionIsSet(UsePregeneratedUnicode)) {
    202                             const std::string functionName = UCD::resolvePropertyFunction(name);
    203                             const UCD::ExternalProperty & ep = UCD::resolveExternalProperty(functionName);
    204                             Call * call = mPB.createCall(Prototype::Create(functionName, std::get<1>(ep), std::get<2>(ep), std::get<0>(ep)), mCCCompiler.getBasisBits());
    205                             name->setCompiled(call);
    206                         } else {
    207                         #endif
    208                             name->setDefinition(makeCC(UCD::resolveUnicodeSet(name)));
    209                         #ifndef DISABLE_PREGENERATED_UCD_FUNCTIONS
    210                         }
    211                         #endif
    212                     }
    213                 } else {
    214                     throw std::runtime_error("All non-unicode-property Name objects should have been defined prior to Unicode property resolution.");
    215                 }
    216             } else {
    217                 return *f;
    218             }
    219         } else if (Seq * seq = dyn_cast<Seq>(re)) {
    220             for (auto si = seq->begin(); si != seq->end(); ++si) {
    221                 *si = resolve(*si);
    222             }
    223         } else if (Alt * alt = dyn_cast<Alt>(re)) {
    224             CC * unionCC = nullptr;
    225             std::stringstream name;
    226             for (auto ai = alt->begin(); ai != alt->end(); ) {
    227                 RE * re = resolve(*ai);
    228                 if (CC * cc = getDefinitionIfCC(re)) {
    229                     if (unionCC == nullptr) {
    230                         unionCC = cc;
    231                     } else {
    232                         unionCC = makeCC(unionCC, cc);
    233                         name << '+';
    234                     }
    235                     Name * n = cast<Name>(re);
    236                     if (n->hasNamespace()) {
    237                         name << n->getNamespace() << ':';
    238                     }
    239                     name << n->getName();
    240                     ai = alt->erase(ai);
    241                 } else {
    242                     *ai++ = re;
    243                 }
    244             }
    245             if (unionCC) {
    246                 alt->push_back(makeName(name.str(), unionCC));
    247             }
    248             if (alt->size() == 1) {
    249                 return alt->front();
    250             }
    251         } else if (Rep * rep = dyn_cast<Rep>(re)) {
    252             rep->setRE(resolve(rep->getRE()));
    253         } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    254             a->setAsserted(resolve(a->getAsserted()));
    255         } else if (Diff * diff = dyn_cast<Diff>(re)) {
    256             diff->setLH(resolve(diff->getLH()));
    257             diff->setRH(resolve(diff->getRH()));
    258             CC * lh = getDefinitionIfCC(diff->getLH());
    259             CC * rh = getDefinitionIfCC(diff->getRH());
    260             if (lh && rh) {
    261                 return resolve(makeName("diff", subtractCC(lh, rh)));
    262             }
    263         } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
    264             ix->setLH(resolve(ix->getLH()));
    265             ix->setRH(resolve(ix->getRH()));
    266             CC * lh = getDefinitionIfCC(ix->getLH());
    267             CC * rh = getDefinitionIfCC(ix->getRH());
    268             if (lh && rh) {
    269                 return resolve(makeName("intersect", intersectCC(lh, rh)));
    270             }
    271         } else if (GraphemeBoundary * gb = dyn_cast<GraphemeBoundary>(re)) {
    272             if (LLVM_LIKELY(gb->getBoundaryRule() == nullptr)) {
    273                 switch (gb->getType()) {
    274                     case GraphemeBoundary::Type::ClusterBoundary:
    275                         if (graphemeClusterRule == nullptr) {
    276                             graphemeClusterRule = cast<Name>(resolve(generateGraphemeClusterBoundaryRule()));
    277                         }
    278                         gb->setBoundaryRule(graphemeClusterRule);
    279                         break;
    280                     default:
    281                         throw std::runtime_error("Only grapheme cluster boundary rules are supported in icGrep 1.0");
    282                 }
    283             }
    284             if (gb->getExpression()) {
    285                 resolve(gb->getExpression());
    286             }
    287         }
    288         return re;
    289     };
    290 
    291179    UCD::UCDCompiler::NameMap nameMap;
    292180    std::unordered_set<Name *> visited;
    293 
    294     std::function<void(RE*)> gather = [&](RE * re) {
    295         assert ("RE object cannot be null!" && re);
    296         if (isa<Name>(re)) {
    297             if (visited.insert(cast<Name>(re)).second) {
    298                 if (isa<CC>(cast<Name>(re)->getDefinition())) {
    299                     nameMap.emplace(cast<Name>(re), nullptr);
    300                 } else {
    301                     gather(cast<Name>(re)->getDefinition());
    302                 }
    303             }
    304         } else if (isa<Seq>(re)) {
    305             for (RE * item : *cast<Seq>(re)) {
    306                 gather(item);
    307             }
    308         } else if (isa<Alt>(re)) {
    309             for (RE * item : *cast<Alt>(re)) {
    310                 gather(item);
    311             }
    312         } else if (isa<Rep>(re)) {
    313             gather(cast<Rep>(re)->getRE());
    314         } else if (isa<Assertion>(re)) {
    315             gather(cast<Assertion>(re)->getAsserted());
    316         } else if (isa<Diff>(re)) {
    317             gather(cast<Diff>(re)->getLH());
    318             gather(cast<Diff>(re)->getRH());
    319         } else if (isa<Intersect>(re)) {
    320             gather(cast<Intersect>(re)->getLH());
    321             gather(cast<Intersect>(re)->getRH());
    322         } else if (isa<GraphemeBoundary>(re)) {
    323             if (cast<GraphemeBoundary>(re)->getExpression()) {
    324                 gather(cast<GraphemeBoundary>(re)->getExpression());
    325             }
    326             gather(cast<GraphemeBoundary>(re)->getBoundaryRule());
    327         }
    328     };
    329     re = resolve(re);
    330     gather(re);
    331 
     181    nameMap = resolveNames(re, graphemeClusterRule);
     182   
    332183    if (LLVM_LIKELY(nameMap.size() > 0)) {
    333184        UCD::UCDCompiler ucdCompiler(mCCCompiler);
     
    356207}
    357208
    358 Name * RE_Compiler::generateGraphemeClusterBoundaryRule() {
    359     // 3.1.1 Grapheme Cluster Boundary Rules
    360     #define Behind(x) makeLookBehindAssertion(x)
    361     #define Ahead(x) makeLookAheadAssertion(x)
    362 
    363     RE * GCB_Control = makeName("gcb", "cn", Name::Type::UnicodeProperty);
    364     RE * GCB_CR = makeName("gcb", "cr", Name::Type::UnicodeProperty);
    365     RE * GCB_LF = makeName("gcb", "lf", Name::Type::UnicodeProperty);
    366     RE * GCB_Control_CR_LF = makeAlt({GCB_CR, GCB_LF});
    367 
    368     // Break at the start and end of text.
    369     RE * GCB_1 = makeStart();
    370     RE * GCB_2 = makeEnd();
    371     // Do not break between a CR and LF.
    372     RE * GCB_3 = makeSeq({Behind(GCB_CR), Ahead(GCB_LF)});
    373     // Otherwise, break before and after controls.
    374     RE * GCB_4 = Behind(GCB_Control_CR_LF);
    375     RE * GCB_5 = Ahead(GCB_Control_CR_LF);
    376     RE * GCB_1_5 = makeAlt({GCB_1, GCB_2, makeDiff(makeAlt({GCB_4, GCB_5}), GCB_3)});
    377 
    378     RE * GCB_L = makeName("gcb", "l", Name::Type::UnicodeProperty);
    379     RE * GCB_V = makeName("gcb", "v", Name::Type::UnicodeProperty);
    380     RE * GCB_LV = makeName("gcb", "lv", Name::Type::UnicodeProperty);
    381     RE * GCB_LVT = makeName("gcb", "lvt", Name::Type::UnicodeProperty);
    382     RE * GCB_T = makeName("gcb", "t", Name::Type::UnicodeProperty);
    383     RE * GCB_RI = makeName("gcb", "ri", Name::Type::UnicodeProperty);
    384     // Do not break Hangul syllable sequences.
    385     RE * GCB_6 = makeSeq({Behind(GCB_L), Ahead(makeAlt({GCB_L, GCB_V, GCB_LV, GCB_LVT}))});
    386     RE * GCB_7 = makeSeq({Behind(makeAlt({GCB_LV, GCB_V})), Ahead(makeAlt({GCB_V, GCB_T}))});
    387     RE * GCB_8 = makeSeq({Behind(makeAlt({GCB_LVT, GCB_T})), Ahead(GCB_T)});
    388     // Do not break between regional indicator symbols.
    389     RE * GCB_8a = makeSeq({Behind(GCB_RI), Ahead(GCB_RI)});
    390     // Do not break before extending characters.
    391     RE * GCB_9 = Ahead(makeName("gcb", "ex", Name::Type::UnicodeProperty));
    392     // Do not break before SpacingMarks, or after Prepend characters.
    393     RE * GCB_9a = Ahead(makeName("gcb", "sm", Name::Type::UnicodeProperty));
    394     RE * GCB_9b = Behind(makeName("gcb", "pp", Name::Type::UnicodeProperty));
    395     RE * GCB_6_9b = makeAlt({GCB_6, GCB_7, GCB_8, GCB_8a, GCB_9, GCB_9a, GCB_9b});
    396     // Otherwise, break everywhere.
    397     RE * GCB_10 = makeSeq({Behind(makeAny()), Ahead(makeAny())});
    398 
    399     Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    400     gcb->setDefinition(makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)}));
    401     return gcb;
    402 }
    403 
    404209void RE_Compiler::finalizeMatchResult(MarkerType match_result, bool InvertMatches) {
    405         PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
     210    PabloAST * match_follow = mPB.createMatchStar(markerVar(match_result), mAny);
    406211    if (InvertMatches) {
    407212        match_follow = mPB.createNot(match_follow);
Note: See TracChangeset for help on using the changeset viewer.