Changeset 5786


Ignore:
Timestamp:
Dec 16, 2017, 12:51:48 PM (9 months ago)
Author:
cameron
Message:

Decouple Unicode property support from re_compiler; initial support for (?-m) flag

Location:
icGREP/icgrep-devel/icgrep
Files:
10 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5770 r5786  
    110110            property->setDefinition(makeDiff(makeAny(), unassigned));
    111111            return true;
    112         } else if (value == "GCB" || value == "NonGCB"){
     112        } else if (value == "GCB" || value == "NonGCB") {
    113113            generateGraphemeClusterBoundaryRule(property);
     114            return true;
     115        } else if (value == "^s") {  // "start anchor (^) in single-line mode"
     116            property->setDefinition(makeNegativeLookBehindAssertion(makeCC(0, 0x10FFFF)));
     117            return true;
     118        } else if (value == "$s") { // "end anchor ($) in single-line mode"
     119            property->setDefinition(makeNegativeLookAheadAssertion(makeCC(0, 0x10FFFF)));
    114120            return true;
    115121        }
  • icGREP/icgrep-devel/icgrep/icgrep.cpp

    r5735 r5786  
    4242static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
    4343
    44 static re::ModeFlagSet globalFlags = 0;
     44static re::ModeFlagSet globalFlags = re::MULTILINE_MODE_FLAG;
    4545
    4646std::vector<re::RE *> readExpressions() {
  • icGREP/icgrep-devel/icgrep/kernels/charclasses.cpp

    r5750 r5786  
    1111#include <re/re_name.h>
    1212#include <boost/uuid/sha1.hpp>
     13#include <pablo/builder.hpp>
    1314#include <llvm/Support/raw_ostream.h>
    1415
     
    99100                pb.createAssign(r, ccc.compileCC(dyn_cast<CC>(t->first->getDefinition())));
    100101            } else {
    101                 pb.createAssign(r, t->second);
     102                pb.createAssign(r, pb.createInFile(t->second));
    102103            }
    103104        } else {
  • icGREP/icgrep-devel/icgrep/re/printer_re.cpp

    r5781 r5786  
    6262        retVal += re_name->getName();
    6363        retVal += "\" ";
     64        if (re_name->getType() == Name::Type::Capture) {
    6465            retVal += "=(" + PrintRE(re_name->getDefinition()) + ")";
     66        }
    6567    } else if (const Range* rg = dyn_cast<const Range>(re)) {
    6668        retVal = "Range (";
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5782 r5786  
    4242
    4343namespace re {
    44 
    45 RE * RE_Compiler::resolveUnicodeProperties(RE * re) {
    46     Name * ZeroWidth = nullptr;
    47     mCompiledName = &mBaseMap;
    48     gatherNames(re, ZeroWidth);
    49     // Now precompile any grapheme segmentation rules
    50     if (ZeroWidth) {
    51         mCompiledName->add(ZeroWidth, compileName(ZeroWidth, mPB));
    52     }
    53     return re;
    54 }
    55 
    56 RE * RE_Compiler::compileUnicodeNames(RE * re) {
    57     return resolveUnicodeProperties(re);
    58 }
    5944
    6045PabloAST * RE_Compiler::compile(RE * re) {
     
    341326    //
    342327    // A bounded repetition with an upper bound of at least 2.
    343     if (!mGraphemeBoundaryRule && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
     328    if (!AlgorithmOptionIsSet(DisableLog2BoundedRepetition)) {
    344329        // Check for a regular expression that satisfies on of the special conditions that
    345330        // allow implementation using the log2 technique.
     
    390375    for (auto i = 0; i < group; i++) {
    391376        marker = process(repeated, marker, pb);
    392         if (mGraphemeBoundaryRule) {
    393             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    394         }
    395377    }
    396378    if (lb == group) {
     
    412394    //
    413395    // A bounded repetition with an upper bound of at least 2.
    414     if (!mGraphemeBoundaryRule && !AlgorithmOptionIsSet(DisableLog2BoundedRepetition) && (ub > 1)) {
     396    if (!AlgorithmOptionIsSet(DisableLog2BoundedRepetition) && (ub > 1)) {
    415397        // Check for a regular expression that satisfies on of the special conditions that
    416398        // allow implementation using the log2 technique.
     
    466448        AlignMarkers(a, m, pb);
    467449        marker = makeMarker(markerPos(a), pb.createOr(markerVar(a), markerVar(m)));
    468         if (mGraphemeBoundaryRule) {
    469             marker = AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb);
    470         }
    471450    }
    472451    if (ub == group) {
     
    487466    // always use PostPosition markers for unbounded repetition.
    488467    PabloAST * base = markerVar(AdvanceMarker(marker, MarkerPosition::FinalPostPositionUnit, pb));
    489     if (!mGraphemeBoundaryRule && isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
     468    if (isByteLength(repeated)  && !AlgorithmOptionIsSet(DisableMatchStar)) {
    490469        PabloAST * mask = markerVar(compile(repeated, pb));
    491470        PabloAST * nonFinal = mNonFinal;
    492         if (mGraphemeBoundaryRule) {
    493             nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
    494         }
    495471        // The post position character may land on the initial byte of a multi-byte character. Combine them with the masked range.
    496472        PabloAST * unbounded = pb.createMatchStar(base, pb.createOr(mask, nonFinal), "unbounded");
     
    500476        PabloAST * mstar = nullptr;
    501477        PabloAST * nonFinal = mNonFinal;
    502         if (mGraphemeBoundaryRule) {
    503             nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
    504         }
    505478        cc = pb.createOr(cc, nonFinal);
    506479        mstar = pb.createMatchStar(base, cc);
    507480        PabloAST * final = mFinal;
    508         if (mGraphemeBoundaryRule) {
    509             final = mGraphemeBoundaryRule;
    510         }
    511481        return makeMarker(MarkerPosition::FinalPostPositionUnit, pb.createAnd(mstar, final, "unbounded"));
    512482    } else if (mStarDepth > 0){
     
    566536            marker.stream = pb.createAdvance(marker.stream, 1, "ipp");
    567537            PabloAST * nonFinal = mNonFinal;
    568             if (mGraphemeBoundaryRule) {
    569                 nonFinal = pb.createOr(nonFinal, pb.createNot(mGraphemeBoundaryRule, "gext"));
    570             }
    571538            PabloAST * starts = pb.createAnd(mInitial, marker.stream);
    572539            marker.stream = pb.createScanThru(starts, nonFinal, "fpp");
     
    594561, mLineBreak(nullptr)
    595562, mCRLF(nullptr)
    596 , mGraphemeBoundaryRule(nullptr)
    597563, mInitial(nullptr)
    598564, mNonFinal(nullptr)
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5782 r5786  
    6262
    6363    RE_Compiler(pablo::PabloKernel * kernel, cc::CC_Compiler & ccCompiler);
    64     RE * compileUnicodeNames(RE * re);
    6564    pablo::PabloAST * compile(RE * re);
    6665
     
    107106    MarkerType processUnboundedRep(RE * repeated, MarkerType marker, pablo::PabloBuilder & pb);
    108107    MarkerType processBoundedRep(RE * repeated, int ub, MarkerType marker, int ifGroupSize,  pablo::PabloBuilder & pb);
    109     RE * resolveUnicodeProperties(RE * re);
    110108
    111109    MarkerType compileName(Name * name, pablo::PabloBuilder & pb);
     
    124122    pablo::PabloAST *                               mLineBreak;
    125123    pablo::PabloAST *                               mCRLF;
    126     pablo::PabloAST *                               mGraphemeBoundaryRule;
    127124    pablo::PabloAST *                               mInitial;
    128125    pablo::PabloAST *                               mNonFinal;
  • icGREP/icgrep-devel/icgrep/re/re_name_resolve.cpp

    r5781 r5786  
    3232
    3333struct NameResolver {
     34    RE * resolveUnicodeProperties(RE * re) {
     35        if (Name * name = dyn_cast<Name>(re)) {
     36            auto f = mMemoizer.find(name);
     37            if (f == mMemoizer.end()) {
     38                if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
     39                    name->setDefinition(resolveUnicodeProperties(name->getDefinition()));
     40                } else if (LLVM_LIKELY(name->getType() == Name::Type::UnicodeProperty || name->getType() == Name::Type::ZeroWidth)) {
     41                    if (UCD::resolvePropertyDefinition(name)) {
     42                        name->setDefinition(resolveUnicodeProperties(name->getDefinition()));
     43                    } else {
     44                        name->setDefinition(makeCC(UCD::resolveUnicodeSet(name)));
     45                    }
     46                } else {
     47                    UndefinedNameError(name);
     48                }
     49            } else {
     50                return *f;
     51            }
     52        } else if (Seq * seq = dyn_cast<Seq>(re)) {
     53            for (auto si = seq->begin(); si != seq->end(); ++si) {
     54                *si = resolveUnicodeProperties(*si);
     55            }
     56        } else if (Alt * alt = dyn_cast<Alt>(re)) {
     57            for (auto ai = alt->begin(); ai != alt->end(); ++ai) {
     58                *ai = resolveUnicodeProperties(*ai);
     59            }
     60        } else if (Rep * rep = dyn_cast<Rep>(re)) {
     61            rep->setRE(resolveUnicodeProperties(rep->getRE()));
     62        } else if (Assertion * a = dyn_cast<Assertion>(re)) {
     63            a->setAsserted(resolveUnicodeProperties(a->getAsserted()));
     64        } else if (Range * rg = dyn_cast<Range>(re)) {
     65            rg->setLo(resolveUnicodeProperties(rg->getLo()));
     66            rg->setHi(resolveUnicodeProperties(rg->getHi()));
     67        } else if (Diff * diff = dyn_cast<Diff>(re)) {
     68            diff->setLH(resolveUnicodeProperties(diff->getLH()));
     69            diff->setRH(resolveUnicodeProperties(diff->getRH()));
     70        } else if (Intersect * ix = dyn_cast<Intersect>(re)) {
     71            ix->setLH(resolveUnicodeProperties(ix->getLH()));
     72            ix->setRH(resolveUnicodeProperties(ix->getRH()));
     73        } else if (Group * g = dyn_cast<Group>(re)) {
     74            g->setRE(resolveUnicodeProperties(g->getRE()));
     75        }
     76        return re;
     77    }
     78   
    3479    RE * resolve(RE * re) {
    3580        if (Name * name = dyn_cast<Name>(re)) {
     
    3883                if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    3984                    name->setDefinition(resolve(name->getDefinition()));
    40                 } else if (LLVM_LIKELY(name->getType() == Name::Type::UnicodeProperty || name->getType() == Name::Type::ZeroWidth)) {
    41                     if (UCD::resolvePropertyDefinition(name)) {
    42                         name->setDefinition(resolve(name->getDefinition()));
    43                     } else {
    44                         name->setDefinition(makeCC(UCD::resolveUnicodeSet(name)));
    45                     }
    4685                } else {
    47                     throw std::runtime_error("All non-unicode-property Name objects should have been defined prior to Unicode property resolution.");
     86                    UndefinedNameError(name);
    4887                }
    4988            } else {
     
    104143        return re;
    105144    }
    106 
     145   
    107146private:
    108147    Memoizer                mMemoizer;
    109148};
    110149   
    111 RE * resolveNames(RE * re) {
    112     NameResolver nameResolver;
    113     return nameResolver.resolve(re);   
     150    RE * resolveUnicodeProperties(RE * re) {
     151        NameResolver nameResolver;
     152        return nameResolver.resolveUnicodeProperties(re);
     153    }
     154   
     155    RE * resolveNames(RE * re) {
     156        NameResolver nameResolver;
     157        return nameResolver.resolve(re);
     158    }
     159   
    114160}
    115 
    116 }
  • icGREP/icgrep-devel/icgrep/re/re_name_resolve.h

    r5732 r5786  
    99    class Name;
    1010
     11    RE * resolveUnicodeProperties(RE * re);
    1112    RE * resolveNames(RE * re);
    1213
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5772 r5786  
    7474RE_Parser::RE_Parser(const std::string & regular_expression)
    7575: fByteMode(false)
    76 , fModeFlagSet(0)
     76, fModeFlagSet(MULTILINE_MODE_FLAG)
    7777, fNested(false)
    7878, mGroupsOpen(0)
     
    143143            case '^':
    144144                ++mCursor;
     145                if ((fModeFlagSet & ModeFlagType::MULTILINE_MODE_FLAG) == 0) {
     146                    return makeZeroWidth("^s");  //single-line mode
     147                }
    145148                if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
    146149                    return makeNegativeLookBehindAssertion(makeByte(makeCC(makeCC(0, '\n'-1), makeCC('\n'+1, 0xFF))));
     
    149152            case '$':
    150153                ++mCursor;
     154                if ((fModeFlagSet & ModeFlagType::MULTILINE_MODE_FLAG) == 0) {
     155                    return makeZeroWidth("$s");  //single-line mode
     156                }
    151157                if ((fModeFlagSet & ModeFlagType::UNIX_LINES_MODE_FLAG) != 0) {
    152158                    return makeLookAheadAssertion(makeCC('\n'));
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5785 r5786  
    8484    }
    8585    r = RE_Star_Normal::star_normal(r);
    86    
     86
     87    r = resolveGraphemeMode(r, false /* not in grapheme mode at top level*/);
     88    if (PrintOptions.isSet(ShowAllREs)) {
     89        errs() << "resolveGraphemeMode:\n" << Printer_RE::PrintRE(r) << '\n';
     90    }
     91    r = re::resolveUnicodeProperties(r);
     92    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowStrippedREs)) {
     93        errs() << "resolveUnicodeProperties:\n" << Printer_RE::PrintRE(r) << '\n';
     94    }
     95
    8796    r = RE_Simplifier::simplify(r);
    8897   
     
    98107    if (PrintOptions.isSet(ShowAllREs)) {
    99108        errs() << "resolveCaseInsensitiveMode:\n" << Printer_RE::PrintRE(r) << '\n';
    100     }
    101     r = resolveGraphemeMode(r, false /* not in grapheme mode at top level*/);
    102     if (PrintOptions.isSet(ShowAllREs)) {
    103         errs() << "resolveGraphemeMode:\n" << Printer_RE::PrintRE(r) << '\n';
    104109    }
    105110    r = re::resolveNames(r);
     
    173178    cc::CC_Compiler cc_compiler(kernel, basis);
    174179    RE_Compiler re_compiler(kernel, cc_compiler);
    175     re_ast = re_compiler.compileUnicodeNames(re_ast);
    176180    return re_compiler.compile(re_ast);
    177181}
Note: See TracChangeset for help on using the changeset viewer.