Changeset 5897 for icGREP


Ignore:
Timestamp:
Mar 10, 2018, 11:44:44 AM (11 months ago)
Author:
cameron
Message:

RE compiler restructuring progress

Location:
icGREP/icgrep-devel/icgrep
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r5894 r5897  
    7777    pattern = resolveCaseInsensitiveMode(pattern, false);
    7878    pattern = regular_expression_passes(pattern);
    79    
    80    
     79    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
     80    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
     81
    8182    ParabixDriver pxDriver("codepointEngine");
    8283    auto & idb = pxDriver.getBuilder();
     
    214215    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
    215216        breakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
    216         breakName = "UTF8_LB";
    217217    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    218218        breakCC = re::makeByte(0);  // Null
    219         breakName = "NULL";
    220219    } else {
    221220        breakCC = re::makeByte(0x0A); // LF
    222         breakName = "LF";
     221    }
     222    re::RE * anchorRE = breakCC;
     223    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     224        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
     225        anchorName->setDefinition(UCD::UnicodeBreakRE());
     226        anchorRE = anchorName;
    223227    }
    224228
     
    226230        REs[i] = resolveModesAndExternalSymbols(REs[i]);
    227231        REs[i] = re::exclude_CC(REs[i], breakCC);
    228         if (mGrepRecordBreak != GrepRecordBreakKind::Unicode) {
    229             REs[i] = resolveAnchors(REs[i], breakCC);
    230         }
     232        REs[i] = resolveAnchors(REs[i], anchorRE);
    231233        re::gatherUnicodeProperties(REs[i], UnicodeProperties);
    232234        REs[i] = regular_expression_passes(REs[i]);
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5890 r5897  
    5757        mNonFinal = precompiledStream;
    5858        mFinal = pb.createNot(precompiledStream);
    59     }
    60     if (precompiledName == "UTF8_LB") {
    61         mLineBreak = precompiledStream;
    6259    }
    6360}
     
    154151
    155152inline MarkerType RE_Compiler::compileName(Name * const name, MarkerType marker, PabloBuilder & pb) {
     153    if (name->getType() == Name::Type::Capture) {
     154        return process(name->getDefinition(), marker, pb);
     155    } else if (name->getType() == Name::Type::Reference) {
     156        llvm::report_fatal_error("back references not supported in icgrep.");
     157    }
    156158    const auto & nameString = name->getName();
    157     if (nameString == ".") {
    158         return compileAny(marker, pb);
    159     } else if (nameString == "^"){
    160         return compileStart(marker, pb);
    161     } else if (nameString == "$"){
    162         return compileEnd(marker, pb);
    163     } else if (isUnicodeUnitLength(name)) {
    164         MarkerType nameMarker = compileName(name, pb);
     159    MarkerType nameMarker = compileName(name, pb);
     160    if (isUnicodeUnitLength(name)) {
    165161        MarkerType nextPos = AdvanceMarker(marker, FinalPostPositionUnit, pb);
    166162        nameMarker.stream = pb.createAnd(markerVar(nextPos), markerVar(nameMarker), name->getName());
    167163        return nameMarker;
    168164    } else if (name->getType() == Name::Type::ZeroWidth) {
    169         auto f = mExternalNameMap.find(nameString);
    170         if (f != mExternalNameMap.end()) {
    171             MarkerType z = makeMarker(FinalPostPositionUnit, f->second);
    172             AlignMarkers(marker, z, pb);
    173             PabloAST * ze = markerVar(z);
    174             return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth"));
    175         }
    176         RE * zerowidth = name->getDefinition();
    177         MarkerType zero = compile(zerowidth, pb);
    178         AlignMarkers(marker, zero, pb);
    179         PabloAST * ze = markerVar(zero);
     165        AlignMarkers(marker, nameMarker, pb);
     166        PabloAST * ze = markerVar(nameMarker);
    180167        return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth"));
    181168    } else {
    182         return process(name->getDefinition(), marker, pb);
     169        llvm::report_fatal_error(nameString + " is neither Unicode unit length nor ZeroWidth.");
    183170    }
    184171}
    185172
    186173inline MarkerType RE_Compiler::compileName(Name * const name, PabloBuilder & pb) {
     174    const auto & nameString = name->getName();
    187175    MarkerType m;
    188176    if (LLVM_LIKELY(mCompiledName->get(name, m))) {
    189177        return m;
    190     } else if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
     178    }
     179    auto f = mExternalNameMap.find(nameString);
     180    if (f != mExternalNameMap.end()) {
     181        if (name->getType() == Name::Type::ZeroWidth) return makeMarker(FinalPostPositionUnit, f->second);
     182        else return makeMarker(FinalMatchUnit, f->second);
     183    }
     184    if (LLVM_LIKELY(name->getDefinition() != nullptr)) {
    191185        m = compile(name->getDefinition(), pb);
    192186        mCompiledName->add(name, m);
     
    569563    PabloAST * const nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    570564    PabloAST * const atEOL = pb.createAnd(mLineBreak, nextPos, "eol");
    571     //PabloAST * const atEOL = pb.createOr(pb.createAnd(mLineBreak, nextPos), pb.createAdvance(pb.createAnd(nextPos, mCRLF), 1), "eol");
    572565    return makeMarker(FinalPostPositionUnit, atEOL);
    573566}
  • icGREP/icgrep-devel/icgrep/re/re_multiplex.cpp

    r5847 r5897  
    104104            if (name->getType() == Name::Type::ZeroWidth)
    105105                re = makeZeroWidth(name->getName(), xfrm);
     106            else if (name->getType() == Name::Type::Capture)
     107                re = makeCapture(name->getName(), xfrm);
    106108            else
    107109                re = makeName(name->getName(), xfrm);
  • icGREP/icgrep-devel/icgrep/re/re_name_resolve.cpp

    r5872 r5897  
    1919#include <cc/alphabet.h>
    2020#include <boost/container/flat_set.hpp>
    21 #include <sstream>
     21#include <llvm/Support/ErrorHandling.h>
     22
    2223
    2324using namespace boost::container;
     
    159160}
    160161
    161 RE * resolveAnchors(RE * r, RE * breakRE) {
     162struct AnchorResolution {
     163    RE * mAnchorRE;
     164    bool mIsNegated;
     165    RE * resolve(RE * r);
     166};
     167   
     168RE * AnchorResolution::resolve(RE * r) {
    162169    if (!hasAnchor(r)) return r;
    163170    if (const Alt * alt = dyn_cast<Alt>(r)) {
     
    165172        list.reserve(alt->size());
    166173        for (RE * item : *alt) {
    167             item = resolveAnchors(item, breakRE);
     174            item = resolve(item);
    168175            list.push_back(item);
    169176        }
     
    173180        list.reserve(seq->size());
    174181        for (RE * item : *seq) {
    175             item = resolveAnchors(item, breakRE);
     182            item = resolve(item);
    176183            list.push_back(item);
    177184        }
    178185        return makeSeq(list.begin(), list.end());
    179186    } else if (Assertion * a = dyn_cast<Assertion>(r)) {
    180         return makeAssertion(resolveAnchors(a->getAsserted(), breakRE), a->getKind(), a->getSense());
     187        return makeAssertion(resolve(a->getAsserted()), a->getKind(), a->getSense());
    181188    } else if (Rep * rep = dyn_cast<Rep>(r)) {
    182         return makeRep(resolveAnchors(rep->getRE(), breakRE), rep->getLB(), rep->getUB());
     189        return makeRep(resolve(rep->getRE()), rep->getLB(), rep->getUB());
    183190    } else if (Diff * diff = dyn_cast<Diff>(r)) {
    184         return makeDiff(resolveAnchors(diff->getLH(), breakRE), resolveAnchors(diff->getRH(), breakRE));
     191        return makeDiff(resolve(diff->getLH()), resolve(diff->getRH()));
    185192    } else if (Intersect * e = dyn_cast<Intersect>(r)) {
    186         return makeIntersect(resolveAnchors(e->getLH(), breakRE), resolveAnchors(e->getRH(), breakRE));
     193        return makeIntersect(resolve(e->getLH()), resolve(e->getRH()));
    187194    } else if (isa<Start>(r)) {
    188         return makeAlt({r, makeLookBehindAssertion(breakRE)});
     195        if (mIsNegated) return makeNegativeLookBehindAssertion(mAnchorRE);
     196        else return makeAlt({makeSOT(),
     197                             makeLookBehindAssertion(mAnchorRE)});
    189198    } else if (isa<End>(r)) {
    190         return makeAlt({r, makeLookAheadAssertion(breakRE)});
    191     }
    192 }
    193 }
     199        if (mIsNegated) return makeNegativeLookAheadAssertion(mAnchorRE);
     200        else return makeAlt({makeEOT(),
     201                             makeLookAheadAssertion(mAnchorRE)});
     202    }
     203}
     204
     205RE * resolveAnchors(RE * r, RE * breakRE) {
     206    AnchorResolution a;
     207    if (const CC * cc = dyn_cast<CC>(breakRE)) {
     208        a.mIsNegated = true;
     209        if (cc->getAlphabet() == &cc::Unicode) {
     210            a.mAnchorRE = makeDiff(makeCC(0, 0x10FFFF), breakRE);
     211        } else if (cc->getAlphabet() == &cc::Byte) {
     212            a.mAnchorRE = makeDiff(makeByte(0, 0xFF), breakRE);
     213        } else {
     214            llvm::report_fatal_error("resolveAnchors: unexpected alphabet " + cc->getAlphabet()->getName());
     215        }
     216    } else {
     217        a.mIsNegated = false;
     218        a.mAnchorRE = breakRE;
     219    }
     220    return a.resolve(r);
     221}
     222                                                       
     223}
Note: See TracChangeset for help on using the changeset viewer.