Changeset 5901 for icGREP


Ignore:
Timestamp:
Mar 11, 2018, 5:53:16 PM (10 months ago)
Author:
cameron
Message:

RE compiler can generate UTF8_nonfinal if not provided externally

Location:
icGREP/icgrep-devel/icgrep/re
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5900 r5901  
    2929#include <cc/alphabet.h>
    3030#include <cc/cc_compiler.h>
     31#include <UCD/ucd_compiler.hpp>
    3132#include "pablo/builder.hpp"        // for PabloBuilder
    3233#include <llvm/ADT/STLExtras.h> // for make_unique
     
    5455    PabloBuilder pb(mEntryScope);
    5556    mExternalNameMap.insert(std::make_pair(precompiledName, precompiledStream));
    56     if (precompiledName == "UTF8_nonfinal") {
    57         mNonFinal = precompiledStream;
    58         mFinal = pb.createNot(precompiledStream);
    59     }
    6057}
    6158
     
    240237        return makeMarker(InitialPostPositionUnit, combine);
    241238    }
    242     combine = pb.createOr(pb.createOr(pb.createAnd(combine, mFinal), pb.createScanThru(pb.createAnd(mNonFinal, combine), mNonFinal)), accum[FinalPostPositionUnit], "alt");
     239    combine = pb.createOr(pb.createOr(pb.createAnd(combine, u8Final(pb)), pb.createScanThru(pb.createAnd(u8NonFinal(pb), combine), u8NonFinal(pb))), accum[FinalPostPositionUnit], "alt");
    243240    return makeMarker(FinalPostPositionUnit, combine);
    244241}
     
    404401        else if (isUnicodeUnitLength(repeated)) {
    405402            PabloAST * cc = markerVar(compile(repeated, pb));
    406             PabloAST * cc_lb = consecutive_matches(cc, 1, lb, mFinal, pb);
     403            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, u8Final(pb), pb);
    407404            const auto pos = markerPos(marker) == FinalMatchUnit ? lb : lb - 1;
    408             PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), mFinal, pos);
     405            PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), u8Final(pb), pos);
    409406            return makeMarker(FinalMatchUnit, pb.createAnd(marker_fwd, cc_lb, "lowerbound"));
    410407        }
     
    467464            // MatchStar deposits any cursors on the post position. However those cursors may may land on the initial "byte" of a
    468465            // "multi-byte" character. Combine the masked range with any nonFinals.
    469             PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     466            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, u8NonFinal(pb)), "bounded");
    470467            return makeMarker(FinalPostPositionUnit, bounded);
    471468        }
    472469        else if (isUnicodeUnitLength(repeated)) {
    473             // For a regexp which represent a single Unicode codepoint, we can use the mFinal stream
     470            // For a regexp which represent a single Unicode codepoint, we can use the u8Final(pb) stream
    474471            // as an index stream for an indexed advance operation.
    475472            PabloAST * cursor = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    476             PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, mFinal, pb);
     473            PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, u8Final(pb), pb);
    477474            PabloAST * masked = pb.createAnd(markerVar(compile(repeated, pb)), upperLimitMask, "masked");
    478             PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     475            PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, u8NonFinal(pb)), "bounded");
    479476            return makeMarker(FinalPostPositionUnit, bounded);
    480477        }
     
    486483                PabloAST * upperLimitMask = reachable(cursor, 1, ub - 1, firstCCstream, pb);
    487484                PabloAST * masked = pb.createAnd(markerVar(AdvanceMarker(compile(repeated, pb), FinalPostPositionUnit, pb)), upperLimitMask, "masked");
    488                 PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, mNonFinal), "bounded");
     485                PabloAST * bounded = pb.createMatchStar(cursor, pb.createOr(masked, u8NonFinal(pb)), "bounded");
    489486                return makeMarker(FinalPostPositionUnit, bounded);
    490487            }
     
    519516        PabloAST * mask = markerVar(compile(repeated, pb));
    520517        // The post position character may land on the initial byte of a multi-byte character. Combine them with the masked range.
    521         mask = pb.createOr(mask, mNonFinal);
     518        mask = pb.createOr(mask, u8NonFinal(pb));
    522519        PabloAST * unbounded = pb.createMatchStar(base, mask, "unbounded");
    523520        return makeMarker(FinalPostPositionUnit, unbounded);
    524521    } else if (isUnicodeUnitLength(repeated) && LLVM_LIKELY(!AlgorithmOptionIsSet(DisableMatchStar) && !AlgorithmOptionIsSet(DisableUnicodeMatchStar))) {
    525522        PabloAST * mask = markerVar(compile(repeated, pb));
    526         mask = pb.createOr(mask, mNonFinal);
     523        mask = pb.createOr(mask, u8NonFinal(pb));
    527524        PabloAST * unbounded = pb.createMatchStar(base, mask);
    528         return makeMarker(FinalPostPositionUnit, pb.createAnd(unbounded, mFinal, "unbounded"));
     525        return makeMarker(FinalPostPositionUnit, pb.createAnd(unbounded, u8Final(pb), "unbounded"));
    529526    } else if (mStarDepth > 0){
    530527        PabloBuilder * const outer = pb.getParent();
     
    583580        }
    584581        if (newpos == FinalPostPositionUnit) {
    585             marker.stream = pb.createOr(pb.createAnd(marker.stream, mFinal), pb.createScanThru(pb.createAnd(mNonFinal, marker.stream), mNonFinal, "fpp"));
     582            marker.stream = pb.createOr(pb.createAnd(marker.stream, u8Final(pb)), pb.createScanThru(pb.createAnd(u8NonFinal(pb), marker.stream), u8NonFinal(pb), "fpp"));
    586583            marker.pos = FinalPostPositionUnit;
    587584        }
     
    597594    }
    598595}
     596
     597pablo::PabloAST * RE_Compiler::u8NonFinal(pablo::PabloBuilder & pb) {
     598    MarkerType m;
     599    auto f = mExternalNameMap.find("UTF8_nonfinal");
     600    if (f!= mExternalNameMap.end()) {
     601        return f->second;
     602    }
     603    if (LLVM_LIKELY(mCompiledName->get(mNonFinalName, m))) {
     604        return markerVar(m);
     605    }
     606    m = compile(mNonFinalName->getDefinition(), pb);
     607    mCompiledName->add(mNonFinalName, m);
     608    return markerVar(m);
     609}
     610
     611pablo::PabloAST * RE_Compiler::u8Final(pablo::PabloBuilder & pb) {
     612    return pb.createNot(u8NonFinal(pb));
     613}
     614
    599615   
    600616LLVM_ATTRIBUTE_NORETURN void RE_Compiler::UnsupportedRE(std::string errmsg) {
     
    606622, mCCCompiler(ccCompiler)
    607623, mLineBreak(nullptr)
    608 , mNonFinal(nullptr)
    609 , mFinal(nullptr)
    610624, mWhileTest(nullptr)
    611625, mStarDepth(0)
     
    613627    PabloBuilder pb(mEntryScope);
    614628    mLineBreak = pb.createZeroes();  // default so "^/$" matches start/end of text only
    615     mNonFinal = pb.createZeroes();
    616     mFinal = pb.createOnes();
     629    mNonFinalName = makeName("u8NonFinal", makeAlt({makeByte(0xC0, 0xFF),
     630                               makeSeq({makeByte(0xE0, 0xFF), makeByte(0x00, 0xFF)}),
     631                               makeSeq({makeByte(0xF0, 0xFF), makeByte(0x00, 0xFF), makeByte(0x00, 0xFF)})}));
    617632}
    618633
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5888 r5901  
    124124    MarkerType AdvanceMarker(MarkerType marker, const MarkerPosition newpos, pablo::PabloBuilder & pb);
    125125    void AlignMarkers(MarkerType & m1, MarkerType & m2, pablo::PabloBuilder & pb);
     126   
     127    pablo::PabloAST * u8NonFinal(pablo::PabloBuilder & pb);
     128    pablo::PabloAST * u8Final(pablo::PabloBuilder & pb);
    126129
    127130    static inline MarkerPosition markerPos(const MarkerType & m) {return m.pos; }
     
    137140    cc::CC_Compiler &                               mCCCompiler;
    138141    pablo::PabloAST *                               mLineBreak;
    139     pablo::PabloAST *                               mNonFinal;
    140     pablo::PabloAST *                               mFinal;
     142    re::Name *                                      mNonFinalName;
    141143    pablo::PabloAST *                               mWhileTest;
    142144    int                                             mStarDepth;
Note: See TracChangeset for help on using the changeset viewer.