Changeset 5902 for icGREP


Ignore:
Timestamp:
Mar 12, 2018, 7:22:06 AM (13 months ago)
Author:
cameron
Message:

Initial deployment of bytegrep kernel in icgrep

Location:
icGREP/icgrep-devel/icgrep
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r5900 r5902  
    2828#include <re/casing.h>
    2929#include <re/exclude_CC.h>
     30#include <re/to_utf8.h>
    3031#include <re/re_toolchain.h>
    3132#include <toolchain/toolchain.h>
     33#include <re/re_analysis.h>
    3234#include <re/re_name_resolve.h>
    3335#include <re/re_name_gather.h>
     
    237239    }
    238240   
    239     StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
    240     kernel::Kernel * s2pk = nullptr;
    241     if (PabloTransposition) {
    242         s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
    243     }
    244     else {
    245         s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    246     }
    247     mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    248 
    249241    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    250     StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    251     StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    252 
    253     StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    254     kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
    255     mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    256    
    257     kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    258     mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
    259 
    260     if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
    261         LineBreakStream = LineFeedStream;
    262     } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    263         kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
    264         mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
     242    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
     243   
     244    // For simple regular expressions with a small number of characters, we
     245    // can bypass transposition and use the Direct CC compiler.
     246    if ((nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB) && byteTestsWithinLimit(REs[0], 6)) {
     247        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     248        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, REs[0]);
     249        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
     250        MatchResultsBufs[0] = MatchResults;
     251        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 1);
     252        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
    265253    } else {
    266         LineBreakStream = UnicodeLB;
    267     }
    268    
    269     std::map<std::string, StreamSetBuffer *> propertyStream;
    270     if (PropertyKernels) {
    271         for (auto p : UnicodeProperties) {
    272             auto name = p->getFullName();
    273             StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    274             propertyStream.emplace(std::make_pair(name, s));
    275             kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
    276             mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
    277         }
    278     }
    279     StreamSetBuffer * GCB_stream = nullptr;
    280     if (anyGCB) {
    281         GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    282         kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
    283         mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
    284     }
    285 
    286     std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
    287     for(unsigned i = 0; i < nREs; ++i) {
    288         std::vector<std::string> externalStreamNames = std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"};
    289         std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits, LineBreakStream, RequiredStreams};
    290         std::set<re::Name *> UnicodeProperties;
     254        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
     255        kernel::Kernel * s2pk = nullptr;
     256        if (PabloTransposition) {
     257            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
     258        }
     259        else {
     260            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
     261        }
     262        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
     263
     264        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     265        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     266
     267        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     268        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
     269        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
     270       
     271        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
     272        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
     273
     274        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     275            LineBreakStream = LineFeedStream;
     276        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     277            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
     278            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
     279        } else {
     280            LineBreakStream = UnicodeLB;
     281        }
     282       
     283        std::map<std::string, StreamSetBuffer *> propertyStream;
    291284        if (PropertyKernels) {
    292             re::gatherUnicodeProperties(REs[i], UnicodeProperties);
    293285            for (auto p : UnicodeProperties) {
    294286                auto name = p->getFullName();
    295                 auto f = propertyStream.find(name);
    296                 if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
    297                 externalStreamNames.push_back(name);
    298                 icgrepInputSets.push_back(f->second);
    299             }
    300         }
    301         if (hasGCB[i]) {
    302             externalStreamNames.push_back("\\b{g}");
    303             icgrepInputSets.push_back(GCB_stream);
    304         }
    305         if (CC_Multiplexing) {
    306             const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
    307             StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    308             if (UnicodeSets.size() <= 1) {
     287                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     288                propertyStream.emplace(std::make_pair(name, s));
     289                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
     290                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
     291            }
     292        }
     293        StreamSetBuffer * GCB_stream = nullptr;
     294        if (anyGCB) {
     295            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     296            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
     297            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
     298        }
     299
     300        for(unsigned i = 0; i < nREs; ++i) {
     301            std::vector<std::string> externalStreamNames;
     302            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
     303            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     304                externalStreamNames.push_back("UTF8_LB");
     305                icgrepInputSets.push_back(LineBreakStream);
     306                externalStreamNames.push_back("UTF8_nonfinal");
     307                icgrepInputSets.push_back(RequiredStreams);
     308            }
     309            std::set<re::Name *> UnicodeProperties;
     310            if (PropertyKernels) {
     311                re::gatherUnicodeProperties(REs[i], UnicodeProperties);
     312                for (auto p : UnicodeProperties) {
     313                    auto name = p->getFullName();
     314                    auto f = propertyStream.find(name);
     315                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
     316                    externalStreamNames.push_back(name);
     317                    icgrepInputSets.push_back(f->second);
     318                }
     319            }
     320            if (hasGCB[i]) {
     321                externalStreamNames.push_back("\\b{g}");
     322                icgrepInputSets.push_back(GCB_stream);
     323            }
     324            if (CC_Multiplexing) {
     325                const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
     326                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     327                if (UnicodeSets.size() <= 1) {
     328                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
     329                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     330                    MatchResultsBufs[i] = MatchResults;
     331                } else {
     332                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
     333                    REs[i] = transformCCs(mpx.get(), REs[i]);
     334                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
     335                    auto numOfCharacterClasses = mpx_basis.size();
     336                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
     337                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
     338                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
     339    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
     340    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
     341                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
     342                    icgrepInputSets.push_back(CharClasses);
     343                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     344                    MatchResultsBufs[i] = MatchResults;
     345                }
     346            } else {
     347                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    309348                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
    310349                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    311350                MatchResultsBufs[i] = MatchResults;
    312             } else {
    313                 mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
    314                 REs[i] = transformCCs(mpx.get(), REs[i]);
    315                 std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
    316                 auto numOfCharacterClasses = mpx_basis.size();
    317                 StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
    318                 kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    319                 mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    320 //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
    321 //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
    322                 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
    323                 icgrepInputSets.push_back(CharClasses);
    324                 mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    325                 MatchResultsBufs[i] = MatchResults;
    326             }
    327         } else {
    328             StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    329             kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
    330             mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    331             MatchResultsBufs[i] = MatchResults;
     351            }
    332352        }
    333353    }
  • icGREP/icgrep-devel/icgrep/kernels/cc_kernel.cpp

    r5872 r5902  
    88#include <cc/cc_compiler.h>
    99#include <kernels/kernel_builder.h>
     10#include <llvm/Support/raw_ostream.h>
    1011
    1112using namespace cc;
     
    2324, mCharClasses(charClasses)
    2425, mCodeUnitSize(codeUnitSize) {
    25 
     26    if (codeUnitSize > 4) errs() << "codeUnitsize of " << codeUnitSize << " too large!\n";
    2627}
    2728
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5900 r5902  
    77#include <boost/uuid/sha1.hpp>
    88#include <re/printer_re.h>
     9#include <re/re_cc.h>
     10#include <re/re_name.h>
    911#include <re/re_toolchain.h>
    1012#include <re/re_reverse.h>
     
    2426#include <cc/multiplex_CCs.h>
    2527#include <re/re_compiler.h>
     28#include <UCD/ucd_compiler.hpp>
    2629#include <llvm/Support/raw_ostream.h>
    2730
     
    4043             digest[0], digest[1], digest[2], digest[3], digest[4]);
    4144    return std::string(buffer);
     45}
     46
     47
     48UnicodeLineBreakKernel::UnicodeLineBreakKernel(const std::unique_ptr<kernel::KernelBuilder> & kb)
     49: PabloKernel(kb,
     50              "UTF8_LB",
     51              {Binding{kb->getStreamSetTy(8), "basis"}, Binding{kb->getStreamSetTy(1), "lf", FixedRate(), LookAhead(1)}},
     52              {Binding{kb->getStreamSetTy(1, 1), "UTF8_LB", FixedRate()}}) {
     53}
     54
     55void UnicodeLineBreakKernel::generatePabloMethod() {
     56        PabloBuilder pb(getEntryScope());
     57        cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
     58        UCD::UCDCompiler ucdCompiler(ccc);
     59   
     60    Name * breakChars = re::makeName("breakChars", makeCC(makeCC(makeCC(0x0A, 0x0D), makeCC(0x85)), makeCC(0x2028,0x2029)));
     61    UCD::UCDCompiler::NameMap nameMap;
     62    nameMap.emplace(breakChars, nullptr);
     63    ucdCompiler.generateWithDefaultIfHierarchy(nameMap, pb);
     64    auto f = nameMap.find(breakChars);
     65    if (f == nameMap.end()) llvm::report_fatal_error("UnicodeLineBreakKernel compilation failure");
     66    PabloAST * breakStream = f-> second;
     67    PabloAST * const LF = pb.createExtract(getInput(1), pb.getInteger(0), "LF");
     68    PabloAST * const CR = ccc.compileCC(makeByte(0x0D));
     69    Var * const CR_before_LF = pb.createVar("CR_before_LFCR_before_LF", pb.createZeroes());
     70    auto crb = pb.createScope();
     71    pb.createIf(CR, crb);
     72    PabloAST * const lookaheadLF = crb.createLookahead(LF, 1, "lookaheadLF");
     73    crb.createAssign(CR_before_LF, crb.createAnd(CR, lookaheadLF));
     74    breakStream = pb.createXor(breakStream, CR_before_LF);  // Remove CR_before_LF from breakStream
     75    Var * const UTF8_LB = getOutputStreamVar("UTF8_LB");
     76    pb.createAssign(pb.createExtract(UTF8_LB, pb.getInteger(0)), breakStream);
    4277}
    4378
     
    242277}
    243278
     279
     280ByteGrepSignature::ByteGrepSignature(RE * re)
     281: mRE(re)
     282, mSignature(Printer_RE::PrintRE(re) ) {
     283}
     284
     285ByteGrepKernel::ByteGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<std::string> externals)
     286: ByteGrepSignature(re)
     287, PabloKernel(b, "bBc" + sha1sum(mSignature),
     288              // inputs
     289{Binding{b->getStreamSetTy(1, 8), "byteData"}},
     290              // output
     291{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
     292, mExternals(externals) {
     293    for (auto & e : externals) {
     294        mStreamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
     295    }
     296}
     297
     298std::string ByteGrepKernel::makeSignature(const std::unique_ptr<kernel::KernelBuilder> &) {
     299    return mSignature;
     300}
     301
     302
     303void ByteGrepKernel::generatePabloMethod() {
     304    PabloBuilder pb(getEntryScope());
     305    PabloAST * u8bytes = pb.createExtract(getInput(0), pb.getInteger(0));
     306    cc::Direct_CC_Compiler dcc(getEntryScope(), u8bytes);
     307    RE_Compiler re_byte_compiler(getEntryScope(), dcc);
     308    for (auto & e : mExternals) {
     309        re_byte_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
     310    }
     311    PabloAST * const matches = re_byte_compiler.compile(mRE);
     312   
     313    Var * const output = getOutputStreamVar("matches");
     314    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
     315}
     316
    244317// Helper to compute stream set inputs to pass into PabloKernel constructor.
    245318inline std::vector<Binding> byteBitGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
    246                                          const std::vector<std::string> & externals) {
     319                                              const std::vector<std::string> & externals) {
    247320    std::vector<Binding> streamSetInputs = {
    248321        Binding{b->getStreamSetTy(1, 8), "bytedata"},
     
    254327}
    255328
    256 
    257329ByteBitGrepSignature::ByteBitGrepSignature(RE * prefix, RE * suffix)
    258330: mPrefixRE(prefix)
    259331, mSuffixRE(suffix)
    260332, mSignature(Printer_RE::PrintRE(mPrefixRE) + Printer_RE::PrintRE(mSuffixRE) ) {
    261    
    262333}
    263334
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.h

    r5889 r5902  
    1414
    1515   
     16class UnicodeNonFinalKernel : public pablo::PabloKernel {
     17public:
     18    UnicodeNonFinalKernel(const std::unique_ptr<kernel::KernelBuilder> & kb);
     19    bool isCachable() const override { return true; }
     20    bool hasSignature() const override { return false; }
     21protected:
     22    void generatePabloMethod() override;
     23};
     24
     25class UnicodeLineBreakKernel : public pablo::PabloKernel {
     26public:
     27    UnicodeLineBreakKernel(const std::unique_ptr<kernel::KernelBuilder> & kb);
     28    bool isCachable() const override { return true; }
     29    bool hasSignature() const override { return false; }
     30protected:
     31    void generatePabloMethod() override;
     32};
     33
    1634class RequiredStreams_UTF8 : public pablo::PabloKernel {
    1735public:
     
    5270};
    5371
     72struct ByteGrepSignature {
     73    ByteGrepSignature(re::RE * re);
     74protected:
     75    re::RE * const  mRE;
     76    std::string     mSignature;
     77};
     78
     79
     80class ByteGrepKernel : public ByteGrepSignature, public pablo::PabloKernel {
     81public:
     82    ByteGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, re::RE * const re, std::vector<std::string> externals = {});
     83    std::string makeSignature(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
     84    bool isCachable() const override { return true; }
     85protected:
     86    void generatePabloMethod() override;
     87    std::vector<std::string> mExternals;
     88};
     89   
    5490struct ByteBitGrepSignature {
    5591    ByteBitGrepSignature(re::RE * prefix, re::RE * suffix);
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5899 r5902  
    1414#include <re/re_group.h>
    1515#include <re/re_nullable.h>
     16#include <re/to_utf8.h>
    1617#include <re/printer_re.h>
    1718#include <cc/alphabet.h>
     
    414415    UCD::UnicodeSet equalityTests;
    415416    UCD::UnicodeSet lessThanTests;
     417    unsigned testCount;
     418    unsigned testLimit;
    416419};
    417420
    418421void ByteTestComplexity::gatherTests(RE * re) {
    419422    if (CC * cc = dyn_cast<CC>(re)) {
    420         if (cc->getAlphabet() != &cc::Byte) report_fatal_error("ByteTestComplexity: non Byte alphabet");
    421         for (const auto range : *cc) {
    422             const auto lo = re::lo_codepoint(range);
    423             const auto hi = re::hi_codepoint(range);
    424             if (lo == hi) {
    425                 equalityTests.insert(lo);
    426             } else {
    427                 if (lo > 0) lessThanTests.insert(lo);
    428                 if (hi < 0xFF) lessThanTests.insert(hi+1);
     423        if (cc->getAlphabet() == &cc::Unicode) {
     424            gatherTests(toUTF8(re));
     425        } else {
     426            for (const auto range : *cc) {
     427                const auto lo = re::lo_codepoint(range);
     428                const auto hi = re::hi_codepoint(range);
     429                if (lo == hi) {
     430                    if (!equalityTests.contains(lo)) {
     431                        equalityTests.insert(lo);
     432                        testCount++;
     433                    }
     434                } else {
     435                    if (lo > 0) {
     436                        if (!lessThanTests.contains(lo)) {
     437                            lessThanTests.insert(lo);
     438                            testCount++;
     439                        }
     440                    }
     441                    if (hi < 0xFF) {
     442                        if (!lessThanTests.contains(hi+1)) {
     443                            lessThanTests.insert(hi+1);
     444                            testCount++;
     445                        }
     446                    }
     447                }
     448                if (testCount > testLimit) return;
    429449            }
    430450        }
     
    454474}
    455475
    456 unsigned byteTestComplexity(RE * re) {
     476bool byteTestsWithinLimit(RE * re, unsigned limit) {
    457477    ByteTestComplexity btc_object;
     478    btc_object.testCount = 0;
     479    btc_object.testLimit = limit;
    458480    btc_object.gatherTests(re);
    459     return btc_object.equalityTests.count() + btc_object.lessThanTests.count();
     481    return btc_object.testCount <= btc_object.testLimit;
    460482}
    461483
  • icGREP/icgrep-devel/icgrep/re/re_analysis.h

    r5892 r5902  
    3030bool hasAssertion(const RE * re);
    3131   
    32 unsigned byteTestComplexity(RE * re);
     32bool byteTestsWithinLimit(RE * re, unsigned limit);
    3333   
    3434void UndefinedNameError (const Name * n);
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5901 r5902  
    627627    PabloBuilder pb(mEntryScope);
    628628    mLineBreak = pb.createZeroes();  // default so "^/$" matches start/end of text only
    629     mNonFinalName = makeName("u8NonFinal", makeAlt({makeByte(0xC0, 0xFF),
    630                                makeSeq({makeByte(0xE0, 0xFF), makeByte(0x00, 0xFF)}),
    631                                makeSeq({makeByte(0xF0, 0xFF), makeByte(0x00, 0xFF), makeByte(0x00, 0xFF)})}));
     629    mNonFinalName = makeName("u8NonFinal", makeAlt({makeByte(0xC2, 0xF4),
     630                               makeSeq({makeByte(0xE0, 0xF4), makeByte(0x80, 0xBF)}),
     631                               makeSeq({makeByte(0xF0, 0xF4), makeByte(0x80, 0xBF), makeByte(0x80, 0xBF)})}));
    632632}
    633633
Note: See TracChangeset for help on using the changeset viewer.