Changeset 5908 for icGREP/icgrep-devel


Ignore:
Timestamp:
Mar 13, 2018, 1:17:42 PM (13 months ago)
Author:
cameron
Message:

Byte-Bit grep kernel optimizes when an RE begins with an initial trigraph

Location:
icGREP/icgrep-devel/icgrep
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r5902 r5908  
    6262static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
    6363
     64const unsigned DefaultByteCClimit = 6;
     65
     66static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
     67
    6468
    6569namespace grep {
     
    196200    return (packSize * packSize) / b->getBitBlockWidth();
    197201}
    198 
     202   
    199203std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
    200204    auto & idb = mGrepDriver->getBuilder();
     
    242246    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
    243247   
     248    re::RE * prefixRE;
     249    re::RE * suffixRE;
    244250    // For simple regular expressions with a small number of characters, we
    245251    // can bypass transposition and use the Direct CC compiler.
    246     if ((nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB) && byteTestsWithinLimit(REs[0], 6)) {
     252    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
     253    if (isSimple) {
     254        REs[0] = toUTF8(REs[0]);
     255    }
     256    if (isSimple && byteTestsWithinLimit(REs[0], ByteCClimit)) {
    247257        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    248258        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, REs[0]);
    249259        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
    250260        MatchResultsBufs[0] = MatchResults;
    251         kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 1);
     261        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{breakCC}, 1);
     262        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
     263    } else if (isSimple && hasTriCCwithinLimit(REs[0], ByteCClimit, prefixRE, suffixRE)) {
     264        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     265        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE);
     266        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
     267        MatchResultsBufs[0] = MatchResults;
     268        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{breakCC}, 1);
    252269        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
    253270    } else {
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5902 r5908  
    357357    }
    358358    PabloAST * const prefixMatches = re_byte_compiler.compile(mPrefixRE);
    359    
     359    Var * const final_matches = pb.createVar("final_matches", pb.createZeroes());
    360360    PabloBlock * scope1 = getEntryScope()->createScope();
    361361    pb.createIf(prefixMatches, scope1);
     
    373373    std::vector<PabloAST *> basis(8);
    374374    for (unsigned i = 0; i < 4; i++) {
     375        // The subtraction 7-bit is because of the confusion between
     376        // little-endian and big-endian bit numbering of bytes.
     377        // We should fix this, switching to little-endian numbering throughout.
    375378        basis[7-2*i] = scope1->createPackL(scope1->getInteger(2), bitpairs[i]);
    376379        basis[7-(2*i + 1)] = scope1->createPackH(scope1->getInteger(2), bitpairs[i]);
     
    379382    cc::Parabix_CC_Compiler ccc(scope1, basis);
    380383    RE_Compiler re_compiler(scope1, ccc);
    381     PabloAST * const matches = re_compiler.compile(mSuffixRE, prefixMatches);
     384    scope1->createAssign(final_matches, re_compiler.compile(mSuffixRE, prefixMatches));
    382385    Var * const output = getOutputStreamVar("matches");
    383     pb.createAssign(pb.createExtract(output, pb.getInteger(0)), matches);
    384 }
    385 
    386 
     386    pb.createAssign(pb.createExtract(output, pb.getInteger(0)), final_matches);
     387}
    387388
    388389
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r5902 r5908  
    482482}
    483483
     484bool hasTriCCwithinLimit(RE * r, unsigned byteCClimit, RE * & prefixRE, RE * & suffixRE) {
     485    if (Seq * seq = dyn_cast<Seq>(r)) {
     486        if (seq->size() < 4) return false;
     487        prefixRE = makeSeq(seq->begin(), seq->begin()+3);
     488        if (byteTestsWithinLimit(prefixRE, byteCClimit)) {
     489            suffixRE = makeSeq(seq->begin()+3, seq->end());
     490            return true;
     491        }
     492        return false;
     493    }
     494    return false;
     495}
    484496
    485497void UndefinedNameError(const Name * n) {
  • icGREP/icgrep-devel/icgrep/re/re_analysis.h

    r5902 r5908  
    3232bool byteTestsWithinLimit(RE * re, unsigned limit);
    3333   
     34bool hasTriCCwithinLimit(RE * r, unsigned byteCClimit, RE * & prefixRE, RE * & suffixRE);
     35
     36   
    3437void UndefinedNameError (const Name * n);
    3538}
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5903 r5908  
    290290        }
    291291    }
    292     UnsupportedRE("Unsupported lookahead assertion.");
     292    UnsupportedRE("Unsupported lookahead assertion:" + Printer_RE::PrintRE(a));
    293293}
    294294
  • icGREP/icgrep-devel/icgrep/re/to_utf8.cpp

    r5802 r5908  
    88#include <UCD/unicode_set.h>
    99#include <UCD/UTF.h>
     10#include <cc/alphabet.h>
    1011#include <re/re_name.h>
    1112#include <re/re_start.h>
     
    6667        return r;
    6768    } else if (const CC * cc = dyn_cast<CC>(r)) {
     69        if (cc->getAlphabet() != &cc::Unicode) return r;
    6870        std::vector<RE *> alt;
    6971        for (const interval_t & i : *cc) {
Note: See TracChangeset for help on using the changeset viewer.