Ignore:
Timestamp:
Mar 12, 2018, 7:22:06 AM (18 months ago)
Author:
cameron
Message:

Initial deployment of bytegrep kernel in icgrep

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r5900 r5902  
    2828#include <re/casing.h>
    2929#include <re/exclude_CC.h>
     30#include <re/to_utf8.h>
    3031#include <re/re_toolchain.h>
    3132#include <toolchain/toolchain.h>
     33#include <re/re_analysis.h>
    3234#include <re/re_name_resolve.h>
    3335#include <re/re_name_gather.h>
     
    237239    }
    238240   
    239     StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
    240     kernel::Kernel * s2pk = nullptr;
    241     if (PabloTransposition) {
    242         s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
    243     }
    244     else {
    245         s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
    246     }
    247     mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
    248 
    249241    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    250     StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    251     StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    252 
    253     StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    254     kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
    255     mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
    256    
    257     kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
    258     mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
    259 
    260     if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
    261         LineBreakStream = LineFeedStream;
    262     } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    263         kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
    264         mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
     242    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
     243   
     244    // For simple regular expressions with a small number of characters, we
     245    // can bypass transposition and use the Direct CC compiler.
     246    if ((nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB) && byteTestsWithinLimit(REs[0], 6)) {
     247        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     248        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, REs[0]);
     249        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
     250        MatchResultsBufs[0] = MatchResults;
     251        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 1);
     252        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
    265253    } else {
    266         LineBreakStream = UnicodeLB;
    267     }
    268    
    269     std::map<std::string, StreamSetBuffer *> propertyStream;
    270     if (PropertyKernels) {
    271         for (auto p : UnicodeProperties) {
    272             auto name = p->getFullName();
    273             StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    274             propertyStream.emplace(std::make_pair(name, s));
    275             kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
    276             mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
    277         }
    278     }
    279     StreamSetBuffer * GCB_stream = nullptr;
    280     if (anyGCB) {
    281         GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    282         kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
    283         mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
    284     }
    285 
    286     std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
    287     for(unsigned i = 0; i < nREs; ++i) {
    288         std::vector<std::string> externalStreamNames = std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"};
    289         std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits, LineBreakStream, RequiredStreams};
    290         std::set<re::Name *> UnicodeProperties;
     254        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
     255        kernel::Kernel * s2pk = nullptr;
     256        if (PabloTransposition) {
     257            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
     258        }
     259        else {
     260            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
     261        }
     262        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
     263
     264        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     265        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     266
     267        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     268        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
     269        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
     270       
     271        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
     272        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
     273
     274        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     275            LineBreakStream = LineFeedStream;
     276        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     277            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
     278            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
     279        } else {
     280            LineBreakStream = UnicodeLB;
     281        }
     282       
     283        std::map<std::string, StreamSetBuffer *> propertyStream;
    291284        if (PropertyKernels) {
    292             re::gatherUnicodeProperties(REs[i], UnicodeProperties);
    293285            for (auto p : UnicodeProperties) {
    294286                auto name = p->getFullName();
    295                 auto f = propertyStream.find(name);
    296                 if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
    297                 externalStreamNames.push_back(name);
    298                 icgrepInputSets.push_back(f->second);
    299             }
    300         }
    301         if (hasGCB[i]) {
    302             externalStreamNames.push_back("\\b{g}");
    303             icgrepInputSets.push_back(GCB_stream);
    304         }
    305         if (CC_Multiplexing) {
    306             const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
    307             StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    308             if (UnicodeSets.size() <= 1) {
     287                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     288                propertyStream.emplace(std::make_pair(name, s));
     289                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
     290                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
     291            }
     292        }
     293        StreamSetBuffer * GCB_stream = nullptr;
     294        if (anyGCB) {
     295            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     296            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
     297            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
     298        }
     299
     300        for(unsigned i = 0; i < nREs; ++i) {
     301            std::vector<std::string> externalStreamNames;
     302            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
     303            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     304                externalStreamNames.push_back("UTF8_LB");
     305                icgrepInputSets.push_back(LineBreakStream);
     306                externalStreamNames.push_back("UTF8_nonfinal");
     307                icgrepInputSets.push_back(RequiredStreams);
     308            }
     309            std::set<re::Name *> UnicodeProperties;
     310            if (PropertyKernels) {
     311                re::gatherUnicodeProperties(REs[i], UnicodeProperties);
     312                for (auto p : UnicodeProperties) {
     313                    auto name = p->getFullName();
     314                    auto f = propertyStream.find(name);
     315                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
     316                    externalStreamNames.push_back(name);
     317                    icgrepInputSets.push_back(f->second);
     318                }
     319            }
     320            if (hasGCB[i]) {
     321                externalStreamNames.push_back("\\b{g}");
     322                icgrepInputSets.push_back(GCB_stream);
     323            }
     324            if (CC_Multiplexing) {
     325                const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
     326                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     327                if (UnicodeSets.size() <= 1) {
     328                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
     329                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     330                    MatchResultsBufs[i] = MatchResults;
     331                } else {
     332                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
     333                    REs[i] = transformCCs(mpx.get(), REs[i]);
     334                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
     335                    auto numOfCharacterClasses = mpx_basis.size();
     336                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
     337                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
     338                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
     339    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
     340    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
     341                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
     342                    icgrepInputSets.push_back(CharClasses);
     343                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
     344                    MatchResultsBufs[i] = MatchResults;
     345                }
     346            } else {
     347                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    309348                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
    310349                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    311350                MatchResultsBufs[i] = MatchResults;
    312             } else {
    313                 mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
    314                 REs[i] = transformCCs(mpx.get(), REs[i]);
    315                 std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
    316                 auto numOfCharacterClasses = mpx_basis.size();
    317                 StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
    318                 kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    319                 mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    320 //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
    321 //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
    322                 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
    323                 icgrepInputSets.push_back(CharClasses);
    324                 mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    325                 MatchResultsBufs[i] = MatchResults;
    326             }
    327         } else {
    328             StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    329             kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
    330             mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    331             MatchResultsBufs[i] = MatchResults;
     351            }
    332352        }
    333353    }
Note: See TracChangeset for help on using the changeset viewer.