Ignore:
Timestamp:
Jan 27, 2019, 2:54:36 PM (4 months ago)
Author:
cameron
Message:

Merge branch 'master' of https://cs-git-research.cs.surrey.sfu.ca/cameron/parabix-devel

Location:
icGREP/icgrep-devel/icgrep/grep
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r6294 r6297  
    206206// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
    207207
    208 std::pair<StreamSet *, StreamSet *> GrepEngine::grepPipeline(const std::unique_ptr<ProgramBuilder> & P, StreamSet *SourceStream) {
     208std::pair<StreamSet *, StreamSet *> GrepEngine::grepPipeline(const std::unique_ptr<ProgramBuilder> & P, StreamSet * InputStream) {
    209209
    210210    Scalar * const callbackObject = P->getInputScalar("callbackObject");
     
    214214    StreamSet * ByteStream = nullptr;
    215215    if (mBinaryFilesMode == argv::Text) {
    216         ByteStream = SourceStream;
     216        ByteStream = InputStream;
    217217    } else if (mBinaryFilesMode == argv::WithoutMatch) {
    218218        ByteStream = P->CreateStreamSet(1, 8);
    219         Kernel * binaryCheckK = P->CreateKernelCall<AbortOnNull>(SourceStream, ByteStream, callbackObject);
     219        Kernel * binaryCheckK = P->CreateKernelCall<AbortOnNull>(InputStream, ByteStream, callbackObject);
    220220        mGrepDriver.LinkFunction(binaryCheckK, "signal_dispatcher", kernel::signal_dispatcher);
    221221    } else {
     
    235235    }
    236236
    237 
    238237    StreamSet * LineBreakStream = P->CreateStreamSet();
    239238    std::vector<StreamSet *> MatchResultsBufs(numOfREs);
     
    245244    // can bypass transposition and use the Direct CC compiler.
    246245    const auto isSimple = (numOfREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
    247     if (isSimple) {
    248         mREs[0] = toUTF8(mREs[0]);
    249     }
    250 
    251     bool requiresComplexTest = true;
     246    const auto isWithinByteTestLimit = byteTestsWithinLimit(mREs[0], ByteCClimit);
     247    const auto hasTriCC = hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE);
     248    const auto internalS2P = isSimple && (isWithinByteTestLimit || hasTriCC);
     249   
    252250    Component internalComponents = Component::NoComponents;
    253 
    254 
    255 
    256     if (isSimple) {
    257         if (hasComponent(mRequiredComponents, Component::MoveMatchesToEOL)) {
     251    if (internalS2P && hasComponent(mRequiredComponents, Component::MoveMatchesToEOL)) {
     252        setComponent(internalComponents, Component::MoveMatchesToEOL);
     253    }
     254
     255    StreamSet * SourceStream = ByteStream;
     256    StreamSet * const RequiredStreams = P->CreateStreamSet();
     257    StreamSet * UnicodeLB = nullptr;
     258    std::map<std::string, StreamSet *> propertyStream;
     259    StreamSet * GCB_stream = nullptr;
     260   
     261    if (!internalS2P) {
     262        StreamSet * BasisBits = P->CreateStreamSet(ENCODING_BITS, 1);
     263        if (PabloTransposition) {
     264            P->CreateKernelCall<S2P_PabloKernel>(SourceStream, BasisBits);
     265        } else {
     266            //P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
     267            Kernel * s2pK = P->CreateKernelCall<S2PKernel>(SourceStream, BasisBits, cc::BitNumbering::LittleEndian, callbackObject);
     268            mGrepDriver.LinkFunction(s2pK, "signal_dispatcher", kernel::signal_dispatcher);
     269        }
     270        SourceStream = BasisBits;
     271    }
     272
     273    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     274        UnicodeLB = P->CreateStreamSet();
     275        StreamSet * const LineFeedStream = P->CreateStreamSet();
     276        P->CreateKernelCall<LineFeedKernelBuilder>(SourceStream, LineFeedStream);
     277        P->CreateKernelCall<RequiredStreams_UTF8>(SourceStream, LineFeedStream, RequiredStreams, UnicodeLB);
     278        LineBreakStream = UnicodeLB;
     279    }
     280    else if (!internalS2P) {
     281        P->CreateKernelCall<UTF8_nonFinal>(SourceStream, RequiredStreams);
     282        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     283            P->CreateKernelCall<LineFeedKernelBuilder>(SourceStream, LineBreakStream);
     284        } else { // if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     285            P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, SourceStream, LineBreakStream);
     286        }
     287    }
     288
     289    if (PropertyKernels) {
     290        for (auto p : mUnicodeProperties) {
     291            auto name = p->getFullName();
     292            StreamSet * property = P->CreateStreamSet(1, 1);
     293            propertyStream.emplace(name, property);
     294            P->CreateKernelCall<UnicodePropertyKernelBuilder>(p, SourceStream, property);
     295        }
     296    }
     297
     298    if (hasComponent(mRequiredComponents, Component::GraphemeClusterBoundary)) {
     299        GCB_stream = P->CreateStreamSet();
     300        P->CreateKernelCall<GraphemeClusterBreakKernel>(SourceStream, RequiredStreams, GCB_stream);
     301    }
     302
     303    for(unsigned i = 0; i < numOfREs; ++i) {
     304        StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
     305        MatchResultsBufs[0] = MatchResults;
     306        std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
     307        options->setIndexingAlphabet(&cc::UTF8);
     308        options->setSource(SourceStream);
     309        options->setResults(MatchResults);
     310        if (hasComponent(internalComponents, Component::MoveMatchesToEOL)) {
    258311            re::RE * notBreak = re::makeDiff(re::makeByte(0x00, 0xFF), mBreakCC);
    259             mREs[0] = re::makeSeq({mREs[0], re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
    260             setComponent(internalComponents, Component::MoveMatchesToEOL);
    261         }
    262         std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
    263         const auto isWithinByteTestLimit = byteTestsWithinLimit(mREs[0], ByteCClimit);
    264         const auto hasTriCC = hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE);
    265         if (isWithinByteTestLimit || hasTriCC) {
    266             if (MultithreadedSimpleRE && hasTriCC) {
    267                 auto CCs = re::collectCCs(prefixRE, cc::Byte);
    268                 for (auto cc : CCs) {
    269                     auto ccName = makeName(cc);
    270                     mREs[0] = re::replaceCC(mREs[0], cc, ccName);
    271                     auto ccNameStr = ccName->getFullName();
    272                     StreamSet * const ccStream = P->CreateStreamSet(1, 1);
    273                     P->CreateKernelCall<CharacterClassKernelBuilder>(ccNameStr, std::vector<re::CC *>{cc}, ByteStream, ccStream);
    274                     options->addExternal(ccNameStr, ccStream);
     312            if (isWithinByteTestLimit) {
     313                mREs[i] = re::makeSeq({mREs[i], re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
     314            } else {
     315                suffixRE = re::makeSeq({suffixRE, re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
     316            }
     317        }
     318        options->setRE(mREs[i]);
     319        if (internalS2P) {
     320            if (!isWithinByteTestLimit) {
     321                if (MultithreadedSimpleRE) {
     322                    auto CCs = re::collectCCs(prefixRE, cc::Byte);
     323                    for (auto cc : CCs) {
     324                        auto ccName = makeName(cc);
     325                        prefixRE = re::replaceCC(prefixRE, cc, ccName);
     326                        suffixRE = re::replaceCC(suffixRE, cc, ccName);
     327                        auto ccNameStr = ccName->getFullName();
     328                        StreamSet * const ccStream = P->CreateStreamSet(1, 1);
     329                        P->CreateKernelCall<CharacterClassKernelBuilder>(ccNameStr, std::vector<re::CC *>{cc}, SourceStream, ccStream);
     330                        options->addExternal(ccNameStr, ccStream);
     331                    }
    275332                }
    276             }
    277             StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
    278             MatchResultsBufs[0] = MatchResults;
    279             if (isWithinByteTestLimit) {
    280                 options->setRE(mREs[0]);
    281                 options->setSource(ByteStream);
    282                 options->setResults(MatchResults);
    283                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    284             } else {
    285                 //P->CreateKernelCall<ByteBitGrepKernel>(prefixRE, suffixRE, ByteStream, MatchResults, externals);
    286333                options->setPrefixRE(prefixRE);
    287334                options->setRE(suffixRE);
    288                 options->setSource(ByteStream);
    289                 options->setResults(MatchResults);
    290                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    291             }
    292             Kernel * LB_nullK = P->CreateKernelCall<CharacterClassKernelBuilder>( "breakCC", std::vector<re::CC *>{mBreakCC}, ByteStream, LineBreakStream, callbackObject);
     335            }
     336            Kernel * LB_nullK = P->CreateKernelCall<CharacterClassKernelBuilder>( "breakCC", std::vector<re::CC *>{mBreakCC}, SourceStream, LineBreakStream, callbackObject);
    293337            mGrepDriver.LinkFunction(LB_nullK, "signal_dispatcher", kernel::signal_dispatcher);
    294             requiresComplexTest = false;
    295         }
    296     }
    297 
    298     if (requiresComplexTest) {
    299 
    300         StreamSet * const BasisBits = P->CreateStreamSet(ENCODING_BITS, 1);
    301         if (PabloTransposition) {
    302             P->CreateKernelCall<S2P_PabloKernel>(ByteStream, BasisBits);
    303338        } else {
    304             //P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
    305             Kernel * s2pK = P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits, cc::BitNumbering::LittleEndian, callbackObject);
    306             mGrepDriver.LinkFunction(s2pK, "signal_dispatcher", kernel::signal_dispatcher);
    307         }
    308 
    309         StreamSet * const RequiredStreams = P->CreateStreamSet();
    310         StreamSet * UnicodeLB = nullptr;
    311 
    312         if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
    313             UnicodeLB = P->CreateStreamSet();
    314             StreamSet * const LineFeedStream = P->CreateStreamSet();
    315             P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineFeedStream);
    316             P->CreateKernelCall<RequiredStreams_UTF8>(BasisBits, LineFeedStream, RequiredStreams, UnicodeLB);
    317             LineBreakStream = UnicodeLB;
    318         } else {
    319             P->CreateKernelCall<UTF8_nonFinal>(BasisBits, RequiredStreams);
    320             if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
    321                 P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineBreakStream);
    322             } else { // if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    323                 P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, BasisBits, LineBreakStream);
    324             }
    325         }
    326 
    327         std::map<std::string, StreamSet *> propertyStream;
    328         if (PropertyKernels) {
    329             for (auto p : mUnicodeProperties) {
    330                 auto name = p->getFullName();
    331                 StreamSet * property = P->CreateStreamSet(1, 1);
    332                 propertyStream.emplace(name, property);
    333                 P->CreateKernelCall<UnicodePropertyKernelBuilder>(p, BasisBits, property);
    334             }
    335         }
    336 
    337         StreamSet * GCB_stream = nullptr;
    338         if (hasComponent(mRequiredComponents, Component::GraphemeClusterBoundary)) {
    339             GCB_stream = P->CreateStreamSet();
    340             P->CreateKernelCall<GraphemeClusterBreakKernel>(BasisBits, RequiredStreams, GCB_stream);
    341         }
    342 
    343         for(unsigned i = 0; i < numOfREs; ++i) {
    344             std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
    345339            options->addExternal("UTF8_nonfinal", RequiredStreams);
    346340            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     
    362356                options->addExternal("\\b{g}", GCB_stream);
    363357            }
    364 
    365             StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
    366             MatchResultsBufs[i] = MatchResults;
    367 
    368358            if (CC_Multiplexing) {
    369359                const auto UnicodeSets = re::collectCCs(mREs[i], cc::Unicode, std::set<re::Name *>{re::makeZeroWidth("\\b{g}")});
    370360                if (UnicodeSets.size() <= 1) {
    371361                    options->setRE(mREs[i]);
    372                     options->setSource(BasisBits);
    373                     options->setResults(MatchResults);
    374362                } else {
    375363                    auto mpx = std::make_shared<MultiplexedAlphabet>("mpx", UnicodeSets);
    376364                    mREs[i] = transformCCs(mpx, mREs[i]);
     365                    options->setRE(mREs[i]);
    377366                    auto mpx_basis = mpx->getMultiplexedCCs();
    378367                    StreamSet * const CharClasses = P->CreateStreamSet(mpx_basis.size());
    379                     P->CreateKernelCall<CharClassesKernel>(std::move(mpx_basis), BasisBits, CharClasses);
    380 
    381                     #warning TODO: multiplexed CCs ought to generate unique names. Make the name also dependent on alphabet.
    382                     // Multiplexing Grep Kernel is not Cachable, since for now it use string representation of RE AST as cache key,
    383                     // whileit is possible that two multiplexed REs with the same name "mpx_1" have different alphabets
    384                     options->setRE(mREs[i]);
    385                     options->setSource(BasisBits);
    386                     options->setResults(MatchResults);
     368                    P->CreateKernelCall<CharClassesKernel>(std::move(mpx_basis), SourceStream, CharClasses);
    387369                    options->addAlphabet(mpx, CharClasses);
    388                     P->CreateKernelCall<ICGrepKernel>(std::move(options));
    389370                }
    390             } else {
    391                 options->setRE(mREs[i]);
    392                 options->setSource(BasisBits);
    393                 options->setResults(MatchResults);
    394                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    395             }
    396         }
    397 
    398     } // end of requiresComplexTest
     371            }
     372        }
     373        P->CreateKernelCall<ICGrepKernel>(std::move(options));
     374    }
    399375
    400376    StreamSet * Matches = MatchResultsBufs[0];
  • icGREP/icgrep-devel/icgrep/grep/grep_kernel.cpp

    r6294 r6297  
    313313
    314314void GrepKernelOptions::setNumbering(cc::BitNumbering numbering) {mBasisSetNumbering = numbering;}
    315 void GrepKernelOptions::setIndexingAlphabet(cc::Alphabet * a) {mIndexingAlphabet = a;}
     315void GrepKernelOptions::setIndexingAlphabet(const cc::Alphabet * a) {mIndexingAlphabet = a;}
    316316void GrepKernelOptions::setRE(RE * e) {mRE = e;}
    317317void GrepKernelOptions::setPrefixRE(RE * e) {mPrefixRE = e;}
     
    372372    options->streamSetOutputBindings(),
    373373    options->scalarInputBindings(),
    374               options->scalarOutputBindings()), mOptions(std::move(options)) {
     374    options->scalarOutputBindings()), mOptions(std::move(options)) {
    375375}
    376376
     
    389389    }
    390390    //cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"), mOptions->mBasisSetNumbering);
    391     RE_Compiler re_compiler(getEntryScope(), *ccc.get(), mOptions->mBasisSetNumbering);
     391    RE_Compiler re_compiler(getEntryScope(), *ccc.get(), *(mOptions->mIndexingAlphabet), mOptions->mBasisSetNumbering);
    392392    for (const auto & e : mOptions->mExternals) {
    393393        re_compiler.addPrecompiled(e.first, pb.createExtract(getInputStreamVar(e.first), pb.getInteger(0)));
     
    422422
    423423        cc::Parabix_CC_Compiler ccc(scope1, basis);
    424         RE_Compiler re_compiler(scope1, ccc);
     424        RE_Compiler re_compiler(scope1, ccc, *(mOptions->mIndexingAlphabet), mOptions->mBasisSetNumbering);
    425425        scope1->createAssign(final_matches, re_compiler.compile(mOptions->mRE, prefixMatches));
    426426        Var * const output = getOutputStreamVar("matches");
  • icGREP/icgrep-devel/icgrep/grep/grep_kernel.h

    r6294 r6297  
    6262        mPrefixRE(nullptr) {}
    6363    void setNumbering(cc::BitNumbering numbering);
    64     void setIndexingAlphabet(cc::Alphabet * a);
     64    void setIndexingAlphabet(const cc::Alphabet * a);
    6565    void setSource(StreamSet * s);
    6666    void setResults(StreamSet * r);
Note: See TracChangeset for help on using the changeset viewer.