Changeset 6297


Ignore:
Timestamp:
Jan 27, 2019, 2:54:36 PM (8 weeks ago)
Author:
cameron
Message:

Merge branch 'master' of https://cs-git-research.cs.surrey.sfu.ca/cameron/parabix-devel

Location:
icGREP/icgrep-devel
Files:
269 added
16 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/.gitignore

    r6293 r6297  
    11# Don't track build directories
    2 icgrep-build/
    3 libllvm
    4 llvm-build/
    5 debug-build/
     2icgrep*build/
     3libllv*
     4llvm*build/
     5debug*build/
    66# Don't track LLVM source
    77llvm-[0-9].[0-9].[0-9].src/
  • icGREP/icgrep-devel/QA/.gitignore

    r6281 r6297  
    1 abc/
     1abc/TestOutput
    22editd/OutputFiles/
    33u32u8output/
  • icGREP/icgrep-devel/QA/greptest.xml

    r6176 r6297  
    503503<grepcase regexp="=0123[0-9]{496,996};" datafile="bounded_charclass" grepcount="1"/>
    504504<grepcase regexp="=([a-f].{0,2})+;" datafile="bounded_charclass" grepcount="6"/>
     505<grepcase regexp="=[acegikmoq](..)*;" datafile="bounded_charclass" grepcount="9"/>
    505506
    506507
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r6294 r6297  
    131131target_link_libraries (FileSelect GrepEngine)
    132132
     133#add_executable(jitdemo2 jitdemo2.cpp)
    133134add_executable(icgrep icgrep.cpp grep_interface.cpp)
    134135add_executable(u8u16 u8u16.cpp kernels/zeroextend.cpp)
     
    143144add_executable(cachejanitord ${OBJECT_CACHE_DAEMON_SRC})
    144145
     146#target_link_libraries (jitdemo2 ${REQ_LLVM_LIBRARIES})
    145147target_link_libraries (icgrep GrepEngine UCDlib PabloADT RegExpCompiler CCADT CodeGen FileSelect ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
    146148target_link_libraries (u8u16 PabloADT CCADT CodeGen ${REQ_LLVM_LIBRARIES} ${Boost_LIBRARIES} ${CUDA_LIB})
     
    251253# CreateAssert from being able to provide the compilation call stack for each JIT'ed assertion error.
    252254
    253 CHECK_CXX_COMPILER_FLAG("-no-pie" COMPILER_SUPPORTS_NO_PIE)
    254 IF (COMPILER_SUPPORTS_NO_PIE)
    255   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -no-pie")
     255IF (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
     256  CHECK_CXX_COMPILER_FLAG("-no-pie" COMPILER_SUPPORTS_NO_PIE)
     257  IF (COMPILER_SUPPORTS_NO_PIE)
     258    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -no-pie")
     259  ENDIF()
    256260ENDIF()
    257261
     
    360364add_custom_target (check
    361365  COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure
    362   DEPENDS icgrep u8u16 u32u8 base64 editd abc_gen idisa_test)
     366  DEPENDS icgrep u8u16 u32u8 base64 editd idisa_test)
    363367
    364368add_custom_target (perf_icgrep
  • icGREP/icgrep-devel/icgrep/cc/alphabet.cpp

    r6184 r6297  
    5050const CodeUnitAlphabet Byte("Byte", 8);
    5151   
     52const CodeUnitAlphabet UTF8("UTF8", 8);
     53
    5254const CodeUnitAlphabet UTF16("UTF16", 16);
    5355   
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r6184 r6297  
    9898const extern CodeUnitAlphabet Byte; // Byte("Byte", 8);
    9999   
     100const extern CodeUnitAlphabet UTF8; // UTF8("UTF8", 8);
     101
    100102const extern CodeUnitAlphabet UTF16; // UTF16("UTF16", 16);
    101103   
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.cpp

    r6133 r6297  
    7575                codepoint_t lo = lo_codepoint(cc->front());
    7676                codepoint_t hi = lo_codepoint(cc->back());
     77                PabloAST * even_odd = getBasisVar(0, pb);
     78                if ((lo & 1) == 0) {
     79                    even_odd = pb.createNot(even_odd);
     80                }
    7781                lo &= (mEncodingMask - 1);
    7882                hi |= (mEncodingMask ^ (mEncodingMask - 1));
    7983                PabloAST * expr = make_range(lo, hi, pb);
    80                 PabloAST * bit0 = getBasisVar(0, pb);
    81                 if ((lo & 1) == 0) {
    82                     bit0 = pb.createNot(bit0);
    83                 }
    84                 return pb.createAnd(expr, bit0);
     84                return pb.createAnd(expr, even_odd);
    8585            }
    8686        }
  • icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp

    r6294 r6297  
    206206// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
    207207
    208 std::pair<StreamSet *, StreamSet *> GrepEngine::grepPipeline(const std::unique_ptr<ProgramBuilder> & P, StreamSet *SourceStream) {
     208std::pair<StreamSet *, StreamSet *> GrepEngine::grepPipeline(const std::unique_ptr<ProgramBuilder> & P, StreamSet * InputStream) {
    209209
    210210    Scalar * const callbackObject = P->getInputScalar("callbackObject");
     
    214214    StreamSet * ByteStream = nullptr;
    215215    if (mBinaryFilesMode == argv::Text) {
    216         ByteStream = SourceStream;
     216        ByteStream = InputStream;
    217217    } else if (mBinaryFilesMode == argv::WithoutMatch) {
    218218        ByteStream = P->CreateStreamSet(1, 8);
    219         Kernel * binaryCheckK = P->CreateKernelCall<AbortOnNull>(SourceStream, ByteStream, callbackObject);
     219        Kernel * binaryCheckK = P->CreateKernelCall<AbortOnNull>(InputStream, ByteStream, callbackObject);
    220220        mGrepDriver.LinkFunction(binaryCheckK, "signal_dispatcher", kernel::signal_dispatcher);
    221221    } else {
     
    235235    }
    236236
    237 
    238237    StreamSet * LineBreakStream = P->CreateStreamSet();
    239238    std::vector<StreamSet *> MatchResultsBufs(numOfREs);
     
    245244    // can bypass transposition and use the Direct CC compiler.
    246245    const auto isSimple = (numOfREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
    247     if (isSimple) {
    248         mREs[0] = toUTF8(mREs[0]);
    249     }
    250 
    251     bool requiresComplexTest = true;
     246    const auto isWithinByteTestLimit = byteTestsWithinLimit(mREs[0], ByteCClimit);
     247    const auto hasTriCC = hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE);
     248    const auto internalS2P = isSimple && (isWithinByteTestLimit || hasTriCC);
     249   
    252250    Component internalComponents = Component::NoComponents;
    253 
    254 
    255 
    256     if (isSimple) {
    257         if (hasComponent(mRequiredComponents, Component::MoveMatchesToEOL)) {
     251    if (internalS2P && hasComponent(mRequiredComponents, Component::MoveMatchesToEOL)) {
     252        setComponent(internalComponents, Component::MoveMatchesToEOL);
     253    }
     254
     255    StreamSet * SourceStream = ByteStream;
     256    StreamSet * const RequiredStreams = P->CreateStreamSet();
     257    StreamSet * UnicodeLB = nullptr;
     258    std::map<std::string, StreamSet *> propertyStream;
     259    StreamSet * GCB_stream = nullptr;
     260   
     261    if (!internalS2P) {
     262        StreamSet * BasisBits = P->CreateStreamSet(ENCODING_BITS, 1);
     263        if (PabloTransposition) {
     264            P->CreateKernelCall<S2P_PabloKernel>(SourceStream, BasisBits);
     265        } else {
     266            //P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
     267            Kernel * s2pK = P->CreateKernelCall<S2PKernel>(SourceStream, BasisBits, cc::BitNumbering::LittleEndian, callbackObject);
     268            mGrepDriver.LinkFunction(s2pK, "signal_dispatcher", kernel::signal_dispatcher);
     269        }
     270        SourceStream = BasisBits;
     271    }
     272
     273    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     274        UnicodeLB = P->CreateStreamSet();
     275        StreamSet * const LineFeedStream = P->CreateStreamSet();
     276        P->CreateKernelCall<LineFeedKernelBuilder>(SourceStream, LineFeedStream);
     277        P->CreateKernelCall<RequiredStreams_UTF8>(SourceStream, LineFeedStream, RequiredStreams, UnicodeLB);
     278        LineBreakStream = UnicodeLB;
     279    }
     280    else if (!internalS2P) {
     281        P->CreateKernelCall<UTF8_nonFinal>(SourceStream, RequiredStreams);
     282        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
     283            P->CreateKernelCall<LineFeedKernelBuilder>(SourceStream, LineBreakStream);
     284        } else { // if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
     285            P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, SourceStream, LineBreakStream);
     286        }
     287    }
     288
     289    if (PropertyKernels) {
     290        for (auto p : mUnicodeProperties) {
     291            auto name = p->getFullName();
     292            StreamSet * property = P->CreateStreamSet(1, 1);
     293            propertyStream.emplace(name, property);
     294            P->CreateKernelCall<UnicodePropertyKernelBuilder>(p, SourceStream, property);
     295        }
     296    }
     297
     298    if (hasComponent(mRequiredComponents, Component::GraphemeClusterBoundary)) {
     299        GCB_stream = P->CreateStreamSet();
     300        P->CreateKernelCall<GraphemeClusterBreakKernel>(SourceStream, RequiredStreams, GCB_stream);
     301    }
     302
     303    for(unsigned i = 0; i < numOfREs; ++i) {
     304        StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
     305        MatchResultsBufs[0] = MatchResults;
     306        std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
     307        options->setIndexingAlphabet(&cc::UTF8);
     308        options->setSource(SourceStream);
     309        options->setResults(MatchResults);
     310        if (hasComponent(internalComponents, Component::MoveMatchesToEOL)) {
    258311            re::RE * notBreak = re::makeDiff(re::makeByte(0x00, 0xFF), mBreakCC);
    259             mREs[0] = re::makeSeq({mREs[0], re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
    260             setComponent(internalComponents, Component::MoveMatchesToEOL);
    261         }
    262         std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
    263         const auto isWithinByteTestLimit = byteTestsWithinLimit(mREs[0], ByteCClimit);
    264         const auto hasTriCC = hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE);
    265         if (isWithinByteTestLimit || hasTriCC) {
    266             if (MultithreadedSimpleRE && hasTriCC) {
    267                 auto CCs = re::collectCCs(prefixRE, cc::Byte);
    268                 for (auto cc : CCs) {
    269                     auto ccName = makeName(cc);
    270                     mREs[0] = re::replaceCC(mREs[0], cc, ccName);
    271                     auto ccNameStr = ccName->getFullName();
    272                     StreamSet * const ccStream = P->CreateStreamSet(1, 1);
    273                     P->CreateKernelCall<CharacterClassKernelBuilder>(ccNameStr, std::vector<re::CC *>{cc}, ByteStream, ccStream);
    274                     options->addExternal(ccNameStr, ccStream);
     312            if (isWithinByteTestLimit) {
     313                mREs[i] = re::makeSeq({mREs[i], re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
     314            } else {
     315                suffixRE = re::makeSeq({suffixRE, re::makeRep(notBreak, 0, re::Rep::UNBOUNDED_REP), makeNegativeLookAheadAssertion(notBreak)});
     316            }
     317        }
     318        options->setRE(mREs[i]);
     319        if (internalS2P) {
     320            if (!isWithinByteTestLimit) {
     321                if (MultithreadedSimpleRE) {
     322                    auto CCs = re::collectCCs(prefixRE, cc::Byte);
     323                    for (auto cc : CCs) {
     324                        auto ccName = makeName(cc);
     325                        prefixRE = re::replaceCC(prefixRE, cc, ccName);
     326                        suffixRE = re::replaceCC(suffixRE, cc, ccName);
     327                        auto ccNameStr = ccName->getFullName();
     328                        StreamSet * const ccStream = P->CreateStreamSet(1, 1);
     329                        P->CreateKernelCall<CharacterClassKernelBuilder>(ccNameStr, std::vector<re::CC *>{cc}, SourceStream, ccStream);
     330                        options->addExternal(ccNameStr, ccStream);
     331                    }
    275332                }
    276             }
    277             StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
    278             MatchResultsBufs[0] = MatchResults;
    279             if (isWithinByteTestLimit) {
    280                 options->setRE(mREs[0]);
    281                 options->setSource(ByteStream);
    282                 options->setResults(MatchResults);
    283                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    284             } else {
    285                 //P->CreateKernelCall<ByteBitGrepKernel>(prefixRE, suffixRE, ByteStream, MatchResults, externals);
    286333                options->setPrefixRE(prefixRE);
    287334                options->setRE(suffixRE);
    288                 options->setSource(ByteStream);
    289                 options->setResults(MatchResults);
    290                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    291             }
    292             Kernel * LB_nullK = P->CreateKernelCall<CharacterClassKernelBuilder>( "breakCC", std::vector<re::CC *>{mBreakCC}, ByteStream, LineBreakStream, callbackObject);
     335            }
     336            Kernel * LB_nullK = P->CreateKernelCall<CharacterClassKernelBuilder>( "breakCC", std::vector<re::CC *>{mBreakCC}, SourceStream, LineBreakStream, callbackObject);
    293337            mGrepDriver.LinkFunction(LB_nullK, "signal_dispatcher", kernel::signal_dispatcher);
    294             requiresComplexTest = false;
    295         }
    296     }
    297 
    298     if (requiresComplexTest) {
    299 
    300         StreamSet * const BasisBits = P->CreateStreamSet(ENCODING_BITS, 1);
    301         if (PabloTransposition) {
    302             P->CreateKernelCall<S2P_PabloKernel>(ByteStream, BasisBits);
    303338        } else {
    304             //P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits);
    305             Kernel * s2pK = P->CreateKernelCall<S2PKernel>(ByteStream, BasisBits, cc::BitNumbering::LittleEndian, callbackObject);
    306             mGrepDriver.LinkFunction(s2pK, "signal_dispatcher", kernel::signal_dispatcher);
    307         }
    308 
    309         StreamSet * const RequiredStreams = P->CreateStreamSet();
    310         StreamSet * UnicodeLB = nullptr;
    311 
    312         if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
    313             UnicodeLB = P->CreateStreamSet();
    314             StreamSet * const LineFeedStream = P->CreateStreamSet();
    315             P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineFeedStream);
    316             P->CreateKernelCall<RequiredStreams_UTF8>(BasisBits, LineFeedStream, RequiredStreams, UnicodeLB);
    317             LineBreakStream = UnicodeLB;
    318         } else {
    319             P->CreateKernelCall<UTF8_nonFinal>(BasisBits, RequiredStreams);
    320             if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
    321                 P->CreateKernelCall<LineFeedKernelBuilder>(BasisBits, LineBreakStream);
    322             } else { // if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
    323                 P->CreateKernelCall<CharacterClassKernelBuilder>( "Null", std::vector<re::CC *>{mBreakCC}, BasisBits, LineBreakStream);
    324             }
    325         }
    326 
    327         std::map<std::string, StreamSet *> propertyStream;
    328         if (PropertyKernels) {
    329             for (auto p : mUnicodeProperties) {
    330                 auto name = p->getFullName();
    331                 StreamSet * property = P->CreateStreamSet(1, 1);
    332                 propertyStream.emplace(name, property);
    333                 P->CreateKernelCall<UnicodePropertyKernelBuilder>(p, BasisBits, property);
    334             }
    335         }
    336 
    337         StreamSet * GCB_stream = nullptr;
    338         if (hasComponent(mRequiredComponents, Component::GraphemeClusterBoundary)) {
    339             GCB_stream = P->CreateStreamSet();
    340             P->CreateKernelCall<GraphemeClusterBreakKernel>(BasisBits, RequiredStreams, GCB_stream);
    341         }
    342 
    343         for(unsigned i = 0; i < numOfREs; ++i) {
    344             std::unique_ptr<GrepKernelOptions> options = make_unique<GrepKernelOptions>();
    345339            options->addExternal("UTF8_nonfinal", RequiredStreams);
    346340            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
     
    362356                options->addExternal("\\b{g}", GCB_stream);
    363357            }
    364 
    365             StreamSet * const MatchResults = P->CreateStreamSet(1, 1);
    366             MatchResultsBufs[i] = MatchResults;
    367 
    368358            if (CC_Multiplexing) {
    369359                const auto UnicodeSets = re::collectCCs(mREs[i], cc::Unicode, std::set<re::Name *>{re::makeZeroWidth("\\b{g}")});
    370360                if (UnicodeSets.size() <= 1) {
    371361                    options->setRE(mREs[i]);
    372                     options->setSource(BasisBits);
    373                     options->setResults(MatchResults);
    374362                } else {
    375363                    auto mpx = std::make_shared<MultiplexedAlphabet>("mpx", UnicodeSets);
    376364                    mREs[i] = transformCCs(mpx, mREs[i]);
     365                    options->setRE(mREs[i]);
    377366                    auto mpx_basis = mpx->getMultiplexedCCs();
    378367                    StreamSet * const CharClasses = P->CreateStreamSet(mpx_basis.size());
    379                     P->CreateKernelCall<CharClassesKernel>(std::move(mpx_basis), BasisBits, CharClasses);
    380 
    381                     #warning TODO: multiplexed CCs ought to generate unique names. Make the name also dependent on alphabet.
    382                     // Multiplexing Grep Kernel is not Cachable, since for now it use string representation of RE AST as cache key,
    383                     // whileit is possible that two multiplexed REs with the same name "mpx_1" have different alphabets
    384                     options->setRE(mREs[i]);
    385                     options->setSource(BasisBits);
    386                     options->setResults(MatchResults);
     368                    P->CreateKernelCall<CharClassesKernel>(std::move(mpx_basis), SourceStream, CharClasses);
    387369                    options->addAlphabet(mpx, CharClasses);
    388                     P->CreateKernelCall<ICGrepKernel>(std::move(options));
    389370                }
    390             } else {
    391                 options->setRE(mREs[i]);
    392                 options->setSource(BasisBits);
    393                 options->setResults(MatchResults);
    394                 P->CreateKernelCall<ICGrepKernel>(std::move(options));
    395             }
    396         }
    397 
    398     } // end of requiresComplexTest
     371            }
     372        }
     373        P->CreateKernelCall<ICGrepKernel>(std::move(options));
     374    }
    399375
    400376    StreamSet * Matches = MatchResultsBufs[0];
  • icGREP/icgrep-devel/icgrep/grep/grep_kernel.cpp

    r6294 r6297  
    313313
    314314void GrepKernelOptions::setNumbering(cc::BitNumbering numbering) {mBasisSetNumbering = numbering;}
    315 void GrepKernelOptions::setIndexingAlphabet(cc::Alphabet * a) {mIndexingAlphabet = a;}
     315void GrepKernelOptions::setIndexingAlphabet(const cc::Alphabet * a) {mIndexingAlphabet = a;}
    316316void GrepKernelOptions::setRE(RE * e) {mRE = e;}
    317317void GrepKernelOptions::setPrefixRE(RE * e) {mPrefixRE = e;}
     
    372372    options->streamSetOutputBindings(),
    373373    options->scalarInputBindings(),
    374               options->scalarOutputBindings()), mOptions(std::move(options)) {
     374    options->scalarOutputBindings()), mOptions(std::move(options)) {
    375375}
    376376
     
    389389    }
    390390    //cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"), mOptions->mBasisSetNumbering);
    391     RE_Compiler re_compiler(getEntryScope(), *ccc.get(), mOptions->mBasisSetNumbering);
     391    RE_Compiler re_compiler(getEntryScope(), *ccc.get(), *(mOptions->mIndexingAlphabet), mOptions->mBasisSetNumbering);
    392392    for (const auto & e : mOptions->mExternals) {
    393393        re_compiler.addPrecompiled(e.first, pb.createExtract(getInputStreamVar(e.first), pb.getInteger(0)));
     
    422422
    423423        cc::Parabix_CC_Compiler ccc(scope1, basis);
    424         RE_Compiler re_compiler(scope1, ccc);
     424        RE_Compiler re_compiler(scope1, ccc, *(mOptions->mIndexingAlphabet), mOptions->mBasisSetNumbering);
    425425        scope1->createAssign(final_matches, re_compiler.compile(mOptions->mRE, prefixMatches));
    426426        Var * const output = getOutputStreamVar("matches");
  • icGREP/icgrep-devel/icgrep/grep/grep_kernel.h

    r6294 r6297  
    6262        mPrefixRE(nullptr) {}
    6363    void setNumbering(cc::BitNumbering numbering);
    64     void setIndexingAlphabet(cc::Alphabet * a);
     64    void setIndexingAlphabet(const cc::Alphabet * a);
    6565    void setSource(StreamSet * s);
    6666    void setResults(StreamSet * r);
  • icGREP/icgrep-devel/icgrep/kernels/kernel.cpp

    r6296 r6297  
    373373    std::vector<Value *> args;
    374374    args.reserve(mCurrentMethod->arg_size());
    375     for (Argument & arg : mCurrentMethod->getArgumentList()) {
    376         args.push_back(&arg);
     375    for (auto ArgI = mCurrentMethod->arg_begin(); ArgI != mCurrentMethod->arg_end(); ++ArgI) {
     376        args.push_back(&(*ArgI));
    377377    }
    378378    setDoSegmentProperties(b, args);
  • icGREP/icgrep-devel/icgrep/re/re_analysis.cpp

    r6292 r6297  
    1818#include <cc/alphabet.h>
    1919#include <cc/multiplex_CCs.h>
     20#include <UCD/UTF.h>
    2021#include <limits.h>
    2122#include <llvm/Support/ErrorHandling.h>
     
    250251}
    251252
    252 std::pair<int, int> getUnicodeUnitLengthRange(const RE * re) {
     253std::pair<int, int> getLengthRange(const RE * re, const cc::Alphabet * indexAlphabet) {
    253254    if (const Alt * alt = dyn_cast<Alt>(re)) {
    254255        std::pair<int, int> range = std::make_pair(INT_MAX, 0);
    255         for (const RE * re : *alt) {
    256             auto r = getUnicodeUnitLengthRange(re);
    257             range.first = std::min<int>(range.first, r.first);
    258             range.second = std::max<int>(range.second, r.second);
     256        for (const RE * a : *alt) {
     257            auto a_range = getLengthRange(a, indexAlphabet);
     258            range.first = std::min<int>(range.first, a_range.first);
     259            range.second = std::max<int>(range.second, a_range.second);
    259260        }
    260261        return range;
     
    262263        std::pair<int, int> range = std::make_pair(0, 0);
    263264        for (const RE * re : *seq) {
    264             auto tmp = getUnicodeUnitLengthRange(re);
     265            auto tmp = getLengthRange(re, indexAlphabet);
    265266            if (LLVM_LIKELY(tmp.first < (INT_MAX - range.first))) {
    266267                range.first += tmp.first;
     
    276277        return range;
    277278    } else if (const Rep * rep = dyn_cast<Rep>(re)) {
    278         auto range = getUnicodeUnitLengthRange(rep->getRE());
     279        auto range = getLengthRange(rep->getRE(), indexAlphabet);
    279280        if (LLVM_LIKELY(rep->getLB() != Rep::UNBOUNDED_REP && range.first < INT_MAX)) {
    280281            range.first *= rep->getLB();
     
    292293    } else if (const Diff * diff = dyn_cast<Diff>(re)) {
    293294        // The range is determined by the first operand only.
    294         return getUnicodeUnitLengthRange(diff->getLH());
     295        return getLengthRange(diff->getLH(), indexAlphabet);
    295296    } else if (const Intersect * i = dyn_cast<Intersect>(re)) {
    296         const auto r1 = getUnicodeUnitLengthRange(i->getLH());
    297         const auto r2 = getUnicodeUnitLengthRange(i->getRH());
     297        const auto r1 = getLengthRange(i->getLH(), indexAlphabet);
     298        const auto r2 = getLengthRange(i->getRH(), indexAlphabet);
    298299        // The matched string cannot be shorter than the largest of the min lengths
    299300        // nor can it be longer than the smallest of the max lengths.
    300301        return std::make_pair(std::max(r1.first, r2.first), std::min(r1.second, r2.second));
    301     } else if (isa<CC>(re)) {
    302         return std::make_pair(1, 1);
    303     } else if (const Name * n = dyn_cast<Name>(re)) {
    304         // Eventually names might be set up for not unit length items.
     302    } else if (const CC * cc = dyn_cast<CC>(re)) {
     303        auto alphabet = cc->getAlphabet();
     304        if (const cc::MultiplexedAlphabet * a = dyn_cast<cc::MultiplexedAlphabet>(alphabet)) {
     305            alphabet = a->getSourceAlphabet();
     306        }
     307        if (isa<cc::CodeUnitAlphabet>(alphabet)) return std::make_pair(1, 1);
     308        if (indexAlphabet == alphabet) return std::make_pair(1, 1);
     309        if ((indexAlphabet == &cc::UTF8) && (alphabet == &cc::Unicode)) {
     310            return std::make_pair(UTF<8>::encoded_length(lo_codepoint(cc->front())),
     311                                  UTF<8>::encoded_length(hi_codepoint(cc->back())));
     312        }
     313        return std::make_pair(0, INT_MAX);
     314    } else if (const Name * n = dyn_cast<Name>(re)) {
    305315        switch (n->getType()) {
    306             case Name::Type::Unicode:
    307             case Name::Type::UnicodeProperty:
    308                 return std::make_pair(1, 1);
    309             case Name::Type::Capture:
    310             case Name::Type::Reference:
    311                 return getUnicodeUnitLengthRange(n->getDefinition());
    312             case Name::Type::ZeroWidth:
    313                 return std::make_pair(0, 0);
    314316            case Name::Type::Unknown:
    315317                return std::make_pair(0, INT_MAX);
    316         }
    317     }
     318            default:
     319                return getLengthRange(n->getDefinition(), indexAlphabet);
     320        }
     321    }
    318322    return std::make_pair(1, 1);
    319323}
    320    
     324
    321325bool isFixedLength(const RE * re) {
    322326    if (isa<Alt>(re)) {
    323         auto range = getUnicodeUnitLengthRange(re);
     327        auto range = getLengthRange(re, &cc::Unicode);
    324328        return range.first == range.second;
    325329    } else if (const Seq * seq = dyn_cast<Seq>(re)) {
  • icGREP/icgrep-devel/icgrep/re/re_analysis.h

    r6160 r6297  
    44#include <utility>
    55namespace re { class RE; class Name; class CC;}
     6namespace cc { class Alphabet;}
    67
    78namespace re {
     
    2324bool isUnicodeUnitLength(const RE * re);
    2425
    25 std::pair<int, int> getUnicodeUnitLengthRange(const RE * re);
     26std::pair<int, int> getLengthRange(const RE * re, const cc::Alphabet * indexingAlphabet);
    2627
    2728bool isFixedLength(const RE * re);
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r6292 r6297  
    2828#include <re/re_toolchain.h>        // for AlgorithmOptionIsSet, RE_Algorith...
    2929#include <cc/alphabet.h>
     30#include <cc/multiplex_CCs.h>
    3031#include <cc/cc_compiler.h>
    3132#include <UCD/ucd_compiler.hpp>
     
    301302
    302303inline bool alignedUnicodeLength(const RE * const lh, const RE * const rh) {
    303     const auto lhl = getUnicodeUnitLengthRange(lh);
    304     const auto rhl = getUnicodeUnitLengthRange(rh);
     304    const auto lhl = getLengthRange(lh, &cc::Unicode);
     305    const auto rhl = getLengthRange(rh, &cc::Unicode);
    305306    return (lhl.first == lhl.second && lhl.first == rhl.first && lhl.second == rhl.second);
    306307}
     
    345346
    346347/*
    347    Given a stream |repeated_j| marking positions associated with |j| conecutive matches to an item
    348    compute a stream marking |repeat_count| consecutive occurrences of such items.
     348   Given a stream |repeated_j| marking positions associated with |j| consecutive matches to an item
     349   of length |match_length| compute a stream marking |repeat_count| consecutive occurrences of such items.
    349350*/
    350351   
    351 PabloAST * RE_Compiler::consecutive_matches(PabloAST * const repeated_j, const int j, const int repeat_count, PabloAST * const indexStream, PabloBuilder & pb) {
     352PabloAST * RE_Compiler::consecutive_matches(PabloAST * const repeated_j, const int j, const int repeat_count, const int match_length, PabloAST * const indexStream, PabloBuilder & pb) {
    352353    if (j == repeat_count) {
    353354        return repeated_j;
     
    355356    const int i = std::min(j, repeat_count - j);
    356357    const int k = j + i;
    357     if (/*j > IfInsertionGap*/ false) {
     358    if (j > IfInsertionGap) {
    358359        Var * repeated = pb.createVar("repeated", pb.createZeroes());
    359360        auto nested = pb.createScope();
    360361        NameMap nestedMap(mCompiledName);
    361362        mCompiledName = &nestedMap;
    362         PabloAST * adv_i = nested.createIndexedAdvance(repeated_j, indexStream, i);
     363        PabloAST * adv_i = nested.createIndexedAdvance(repeated_j, indexStream, i * match_length);
    363364        PabloAST * repeated_k = nested.createAnd(repeated_j, adv_i, "at" + std::to_string(k) + "of" + std::to_string(repeat_count));
    364         nested.createAssign(repeated, consecutive_matches(repeated_k, k, repeat_count, indexStream, nested));
     365        nested.createAssign(repeated, consecutive_matches(repeated_k, k, repeat_count, match_length, indexStream, nested));
    365366        pb.createIf(repeated_j, nested);
    366367        mCompiledName = nestedMap.getParent();
    367368        return repeated;
    368369    } else {
    369         PabloAST * adv_i = pb.createIndexedAdvance(repeated_j, indexStream, i);
     370        PabloAST * adv_i = pb.createIndexedAdvance(repeated_j, indexStream, i * match_length);
    370371        PabloAST * repeated_k = pb.createAnd(repeated_j, adv_i, "at" + std::to_string(k) + "of" + std::to_string(repeat_count));
    371         return consecutive_matches(repeated_k, k, repeat_count, indexStream, pb);
     372        return consecutive_matches(repeated_k, k, repeat_count, match_length, indexStream, pb);
    372373    }
    373374}
     
    405406        // Check for a regular expression that satisfies on of the special conditions that
    406407        // allow implementation using the log2 technique.
    407         if (isByteLength(repeated)) {
     408        auto lengths = getLengthRange(repeated, &mIndexingAlphabet);
     409        if (lengths.first == lengths.second) {
    408410            PabloAST * cc = markerVar(compile(repeated, pb));
    409             PabloAST * cc_lb = consecutive_matches(cc, 1, lb, nullptr, pb);
     411            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, lengths.first, nullptr, pb);
    410412            const auto pos = markerPos(marker) == FinalMatchUnit ? lb : lb - 1;
    411413            PabloAST * marker_fwd = pb.createAdvance(markerVar(marker), pos);
     
    414416        else if (isUnicodeUnitLength(repeated)) {
    415417            PabloAST * cc = markerVar(compile(repeated, pb));
    416             PabloAST * cc_lb = consecutive_matches(cc, 1, lb, u8Final(pb), pb);
     418            PabloAST * cc_lb = consecutive_matches(cc, 1, lb, 1, u8Final(pb), pb);
    417419            const auto pos = markerPos(marker) == FinalMatchUnit ? lb : lb - 1;
    418420            PabloAST * marker_fwd = pb.createIndexedAdvance(markerVar(marker), u8Final(pb), pos);
     
    427429                // Consecutive submatches require that the symbol following the end of one submatch is the first symbol for
    428430                // the next submatch.   lb-1 such submatches are required.
    429                 PabloAST * consecutive_submatch = consecutive_matches(submatch, 1, lb-1, firstCCstream, pb);
     431                PabloAST * consecutive_submatch = consecutive_matches(submatch, 1, lb-1, 1, firstCCstream, pb);
    430432                // Find submatch positions which are lb-2 start symbols forward from the current marker position.
    431433                PabloAST * base = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     
    630632}
    631633
    632 RE_Compiler::RE_Compiler(PabloBlock * scope, cc::CC_Compiler & ccCompiler, cc::BitNumbering basisSetNumbering)
     634RE_Compiler::RE_Compiler(PabloBlock * scope,
     635                         cc::CC_Compiler & ccCompiler,
     636                         const cc::Alphabet & indexingAlphabet,
     637                         cc::BitNumbering basisSetNumbering)
    633638: mEntryScope(scope)
    634639, mCCCompiler(ccCompiler)
     640, mIndexingAlphabet(indexingAlphabet)
     641, mBasisSetNumbering(basisSetNumbering)
    635642, mLineBreak(nullptr)
    636643, mWhileTest(nullptr)
    637644, mStarDepth(0)
    638 , mCompiledName(&mBaseMap)
    639 , mBasisSetNumbering(basisSetNumbering) {
     645, mCompiledName(&mBaseMap) {
    640646    PabloBuilder pb(mEntryScope);
    641647    mLineBreak = pb.createZeroes();  // default so "^/$" matches start/end of text only
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r6184 r6297  
    5252    };
    5353
    54     RE_Compiler(pablo::PabloBlock * scope, cc::CC_Compiler & ccCompiler, cc::BitNumbering basisSetNumbering = cc::BitNumbering::LittleEndian);
     54    RE_Compiler(pablo::PabloBlock * scope,
     55                cc::CC_Compiler & ccCompiler,
     56                const cc::Alphabet & indexingAlphabet = cc::Byte,
     57                cc::BitNumbering basisSetNumbering = cc::BitNumbering::LittleEndian);
    5558   
    5659    //
     
    116119    MarkerType compileDiff(Diff * diff, MarkerType marker, pablo::PabloBuilder & cg);
    117120    MarkerType compileIntersect(Intersect * x, MarkerType marker, pablo::PabloBuilder & cg);
    118     pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated_j, int j, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
     121    pablo::PabloAST * consecutive_matches(pablo::PabloAST * repeated_j, int j, int repeat_count, const int match_length, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
    119122    pablo::PabloAST * reachable(pablo::PabloAST * repeated, int length, int repeat_count, pablo::PabloAST * indexStream, pablo::PabloBuilder & pb);
    120123    static bool isFixedLength(RE * regexp);
     
    143146    std::vector<cc::Alphabet *>                     mAlphabets;
    144147    std::vector<std::unique_ptr<cc::CC_Compiler>>   mAlphabetCompilers;
    145 
    146148    cc::CC_Compiler &                               mCCCompiler;
     149    const cc::Alphabet &                            mIndexingAlphabet;
     150    cc::BitNumbering                                mBasisSetNumbering;
    147151    pablo::PabloAST *                               mLineBreak;
    148152    re::Name *                                      mNonFinalName;
     
    152156    NameMap                                         mBaseMap;
    153157    std::map<std::string, pablo::PabloAST *>        mExternalNameMap;
    154     cc::BitNumbering mBasisSetNumbering;
    155158};
    156159
  • icGREP/icgrep-devel/icgrep/toolchain/cpudriver.cpp

    r6288 r6297  
    298298    mEngine->finalizeObject();
    299299    #else
    300     moduleSet.push_back(std::unique_ptr<Module>(mMainModule);
     300    moduleSet.push_back(std::unique_ptr<Module>(mMainModule));
    301301    mCompileLayer->addModuleSet(std::move(moduleSet), make_unique<SectionMemoryManager>(), std::move(Resolver));
    302302    #endif
Note: See TracChangeset for help on using the changeset viewer.