Changeset 5816


Ignore:
Timestamp:
Dec 31, 2017, 7:22:14 PM (12 months ago)
Author:
cameron
Message:

Supporting multiple alphabets in RE compilation - initial check-in

Location:
icGREP/icgrep-devel/icgrep
Files:
11 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r5800 r5816  
    2222public:
    2323    const std::string & getName() const { return mAlphabetName;}
     24    virtual const unsigned getSize() const = 0;
    2425    enum class ClassTypeId : unsigned {UnicodeMappableAlphabet, CodeUnitAlphabet, MultiplexedAlphabet};
    2526    inline ClassTypeId getClassTypeId() const {
     
    5758    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
    5859
     60    const unsigned getSize() const override {return mUnicodeCommon + mAboveCommon.size();}
     61
    5962protected:
    6063    UCD::codepoint_t mCharSet;
     
    7174    static inline bool classof(const void *) {return false;}
    7275    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
    73    
     76    const unsigned getSize() const override {return 1<<mCodeUnitBits;}
     77
    7478private:
    7579    uint8_t mCodeUnitBits;
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp

    r5801 r5816  
    99#include "boost/dynamic_bitset.hpp"
    1010#include <cc/multiplex_CCs.h>
     11#include <re/printer_re.h>
    1112#include <llvm/Support/Casting.h>
    1213#include <llvm/Support/ErrorHandling.h>
     14#include <llvm/Support/raw_ostream.h>
    1315
    1416namespace cc {
     
    138140   
    139141    const auto index = find(mUnicodeSets.begin(), mUnicodeSets.end(), sourceCC) - mUnicodeSets.begin();
     142    if (index >= mUnicodeSets.size()) {
     143        llvm::errs() << Printer_RE::PrintRE(sourceCC) << " not found\n";
     144    }
    140145    const auto exclusive_IDs = mExclusiveSetIDs[index];
    141146    re::CC * CC_union = re::makeCC(this);
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.h

    r5801 r5816  
    2020    }
    2121    static inline bool classof(const void *) {return false;}
     22   
     23    const unsigned getSize() const override {return mUnicodeSets.size() + 1;}
    2224
    2325    const Alphabet * getSourceAlphabet() const;
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5812 r5816  
    128128        REs[i] = resolveModesAndExternalSymbols(REs[i]);
    129129        REs[i] = excludeUnicodeLineBreak(REs[i]);
    130 #define USE_MULTIPLEX_CC
     130//#define USE_MULTIPLEX_CC
    131131#ifdef USE_MULTIPLEX_CC
    132        
    133132        REs[i] = multiplexing_prepasses(REs[i]);
    134133        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
    135         mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
    136         REs[i] = transformCCs(mpx.get(), REs[i]);
    137         //llvm::errs() << Printer_RE::PrintRE(REs[i]) << '\n';
    138         std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
    139         auto numOfCharacterClasses = mpx_basis.size();
    140         StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
    141         kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    142         mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    143         StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
    144         kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
    145         mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
     134        if (UnicodeSets.size() <= 1) {
     135            StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     136            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
     137            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
     138            MatchResultsBufs[i] = MatchResults;
     139        }
     140        else {
     141            mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
     142            REs[i] = transformCCs(mpx.get(), REs[i]);
     143            std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
     144            auto numOfCharacterClasses = mpx_basis.size();
     145            StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
     146            kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
     147            mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
     148            StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     149            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
     150            mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams, CharClasses}, {MatchResults});
     151            MatchResultsBufs[i] = MatchResults;
     152        }
    146153#else
    147154        REs[i] = regular_expression_passes(REs[i]);
     
    149156        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
    150157        mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
     158        MatchResultsBufs[i] = MatchResults;
    151159#endif
    152         MatchResultsBufs[i] = MatchResults;
    153160    }
    154161    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5797 r5816  
    1919#include <cc/cc_compiler.h>         // for CC_Compiler
    2020#include <cc/alphabet.h>
     21#include <re/re_compiler.h>
    2122#include <llvm/Support/raw_ostream.h>
    2223
     
    175176}
    176177
    177 ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, RE * const re, unsigned numOfCharacterClasses)
     178inline static unsigned ceil_log2(const unsigned v) {
     179    assert ("log2(0) is undefined!" && v != 0);
     180    return (sizeof(unsigned) * CHAR_BIT) - __builtin_clz(v - 1U);
     181}
     182
     183// Helper to compute stream set inputs to pass into PabloKernel constructor.
     184std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, std::vector<cc::Alphabet *> alphabets) {
     185    std::vector<Binding> streamSetInputs = {Binding{iBuilder->getStreamSetTy(8), "basis"},
     186        Binding{iBuilder->getStreamSetTy(1, 1), "linebreak"},
     187        Binding{iBuilder->getStreamSetTy(1, 1), "cr+lf"},
     188        Binding{iBuilder->getStreamSetTy(3, 1), "required"}};
     189    for (unsigned i = 0; i < alphabets.size(); i++) {
     190        unsigned basis_size = ceil_log2(alphabets[i]->getSize());
     191        streamSetInputs.push_back(Binding{iBuilder->getStreamSetTy(basis_size, 1), "basisSet" + std::to_string(i)});
     192    }
     193    return streamSetInputs;
     194}
     195
     196ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, RE * const re, std::vector<cc::Alphabet *> alphabets)
    178197: ICGrepSignature(re)
    179198, PabloKernel(iBuilder, "ic" + sha1sum(mSignature),
    180199// inputs
    181 {Binding{iBuilder->getStreamSetTy(numOfCharacterClasses), "basis"},
    182 Binding{iBuilder->getStreamSetTy(1, 1), "linebreak"},
    183 Binding{iBuilder->getStreamSetTy(1, 1), "cr+lf"},
    184 Binding{iBuilder->getStreamSetTy(3, 1), "required"}},
     200icGrepInputs(iBuilder, alphabets),
    185201// output
    186202{Binding{iBuilder->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}}) {
    187 
     203    mAlphabets = alphabets;
    188204}
    189205
     
    193209
    194210void ICGrepKernel::generatePabloMethod() {
    195     PabloAST * const match_post = re2pablo_compiler(this, mRE);
     211    Var * const basis = getInputStreamVar("basis");
     212    cc::CC_Compiler cc_compiler(this, basis);
     213    RE_Compiler re_compiler(this, cc_compiler);
     214    for (unsigned i = 0; i < mAlphabets.size(); i++) {
     215        auto basis = getInputStreamVar("basisSet" + std::to_string(i));
     216        re_compiler.addAlphabet(mAlphabets[i], basis);
     217    }
     218    PabloAST * const match_post = re_compiler.compile(mRE);
    196219    PabloBlock * const pb = getEntryBlock();
    197220    Var * const output = getOutputStreamVar("matches");
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.h

    r5769 r5816  
    1010namespace IDISA { class IDISA_Builder; }
    1111namespace re { class RE; }
     12namespace cc { class Alphabet; }
    1213namespace kernel {
    1314
     
    4243class ICGrepKernel : public ICGrepSignature, public pablo::PabloKernel {
    4344public:
    44     ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, re::RE * const re_ast, const unsigned numOfCharacterClasses = 8);
     45    ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, re::RE * const re_ast, std::vector<cc::Alphabet *> alphabets = {});
    4546    std::string makeSignature(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    4647    bool isCachable() const override { return true; }
    4748protected:
    4849    void generatePabloMethod() override;
     50    std::vector<cc::Alphabet *> mAlphabets;
    4951};
    5052
  • icGREP/icgrep-devel/icgrep/re/printer_re.cpp

    r5786 r5816  
    2222#include <re/re_assertion.h>
    2323#include <re/re_group.h>
     24#include <cc/alphabet.h>
    2425
    2526using namespace re;
     
    5455            retVal += "]";
    5556        }
     57        retVal += "/" + re_cc->getAlphabet()->getName();
    5658    } else if (const Name* re_name = dyn_cast<const Name>(re)) {
    5759        retVal = "Name \"";
     
    6264        retVal += re_name->getName();
    6365        retVal += "\" ";
    64         if (re_name->getType() == Name::Type::Capture) {
     66        //if (re_name->getType() == Name::Type::Capture) {
    6567            retVal += "=(" + PrintRE(re_name->getDefinition()) + ")";
    66         }
     68        //}
    6769    } else if (const Range* rg = dyn_cast<const Range>(re)) {
    6870        retVal = "Range (";
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5813 r5816  
    2929#include <re/to_utf8.h>
    3030#include <re/re_toolchain.h>        // for AlgorithmOptionIsSet, RE_Algorith...
    31 #include "cc/cc_compiler.h"         // for CC_Compiler
     31#include <cc/alphabet.h>
     32#include <cc/cc_compiler.h>
    3233#include "pablo/builder.hpp"        // for PabloBuilder
    3334#include <IR_Gen/idisa_target.h>    // for AVX2_available
     35#include <llvm/ADT/STLExtras.h> // for make_unique
     36#include <llvm/Support/raw_ostream.h>
    3437#include <llvm/Support/ErrorHandling.h>
    3538
    3639namespace pablo { class PabloAST; }
     40namespace pablo { class Var; }
    3741namespace pablo { class PabloKernel; }
    3842namespace re { class Alt; }
     
    4549
    4650namespace re {
     51
     52   
     53void RE_Compiler::addAlphabet(cc::Alphabet * a, pablo::Var * basis_set) {
     54    mAlphabets.push_back(a);
     55    mAlphabetCompilers.push_back(make_unique<cc::CC_Compiler>(mKernel, basis_set));
     56}
    4757
    4858using MarkerType = RE_Compiler::MarkerType;
     
    97107MarkerType RE_Compiler::compileCC(CC * cc, MarkerType marker, PabloBuilder & pb) {
    98108    PabloAST * nextPos = markerVar(marker);
    99     // If Unicode CCs weren't pulled out earlier, we generate the equivalent
    100     // byte sequence as an RE.
    101     if (cc->getAlphabet() == &cc::Unicode) {
    102          MarkerType m = compile(toUTF8(cc), pb);
    103          nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    104          return makeMarker(FinalMatchUnit, pb.createAnd(markerVar(m), nextPos));
    105     }
    106     if (isByteLength(cc)) {
     109    const cc::Alphabet * a = cc->getAlphabet();
     110    if (a == &cc::Byte) {
    107111        if (marker.pos == FinalMatchUnit) {
    108112            nextPos = pb.createAdvance(nextPos, 1);
    109113        }
     114        return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
     115    } else if (a == &cc::Unicode) {
     116        MarkerType m = compile(toUTF8(cc), pb);
     117        nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     118        return makeMarker(FinalMatchUnit, pb.createAnd(markerVar(m), nextPos));
    110119    } else {
    111         nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
    112     }
    113     return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mCCCompiler.compileCC(cc, pb)));
     120        if (isByteLength(cc)) {
     121            if (marker.pos == FinalMatchUnit) {
     122                nextPos = pb.createAdvance(nextPos, 1);
     123            }
     124        } else {
     125            nextPos = markerVar(AdvanceMarker(marker, FinalPostPositionUnit, pb));
     126        }
     127        unsigned i = 0;
     128        while (i < mAlphabets.size() && (a != mAlphabets[i])) i++;
     129        if (i == mAlphabets.size()) llvm::report_fatal_error("Alphabet " + a->getName() + " has no CC compiler");
     130        return makeMarker(FinalMatchUnit, pb.createAnd(nextPos, mAlphabetCompilers[i]->compileCC(cc, pb)));
     131    }
    114132}
    115133
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5812 r5816  
    1212#include <pablo/builder.hpp>
    1313#include <vector>       // for vector<>::iterator
    14 namespace cc { class CC_Compiler; }
     14namespace cc { class CC_Compiler; class Alphabet;}
    1515namespace pablo { class PabloAST; }
    1616namespace pablo { class PabloBuilder; }
    1717namespace pablo { class PabloKernel; }
     18namespace pablo { class Var; }
    1819namespace re { class Alt; }
    1920namespace re { class Assertion; }
     
    5152
    5253    RE_Compiler(pablo::PabloKernel * kernel, cc::CC_Compiler & ccCompiler);
     54   
     55    //
     56    // The CCs (character classes) within a regular expression are generally
     57    // expressed using a single alphabet.   But multiple alphabets may be
     58    // used under some circumstances.   For example, regular expressions for
     59    // Unicode may use both the Unicode alphabet for full Unicode characters
     60    // as well as the Byte alphabet for the individual code units of UTF-8.
     61    // In other cases, a multiplexed alphabet may be used for a certain
     62    // subexpression, for example, if the subexpression involves a local
     63    // language or a capture-backreference combination.
     64    //
     65    // Alphabets are added as needed using the addAlphabet method, giving both
     66    // the alphabet value and the set of parallel bit streams that comprise
     67    // a basis for the coded alphabet values.
     68   
     69    void addAlphabet(cc::Alphabet * a, pablo::Var * basis_set);
     70
    5371    pablo::PabloAST * compile(RE * re, pablo::PabloAST * const initialCursors = nullptr);
    5472
     
    113131
    114132    pablo::PabloKernel * const                      mKernel;
     133    std::vector<cc::Alphabet *>                     mAlphabets;
     134    std::vector<std::unique_ptr<cc::CC_Compiler>>   mAlphabetCompilers;
     135
    115136    bool                                            mCountOnly;
    116137    cc::CC_Compiler &                               mCCCompiler;
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r5814 r5816  
    722722        else return createCC(cp);
    723723    }
    724     else return makeCC(parse_escaped_codepoint());
     724    else return createCC(parse_escaped_codepoint());
    725725}
    726726
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5806 r5816  
    159159    Var * const basis = kernel->getInputStreamVar("basis");
    160160    cc::CC_Compiler cc_compiler(kernel, basis);
    161     // compile Unicode names
    162161    RE_Compiler re_compiler(kernel, cc_compiler);
    163162    return re_compiler.compile(re_ast);
Note: See TracChangeset for help on using the changeset viewer.