Changeset 5881


Ignore:
Timestamp:
Feb 25, 2018, 12:38:51 PM (14 months ago)
Author:
cameron
Message:

Grapheme Cluster Break kernel

Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
14 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5873 r5881  
    112112target_link_libraries (RegExpCompiler UCDlib CCADT RegExpADT)
    113113
    114 add_executable(icgrep icgrep.cpp grep_interface.cpp grep_engine.cpp kernels/u8u32_kernel.cpp kernels/delmask_kernel.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp kernels/charclasses.cpp kernels/linebreak_kernel.cpp kernels/streams_merge.cpp kernels/grep_kernel.cpp kernels/until_n.cpp)
     114add_executable(icgrep icgrep.cpp grep_interface.cpp grep_engine.cpp kernels/u8u32_kernel.cpp kernels/delmask_kernel.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp kernels/charclasses.cpp kernels/linebreak_kernel.cpp kernels/streams_merge.cpp kernels/grep_kernel.cpp kernels/until_n.cpp kernels/grapheme_kernel.cpp)
    115115add_executable(u8u16 u8u16.cpp)
    116116add_executable(base64 base64.cpp kernels/radix64.cpp)
  • icGREP/icgrep-devel/icgrep/UCD/resolve_properties.cpp

    r5880 r5881  
    7171            return true;
    7272        } else if (value == "\\b{g}") {
    73             generateGraphemeClusterBoundaryRule(property);
     73            RE * gcb = generateGraphemeClusterBoundaryRule();
     74            property->setDefinition(resolveUnicodeProperties(gcb));
    7475            return true;
    7576        } else if (value == "^s") {  // "start anchor (^) in single-line mode"
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5867 r5881  
    44 *  icgrep is a trademark of International Characters.
    55 */
    6 
     6#include <set>
    77#include "grep_engine.h"
    88#include "grep_interface.h"
     
    1313#include <kernels/cc_kernel.h>
    1414#include <kernels/grep_kernel.h>
     15#include <kernels/grapheme_kernel.h>
    1516#include <kernels/linebreak_kernel.h>
    1617#include <kernels/streams_merge.h>
     
    2324#include <pablo/pablo_kernel.h>
    2425#include <re/re_cc.h>
     26#include <re/re_name.h>
    2527#include <re/casing.h>
    2628#include <re/exclude_CC.h>
     
    5355static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
    5456static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
    55 static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(true));
     57static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
    5658
    5759namespace grep {
     
    147149        REs[i] = resolveModesAndExternalSymbols(REs[i]);
    148150        REs[i] = excludeUnicodeLineBreak(REs[i]);
     151        //re::Name * unicodeLB = re::makeName("UTF8_LB", re::Name::Type::Unicode);
     152        //unicodeLB->setDefinition(re::makeCC(0x0A));
     153        //REs[i] = resolveAnchors(REs[i], unicodeLB);
    149154        REs[i] = regular_expression_passes(REs[i]);
     155        bool hasGCB = hasGraphemeClusterBoundary(REs[i]);
     156        StreamSetBuffer * GCB_stream = nullptr;
     157        std::vector<std::string> externalStreamNames = std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"};
     158        std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits, LineBreakStream, RequiredStreams};
     159        if (hasGCB) {
     160            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
     161            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
     162            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
     163            externalStreamNames.push_back("\\b{g}");
     164            icgrepInputSets.push_back(GCB_stream);
     165        }
    150166        if (CC_Multiplexing) {
    151             const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
     167            const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
    152168            StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    153169            if (UnicodeSets.size() <= 1) {
    154                 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
    155                 mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
     170                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
     171                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    156172                MatchResultsBufs[i] = MatchResults;
    157173            } else {
     
    165181//                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
    166182//                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
    167                 kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], std::vector<cc::Alphabet *>{mpx.get()});
    168                 mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams, CharClasses}, {MatchResults});
     183                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
     184                icgrepInputSets.push_back(CharClasses);
     185                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    169186                MatchResultsBufs[i] = MatchResults;
    170187            }
    171188        } else {
    172189            StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
    173             kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
    174             mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
     190            kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
     191            mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
    175192            MatchResultsBufs[i] = MatchResults;
    176193        }
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.cpp

    r5872 r5881  
    137137    Var * const required = getOutputStreamVar("nonFinal");
    138138    pb.createAssign(pb.createExtract(required, pb.getInteger(0)), nonFinal);
    139     pb.createAssign(pb.createExtract(getOutputStreamVar("linebreak"), pb.getInteger(0)), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
     139    pb.createAssign(pb.createExtract(getOutputStreamVar("UnicodeLB"), pb.getInteger(0)), pb.createOr(LineBreak, unterminatedLineAtEOF, "EOL"));
    140140}
    141141
     
    146146// output
    147147{Binding{kb->getStreamSetTy(1), "nonFinal", FixedRate()},
    148  Binding{kb->getStreamSetTy(1), "linebreak", FixedRate(), Add1()}}) {
     148 Binding{kb->getStreamSetTy(1), "UnicodeLB", FixedRate(), Add1()}}) {
    149149
    150150}
     
    194194// Helper to compute stream set inputs to pass into PabloKernel constructor.
    195195inline std::vector<Binding> icGrepInputs(const std::unique_ptr<kernel::KernelBuilder> & b,
     196                                         const std::vector<std::string> & externals,
    196197                                         const std::vector<cc::Alphabet *> & alphabets) {
    197198    std::vector<Binding> streamSetInputs = {
    198199        Binding{b->getStreamSetTy(8), "basis"},
    199         Binding{b->getStreamSetTy(1, 1), "linebreak"},
    200         Binding{b->getStreamSetTy(1, 1), "required"}
    201200    };
     201    for (auto & e : externals) {
     202        streamSetInputs.push_back(Binding{b->getStreamSetTy(1, 1), e});
     203    }
    202204    for (const auto & alphabet : alphabets) {
    203205        unsigned basis_size = cast<cc::MultiplexedAlphabet>(alphabet)->getMultiplexedCCs().size();
     
    207209}
    208210
    209 ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<cc::Alphabet *> alphabets)
     211ICGrepKernel::ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & b, RE * const re, std::vector<std::string> externals, std::vector<cc::Alphabet *> alphabets)
    210212: ICGrepSignature(re)
    211213, PabloKernel(b, "ic" + sha1sum(mSignature),
    212214// inputs
    213 icGrepInputs(b, alphabets),
     215icGrepInputs(b, externals, alphabets),
    214216// output
    215217{Binding{b->getStreamSetTy(1, 1), "matches", FixedRate(), Add1()}})
     218, mExternals(externals)
    216219, mAlphabets(alphabets) {
    217 
    218220}
    219221
     
    226228    cc::Parabix_CC_Compiler ccc(getEntryScope(), getInputStreamSet("basis"));
    227229    RE_Compiler re_compiler(this, ccc);
     230    for (auto & e : mExternals) {
     231        re_compiler.addPrecompiled(e, pb.createExtract(getInputStreamVar(e), pb.getInteger(0)));
     232    }
    228233    for (auto a : mAlphabets) {
    229234        auto mpx_basis = getInputStreamSet(a->getName() + "_basis");
  • icGREP/icgrep-devel/icgrep/kernels/grep_kernel.h

    r5816 r5881  
    4343class ICGrepKernel : public ICGrepSignature, public pablo::PabloKernel {
    4444public:
    45     ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, re::RE * const re_ast, std::vector<cc::Alphabet *> alphabets = {});
     45    ICGrepKernel(const std::unique_ptr<kernel::KernelBuilder> & iBuilder, re::RE * const re_ast, std::vector<std::string> externals, std::vector<cc::Alphabet *> alphabets = {});
    4646    std::string makeSignature(const std::unique_ptr<kernel::KernelBuilder> & iBuilder) override;
    4747    bool isCachable() const override { return true; }
    4848protected:
    4949    void generatePabloMethod() override;
     50    std::vector<std::string> mExternals;
    5051    std::vector<cc::Alphabet *> mAlphabets;
    5152};
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.cpp

    r5880 r5881  
    130130#define Ahead(x) makeLookAheadAssertion(x)
    131131
    132 void generateGraphemeClusterBoundaryRule(Name * const &property) {
     132RE * generateGraphemeClusterBoundaryRule() {
    133133    // 3.1.1 Grapheme Cluster Boundary Rules
    134134   
     
    171171    //Name * gcb = makeName("gcb", Name::Type::UnicodeProperty);
    172172    RE * gcb = makeAlt({GCB_1_5, makeDiff(GCB_10, GCB_6_9b)});
    173     gcb = resolveUnicodeProperties(gcb);
    174     property->setDefinition(gcb);
     173    return gcb;
    175174}
    176175
  • icGREP/icgrep-devel/icgrep/re/grapheme_clusters.h

    r5880 r5881  
    1111RE * resolveGraphemeMode(RE * re, bool inGraphemeMode);
    1212
    13 void generateGraphemeClusterBoundaryRule(Name * const &property);
     13RE * generateGraphemeClusterBoundaryRule();
    1414
    1515}
  • icGREP/icgrep-devel/icgrep/re/re_collect_unicodesets.cpp

    r5819 r5881  
    2323    std::vector<const CC *> UnicodeSets;
    2424    boost::container::flat_set<const RE *>  Visited;
     25    std::set<Name *> ignoredExternals;
    2526};
    2627
     
    3435            }
    3536        } else if (isa<Name>(re)) {
     37            if (ignoredExternals.find(cast<Name>(re)) != ignoredExternals.end()) return;
    3638            auto def = cast<Name>(re)->getDefinition();
    3739            if (def != nullptr)
     
    5961}
    6062
    61 std::vector<const CC *> collectUnicodeSets(RE * const re) {
     63std::vector<const CC *> collectUnicodeSets(RE * const re, std::set<Name *> external) {
    6264    SetCollector collector;
     65    collector.ignoredExternals = external;
    6366    collector.collect(re);
    6467    return collector.UnicodeSets;
  • icGREP/icgrep-devel/icgrep/re/re_collect_unicodesets.h

    r5748 r5881  
    33
    44#include <vector>
     5#include <set>
    56
    67namespace re {
     
    89    class RE;
    910    class CC;
     11    class Name;
    1012
    11     std::vector<const CC *> collectUnicodeSets(RE * const re);
     13    std::vector<const CC *> collectUnicodeSets(RE * const re, std::set<Name *> external = {});
    1214
    1315}
  • icGREP/icgrep-devel/icgrep/re/re_compiler.cpp

    r5880 r5881  
    3030#include <cc/cc_compiler.h>
    3131#include "pablo/builder.hpp"        // for PabloBuilder
    32 #include <IR_Gen/idisa_target.h>    // for AVX2_available
    3332#include <llvm/ADT/STLExtras.h> // for make_unique
    3433#include <llvm/Support/raw_ostream.h>
     
    5049    mAlphabets.push_back(a);
    5150    mAlphabetCompilers.push_back(make_unique<cc::Parabix_CC_Compiler>(mEntryScope, basis_set));
     51}
     52
     53void RE_Compiler::addPrecompiled(std::string precompiledName, PabloAST * precompiledStream) {
     54    PabloBuilder pb(mEntryScope);
     55    mExternalNameMap.insert(std::make_pair(precompiledName, precompiledStream));
     56    if (precompiledName == "UTF8_nonfinal") {
     57        mNonFinal = precompiledStream;
     58        mFinal = pb.createNot(precompiledStream);
     59    }
     60    if (precompiledName == "UTF8_LB") {
     61        mLineBreak = precompiledStream;
     62    }
    5263}
    5364
     
    156167        return nameMarker;
    157168    } else if (name->getType() == Name::Type::ZeroWidth) {
     169        auto f = mExternalNameMap.find(nameString);
     170        if (f != mExternalNameMap.end()) {
     171            MarkerType z = makeMarker(FinalPostPositionUnit, f->second);
     172            AlignMarkers(marker, z, pb);
     173            PabloAST * ze = markerVar(z);
     174            return makeMarker(markerPos(marker), pb.createAnd(markerVar(marker), ze, "zerowidth"));
     175        }
    158176        RE * zerowidth = name->getDefinition();
    159177        MarkerType zero = compile(zerowidth, pb);
     
    179197
    180198MarkerType RE_Compiler::compileSeq(Seq * const seq, MarkerType marker, PabloBuilder & pb) {
     199
    181200    // if-hierarchies are not inserted within unbounded repetitions
    182201    if (mStarDepth > 0) {
     
    589608, mStarDepth(0)
    590609, mCompiledName(&mBaseMap) {
    591     PabloBuilder mPB(mEntryScope);
    592     Var * const linebreak = kernel->getInputStreamVar("linebreak");
    593     mLineBreak = mPB.createExtract(linebreak, 0);
    594     Var * const required = kernel->getInputStreamVar("required");
    595     mNonFinal = mPB.createExtract(required, 0);
    596     mFinal = mPB.createNot(mNonFinal);
     610    PabloBuilder pb(mEntryScope);
     611    mLineBreak = pb.createZeroes();  // default so "^/$" matches start/end of text only
     612    mNonFinal = pb.createZeroes();
     613    mFinal = pb.createOnes();
    597614}
    598615
  • icGREP/icgrep-devel/icgrep/re/re_compiler.h

    r5872 r5881  
    6868
    6969    void addAlphabet(cc::Alphabet * a, std::vector<pablo::PabloAST* > basis_set);
     70   
     71    void addPrecompiled(std::string precompiledName, pablo::PabloAST * precompiledStream);
    7072
    7173    pablo::PabloAST * compile(RE * re, pablo::PabloAST * const initialCursors = nullptr);
     
    143145    NameMap *                                       mCompiledName;
    144146    NameMap                                         mBaseMap;
     147    std::map<std::string, pablo::PabloAST *>        mExternalNameMap;
     148
    145149};
    146150
  • icGREP/icgrep-devel/icgrep/re/re_name_gather.cpp

    r5805 r5881  
    1616#include <UCD/resolve_properties.h>
    1717#include <boost/container/flat_set.hpp>
    18 #include <sstream>
     18#include <llvm/Support/Casting.h>
     19#include <llvm/Support/raw_ostream.h>
    1920
    20 using NameMap = UCD::UCDCompiler::NameMap;
    21 
    22 using namespace boost::container;
    2321using namespace llvm;
    24 
    2522namespace re {
    26 
    2723struct NameGather {
    2824
     
    3026        assert ("RE object cannot be null!" && re);
    3127        if (isa<Name>(re)) {
    32             if (mVisited.insert(cast<Name>(re)).second) {
    33                 RE * defn = cast<Name>(re)->getDefinition();
    34                 if (isa<CC>(defn)) {
    35                     if (cast<CC>(defn)->getAlphabet() == &cc::Unicode)
    36                         mNameMap.emplace(cast<Name>(re), nullptr);
    37                 } else {
    38                     gather(defn);
    39                 }
     28            RE * defn = cast<Name>(re)->getDefinition();
     29            if (defn == nullptr) {
     30                mNameSet.emplace(cast<Name>(re));
    4031            }
    4132        } else if (isa<Seq>(re)) {
     
    6455        }
    6556    }
    66     NameGather(NameMap & nameMap)
    67     : mNameMap(nameMap) {
     57    NameGather(std::set<Name *> & nameSet)
     58    : mNameSet(nameSet) {
    6859
    6960    }
     
    7162private:
    7263
    73     NameMap &               mNameMap;
    74     flat_set<Name *>        mVisited;
     64    std::set<Name *> &               mNameSet;
    7565
    7666};
    7767   
    78 NameMap gatherNames(RE *& re) {
    79     NameMap nameMap;
    80     NameGather nameGather(nameMap);
     68std::set<Name *> gatherExternalNames(RE * re) {
     69    std::set<Name *> nameSet;
     70   
     71    NameGather nameGather(nameSet);
    8172    nameGather.gather(re);
    82     return nameMap;
     73    return nameSet;
    8374   
    8475}
  • icGREP/icgrep-devel/icgrep/re/re_name_gather.h

    r5805 r5881  
    22#define RE_NAME_GATHER_H
    33
    4 #include <UCD/ucd_compiler.hpp>
     4#include <string>
     5#include <set>
    56
    67namespace re {
    78
    8     class RE;
    9     class Name;
     9    class RE; class Name;
    1010
    11     UCD::UCDCompiler::NameMap gatherNames(RE * &re);
     11    std::set<Name *> gatherExternalNames(RE * re);
    1212
    1313}
  • icGREP/icgrep-devel/icgrep/toolchain/grep_pipeline.cpp

    r5868 r5881  
    7979   
    8080    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
    81     kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern);
     81    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
    8282    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
    8383   
Note: See TracChangeset for help on using the changeset viewer.