Changeset 5795


Ignore:
Timestamp:
Dec 21, 2017, 3:22:41 PM (12 months ago)
Author:
cameron
Message:

Adding Alphabet to CCs: initial check-in

Location:
icGREP/icgrep-devel/icgrep
Files:
9 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/cc/alphabet.cpp

    r5620 r5795  
    77
    88#include "alphabet.h"
     9#include <llvm/Support/ErrorHandling.h>
    910
     11namespace cc {
     12   
     13UnicodeMappableAlphabet::UnicodeMappableAlphabet(std::string alphabetName,
     14                                                 unsigned unicodeCommon,
     15                                                 std::vector <UCD::codepoint_t> aboveCommon) :
     16    Alphabet(alphabetName), mUnicodeCommon(unicodeCommon), mAboveCommon(aboveCommon) {}
    1017
    11 // Default implementation for simple Unicode subsets.  The codepoint value
    12 // of the nth character is just the given value n, if it is in range.
    13 
    14 UCD::codepoint_t Alphabet::toUnicode(const unsigned n) const {
     18UCD::codepoint_t UnicodeMappableAlphabet::toUnicode(const unsigned n) const {
    1519    UCD::codepoint_t cp = n;
    16     if (mCharSet.contains(cp)) return cp;
    17     throw std::runtime_error("toUnicode: n too large.");
    18 }   
     20    if (n < mUnicodeCommon) return cp;
     21    assert(n < mUnicodeCommon + mAboveCommon.size());
     22    return mAboveCommon[n - mUnicodeCommon];
     23}
    1924 
    20 // Default implementation for simple Unicode subsets.  The ord value
    21 // of a Unicode codepoint is just the given codepoint, if it is in range.
    22 
    23 unsigned Alphabet::fromUnicode(const UCD::codepoint_t codepoint) const {
    24     if (mCharSet.contains(codepoint)) return codepoint;
    25     throw std::runtime_error("fromUnicode: codepoint not found in alphabet.");
     25unsigned UnicodeMappableAlphabet::fromUnicode(const UCD::codepoint_t codepoint) const {
     26    unsigned n = codepoint;
     27    if (n < mUnicodeCommon) return n;
     28    for (unsigned i = 0; i < mAboveCommon.size(); i++) {
     29        if (mAboveCommon[i] == codepoint) return mUnicodeCommon + i;
     30    }
     31    llvm::report_fatal_error("fromUnicode: codepoint not found in alphabet.");
    2632}
    2733
     34CodeUnitAlphabet::CodeUnitAlphabet(std::string alphabetName, uint8_t bits) :
     35    Alphabet(alphabetName), mCodeUnitBits(bits) {}
    2836
    29 template <class uint_t> ExtendedASCII<uint_t>::ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]) {
    30     mAlphabetName = alphabetName;
    31     mExtendedCharacterTable = extendedTable;
    32     mCharSet = UCD::UnicodeSet(0, 127);
    33     for (unsigned i = 0; i < 128; i++) {
    34         mCharSet.insert(extendedTable[i]);
    35     }
    36 }   
    37 
    38 template <class uint_t> UCD::codepoint_t ExtendedASCII<uint_t>::toUnicode(const unsigned n) const {
    39     //  The first 128 characters are just ASCII.
    40     if (n < 128) return n;
    41     if (n < 256) return mExtendedCharacterTable[n-128];
    42     throw std::runtime_error("toUnicode: n too large.");
    43 }   
    44 
    45 template <class uint_t> unsigned ExtendedASCII<uint_t>::fromUnicode(const UCD::codepoint_t codepoint) const {
    46     if (codepoint < 128) return codepoint;
    47     for (unsigned i = 0; i < 128; i++) {
    48         if (mExtendedCharacterTable[i] == codepoint) return i + 128;
    49     }
    50     throw std::runtime_error("fromUnicode: codepoint not found in alphabet.");
    5137}
    52 
  • icGREP/icgrep-devel/icgrep/cc/alphabet.h

    r5620 r5795  
    1010#include <string>
    1111#include <UCD/unicode_set.h>
     12#include <vector>
    1213
     14namespace cc {
    1315//
    1416// An Alphabet is the universe of characters used to form strings in
     
    1921class Alphabet {
    2022public:
     23    const std::string & getName() const { return mAlphabetName;}
     24protected:
     25    Alphabet(std::string alphabetName) : mAlphabetName(alphabetName) {}
     26private:
     27    std::string mAlphabetName;
     28};
    2129
    22     //  Alphabets may simply be a subset of Unicode characters including all
    23     //  characters up to and including a given maximum Unicode codepoint.
     30class UnicodeMappableAlphabet : public Alphabet {
     31public:
     32    //  Alphabets may be formed by some subset of Unicode characters, together
     33    //  with a mapping to and from Unicode.  The mapping is defined in terms of the
     34    //  number of character codes unicodeCommon such that all character codes in the range
     35    //  0..unicodeCommon - 1 map to the same numeric value as the corresponding Unicode
     36    //  codepoint, together with a vector defining the Unicode codepoints for consecutive
     37    //  character codes (if any) above unicodeCommon - 1.
    2438   
    25     Alphabet(std::string alphabetName, UCD::codepoint_t maxChar) :
    26         mAlphabetName(alphabetName), mCharSet(UCD::UnicodeSet(0, maxChar)) {}
    27        
    28     const std::string & getName() const { return mAlphabetName;}
    29    
    30     const UCD::UnicodeSet & getSet() const { return mCharSet;}
     39    UnicodeMappableAlphabet(std::string alphabetName,
     40                            unsigned unicodeCommon,
     41                            std::vector <UCD::codepoint_t> aboveCommon);
    3142   
    3243    //  The Unicode codepoint of the nth character (the character whose alphabet code is n).
    33     virtual UCD::codepoint_t toUnicode(const unsigned n) const;
     44    UCD::codepoint_t toUnicode(const unsigned n) const;
    3445   
    3546    //  The ordinal position of the character whose Unicode codepoint value is ucp.
    36     virtual unsigned fromUnicode(const UCD::codepoint_t ucp) const;
     47    unsigned fromUnicode(const UCD::codepoint_t ucp) const;
    3748
    3849protected:
    39     std::string mAlphabetName;
    40     UCD::UnicodeSet mCharSet;
     50    UCD::codepoint_t mCharSet;
     51    UCD::codepoint_t mUnicodeCommon;
     52    std::vector <UCD::codepoint_t> mAboveCommon;
    4153};
    4254
    43 
    44 Alphabet Unicode("Unicode", UCD::UNICODE_MAX);
    45 
    46 Alphabet ASCII("ASCII", 0x7F);
    47 
    48 Alphabet ISO_Latin1("ISO_Latin1", 0xFF);
    49 
    50 
    51 // Extended ASCII alphabets can be defined with a table of 128 entries defining
    52 // the codepoints for codes in the 0x80 to 0xFF range.
    53 //
    54 // ExtendedASCII<uint16_t> uses compact tables of 16-bit entries, while
    55 // ExtendedASCII<uint32_t> uses tables of 32-bit entries, necessary if any
    56 // codepoint is above 0xFFFF.
    57 
    58 template <class uint_t> class ExtendedASCII : public Alphabet {
     55class CodeUnitAlphabet : public Alphabet {
    5956public:
    60     ExtendedASCII(std::string alphabetName, const uint_t (& extendedTable)[128]);
    61     UCD::codepoint_t toUnicode(const unsigned n) const final;
    62     unsigned fromUnicode(const UCD::codepoint_t ucp) const final;
     57    CodeUnitAlphabet(std::string alphabetName, uint8_t codeUnitBits);
     58    uint8_t getCodeUnitBitWidth() { return mCodeUnitBits;}
     59   
    6360private:
    64     const uint_t (& mExtendedCharacterTable)[128];
     61    uint8_t mCodeUnitBits;
    6562};
    6663
     64//  Some important alphabets are predefined.
     65
     66const static UnicodeMappableAlphabet Unicode("Unicode", UCD::UNICODE_MAX, {});
     67
     68const static UnicodeMappableAlphabet ASCII("ASCII", 0x7F, {});
     69
     70const static UnicodeMappableAlphabet ISO_Latin1("ISO_Latin1", 0xFF, {});
     71
     72const static CodeUnitAlphabet Byte("Byte", 8);
     73
     74}
    6775
    6876#endif // ALPHABET_H
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.cpp

    r5748 r5795  
    88#include <re/re_cc.h>
    99#include "boost/dynamic_bitset.hpp"
     10#include <cc/multiplex_CCs.h>
     11
     12namespace cc {
    1013
    1114//
     
    104107    }
    105108}
     109
     110
     111
     112MultiplexedAlphabet::MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs)
     113    : Alphabet(alphabetName) {
     114        cc::doMultiplexCCs(CCs, mExclusiveSetIDs, mMultiplexedCCs);
     115}
     116
     117std::vector<std::vector<unsigned>> MultiplexedAlphabet::getExclusiveSetIDs() {
     118    return mExclusiveSetIDs;
     119}
     120
     121std::vector<re::CC *> MultiplexedAlphabet::getMultiplexedCCs() {
     122    return mMultiplexedCCs;
     123}
     124}
     125
  • icGREP/icgrep-devel/icgrep/cc/multiplex_CCs.h

    r5748 r5795  
    77
    88#include <vector>
     9#include <cc/alphabet.h>
    910
    1011namespace re { class CC; }
    1112
     13namespace cc {
    1214
    13 void doMultiplexCCs(const std::vector<const re::CC *> & CCs,
    14                     std::vector<std::vector<unsigned>> & exclusiveSetIDs,
    15                     std::vector<re::CC *> & multiplexedCCs);
     15class MultiplexedAlphabet : public Alphabet {
     16public:
     17    MultiplexedAlphabet(std::string alphabetName, const std::vector<const re::CC *> CCs);
     18   
     19    std::vector<std::vector<unsigned>> getExclusiveSetIDs();
     20   
     21    std::vector<re::CC *> getMultiplexedCCs();
     22private:
     23    std::vector<std::vector<unsigned>> mExclusiveSetIDs;
     24    std::vector<re::CC *> mMultiplexedCCs;
     25};
     26}
     27
    1628
    1729#endif
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5793 r5795  
    4747using namespace parabix;
    4848using namespace llvm;
     49using namespace cc;
     50
    4951static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
    5052
     
    125127#define USE_MULTIPLEX_CC
    126128#ifdef USE_MULTIPLEX_CC
    127         std::tie<re::RE*, std::vector<re::CC *>>(REs[i], charclasses[i]) = multiplexing_passes(REs[i]);
    128         const auto numOfCharacterClasses = charclasses[i].size();
     129       
     130        REs[i] = multiplexing_prepasses(REs[i]);
     131        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
     132        std::unique_ptr<cc::MultiplexedAlphabet> mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
     133        REs[i] = multiplex(REs[i], UnicodeSets, mpx->getExclusiveSetIDs());
     134        std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
     135        auto numOfCharacterClasses = mpx_basis.size();
    129136        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
    130         kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(charclasses[i]));
     137        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
    131138        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
    132139        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
     
    409416
    410417bool GrepEngine::searchAllFiles() {
    411     const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
    412     pthread_t threads[numOfThreads];
     418    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size()));
     419    std::vector<pthread_t> threads(numOfThreads);
    413420
    414421    for(unsigned long i = 1; i < numOfThreads; ++i) {
     
    470477        pthread_exit(nullptr);
    471478    } else {
    472         return nullptr;
    473     }
    474 }
    475 
    476 }
     479        // Always perform one final cache cleanup step.
     480        mGrepDriver->performIncrementalCacheCleanupStep();
     481    }
     482}
     483
     484}
  • icGREP/icgrep-devel/icgrep/re/re_cc.cpp

    r5781 r5795  
    3333}
    3434   
    35 CC::CC()
     35CC::CC(const cc::Alphabet * alphabet)
    3636: RE(ClassTypeId::CC)
    37 , UnicodeSet() {
     37, UnicodeSet()
     38, mAlphabet(alphabet) {}
    3839
    39 }
    4040
    4141CC::CC(const CC & cc)
    4242: RE(ClassTypeId::CC)
    43 , UCD::UnicodeSet(cc) {
     43, UCD::UnicodeSet(cc)
     44, mAlphabet(cc.getAlphabet()) {}
    4445
    45 }
    4646
    47 CC::CC(const codepoint_t codepoint)
     47CC::CC(const codepoint_t codepoint, const cc::Alphabet * alphabet)
    4848: RE(ClassTypeId::CC)
    49 , UCD::UnicodeSet(codepoint) {
     49, UCD::UnicodeSet(codepoint)
     50, mAlphabet(alphabet) {}
    5051
    51 }
    5252
    53 CC::CC(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint)
     53CC::CC(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint, const cc::Alphabet * alphabet)
    5454: RE(ClassTypeId::CC)
    55 , UCD::UnicodeSet(lo_codepoint, hi_codepoint) {
     55, UCD::UnicodeSet(lo_codepoint, hi_codepoint)
     56, mAlphabet(alphabet) {}
    5657
    57 }
    5858
    5959CC::CC(const CC * cc1, const CC * cc2)
    6060: RE(ClassTypeId::CC)
    61 , UCD::UnicodeSet(std::move(*cc1 + *cc2)) {
     61, UCD::UnicodeSet(std::move(*cc1 + *cc2))
     62, mAlphabet(cc1->getAlphabet()) {
     63    assert (cc1->getAlphabet() == cc2->getAlphabet());
     64}
     65
     66
     67CC::CC(const UCD::UnicodeSet && set, const cc::Alphabet * alphabet)
     68: RE(ClassTypeId::CC)
     69, UCD::UnicodeSet(std::move(set))
     70, mAlphabet(alphabet) {}
     71
     72
     73CC::CC(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end, const cc::Alphabet * alphabet)
     74: RE(ClassTypeId::CC)
     75, UCD::UnicodeSet(begin, end)
     76, mAlphabet(alphabet) {}
     77
     78
     79CC::CC(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end, const cc::Alphabet * alphabet)
     80: RE(ClassTypeId::CC)
     81, UCD::UnicodeSet(begin, end)
     82, mAlphabet(alphabet) {}
    6283
    6384}
    64 
    65 CC::CC(const UCD::UnicodeSet && set)
    66 : RE(ClassTypeId::CC)
    67 , UCD::UnicodeSet(std::move(set)) {
    68 
    69 }
    70 
    71 CC::CC(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end)
    72 : RE(ClassTypeId::CC)
    73 , UCD::UnicodeSet(begin, end)
    74 {
    75 
    76 }
    77 
    78 CC::CC(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end)
    79 : RE(ClassTypeId::CC)
    80 , UCD::UnicodeSet(begin, end)
    81 {
    82 
    83 }
    84 
    85 }
  • icGREP/icgrep-devel/icgrep/re/re_cc.h

    r5781 r5795  
    1010#include "re_re.h"
    1111#include <UCD/unicode_set.h>
     12#include <cc/alphabet.h>
    1213
    1314namespace re {
     
    2829    }
    2930
     31    const cc::Alphabet * getAlphabet() const { return mAlphabet;}
    3032
    3133    std::string canonicalName(const CC_type type) const;
     
    4244
    4345protected:
    44     friend CC * makeCC();
    45     friend CC * makeCC(const codepoint_t codepoint);
    46     friend CC * makeCC(const codepoint_t lo, const codepoint_t hi);
     46    friend CC * makeCC(const cc::Alphabet * alphabet);
     47    friend CC * makeCC(const codepoint_t codepoint, const cc::Alphabet * alphabet);
     48    friend CC * makeCC(const codepoint_t lo, const codepoint_t hi, const cc::Alphabet * alphabet);
    4749    friend CC * makeCC(const CC * cc1, const CC * cc2);
    48     friend CC * makeCC(std::initializer_list<interval_t> list);
    49     friend CC * makeCC(std::vector<interval_t> && list);
    50     friend CC * makeCC(UCD::UnicodeSet && set);
     50    friend CC * makeCC(std::initializer_list<interval_t> list, const cc::Alphabet * alphabet);
     51    friend CC * makeCC(std::vector<interval_t> && list, const cc::Alphabet * alphabet);
     52    friend CC * makeCC(UCD::UnicodeSet && set, const cc::Alphabet * alphabet);
    5153    friend CC * subtractCC(const CC * a, const CC * b);
    5254    friend CC * intersectCC(const CC * a, const CC * b);
    5355
    54     CC();
     56    CC(const cc::Alphabet * alphabet);
    5557
    5658    CC(const CC & cc);
    5759
    58     CC(const codepoint_t codepoint);
     60    CC(const codepoint_t codepoint, const cc::Alphabet * alphabet);
    5961
    60     explicit CC(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint);
     62    explicit CC(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint, const cc::Alphabet * alphabet);
    6163
    6264    explicit CC(const CC * cc1, const CC * cc2);
    6365
    64     CC(const UCD::UnicodeSet && set);
     66    CC(const UCD::UnicodeSet && set, const cc::Alphabet * alphabet);
    6567
    66     CC(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end);
     68    CC(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end, const cc::Alphabet * alphabet);
    6769
    68     CC(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end);
     70    CC(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end, const cc::Alphabet * alphabet);
     71private:
     72    const cc::Alphabet * mAlphabet;
     73   
    6974
    7075};
     
    100105 */
    101106
    102 inline CC * makeCC() {
    103     return new CC();
     107inline CC * makeCC(const cc::Alphabet * alphabet = &cc::Unicode) {
     108    return new CC(alphabet);
    104109}
    105110
    106 inline CC * makeCC(const codepoint_t codepoint) {
    107     return new CC(codepoint);
     111    inline CC * makeCC(const codepoint_t codepoint, const cc::Alphabet * alphabet = &cc::Unicode) {
     112    return new CC(codepoint, alphabet);
    108113}
    109114
    110 inline CC * makeCC(const codepoint_t lo, const codepoint_t hi) {
    111     return new CC(lo, hi);
     115inline CC * makeCC(const codepoint_t lo, const codepoint_t hi, const cc::Alphabet * alphabet = &cc::Unicode) {
     116    return new CC(lo, hi, alphabet);
    112117}
    113118
     
    116121}
    117122
    118 inline CC * makeCC(std::initializer_list<interval_t> list) {
    119     return new CC(list.begin(), list.end());
     123inline CC * makeCC(std::initializer_list<interval_t> list, const cc::Alphabet * alphabet = &cc::Unicode) {
     124    return new CC(list.begin(), list.end(), alphabet);
    120125}
    121126
    122 inline CC * makeCC(std::vector<interval_t> && list) {
    123     return new CC(list.begin(), list.end());
     127inline CC * makeCC(std::vector<interval_t> && list, const cc::Alphabet * alphabet = &cc::Unicode) {
     128    return new CC(list.begin(), list.end(), alphabet);
    124129}
    125130
    126 inline CC * makeCC(UCD::UnicodeSet && set) {
    127     return new CC(std::move(set));
     131inline CC * makeCC(UCD::UnicodeSet && set, const cc::Alphabet * alphabet = &cc::Unicode) {
     132    return new CC(std::move(set), alphabet);
    128133}
    129134
    130135inline CC * subtractCC(const CC * a, const CC * b) {
    131     return new CC(*a - *b);
     136    //assert (a->getAlphabet() == b->getAlphabet());
     137    return new CC(*a - *b, a->getAlphabet());
    132138}
    133139
    134140inline CC * intersectCC(const CC * a, const CC * b) {
    135     return new CC(*a & *b);
     141    //assert (a->getAlphabet() == b->getAlphabet());
     142    return new CC(*a & *b, a->getAlphabet());
    136143}
    137144
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.cpp

    r5792 r5795  
    2222#include <re/exclude_CC.h>
    2323#include <re/re_name_resolve.h>
    24 #include <re/re_collect_unicodesets.h>
    25 #include <re/re_multiplex.h>
    2624#include <re/grapheme_clusters.h>
    27 #include <cc/multiplex_CCs.h>
    2825#include <llvm/Support/raw_ostream.h>
    2926
     
    6562
    6663
    67 std::pair<RE *, std::vector<re::CC *>> multiplexing_passes(RE * r) {
     64RE * multiplexing_prepasses(RE * r) {
    6865    std::vector<re::CC *> charclasses;
    6966    if (PrintOptions.isSet(ShowAllREs) || PrintOptions.isSet(ShowREs)) {
     
    116113        errs() << "exclude_CC:\n" << Printer_RE::PrintRE(r) << '\n';
    117114    }
    118     const auto UnicodeSets = re::collectUnicodeSets(r);
    119     std::vector<std::vector<unsigned>> exclusiveSetIDs;
    120     doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses);
    121     r = multiplex(r, UnicodeSets, exclusiveSetIDs);
    122     if (PrintOptions.isSet(ShowAllREs)) {
    123         errs() << "multiplex:\n" << Printer_RE::PrintRE(r) << '\n';
    124     }
    125     return std::pair<RE *, std::vector<re::CC *>>(r, charclasses);
     115    return r;
    126116}
    127117
  • icGREP/icgrep-devel/icgrep/re/re_toolchain.h

    r5784 r5795  
    3232RE * regular_expression_passes(RE * re_ast);
    3333
    34 std::pair<RE *, std::vector<re::CC *>> multiplexing_passes(RE * r);
     34RE * multiplexing_prepasses(RE * r);
    3535
    3636pablo::PabloAST * re2pablo_compiler(pablo::PabloKernel * kernel, RE * re_ast);
Note: See TracChangeset for help on using the changeset viewer.