Changeset 4617


Ignore:
Timestamp:
Jun 25, 2015, 3:47:56 PM (4 years ago)
Author:
nmedfort
Message:

Upload of an untested (inactive) UCD compiler.

Location:
icGREP/icgrep-devel/icgrep
Files:
2 added
9 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r4602 r4617  
    6868add_library(RegExpADT re/re_re.cpp re/re_cc.cpp re/re_parser.cpp re/re_rep.cpp re/parsefailure.cpp re/re_nullable.cpp re/re_simplifier.cpp re/re_compiler.cpp re/printer_re.cpp re/re_diff.cpp re/re_intersect.cpp re/re_analysis.cpp)
    6969add_library(CCADT cc/cc_namemap.cpp cc/cc_compiler.cpp utf8_encoder.cpp UCD/CaseFolding_txt.cpp)
    70 add_library(UCDlib UCD/unicode_set.cpp UCD/precompiled_gc.cpp UCD/precompiled_sc.cpp UCD/precompiled_scx.cpp UCD/precompiled_blk.cpp UCD/precompiled_derivedcoreproperties.cpp UCD/precompiled_proplist.cpp)
     70add_library(UCDlib UCD/unicode_set.cpp UCD/ucd_compiler.cpp UCD/precompiled_gc.cpp UCD/precompiled_sc.cpp UCD/precompiled_scx.cpp UCD/precompiled_blk.cpp UCD/precompiled_derivedcoreproperties.cpp UCD/precompiled_proplist.cpp)
    7171
    7272
  • icGREP/icgrep-devel/icgrep/UCD/CaseFolding_txt.h

    r4615 r4617  
    1717    re::codepoint_t range_lo;
    1818    int fold_offset;
    19     std::vector<std::pair<re::codepoint_t, re::codepoint_t> > fold_pairs;
     19    std::vector<re::interval_t> fold_pairs;
    2020};
    2121
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.cpp

    r4616 r4617  
    2222#include <string>
    2323#include <iostream>
     24#include <include/simd-lib/builtins.hpp>
    2425
    2526const size_t QUAD_BITS = (8 * sizeof(bitquad_t));
     
    2829const bitquad_t FULL_QUAD_MASK = -1;
    2930
    30 
    31 inline const RunStructure & get_run(UnicodeSet::iterator i) {
     31inline const RunStructure & get_run(UnicodeSet::quad_iterator i) {
    3232    return std::get<0>(*i);
    3333}
    3434
    35 inline bitquad_t get_quad(UnicodeSet::iterator i) {
     35inline bitquad_t get_quad(UnicodeSet::quad_iterator i) {
    3636    return std::get<1>(*i);
    3737}
    3838
    39 const std::pair<RunStructure, bitquad_t> UnicodeSet::iterator::dereference() const {
     39const std::pair<RunStructure, bitquad_t> UnicodeSet::quad_iterator::dereference() const {
    4040    const RunStructure & t = mUnicodeSet.runs[mRunIndex];
    4141    RunStructure s(t.mType, t.mRunLength - mOffset);
     
    4444}
    4545
    46 void UnicodeSet::iterator::advance(unsigned n) {
     46void UnicodeSet::quad_iterator::advance(unsigned n) {
    4747    while (n > 0) {
    4848        const RunStructure & t = mUnicodeSet.runs[mRunIndex];
     
    107107
    108108void Dump_uset(const UnicodeSet & s) {
    109     for (auto it = s.begin(); it != s.end(); ++it) {
     109    for (auto it = s.quad_begin(); it != s.quad_end(); ++it) {
    110110        RunStructure this_run = get_run(it);
    111111        if (this_run.mType == Empty) {
     
    170170    assert(s.quad_count == UNICODE_QUAD_COUNT);
    171171    UnicodeSet iset;
    172     for (auto itr = s.begin(); itr != s.end(); ) {
     172    for (auto itr = s.quad_begin(); itr != s.quad_end(); ) {
    173173        auto run = get_run(itr);
    174174        if (run.mType == Empty) {
     
    193193    assert(s2.quad_count == UNICODE_QUAD_COUNT);
    194194    UnicodeSet iset;
    195     for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) {
     195    for (auto i1 = s1.quad_begin(), i2 = s2.quad_begin(); i1 != s1.quad_end(); ) {
    196196        auto run1 = get_run(i1);
    197197        auto run2 = get_run(i2);
     
    232232    assert(s2.quad_count == UNICODE_QUAD_COUNT);
    233233    UnicodeSet iset;
    234     for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) {
     234    for (auto i1 = s1.quad_begin(), i2 = s2.quad_begin(); i1 != s1.quad_end(); ) {
    235235        auto run1 = get_run(i1);
    236236        auto run2 = get_run(i2);
     
    271271    assert(s2.quad_count == UNICODE_QUAD_COUNT);
    272272    UnicodeSet iset;
    273     for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) {
     273    for (auto i1 = s1.quad_begin(), i2 = s2.quad_begin(); i1 != s1.quad_end(); ) {
    274274        auto run1 = get_run(i1);
    275275        auto run2 = get_run(i2);
     
    310310    assert(s2.quad_count == UNICODE_QUAD_COUNT);
    311311    UnicodeSet iset;
    312     for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) {
     312    for (auto i1 = s1.quad_begin(), i2 = s2.quad_begin(); i1 != s1.quad_end(); ) {
    313313        auto run1 = get_run(i1);
    314314        auto run2 = get_run(i2);
     
    360360    int quad_no = codepoint / QUAD_BITS;
    361361    bitquad_t quad_val = 1 << (codepoint & MOD_QUAD_BIT_MASK);
    362     return (get_quad(s.begin() + quad_no) & quad_val) != 0;
    363 }
     362    return (get_quad(s.quad_begin() + quad_no) & quad_val) != 0;
     363}
     364
     365void UnicodeSet::iterator::advance(unsigned n) {
     366
     367    while (n) {
     368
     369        const RunStructure & t = mUnicodeSet.runs[mRunIndex];
     370
     371        if (t.mType == Full) {
     372            mRight = mBaseCodePoint + t.mRunLength * QUAD_BITS;
     373            --n;
     374        }
     375
     376        if (t.mType != Mixed) {
     377            ++mRunIndex;
     378            mBaseCodePoint += t.mRunLength * QUAD_BITS;
     379            mQuadOffset = 0;
     380            mQuadRunIndex = 0;
     381            continue;
     382        }
     383
     384        while (mQuadRunIndex < t.mRunLength) {
     385
     386            const bitquad_t q = mUnicodeSet.quads[mQuadIndex];
     387            const bitquad_t m = q &(MOD_QUAD_BIT_MASK >> mQuadOffset);
     388
     389            // Nothing left in this quad to add; skip to the next one.
     390            if (m == 0) {
     391                mBaseCodePoint += QUAD_BITS;
     392                mLeft = mBaseCodePoint;
     393                ++mQuadIndex;
     394                if (++mQuadRunIndex == t.mRunLength) {
     395                    ++mRunIndex;
     396                }
     397                continue;
     398            }
     399
     400            mQuadOffset = scan_forward_zeroes(m);
     401            mLeft = mBaseCodePoint + mQuadOffset;
     402            break;
     403        }
     404
     405
     406        while (mQuadRunIndex < t.mRunLength) {
     407
     408            // Although the initial position was in this quad, the final position isn't
     409            // unless this is the last quad of this mixed run and the subsequent quad is
     410            // Empty.
     411
     412            const bitquad_t q = mUnicodeSet.quads[mQuadIndex];
     413            const bitquad_t m = ~q & (MOD_QUAD_BIT_MASK >> mQuadOffset);
     414            // Nothing left in this quad to add; skip to the next one.
     415            if (m == 0) {
     416                mBaseCodePoint += QUAD_BITS;
     417                mRight = mBaseCodePoint;
     418                ++mQuadIndex;
     419                if (++mQuadRunIndex == t.mRunLength) {
     420                    ++mRunIndex;
     421                }
     422                continue;
     423            }
     424
     425            mQuadOffset = scan_forward_zeroes(m);
     426            mRight = mBaseCodePoint + mQuadOffset;
     427            --n;
     428            break;
     429        }
     430    }
     431}
     432
     433
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.h

    r4616 r4617  
    33#include <stdint.h>
    44#include <vector>
     5#include <re/re_cc.h>
    56#include <boost/iterator/iterator_facade.hpp>
    67
     
    4546public:
    4647
    47     class iterator : public boost::iterator_facade<iterator, const UnicodeSet &, boost::forward_traversal_tag, std::pair<RunStructure, bitquad_t>> {
     48    class quad_iterator : public boost::iterator_facade<quad_iterator, const std::pair<RunStructure, bitquad_t>, boost::forward_traversal_tag> {
    4849        friend class UnicodeSet;
    49     public:
    50         iterator(const UnicodeSet & set, unsigned runIndex) : mUnicodeSet(set), mRunIndex(runIndex), mOffset(0), mQuadIndex(0) {}
     50        friend class boost::iterator_core_access;
    5151    protected:
    52         friend class boost::iterator_core_access;
     52        quad_iterator(const UnicodeSet & set, unsigned runIndex) : mUnicodeSet(set), mRunIndex(runIndex), mOffset(0), mQuadIndex(0) {}
     53
    5354        void advance(unsigned n);
     55
    5456        const std::pair<RunStructure, bitquad_t> dereference() const;
     57
    5558        inline void increment() {
    5659            advance(1);
    5760        }
    58         inline bool equal(iterator const& other) const {
    59             return (mRunIndex == other.mRunIndex) && (&(mUnicodeSet) == &(other.mUnicodeSet)) && (mQuadIndex == other.mQuadIndex) && (mOffset == other.mOffset);
     61
     62        inline bool equal(quad_iterator const& other) const {
     63            assert (&(mUnicodeSet) == &(other.mUnicodeSet));
     64            return (mRunIndex == other.mRunIndex) && (mQuadIndex == other.mQuadIndex) && (mOffset == other.mOffset);
    6065        }
    6166    private:
     
    6671    };
    6772
     73    class iterator : public boost::iterator_facade<iterator, re::interval_t, boost::forward_traversal_tag, re::interval_t> {
     74        friend class UnicodeSet;
     75        friend class boost::iterator_core_access;
     76    protected:
     77        iterator(const UnicodeSet & set, unsigned runIndex, unsigned quadIndex)
     78        : mUnicodeSet(set), mRunIndex(runIndex), mQuadIndex(quadIndex), mQuadOffset(0)
     79        , mQuadRunIndex(0), mBaseCodePoint(0), mLeft(0), mRight(0)
     80        {
     81
     82        }
     83
     84        void advance(unsigned n);
     85
     86        re::interval_t dereference() const {
     87            return std::make_pair(mLeft, mRight);
     88        }
     89
     90        inline void increment() {
     91            advance(1);
     92        }
     93
     94        inline bool equal(iterator const & other) const {
     95            assert (&(mUnicodeSet) == &(other.mUnicodeSet));
     96            return (mRunIndex == other.mRunIndex) && (mQuadIndex == other.mQuadIndex) &&
     97                   (mQuadOffset == other.mQuadOffset) && (mQuadRunIndex == other.mQuadRunIndex);
     98        }
     99    private:
     100        const UnicodeSet &      mUnicodeSet;
     101        unsigned                mRunIndex;
     102        unsigned                mQuadIndex;
     103        bitquad_t               mQuadOffset;
     104        unsigned                mQuadRunIndex;
     105        unsigned                mBaseCodePoint;
     106        re::codepoint_t         mLeft;
     107        re::codepoint_t         mRight;
     108    };
     109
     110    inline quad_iterator quad_begin() const {
     111        return quad_iterator(*this, 0);
     112    }
     113
     114    inline quad_iterator quad_end() const {
     115        return quad_iterator(*this, runs.size());
     116    }
     117
    68118    inline iterator begin() const {
    69         return iterator(*this, 0);
     119        return iterator(*this, 0,0);
    70120    }
    71121
    72122    inline iterator end() const {
    73         return iterator(*this, runs.size());
     123        return iterator(*this, runs.size(), quads.size());
    74124    }
    75125
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.cpp

    r4614 r4617  
    2727namespace cc {
    2828
    29 CC_Compiler::CC_Compiler(PabloBlock & entry, const Encoding encoding, const std::string basis_pattern)
     29CC_Compiler::CC_Compiler(PabloBlock & entry, const Encoding & encoding, const std::string basis_pattern)
    3030: mBuilder(entry)
    3131, mBasisBit(encoding.getBits())
  • icGREP/icgrep-devel/icgrep/cc/cc_compiler.h

    r4612 r4617  
    2222    using Vars = std::vector<pablo::Var *>;
    2323
    24     CC_Compiler(pablo::PabloBlock & entry, const Encoding encoding, const std::string basis_pattern = "basis");
     24    CC_Compiler(pablo::PabloBlock & entry, const Encoding & encoding, const std::string basis_pattern = "basis");
    2525
    2626    const Vars & getBasisBits(const CC_NameMap & nameMap) const;
     
    5757    pablo::PabloBuilder         mBuilder;
    5858    std::vector<pablo::Var *>   mBasisBit;
    59     const Encoding              mEncoding;
     59    const Encoding &            mEncoding;
    6060};
    6161
  • icGREP/icgrep-devel/icgrep/pablo/builder.hpp

    r4612 r4617  
    1111
    1212    PabloBuilder(PabloBlock & pb) : mPb(pb) {}
     13
     14    inline static PabloBuilder Create(PabloBuilder & parent) {
     15        return PabloBuilder(parent.getPabloBlock().Create(parent.getPabloBlock()), parent);
     16    }
    1317
    1418    inline Zeroes * createZeroes() const {
     
    101105    }
    102106
     107    inline PabloBlock & getPabloBlock() {
     108        return mPb;
     109    }
     110
     111protected:
     112
     113    PabloBuilder(PabloBlock & pb, PabloBuilder & parent) : mPb(pb), mExprTable(&(parent.mExprTable)) {}
     114
    103115private:
    104116
  • icGREP/icgrep-devel/icgrep/re/re_cc.cpp

    r4614 r4617  
    156156    return cci;
    157157}
    158 
    159 /** ------------------------------------------------------------------------------------------------------------- *
    160  * @brief rangeIntersect
    161  * @param cc
    162  * @param lo
    163  * @param hi
    164  ** ------------------------------------------------------------------------------------------------------------- */
    165 CC * rangeIntersect(const CC * cc, const codepoint_t lo, const codepoint_t hi) {
    166     assert ("cc cannot be null" && cc);
    167     CC * intersect = makeCC();
    168     for (const auto & i : *cc) {
    169         if ((lo_codepoint(i) <= hi) && (hi_codepoint(i) >= lo)) {
    170             intersect->insert_range(std::max(lo, lo_codepoint(i)), std::min(hi, hi_codepoint(i)));
    171         }
    172     }
    173     return intersect;
    174 }
    175 
    176 /** ------------------------------------------------------------------------------------------------------------- *
    177  * @brief rangeGaps
    178  * @param cc
    179  * @param lo
    180  * @param hi
    181  ** ------------------------------------------------------------------------------------------------------------- */
    182 CC * rangeGaps(const CC * cc, const codepoint_t lo, const codepoint_t hi) {
    183     assert ("cc cannot be null" && cc);
    184     CC * gaps = makeCC();
    185     codepoint_t cp = lo;
    186     if (cp < hi) {
    187         auto i = cc->cbegin(), end = cc->cend();
    188         for (; i != end && cp < hi; ++i) {
    189             if (hi_codepoint(i) < cp) {
    190                 continue;
    191             }
    192             else if (lo_codepoint(i) > cp) {
    193                 gaps->insert_range(cp, lo_codepoint(i) - 1);
    194             }
    195             cp = hi_codepoint(i) + 1;
    196         }
    197         if (cp < hi) {
    198             gaps->insert_range(cp, hi);
    199         }
    200     }
    201     return gaps;
    202 }
    203 
    204 /** ------------------------------------------------------------------------------------------------------------- *
    205  * @brief outerRanges
    206  * @param cc
    207  ** ------------------------------------------------------------------------------------------------------------- */
    208 CC * outerRanges(const CC * cc) {
    209     assert ("cc cannot be null" && cc);
    210     CC * ranges = makeCC();
    211     auto i = cc->cbegin();
    212     const auto end = cc->cend();
    213     for (auto j = i; ++j != end; ) {
    214         if (hi_codepoint(j) > hi_codepoint(i)) {
    215             ranges->insert_range(lo_codepoint(i), hi_codepoint(i));
    216             i = j;
    217         }
    218     }
    219     return ranges;
    220 }
    221 
    222 /** ------------------------------------------------------------------------------------------------------------- *
    223  * @brief innerRanges
    224  * @param cc
    225  ** ------------------------------------------------------------------------------------------------------------- */
    226 CC * innerRanges(const CC * cc) {
    227     assert ("cc cannot be null" && cc);
    228     CC * ranges = makeCC();
    229     auto i = cc->cbegin();
    230     const auto end = cc->cend();
    231     for (auto j = i; ++j != end; ) {
    232         if (hi_codepoint(j) <= hi_codepoint(i)) {
    233             ranges->insert_range(lo_codepoint(j), hi_codepoint(j));
    234         }
    235         else {
    236             i = j;
    237         }
    238     }
    239     return ranges;
    240 }
    241158   
    242159}
  • icGREP/icgrep-devel/icgrep/re/re_cc.h

    r4614 r4617  
    127127    friend CC * makeCC(const codepoint_t lo, const codepoint_t hi);
    128128    friend CC * makeCC(const CC * cc1, const CC * cc2);
     129    friend CC * makeCC(const std::initializer_list<interval_t> list);
     130    friend CC * makeCC(const std::vector<interval_t> & list);
    129131    friend CC * subtractCC(const CC * cc1, const CC * cc2);
     132
    130133    inline CC()
    131134    : RE(ClassTypeId::CC)
     
    146149    CC(const CC * cc1, const CC * cc2);
    147150
     151    template <typename itr>
     152    CC * initialize(itr begin, itr end);
    148153private:   
    149154    IntervalVector mSparseCharSet;
     
    167172}
    168173
    169 inline codepoint_t & lo_codepoint(CC::reference i) {
     174inline codepoint_t & lo_codepoint(interval_t & i) {
    170175    return std::get<0>(i);
    171176}
    172 inline codepoint_t lo_codepoint(CC::const_reference i) {
     177inline codepoint_t lo_codepoint(const interval_t & i) {
    173178    return std::get<0>(i);
    174179}
     
    180185}
    181186
    182 inline codepoint_t & hi_codepoint(CC::reference i) {
     187inline codepoint_t & hi_codepoint(interval_t & i) {
    183188    return std::get<1>(i);
    184189}
    185 inline codepoint_t hi_codepoint(CC::const_reference i) {
     190inline codepoint_t hi_codepoint(const interval_t & i) {
    186191    return std::get<1>(i);
    187192}
     
    191196inline codepoint_t hi_codepoint(const CC::const_iterator i) {
    192197    return hi_codepoint(*i);
     198}
     199
     200template<typename itr>
     201CC * CC::initialize(itr begin, itr end) {
     202    mSparseCharSet.resize(std::distance(begin, end));
     203    for (auto i = begin; i != end; ++i) {
     204        assert (i == begin || lo_codepoint(i) > max_codepoint());
     205        mSparseCharSet[std::distance(begin, i)] = *i;
     206    }
     207    return this;
    193208}
    194209
     
    218233}
    219234
     235inline CC * makeCC(const std::initializer_list<interval_t> list) {
     236    return makeCC()->initialize(list.begin(), list.end());
     237}
     238
     239inline CC * makeCC(const std::vector<interval_t> & list) {
     240    return makeCC()->initialize(list.begin(), list.end());
     241}
     242
    220243CC * subtractCC(const CC * a, const CC * b);
    221244   
     
    224247CC * caseInsensitize(const CC * cc);
    225248
    226 CC * rangeIntersect(const CC * cc, const codepoint_t lo, const codepoint_t hi);
    227 
    228 CC * rangeGaps(const CC * cc, const codepoint_t lo, const codepoint_t hi);
    229 
    230 CC * outerRanges(const CC * cc);
    231 
    232 CC * innerRanges(const CC * cc);
    233 
    234249}
    235250
Note: See TracChangeset for help on using the changeset viewer.