Ignore:
Timestamp:
Jun 28, 2015, 3:55:39 PM (4 years ago)
Author:
nmedfort
Message:

Bug fix for CC insert_range and UnicodeSet? iterator.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.cpp

    r4620 r4621  
    2525#include <include/simd-lib/builtins.hpp>
    2626#include <iostream>
    27 #include <iomanip>
     27
     28using namespace re;
     29
     30namespace UCD {
     31
     32using bitquad_t = UnicodeSet::bitquad_t;
     33using run_t = UnicodeSet::run_t;
     34using RunVector = UnicodeSet::RunVector;
     35using QuadVector = UnicodeSet::QuadVector;
    2836
    2937const size_t QUAD_BITS = (8 * sizeof(bitquad_t));
    3038const size_t MOD_QUAD_BIT_MASK = QUAD_BITS - 1;
    31 const size_t UNICODE_QUAD_COUNT = 0x110000 / QUAD_BITS;
     39const size_t UNICODE_QUAD_COUNT = (CC::UNICODE_MAX + 1) / QUAD_BITS;
    3240const bitquad_t FULL_QUAD_MASK = -1;
    3341
    34 std::string run_type_name(const run_type_t type) {
    35     if (type == Empty) {
    36         return "Empty";
    37     }
    38     if (type == Full) {
    39         return "Full";
    40     }
    41     if (type == Mixed) {
    42         return "Mixed";
    43     }
    44     return "???";
    45 }
    46 
    47 using RunVector = UnicodeSet::RunVector;
    48 using QuadVector = UnicodeSet::QuadVector;
     42inline run_type_t typeOf(const run_t & run) {
     43    return std::get<0>(run);
     44}
     45
     46inline UnicodeSet::length_t lengthOf(const run_t & run) {
     47    return std::get<1>(run);
     48}
    4949
    5050/** ------------------------------------------------------------------------------------------------------------- *
     
    5555        return;
    5656    }
    57     else if (!runs.empty() && runs.back().mType == type) {
    58         runs.back().mRunLength += length;
     57    else if (!runs.empty() && typeOf(runs.back()) == type) {
     58        std::get<1>(runs.back()) += length;
    5959        return;
    6060    }
     
    8181
    8282/** ------------------------------------------------------------------------------------------------------------- *
     83 * @brief runLengthSumsUpToUnicodeQuadCount
     84 *
     85 * Sanity check for each function that constructs a new UnicodeSet
     86 ** ------------------------------------------------------------------------------------------------------------- */
     87inline bool runLengthSumsUpToUnicodeQuadCount(const RunVector & runs) {
     88    unsigned sum = 0;
     89    for (auto & run : runs) {
     90        sum += lengthOf(run);
     91    }
     92    return sum == UNICODE_QUAD_COUNT;
     93}
     94
     95/** ------------------------------------------------------------------------------------------------------------- *
    8396 * @brief dump
    8497 ** ------------------------------------------------------------------------------------------------------------- */
    8598void UnicodeSet::dump(llvm::raw_ostream & out) const {
    8699    auto qi = mQuads.cbegin();
    87     for (const RunStructure & run : mRuns) {
    88         if (run.mType == Empty) {
    89             out << "Empty(" << run.mRunLength << ")\n";
    90         }
    91         else if (run.mType == Full) {
    92             out << "Full(" << run.mRunLength << ")\n";
     100    for (const run_t & run : mRuns) {
     101        if (typeOf(run) == Empty) {
     102            out << "Empty(" << lengthOf(run) << ")\n";
     103        }
     104        else if (typeOf(run) == Full) {
     105            out << "Full(" << lengthOf(run) << ")\n";
    93106        }
    94107        else {
    95             for (const auto qi_end = qi + run.mRunLength; qi != qi_end; ++qi) {
     108            for (const auto qi_end = qi + lengthOf(run); qi != qi_end; ++qi) {
    96109                assert (qi != mQuads.cend());
    97110                out << "Mixed(" << llvm::format("%08x", *qi) << ")\n";
     
    105118 * @brief complement
    106119 ** ------------------------------------------------------------------------------------------------------------- */
    107 UnicodeSet UnicodeSet::complement() const {
     120UnicodeSet UnicodeSet::operator~() const {
    108121    RunVector runs;
    109122    QuadVector quads;
     
    111124    quads.reserve(mQuads.size());
    112125    auto qi = quads.cbegin();
    113     for (const RunStructure & run : mRuns) {
    114         if (run.mType == Empty) {
    115             append_run(Full, run.mRunLength, runs);
    116         }
    117         else if (run.mType == Full) {
    118             append_run(Empty, run.mRunLength, runs);
     126    for (const run_t & run : mRuns) {
     127        if (typeOf(run) == Empty) {
     128            append_run(Full, lengthOf(run), runs);
     129        }
     130        else if (typeOf(run) == Full) {
     131            append_run(Empty, lengthOf(run), runs);
    119132        }
    120133        else {
    121             for (const auto qi_end = qi + run.mRunLength; qi != qi_end; ++qi) {
     134            for (const auto qi_end = qi + lengthOf(run); qi != qi_end; ++qi) {
    122135                assert (qi != quads.cend());
    123136                append_quad(FULL_QUAD_MASK ^ *qi, quads, runs);
     
    125138        }
    126139    }
     140    assert (runLengthSumsUpToUnicodeQuadCount(runs));
    127141    return UnicodeSet(std::move(runs), std::move(quads));
    128142}
     
    139153        const auto run1 = i1.getRun();
    140154        const auto run2 = i2.getRun();
    141         const auto n = std::min(run1.mRunLength, run2.mRunLength);
    142         if (run1.mType == run2.mType && run1.mType != Mixed) {
    143             append_run(run1.mType, n, runs);
    144             i1 += n;
    145             i2 += n;
    146         }
    147         else if (run1.mType == Full) {
     155        const auto n = std::min(lengthOf(run1), lengthOf(run2));
     156        if (typeOf(run1) == typeOf(run2) && typeOf(run1) != Mixed) {
     157            append_run(typeOf(run1), n, runs);
     158            i1 += n;
     159            i2 += n;
     160        }
     161        else if (typeOf(run1) == Full) {
    148162            for (unsigned i = 0; i != n; ++i, ++i2) {
    149163                append_quad(i2.getQuad(), quads, runs);
     
    151165            i1 += n;
    152166        }
    153         else if (run2.mType == Full) {
     167        else if (typeOf(run2) == Full) {
    154168            for (unsigned i = 0; i != n; ++i, ++i1) {
    155169                append_quad(i1.getQuad(), quads, runs);
     
    163177        }
    164178    }
     179    assert (runLengthSumsUpToUnicodeQuadCount(runs));
    165180    return UnicodeSet(std::move(runs), std::move(quads));
    166181}
     
    174189    const auto e1 = quad_end();
    175190    const auto e2 = other.quad_end();
    176     for (auto i1 = quad_begin(), i2 = other.quad_begin(); i1 != e1 && i2 != e2; ) {
     191    auto i1 = quad_begin(), i2 = other.quad_begin();
     192    for (; i1 != e1 && i2 != e2; ) {
    177193        const auto run1 = i1.getRun();
    178194        const auto run2 = i2.getRun();
    179 
    180         const auto n = std::min(run1.mRunLength, run2.mRunLength);
    181         if ((run1.mType == Empty) && (run2.mType == Empty)) {
     195        const auto n = std::min(lengthOf(run1), lengthOf(run2));
     196        if ((typeOf(run1) == Empty) && (typeOf(run2) == Empty)) {
    182197            append_run(Empty, n, runs);
    183198            i1 += n;
    184199            i2 += n;
    185200        }
    186         else if ((run1.mType == Full) || (run2.mType == Full)) {
     201        else if ((typeOf(run1) == Full) || (typeOf(run2) == Full)) {
    187202            append_run(Full, n, runs);
    188203            i1 += n;
    189204            i2 += n;
    190205        }
    191         else if (run1.mType == Empty) {
     206        else if (typeOf(run1) == Empty) {
    192207            for (unsigned i = 0; i != n; ++i, ++i2) {
    193208                append_quad(i2.getQuad(), quads, runs);
     
    195210            i1 += n;
    196211        }
    197         else if (run2.mType == Empty) {
     212        else if (typeOf(run2) == Empty) {
    198213            for (unsigned i = 0; i != n; ++i, ++i1) {
    199214                append_quad(i1.getQuad(), quads, runs);
     
    207222        }
    208223    }
     224    assert (runLengthSumsUpToUnicodeQuadCount(runs));
    209225    return UnicodeSet(std::move(runs), std::move(quads));
    210226}
     
    221237        const auto run1 = i1.getRun();
    222238        const auto run2 = i2.getRun();
    223         unsigned n = std::min(run1.mRunLength, run2.mRunLength);
    224         if ((run1.mType == Empty) || (run2.mType == Full) || (run1.mType == Full && run2.mType == Empty)) {
    225             append_run(run1.mType, n, runs);
    226             i1 += n;
    227             i2 += n;
    228         }
    229         else if (run1.mType == Full) {
     239        unsigned n = std::min(lengthOf(run1), lengthOf(run2));
     240        if ((typeOf(run1) == Empty) || (typeOf(run2) == Full) || (typeOf(run1) == Full && typeOf(run2) == Empty)) {
     241            append_run(typeOf(run1), n, runs);
     242            i1 += n;
     243            i2 += n;
     244        }
     245        else if (typeOf(run1) == Full) {
    230246            for (unsigned i = 0; i != n; ++i, ++i2) {
    231247                append_quad(FULL_QUAD_MASK ^ i2.getQuad(), quads, runs);
     
    233249            i1 += n;
    234250        }
    235         else if (run2.mType == Empty) {
     251        else if (typeOf(run2) == Empty) {
    236252            for (unsigned i = 0; i != n; ++i, ++i1) {
    237253                append_quad(i1.getQuad(), quads, runs);
     
    245261        }
    246262    }
     263    assert (runLengthSumsUpToUnicodeQuadCount(runs));
    247264    return UnicodeSet(std::move(runs), std::move(quads));
    248265}
     
    259276        const auto run1 = i1.getRun();
    260277        const auto run2 = i2.getRun();
    261         unsigned n = std::min(run1.mRunLength, run2.mRunLength);
    262         if (run1.mType != Mixed && run2.mType != Mixed) {
    263             append_run(run1.mType == run2.mType ? Empty : Full, n, runs);
    264             i1 += n;
    265             i2 += n;
    266         }
    267         else if (run1.mType == Empty) {
     278        unsigned n = std::min(lengthOf(run1), lengthOf(run2));
     279        if (typeOf(run1) != Mixed && typeOf(run2) != Mixed) {
     280            append_run(typeOf(run1) == typeOf(run2) ? Empty : Full, n, runs);
     281            i1 += n;
     282            i2 += n;
     283        }
     284        else if (typeOf(run1) == Empty) {
    268285            for (unsigned i = 0; i < n; ++i, ++i2) {
    269286                append_quad(i2.getQuad(), quads, runs);
     
    271288            i1 += n;
    272289        }
    273         else if (run2.mType == Empty) {
     290        else if (typeOf(run2) == Empty) {
    274291            for (unsigned i = 0; i < n; ++i, ++i1) {
    275292                append_quad(i1.getQuad(), quads, runs);
     
    277294            i2 += n;
    278295        }
    279         else if (run1.mType == Full) {
     296        else if (typeOf(run1) == Full) {
    280297            for (unsigned i = 0; i < n; ++i, ++i2) {
    281298                append_quad(FULL_QUAD_MASK ^ i2.getQuad(), quads, runs);
     
    283300            i1 += n;
    284301        }
    285         else if (run2.mType == Empty) {
     302        else if (typeOf(run2) == Empty) {
    286303            for (unsigned i = 0; i < n; ++i, ++i1) {
    287304                append_quad(FULL_QUAD_MASK ^ i1.getQuad(), quads, runs);
     
    295312        }
    296313    }
     314    assert (runLengthSumsUpToUnicodeQuadCount(runs));
    297315    return UnicodeSet(std::move(runs), std::move(quads));
    298316}
     317
     318/** ------------------------------------------------------------------------------------------------------------- *
     319 * @brief equality
     320 ** ------------------------------------------------------------------------------------------------------------- */
     321UnicodeSet UnicodeSet::operator==(const UnicodeSet & other) const {
     322    if (mRuns.size() != other.mRuns.size() || mQuads.size() != other.mQuads.size()) {
     323        return false;
     324    }
     325    for (auto i = mQuads.begin(), j = other.mQuads.begin(); i != mQuads.end(); ++i, ++j) {
     326        if (*i != *j) return false;
     327    }
     328    for (auto i = mRuns.begin(), j = other.mRuns.begin(); i != mRuns.end(); ++i, ++j) {
     329        if (*i != *j) return false;
     330    }
     331    return true;
     332}
     333
    299334
    300335/** ------------------------------------------------------------------------------------------------------------- *
     
    305340 ** ------------------------------------------------------------------------------------------------------------- */
    306341bool UnicodeSet::contains(const codepoint_t codepoint) const {
    307 
    308342    auto n = codepoint / QUAD_BITS;
    309     unsigned runIndex = 0;
    310     unsigned quadIndex = 0;
    311 
    312     for (;;) {
    313         const RunStructure & t = mRuns[runIndex];
    314         if (t.mRunLength >= n) {
    315             if (t.mType == Mixed) {
    316                 return (mQuads[quadIndex + n - 1] & (static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK))) != 0;
    317             }
    318             return (t.mType == Full);
    319         }
    320         if (t.mType == Mixed) {
    321             quadIndex += n;
    322         }
    323         ++runIndex;
    324         n -= t.mRunLength;
    325     }
    326 
     343    QuadVector::const_iterator qi = mQuads.cbegin();
     344    for (const auto & r : mRuns) {
     345        if (lengthOf(r) >= n) {
     346            if (typeOf(r) == Mixed) {
     347                qi += n - 1;
     348                return (*qi & (static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK))) != 0;
     349            }
     350            return (typeOf(r) == Full);
     351        }
     352        if (typeOf(r) == Mixed) {
     353            qi += n;
     354        }       
     355        n -= lengthOf(r);
     356    }
     357    return false;
    327358}
    328359
     
    332363void UnicodeSet::quad_iterator::advance(unsigned n) {
    333364    while (n > 0) {
    334         const unsigned remain = mRunIterator->mRunLength - mOffset;
     365        const unsigned remain = lengthOf(*mRunIterator) - mOffset;
    335366        if (remain > n) {
    336             if (mRunIterator->mType == Mixed) {
     367            if (typeOf(*mRunIterator) == Mixed) {
    337368                mQuadIterator += n;
    338369            }
     
    340371            break;
    341372        }
    342         if (mRunIterator->mType == Mixed) {
     373        if (typeOf(*mRunIterator) == Mixed) {
    343374            mQuadIterator += remain;
    344375        }
     
    354385void UnicodeSet::iterator::advance(const unsigned n) {
    355386
    356     assert (n == 1);
     387    assert (n == 1);   
    357388
    358389    // Find the start of our interval
    359     for ( ; mBaseCodePoint <= re::CC::UNICODE_MAX; ++mRunIterator) {
     390    for ( ; mBaseCodePoint < CC::UNICODE_MAX; ++mRunIterator) {
    360391        // Find the first non-empty block
    361         const RunStructure & run = *mRunIterator;
    362         if (run.mType != Mixed) {
    363             mMinCodePoint = mBaseCodePoint;
    364             mBaseCodePoint += run.mRunLength * QUAD_BITS;
     392        if (typeOf(*mRunIterator) != Mixed) {
     393            mBaseCodePoint += lengthOf(*mRunIterator) * QUAD_BITS;
    365394            mQuadOffset = 0;
    366             mQuadPosition = 0;
    367             if (run.mType == Full) {
     395            mMixedRunIndex = 0;
     396            // If we found a full run, this must be the start of our interval.
     397            // Otherwise it must be empty.
     398            if (typeOf(*mRunIterator) == Full) {
     399                mMinCodePoint = mBaseCodePoint;
    368400                break;
    369401            }
    370402        }
    371         else { // if (left.mType == Mixed)
    372             while (mQuadPosition != run.mRunLength) {
    373                 const bitquad_t q = *mQuadIterator;
    374                 const bitquad_t M = (FULL_QUAD_MASK << mQuadOffset);
    375                 const bitquad_t m = q & M;
    376                 // Nothing left in this quad to add; skip to the next one.
    377                 if (m == 0) {
    378                     mBaseCodePoint += QUAD_BITS;
    379                     ++mQuadPosition;
    380                     ++mQuadIterator;
    381                     continue;
     403        else { // if (leftypeOf(t) == Mixed)
     404            bool found = false;
     405            while (mMixedRunIndex != lengthOf(*mRunIterator)) {
     406                const bitquad_t m = (*mQuadIterator) & (FULL_QUAD_MASK << mQuadOffset);
     407                // If we found a marker in m, it marks the beginning of our current interval.
     408                // Find it and break out of the loop.
     409                if (m) {
     410                    mQuadOffset = scan_forward_zeroes(m);
     411                    mMinCodePoint = mBaseCodePoint + mQuadOffset;
     412                    found = true;
     413                    break;
    382414                }
    383                 mQuadOffset = scan_forward_zeroes(m);
    384                 mMinCodePoint = mBaseCodePoint + mQuadOffset;
     415                mBaseCodePoint += QUAD_BITS;
     416                ++mMixedRunIndex;
     417                ++mQuadIterator;
     418                mQuadOffset = 0;
     419            }
     420            // If we found nothing in the quad, restart the loop.
     421            if (found) {
     422                // std::cerr << "Min: " << mMinCodePoint << " = " << mBaseCodePoint << " + " << mQuadOffset << std::endl;
    385423                break;
    386424            }
     425        }
     426    }
     427
     428    // Find the end of our interval
     429    for ( ; mBaseCodePoint < CC::UNICODE_MAX; ++mRunIterator) {
     430        // If this run is Empty, the max code point is the last computed base code point - 1.
     431        if (typeOf(*mRunIterator) == Empty) {
     432            mMaxCodePoint = mBaseCodePoint - 1;
     433            break;
     434        }
     435        // If this run is Full, increment the base code point; we need to check whether
     436        // the next run is Empty or Mixed to know if we've found the max code point of
     437        // the current interval.
     438        else if (typeOf(*mRunIterator) == Full) {
     439            mBaseCodePoint += lengthOf(*mRunIterator) * QUAD_BITS;
     440            mQuadOffset = 0;
     441            mMixedRunIndex = 0;
     442            continue;
     443        }
     444        else { // if (leftypeOf(t) == Mixed)
     445            bool found = false;
     446            while (mMixedRunIndex != lengthOf(*mRunIterator)) {
     447                const bitquad_t m = (~(*mQuadIterator)) & (FULL_QUAD_MASK << mQuadOffset);
     448                // If we found a marker in m, it marks the end of our current interval.
     449                // Find it and break out of the loop.
     450                if (m) {
     451                    mQuadOffset = scan_forward_zeroes(m);
     452                    mMaxCodePoint = mBaseCodePoint + mQuadOffset - 1;
     453                    found = true;
     454                    break;
     455                }
     456                mBaseCodePoint += QUAD_BITS;
     457                ++mMixedRunIndex;
     458                ++mQuadIterator;
     459                mQuadOffset = 0;
     460            }
    387461            // If we found nothing in the quad, restart the loop.
    388             if (mQuadPosition != run.mRunLength) {
    389                 break;
    390             }
    391         }
    392     }
    393 
    394     // Find the end of our interval
    395     for ( ; mBaseCodePoint <= re::CC::UNICODE_MAX; ++mRunIterator) {
    396         const RunStructure & run = *mRunIterator;
    397         // If the next run is empty, we already know the max code point.
    398         if (run.mType == Empty) {
    399             mMaxCodePoint = mBaseCodePoint;
    400             break;
    401         }
    402         // If the next run is Full, increment the base code point.
    403         else if (run.mType == Full) {
    404             mBaseCodePoint += run.mRunLength * QUAD_BITS;
    405             mMaxCodePoint = mBaseCodePoint;
    406             mQuadOffset = 0;
    407             mQuadPosition = 0;
    408             continue;
    409         }
    410         else { // if (left.mType == Mixed)
    411             while (mQuadPosition != run.mRunLength) {
    412 
    413                 const bitquad_t q = *mQuadIterator;
    414                 const bitquad_t M = (FULL_QUAD_MASK << mQuadOffset);
    415                 const bitquad_t m = ~q & M;
    416 
    417                 // Nothing left in this quad to add; skip to the next one.
    418                 if (m == 0) {
    419                     mBaseCodePoint += QUAD_BITS;
    420                     mMaxCodePoint = mBaseCodePoint;
    421                     ++mQuadPosition;
    422                     ++mQuadIterator;
    423                     continue;
    424                 }
    425 
    426                 mQuadOffset = scan_forward_zeroes(m);
    427                 mMaxCodePoint = mBaseCodePoint + mQuadOffset - 1;
    428                 break;
    429             }
    430             // If we found nothing in the quad, restart the loop.
    431             if (mQuadPosition != run.mRunLength) {
     462            if (found) {
     463                // std::cerr << "Max: " << mMinCodePoint << " = " << mBaseCodePoint << " + " << mQuadOffset << std::endl;
    432464                break;
    433465            }
     
    448480UnicodeSet::UnicodeSet(const codepoint_t codepoint) {
    449481    const codepoint_t quad_no = codepoint / QUAD_BITS;
    450     append_run(Empty, quad_no, mRuns);
     482    if (quad_no > 0) {
     483        append_run(Empty, quad_no, mRuns);
     484    }
    451485    append_quad(static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK), mQuads, mRuns);
    452     append_run(Empty, UNICODE_QUAD_COUNT - (quad_no + 1), mRuns);
     486    if (quad_no < UNICODE_QUAD_COUNT - 1) {
     487        append_run(Empty, UNICODE_QUAD_COUNT - (quad_no + 1), mRuns);
     488    }
     489    assert (runLengthSumsUpToUnicodeQuadCount(mRuns));
    453490}
    454491
     
    472509        append_quad(hi_quad, mQuads, mRuns);
    473510    }
    474     append_run(Empty, UNICODE_QUAD_COUNT - (hi_quad_no + 1), mRuns);
    475 }
    476 
     511    if (hi_quad_no < UNICODE_QUAD_COUNT - 1) {
     512        append_run(Empty, UNICODE_QUAD_COUNT - (hi_quad_no + 1), mRuns);
     513    }
     514    assert (runLengthSumsUpToUnicodeQuadCount(mRuns));
     515}
     516
     517}
Note: See TracChangeset for help on using the changeset viewer.