Ignore:
Timestamp:
Nov 5, 2015, 4:41:37 PM (4 years ago)
Author:
nmedfort
Message:

Back up check in. Memory leaks should be fixed.

Location:
icGREP/icgrep-devel/icgrep/UCD
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/UCD/ucd_compiler.cpp

    r4841 r4860  
    229229            // We have a single byte remaining to match for all code points in this CC.
    230230            // Use the byte class compiler to generate matches for these codepoints.
    231             const auto bytes = byteDefinitions(ranges, byte_no);
    232             PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(bytes), builder);
     231            PabloAST * var = mCharacterClassCompiler.compileCC(makeCC(byteDefinitions(ranges, byte_no)), builder);
    233232            if (byte_no > 1) {
    234233                var = builder.createAnd(var, builder.createAdvance(makePrefix(lo, byte_no, builder, prefix), 1));
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.cpp

    r4850 r4860  
    2424#include <llvm/Support/Format.h>
    2525#include <include/simd-lib/builtins.hpp>
    26 #include <iostream>
     26#include <array>
    2727
    2828namespace UCD {
     
    3030using bitquad_t = UnicodeSet::bitquad_t;
    3131using run_t = UnicodeSet::run_t;
    32 using RunVector = UnicodeSet::RunVector;
    33 using QuadVector = UnicodeSet::QuadVector;
     32using interval_t = UnicodeSet::interval_t;
     33using codepoint_t = UnicodeSet::codepoint_t;
     34
     35UnicodeSet::Allocator UnicodeSet::mAllocator;
    3436
    3537const size_t QUAD_BITS = (8 * sizeof(bitquad_t));
     
    4951 * @brief append_run
    5052 ** ------------------------------------------------------------------------------------------------------------- */
     53template <class RunVector>
    5154inline void append_run(const run_type_t type, const unsigned length, RunVector & runs) {
    5255    if (LLVM_UNLIKELY(length == 0)) {
    5356        return;
    54     }
    55     else if (!runs.empty() && typeOf(runs.back()) == type) {
     57    } else if (!runs.empty() && typeOf(runs.back()) == type) {
    5658        std::get<1>(runs.back()) += length;
    5759        return;
     
    6365 * @brief append_quad
    6466 ** ------------------------------------------------------------------------------------------------------------- */
     67template <class QuadVector, class RunVector>
    6568inline void append_quad(const bitquad_t quad, QuadVector & quads, RunVector & runs) {
    6669    run_type_t type = Empty;
    6770    if (LLVM_UNLIKELY(quad == 0)) {
    6871        type = Empty;
    69     }
    70     else if (LLVM_UNLIKELY(quad == FULL_QUAD_MASK)) {
     72    } else if (LLVM_UNLIKELY(quad == FULL_QUAD_MASK)) {
    7173        type = Full;
    72     }
    73     else {
     74    } else {
    7475        quads.emplace_back(quad);
    7576        type = Mixed;
     
    8081#ifndef NDEBUG
    8182/** ------------------------------------------------------------------------------------------------------------- *
    82  * @brief runLengthSumsUpToUnicodeQuadCount
     83 * @brief verify
    8384 *
    84  * Sanity check for each function that constructs a new UnicodeSet
    85  ** ------------------------------------------------------------------------------------------------------------- */
    86 inline bool runLengthSumsUpToUnicodeQuadCount(const RunVector & runs) {
     85 * Sanity check for each function that constructs or modifies a UnicodeSet
     86 ** ------------------------------------------------------------------------------------------------------------- */
     87template <class RunVector, class QuadVector>
     88inline bool verify(const RunVector & runs, const QuadVector & quads) {
    8789    unsigned sum = 0;
    88     for (auto & run : runs) {
    89         sum += lengthOf(run);
    90     }
    91     return sum == UNICODE_QUAD_COUNT;
     90    unsigned mixedQuads = 0;
     91    for (auto run : runs) {
     92        const auto l = lengthOf(run);
     93        if (l == 0) {
     94            throw std::runtime_error("Zero-length quad found!");
     95        }
     96        if (typeOf(run) == Mixed) {
     97            mixedQuads += l;
     98        }
     99        sum += l;
     100    }
     101    if (sum != UNICODE_QUAD_COUNT) {
     102        throw std::runtime_error("Invalid quad count: found " + std::to_string(sum) + " but expected " + std::to_string(UNICODE_QUAD_COUNT));
     103    }
     104    if (mixedQuads != quads.size()) {
     105        throw std::runtime_error("Invalid mixed quad count: found " + std::to_string(quads.size()) + " but expected " + std::to_string(mixedQuads));
     106    }
     107    for (auto quad : quads) {
     108        if (quad == 0) {
     109            throw std::runtime_error("Empty quad found in Mixed quad array!");
     110        } else if (quad == FULL_QUAD_MASK) {
     111            throw std::runtime_error("Full quad found in Mixed quad array!");
     112        }
     113    }
     114    return true;
    92115}
    93116#endif
     
    148171 ** ------------------------------------------------------------------------------------------------------------- */
    149172UnicodeSet UnicodeSet::operator~() const {
    150     RunVector runs;
    151     QuadVector quads;
     173    std::vector<run_t> runs;
     174    std::vector<bitquad_t> quads;
    152175    runs.reserve(mRuns.size());
    153176    quads.reserve(mQuads.size());
     
    167190        }
    168191    }
    169     assert (runLengthSumsUpToUnicodeQuadCount(runs));
    170192    return UnicodeSet(std::move(runs), std::move(quads));
    171193}
     
    175197 ** ------------------------------------------------------------------------------------------------------------- */
    176198UnicodeSet UnicodeSet::operator&(const UnicodeSet & other) const {
    177     RunVector runs;
    178     QuadVector quads;
     199    std::vector<run_t> runs;
     200    std::vector<bitquad_t> quads;
    179201    const auto e1 = quad_end();
    180202    const auto e2 = other.quad_end();
     
    209231        }
    210232    }
    211     assert (runLengthSumsUpToUnicodeQuadCount(runs));
    212233    return UnicodeSet(std::move(runs), std::move(quads));
    213234}
     
    217238 ** ------------------------------------------------------------------------------------------------------------- */
    218239UnicodeSet UnicodeSet::operator+(const UnicodeSet & other) const {
    219     RunVector runs;
    220     QuadVector quads;
     240    std::vector<run_t> runs;
     241    std::vector<bitquad_t> quads;
    221242    const auto e1 = quad_end();
    222243    const auto e2 = other.quad_end();
     
    228249            i1 += n;
    229250            i2 += n;
    230         }
    231         else if ((i1.type() == Full) || (i2.type() == Full)) {
     251        } else if ((i1.type() == Full) || (i2.type() == Full)) {
    232252            append_run(Full, n, runs);
    233253            i1 += n;
    234254            i2 += n;
    235         }
    236         else if (i1.type() == Empty) {
     255        } else if (i1.type() == Empty) {
    237256            for (unsigned i = 0; i != n; ++i, ++i2) {
    238257                append_quad(i2.quad(), quads, runs);
    239258            }
    240259            i1 += n;
    241         }
    242         else if (i2.type() == Empty) {
     260        } else if (i2.type() == Empty) {
    243261            for (unsigned i = 0; i != n; ++i, ++i1) {
    244262                append_quad(i1.quad(), quads, runs);
    245263            }
    246264            i2 += n;
    247         }
    248         else {
     265        } else {
    249266            for (unsigned i = 0; i < n; ++i, ++i1, ++i2) {
    250267                append_quad(i1.quad() | i2.quad(), quads, runs);
     
    252269        }
    253270    }
    254     assert (runLengthSumsUpToUnicodeQuadCount(runs));
    255271    return UnicodeSet(std::move(runs), std::move(quads));
    256272}
     
    260276 ** ------------------------------------------------------------------------------------------------------------- */
    261277UnicodeSet UnicodeSet::operator-(const UnicodeSet & other) const {
    262     RunVector runs;
    263     QuadVector quads;
     278    std::vector<run_t> runs;
     279    std::vector<bitquad_t> quads;
    264280    const auto e1 = quad_end();
    265281    const auto e2 = other.quad_end();
     
    289305        }
    290306    }
    291     assert (runLengthSumsUpToUnicodeQuadCount(runs));
    292307    return UnicodeSet(std::move(runs), std::move(quads));
    293308}
     
    297312 ** ------------------------------------------------------------------------------------------------------------- */
    298313UnicodeSet UnicodeSet::operator^(const UnicodeSet & other) const {
    299     RunVector runs;
    300     QuadVector quads;
     314    std::vector<run_t> runs;
     315    std::vector<bitquad_t> quads;
    301316    const auto e1 = quad_end();
    302317    const auto e2 = other.quad_end();
     
    338353        }
    339354    }
    340     assert (runLengthSumsUpToUnicodeQuadCount(runs));
    341355    return UnicodeSet(std::move(runs), std::move(quads));
    342356}
     
    385399}
    386400
    387 ///** ------------------------------------------------------------------------------------------------------------- *
    388 // * @brief insert_range
    389 // ** ------------------------------------------------------------------------------------------------------------- */
    390 //void UnicodeSet::insert_range(const codepoint_t lo, const codepoint_t hi)  {
    391 
    392 //    if (LLVM_UNLIKELY(lo > hi)) {
    393 //        throw std::runtime_error('[' + std::to_string(lo) + ',' + std::to_string(hi) + "] is an illegal codepoint range!");
    394 //    } else if (LLVM_UNLIKELY(hi >= 0x110000)) {
    395 //        throw std::runtime_error(std::to_string(hi) + " exceeds maximum code point.");
    396 //    }
    397 
    398 //    auto r = mRuns.begin();
    399 //    auto q = mQuads.begin();
    400 //    unsigned offset = 0;
    401 
    402 //    auto lo_quad_no = lo / QUAD_BITS;
    403 //    auto lo_offset = lo & MOD_QUAD_BIT_MASK;
    404 
    405 //    auto hi_quad_no = hi / QUAD_BITS;
    406 //    auto hi_offset = hi & MOD_QUAD_BIT_MASK;
    407 
    408 //    // Scan up to the lo codepoint
    409 //    for (;;) {
    410 //        assert (r != mRuns.end());
    411 //        const auto l = lengthOf(*r);
    412 //        if ((offset + l) > lo_quad_no) {
    413 //            break;
    414 //        }
    415 //        if (typeOf(*r) == Mixed) {
    416 //            q += lengthOf(*r);
    417 //        }
    418 //        offset += l;
    419 //        ++r;
    420 //    }
    421 
    422 //    // Test whether the range is already 'full' and skip ahead to the first empty or mixed quad.
    423 //    // If the entire [lo,hi] range is already covered by a Full run, abort.
    424 //    while (typeOf(*r) == Full) {
    425 //        const auto l = lengthOf(*r);
    426 //        lo_quad_no += l;
    427 //        offset = lo_quad_no;
    428 //        lo_offset = 0;
    429 //        if (lo_quad_no > hi_quad_no) {
    430 //            return;
    431 //        }
    432 //        ++r;
    433 //    }
    434 
    435 //    // Otherwise, some portion of this range has to be inserted into the current sparse set.
    436 //    // Begin by inserting the initial (potentially) partial lo quad.
    437 //    const bitquad_t lo_quad = (FULL_QUAD_MASK << lo_offset);
    438 //    const bitquad_t hi_quad = (FULL_QUAD_MASK >> (QUAD_BITS - 1 - hi_offset));
    439 //    bitquad_t quad = (lo_quad_no == hi_quad_no) ? (lo_quad & hi_quad) : lo_quad;
    440 //    run_type_t newType = (quad == FULL_QUAD_MASK) ? Full : ((quad == 0) ? Empty : Mixed);
    441 //    run_type_t runType = typeOf(*r);
    442 //    // If the original run is Mixed, we may be able to simply update the quad accordingly.
    443 //    if (runType == Mixed) {
    444 //        q += (lo_quad_no - offset);
    445 //        quad |= *q;
    446 //        if (LLVM_LIKELY(quad != FULL_QUAD_MASK)) {
    447 //            *q = quad;
    448 //            if (lo_quad_no == hi_quad_no) {
    449 //                return;
    450 //            }
    451 //        } else { // we filled a Mixed quad
    452 //            mQuads.erase(q);
    453 //        }
    454 //        newType = Full;
    455 //    }
    456 //    auto length = lengthOf(*r);
    457 //    auto splitAt = length - (lo_quad_no - offset) - 1;
    458 //    if (splitAt) {
    459 //        // reduce the original run length
    460 //        lengthOf(*r) = splitAt;
    461 //        // and add in a new quad
    462 //        r = mRuns.emplace(r, newType, 1);
    463 //    } else { // we're inserting this quad at the beginning of the run
    464 //        typeOf(*r) = newType;
    465 //        lengthOf(*r) = 1;
    466 //    }
    467 //    if (newType == Mixed) {
    468 //        q = mQuads.emplace(q, quad);
    469 //    }
    470 //    length -= splitAt + 1;
    471 //    auto remaining = (hi_quad_no - lo_quad_no);
    472 //    // We're inserting a Full run so if the original run type was Full and exceeds the
    473 //    // length of what we're inserting, we can abort without considering the hi_quad
    474 //    if (runType == Full && length > remaining) {
    475 //        return;
    476 //    }
    477 //    if (remaining) {
    478 //        r = mRuns.emplace(r, Full, remaining);
    479 
    480 
    481 
    482 //    }
    483 
    484 //}
    485 
    486 
     401/** ------------------------------------------------------------------------------------------------------------- *
     402 * @brief insert
     403 ** ------------------------------------------------------------------------------------------------------------- */
     404void UnicodeSet::insert(const codepoint_t cp) {
     405
     406    if (LLVM_UNLIKELY(cp >= UNICODE_MAX)) {
     407        throw std::runtime_error(std::to_string(cp) + " exceeds maximum code point.");
     408    }
     409
     410    codepoint_t offset = cp / QUAD_BITS;
     411    const bitquad_t value = static_cast<bitquad_t>(1) << (cp & MOD_QUAD_BIT_MASK);
     412    auto run = mRuns.begin();
     413    auto quad = mQuads.begin();
     414    length_t l = 0;
     415    run_type_t t = Empty;
     416    for (;;) {
     417        std::tie(t, l) = *run;
     418        if (offset < l) {
     419            break;
     420        }
     421        if (t == Mixed) {
     422            quad += l;
     423        }
     424        offset -= l;
     425        ++run;
     426    }
     427    if (LLVM_LIKELY(t == Mixed)) {
     428        quad += offset;
     429        *quad |= value;
     430        if (LLVM_LIKELY(*quad != FULL_QUAD_MASK)) {
     431            assert (contains(cp));
     432            return;
     433        }
     434        mQuads.erase(quad);
     435    } else if (t == Empty) {
     436        mQuads.insert(quad, value);
     437    } else { // if (t == Full) {
     438        assert (contains(cp));
     439        return;
     440    }
     441    const unsigned remaining = l - (offset + 1);
     442    if (offset == 0) {
     443        *run = std::make_pair(t == Empty ? Mixed : Full, 1);
     444        if (remaining != 0) {
     445            mRuns.insert(++run, std::make_pair(t, remaining));
     446        }
     447    } else {
     448        run->second = offset;
     449        if (remaining == 0) {
     450            mRuns.insert(++run, std::make_pair(t == Empty ? Mixed : Full, 1));
     451        } else {
     452            mRuns.insert(++run, {std::make_pair(t == Empty ? Mixed : Full, 1), std::make_pair(t, remaining)});
     453        }
     454    }
     455
     456    assert (verify(mRuns, mQuads));
     457    assert (contains(cp));
     458}
     459
     460/** ------------------------------------------------------------------------------------------------------------- *
     461 * @brief insert_range
     462 ** ------------------------------------------------------------------------------------------------------------- */
     463void UnicodeSet::insert_range(const codepoint_t lo, const codepoint_t hi)  {
     464
     465    if (LLVM_UNLIKELY(lo > hi)) {
     466        throw std::runtime_error('[' + std::to_string(lo) + ',' + std::to_string(hi) + "] is an illegal codepoint range!");
     467    } else if (LLVM_UNLIKELY(hi > UNICODE_MAX)) {
     468        throw std::runtime_error(std::to_string(hi) + " exceeds maximum code point.");
     469    }
     470
     471    // Create a temporary run and quad set for the given range
     472    std::vector<run_t> runs;
     473    std::vector<bitquad_t> quads;
     474
     475    codepoint_t lo_index = lo / QUAD_BITS;
     476    codepoint_t hi_index = hi / QUAD_BITS;
     477
     478    auto ri = mRuns.cbegin();
     479    auto qi = mQuads.cbegin();
     480
     481    length_t length = 0;
     482    run_type_t type = Empty;
     483    // Advance past any full runs prior to the lo_index
     484    for (;;) {
     485        std::tie(type, length) = *ri;
     486        if (lo_index < length) {
     487            break;
     488        }
     489        if (type == Mixed) {
     490            qi += length;
     491        }
     492        lo_index -= length;
     493        hi_index -= length;
     494        ++ri;
     495    }
     496
     497    // Now record the runs and any quads prior to lo_index
     498    runs.assign(mRuns.cbegin(), ri);
     499    if (lo_index) {
     500        runs.push_back(std::make_pair(type, lo_index));
     501        if (type == Mixed) {
     502            qi += lo_index;
     503        }
     504        hi_index -= lo_index;
     505        length -= lo_index;
     506    }
     507
     508    quads.assign(mQuads.cbegin(), qi);
     509    // We now have everything up to the first quad added to our temporary buffers; merge in the first new quad.
     510    bitquad_t lo_quad = (FULL_QUAD_MASK << (lo & MOD_QUAD_BIT_MASK));
     511    bitquad_t hi_quad = (FULL_QUAD_MASK >> (QUAD_BITS - 1 - (hi & MOD_QUAD_BIT_MASK)));
     512
     513    // If our original quad is full, just append a Full run
     514    if (LLVM_UNLIKELY(type == Full)) {
     515        append_run(Full, 1, runs);
     516    } else { // Otherwise merge the new quad value with the original one
     517        if (hi_index == 0) {
     518            lo_quad &= hi_quad;
     519        }
     520        if (type == Mixed) {
     521            lo_quad |= *qi++;
     522        }
     523        append_quad(lo_quad, quads, runs);
     524    }
     525    --length;
     526
     527    // Now check if we need to write out any Full blocks between the lo and hi code points; adjust our position
     528    // in the original quad to suit.
     529    if (hi_index) {
     530        // Add in any full runs between the lo and hi quads
     531        append_run(Full, hi_index - 1, runs);
     532        // Advance past original quads that were filled in
     533        for (;;) {
     534            if (type == Mixed) {
     535                qi += length;
     536            }
     537            std::tie(type, length) = *++ri;
     538            if (hi_index < length) {
     539                break;
     540            }
     541            hi_index -= length;
     542        }
     543
     544        // Write out the hi_quad value
     545        if (LLVM_UNLIKELY(type == Full)) {
     546            append_run(Full, 1, runs);
     547        } else {
     548            if (type == Mixed) {
     549                qi += hi_index;
     550                hi_quad |= *qi++;
     551            }
     552            append_quad(hi_quad, quads, runs);
     553        }
     554    }
     555
     556    // And append any remaining values from the original data
     557    append_run(type, length - hi_index, runs);
     558    runs.insert(runs.end(), ++ri, mRuns.cend());
     559    quads.insert(quads.end(), qi, mQuads.cend());
     560
     561    assert (verify(runs, quads));
     562
     563    mRuns.assign(runs.cbegin(), runs.cend());
     564    mQuads.assign(quads.cbegin(), quads.cend());
     565}
    487566
    488567/** ------------------------------------------------------------------------------------------------------------- *
     
    492571 * Return whether this UnicodeSet contains the specified code point
    493572 ** ------------------------------------------------------------------------------------------------------------- */
    494 bool UnicodeSet::contains(const codepoint_t codepoint) const {
    495     auto n = codepoint / QUAD_BITS;
    496     auto qi = mQuads.cbegin();
    497     for (const auto & r : mRuns) {
    498         if (lengthOf(r) >= n) {
    499             if (typeOf(r) == Mixed) {
    500                 qi += n - 1;
    501                 return (*qi & (static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK))) != 0;
    502             }
    503             return (typeOf(r) == Full);
    504         }
    505         if (typeOf(r) == Mixed) {
    506             qi += lengthOf(r);
     573bool UnicodeSet::contains(const codepoint_t cp) const {
     574    codepoint_t offset = cp / QUAD_BITS;
     575    auto quad = mQuads.cbegin();
     576    for (const auto r : mRuns) {
     577        length_t l = 0;
     578        run_type_t t = Empty;
     579        std::tie(t, l) = r;
     580        if (offset < l) {
     581            if (t == Mixed) {
     582                quad += offset;
     583                return (*quad & static_cast<bitquad_t>(1) << (cp & MOD_QUAD_BIT_MASK)) != 0;
     584            }
     585            return (t == Full);
     586        }
     587        if (t == Mixed) {
     588            quad += l;
    507589        }       
    508         n -= lengthOf(r);
     590        offset -= l;
    509591    }
    510592    return false;
     
    659741 * @brief Empty Set Constructor
    660742 ** ------------------------------------------------------------------------------------------------------------- */
    661 UnicodeSet::UnicodeSet() : mRuns({{{Empty, UNICODE_QUAD_COUNT}}}) { }
     743UnicodeSet::UnicodeSet()
     744: mRuns({{{Empty, UNICODE_QUAD_COUNT}}}, reinterpret_cast<RunAllocator &>(mAllocator))
     745, mQuads(reinterpret_cast<QuadAllocator &>(mAllocator))
     746{
     747    assert (verify(mRuns, mQuads));
     748}
    662749
    663750/** ------------------------------------------------------------------------------------------------------------- *
    664751 * @brief Singleton Set Constructor
    665752 ** ------------------------------------------------------------------------------------------------------------- */
    666 UnicodeSet::UnicodeSet(const codepoint_t codepoint) {
     753UnicodeSet::UnicodeSet(const codepoint_t codepoint)
     754: mRuns(reinterpret_cast<RunAllocator &>(mAllocator))
     755, mQuads(reinterpret_cast<QuadAllocator &>(mAllocator))
     756{
    667757    const codepoint_t quad_no = codepoint / QUAD_BITS;
    668     if (quad_no > 0) {
    669         append_run(Empty, quad_no, mRuns);
    670     }
     758    append_run(Empty, quad_no, mRuns);
    671759    append_quad(static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK), mQuads, mRuns);
    672     if (quad_no < UNICODE_QUAD_COUNT - 1) {
    673         append_run(Empty, UNICODE_QUAD_COUNT - (quad_no + 1), mRuns);
    674     }
    675     assert (runLengthSumsUpToUnicodeQuadCount(mRuns));
     760    append_run(Empty, UNICODE_QUAD_COUNT - (quad_no + 1), mRuns);
     761    assert (verify(mRuns, mQuads));
    676762}
    677763
     
    679765 * @brief Range Set Constructor
    680766 ** ------------------------------------------------------------------------------------------------------------- */
    681 UnicodeSet::UnicodeSet(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint) {
    682     const codepoint_t lo_quad_no = lo_codepoint / QUAD_BITS;
    683     const codepoint_t hi_quad_no = hi_codepoint / QUAD_BITS;
    684     const codepoint_t lo_offset = lo_codepoint & MOD_QUAD_BIT_MASK;
    685     const codepoint_t hi_offset = hi_codepoint & MOD_QUAD_BIT_MASK;
     767UnicodeSet::UnicodeSet(const codepoint_t lo, const codepoint_t hi)
     768: mRuns(reinterpret_cast<RunAllocator &>(mAllocator))
     769, mQuads(reinterpret_cast<QuadAllocator &>(mAllocator))
     770{
     771    const codepoint_t lo_index = lo / QUAD_BITS;
     772    const codepoint_t hi_index = hi / QUAD_BITS;
     773    const codepoint_t lo_offset = lo & MOD_QUAD_BIT_MASK;
     774    const codepoint_t hi_offset = hi & MOD_QUAD_BIT_MASK;
    686775    const bitquad_t lo_quad = (FULL_QUAD_MASK << lo_offset);
    687776    const bitquad_t hi_quad = (FULL_QUAD_MASK >> (QUAD_BITS - 1 - hi_offset));
    688     append_run(Empty, lo_quad_no, mRuns);
    689     if (lo_quad_no == hi_quad_no) {       
    690         append_quad(lo_quad & hi_quad, mQuads, mRuns);
    691     }
    692     else {
     777    append_run(Empty, lo_index, mRuns);
     778    bitquad_t mask = hi_quad;
     779    if (lo_index == hi_index) {
     780        mask &= lo_quad;
     781    } else {
    693782        append_quad(lo_quad, mQuads, mRuns);
    694         append_run(Full, hi_quad_no - (lo_quad_no + 1), mRuns);
    695         append_quad(hi_quad, mQuads, mRuns);
    696     }
    697     if (hi_quad_no < UNICODE_QUAD_COUNT - 1) {
    698         append_run(Empty, UNICODE_QUAD_COUNT - (hi_quad_no + 1), mRuns);
    699     }
    700     assert (runLengthSumsUpToUnicodeQuadCount(mRuns));
    701 }
    702 
    703 }
     783        append_run(Full, hi_index - (lo_index + 1), mRuns);
     784    }
     785    append_quad(mask, mQuads, mRuns);
     786    append_run(Empty, UNICODE_QUAD_COUNT - (hi_index + 1), mRuns);
     787    assert (verify(mRuns, mQuads));
     788}
     789
     790/** ------------------------------------------------------------------------------------------------------------- *
     791 * @brief Range List Set Constructor
     792 ** ------------------------------------------------------------------------------------------------------------- */
     793template <typename itr>
     794void convertIntervalRangesToSparseSet(const itr begin, const itr end, UnicodeSet::RunVector & mRuns, UnicodeSet::QuadVector & mQuads) {
     795    assert (std::is_sorted(begin, end, [](const interval_t l, const interval_t r) {
     796        assert (l.first <= l.second);
     797        assert (l.second < UNICODE_MAX);
     798        assert (r.first <= r.second);
     799        assert (r.second < UNICODE_MAX);
     800        return l.second < r.first;
     801    }));
     802
     803    std::vector<run_t> runs;
     804    std::vector<bitquad_t> quads;
     805
     806    codepoint_t prior_index = 0;
     807    bitquad_t mask = 0;
     808    for (auto interval = begin; interval != end; ) {
     809        codepoint_t lo, hi;
     810        std::tie(lo, hi) = *interval;
     811        const codepoint_t lo_index = (lo / QUAD_BITS);
     812        const codepoint_t hi_index = (hi / QUAD_BITS);
     813        const codepoint_t lo_offset = lo & MOD_QUAD_BIT_MASK;
     814        const codepoint_t hi_offset = hi & MOD_QUAD_BIT_MASK;
     815        const bitquad_t lo_quad = (FULL_QUAD_MASK << lo_offset);
     816        const bitquad_t hi_quad = (FULL_QUAD_MASK >> (QUAD_BITS - 1 - hi_offset));
     817        append_run(Empty, lo_index - prior_index, runs);
     818        if (lo_index == hi_index) {
     819            mask |= (lo_quad & hi_quad);
     820        } else {
     821            append_quad(mask | lo_quad, quads, runs);
     822            append_run(Full, hi_index - (lo_index + 1), runs);
     823            mask = hi_quad;
     824        }
     825        // if the next interval is in our current quad, continue to the next range
     826        prior_index = hi_index;
     827        if (LLVM_LIKELY(++interval != end)) {
     828            if (hi_index == (interval->first / QUAD_BITS)) {
     829                continue;
     830            }
     831        }
     832        append_quad(mask, quads, runs);
     833        mask = 0;
     834        ++prior_index;
     835    }
     836    assert (mask == 0);
     837    if (prior_index < UNICODE_QUAD_COUNT) {
     838        append_run(Empty, UNICODE_QUAD_COUNT - prior_index, runs);
     839    }   
     840    assert (verify(runs, quads));
     841    mRuns.assign(runs.begin(), runs.end());
     842    mQuads.assign(quads.begin(), quads.end());
     843}
     844
     845/** ------------------------------------------------------------------------------------------------------------- *
     846 * @brief Interval Range Constructor
     847 ** ------------------------------------------------------------------------------------------------------------- */
     848UnicodeSet::UnicodeSet(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end)
     849: mRuns(0, {Empty, 0}, reinterpret_cast<RunAllocator &>(mAllocator))
     850, mQuads(0, 0, reinterpret_cast<QuadAllocator &>(mAllocator))
     851{
     852    convertIntervalRangesToSparseSet(begin, end, mRuns, mQuads);
     853}
     854
     855/** ------------------------------------------------------------------------------------------------------------- *
     856 * @brief Interval Range Constructor
     857 ** ------------------------------------------------------------------------------------------------------------- */
     858UnicodeSet::UnicodeSet(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end)
     859: mRuns(0, {Empty, 0}, reinterpret_cast<RunAllocator &>(mAllocator))
     860, mQuads(0, 0, reinterpret_cast<QuadAllocator &>(mAllocator))
     861{
     862    convertIntervalRangesToSparseSet(begin, end, mRuns, mQuads);
     863}
     864
     865/** ------------------------------------------------------------------------------------------------------------- *
     866 * @brief Copy Constructor
     867 ** ------------------------------------------------------------------------------------------------------------- */
     868UnicodeSet::UnicodeSet(const UnicodeSet & other)
     869: mRuns(other.mRuns, reinterpret_cast<RunAllocator &>(mAllocator))
     870, mQuads(other.mQuads, reinterpret_cast<QuadAllocator &>(mAllocator))
     871{
     872    assert (verify(mRuns, mQuads));
     873}
     874
     875/** ------------------------------------------------------------------------------------------------------------- *
     876 * @brief Initializer Constructor
     877 ** ------------------------------------------------------------------------------------------------------------- */
     878UnicodeSet::UnicodeSet(std::initializer_list<run_t> r, std::initializer_list<bitquad_t> q)
     879: mRuns(r.begin(), r.end(), reinterpret_cast<RunAllocator &>(mAllocator))
     880, mQuads(q.begin(), q.end(), reinterpret_cast<QuadAllocator &>(mAllocator))
     881{
     882    assert (verify(mRuns, mQuads));
     883}
     884
     885/** ------------------------------------------------------------------------------------------------------------- *
     886 * @brief Internal Vector Constructor
     887 ** ------------------------------------------------------------------------------------------------------------- */
     888inline UnicodeSet::UnicodeSet(std::vector<run_t> && r, std::vector<bitquad_t> && q)
     889: mRuns(r.begin(), r.end(), reinterpret_cast<RunAllocator &>(mAllocator))
     890, mQuads(q.begin(), q.end(), reinterpret_cast<QuadAllocator &>(mAllocator))
     891{
     892    assert (verify(mRuns, mQuads));
     893}
     894
     895}
  • icGREP/icgrep-devel/icgrep/UCD/unicode_set.h

    r4818 r4860  
    44#include <vector>
    55#include <boost/iterator/iterator_facade.hpp>
     6#include <slab_allocator.h>
    67
    78//
     
    4849    using interval_t = std::pair<codepoint_t, codepoint_t>;
    4950
    50     using RunVector = std::vector<run_t>;
    51     using QuadVector = std::vector<bitquad_t>;
     51    using Allocator = SlabAllocator<uint32_t>;
     52    using RunAllocator = Allocator::rebind<run_t>::other;
     53    using QuadAllocator = Allocator::rebind<bitquad_t>::other;
     54
     55    using RunVector = std::vector<run_t, RunAllocator>;
     56    using QuadVector = std::vector<bitquad_t, QuadAllocator>;
     57    using RunIterator = RunVector::const_iterator;
     58    using QuadIterator = QuadVector::const_iterator;
    5259
    5360    using size_type = RunVector::size_type;
     
    7885        }
    7986    private:
    80         RunVector::const_iterator           mRunIterator;
    81         QuadVector::const_iterator          mQuadIterator;
    82         unsigned                            mMixedRunIndex;
    83         bitquad_t                           mQuadOffset;
    84         codepoint_t                         mBaseCodePoint;
    85         codepoint_t                         mMinCodePoint;
    86         codepoint_t                         mMaxCodePoint;
     87        RunIterator         mRunIterator;
     88        QuadIterator        mQuadIterator;
     89        unsigned            mMixedRunIndex;
     90        bitquad_t           mQuadOffset;
     91        codepoint_t         mBaseCodePoint;
     92        codepoint_t         mMinCodePoint;
     93        codepoint_t         mMaxCodePoint;
    8794    };
    8895
     
    96103    }
    97104
    98     class quad_iterator : public boost::iterator_facade<quad_iterator, quad_iterator_return_t, boost::random_access_traversal_tag, quad_iterator_return_t> {
    99         friend class UnicodeSet;
    100         friend class boost::iterator_core_access;
    101     public:
    102         quad_iterator(RunVector::const_iterator runIterator, QuadVector::const_iterator quadIterator)
    103             : mRunIterator(runIterator), mQuadIterator(quadIterator), mOffset(0) {}
    104 
    105         void advance(unsigned n);
    106 
    107         inline quad_iterator_return_t dereference() const {
    108             return std::make_pair(std::make_pair(type(), length()), quad());
    109         }
    110 
    111         inline void increment() {
    112             advance(1);
    113         }
    114 
    115         inline run_type_t type() const {
    116             return mRunIterator->first;
    117         }
    118 
    119         inline length_t length() const {
    120             return mRunIterator->second - mOffset;
    121         }
    122 
    123         inline bitquad_t quad() const {
    124             return *mQuadIterator;
    125         }
    126 
    127         inline bool equal(const quad_iterator & other) const {
    128             return (mRunIterator == other.mRunIterator) && (mQuadIterator == other.mQuadIterator);
    129         }
    130 
    131     private:
    132         RunVector::const_iterator   mRunIterator;
    133         QuadVector::const_iterator  mQuadIterator;
    134         unsigned                    mOffset;
    135     };
    136 
    137     inline quad_iterator quad_begin() const {
    138         return quad_iterator(mRuns.cbegin(), mQuads.cbegin());
    139     }
    140 
    141     inline quad_iterator quad_end() const {
    142         return quad_iterator(mRuns.cend(), mQuads.cend());
    143     }
    144 
    145105    bool contains(const codepoint_t codepoint) const;
    146106
    147107    bool intersects(const codepoint_t lo, const codepoint_t hi) const;
    148108
    149     inline void insert(const codepoint_t cp) {
    150         *this = std::move(*this + UnicodeSet(cp));
    151     }
    152 
    153     inline void insert_range(const codepoint_t lo, const codepoint_t hi) {
    154         *this = std::move(*this + UnicodeSet(lo, hi));
    155     }
     109    void insert(const codepoint_t cp);
     110
     111    void insert_range(const codepoint_t lo, const codepoint_t hi);
    156112
    157113    bool empty() const;
     
    178134    UnicodeSet();
    179135    UnicodeSet(const codepoint_t codepoint);
    180     UnicodeSet(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint);
    181     inline UnicodeSet(const UnicodeSet & other) = default;
    182     UnicodeSet(std::initializer_list<run_t> r, std::initializer_list<bitquad_t> q) : mRuns(r), mQuads(q) {}
    183     UnicodeSet(std::vector<run_t> && r, std::vector<bitquad_t> && q) : mRuns(r), mQuads(q) {}
     136    UnicodeSet(const codepoint_t lo, const codepoint_t hi);
     137    UnicodeSet(const UnicodeSet & other);
     138    UnicodeSet(std::initializer_list<run_t> r, std::initializer_list<bitquad_t> q);
     139    UnicodeSet(std::initializer_list<interval_t>::iterator begin, std::initializer_list<interval_t>::iterator end);
     140    UnicodeSet(const std::vector<interval_t>::iterator begin, const std::vector<interval_t>::iterator end);
    184141
    185142    inline void swap(UnicodeSet & other);
    186143    inline void swap(UnicodeSet && other);
    187144
     145protected:
     146
     147    UnicodeSet(std::vector<run_t> && r, std::vector<bitquad_t> && q);
     148
     149    class quad_iterator : public boost::iterator_facade<quad_iterator, quad_iterator_return_t, boost::random_access_traversal_tag, quad_iterator_return_t> {
     150        friend class UnicodeSet;
     151        friend class boost::iterator_core_access;
     152    public:
     153        quad_iterator(RunIterator runIterator, QuadIterator quadIterator)
     154            : mRunIterator(runIterator), mQuadIterator(quadIterator), mOffset(0) {}
     155
     156        void advance(unsigned n);
     157
     158        inline quad_iterator_return_t dereference() const {
     159            return std::make_pair(std::make_pair(type(), length()), quad());
     160        }
     161
     162        inline void increment() {
     163            advance(1);
     164        }
     165
     166        inline run_type_t type() const {
     167            return mRunIterator->first;
     168        }
     169
     170        inline length_t length() const {
     171            return mRunIterator->second - mOffset;
     172        }
     173
     174        inline bitquad_t quad() const {
     175            return *mQuadIterator;
     176        }
     177
     178        inline bool equal(const quad_iterator & other) const {
     179            return (mRunIterator == other.mRunIterator) && (mQuadIterator == other.mQuadIterator);
     180        }
     181
     182    private:
     183        RunIterator     mRunIterator;
     184        QuadIterator    mQuadIterator;
     185        unsigned        mOffset;
     186    };
     187
     188    inline quad_iterator quad_begin() const {
     189        return quad_iterator(mRuns.cbegin(), mQuads.cbegin());
     190    }
     191
     192    inline quad_iterator quad_end() const {
     193        return quad_iterator(mRuns.cend(), mQuads.cend());
     194    }
     195
    188196private:
    189197
    190     RunVector       mRuns;
    191     QuadVector      mQuads;
     198    RunVector           mRuns;
     199    QuadVector          mQuads;
     200    static Allocator    mAllocator;
    192201};
    193202
Note: See TracChangeset for help on using the changeset viewer.