Ignore:
Timestamp:
Oct 1, 2015, 2:54:17 PM (4 years ago)
Author:
nmedfort
Message:

Embedded UnicodeSet? into CC objects (will currently cause memory leak)

Location:
icGREP/icgrep-devel/icgrep/re
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/re_cc.cpp

    r4797 r4812  
    1111
    1212namespace re {
    13 CC::IntervalAllocator CC::mCharSetAllocator;
    1413
    1514CC::CC(const CC * cc1, const CC * cc2)
    1615: RE(ClassTypeId::CC)
    17 , mSparseCharSet(cc1->cbegin(), cc1->cend(), mCharSetAllocator) {
    18     for (const interval_t & i : cc2->mSparseCharSet) {
    19         insert_range(lo_codepoint(i), hi_codepoint(i));
    20     }
     16, mSparseCharSet(std::move(cc1->mSparseCharSet + cc2->mSparseCharSet)) {
     17
    2118}
    2219
    2320CC::CC(const CC & cc)
    2421: RE(ClassTypeId::CC)
    25 , mSparseCharSet(cc.cbegin(), cc.cend(), mCharSetAllocator) {
     22, mSparseCharSet(cc.mSparseCharSet) {
    2623
    2724}
     
    4946}
    5047
    51 void CC::insert_range(const codepoint_t lo, const codepoint_t hi) {
    52     for (auto i = mSparseCharSet.begin(); i != mSparseCharSet.end(); ) {
    53         if (hi < lo_codepoint(i) - 1) {
    54             mSparseCharSet.emplace(i, lo, hi);
    55             return;
    56         } else if (lo > hi_codepoint(i) + 1) {
    57             ++i;
    58         } else {
    59             // ranges overlap; expand the range to include the overlapp
    60             lo_codepoint(i) = std::min(lo_codepoint(i), lo);
    61             hi_codepoint(i) = std::max(hi_codepoint(i), hi);
    62             // Test whether the new hi code point of this range touches the subsequent
    63             // interval. If so extend it over that one and remove it from the list.
    64             for (auto j = i + 1; j != mSparseCharSet.end(); ) {
    65                 if (LLVM_LIKELY(hi_codepoint(i) + 1 < lo_codepoint(j))) {
    66                     break;
    67                 }
    68                 hi_codepoint(i) = std::max(hi_codepoint(i), hi_codepoint(j));
    69                 j = mSparseCharSet.erase(j);
    70             }
    71             return;
    72         }
    73     }
    74     mSparseCharSet.emplace_back(lo, hi);
    75 }
    76 
    77 void CC::remove_range(const codepoint_t lo, const codepoint_t hi) {
    78     for (auto i = mSparseCharSet.begin(); i != mSparseCharSet.end(); ) {
    79         if (lo > hi_codepoint(i) + 1) {
    80             ++i;
    81         }
    82         else if (hi < lo_codepoint(i) - 1) {
    83             break;
    84         }
    85         else if (lo <= lo_codepoint(i) && hi >= hi_codepoint(i)) {
    86             i = mSparseCharSet.erase(i);
    87         }
    88         else if (lo <= lo_codepoint(i)) {
    89             lo_codepoint(i) = hi + 1;
    90             break;
    91         }
    92         else if (hi >= hi_codepoint(i)) {
    93             hi_codepoint(i) = lo - 1;
    94             ++i;
    95         }
    96         else {         
    97             mSparseCharSet.emplace(++i, hi + 1, hi_codepoint(i));
    98             hi_codepoint(i) = lo - 1;
    99             break;
    100         }
    101     }
    102 }
    103 
    10448CC * subtractCC(const CC * a, const CC * b) {
    105     CC * diff = makeCC();
    106     auto i = a->cbegin();
    107     const auto i_end = a->cend();
    108     auto j = b->cbegin();
    109     const auto j_end = b->cend();
    110     while (i != i_end && j != j_end) {
    111         if (hi_codepoint(j) < lo_codepoint(i)) {
    112             ++j;
    113         }
    114         else { // test whether the intervals overlap
    115             if (lo_codepoint(i) < lo_codepoint(j)) {
    116                 diff->insert_range(lo_codepoint(i), std::min(lo_codepoint(j) - 1, hi_codepoint(i)));
    117             }
    118             if (hi_codepoint(i) > hi_codepoint(j)) {
    119                 diff->insert_range(std::max(hi_codepoint(j) + 1, lo_codepoint(i)), hi_codepoint(i));
    120             }
    121             ++i;
    122         }
    123     }
    124     for (; i != i_end; ++i) {
    125         diff->insert_range(lo_codepoint(i), hi_codepoint(i));
    126     }
    127     return diff;
     49    return makeCC(a->mSparseCharSet - b->mSparseCharSet);
    12850}
    12951   
    13052CC * intersectCC(const CC * a, const CC * b) {
    131     CC * isect = makeCC();
    132     auto ai = a->cbegin();
    133     const auto ai_end = a->cend();
    134     auto bi = b->cbegin();
    135     const auto bi_end = b->cend();
    136     while (ai != ai_end && bi != bi_end) {
    137         if (hi_codepoint(ai) < lo_codepoint(bi)) {
    138             ++ai;
    139         }
    140         else if (hi_codepoint(bi) < lo_codepoint(ai)) {
    141             ++bi;
    142         }
    143         else {
    144             isect->insert_range(std::max(lo_codepoint(ai), lo_codepoint(bi)), std::min(hi_codepoint(ai), hi_codepoint(bi)));
    145             if (hi_codepoint(ai) < hi_codepoint(bi)) {
    146                 ++ai;
    147             }
    148             else {
    149                 ++bi;
    150             }
    151         }
    152     }
    153     return isect;
     53    return makeCC(a->mSparseCharSet & b->mSparseCharSet);
    15454}
    15555   
  • icGREP/icgrep-devel/icgrep/re/re_cc.h

    r4621 r4812  
    1212#include <string>
    1313#include <vector>
     14#include <UCD/unicode_set.h>
    1415#include <slab_allocator.h>
    1516
    1617namespace re {
    1718
    18 using codepoint_t = unsigned;
    19 using interval_t = std::pair<codepoint_t, codepoint_t>;
     19using codepoint_t = UCD::UnicodeSet::codepoint_t;
     20using interval_t = UCD::UnicodeSet::interval_t;
    2021
    2122enum CC_type {UnicodeClass, ByteClass};
     
    3132    }
    3233
    33     using IntervalAllocator = SlabAllocator<interval_t>;
    34     using IntervalVector = std::vector<interval_t, IntervalAllocator>;
    35 
    36     using iterator = IntervalVector::iterator;
    37     using const_iterator = IntervalVector::const_iterator;
    38     using size_type = IntervalVector::size_type;
    39     using reference = IntervalVector::reference;
    40     using const_reference = IntervalVector::const_reference;
    41 
    42     static const codepoint_t UNICODE_MAX = 0x10FFFF;
     34    using iterator = UCD::UnicodeSet::iterator;
     35    using size_type = UCD::UnicodeSet::size_type;
    4336
    4437    std::string canonicalName(const CC_type type) const;
    4538
    46     interval_t & operator [](unsigned i) {
    47         return mSparseCharSet[i];
    48     }
    49 
    50     const interval_t & operator [](unsigned i) const {
    51         return mSparseCharSet[i];
    52     }
    53 
    5439    inline codepoint_t min_codepoint() const {
    55         return empty() ? 0 : std::get<0>(front());
     40        return mSparseCharSet.front().first;
    5641    }
    5742
    5843    inline codepoint_t max_codepoint() const {
    59         return empty() ? 0 : std::get<1>(back());
     44        return mSparseCharSet.back().second;
    6045    }
    6146
    62     void insert_range(const codepoint_t lo, const codepoint_t hi);
    63 
    64     void remove_range(const codepoint_t lo, const codepoint_t hi);
     47    void insert_range(const codepoint_t lo, const codepoint_t hi) {
     48        mSparseCharSet.insert_range(lo, hi);
     49    }
    6550
    6651    inline void insert(const codepoint_t codepoint) {
    67         insert_range(codepoint, codepoint);
     52        mSparseCharSet.insert(codepoint);
    6853    }
    6954
    70     inline void remove(const codepoint_t codepoint) {
    71         remove_range(codepoint, codepoint);
    72     }
    73 
    74     inline iterator begin() {
     55    inline iterator begin() const {
    7556        return mSparseCharSet.begin();
    7657    }
    7758
    78     inline iterator end() {
     59    inline iterator end() const {
    7960        return mSparseCharSet.end();
    8061    }
    8162
    82     inline reference front() {
     63    inline interval_t front() const {
    8364        return mSparseCharSet.front();
    8465    }
    8566
    86     inline reference back() {
    87         return mSparseCharSet.back();
    88     }
    89 
    90     inline const_iterator begin() const {
    91         return mSparseCharSet.cbegin();
    92     }
    93 
    94     inline const_iterator end() const {
    95         return mSparseCharSet.cend();
    96     }
    97 
    98     inline const_iterator cbegin() const {
    99         return mSparseCharSet.cbegin();
    100     }
    101 
    102     inline const_iterator cend() const {
    103         return mSparseCharSet.cend();
    104     }
    105 
    106     inline const_reference front() const {
    107         return mSparseCharSet.front();
    108     }
    109 
    110     inline const_reference back() const {
     67    inline interval_t back() const {
    11168        return mSparseCharSet.back();
    11269    }
     
    12986    friend CC * makeCC(const std::initializer_list<interval_t> list);
    13087    friend CC * makeCC(const std::vector<interval_t> & list);
    131     friend CC * subtractCC(const CC * cc1, const CC * cc2);
     88    friend CC * makeCC(UCD::UnicodeSet && set);
     89    friend CC * subtractCC(const CC * a, const CC * b);
     90    friend CC * intersectCC(const CC * a, const CC * b);
     91    friend CC * caseInsensitize(const CC * a, const CC * b);
    13292
    13393    inline CC()
    13494    : RE(ClassTypeId::CC)
    135     , mSparseCharSet(mCharSetAllocator) {
     95    , mSparseCharSet() {
    13696
    13797    }
     
    13999    inline CC(const codepoint_t codepoint)
    140100    : RE(ClassTypeId::CC)
    141     , mSparseCharSet(mCharSetAllocator) {
    142         insert(codepoint);
     101    , mSparseCharSet(codepoint) {
     102
    143103    }
    144104    inline CC(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint)
    145105    : RE(ClassTypeId::CC)
    146     , mSparseCharSet(mCharSetAllocator) {
    147         insert_range(lo_codepoint, hi_codepoint);
     106    , mSparseCharSet(lo_codepoint, hi_codepoint) {
     107
    148108    }
    149109    CC(const CC * cc1, const CC * cc2);
     110
     111    inline CC(UCD::UnicodeSet && set)
     112    : RE(ClassTypeId::CC)
     113    , mSparseCharSet(std::move(set)) {
     114
     115    }
    150116
    151117    template <typename itr>
    152118    CC * initialize(itr begin, itr end);
    153119private:   
    154     IntervalVector mSparseCharSet;
    155     static IntervalAllocator mCharSetAllocator;
     120    UCD::UnicodeSet mSparseCharSet;
    156121};
    157122
    158 inline static CC::iterator begin(CC & cc) {
     123inline static CC::iterator begin(const CC & cc) {
    159124    return cc.begin();
    160125}
    161126
    162 inline static CC::iterator end(CC & cc) {
     127inline static CC::iterator end(const CC & cc) {
    163128    return cc.end();
    164129}
    165130
    166 inline static CC::const_iterator begin(const CC & cc) {
    167     return cc.cbegin();
    168 }
    169 
    170 inline static CC::const_iterator end(const CC & cc) {
    171     return cc.cend();
    172 }
    173 
    174 inline codepoint_t & lo_codepoint(interval_t & i) {
    175     return std::get<0>(i);
    176 }
    177131inline codepoint_t lo_codepoint(const interval_t & i) {
    178132    return std::get<0>(i);
    179133}
    180 inline codepoint_t & lo_codepoint(const CC::iterator i) {
    181     return lo_codepoint(*i);
    182 }
    183 inline codepoint_t lo_codepoint(const CC::const_iterator i) {
     134inline codepoint_t lo_codepoint(const CC::iterator i) {
    184135    return lo_codepoint(*i);
    185136}
    186137
    187 inline codepoint_t & hi_codepoint(interval_t & i) {
    188     return std::get<1>(i);
    189 }
    190138inline codepoint_t hi_codepoint(const interval_t & i) {
    191139    return std::get<1>(i);
    192140}
    193 inline codepoint_t & hi_codepoint(const CC::iterator i) {
    194     return hi_codepoint(*i);
    195 }
    196 inline codepoint_t hi_codepoint(const CC::const_iterator i) {
     141inline codepoint_t hi_codepoint(const CC::iterator i) {
    197142    return hi_codepoint(*i);
    198143}
     
    200145template<typename itr>
    201146CC * CC::initialize(itr begin, itr end) {
    202     mSparseCharSet.resize(std::distance(begin, end));
    203147    for (auto i = begin; i != end; ++i) {
    204         assert (i == begin || lo_codepoint(*i) > max_codepoint());
    205         mSparseCharSet[std::distance(begin, i)] = *i;
     148        mSparseCharSet.insert_range(i->first, i->second);
    206149    }
    207150    return this;
    208151}
    209 
    210152
    211153/**
     
    241183}
    242184
     185inline CC * makeCC(UCD::UnicodeSet && set) {
     186    return makeCC(std::move(set));
     187}
     188
    243189CC * subtractCC(const CC * a, const CC * b);
    244190   
  • icGREP/icgrep-devel/icgrep/re/re_parser.cpp

    r4809 r4812  
    475475    // It is an error if a 4-byte sequence is used to encode a codepoint
    476476    // above the Unicode maximum.
    477     if (cp > CC::UNICODE_MAX) {
     477    if (cp > UCD::UNICODE_MAX) {
    478478        throw InvalidUTF8Encoding();
    479479    }
     
    929929    }
    930930    if (count < mindigits) throw ParseFailure("Octal sequence has too few digits");
    931     if (value > CC::UNICODE_MAX) throw ParseFailure("Octal value too large");
     931    if (value > UCD::UNICODE_MAX) throw ParseFailure("Octal value too large");
    932932    return value;
    933933}
     
    948948    }
    949949    if (count < mindigits) throw ParseFailure("Hexadecimal sequence has too few digits");
    950     if (value > CC::UNICODE_MAX) throw ParseFailure("Hexadecimal value too large");
     950    if (value > UCD::UNICODE_MAX) throw ParseFailure("Hexadecimal value too large");
    951951    return value;
    952952}
     
    966966    if (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) {
    967967        caseInsensitiveInsert(cc, cp);
    968     }
    969     else cc->insert(cp);
     968    } else {
     969        cc->insert(cp);
     970    }
    970971}
    971972
     
    973974    if (fModeFlagSet & CASE_INSENSITIVE_MODE_FLAG) {
    974975        caseInsensitiveInsertRange(cc, lo, hi);
    975     }
    976     else cc->insert_range(lo, hi);
     976    } else {
     977        cc->insert_range(lo, hi);
     978    }
    977979}
    978980
Note: See TracChangeset for help on using the changeset viewer.