source: icGREP/icgrep-devel/icgrep/UCD/unicode_set.h @ 4620

Last change on this file since 4620 was 4620, checked in by nmedfort, 4 years ago

More modifications to the UnicodeSet? class. Default iterator computes code point range intervals as expected by the UCD compiler.

File size: 5.8 KB
Line 
1#ifndef UNICODE_SET_H
2#define UNICODE_SET_H
3#include <stdint.h>
4#include <vector>
5#include <re/re_cc.h>
6#include <boost/iterator/iterator_facade.hpp>
7
8//
9// unicode_set.h - representing and manipulating sets of Unicode
10// characters, based on data from UCD - the Unicode Character Database
11//
12// Robert D. Cameron
13// September 18, 2014
14//
15// Licensed under Open Software License 3.0.
16//
17// Unicode Sparse Bitset Representation
18//
19// The Unicode Sparse Bitset representation is based on
20// (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads.
21// (b) Specifying the quads using a run-length encoding, in which each run
22//     is Empty (quads contain no members), Mixed (quads contain some members and
23//     some nonmembers) or Full (all codepoints in each quad are members of the set).
24// (c) Explicitly listing all the quads of Mixed type.
25//
26
27//
28// The internal datatype for quads - bitsets of 2^k codepoints.
29// Default: 64 codepoints (k=6).
30//
31
32namespace llvm {
33class raw_ostream;
34}
35
36typedef uint32_t bitquad_t;
37
38// The representation for runs
39enum run_type_t : uint16_t {Empty, Mixed, Full};
40
41struct RunStructure {
42  RunStructure(run_type_t r, uint16_t lgth) : mType(r), mRunLength(lgth) {}
43  run_type_t mType;
44  uint16_t mRunLength;
45};
46
47class UnicodeSet {
48public:
49
50    using codepoint_t = re::codepoint_t;
51    using interval_t = re::interval_t;
52    using RunVector = std::vector<RunStructure>;
53    using QuadVector = std::vector<bitquad_t>;
54
55    class iterator : public boost::iterator_facade<iterator, interval_t, boost::forward_traversal_tag, interval_t> {
56        friend class UnicodeSet;
57        friend class boost::iterator_core_access;
58    protected:
59        iterator(RunVector::const_iterator runIterator, QuadVector::const_iterator quadIterator)
60        : mRunIterator(runIterator), mQuadIterator(quadIterator)
61        , mQuadOffset(0), mQuadPosition(0), mBaseCodePoint(0), mMinCodePoint(0), mMaxCodePoint(0)
62        {
63
64        }
65
66        void advance(const unsigned n);
67
68        re::interval_t dereference() const {
69            return std::make_pair(mMinCodePoint, mMaxCodePoint);
70        }
71
72        inline void increment() {
73            advance(1);
74        }
75
76        inline bool equal(iterator const & other) const {
77            return (mRunIterator == other.mRunIterator) && (mQuadIterator == other.mQuadIterator);
78        }
79    private:
80        RunVector::const_iterator           mRunIterator;
81        const RunVector::const_iterator     mRunEnd;
82        QuadVector::const_iterator          mQuadIterator;
83
84
85        bitquad_t                   mQuadOffset;
86        unsigned                    mQuadPosition;
87        unsigned                    mBaseCodePoint;
88        re::codepoint_t             mMinCodePoint;
89        re::codepoint_t             mMaxCodePoint;
90    };
91
92    inline iterator begin() const {
93        // note: pre-increment is intentional to move the iterator onto the first non-Empty interval.
94        return ++iterator(mRuns.cbegin(), mQuads.cbegin());
95    }
96
97    inline iterator end() const {
98        return iterator(mRuns.cend(), mQuads.cend());
99    }
100
101    class quad_iterator : public boost::iterator_facade<quad_iterator, std::pair<RunStructure, bitquad_t>, boost::random_access_traversal_tag> {
102        friend class UnicodeSet;
103        friend class boost::iterator_core_access;
104    public:
105        quad_iterator(RunVector::const_iterator runIterator, QuadVector::const_iterator quadIterator)
106            : mRunIterator(runIterator), mQuadIterator(quadIterator), mOffset(0) {}
107
108        void advance(unsigned n);
109
110        inline const std::pair<RunStructure, bitquad_t> dereference() const {
111            return std::make_pair(getRun(), getQuad());
112        }
113
114        inline void increment() {
115            advance(1);
116        }
117
118        inline RunStructure getRun() const {
119            const auto & t = *mRunIterator;
120            return RunStructure(t.mType, t.mRunLength - mOffset);
121        }
122
123        inline bitquad_t getQuad() const {
124            return *mQuadIterator;
125        }
126
127        inline bool equal(const quad_iterator & other) const {
128            return (mRunIterator == other.mRunIterator) && (mQuadIterator == other.mQuadIterator);
129        }
130
131    private:
132        RunVector::const_iterator   mRunIterator;
133        QuadVector::const_iterator  mQuadIterator;
134        unsigned                    mOffset;
135    };
136
137    inline quad_iterator quad_begin() const {
138        return quad_iterator(mRuns.cbegin(), mQuads.cbegin());
139    }
140
141    inline quad_iterator quad_end() const {
142        return quad_iterator(mRuns.cend(), mQuads.cend());
143    }
144
145    bool contains(const codepoint_t codepoint) const;
146
147    void dump(llvm::raw_ostream & out) const;
148
149    UnicodeSet complement() const;
150    UnicodeSet operator & (const UnicodeSet & other) const;
151    UnicodeSet operator + (const UnicodeSet & other) const;
152    UnicodeSet operator - (const UnicodeSet & other) const;
153    UnicodeSet operator ^ (const UnicodeSet & other) const;
154
155    UnicodeSet();
156    UnicodeSet(const codepoint_t codepoint);
157    UnicodeSet(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint);
158    UnicodeSet(std::initializer_list<RunStructure> r, std::initializer_list<bitquad_t> q) : mRuns(r), mQuads(q) {}
159    UnicodeSet(std::vector<RunStructure> && r, std::vector<bitquad_t> && q) : mRuns(r), mQuads(q) {}
160
161private:
162
163    std::vector<RunStructure>   mRuns;
164    std::vector<bitquad_t>      mQuads;
165};
166
167inline UnicodeSet uset_complement(const UnicodeSet & s) {
168    return s.complement();
169}
170
171inline UnicodeSet uset_intersection(const UnicodeSet & s1, const UnicodeSet & s2) {
172    return s1 & s2;
173}
174
175inline UnicodeSet uset_union(const UnicodeSet & s1, const UnicodeSet & s2) {
176    return s1 + s2;
177}
178
179inline UnicodeSet uset_difference(const UnicodeSet & s1, const UnicodeSet & s2) {
180    return s1 - s2;
181}
182
183inline UnicodeSet uset_symmetric_difference(const UnicodeSet & s1, const UnicodeSet & s2) {
184    return s1 ^ s2;
185}
186
187#endif
188
Note: See TracBrowser for help on using the repository browser.