source: icGREP/icgrep-devel/icgrep/UCD/unicode_set.h @ 4616

Last change on this file since 4616 was 4616, checked in by nmedfort, 4 years ago

Replaced USet_Iterator with a standard C++ UnicodeSet? iterator.

File size: 3.5 KB
Line 
1#ifndef UNICODE_SET_H
2#define UNICODE_SET_H
3#include <stdint.h>
4#include <vector>
5#include <boost/iterator/iterator_facade.hpp>
6
7//
8// unicode_set.h - representing and manipulating sets of Unicode
9// characters, based on data from UCD - the Unicode Character Database
10//
11// Robert D. Cameron
12// September 18, 2014
13//
14// Licensed under Open Software License 3.0.
15//
16// Unicode Sparse Bitset Representation
17//
18// The Unicode Sparse Bitset representation is based on
19// (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads.
20// (b) Specifying the quads using a run-length encoding, in which each run
21//     is Empty (quads contain no members), Mixed (quads contain some members and
22//     some nonmembers) or Full (all codepoints in each quad are members of the set).
23// (c) Explicitly listing all the quads of Mixed type.
24//
25
26//
27// The internal datatype for quads - bitsets of 2^k codepoints.
28// Default: 64 codepoints (k=6).
29//
30
31typedef uint32_t bitquad_t;
32
33// The representation for runs
34enum run_type_t : uint16_t {Empty, Mixed, Full};
35
36struct RunStructure {
37  RunStructure(run_type_t r, uint16_t lgth) : mType(r), mRunLength(lgth) {}
38  run_type_t mType;
39  uint16_t mRunLength;
40};
41
42class UnicodeSet;
43
44class UnicodeSet {
45public:
46
47    class iterator : public boost::iterator_facade<iterator, const UnicodeSet &, boost::forward_traversal_tag, std::pair<RunStructure, bitquad_t>> {
48        friend class UnicodeSet;
49    public:
50        iterator(const UnicodeSet & set, unsigned runIndex) : mUnicodeSet(set), mRunIndex(runIndex), mOffset(0), mQuadIndex(0) {}
51    protected:
52        friend class boost::iterator_core_access;
53        void advance(unsigned n);
54        const std::pair<RunStructure, bitquad_t> dereference() const;
55        inline void increment() {
56            advance(1);
57        }
58        inline bool equal(iterator const& other) const {
59            return (mRunIndex == other.mRunIndex) && (&(mUnicodeSet) == &(other.mUnicodeSet)) && (mQuadIndex == other.mQuadIndex) && (mOffset == other.mOffset);
60        }
61    private:
62        const UnicodeSet &          mUnicodeSet;
63        unsigned                    mRunIndex;
64        unsigned                    mOffset;
65        unsigned                    mQuadIndex;
66    };
67
68    inline iterator begin() const {
69        return iterator(*this, 0);
70    }
71
72    inline iterator end() const {
73        return iterator(*this, runs.size());
74    }
75
76//
77//  The internal fields for a UnicodeSet.
78    std::vector<RunStructure>   runs;
79    std::vector<bitquad_t>      quads;
80    unsigned quad_count;
81   
82// 
83//  Internal helper functions
84    void append_run(run_type_t run_type, int run_length);
85    void append_quad(bitquad_t q);
86//
87//  Nullary constructor for incremental building.
88    UnicodeSet() : quad_count(0) {}
89//
90//  Ternary constructor for constant construction using precomputed data.
91    UnicodeSet(std::initializer_list<RunStructure> r, std::initializer_list<bitquad_t> q, unsigned c) : runs(r), quads(q), quad_count(c) {}
92};
93
94void Dump_uset(UnicodeSet s);
95UnicodeSet empty_uset();
96UnicodeSet singleton_uset(int codepoint);
97UnicodeSet range_uset(int lo_codepoint, int hi_codepoint);
98UnicodeSet uset_complement (const UnicodeSet &s);
99UnicodeSet uset_union(const UnicodeSet & s1, const UnicodeSet & s2);
100UnicodeSet uset_intersection(const UnicodeSet &s1, const UnicodeSet &s2);
101UnicodeSet uset_difference(const UnicodeSet &s1, const UnicodeSet &s2);
102UnicodeSet uset_symmetric_difference(const UnicodeSet & s1, const UnicodeSet & s2);
103bool uset_member(const UnicodeSet & s, int codepoint);
104
105#endif
106
Note: See TracBrowser for help on using the repository browser.