source: icGREP/icgrep-devel/icgrep/UCD/unicode_set.h @ 4611

Last change on this file since 4611 was 4611, checked in by nmedfort, 4 years ago

Temporary check-in

File size: 2.9 KB
Line 
1#ifndef UNICODE_SET_H
2#define UNICODE_SET_H
3#include <stdint.h>
4#include <vector>
5#include <ostream>
6
7//
8// unicode_set.h - representing and manipulating sets of Unicode
9// characters, based on data from UCD - the Unicode Character Database
10//
11// Robert D. Cameron
12// September 18, 2014
13//
14// Licensed under Open Software License 3.0.
15//
16// Unicode Sparse Bitset Representation
17//
18// The Unicode Sparse Bitset representation is based on
19// (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads.
20// (b) Specifying the quads using a run-length encoding, in which each run
21//     is Empty (quads contain no members), Mixed (quads contain some members and
22//     some nonmembers) or Full (all codepoints in each quad are members of the set).
23// (c) Explicitly listing all the quads of Mixed type.
24//
25
26//
27// The internal datatype for quads - bitsets of 2^k codepoints.
28// Default: 64 codepoints (k=6).
29//
30typedef uint32_t bitquad_t;
31
32const size_t quad_bits = 8 * sizeof(bitquad_t);
33const size_t mod_quad_bit_mask = quad_bits - 1;
34const size_t UnicodeQuadCount = 0x110000 / quad_bits;
35const bitquad_t FullQuadMask = -1;
36
37// The representation for runs
38enum run_type_t : uint16_t {Empty, Mixed, Full};
39
40struct RunStructure {
41  RunStructure(run_type_t r, uint16_t lgth) : run_type(r), run_length(lgth) {}
42  run_type_t run_type;
43  uint16_t run_length;
44};
45
46class UnicodeSet {
47friend class Uset_Iterator;
48public:
49//
50//  The internal fields for a UnicodeSet.
51    std::vector<RunStructure> runs;
52    std::vector<bitquad_t> quads;
53    int quad_count;
54   
55// 
56//  Internal helper functions
57    void append_run(run_type_t run_type, int run_length);
58    void append_quad(bitquad_t q);
59//
60//  Nullary constructor for incremental building.
61    UnicodeSet() : runs(std::vector<RunStructure>()), quads(std::vector<bitquad_t>()), quad_count(0) {}
62//
63//  Ternary constructor for constant construction using precomputed data.
64    UnicodeSet(std::initializer_list<RunStructure> r, std::initializer_list<bitquad_t> q, int c) : runs(r), quads(q), quad_count(c) {}
65};
66
67void Dump_uset(UnicodeSet s);
68UnicodeSet empty_uset();
69UnicodeSet singleton_uset(int codepoint);
70UnicodeSet range_uset(int lo_codepoint, int hi_codepoint);
71UnicodeSet uset_complement (const UnicodeSet &s);
72UnicodeSet uset_union(const UnicodeSet & s1, const UnicodeSet & s2);
73UnicodeSet uset_intersection(const UnicodeSet &s1, const UnicodeSet &s2);
74UnicodeSet uset_difference(const UnicodeSet &s1, const UnicodeSet &s2);
75UnicodeSet uset_symmetric_difference(const UnicodeSet & s1, const UnicodeSet & s2);
76bool uset_member(const UnicodeSet & s, int codepoint);
77
78class Uset_Iterator {
79public:
80    Uset_Iterator(const UnicodeSet & s) : uSet(s), run_no(0), offset(0), quad_no(0) {}
81    bool at_end();
82    RunStructure current_run();
83    bitquad_t get_quad();
84    void advance(int n);
85private:
86    const UnicodeSet & uSet;
87    int run_no;
88    int offset;
89    int quad_no;
90};
91
92#endif
93
Note: See TracBrowser for help on using the repository browser.