source: icGREP/icgrep-devel/icgrep/UCD/unicode_set.h @ 4532

Last change on this file since 4532 was 4189, checked in by cameron, 5 years ago

Unicode data files and sparse bitset representation

File size: 2.5 KB
Line 
1#ifndef UNICODE_SET_H
2#define UNICODE_SET_H
3#include <stdint.h>
4#include <vector>
5#include <ostream>
6//
7// unicode_set.h - representing and manipulating sets of Unicode
8// characters, based on data from UCD - the Unicode Character Database
9//
10// Robert D. Cameron
11// September 18, 2014
12//
13// Licensed under Open Software License 3.0.
14//
15// Unicode Sparse Bitset Representation
16//
17// The Unicode Sparse Bitset representation is based on
18// (a) Dividing the Unicode codepoint space into groups of 2^k codepoints called quads.
19// (b) Specifying the quads using a run-length encoding, in which each run
20//     is Empty (quads contain no members), Mixed (quads contain some members and
21//     some nonmembers) or Full (all codepoints in each quad are members of the set).
22// (c) Explicitly listing all the quads of Mixed type.
23//
24
25//
26// The internal datatype for quads - bitsets of 2^k codepoints.
27// Default: 64 codepoints (k=6).
28//
29typedef uint32_t bitquad_t;
30
31const size_t quad_bits = 8 * sizeof(bitquad_t);
32const size_t mod_quad_bit_mask = quad_bits - 1;
33const size_t UnicodeQuadCount = 0x110000 / quad_bits;
34const bitquad_t FullQuadMask = -1;
35
36// The representation for runs
37enum run_type_t : uint16_t {Empty, Mixed, Full};
38
39struct RunStructure {
40  RunStructure(run_type_t r, uint16_t lgth) : run_type(r), run_length(lgth) {};
41  run_type_t run_type;
42  uint16_t run_length;
43};
44
45class UnicodeSet {
46friend class Uset_Iterator;
47public:
48//
49//  The internal fields for a UnicodeSet.
50    std::vector<RunStructure> runs;
51    std::vector<bitquad_t> quads;
52    int quad_count;
53   
54// 
55//  Internal helper functions
56    void append_run(run_type_t run_type, int run_length);
57    void append_quad(bitquad_t q);
58//
59//  Nullary constructor for incremental building.
60    UnicodeSet() : runs(std::vector<RunStructure>()), quads(std::vector<bitquad_t>()), quad_count(0) {};
61//
62//  Ternary constructor for constant construction using precomputed data.
63    UnicodeSet(std::vector<RunStructure> r, std::vector<bitquad_t> q, int c) : runs(r), quads(q), quad_count(c) {};
64};
65
66    void Dump_uset(UnicodeSet s);
67    UnicodeSet empty_uset();
68    UnicodeSet singleton_uset(int codepoint);
69    UnicodeSet range_uset(int lo_codepoint, int hi_codepoint);
70    UnicodeSet uset_complement (UnicodeSet s);
71    UnicodeSet uset_union(UnicodeSet s1, UnicodeSet s2);
72    UnicodeSet uset_intersection(UnicodeSet s1, UnicodeSet s2);
73    UnicodeSet uset_difference(UnicodeSet s1, UnicodeSet s2);
74    UnicodeSet uset_symmetric_difference(UnicodeSet s1, UnicodeSet s2);
75    bool uset_member(UnicodeSet s, int codepoint);
76
77#endif
78
Note: See TracBrowser for help on using the repository browser.