Changeset 4616 for icGREP/icgrepdevel/icgrep/UCD/unicode_set.cpp
 Timestamp:
 Jun 24, 2015, 5:26:17 PM (4 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

icGREP/icgrepdevel/icgrep/UCD/unicode_set.cpp
r4611 r4616 23 23 #include <iostream> 24 24 25 bool Uset_Iterator::at_end() { 26 return run_no == uSet.runs.size(); 27 } 28 29 RunStructure Uset_Iterator::current_run() { 30 RunStructure this_run = uSet.runs[run_no]; 31 return RunStructure(this_run.run_type, this_run.run_length  offset); 32 } 33 34 bitquad_t Uset_Iterator::get_quad() { 35 RunStructure this_run = uSet.runs[run_no]; 36 if (this_run.run_type == Empty) return 0; 37 else if (this_run.run_type == Full) return FullQuadMask; 38 else return uSet.quads[quad_no]; 39 } 40 41 void Uset_Iterator::advance(int n) { 25 const size_t QUAD_BITS = (8 * sizeof(bitquad_t)); 26 const size_t MOD_QUAD_BIT_MASK = QUAD_BITS  1; 27 const size_t UNICODE_QUAD_COUNT = 0x110000 / QUAD_BITS; 28 const bitquad_t FULL_QUAD_MASK = 1; 29 30 31 inline const RunStructure & get_run(UnicodeSet::iterator i) { 32 return std::get<0>(*i); 33 } 34 35 inline bitquad_t get_quad(UnicodeSet::iterator i) { 36 return std::get<1>(*i); 37 } 38 39 const std::pair<RunStructure, bitquad_t> UnicodeSet::iterator::dereference() const { 40 const RunStructure & t = mUnicodeSet.runs[mRunIndex]; 41 RunStructure s(t.mType, t.mRunLength  mOffset); 42 const bitquad_t q = ((t.mType == Empty) ? 0 : (t.mType == Full) ? FULL_QUAD_MASK : mUnicodeSet.quads[mQuadIndex]); 43 return std::make_pair(s, q); 44 } 45 46 void UnicodeSet::iterator::advance(unsigned n) { 42 47 while (n > 0) { 43 RunStructure this_run = uSet.runs[run_no];44 int remain = t his_run.run_length  offset;48 const RunStructure & t = mUnicodeSet.runs[mRunIndex]; 49 int remain = t.mRunLength  mOffset; 45 50 if (remain > n) { 46 offset += n; 47 if (this_run.run_type == Mixed) quad_no += n; 48 n = 0; 51 mOffset += n; 52 if (t.mType == Mixed) { 53 mQuadIndex += n; 54 } 55 break; 49 56 } 50 57 else if (remain == n) { 51 run_no += 1; 52 offset = 0; 53 if (this_run.run_type == Mixed) quad_no += n; 54 n = 0; 55 } 56 else { 57 run_no += 1; 58 offset = 0; 59 if (this_run.run_type == Mixed) quad_no += remain; 58 ++mRunIndex; 59 mOffset = 0; 60 if (t.mType == Mixed) { 61 mQuadIndex += n; 62 } 63 break; 64 } 65 else { 66 ++mRunIndex; 67 mOffset = 0; 68 if (t.mType == Mixed) { 69 mQuadIndex += remain; 70 } 60 71 n = remain; 61 72 } … … 72 83 else { 73 84 RunStructure last_run = runs[runs.size()1]; 74 if (last_run. run_type == run_type) {75 runs [runs.size()1].run_length += run_length;85 if (last_run.mType == run_type) { 86 runs.back().mRunLength += run_length; 76 87 } 77 88 else { … … 86 97 append_run(Empty, 1); 87 98 } 88 else if (q == F ullQuadMask) {99 else if (q == FULL_QUAD_MASK) { 89 100 append_run(Full, 1); 90 101 } … … 96 107 97 108 void Dump_uset(const UnicodeSet & s) { 98 Uset_Iterator it(s); 99 while (!it.at_end()) { 100 RunStructure this_run = it.current_run(); 101 if (this_run.run_type == Empty) { 102 std::cout << "Empty(" << this_run.run_length << ")\n"; 103 it.advance(this_run.run_length); 104 } 105 else if (this_run.run_type == Full) { 106 std::cout << "Full(" << this_run.run_length << ")\n"; 107 it.advance(this_run.run_length); 108 } 109 else { 110 for (int i = 0; i < this_run.run_length; i++) { 111 std::cout << "Mixed(" << std::hex << it.get_quad() << std::dec << ")\n"; 112 it.advance(1); 109 for (auto it = s.begin(); it != s.end(); ++it) { 110 RunStructure this_run = get_run(it); 111 if (this_run.mType == Empty) { 112 std::cout << "Empty(" << this_run.mRunLength << ")\n"; 113 it += this_run.mRunLength; 114 } 115 else if (this_run.mType == Full) { 116 std::cout << "Full(" << this_run.mRunLength << ")\n"; 117 it += this_run.mRunLength; 118 } 119 else { 120 for (int i = 0; i != this_run.mRunLength; i++) { 121 std::cout << "Mixed(" << std::hex << get_quad(it) << std::dec << ")\n"; 122 ++it; 113 123 } 114 124 } … … 118 128 UnicodeSet empty_uset() { 119 129 UnicodeSet iset; 120 iset.runs.emplace_back(Empty, U nicodeQuadCount);121 iset.quad_count = U nicodeQuadCount;130 iset.runs.emplace_back(Empty, UNICODE_QUAD_COUNT); 131 iset.quad_count = UNICODE_QUAD_COUNT; 122 132 return iset; 123 133 } … … 126 136 UnicodeSet singleton_uset(int codepoint) { 127 137 UnicodeSet iset; 128 int quad_no = codepoint / quad_bits;129 bitquad_t quad_val = 1 << (codepoint & mod_quad_bit_mask);138 int quad_no = codepoint / QUAD_BITS; 139 bitquad_t quad_val = 1 << (codepoint & MOD_QUAD_BIT_MASK); 130 140 if (quad_no > 0) iset.append_run(Empty, quad_no); 131 141 iset.append_run(Mixed, 1); 132 142 iset.quads.push_back(quad_val); 133 if (quad_no < U nicodeQuadCount  1) iset.append_run(Empty, UnicodeQuadCount (quad_no + 1));134 iset.quad_count = U nicodeQuadCount;143 if (quad_no < UNICODE_QUAD_COUNT  1) iset.append_run(Empty, UNICODE_QUAD_COUNT  (quad_no + 1)); 144 iset.quad_count = UNICODE_QUAD_COUNT; 135 145 return iset; 136 146 } … … 139 149 UnicodeSet range_uset(int lo_codepoint, int hi_codepoint) { 140 150 UnicodeSet iset; 141 int lo_quad_no = lo_codepoint / quad_bits;142 int hi_quad_no = hi_codepoint / quad_bits;143 int lo_offset = lo_codepoint & mod_quad_bit_mask;144 int hi_offset = hi_codepoint & mod_quad_bit_mask;151 int lo_quad_no = lo_codepoint / QUAD_BITS; 152 int hi_quad_no = hi_codepoint / QUAD_BITS; 153 int lo_offset = lo_codepoint & MOD_QUAD_BIT_MASK; 154 int hi_offset = hi_codepoint & MOD_QUAD_BIT_MASK; 145 155 if (lo_quad_no > 0) iset.append_run(Empty, lo_quad_no); 146 156 if (lo_quad_no == hi_quad_no) { 147 bitquad_t quad = (F ullQuadMask << lo_offset) & (FullQuadMask >> (quad_bits 1  hi_offset));157 bitquad_t quad = (FULL_QUAD_MASK << lo_offset) & (FULL_QUAD_MASK >> (QUAD_BITS  1  hi_offset)); 148 158 iset.append_quad(quad); 149 159 } 150 160 else { 151 iset.append_quad((F ullQuadMask << lo_offset) & FullQuadMask);161 iset.append_quad((FULL_QUAD_MASK << lo_offset) & FULL_QUAD_MASK); 152 162 iset.append_run(Full, hi_quad_no  (lo_quad_no + 1)); 153 iset.append_quad((F ullQuadMask >> (quad_bits  1  hi_offset)) & FullQuadMask);154 } 155 if (hi_quad_no < U nicodeQuadCount  1) iset.append_run(Empty, UnicodeQuadCount (hi_quad_no + 1));163 iset.append_quad((FULL_QUAD_MASK >> (QUAD_BITS  1  hi_offset)) & FULL_QUAD_MASK); 164 } 165 if (hi_quad_no < UNICODE_QUAD_COUNT  1) iset.append_run(Empty, UNICODE_QUAD_COUNT  (hi_quad_no + 1)); 156 166 return iset; 157 167 } 158 168 159 169 UnicodeSet uset_complement (const UnicodeSet & s) { 160 assert(s.quad_count == UnicodeQuadCount); 161 UnicodeSet iset; 162 Uset_Iterator it(s); 163 while (!it.at_end()) { 164 RunStructure this_run = it.current_run(); 165 if (this_run.run_type == Empty) { 166 iset.append_run(Full, this_run.run_length); 167 it.advance(this_run.run_length); 168 } 169 else if (this_run.run_type == Full) { 170 iset.append_run(Empty, this_run.run_length); 171 it.advance(this_run.run_length); 172 } 173 else { 174 for (int i = 0; i < this_run.run_length; i++) { 175 iset.append_quad(FullQuadMask ^ it.get_quad()); 176 it.advance(1); 170 assert(s.quad_count == UNICODE_QUAD_COUNT); 171 UnicodeSet iset; 172 for (auto itr = s.begin(); itr != s.end(); ) { 173 auto run = get_run(itr); 174 if (run.mType == Empty) { 175 iset.append_run(Full, run.mRunLength); 176 itr += run.mRunLength; 177 } 178 else if (run.mType == Full) { 179 iset.append_run(Empty, run.mRunLength); 180 itr += run.mRunLength; 181 } 182 else { 183 for (unsigned i = 0; i != run.mRunLength; i++) { 184 iset.append_quad(FULL_QUAD_MASK ^ get_quad(itr++)); 177 185 } 178 186 } … … 182 190 183 191 UnicodeSet uset_intersection (const UnicodeSet & s1, const UnicodeSet & s2) { 184 assert(s1.quad_count == UnicodeQuadCount); 185 assert(s2.quad_count == UnicodeQuadCount); 186 UnicodeSet iset; 187 Uset_Iterator i1(s1); 188 Uset_Iterator i2(s2); 189 while (!i1.at_end()) { 190 RunStructure run1 = i1.current_run(); 191 RunStructure run2 = i2.current_run(); 192 int n = std::min(run1.run_length, run2.run_length); 193 if ((run1.run_type == Empty)  (run2.run_type == Empty)) { 192 assert(s1.quad_count == UNICODE_QUAD_COUNT); 193 assert(s2.quad_count == UNICODE_QUAD_COUNT); 194 UnicodeSet iset; 195 for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) { 196 auto run1 = get_run(i1); 197 auto run2 = get_run(i2); 198 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 199 if ((run1.mType == Empty)  (run2.mType == Empty)) { 194 200 iset.append_run(Empty, n); 195 i1 .advance(n);196 i2 .advance(n);197 } 198 else if ((run1. run_type == Full) && (run2.run_type == Full)) {201 i1 += n; 202 i2 += n; 203 } 204 else if ((run1.mType == Full) && (run2.mType == Full)) { 199 205 iset.append_run(Full, n); 200 i1.advance(n); 201 i2.advance(n); 202 } 203 else if (run1.run_type == Full) { 204 for (int i = 0; i < n; i++) { 205 iset.append_quad(i2.get_quad()); 206 i2.advance(1); 207 } 208 i1.advance(n); 209 } 210 else if (run2.run_type == Full) { 211 for (int i = 0; i < n; i++) { 212 iset.append_quad(i1.get_quad()); 213 i1.advance(1); 214 } 215 i2.advance(n); 216 } 217 else { 218 for (int i = 0; i < n; i++) { 219 iset.append_quad(i1.get_quad() & i2.get_quad()); 220 i1.advance(1); 221 i2.advance(1); 206 i1 += n; 207 i2 += n; 208 } 209 else if (run1.mType == Full) { 210 for (unsigned i = 0; i != n; ++i, ++i2) { 211 iset.append_quad(get_quad(i2)); 212 } 213 i1 += n; 214 } 215 else if (run2.mType == Full) { 216 for (unsigned i = 0; i != n; ++i, ++i1) { 217 iset.append_quad(get_quad(i1)); 218 } 219 i2 += n; 220 } 221 else { 222 for (unsigned i = 0; i < n; ++i, ++i1, ++i2) { 223 iset.append_quad(get_quad(i1) & get_quad(i2)); 222 224 } 223 225 } … … 227 229 228 230 UnicodeSet uset_union (const UnicodeSet & s1, const UnicodeSet & s2) { 229 assert(s1.quad_count == UnicodeQuadCount); 230 assert(s2.quad_count == UnicodeQuadCount); 231 UnicodeSet iset; 232 Uset_Iterator i1(s1); 233 Uset_Iterator i2(s2); 234 while (!i1.at_end()) { 235 RunStructure run1 = i1.current_run(); 236 RunStructure run2 = i2.current_run(); 237 int n = std::min(run1.run_length, run2.run_length); 238 if ((run1.run_type == Empty) && (run2.run_type == Empty)) { 231 assert(s1.quad_count == UNICODE_QUAD_COUNT); 232 assert(s2.quad_count == UNICODE_QUAD_COUNT); 233 UnicodeSet iset; 234 for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) { 235 auto run1 = get_run(i1); 236 auto run2 = get_run(i2); 237 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 238 if ((run1.mType == Empty) && (run2.mType == Empty)) { 239 239 iset.append_run(Empty, n); 240 i1 .advance(n);241 i2 .advance(n);242 } 243 else if ((run1. run_type == Full)  (run2.run_type == Full)) {240 i1 += n; 241 i2 += n; 242 } 243 else if ((run1.mType == Full)  (run2.mType == Full)) { 244 244 iset.append_run(Full, n); 245 i1.advance(n); 246 i2.advance(n); 247 } 248 else if (run1.run_type == Empty) { 249 for (int i = 0; i < n; i++) { 250 iset.append_quad(i2.get_quad()); 251 i2.advance(1); 252 } 253 i1.advance(n); 254 } 255 else if (run2.run_type == Empty) { 256 for (int i = 0; i < n; i++) { 257 iset.append_quad(i1.get_quad()); 258 i1.advance(1); 259 } 260 i2.advance(n); 261 } 262 else { 263 for (int i = 0; i < n; i++) { 264 iset.append_quad(i1.get_quad()  i2.get_quad()); 265 i1.advance(1); 266 i2.advance(1); 245 i1 += n; 246 i2 += n; 247 } 248 else if (run1.mType == Empty) { 249 for (unsigned i = 0; i < n; ++i, ++i2) { 250 iset.append_quad(get_quad(i2)); 251 } 252 i1 += n; 253 } 254 else if (run2.mType == Empty) { 255 for (unsigned i = 0; i < n; ++i, ++i1) { 256 iset.append_quad(get_quad(i1)); 257 } 258 i2 += n; 259 } 260 else { 261 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 262 iset.append_quad(get_quad(i1)  get_quad(i2)); 267 263 } 268 264 } … … 272 268 273 269 UnicodeSet uset_difference (const UnicodeSet & s1, const UnicodeSet & s2) { 274 assert(s1.quad_count == UnicodeQuadCount); 275 assert(s2.quad_count == UnicodeQuadCount); 276 UnicodeSet iset; 277 Uset_Iterator i1(s1); 278 Uset_Iterator i2(s2); 279 while (!i1.at_end()) { 280 RunStructure run1 = i1.current_run(); 281 RunStructure run2 = i2.current_run(); 282 int n = std::min(run1.run_length, run2.run_length); 283 if ((run1.run_type == Empty)  (run2.run_type == Full)) { 270 assert(s1.quad_count == UNICODE_QUAD_COUNT); 271 assert(s2.quad_count == UNICODE_QUAD_COUNT); 272 UnicodeSet iset; 273 for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) { 274 auto run1 = get_run(i1); 275 auto run2 = get_run(i2); 276 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 277 if ((run1.mType == Empty)  (run2.mType == Full)) { 284 278 iset.append_run(Empty, n); 285 i1 .advance(n);286 i2 .advance(n);287 } 288 else if ((run1. run_type == Full) && (run2.run_type == Empty)) {279 i1 += n; 280 i2 += n; 281 } 282 else if ((run1.mType == Full) && (run2.mType == Empty)) { 289 283 iset.append_run(Full, n); 290 i1.advance(n); 291 i2.advance(n); 292 } 293 else if (run1.run_type == Full) { 294 for (int i = 0; i < n; i++) { 295 iset.append_quad(FullQuadMask ^ i2.get_quad()); 296 i2.advance(1); 297 } 298 i1.advance(n); 299 } 300 else if (run2.run_type == Empty) { 301 for (int i = 0; i < n; i++) { 302 iset.append_quad(i1.get_quad()); 303 i1.advance(1); 304 } 305 i2.advance(n); 306 } 307 else { 308 for (int i = 0; i < n; i++) { 309 iset.append_quad(i1.get_quad() & ~i2.get_quad()); 310 i1.advance(1); 311 i2.advance(1); 284 i1 += n; 285 i2 += n; 286 } 287 else if (run1.mType == Full) { 288 for (unsigned i = 0; i != n; ++i, ++i2) { 289 iset.append_quad(FULL_QUAD_MASK ^ get_quad(i2)); 290 } 291 i1 += n; 292 } 293 else if (run2.mType == Empty) { 294 for (unsigned i = 0; i != n; ++i, ++i1) { 295 iset.append_quad(get_quad(i1)); 296 } 297 i2 += n; 298 } 299 else { 300 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 301 iset.append_quad(get_quad(i1) & ~get_quad(i2)); 312 302 } 313 303 } … … 317 307 318 308 UnicodeSet uset_symmetric_difference (const UnicodeSet & s1, const UnicodeSet & s2) { 319 assert(s1.quad_count == UnicodeQuadCount); 320 assert(s2.quad_count == UnicodeQuadCount); 321 UnicodeSet iset; 322 Uset_Iterator i1(s1); 323 Uset_Iterator i2(s2); 324 while (!i1.at_end()) { 325 RunStructure run1 = i1.current_run(); 326 RunStructure run2 = i2.current_run(); 327 int n = std::min(run1.run_length, run2.run_length); 328 if (((run1.run_type == Empty) && (run2.run_type == Full))  ((run1.run_type == Full) && (run2.run_type == Empty))) { 309 assert(s1.quad_count == UNICODE_QUAD_COUNT); 310 assert(s2.quad_count == UNICODE_QUAD_COUNT); 311 UnicodeSet iset; 312 for (auto i1 = s1.begin(), i2 = s2.begin(); i1 != s1.end(); ) { 313 auto run1 = get_run(i1); 314 auto run2 = get_run(i2); 315 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 316 if (((run1.mType == Empty) && (run2.mType == Full))  ((run1.mType == Full) && (run2.mType == Empty))) { 329 317 iset.append_run(Full, n); 330 i1 .advance(n);331 i2 .advance(n);332 } 333 else if (((run1. run_type == Full) && (run2.run_type == Full))  ((run1.run_type == Empty) && (run2.run_type == Empty))) {318 i1 += n; 319 i2 += n; 320 } 321 else if (((run1.mType == Full) && (run2.mType == Full))  ((run1.mType == Empty) && (run2.mType == Empty))) { 334 322 iset.append_run(Empty, n); 335 i1.advance(n); 336 i2.advance(n); 337 } 338 else if (run1.run_type == Empty) { 339 for (int i = 0; i < n; i++) { 340 iset.append_quad(i2.get_quad()); 341 i2.advance(1); 342 } 343 i1.advance(n); 344 } 345 else if (run2.run_type == Empty) { 346 for (int i = 0; i < n; i++) { 347 iset.append_quad(i1.get_quad()); 348 i1.advance(1); 349 } 350 i2.advance(n); 351 } 352 else if (run1.run_type == Full) { 353 for (int i = 0; i < n; i++) { 354 iset.append_quad(FullQuadMask ^ i2.get_quad()); 355 i2.advance(1); 356 } 357 i1.advance(n); 358 } 359 else if (run2.run_type == Empty) { 360 for (int i = 0; i < n; i++) { 361 iset.append_quad(FullQuadMask ^ i1.get_quad()); 362 i1.advance(1); 363 } 364 i2.advance(n); 365 } 366 else { 367 for (int i = 0; i < n; i++) { 368 iset.append_quad(i1.get_quad() ^ i2.get_quad()); 369 i1.advance(1); 370 i2.advance(1); 323 i1 += n; 324 i2 += n; 325 } 326 else if (run1.mType == Empty) { 327 for (int i = 0; i < n; ++i, ++i2) { 328 iset.append_quad(get_quad(i2)); 329 } 330 i1 += n; 331 } 332 else if (run2.mType == Empty) { 333 for (int i = 0; i < n; ++i, ++i1) { 334 iset.append_quad(get_quad(i1)); 335 } 336 i2 += n; 337 } 338 else if (run1.mType == Full) { 339 for (int i = 0; i < n; ++i, ++i2) { 340 iset.append_quad(FULL_QUAD_MASK ^ get_quad(i2)); 341 } 342 i1 += n; 343 } 344 else if (run2.mType == Empty) { 345 for (unsigned i = 0; i < n; ++i, ++i1) { 346 iset.append_quad(FULL_QUAD_MASK ^ get_quad(i1)); 347 } 348 i2 += n; 349 } 350 else { 351 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 352 iset.append_quad(get_quad(i1) ^ get_quad(i2)); 371 353 } 372 354 } … … 376 358 377 359 bool uset_member(const UnicodeSet & s, int codepoint){ 378 int quad_no = codepoint / quad_bits; 379 bitquad_t quad_val = 1 << (codepoint & mod_quad_bit_mask); 380 Uset_Iterator it(s); 381 it.advance(quad_no); 382 return (it.get_quad() & quad_val) != 0; 383 } 360 int quad_no = codepoint / QUAD_BITS; 361 bitquad_t quad_val = 1 << (codepoint & MOD_QUAD_BIT_MASK); 362 return (get_quad(s.begin() + quad_no) & quad_val) != 0; 363 }
Note: See TracChangeset
for help on using the changeset viewer.