Changeset 4620 for icGREP/icgrepdevel/icgrep/UCD/unicode_set.cpp
 Timestamp:
 Jun 28, 2015, 12:18:14 AM (4 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

icGREP/icgrepdevel/icgrep/UCD/unicode_set.cpp
r4618 r4620 22 22 #include <string> 23 23 #include <llvm/Support/raw_ostream.h> 24 #include <llvm/Support/Format.h> 24 25 #include <include/simdlib/builtins.hpp> 25 26 #include <iostream> 27 #include <iomanip> 26 28 27 29 const size_t QUAD_BITS = (8 * sizeof(bitquad_t)); … … 30 32 const bitquad_t FULL_QUAD_MASK = 1; 31 33 34 std::string run_type_name(const run_type_t type) { 35 if (type == Empty) { 36 return "Empty"; 37 } 38 if (type == Full) { 39 return "Full"; 40 } 41 if (type == Mixed) { 42 return "Mixed"; 43 } 44 return "???"; 45 } 46 47 using RunVector = UnicodeSet::RunVector; 48 using QuadVector = UnicodeSet::QuadVector; 49 32 50 /**  * 33 51 * @brief append_run 34 52 **  */ 35 inline void UnicodeSet::append_run(const run_type_t type, const unsigned length) {36 if ( length == 0) {53 inline void append_run(const run_type_t type, const unsigned length, RunVector & runs) { 54 if (LLVM_UNLIKELY(length == 0)) { 37 55 return; 38 56 } 39 else if (runs.size() == 0) { 40 runs.emplace_back(type, length); 57 else if (!runs.empty() && runs.back().mType == type) { 58 runs.back().mRunLength += length; 59 return; 60 } 61 runs.emplace_back(type, length); 62 } 63 64 /**  * 65 * @brief append_quad 66 **  */ 67 inline void append_quad(const bitquad_t quad, QuadVector & quads, RunVector & runs) { 68 run_type_t type = Empty; 69 if (LLVM_UNLIKELY(quad == 0)) { 70 type = Empty; 71 } 72 else if (LLVM_UNLIKELY(quad == FULL_QUAD_MASK)) { 73 type = Full; 41 74 } 42 75 else { 43 RunStructure last_run = runs[runs.size()1]; 44 if (last_run.mType == type) { 45 runs.back().mRunLength += length; 46 } 47 else { 48 runs.emplace_back(type, length); 49 } 50 } 51 } 52 53 /**  * 54 * @brief append_quad 55 **  */ 56 inline void UnicodeSet::append_quad(const bitquad_t quad) { 57 if (quad == 0) { 58 append_run(Empty, 1); 59 } 60 else if (quad == FULL_QUAD_MASK) { 61 append_run(Full, 1); 62 } 63 else { 64 quads.push_back(quad); 65 append_run(Mixed, 1); 66 } 76 quads.emplace_back(quad); 77 type = Mixed; 78 } 79 append_run(type, 1, runs); 67 80 } 68 81 … … 71 84 **  */ 72 85 void UnicodeSet::dump(llvm::raw_ostream & out) const { 73 auto q uad_itr = quads.cbegin();74 for (const RunStructure & run : runs) {86 auto qi = mQuads.cbegin(); 87 for (const RunStructure & run : mRuns) { 75 88 if (run.mType == Empty) { 76 89 out << "Empty(" << run.mRunLength << ")\n"; 77 90 } 78 else if (run.mType == Empty) {91 else if (run.mType == Full) { 79 92 out << "Full(" << run.mRunLength << ")\n"; 80 93 } 81 94 else { 82 for (unsigned i = 0; i != run.mRunLength; ++i, ++quad_itr) { 83 assert (quad_itr != quads.cend()); 84 out << "Mixed("; out.write_hex(*quad_itr) << ")\n"; 85 } 86 } 87 } 95 for (const auto qi_end = qi + run.mRunLength; qi != qi_end; ++qi) { 96 assert (qi != mQuads.cend()); 97 out << "Mixed(" << llvm::format("%08x", *qi) << ")\n"; 98 } 99 } 100 } 101 out.flush(); 88 102 } 89 103 … … 92 106 **  */ 93 107 UnicodeSet UnicodeSet::complement() const { 94 UnicodeSet set; 95 auto quad_itr = quads.cbegin(); 96 for (const RunStructure & run : runs) { 108 RunVector runs; 109 QuadVector quads; 110 runs.reserve(mRuns.size()); 111 quads.reserve(mQuads.size()); 112 auto qi = quads.cbegin(); 113 for (const RunStructure & run : mRuns) { 97 114 if (run.mType == Empty) { 98 set.append_run(Full, run.mRunLength);99 } 100 else if (run.mType == Empty) {101 set.append_run(Empty, run.mRunLength);102 } 103 else { 104 for ( unsigned i = 0; i != run.mRunLength; ++i, ++quad_itr) {105 assert (q uad_itr!= quads.cend());106 set.append_quad(FULL_QUAD_MASK ^ *quad_itr);107 } 108 } 109 } 110 return set;115 append_run(Full, run.mRunLength, runs); 116 } 117 else if (run.mType == Full) { 118 append_run(Empty, run.mRunLength, runs); 119 } 120 else { 121 for (const auto qi_end = qi + run.mRunLength; qi != qi_end; ++qi) { 122 assert (qi != quads.cend()); 123 append_quad(FULL_QUAD_MASK ^ *qi, quads, runs); 124 } 125 } 126 } 127 return UnicodeSet(std::move(runs), std::move(quads)); 111 128 } 112 129 … … 115 132 **  */ 116 133 UnicodeSet UnicodeSet::operator&(const UnicodeSet & other) const { 117 UnicodeSet iset; 134 RunVector runs; 135 QuadVector quads; 118 136 const auto e1 = quad_end(); 119 137 const auto e2 = other.quad_end(); … … 123 141 const auto n = std::min(run1.mRunLength, run2.mRunLength); 124 142 if (run1.mType == run2.mType && run1.mType != Mixed) { 125 iset.append_run(run1.mType, n);143 append_run(run1.mType, n, runs); 126 144 i1 += n; 127 145 i2 += n; … … 129 147 else if (run1.mType == Full) { 130 148 for (unsigned i = 0; i != n; ++i, ++i2) { 131 iset.append_quad(i2.getQuad());149 append_quad(i2.getQuad(), quads, runs); 132 150 } 133 151 i1 += n; … … 135 153 else if (run2.mType == Full) { 136 154 for (unsigned i = 0; i != n; ++i, ++i1) { 137 iset.append_quad(i1.getQuad());138 } 139 i2 += n; 140 } 141 else { 142 for (unsigned i = 0; i <n; ++i, ++i1, ++i2) {143 iset.append_quad(i1.getQuad() & i2.getQuad());144 } 145 } 146 } 147 return iset;155 append_quad(i1.getQuad(), quads, runs); 156 } 157 i2 += n; 158 } 159 else { 160 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 161 append_quad(i1.getQuad() & i2.getQuad(), quads, runs); 162 } 163 } 164 } 165 return UnicodeSet(std::move(runs), std::move(quads)); 148 166 } 149 167 … … 152 170 **  */ 153 171 UnicodeSet UnicodeSet::operator+(const UnicodeSet & other) const { 154 UnicodeSet iset; 172 RunVector runs; 173 QuadVector quads; 155 174 const auto e1 = quad_end(); 156 175 const auto e2 = other.quad_end(); … … 158 177 const auto run1 = i1.getRun(); 159 178 const auto run2 = i2.getRun(); 179 160 180 const auto n = std::min(run1.mRunLength, run2.mRunLength); 161 if (run1.mType == run2.mType && run1.mType != Mixed) { 162 iset.append_run(run1.mType, n); 181 if ((run1.mType == Empty) && (run2.mType == Empty)) { 182 append_run(Empty, n, runs); 183 i1 += n; 184 i2 += n; 185 } 186 else if ((run1.mType == Full)  (run2.mType == Full)) { 187 append_run(Full, n, runs); 163 188 i1 += n; 164 189 i2 += n; … … 166 191 else if (run1.mType == Empty) { 167 192 for (unsigned i = 0; i != n; ++i, ++i2) { 168 iset.append_quad(i2.getQuad());193 append_quad(i2.getQuad(), quads, runs); 169 194 } 170 195 i1 += n; … … 172 197 else if (run2.mType == Empty) { 173 198 for (unsigned i = 0; i != n; ++i, ++i1) { 174 iset.append_quad(i1.getQuad());199 append_quad(i1.getQuad(), quads, runs); 175 200 } 176 201 i2 += n; … … 178 203 else { 179 204 for (unsigned i = 0; i < n; ++i, ++i1, ++i2) { 180 iset.append_quad(i1.getQuad()  i2.getQuad());181 } 182 } 183 } 184 return iset;205 append_quad(i1.getQuad()  i2.getQuad(), quads, runs); 206 } 207 } 208 } 209 return UnicodeSet(std::move(runs), std::move(quads)); 185 210 } 186 211 … … 189 214 **  */ 190 215 UnicodeSet UnicodeSet::operator(const UnicodeSet & other) const { 191 UnicodeSet iset; 216 RunVector runs; 217 QuadVector quads; 192 218 const auto e1 = quad_end(); 193 219 const auto e2 = other.quad_end(); … … 197 223 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 198 224 if ((run1.mType == Empty)  (run2.mType == Full)  (run1.mType == Full && run2.mType == Empty)) { 199 iset.append_run(run1.mType, n);225 append_run(run1.mType, n, runs); 200 226 i1 += n; 201 227 i2 += n; … … 203 229 else if (run1.mType == Full) { 204 230 for (unsigned i = 0; i != n; ++i, ++i2) { 205 iset.append_quad(FULL_QUAD_MASK ^ i2.getQuad());231 append_quad(FULL_QUAD_MASK ^ i2.getQuad(), quads, runs); 206 232 } 207 233 i1 += n; … … 209 235 else if (run2.mType == Empty) { 210 236 for (unsigned i = 0; i != n; ++i, ++i1) { 211 iset.append_quad(i1.getQuad());237 append_quad(i1.getQuad(), quads, runs); 212 238 } 213 239 i2 += n; … … 215 241 else { 216 242 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 217 iset.append_quad(i1.getQuad() &~ i2.getQuad());218 } 219 } 220 } 221 return iset;243 append_quad(i1.getQuad() &~ i2.getQuad(), quads, runs); 244 } 245 } 246 } 247 return UnicodeSet(std::move(runs), std::move(quads)); 222 248 } 223 249 … … 226 252 **  */ 227 253 UnicodeSet UnicodeSet::operator^(const UnicodeSet & other) const { 228 UnicodeSet iset; 254 RunVector runs; 255 QuadVector quads; 229 256 const auto e1 = quad_end(); 230 257 const auto e2 = other.quad_end(); … … 234 261 unsigned n = std::min(run1.mRunLength, run2.mRunLength); 235 262 if (run1.mType != Mixed && run2.mType != Mixed) { 236 iset.append_run(run1.mType == run2.mType ? Empty : Full, n);263 append_run(run1.mType == run2.mType ? Empty : Full, n, runs); 237 264 i1 += n; 238 265 i2 += n; 239 266 } 240 267 else if (run1.mType == Empty) { 241 for (int i = 0; i < n; ++i, ++i2) { 242 iset.append_quad(i2.getQuad()); 243 } 244 i1 += n; 245 } 246 else if (run2.mType == Empty) { 247 for (int i = 0; i < n; ++i, ++i1) { 248 iset.append_quad(i1.getQuad()); 249 } 250 i2 += n; 251 } 252 else if (run1.mType == Full) { 253 for (int i = 0; i < n; ++i, ++i2) { 254 iset.append_quad(FULL_QUAD_MASK ^ i2.getQuad()); 268 for (unsigned i = 0; i < n; ++i, ++i2) { 269 append_quad(i2.getQuad(), quads, runs); 255 270 } 256 271 i1 += n; … … 258 273 else if (run2.mType == Empty) { 259 274 for (unsigned i = 0; i < n; ++i, ++i1) { 260 iset.append_quad(FULL_QUAD_MASK ^ i1.getQuad()); 275 append_quad(i1.getQuad(), quads, runs); 276 } 277 i2 += n; 278 } 279 else if (run1.mType == Full) { 280 for (unsigned i = 0; i < n; ++i, ++i2) { 281 append_quad(FULL_QUAD_MASK ^ i2.getQuad(), quads, runs); 282 } 283 i1 += n; 284 } 285 else if (run2.mType == Empty) { 286 for (unsigned i = 0; i < n; ++i, ++i1) { 287 append_quad(FULL_QUAD_MASK ^ i1.getQuad(), quads, runs); 261 288 } 262 289 i2 += n; … … 264 291 else { 265 292 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 266 iset.append_quad(i1.getQuad() ^ i2.getQuad());267 } 268 } 269 } 270 return iset;293 append_quad(i1.getQuad() ^ i2.getQuad(), quads, runs); 294 } 295 } 296 } 297 return UnicodeSet(std::move(runs), std::move(quads)); 271 298 } 272 299 … … 284 311 285 312 for (;;) { 286 const RunStructure & t = runs[runIndex];313 const RunStructure & t = mRuns[runIndex]; 287 314 if (t.mRunLength >= n) { 288 315 if (t.mType == Mixed) { 289 return ( quads[quadIndex + n  1] & (static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK))) != 0;316 return (mQuads[quadIndex + n  1] & (static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK))) != 0; 290 317 } 291 318 return (t.mType == Full); … … 305 332 void UnicodeSet::quad_iterator::advance(unsigned n) { 306 333 while (n > 0) { 307 const RunStructure & t = mUnicodeSet.runs[mRunIndex]; 308 const unsigned remain = t.mRunLength  mOffset; 334 const unsigned remain = mRunIterator>mRunLength  mOffset; 309 335 if (remain > n) { 336 if (mRunIterator>mType == Mixed) { 337 mQuadIterator += n; 338 } 310 339 mOffset += n; 311 if (t.mType == Mixed) {312 mQuadIndex += n;313 }314 340 break; 315 341 } 316 else if (remain == n) { 317 ++mRunIndex; 318 mOffset = 0; 319 if (t.mType == Mixed) { 320 mQuadIndex += n; 321 } 322 break; 323 } 324 else { 325 ++mRunIndex; 326 mOffset = 0; 327 if (t.mType == Mixed) { 328 mQuadIndex += remain; 329 } 330 n = remain; 331 } 342 if (mRunIterator>mType == Mixed) { 343 mQuadIterator += remain; 344 } 345 ++mRunIterator; 346 mOffset = 0; 347 n = remain; 332 348 } 333 349 } … … 336 352 * @brief UnicodeSet::iterator::advance 337 353 **  */ 338 void UnicodeSet::iterator::advance(unsigned n) { 339 340 std::cerr << "advance(" << n << ")\n"; 341 342 mMinCodePoint = mBaseCodePoint; 343 344 for ( ;n; ++mRunIterator) { 345 346 const RunStructure & t = *mRunIterator; 347 348 std::cerr << "Type:"; 349 switch (t.mType) { 350 case Empty: std::cerr << "Empty"; break; 351 case Full: std::cerr << "Full"; break; 352 case Mixed: std::cerr << "Mixed"; break; 353 } 354 std::cerr << " Length:" << t.mRunLength; 355 std::cerr << " BaseCodePoint:" << mBaseCodePoint; 356 357 358 std::cerr << std::endl; 359 360 if (t.mType != Mixed) { 361 mMaxCodePoint = mBaseCodePoint + t.mRunLength * QUAD_BITS; 362 mBaseCodePoint = mMaxCodePoint; 354 void UnicodeSet::iterator::advance(const unsigned n) { 355 356 assert (n == 1); 357 358 // Find the start of our interval 359 for ( ; mBaseCodePoint <= re::CC::UNICODE_MAX; ++mRunIterator) { 360 // Find the first nonempty block 361 const RunStructure & run = *mRunIterator; 362 if (run.mType != Mixed) { 363 mMinCodePoint = mBaseCodePoint; 364 mBaseCodePoint += run.mRunLength * QUAD_BITS; 363 365 mQuadOffset = 0; 364 366 mQuadPosition = 0; 365 if (t.mType == Full) { 366 n; 367 } 367 if (run.mType == Full) { 368 break; 369 } 370 } 371 else { // if (left.mType == Mixed) 372 while (mQuadPosition != run.mRunLength) { 373 const bitquad_t q = *mQuadIterator; 374 const bitquad_t M = (FULL_QUAD_MASK << mQuadOffset); 375 const bitquad_t m = q & M; 376 // Nothing left in this quad to add; skip to the next one. 377 if (m == 0) { 378 mBaseCodePoint += QUAD_BITS; 379 ++mQuadPosition; 380 ++mQuadIterator; 381 continue; 382 } 383 mQuadOffset = scan_forward_zeroes(m); 384 mMinCodePoint = mBaseCodePoint + mQuadOffset; 385 break; 386 } 387 // If we found nothing in the quad, restart the loop. 388 if (mQuadPosition != run.mRunLength) { 389 break; 390 } 391 } 392 } 393 394 // Find the end of our interval 395 for ( ; mBaseCodePoint <= re::CC::UNICODE_MAX; ++mRunIterator) { 396 const RunStructure & run = *mRunIterator; 397 // If the next run is empty, we already know the max code point. 398 if (run.mType == Empty) { 399 mMaxCodePoint = mBaseCodePoint; 400 break; 401 } 402 // If the next run is Full, increment the base code point. 403 else if (run.mType == Full) { 404 mBaseCodePoint += run.mRunLength * QUAD_BITS; 405 mMaxCodePoint = mBaseCodePoint; 406 mQuadOffset = 0; 407 mQuadPosition = 0; 368 408 continue; 369 409 } 370 371 while (mQuadPosition != t.mRunLength) { 372 373 const bitquad_t q = *mQuadIterator; 374 375 const bitquad_t m = q & ((1) >> mQuadOffset); 376 377 std::cerr << " q:" << std::hex << q << std::endl; 378 std::cerr << " +m:" << std::hex << m << std::dec << " (" << mQuadOffset << ")" << std::endl; 379 380 // Nothing left in this quad to add; skip to the next one. 381 if (m == 0) { 382 mBaseCodePoint += QUAD_BITS; 383 mMinCodePoint = mBaseCodePoint; 384 ++mQuadIterator; 385 continue; 386 } 387 388 mQuadOffset = scan_forward_zeroes(m); 389 mMinCodePoint = mBaseCodePoint + mQuadOffset; 390 391 392 393 break; 394 } 395 396 while (mQuadPosition != t.mRunLength) { 397 398 // Although the initial position was in this quad, the final position isn't 399 // unless this is the last quad of this mixed run and the subsequent quad is 400 // Empty. 401 402 const bitquad_t q = *mQuadIterator; 403 const bitquad_t m = ~q & ((1) >> mQuadOffset); 404 405 std::cerr << " q:" << std::hex << q << std::endl; 406 std::cerr << " m:" << std::hex << m << std::dec << " (" << mQuadOffset << ")" << std::endl; 407 408 // Nothing left in this quad to add; skip to the next one. 409 if (m == 0) { 410 mBaseCodePoint += QUAD_BITS; 411 mMaxCodePoint = mBaseCodePoint; 412 ++mQuadIterator; 413 continue; 414 } 415 416 mQuadOffset = scan_forward_zeroes(m); 417 mMaxCodePoint = mBaseCodePoint + mQuadOffset; 418 n; 419 break; 420 } 421 } 422 } 423 424 UnicodeSet::UnicodeSet() 425 : runs({{{Empty, UNICODE_QUAD_COUNT}}}) 426 { 427 428 } 429 430 // singleton set constructor 410 else { // if (left.mType == Mixed) 411 while (mQuadPosition != run.mRunLength) { 412 413 const bitquad_t q = *mQuadIterator; 414 const bitquad_t M = (FULL_QUAD_MASK << mQuadOffset); 415 const bitquad_t m = ~q & M; 416 417 // Nothing left in this quad to add; skip to the next one. 418 if (m == 0) { 419 mBaseCodePoint += QUAD_BITS; 420 mMaxCodePoint = mBaseCodePoint; 421 ++mQuadPosition; 422 ++mQuadIterator; 423 continue; 424 } 425 426 mQuadOffset = scan_forward_zeroes(m); 427 mMaxCodePoint = mBaseCodePoint + mQuadOffset  1; 428 break; 429 } 430 // If we found nothing in the quad, restart the loop. 431 if (mQuadPosition != run.mRunLength) { 432 break; 433 } 434 } 435 } 436 437 438 } 439 440 /**  * 441 * @brief Empty Set Constructor 442 **  */ 443 UnicodeSet::UnicodeSet() : mRuns({{{Empty, UNICODE_QUAD_COUNT}}}) { } 444 445 /**  * 446 * @brief Singleton Set Constructor 447 **  */ 431 448 UnicodeSet::UnicodeSet(const codepoint_t codepoint) { 432 codepoint_t quad_no = codepoint / QUAD_BITS; 433 if (quad_no > 0) { 434 append_run(Empty, quad_no); 435 } 436 append_run(Mixed, 1); 437 quads.push_back(static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK)); 438 if (quad_no < UNICODE_QUAD_COUNT  1) { 439 append_run(Empty, UNICODE_QUAD_COUNT  (quad_no + 1)); 440 } 441 } 442 443 // range set constructor 449 const codepoint_t quad_no = codepoint / QUAD_BITS; 450 append_run(Empty, quad_no, mRuns); 451 append_quad(static_cast<bitquad_t>(1) << (codepoint & MOD_QUAD_BIT_MASK), mQuads, mRuns); 452 append_run(Empty, UNICODE_QUAD_COUNT  (quad_no + 1), mRuns); 453 } 454 455 /**  * 456 * @brief Range Set Constructor 457 **  */ 444 458 UnicodeSet::UnicodeSet(const codepoint_t lo_codepoint, const codepoint_t hi_codepoint) { 445 codepoint_t lo_quad_no = lo_codepoint / QUAD_BITS; 446 codepoint_t hi_quad_no = hi_codepoint / QUAD_BITS; 447 codepoint_t lo_offset = lo_codepoint & MOD_QUAD_BIT_MASK; 448 codepoint_t hi_offset = hi_codepoint & MOD_QUAD_BIT_MASK; 449 if (lo_quad_no > 0) { 450 append_run(Empty, lo_quad_no); 451 } 452 if (lo_quad_no == hi_quad_no) { 453 bitquad_t quad = (FULL_QUAD_MASK << lo_offset) & (FULL_QUAD_MASK >> (QUAD_BITS  1  hi_offset)); 454 append_quad(quad); 459 const codepoint_t lo_quad_no = lo_codepoint / QUAD_BITS; 460 const codepoint_t hi_quad_no = hi_codepoint / QUAD_BITS; 461 const codepoint_t lo_offset = lo_codepoint & MOD_QUAD_BIT_MASK; 462 const codepoint_t hi_offset = hi_codepoint & MOD_QUAD_BIT_MASK; 463 const bitquad_t lo_quad = (FULL_QUAD_MASK << lo_offset); 464 const bitquad_t hi_quad = (FULL_QUAD_MASK >> (QUAD_BITS  1  hi_offset)); 465 append_run(Empty, lo_quad_no, mRuns); 466 if (lo_quad_no == hi_quad_no) { 467 append_quad(lo_quad & hi_quad, mQuads, mRuns); 455 468 } 456 469 else { 457 append_quad((FULL_QUAD_MASK << lo_offset) & FULL_QUAD_MASK); 458 append_run(Full, hi_quad_no  (lo_quad_no + 1)); 459 append_quad((FULL_QUAD_MASK >> (QUAD_BITS  1  hi_offset)) & FULL_QUAD_MASK); 460 } 461 if (hi_quad_no < UNICODE_QUAD_COUNT  1) { 462 append_run(Empty, UNICODE_QUAD_COUNT  (hi_quad_no + 1)); 463 } 464 } 465 470 append_quad(lo_quad, mQuads, mRuns); 471 append_run(Full, hi_quad_no  (lo_quad_no + 1), mRuns); 472 append_quad(hi_quad, mQuads, mRuns); 473 } 474 append_run(Empty, UNICODE_QUAD_COUNT  (hi_quad_no + 1), mRuns); 475 } 476
Note: See TracChangeset
for help on using the changeset viewer.