 Timestamp:
 Nov 17, 2017, 2:49:06 PM (14 months ago)
 Location:
 icGREP/icgrepdevel/icgrep/UCD
 Files:

 2 edited
Legend:
 Unmodified
 Added
 Removed

icGREP/icgrepdevel/icgrep/UCD/unicode_set.cpp
r5727 r5739 33 33 // Select the correct builtin scan function, dependent on whatever 34 34 // bitquad_t resolves to, when scan_forward_zeroes<bitquad_t> is called. 35 template <typename T> int scan_forward_zeroes(T x); 36 template <> inline int scan_forward_zeroes<unsigned int>(unsigned int x){return __builtin_ctz(x);} 37 template <> inline int scan_forward_zeroes<unsigned long>(unsigned long x){return __builtin_ctzl(x);} 38 template <> inline int scan_forward_zeroes<unsigned long long>(unsigned long long x){return __builtin_ctzll(x);} 35 template <typename T> int scan_forward_zeroes(const T x) noexcept; 36 template <> inline int scan_forward_zeroes<unsigned int>(const unsigned int x) noexcept { return __builtin_ctz(x); } 37 template <> inline int scan_forward_zeroes<unsigned long>(const unsigned long x) noexcept { return __builtin_ctzl(x); } 38 template <> inline int scan_forward_zeroes<unsigned long long>(const unsigned long long x) noexcept { return __builtin_ctzll(x); } 39 40 template <typename T> unsigned popcount(const T x) noexcept; 41 template <> inline unsigned popcount<unsigned int>(const unsigned int x) noexcept { return __builtin_popcount(x); } 42 template <> inline unsigned popcount<unsigned long>(const unsigned long x) noexcept { return __builtin_popcountl(x); } 43 template <> inline unsigned popcount<unsigned long long>(const unsigned long long x) noexcept { return __builtin_popcountll(x); } 44 39 45 40 46 SlabAllocator<> UnicodeSet::mAllocator; … … 135 141 } 136 142 143 144 /**  * 145 * @brief at 146 **  */ 147 codepoint_t UnicodeSet::at(const size_type k) const { 148 auto qi = mQuads.cbegin(); 149 size_type base = 0; 150 size_type remaining = k; 151 for (const run_t & r : mRuns) { 152 assert ((base % QUAD_BITS) == 0); 153 if (typeOf(r) == Empty) { 154 base += QUAD_BITS * lengthOf(r); 155 } else if (typeOf(r) == Full) { 156 const auto m = QUAD_BITS * lengthOf(r); 157 if (LLVM_UNLIKELY(remaining < m)) { 158 return base + remaining; 159 } 160 base += m; 161 remaining = m; 162 } else { // if (typeOf(r) == Mixed) { 163 for (auto l = lengthOf(r); l; l, ++qi) { 164 auto q = *qi; 165 assert (q); 166 const auto c = popcount<bitquad_t>(q); 167 if (LLVM_UNLIKELY(remaining < c)) { 168 // iterate through the remaining bits to find the offset 169 for (;;) { 170 assert (q); 171 const bitquad_t k = scan_forward_zeroes<bitquad_t>(q); 172 if (remaining == 0) { 173 return base + k; 174 } 175 q ^= static_cast<bitquad_t>(1) << k; 176 remaining; 177 } 178 } 179 base += QUAD_BITS; 180 remaining = c; 181 } 182 } 183 } 184 throw std::runtime_error("cannot retrieve codepoint " + std::to_string(k) + " from a " 185 " set containing " + std::to_string(count()) + " codepoints"); 186 } 187 137 188 /**  * 138 189 * @brief size … … 140 191 UnicodeSet::size_type UnicodeSet::size() const { 141 192 return std::distance(begin(), end()); 193 } 194 195 /**  * 196 * @brief count 197 **  */ 198 UnicodeSet::size_type UnicodeSet::count() const { 199 size_type count = 0; 200 for (const run_t & r : mRuns) { 201 if (typeOf(r) == Full) { 202 count += lengthOf(r); 203 } 204 } 205 count *= QUAD_BITS; 206 for (const bitquad_t q : mQuads) { 207 count += popcount<bitquad_t>(q); 208 } 209 return count; 142 210 } 143 211 … … 186 254 if (typeOf(run) == Empty) { 187 255 out << "Empty(" << lengthOf(run) << ")\n"; 188 } 189 else if (typeOf(run) == Full) { 256 } else if (typeOf(run) == Full) { 190 257 out << "Full(" << lengthOf(run) << ")\n"; 191 } 192 else { 258 } else { 193 259 for (const auto qi_end = qi + lengthOf(run); qi != qi_end; ++qi) { 194 260 assert (qi != mQuads.cend()); … … 203 269 **  */ 204 270 UnicodeSet UnicodeSet::operator~() const { 205 std::vector<run_t> runs; 206 std::vector<bitquad_t> quads; 207 runs.reserve(mRuns.size()); 208 quads.reserve(mQuads.size()); 209 auto qi = quads.cbegin(); 210 for (const run_t & run : mRuns) { 211 if (typeOf(run) == Empty) { 212 append_run(Full, lengthOf(run), runs); 213 } 214 else if (typeOf(run) == Full) { 215 append_run(Empty, lengthOf(run), runs); 216 } 217 else { 218 for (const auto qi_end = qi + lengthOf(run); qi != qi_end; ++qi) { 219 assert (qi != quads.cend()); 220 append_quad(FULL_QUAD_MASK ^ *qi, quads, runs); 221 } 222 } 271 std::vector<run_t> runs(mRuns.size()); 272 auto ri = runs.begin(); 273 for (const auto run : mRuns) { 274 run_type_t type = Empty; 275 if (typeOf(run) == Empty) { 276 type = Full; 277 } else if (typeOf(run) == Full) { 278 type = Empty; 279 } else { 280 type = Mixed; 281 } 282 *ri++ = { type, lengthOf(run) }; 283 } 284 std::vector<bitquad_t> quads(mQuads.size()); 285 auto qi = quads.begin(); 286 for (const auto quad : mQuads) { 287 *qi++ = ~quad; 223 288 } 224 289 return UnicodeSet(std::move(runs), std::move(quads)); … … 233 298 const auto e1 = quad_end(); 234 299 const auto e2 = other.quad_end(); 235 for (auto i1 = quad_begin(), i2 = other.quad_begin(); i1 != e1 && i2 != e2; ) { 300 auto i1 = quad_begin(), i2 = other.quad_begin(); 301 while ((i1 != e1) && (i2 != e2)) { 236 302 const auto n = std::min(i1.length(), i2.length()); 237 303 if ((i1.type() == Full) && (i2.type() == Full)) { … … 239 305 i1 += n; 240 306 i2 += n; 241 } 242 else if ((i1.type() == Empty)  (i2.type() == Empty)) { 307 } else if ((i1.type() == Empty)  (i2.type() == Empty)) { 243 308 append_run(Empty, n, runs); 244 309 i1 += n; 245 310 i2 += n; 246 } 247 else if (i1.type() == Full) { 311 } else if (i1.type() == Full) { 248 312 for (unsigned i = 0; i != n; ++i, ++i2) { 249 313 append_quad(i2.quad(), quads, runs); 250 314 } 251 315 i1 += n; 252 } 253 else if (i2.type() == Full) { 316 } else if (i2.type() == Full) { 254 317 for (unsigned i = 0; i != n; ++i, ++i1) { 255 318 append_quad(i1.quad(), quads, runs); 256 319 } 257 320 i2 += n; 258 } 259 else { //both Mixed 321 } else { // both Mixed 260 322 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 261 323 append_quad(i1.quad() & i2.quad(), quads, runs); … … 275 337 const auto e2 = other.quad_end(); 276 338 auto i1 = quad_begin(), i2 = other.quad_begin(); 277 for (; i1 != e1 && i2 != e2;) {339 while ((i1 != e1) && (i2 != e2)) { 278 340 const auto n = std::min(i1.length(), i2.length()); 279 341 if ((i1.type() == Empty) && (i2.type() == Empty)) { … … 295 357 } 296 358 i2 += n; 297 } else { 359 } else { // both Mixed 298 360 for (unsigned i = 0; i < n; ++i, ++i1, ++i2) { 299 361 append_quad(i1.quad()  i2.quad(), quads, runs); 300 362 } 363 } 364 } 365 // append any remaining blocks 366 if ((i1 != e1)  (i2 != e2)) { 367 assert ((i1 != e1) ^ (i2 != e2)); 368 auto i3 = (i1 != e1) ? i1 : i2; 369 const auto e3 = (i1 != e1) ? e1 : e2; 370 while (i3 != e3) { 371 const auto n = i3.length(); 372 if (i3.type() == Empty  i3.type() == Full) { 373 append_run(i3.type(), n, runs); 374 i3 += n; 375 } else { 376 for (unsigned i = 0; i != n; ++i3) { 377 append_quad(i3.quad(), quads, runs); 378 } 379 } 301 380 } 302 381 } … … 312 391 const auto e1 = quad_end(); 313 392 const auto e2 = other.quad_end(); 314 for (auto i1 = quad_begin(), i2 = other.quad_begin(); i1 != e1 && i2 != e2; ) { 393 auto i1 = quad_begin(), i2 = other.quad_begin(); 394 while ((i1 != e1) && (i2 != e2)) { 315 395 unsigned n = std::min(i1.length(), i2.length()); 316 if ((i1.type() == Empty)  (i2.type() == Full)  (i1.type() == Full && i2.type() == Empty)) { 396 if ((i1.type() == Empty)  (i2.type() == Full)) { 397 append_run(Empty, n, runs); 398 i1 += n; 399 i2 += n; 400 } else if (i1.type() == Full) { 401 if (i2.type() == Empty) { 402 append_run(Full, n, runs); 403 i2 += n; 404 } else { 405 for (unsigned i = 0; i != n; ++i, ++i2) { 406 append_quad(~i2.quad(), quads, runs); 407 } 408 } 409 i1 += n; 410 } else if (i2.type() == Empty) { 411 for (unsigned i = 0; i != n; ++i, ++i1) { 412 append_quad(i1.quad(), quads, runs); 413 } 414 i2 += n; 415 } else { 416 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 417 append_quad(i1.quad() &~ i2.quad(), quads, runs); 418 } 419 } 420 } 421 while (i1 != e1) { 422 const auto n = i1.length(); 423 if (i1.type() == Empty  i1.type() == Full) { 317 424 append_run(i1.type(), n, runs); 318 425 i1 += n; 319 i2 += n; 320 } 321 else if (i1.type() == Full) { 322 for (unsigned i = 0; i != n; ++i, ++i2) { 323 append_quad(FULL_QUAD_MASK ^ i2.quad(), quads, runs); 324 } 325 i1 += n; 326 } 327 else if (i2.type() == Empty) { 328 for (unsigned i = 0; i != n; ++i, ++i1) { 426 } else { 427 for (unsigned i = 0; i != n; ++i1) { 329 428 append_quad(i1.quad(), quads, runs); 330 429 } 331 i2 += n; 332 } 333 else { 334 for (unsigned i = 0; i != n; ++i, ++i1, ++i2) { 335 append_quad(i1.quad() &~ i2.quad(), quads, runs); 336 } 337 } 430 } 338 431 } 339 432 return UnicodeSet(std::move(runs), std::move(quads)); … … 348 441 const auto e1 = quad_end(); 349 442 const auto e2 = other.quad_end(); 350 for (auto i1 = quad_begin(), i2 = other.quad_begin(); i1 != e1 && i2 != e2; ) { 443 auto i1 = quad_begin(), i2 = other.quad_begin(); 444 while ((i1 != e1) && (i2 != e2)) { 351 445 unsigned n = std::min(i1.length(), i2.length()); 352 446 if (i1.type() != Mixed && i2.type() != Mixed) { … … 366 460 } else if (i1.type() == Full) { 367 461 for (unsigned i = 0; i < n; ++i, ++i2) { 368 append_quad( FULL_QUAD_MASK ^i2.quad(), quads, runs);462 append_quad(~i2.quad(), quads, runs); 369 463 } 370 464 i1 += n; 371 465 } else if (i2.type() == Full) { 372 466 for (unsigned i = 0; i < n; ++i, ++i1) { 373 append_quad( FULL_QUAD_MASK ^i1.quad(), quads, runs);467 append_quad(~i1.quad(), quads, runs); 374 468 } 375 469 i2 += n; … … 378 472 append_quad(i1.quad() ^ i2.quad(), quads, runs); 379 473 } 474 } 475 } 476 // append any remaining blocks 477 if ((i1 != e1)  (i2 != e2)) { 478 assert ((i1 != e1) ^ (i2 != e2)); 479 auto i3 = (i1 != e1) ? i1 : i2; 480 const auto e3 = (i1 != e1) ? e1 : e2; 481 while (i3 != e3) { 482 const auto n = i3.length(); 483 if (i3.type() == Empty  i3.type() == Full) { 484 append_run(i3.type(), n, runs); 485 i3 += n; 486 } else { 487 for (unsigned i = 0; i != n; ++i3) { 488 append_quad(i3.quad(), quads, runs); 489 } 490 } 380 491 } 381 492 } 
icGREP/icgrepdevel/icgrep/UCD/unicode_set.h
r5727 r5739 105 105 bool full() const; // The set has the full set of possible Unicode codepoints. 106 106 107 codepoint_t at(const size_type k) const; // return the kth codepoint (or throw an error if it doesn't exist) 108 107 109 bool contains(const codepoint_t codepoint) const; 108 110 … … 115 117 void insert_range(const codepoint_t lo, const codepoint_t hi); 116 118 117 size_type size() const; 119 size_type size() const; // number of intervals in this set 120 121 size_type count() const; // number of codepoints in this set 118 122 119 123 interval_t front() const;
Note: See TracChangeset
for help on using the changeset viewer.