- Timestamp:
- Aug 8, 2018, 3:36:26 PM (7 months ago)
- Location:
- icGREP/icgrep-devel/icgrep/re/Unicode
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp
r6139 r6141 245 245 return re; 246 246 } 247 248 /* Reordering of Grapheme Clusters */ 249 250 /* Extract the next grapheme cluster at a given position within a sequence. */ 251 252 std::u32string getCluster(Seq * s, unsigned position) { 253 unsigned pos = position; 254 unsigned size = s->size(); 255 std::u32string rslt; 256 while ((pos < size) && isa<CC>((*s)[pos])) { 257 CC * cc = cast<CC>((*s)[pos]); 258 if (cc->empty()) return rslt; 259 codepoint_t lo = lo_codepoint(cc->front()); 260 codepoint_t hi = hi_codepoint(cc->back()); 261 if (lo != hi) // not a singleton CC; end of the cluster. 262 return rslt; 263 if (pos > position) { 264 // After the first codepoint of a cluster, all remaining codepoints 265 // must have ccc > 0. Terminate the cluster when any starter 266 // (codepoint with ccc==0) is found. 267 const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]); 268 const UnicodeSet & ccc0 = cccObj->GetCodepointSet(CCC_ns::NR); 269 if (ccc0.contains(lo)) return rslt; 270 } 271 rslt.push_back(lo); 272 pos++; 273 } 274 return rslt; 275 } 276 277 /* Helper function to insert a given mark at all possible positions within 278 a set of prefixes, subject to constraints on Unicode canonically-equivalent 279 ordering. */ 280 std::vector<std::u32string> allReorderedInsertions(std::vector<std::u32string> prefixes, codepoint_t mark) { 281 const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]); 282 const UnicodeSet & cccSet = cccObj->GetCodepointSet(cccObj->GetEnumerationValue(mark)); 283 const UnicodeSet & cc0Set = cccObj->GetCodepointSet(CCC_ns::NR); // ccc = 0, NotReorderable. 284 const UnicodeSet insertBeforeBlocked = cccSet + cc0Set; 285 std::vector<std::u32string> reorderings; 286 for (auto & prefix : prefixes) { 287 reorderings.push_back(prefix + (char32_t) mark); 288 int insert_pos = prefix.size() - 1; 289 while ((insert_pos >= 0) && (!insertBeforeBlocked.contains(prefix[insert_pos]))) { 290 reorderings.push_back(prefix.substr(0, insert_pos) + (char32_t) mark + prefix.substr(insert_pos)); 291 insert_pos--; 292 } 293 } 294 return reorderings; 295 } 296 297 RE * allClusterOrderings(std::u32string cluster) { 298 std::vector<std::u32string> orderings = {cluster.substr(0,1)}; 299 for (unsigned i = 1; i < cluster.size(); i++) { 300 orderings = allReorderedInsertions(orderings, cluster[i]); 301 } 302 std::vector<RE *> alts; 303 for (auto a : orderings) { 304 alts.push_back(u32string2re(a)); 305 } 306 return makeAlt(alts.begin(), alts.end()); 307 } 308 309 RE * allOrderings_RE(RE * re) { 310 if (Alt * alt = dyn_cast<Alt>(re)) { 311 std::vector<RE *> list; 312 list.reserve(alt->size()); 313 for (RE * a : *alt) { 314 list.push_back(allOrderings_RE(a)); 315 } 316 return makeAlt(list.begin(), list.end()); 317 } else if (CC * cc = dyn_cast<CC>(re)) { 318 return cc; 319 } else if (Seq * seq = dyn_cast<Seq>(re)) { 320 // find and process all string pieces 321 std::vector<RE *> list; 322 unsigned size = seq->size(); 323 unsigned i = 0; 324 while (i < size) { 325 std::u32string cluster = getCluster(seq, i); 326 if (cluster.size() > 0) { 327 list.push_back(allClusterOrderings(cluster)); 328 i += cluster.size(); 329 } else { 330 list.push_back(allOrderings_RE((*seq)[i])); 331 i++; 332 } 333 } 334 return makeSeq(list.begin(), list.end()); 335 } else if (Assertion * a = dyn_cast<Assertion>(re)) { 336 return makeAssertion(allOrderings_RE(a->getAsserted()), a->getKind(), a->getSense()); 337 } else if (Rep * rep = dyn_cast<Rep>(re)) { 338 RE * expr = allOrderings_RE(rep->getRE()); 339 return makeRep(expr, rep->getLB(), rep->getUB()); 340 } else if (Diff * diff = dyn_cast<Diff>(re)) { 341 return makeDiff(allOrderings_RE(diff->getLH()), allOrderings_RE(diff->getRH())); 342 } else if (Intersect * e = dyn_cast<Intersect>(re)) { 343 return makeIntersect(allOrderings_RE(e->getLH()), allOrderings_RE(e->getRH())); 344 } else if (Range * rg = dyn_cast<Range>(re)) { 345 return makeRange(allOrderings_RE(rg->getLo()), allOrderings_RE(rg->getHi())); 346 } else if (Group * g = dyn_cast<Group>(re)) { 347 return makeGroup(g->getMode(), allOrderings_RE(g->getRE()), g->getSense()); 348 } 349 return re; 350 } 351 -
icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.h
r6138 r6141 16 16 string (expanding decomposition). In general, the result is 17 17 a set of alternatives consisting of sequences for each expanding 18 decomposition as well as a single character class for all the19 single dton decompositions as well as the codepoints that map to18 decomposition plus a single character class for all the 19 singleton decompositions as well as the codepoints that map to 20 20 themselves. 21 21 */ … … 36 36 re::RE * Casefold_RE(re::RE * r); 37 37 38 /* For every decomposed sequence in RE, add alternatives for all 39 canonically equivalent reorderings according to Unicode rules. */ 40 re::RE * allOrderings_RE(re::RE * re); 41 42 38 43 #endif
Note: See TracChangeset
for help on using the changeset viewer.