Changeset 6141


Ignore:
Timestamp:
Aug 8, 2018, 3:36:26 PM (9 days ago)
Author:
cameron
Message:

Support for searching grapheme clusters in all possible reorderings

Location:
icGREP/icgrep-devel/icgrep/re/Unicode
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp

    r6139 r6141  
    245245    return re;
    246246}
     247
     248/* Reordering of Grapheme Clusters */
     249
     250/* Extract the next grapheme cluster at a given position within a sequence. */
     251
     252std::u32string getCluster(Seq * s, unsigned position) {
     253    unsigned pos = position;
     254    unsigned size = s->size();
     255    std::u32string rslt;
     256    while ((pos < size) && isa<CC>((*s)[pos])) {
     257        CC * cc = cast<CC>((*s)[pos]);
     258        if (cc->empty()) return rslt;
     259        codepoint_t lo = lo_codepoint(cc->front());
     260        codepoint_t hi = hi_codepoint(cc->back());
     261        if (lo != hi) // not a singleton CC; end of the cluster.
     262            return rslt;
     263        if (pos > position) {
     264            // After the first codepoint of a cluster, all remaining codepoints
     265            // must have ccc > 0.   Terminate the cluster when any starter
     266            // (codepoint with ccc==0) is found.
     267            const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]);
     268            const UnicodeSet & ccc0 = cccObj->GetCodepointSet(CCC_ns::NR);
     269            if (ccc0.contains(lo)) return rslt;
     270        }
     271        rslt.push_back(lo);
     272        pos++;
     273    }
     274    return rslt;
     275}
     276
     277/*  Helper function to insert a given mark at all possible positions within
     278    a set of prefixes, subject to constraints on Unicode canonically-equivalent
     279    ordering. */
     280std::vector<std::u32string> allReorderedInsertions(std::vector<std::u32string> prefixes, codepoint_t mark) {
     281    const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]);
     282    const UnicodeSet & cccSet = cccObj->GetCodepointSet(cccObj->GetEnumerationValue(mark));
     283    const UnicodeSet & cc0Set = cccObj->GetCodepointSet(CCC_ns::NR); // ccc = 0, NotReorderable.
     284    const UnicodeSet insertBeforeBlocked = cccSet + cc0Set;
     285    std::vector<std::u32string> reorderings;
     286    for (auto & prefix : prefixes) {
     287        reorderings.push_back(prefix + (char32_t) mark);
     288        int insert_pos = prefix.size() - 1;
     289        while ((insert_pos >= 0) && (!insertBeforeBlocked.contains(prefix[insert_pos]))) {
     290            reorderings.push_back(prefix.substr(0, insert_pos) + (char32_t) mark + prefix.substr(insert_pos));
     291            insert_pos--;
     292        }
     293    }
     294    return reorderings;
     295}
     296
     297RE * allClusterOrderings(std::u32string cluster) {
     298    std::vector<std::u32string> orderings = {cluster.substr(0,1)};
     299    for (unsigned i = 1; i < cluster.size(); i++) {
     300        orderings = allReorderedInsertions(orderings, cluster[i]);
     301    }
     302    std::vector<RE *> alts;
     303    for (auto a : orderings) {
     304        alts.push_back(u32string2re(a));
     305    }
     306    return makeAlt(alts.begin(), alts.end());
     307}
     308
     309RE * allOrderings_RE(RE * re) {
     310    if (Alt * alt = dyn_cast<Alt>(re)) {
     311        std::vector<RE *> list;
     312        list.reserve(alt->size());
     313        for (RE * a : *alt) {
     314            list.push_back(allOrderings_RE(a));
     315        }
     316        return makeAlt(list.begin(), list.end());
     317    } else if (CC * cc = dyn_cast<CC>(re)) {
     318        return cc;
     319    } else if (Seq * seq = dyn_cast<Seq>(re)) {
     320        // find and process all string pieces
     321        std::vector<RE *> list;
     322        unsigned size = seq->size();
     323        unsigned i = 0;
     324        while (i < size) {
     325            std::u32string cluster = getCluster(seq, i);
     326            if (cluster.size() > 0) {
     327                list.push_back(allClusterOrderings(cluster));
     328                i += cluster.size();
     329            } else {
     330                list.push_back(allOrderings_RE((*seq)[i]));
     331                i++;
     332            }
     333        }
     334        return makeSeq(list.begin(), list.end());
     335    } else if (Assertion * a = dyn_cast<Assertion>(re)) {
     336        return makeAssertion(allOrderings_RE(a->getAsserted()), a->getKind(), a->getSense());
     337    } else if (Rep * rep = dyn_cast<Rep>(re)) {
     338        RE * expr = allOrderings_RE(rep->getRE());
     339        return makeRep(expr, rep->getLB(), rep->getUB());
     340    } else if (Diff * diff = dyn_cast<Diff>(re)) {
     341        return makeDiff(allOrderings_RE(diff->getLH()), allOrderings_RE(diff->getRH()));
     342    } else if (Intersect * e = dyn_cast<Intersect>(re)) {
     343        return makeIntersect(allOrderings_RE(e->getLH()), allOrderings_RE(e->getRH()));
     344    } else if (Range * rg = dyn_cast<Range>(re)) {
     345        return makeRange(allOrderings_RE(rg->getLo()), allOrderings_RE(rg->getHi()));
     346    } else if (Group * g = dyn_cast<Group>(re)) {
     347        return makeGroup(g->getMode(), allOrderings_RE(g->getRE()), g->getSense());
     348    }
     349    return re;
     350}
     351
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.h

    r6138 r6141  
    1616    string (expanding decomposition).   In general, the result is
    1717    a set of alternatives consisting of sequences for each expanding
    18     decomposition as well as a single character class for all the
    19     singledton decompositions as well as the codepoints that map to
     18    decomposition plus a single character class for all the
     19    singleton decompositions as well as the codepoints that map to
    2020    themselves.
    2121*/
     
    3636re::RE * Casefold_RE(re::RE * r);
    3737
     38/* For every decomposed sequence in RE, add alternatives for all
     39   canonically equivalent reorderings according to Unicode rules. */
     40re::RE * allOrderings_RE(re::RE * re);
     41
     42
    3843#endif
Note: See TracChangeset for help on using the changeset viewer.