Changeset 6172


Ignore:
Timestamp:
Oct 6, 2018, 8:27:36 AM (2 months ago)
Author:
cameron
Message:

NFD Transformer

Location:
icGREP/icgrep-devel/icgrep/re/Unicode
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp

    r6146 r6172  
    2525#include <llvm/Support/Casting.h>
    2626
    27 using namespace UCD;
    2827using namespace llvm;
    2928using namespace re;
    3029
     30namespace UCD {
     31   
    3132// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
    3233const codepoint_t Hangul_SBase = 0xAC00;
    3334const codepoint_t Hangul_LBase = 0x1100;
     35//const codepoint_t Hangul_LMax = 0x1112;
    3436const codepoint_t Hangul_VBase = 0x1161;
     37//const codepoint_t Hangul_VMax = 0x1175;
    3538const codepoint_t Hangul_TBase = 0x11A7;
     39//const codepoint_t Hangul_TMax = 0x11C2;
    3640const unsigned Hangul_TCount = 28;
    3741const unsigned Hangul_NCount = 588;
    3842const unsigned Hangul_SCount = 11172;
    39 static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
    40 
    41 static RE * HangulDecomposition(codepoint_t cp) {
    42     auto SIndex = cp - Hangul_SBase;
    43     auto LIndex = SIndex / Hangul_NCount;
    44     auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount;
    45     auto TIndex = SIndex % Hangul_TCount;
    46     auto L = makeCC(Hangul_LBase + LIndex);
    47     auto V = makeCC(Hangul_VBase + VIndex);
    48     if (TIndex > 0) {
    49         return makeSeq({L, V, makeCC(Hangul_TBase + TIndex)});
    50     } else {
    51         return makeSeq({L, V});
    52     }
    53 }
    54 
    55 RE * NFD_CC(CC * cc) {
    56     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
    57     const auto & decompMappingObj = cast<StringPropertyObject>(property_object_table[dm]);
    58     const auto & decompTypeObj = cast<EnumeratedPropertyObject>(property_object_table[dt]);
    59     UnicodeSet canonicalMapped = decompTypeObj->GetCodepointSet(DT_ns::Can);
    60     UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
    61     if (mappingRequired.empty()) return cc;
    62     std::vector<RE *> alts;
    63     CC * finalCC = makeCC(*cc - mappingRequired);
    64     for (const interval_t & i : mappingRequired) {
    65         for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
    66             if (HangulPrecomposed.contains(cp)) {
    67                 alts.push_back(HangulDecomposition(cp));
    68             } else {
    69                 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
    70                 RE * dm = NFD_RE(u32string2re(dms));
    71                 if (CC * nfd_cc = dyn_cast<CC>(dm)) {
    72                     finalCC = makeCC(finalCC, nfd_cc);
    73                 } else if (Seq * s = dyn_cast<Seq>(dm)) {
    74                     if (s->size() == 1) {
    75                         finalCC = makeCC(finalCC, cast<CC>(s->front()));
    76                     } else {
    77                         alts.push_back(s);
    78                     }
    79                 } else {
    80                     alts.push_back(dm);
    81                 }
    82             }
    83         }
    84     }
    85     if (!finalCC->empty()) alts.push_back(finalCC);
    86     return makeAlt(alts.begin(), alts.end());
    87 }
    88 
    89 
    90 RE * NFKD_CC(CC * cc) {
    91     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
    92     const auto & decompMappingObj = cast<StringPropertyObject>(property_object_table[dm]);
    93     UnicodeSet reflexiveSet = decompMappingObj->GetReflexiveSet() - HangulPrecomposed;
    94     UnicodeSet mappingRequired = *cc - reflexiveSet;
    95     if (mappingRequired.empty()) return cc;
    96     std::vector<RE *> alts;
    97     CC * finalCC = makeCC(*cc - mappingRequired);
    98     for (const interval_t & i : mappingRequired) {
    99         for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
    100             if (HangulPrecomposed.contains(cp)) {
    101                 alts.push_back(HangulDecomposition(cp));
    102             } else {
    103                 std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
    104                 RE * dm = NFKD_RE(u32string2re(dms));
    105                 if (CC * nfkd_cc = dyn_cast<CC>(dm)) {
    106                     finalCC = makeCC(finalCC, nfkd_cc);
    107                 } else if (Seq * s = dyn_cast<Seq>(dm)) {
    108                     if (s->size() == 1) {
    109                         finalCC = makeCC(finalCC, cast<CC>(s->front()));
    110                     } else {
    111                         alts.push_back(s);
    112                     }
    113                 } else {
    114                     alts.push_back(dm);
    115                 }
    116             }
    117         }
    118     }
    119     if (!finalCC->empty()) alts.push_back(finalCC);
    120     return makeAlt(alts.begin(), alts.end());
    121 }
    122 
    123 RE * Casefold_CC(CC * cc) {
    124     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
    125     const auto & caseFoldObj = cast<StringOverridePropertyObject>(property_object_table[cf]);
    126     UnicodeSet reflexiveSet = caseFoldObj->GetReflexiveSet();
    127     UnicodeSet foldingRequired = *cc - reflexiveSet;
    128     if (foldingRequired.empty()) return cc;
    129     std::vector<RE *> alts;
    130     CC * finalCC = makeCC(*cc - foldingRequired);
    131     for (const interval_t & i : foldingRequired) {
    132         for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
    133             std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
    134             RE * dm = NFD_RE(u32string2re(dms));
    135             if (CC * nfd_cc = dyn_cast<CC>(dm)) {
    136                 finalCC = makeCC(finalCC, nfd_cc);
    137             } else if (Seq * s = dyn_cast<Seq>(dm)) {
    138                 if (s->size() == 1) {
    139                     finalCC = makeCC(finalCC, cast<CC>(s->front()));
    140                 } else {
    141                     alts.push_back(s);
    142                 }
    143             } else {
    144                 alts.push_back(dm);
    145             }
    146         }
    147     }
    148     if (!finalCC->empty()) alts.push_back(finalCC);
    149     return makeAlt(alts.begin(), alts.end());
    150 }
    151 
    152 RE * NFD_RE(RE * re) {
    153     if (Alt * alt = dyn_cast<Alt>(re)) {
    154         std::vector<RE *> list;
    155         list.reserve(alt->size());
    156         for (RE * re : *alt) {
    157             list.push_back(NFD_RE(re));
    158         }
    159         return makeAlt(list.begin(), list.end());
    160     } else if (CC * cc = dyn_cast<CC>(re)) {
    161         return NFD_CC(cc);
    162     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    163         std::vector<RE *> list;
    164         list.reserve(seq->size());
    165         for (RE * re : *seq) {
    166             list.push_back(NFD_RE(re));
    167         }
    168         return makeSeq(list.begin(), list.end());
    169     } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    170         return makeAssertion(NFD_RE(a->getAsserted()), a->getKind(), a->getSense());
    171     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    172         RE * expr = NFD_RE(rep->getRE());
    173         return makeRep(expr, rep->getLB(), rep->getUB());
    174     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    175         return makeDiff(NFD_RE(diff->getLH()), NFD_RE(diff->getRH()));
    176     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    177         return makeIntersect(NFD_RE(e->getLH()), NFD_RE(e->getRH()));
    178     } else if (Range * rg = dyn_cast<Range>(re)) {
    179         return makeRange(NFD_RE(rg->getLo()), NFD_RE(rg->getHi()));
    180     } else if (Group * g = dyn_cast<Group>(re)) {
    181         return makeGroup(g->getMode(), NFD_RE(g->getRE()), g->getSense());
    182     }
    183     return re;
    184 }
    185    
    186 RE * NFKD_RE(RE * re) {
    187     if (Alt * alt = dyn_cast<Alt>(re)) {
    188         std::vector<RE *> list;
    189         list.reserve(alt->size());
    190         for (RE * re : *alt) {
    191             list.push_back(NFKD_RE(re));
    192         }
    193         return makeAlt(list.begin(), list.end());
    194     } else if (CC * cc = dyn_cast<CC>(re)) {
    195         return NFKD_CC(cc);
    196     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    197         std::vector<RE *> list;
    198         list.reserve(seq->size());
    199         for (RE * re : *seq) {
    200             list.push_back(NFKD_RE(re));
    201         }
    202         return makeSeq(list.begin(), list.end());
    203     } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    204         return makeAssertion(NFKD_RE(a->getAsserted()), a->getKind(), a->getSense());
    205     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    206         RE * expr = NFKD_RE(rep->getRE());
    207         return makeRep(expr, rep->getLB(), rep->getUB());
    208     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    209         return makeDiff(NFKD_RE(diff->getLH()), NFKD_RE(diff->getRH()));
    210     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    211         return makeIntersect(NFKD_RE(e->getLH()), NFKD_RE(e->getRH()));
    212     } else if (Range * rg = dyn_cast<Range>(re)) {
    213         return makeRange(NFKD_RE(rg->getLo()), NFKD_RE(rg->getHi()));
    214     } else if (Group * g = dyn_cast<Group>(re)) {
    215         return makeGroup(g->getMode(), NFKD_RE(g->getRE()), g->getSense());
    216     }
    217     return re;
    218 }
    219 
    220 RE * Casefold_RE(RE * re) {
    221     if (Alt * alt = dyn_cast<Alt>(re)) {
    222         std::vector<RE *> list;
    223         list.reserve(alt->size());
    224         for (RE * re : *alt) {
    225             list.push_back(Casefold_RE(re));
    226         }
    227         return makeAlt(list.begin(), list.end());
    228     } else if (CC * cc = dyn_cast<CC>(re)) {
    229         return Casefold_CC(cc);
    230     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    231         std::vector<RE *> list;
    232         list.reserve(seq->size());
    233         for (RE * re : *seq) {
    234             list.push_back(Casefold_RE(re));
    235         }
    236         return makeSeq(list.begin(), list.end());
    237     } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    238         return makeAssertion(Casefold_RE(a->getAsserted()), a->getKind(), a->getSense());
    239     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    240         RE * expr = Casefold_RE(rep->getRE());
    241         return makeRep(expr, rep->getLB(), rep->getUB());
    242     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    243         return makeDiff(Casefold_RE(diff->getLH()), Casefold_RE(diff->getRH()));
    244     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    245         return makeIntersect(Casefold_RE(e->getLH()), Casefold_RE(e->getRH()));
    246     } else if (Range * rg = dyn_cast<Range>(re)) {
    247         return makeRange(Casefold_RE(rg->getLo()), Casefold_RE(rg->getHi()));
    248     } else if (Group * g = dyn_cast<Group>(re)) {
    249         return makeGroup(g->getMode(), Casefold_RE(g->getRE()), g->getSense());
    250     }
    251     return re;
    252 }
    253 
    254 /* Reordering of Grapheme Clusters */
    255 
    256 /* Extract the next grapheme cluster at a given position within a sequence. */
    257 
    258 std::u32string getCluster(Seq * s, unsigned position) {
     43
     44static inline std::u32string getStringPiece(Seq * s, unsigned position) {
    25945    unsigned pos = position;
    26046    unsigned size = s->size();
     
    26551        codepoint_t lo = lo_codepoint(cc->front());
    26652        codepoint_t hi = hi_codepoint(cc->back());
    267         if (lo != hi) // not a singleton CC; end of the cluster.
     53        if (lo != hi) // not a singleton CC; end of the string piece.
    26854            return rslt;
    269         if (pos > position) {
    270             // After the first codepoint of a cluster, all remaining codepoints
    271             // must have ccc > 0.   Terminate the cluster when any starter
    272             // (codepoint with ccc==0) is found.
    273             const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]);
    274             const UnicodeSet & ccc0 = cccObj->GetCodepointSet(CCC_ns::NR);
    275             if (ccc0.contains(lo)) return rslt;
    276         }
    27755        rslt.push_back(lo);
    27856        pos++;
     
    28058    return rslt;
    28159}
    282 
    283 /*  Helper function to insert a given mark at all possible positions within
    284     a set of prefixes, subject to constraints on Unicode canonically-equivalent
    285     ordering. */
    286 std::vector<std::u32string> allReorderedInsertions(std::vector<std::u32string> prefixes, codepoint_t mark) {
    287     const auto & cccObj = cast<EnumeratedPropertyObject>(property_object_table[ccc]);
    288     const UnicodeSet & cccSet = cccObj->GetCodepointSet(cccObj->GetEnumerationValue(mark));
    289     const UnicodeSet & cc0Set = cccObj->GetCodepointSet(CCC_ns::NR); // ccc = 0, NotReorderable.
    290     const UnicodeSet insertBeforeBlocked = cccSet + cc0Set;
    291     std::vector<std::u32string> reorderings;
    292     for (auto & prefix : prefixes) {
    293         reorderings.push_back(prefix + (char32_t) mark);
    294         int insert_pos = prefix.size() - 1;
    295         while ((insert_pos >= 0) && (!insertBeforeBlocked.contains(prefix[insert_pos]))) {
    296             reorderings.push_back(prefix.substr(0, insert_pos) + (char32_t) mark + prefix.substr(insert_pos));
    297             insert_pos--;
    298         }
    299     }
    300     return reorderings;
    301 }
    302 
    303 RE * allClusterOrderings(std::u32string cluster) {
    304     std::vector<std::u32string> orderings = {cluster.substr(0,1)};
    305     for (unsigned i = 1; i < cluster.size(); i++) {
    306         orderings = allReorderedInsertions(orderings, cluster[i]);
    307     }
     60   
     61NFD_Transformer::NFD_Transformer(DecompositionOptions opt) :
     62    RE_Transformer("toNFD"),
     63    mOptions(opt),
     64    decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
     65    decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
     66    cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
     67    caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
     68    canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
     69    cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
     70    selfNFKD(decompMappingObj->GetReflexiveSet()),
     71    selfCaseFold(caseFoldObj->GetReflexiveSet())
     72{}
     73
     74static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
     75
     76bool hasOption(enum DecompositionOptions optionSet, enum DecompositionOptions testOption) {
     77    return (testOption & optionSet) != 0;
     78}
     79   
     80bool NFD_Transformer::reordering_needed(std::u32string & prefix, codepoint_t suffix_cp) {
     81    if (prefix.empty()) return false;
     82    if (cc0Set.contains(suffix_cp)) return false;
     83    auto cc1 = cccObj->GetEnumerationValue(prefix.back());
     84    auto cc2 = cccObj->GetEnumerationValue(suffix_cp);
     85    return cc1 > cc2;
     86}
     87
     88void NFD_Transformer::NFD_append1(std::u32string & NFD_string, codepoint_t cp) {
     89    if (HangulPrecomposed.contains(cp)) {
     90        // Apply NFD normalization; no NFKD or casefolding required
     91        auto SIndex = cp - Hangul_SBase;
     92        auto LIndex = SIndex / Hangul_NCount;
     93        auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount;
     94        auto TIndex = SIndex % Hangul_TCount;
     95        NFD_string.push_back(Hangul_LBase + LIndex);
     96        NFD_string.push_back(Hangul_VBase + VIndex);
     97        if (TIndex > 0) {
     98            NFD_string.push_back(Hangul_TBase + TIndex);
     99        }
     100    } else if (canonicalMapped.contains(cp)) {
     101        std::string u8decomposed = decompMappingObj->GetStringValue(cp);
     102        std::u32string dms = conv.from_bytes(u8decomposed);
     103        // Recursive normalization may be necessary.
     104        NFD_append(NFD_string, dms);
     105        // After canonical mappings are handled, canonical ordering may be required.
     106        // This should be done before casefolding.
     107    } else if (reordering_needed(NFD_string, cp)) {
     108        // Reorder the last two characters - recursion will handle
     109        // rare multiposition reordering.
     110        std::u32string reordered({cp, NFD_string.back()});
     111        NFD_string.pop_back();
     112        NFD_append(NFD_string, reordered);
     113    } else if (hasOption(mOptions, UCD::CaseFold) && !selfCaseFold.contains(cp)) {
     114        std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
     115        NFD_append(NFD_string, dms);
     116    } else if (hasOption(mOptions, UCD::NFKD) && (!selfNFKD.contains(cp))) {
     117        std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
     118        NFD_append(NFD_string, dms);
     119    } else {
     120        NFD_string.push_back(cp);
     121    }
     122}
     123
     124void NFD_Transformer::NFD_append(std::u32string & NFD_string, std::u32string & to_convert) {
     125    for (unsigned i = 0; i < to_convert.size(); i++) {
     126        NFD_append1(NFD_string, to_convert[i]);
     127    }
     128}
     129
     130RE * NFD_Transformer::transformGroup(Group * g) {
     131    re::Group::Mode mode = g->getMode();
     132    re::Group::Sense sense = g->getSense();
     133    auto r = g->getRE();
     134    UCD::DecompositionOptions saveOptions = mOptions;
     135    if (mode == re::Group::Mode::CaseInsensitiveMode) {
     136        if (sense == re::Group::Sense::On) {
     137            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::CaseFold);
     138        } else {
     139            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::CaseFold);
     140        }
     141    } else if (mode == re::Group::Mode::CompatibilityMode) {
     142        if (sense == re::Group::Sense::On) {
     143            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::NFKD);
     144        } else {
     145            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::NFKD);
     146        }
     147    }
     148    RE * t = transform(r);
     149    mOptions = saveOptions;
     150    if (t == r) return g;
     151    return makeGroup(mode, t, sense);
     152   
     153}
     154
     155RE * NFD_Transformer::transformCC(CC * cc) {
     156    UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
     157    if (hasOption(mOptions, UCD::CaseFold)) {
     158        mappingRequired = mappingRequired + (*cc - selfCaseFold);
     159    }
     160    if (hasOption(mOptions, UCD::NFKD)) {
     161        mappingRequired = mappingRequired + (*cc - selfNFKD);
     162    }
     163    if (mappingRequired.empty()) return cc;
    308164    std::vector<RE *> alts;
    309     for (auto a : orderings) {
    310         alts.push_back(u32string2re(a));
    311     }
     165    CC * finalCC = makeCC(*cc - mappingRequired);
     166    for (const interval_t & i : mappingRequired) {
     167        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
     168            std::u32string decomp;
     169            NFD_append1(decomp, cp);
     170            if (decomp.size() == 1) {
     171                finalCC = makeCC(finalCC, makeCC(decomp[0]));
     172            } else {
     173                alts.push_back(u32string2re(decomp));
     174            }
     175        }
     176    }
     177    if (!finalCC->empty()) alts.push_back(finalCC);
    312178    return makeAlt(alts.begin(), alts.end());
    313179}
    314180
    315 RE * allOrderings_RE(RE * re) {
    316     if (Alt * alt = dyn_cast<Alt>(re)) {
    317         std::vector<RE *> list;
    318         list.reserve(alt->size());
    319         for (RE * a : *alt) {
    320             list.push_back(allOrderings_RE(a));
    321         }
    322         return makeAlt(list.begin(), list.end());
    323     } else if (CC * cc = dyn_cast<CC>(re)) {
    324         return cc;
    325     } else if (Seq * seq = dyn_cast<Seq>(re)) {
    326         // find and process all string pieces
    327         std::vector<RE *> list;
    328         unsigned size = seq->size();
    329         unsigned i = 0;
    330         while (i < size) {
    331             std::u32string cluster = getCluster(seq, i);
    332             if (cluster.size() > 0) {
    333                 list.push_back(allClusterOrderings(cluster));
    334                 i += cluster.size();
    335             } else {
    336                 list.push_back(allOrderings_RE((*seq)[i]));
    337                 i++;
    338             }
    339         }
    340         return makeSeq(list.begin(), list.end());
    341     } else if (Assertion * a = dyn_cast<Assertion>(re)) {
    342         return makeAssertion(allOrderings_RE(a->getAsserted()), a->getKind(), a->getSense());
    343     } else if (Rep * rep = dyn_cast<Rep>(re)) {
    344         RE * expr = allOrderings_RE(rep->getRE());
    345         return makeRep(expr, rep->getLB(), rep->getUB());
    346     } else if (Diff * diff = dyn_cast<Diff>(re)) {
    347         return makeDiff(allOrderings_RE(diff->getLH()), allOrderings_RE(diff->getRH()));
    348     } else if (Intersect * e = dyn_cast<Intersect>(re)) {
    349         return makeIntersect(allOrderings_RE(e->getLH()), allOrderings_RE(e->getRH()));
    350     } else if (Range * rg = dyn_cast<Range>(re)) {
    351         return makeRange(allOrderings_RE(rg->getLo()), allOrderings_RE(rg->getHi()));
    352     } else if (Group * g = dyn_cast<Group>(re)) {
    353         return makeGroup(g->getMode(), allOrderings_RE(g->getRE()), g->getSense());
    354     }
    355     return re;
    356 }
    357 
     181RE * NFD_Transformer::transformSeq(Seq * seq) {
     182    // find and process all string pieces
     183    unsigned size = seq->size();
     184    if (size == 0) return seq;
     185    std::vector<RE *> list;
     186    unsigned i = 0;
     187    while (i < size) {
     188        std::u32string stringPiece = getStringPiece(seq, i);
     189        if (stringPiece.size() > 0) {
     190            std::u32string s;
     191            NFD_append(s, stringPiece);
     192            list.push_back(u32string2re(s));
     193            i += stringPiece.size();
     194        } else {
     195            list.push_back(transform((*seq)[i]));
     196            i++;
     197        }
     198    }
     199    return makeSeq(list.begin(), list.end());
     200}
     201} // end namespace UCD
  • icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.h

    r6141 r6172  
    88#define DECOMPOSITION_H
    99
    10 namespace re { class RE; class CC;}
     10#include <string>
     11#include <locale>
     12#include <codecvt>
     13#include <re/re_toolchain.h>
     14#include <UCD/unicode_set.h>
    1115
    12 /*  NFD, NFKD and casefold decompositions of a character class.
    13     Each codepoint in a class is mapped to its decomposition
    14     under the appropriate mapping, which may be itself, another
    15     single codepoint (singleton decomposition) or a codepoint
    16     string (expanding decomposition).   In general, the result is
    17     a set of alternatives consisting of sequences for each expanding
    18     decomposition plus a single character class for all the
    19     singleton decompositions as well as the codepoints that map to
    20     themselves.
    21 */
     16namespace re { class RE; class CC; class Seq;}
     17namespace UCD { class EnumeratedPropertyObject; class StringPropertyObject; class StringOverridePropertyObject;}
    2218
    23 re::RE * NFD_CC(re::CC * cc);
    24    
    25 re::RE * NFKD_CC(re::CC * cc);
     19namespace UCD {
     20    enum DecompositionOptions : int {NFD = 0, CaseFold = 1, NFKD = 2};
    2621
    27 re::RE * Casefold_CC(re::CC * cc);
    28 
    29 /*  Systematic NFD, NFKD and casefold decomposition of all character
    30     classes in a regular expression.  */
    31 
    32 re::RE * NFD_RE(re::RE * r);
    33 
    34 re::RE * NFKD_RE(re::RE * r);
    35 
    36 re::RE * Casefold_RE(re::RE * r);
    37 
    38 /* For every decomposed sequence in RE, add alternatives for all
    39    canonically equivalent reorderings according to Unicode rules. */
    40 re::RE * allOrderings_RE(re::RE * re);
    41 
    42 
     22    class NFD_Transformer : public re::RE_Transformer {
     23    public:
     24        /* Transforme an RE so that all string pieces and character classes
     25         are converted to NFD form (or NFKD form if the UCD::Compatible option
     26         is used.  The options may also including case folding.  Example:
     27         UCD:NFD_Transformer(UCD::CaseFold | UCD::NFKD).transformRE(r);
     28        */
     29        NFD_Transformer(DecompositionOptions opt = NFD);
     30        /* Helpers to convert and append an individual codepoint or a u32string
     31           to an existing NFD_string.   The process performs any necessary
     32           reordering of marks of the existing string and the appended data
     33           to ensure that the result is overall in NFD form.
     34           These may be used independently of RE transformation, for example:
     35           UCD::NFD_Transformer(UCD::CaseFold).NFD_append1(s, cp);
     36        */
     37        void NFD_append1(std::u32string & NFD_string, codepoint_t cp);
     38        void NFD_append(std::u32string & NFD_string, std::u32string & to_convert);
     39    protected:
     40        re::RE * transformCC(re::CC * cc) override;
     41        re::RE * transformSeq(re::Seq * seq) override;
     42        re::RE * transformGroup(re::Group * g) override;
     43        bool reordering_needed(std::u32string & prefix, codepoint_t suffix_cp);
     44    private:
     45        DecompositionOptions mOptions;
     46        EnumeratedPropertyObject * decompTypeObj;
     47        StringPropertyObject * decompMappingObj;
     48        EnumeratedPropertyObject * cccObj;
     49        StringOverridePropertyObject * caseFoldObj;
     50        const UnicodeSet & canonicalMapped;
     51        const UnicodeSet & cc0Set;
     52        const UnicodeSet selfNFKD;
     53        const UnicodeSet selfCaseFold;
     54        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
     55    };
     56}
    4357#endif
Note: See TracChangeset for help on using the changeset viewer.