source: icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp @ 6172

Last change on this file since 6172 was 6172, checked in by cameron, 8 months ago

NFD Transformer

File size: 7.4 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <vector>
9#include <locale>
10#include <codecvt>
11#include <re/Unicode/decomposition.h>
12#include <re/re_cc.h>
13#include <re/re_seq.h>
14#include <re/re_alt.h>
15#include <re/re_group.h>
16#include <re/re_range.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <UCD/unicode_set.h>
21#include <UCD/PropertyAliases.h>
22#include <UCD/PropertyObjects.h>
23#include <UCD/PropertyObjectTable.h>
24#include <UCD/PropertyValueAliases.h>
25#include <llvm/Support/Casting.h>
26
27using namespace llvm;
28using namespace re;
29
30namespace UCD {
31   
32// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
33const codepoint_t Hangul_SBase = 0xAC00;
34const codepoint_t Hangul_LBase = 0x1100;
35//const codepoint_t Hangul_LMax = 0x1112;
36const codepoint_t Hangul_VBase = 0x1161;
37//const codepoint_t Hangul_VMax = 0x1175;
38const codepoint_t Hangul_TBase = 0x11A7;
39//const codepoint_t Hangul_TMax = 0x11C2;
40const unsigned Hangul_TCount = 28;
41const unsigned Hangul_NCount = 588;
42const unsigned Hangul_SCount = 11172;
43
44static inline std::u32string getStringPiece(Seq * s, unsigned position) {
45    unsigned pos = position;
46    unsigned size = s->size();
47    std::u32string rslt;
48    while ((pos < size) && isa<CC>((*s)[pos])) {
49        CC * cc = cast<CC>((*s)[pos]);
50        if (cc->empty()) return rslt;
51        codepoint_t lo = lo_codepoint(cc->front());
52        codepoint_t hi = hi_codepoint(cc->back());
53        if (lo != hi) // not a singleton CC; end of the string piece.
54            return rslt;
55        rslt.push_back(lo);
56        pos++;
57    }
58    return rslt;
59}
60   
61NFD_Transformer::NFD_Transformer(DecompositionOptions opt) :
62    RE_Transformer("toNFD"),
63    mOptions(opt),
64    decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
65    decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
66    cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
67    caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
68    canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
69    cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
70    selfNFKD(decompMappingObj->GetReflexiveSet()),
71    selfCaseFold(caseFoldObj->GetReflexiveSet())
72{}
73
74static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
75
76bool hasOption(enum DecompositionOptions optionSet, enum DecompositionOptions testOption) {
77    return (testOption & optionSet) != 0;
78}
79   
80bool NFD_Transformer::reordering_needed(std::u32string & prefix, codepoint_t suffix_cp) {
81    if (prefix.empty()) return false;
82    if (cc0Set.contains(suffix_cp)) return false;
83    auto cc1 = cccObj->GetEnumerationValue(prefix.back());
84    auto cc2 = cccObj->GetEnumerationValue(suffix_cp);
85    return cc1 > cc2;
86}
87
88void NFD_Transformer::NFD_append1(std::u32string & NFD_string, codepoint_t cp) {
89    if (HangulPrecomposed.contains(cp)) {
90        // Apply NFD normalization; no NFKD or casefolding required
91        auto SIndex = cp - Hangul_SBase;
92        auto LIndex = SIndex / Hangul_NCount;
93        auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount;
94        auto TIndex = SIndex % Hangul_TCount;
95        NFD_string.push_back(Hangul_LBase + LIndex);
96        NFD_string.push_back(Hangul_VBase + VIndex);
97        if (TIndex > 0) {
98            NFD_string.push_back(Hangul_TBase + TIndex);
99        }
100    } else if (canonicalMapped.contains(cp)) {
101        std::string u8decomposed = decompMappingObj->GetStringValue(cp);
102        std::u32string dms = conv.from_bytes(u8decomposed);
103        // Recursive normalization may be necessary.
104        NFD_append(NFD_string, dms);
105        // After canonical mappings are handled, canonical ordering may be required.
106        // This should be done before casefolding.
107    } else if (reordering_needed(NFD_string, cp)) {
108        // Reorder the last two characters - recursion will handle
109        // rare multiposition reordering.
110        std::u32string reordered({cp, NFD_string.back()});
111        NFD_string.pop_back();
112        NFD_append(NFD_string, reordered);
113    } else if (hasOption(mOptions, UCD::CaseFold) && !selfCaseFold.contains(cp)) {
114        std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
115        NFD_append(NFD_string, dms);
116    } else if (hasOption(mOptions, UCD::NFKD) && (!selfNFKD.contains(cp))) {
117        std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
118        NFD_append(NFD_string, dms);
119    } else {
120        NFD_string.push_back(cp);
121    }
122}
123
124void NFD_Transformer::NFD_append(std::u32string & NFD_string, std::u32string & to_convert) {
125    for (unsigned i = 0; i < to_convert.size(); i++) {
126        NFD_append1(NFD_string, to_convert[i]);
127    }
128}
129
130RE * NFD_Transformer::transformGroup(Group * g) {
131    re::Group::Mode mode = g->getMode();
132    re::Group::Sense sense = g->getSense();
133    auto r = g->getRE();
134    UCD::DecompositionOptions saveOptions = mOptions;
135    if (mode == re::Group::Mode::CaseInsensitiveMode) {
136        if (sense == re::Group::Sense::On) {
137            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::CaseFold);
138        } else {
139            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::CaseFold);
140        }
141    } else if (mode == re::Group::Mode::CompatibilityMode) {
142        if (sense == re::Group::Sense::On) {
143            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::NFKD);
144        } else {
145            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::NFKD);
146        }
147    }
148    RE * t = transform(r);
149    mOptions = saveOptions;
150    if (t == r) return g;
151    return makeGroup(mode, t, sense);
152   
153}
154
155RE * NFD_Transformer::transformCC(CC * cc) {
156    UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
157    if (hasOption(mOptions, UCD::CaseFold)) {
158        mappingRequired = mappingRequired + (*cc - selfCaseFold);
159    }
160    if (hasOption(mOptions, UCD::NFKD)) {
161        mappingRequired = mappingRequired + (*cc - selfNFKD);
162    }
163    if (mappingRequired.empty()) return cc;
164    std::vector<RE *> alts;
165    CC * finalCC = makeCC(*cc - mappingRequired);
166    for (const interval_t & i : mappingRequired) {
167        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
168            std::u32string decomp;
169            NFD_append1(decomp, cp);
170            if (decomp.size() == 1) {
171                finalCC = makeCC(finalCC, makeCC(decomp[0]));
172            } else {
173                alts.push_back(u32string2re(decomp));
174            }
175        }
176    }
177    if (!finalCC->empty()) alts.push_back(finalCC);
178    return makeAlt(alts.begin(), alts.end());
179}
180
181RE * NFD_Transformer::transformSeq(Seq * seq) {
182    // find and process all string pieces
183    unsigned size = seq->size();
184    if (size == 0) return seq;
185    std::vector<RE *> list;
186    unsigned i = 0;
187    while (i < size) {
188        std::u32string stringPiece = getStringPiece(seq, i);
189        if (stringPiece.size() > 0) {
190            std::u32string s;
191            NFD_append(s, stringPiece);
192            list.push_back(u32string2re(s));
193            i += stringPiece.size();
194        } else {
195            list.push_back(transform((*seq)[i]));
196            i++;
197        }
198    }
199    return makeSeq(list.begin(), list.end());
200}
201} // end namespace UCD
Note: See TracBrowser for help on using the repository browser.