source: icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp @ 6186

Last change on this file since 6186 was 6186, checked in by cameron, 12 months ago

Various clean-ups

File size: 9.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "decomposition.h"
8#include <string>
9#include <vector>
10#include <locale>
11#include <codecvt>
12#include <re/re_cc.h>
13#include <re/re_seq.h>
14#include <re/re_alt.h>
15#include <re/re_group.h>
16#include <re/re_range.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <re/re_toolchain.h>
21#include <UCD/unicode_set.h>
22#include <UCD/PropertyAliases.h>
23#include <UCD/PropertyObjects.h>
24#include <UCD/PropertyObjectTable.h>
25#include <UCD/PropertyValueAliases.h>
26#include <llvm/Support/Casting.h>
27
28
29using namespace llvm;
30using namespace re;
31
32namespace UCD {
33   
34// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
35const codepoint_t Hangul_SBase = 0xAC00;
36const codepoint_t Hangul_LBase = 0x1100;
37//const codepoint_t Hangul_LMax = 0x1112;
38const codepoint_t Hangul_VBase = 0x1161;
39//const codepoint_t Hangul_VMax = 0x1175;
40const codepoint_t Hangul_TBase = 0x11A7;
41//const codepoint_t Hangul_TMax = 0x11C2;
42const unsigned Hangul_TCount = 28;
43const unsigned Hangul_NCount = 588;
44const unsigned Hangul_SCount = 11172;
45
46static inline std::u32string getStringPiece(Seq * s, unsigned position) {
47    unsigned pos = position;
48    unsigned size = s->size();
49    std::u32string rslt;
50    while ((pos < size) && isa<CC>((*s)[pos])) {
51        CC * cc = cast<CC>((*s)[pos]);
52        if (cc->empty()) return rslt;
53        if (cc->getAlphabet() != &cc::Unicode) return rslt;
54        codepoint_t lo = lo_codepoint(cc->front());
55        codepoint_t hi = hi_codepoint(cc->back());
56        if (lo != hi) // not a singleton CC; end of the string piece.
57            return rslt;
58        rslt.push_back(lo);
59        pos++;
60    }
61    return rslt;
62}
63
64class NFD_Transformer final : public re::RE_Transformer {
65public:
66    /* Transforme an RE so that all string pieces and character classes
67     are converted to NFD form (or NFKD form if the Compatible option
68     is used.  The options may also including case folding.  Example:
69     NFD_Transformer(CaseFold | NFKD).transformRE(r);
70    */
71    NFD_Transformer(DecompositionOptions opt);
72    /* Helpers to convert and append an individual codepoint or a u32string
73       to an existing NFD_string.   The process performs any necessary
74       reordering of marks of the existing string and the appended data
75       to ensure that the result is overall in NFD form.
76       These may be used independently of RE transformation, for example:
77       NFD_Transformer(CaseFold).NFD_append1(s, cp);
78    */
79    void NFD_append1(std::u32string & NFD_string, codepoint_t cp);
80    void NFD_append(std::u32string & NFD_string, std::u32string & to_convert);
81protected:
82    re::RE * transformCC(re::CC * cc) override;
83    re::RE * transformSeq(re::Seq * seq) override;
84    re::RE * transformGroup(re::Group * g) override;
85    bool reordering_needed(std::u32string & prefix, codepoint_t suffix_cp);
86private:
87    DecompositionOptions mOptions;
88    EnumeratedPropertyObject * decompTypeObj;
89    StringPropertyObject * decompMappingObj;
90    EnumeratedPropertyObject * cccObj;
91    StringOverridePropertyObject * caseFoldObj;
92    const UnicodeSet & canonicalMapped;
93    const UnicodeSet & cc0Set;
94    const UnicodeSet selfNFKD;
95    const UnicodeSet selfCaseFold;
96    const UnicodeSet HangulPrecomposed;
97    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
98};
99   
100NFD_Transformer::NFD_Transformer(DecompositionOptions opt) :
101RE_Transformer("toNFD"),
102mOptions(opt),
103decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
104decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
105cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
106caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
107canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
108cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
109selfNFKD(decompMappingObj->GetReflexiveSet()),
110selfCaseFold(caseFoldObj->GetReflexiveSet()),
111HangulPrecomposed(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1) {
112
113}
114
115bool hasOption(enum DecompositionOptions optionSet, enum DecompositionOptions testOption) {
116    return (testOption & optionSet) != 0;
117}
118   
119bool NFD_Transformer::reordering_needed(std::u32string & prefix, codepoint_t suffix_cp) {
120    if (prefix.empty()) return false;
121    if (cc0Set.contains(suffix_cp)) return false;
122    auto cc1 = cccObj->GetEnumerationValue(prefix.back());
123    auto cc2 = cccObj->GetEnumerationValue(suffix_cp);
124    return cc1 > cc2;
125}
126
127void NFD_Transformer::NFD_append1(std::u32string & NFD_string, codepoint_t cp) {
128    if (HangulPrecomposed.contains(cp)) {
129        // Apply NFD normalization; no NFKD or casefolding required
130        auto SIndex = cp - Hangul_SBase;
131        auto LIndex = SIndex / Hangul_NCount;
132        auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount;
133        auto TIndex = SIndex % Hangul_TCount;
134        NFD_string.push_back(Hangul_LBase + LIndex);
135        NFD_string.push_back(Hangul_VBase + VIndex);
136        if (TIndex > 0) {
137            NFD_string.push_back(Hangul_TBase + TIndex);
138        }
139    } else if (canonicalMapped.contains(cp)) {
140        std::string u8decomposed = decompMappingObj->GetStringValue(cp);
141        std::u32string dms = conv.from_bytes(u8decomposed);
142        // Recursive normalization may be necessary.
143        NFD_append(NFD_string, dms);
144        // After canonical mappings are handled, canonical ordering may be required.
145        // This should be done before casefolding.
146    } else if (reordering_needed(NFD_string, cp)) {
147        // Reorder the last two characters - recursion will handle
148        // rare multiposition reordering.
149        std::u32string reordered({cp, NFD_string.back()});
150        NFD_string.pop_back();
151        NFD_append(NFD_string, reordered);
152    } else if (hasOption(mOptions, CaseFold) && !selfCaseFold.contains(cp)) {
153        std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
154        NFD_append(NFD_string, dms);
155    } else if (hasOption(mOptions, NFKD) && (!selfNFKD.contains(cp))) {
156        std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
157        NFD_append(NFD_string, dms);
158    } else {
159        NFD_string.push_back(cp);
160    }
161}
162
163void NFD_Transformer::NFD_append(std::u32string & NFD_string, std::u32string & to_convert) {
164    for (unsigned i = 0; i < to_convert.size(); i++) {
165        NFD_append1(NFD_string, to_convert[i]);
166    }
167}
168
169RE * NFD_Transformer::transformGroup(Group * g) {
170    re::Group::Mode mode = g->getMode();
171    re::Group::Sense sense = g->getSense();
172    auto r = g->getRE();
173    DecompositionOptions saveOptions = mOptions;
174    if (mode == re::Group::Mode::CaseInsensitiveMode) {
175        if (sense == re::Group::Sense::On) {
176            mOptions = static_cast<DecompositionOptions>(mOptions | CaseFold);
177        } else {
178            mOptions = static_cast<DecompositionOptions>(mOptions & ~CaseFold);
179        }
180    } else if (mode == re::Group::Mode::CompatibilityMode) {
181        if (sense == re::Group::Sense::On) {
182            mOptions = static_cast<DecompositionOptions>(mOptions | NFKD);
183        } else {
184            mOptions = static_cast<DecompositionOptions>(mOptions & ~NFKD);
185        }
186    }
187    RE * t = transform(r);
188    mOptions = saveOptions;
189    if (t == r) return g;
190    return makeGroup(mode, t, sense);
191   
192}
193
194RE * NFD_Transformer::transformCC(CC * cc) {
195    if (cc->getAlphabet() != &cc::Unicode) return cc;
196    UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
197    if (hasOption(mOptions, CaseFold)) {
198        mappingRequired = mappingRequired + (*cc - selfCaseFold);
199    }
200    if (hasOption(mOptions, NFKD)) {
201        mappingRequired = mappingRequired + (*cc - selfNFKD);
202    }
203    if (mappingRequired.empty()) return cc;
204    std::vector<RE *> alts;
205    CC * finalCC = makeCC(*cc - mappingRequired);
206    for (const interval_t & i : mappingRequired) {
207        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
208            std::u32string decomp;
209            NFD_append1(decomp, cp);
210            if (decomp.size() == 1) {
211                finalCC = makeCC(finalCC, makeCC(decomp[0]));
212            } else {
213                alts.push_back(u32string2re(decomp));
214            }
215        }
216    }
217    if (!finalCC->empty()) alts.push_back(finalCC);
218    return makeAlt(alts.begin(), alts.end());
219}
220
221RE * NFD_Transformer::transformSeq(Seq * seq) {
222    // find and process all string pieces
223    unsigned size = seq->size();
224    if (size == 0) return seq;
225    std::vector<RE *> list;
226    unsigned i = 0;
227    bool unchanged = true;
228    while (i < size) {
229        std::u32string stringPiece = getStringPiece(seq, i);
230        if (stringPiece.size() > 0) {
231            std::u32string s;
232            NFD_append(s, stringPiece);
233            if (s != stringPiece) unchanged = false;
234            list.push_back(u32string2re(s));
235            i += stringPiece.size();
236        } else {
237            RE * r = (*seq)[i];
238            RE * t = transform(r);
239            if (t != r) unchanged = false;
240            list.push_back(t);
241            i++;
242        }
243    }
244    if (unchanged) return seq;
245    return makeSeq(list.begin(), list.end());
246}
247
248RE * toNFD(RE * re, const DecompositionOptions opt) {
249    return NFD_Transformer(opt).transformRE(re);
250}
251
252} // end namespace UCD
Note: See TracBrowser for help on using the repository browser.