source: icGREP/icgrep-devel/icgrep/re/Unicode/decomposition.cpp @ 6178

Last change on this file since 6178 was 6178, checked in by cameron, 5 months ago

RE Validation

File size: 7.5 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <vector>
9#include <locale>
10#include <codecvt>
11#include <re/Unicode/decomposition.h>
12#include <re/re_cc.h>
13#include <re/re_seq.h>
14#include <re/re_alt.h>
15#include <re/re_group.h>
16#include <re/re_range.h>
17#include <re/re_diff.h>
18#include <re/re_intersect.h>
19#include <re/re_assertion.h>
20#include <UCD/unicode_set.h>
21#include <UCD/PropertyAliases.h>
22#include <UCD/PropertyObjects.h>
23#include <UCD/PropertyObjectTable.h>
24#include <UCD/PropertyValueAliases.h>
25#include <llvm/Support/Casting.h>
26
27using namespace llvm;
28using namespace re;
29
30namespace UCD {
31   
32// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
33const codepoint_t Hangul_SBase = 0xAC00;
34const codepoint_t Hangul_LBase = 0x1100;
35//const codepoint_t Hangul_LMax = 0x1112;
36const codepoint_t Hangul_VBase = 0x1161;
37//const codepoint_t Hangul_VMax = 0x1175;
38const codepoint_t Hangul_TBase = 0x11A7;
39//const codepoint_t Hangul_TMax = 0x11C2;
40const unsigned Hangul_TCount = 28;
41const unsigned Hangul_NCount = 588;
42const unsigned Hangul_SCount = 11172;
43
44static inline std::u32string getStringPiece(Seq * s, unsigned position) {
45    unsigned pos = position;
46    unsigned size = s->size();
47    std::u32string rslt;
48    while ((pos < size) && isa<CC>((*s)[pos])) {
49        CC * cc = cast<CC>((*s)[pos]);
50        if (cc->empty()) return rslt;
51        if (cc->getAlphabet() != &cc::Unicode) return rslt;
52        codepoint_t lo = lo_codepoint(cc->front());
53        codepoint_t hi = hi_codepoint(cc->back());
54        if (lo != hi) // not a singleton CC; end of the string piece.
55            return rslt;
56        rslt.push_back(lo);
57        pos++;
58    }
59    return rslt;
60}
61   
62NFD_Transformer::NFD_Transformer(DecompositionOptions opt) :
63    RE_Transformer("toNFD"),
64    mOptions(opt),
65    decompTypeObj(cast<EnumeratedPropertyObject>(property_object_table[dt])),
66    decompMappingObj(cast<StringPropertyObject>(property_object_table[dm])),
67    cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
68    caseFoldObj(cast<StringOverridePropertyObject>(property_object_table[cf])),
69    canonicalMapped(decompTypeObj->GetCodepointSet(DT_ns::Can)),
70    cc0Set(cccObj->GetCodepointSet(CCC_ns::NR)),
71    selfNFKD(decompMappingObj->GetReflexiveSet()),
72    selfCaseFold(caseFoldObj->GetReflexiveSet())
73{}
74
75static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
76
77bool hasOption(enum DecompositionOptions optionSet, enum DecompositionOptions testOption) {
78    return (testOption & optionSet) != 0;
79}
80   
81bool NFD_Transformer::reordering_needed(std::u32string & prefix, codepoint_t suffix_cp) {
82    if (prefix.empty()) return false;
83    if (cc0Set.contains(suffix_cp)) return false;
84    auto cc1 = cccObj->GetEnumerationValue(prefix.back());
85    auto cc2 = cccObj->GetEnumerationValue(suffix_cp);
86    return cc1 > cc2;
87}
88
89void NFD_Transformer::NFD_append1(std::u32string & NFD_string, codepoint_t cp) {
90    if (HangulPrecomposed.contains(cp)) {
91        // Apply NFD normalization; no NFKD or casefolding required
92        auto SIndex = cp - Hangul_SBase;
93        auto LIndex = SIndex / Hangul_NCount;
94        auto VIndex = (SIndex % Hangul_NCount) / Hangul_TCount;
95        auto TIndex = SIndex % Hangul_TCount;
96        NFD_string.push_back(Hangul_LBase + LIndex);
97        NFD_string.push_back(Hangul_VBase + VIndex);
98        if (TIndex > 0) {
99            NFD_string.push_back(Hangul_TBase + TIndex);
100        }
101    } else if (canonicalMapped.contains(cp)) {
102        std::string u8decomposed = decompMappingObj->GetStringValue(cp);
103        std::u32string dms = conv.from_bytes(u8decomposed);
104        // Recursive normalization may be necessary.
105        NFD_append(NFD_string, dms);
106        // After canonical mappings are handled, canonical ordering may be required.
107        // This should be done before casefolding.
108    } else if (reordering_needed(NFD_string, cp)) {
109        // Reorder the last two characters - recursion will handle
110        // rare multiposition reordering.
111        std::u32string reordered({cp, NFD_string.back()});
112        NFD_string.pop_back();
113        NFD_append(NFD_string, reordered);
114    } else if (hasOption(mOptions, UCD::CaseFold) && !selfCaseFold.contains(cp)) {
115        std::u32string dms = conv.from_bytes(caseFoldObj->GetStringValue(cp));
116        NFD_append(NFD_string, dms);
117    } else if (hasOption(mOptions, UCD::NFKD) && (!selfNFKD.contains(cp))) {
118        std::u32string dms = conv.from_bytes(decompMappingObj->GetStringValue(cp));
119        NFD_append(NFD_string, dms);
120    } else {
121        NFD_string.push_back(cp);
122    }
123}
124
125void NFD_Transformer::NFD_append(std::u32string & NFD_string, std::u32string & to_convert) {
126    for (unsigned i = 0; i < to_convert.size(); i++) {
127        NFD_append1(NFD_string, to_convert[i]);
128    }
129}
130
131RE * NFD_Transformer::transformGroup(Group * g) {
132    re::Group::Mode mode = g->getMode();
133    re::Group::Sense sense = g->getSense();
134    auto r = g->getRE();
135    UCD::DecompositionOptions saveOptions = mOptions;
136    if (mode == re::Group::Mode::CaseInsensitiveMode) {
137        if (sense == re::Group::Sense::On) {
138            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::CaseFold);
139        } else {
140            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::CaseFold);
141        }
142    } else if (mode == re::Group::Mode::CompatibilityMode) {
143        if (sense == re::Group::Sense::On) {
144            mOptions = static_cast<UCD::DecompositionOptions>(mOptions | UCD::NFKD);
145        } else {
146            mOptions = static_cast<UCD::DecompositionOptions>(mOptions & ~UCD::NFKD);
147        }
148    }
149    RE * t = transform(r);
150    mOptions = saveOptions;
151    if (t == r) return g;
152    return makeGroup(mode, t, sense);
153   
154}
155
156RE * NFD_Transformer::transformCC(CC * cc) {
157    if (cc->getAlphabet() != &cc::Unicode) return cc;
158    UnicodeSet mappingRequired = *cc & (canonicalMapped + HangulPrecomposed);
159    if (hasOption(mOptions, UCD::CaseFold)) {
160        mappingRequired = mappingRequired + (*cc - selfCaseFold);
161    }
162    if (hasOption(mOptions, UCD::NFKD)) {
163        mappingRequired = mappingRequired + (*cc - selfNFKD);
164    }
165    if (mappingRequired.empty()) return cc;
166    std::vector<RE *> alts;
167    CC * finalCC = makeCC(*cc - mappingRequired);
168    for (const interval_t & i : mappingRequired) {
169        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
170            std::u32string decomp;
171            NFD_append1(decomp, cp);
172            if (decomp.size() == 1) {
173                finalCC = makeCC(finalCC, makeCC(decomp[0]));
174            } else {
175                alts.push_back(u32string2re(decomp));
176            }
177        }
178    }
179    if (!finalCC->empty()) alts.push_back(finalCC);
180    return makeAlt(alts.begin(), alts.end());
181}
182
183RE * NFD_Transformer::transformSeq(Seq * seq) {
184    // find and process all string pieces
185    unsigned size = seq->size();
186    if (size == 0) return seq;
187    std::vector<RE *> list;
188    unsigned i = 0;
189    while (i < size) {
190        std::u32string stringPiece = getStringPiece(seq, i);
191        if (stringPiece.size() > 0) {
192            std::u32string s;
193            NFD_append(s, stringPiece);
194            list.push_back(u32string2re(s));
195            i += stringPiece.size();
196        } else {
197            list.push_back(transform((*seq)[i]));
198            i++;
199        }
200    }
201    return makeSeq(list.begin(), list.end());
202}
203} // end namespace UCD
Note: See TracBrowser for help on using the repository browser.