source: icGREP/icgrep-devel/icgrep/re/Unicode/equivalence.cpp @ 6181

Last change on this file since 6181 was 6181, checked in by cameron, 12 months ago

Enabling Unicode Level 2 matching under canonical and compatible equivalence: -U2 flag

File size: 21.9 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "equivalence.h"
8#include <string>
9#include <locale>
10#include <codecvt>
11#include <cc/alphabet.h>
12#include <re/re_re.h>
13#include <re/re_seq.h>
14#include <re/re_alt.h>
15#include <re/re_cc.h>
16#include <re/re_group.h>
17#include <re/re_range.h>
18#include <re/re_diff.h>
19#include <re/re_intersect.h>
20#include <re/re_assertion.h>
21#include <re/re_toolchain.h>
22#include <re/Unicode/decomposition.h>
23#include <re/printer_re.h>
24#include "UCD/PropertyAliases.h"
25#include "UCD/PropertyObjects.h"
26#include "UCD/PropertyObjectTable.h"
27#include "UCD/PropertyValueAliases.h"
28#include <UCD/Equivalence.h>
29#include <UCD/PrecomposedMappings.h>
30#include <UCD/unicode_set.h>
31#include <llvm/Support/Casting.h>
32#include <llvm/Support/raw_ostream.h>
33
34using namespace re;
35using namespace llvm;
36namespace UCD {
37   
38std::u32string getStringPiece(Seq * s, unsigned position) {
39    unsigned pos = position;
40    unsigned size = s->size();
41    std::u32string rslt;
42    while ((pos < size) && isa<CC>((*s)[pos])) {
43        CC * cc = cast<CC>((*s)[pos]);
44        if (cc->getAlphabet() != &cc::Unicode) return rslt;
45        if (cc->empty()) return rslt;
46        codepoint_t lo = lo_codepoint(cc->front());
47        codepoint_t hi = hi_codepoint(cc->back());
48        if (lo != hi) // not a singleton CC; end of the string piece.
49            return rslt;
50        rslt.push_back(lo);
51        pos++;
52    }
53    return rslt;
54}
55   
56
57// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
58const codepoint_t Hangul_SBase = 0xAC00;
59const codepoint_t Hangul_LBase = 0x1100;
60const codepoint_t Hangul_LMax = 0x1112;
61const codepoint_t Hangul_VBase = 0x1161;
62const codepoint_t Hangul_VMax = 0x1175;
63const codepoint_t Hangul_TBase = 0x11A7;
64const codepoint_t Hangul_TMax = 0x11C2;
65const unsigned Hangul_TCount = 28;
66const unsigned Hangul_NCount = 588;
67const unsigned Hangul_SCount = 11172;
68static UnicodeSet HangulPrecomposed = UnicodeSet(Hangul_SBase, Hangul_SBase + Hangul_SCount - 1);
69
70
71typedef std::pair<codepoint_t, std::vector<unsigned>> MatchResult_t;
72
73class ClusterMatchTransformer : public re::RE_Transformer {
74public:
75    /* Givan an RE in decomposed form (NFD + CaseFold and/or NFKD),
76     produce an RE that will match all equivalent strings under the
77     given options.   Note that embedded groups within the RE
78     may change options.  Example:
79     UCD:AllEquivalentsTransformer(UCD::Caseless).transformRE(r);
80     */
81    ClusterMatchTransformer(EquivalenceOptions opt = Canonical) :
82        RE_Transformer("ClusterMatchTransformer"),
83        mOptions(opt),
84        cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
85        sUCobj(cast<StringPropertyObject>(property_object_table[suc])),
86        sLCobj(cast<StringPropertyObject>(property_object_table[slc])),
87        ccc0set(cccObj->GetCodepointSet(CCC_ns::NR)),
88        selfUC(sUCobj->GetReflexiveSet()),
89        selfLC(sLCobj->GetReflexiveSet())
90        {}
91protected:
92    re::RE * transformSeq(re::Seq * seq) override;
93    re::RE * transformGroup(re::Group * g) override;
94    void find_precomposed(std::u32string & s, std::vector<unsigned> & sss,
95                          unsigned j, std::vector<MatchResult_t> & matches);
96    void getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & ccc, unsigned pos,
97                              std::vector<MatchResult_t> & matches) const;
98    void getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & ccc,
99                              unsigned pos, bool skip, std::vector<unsigned> positions, std::vector<MatchResult_t> & matches) const;
100    RE * addEquivalents(std::u32string NFD_string);
101private:
102    EquivalenceOptions mOptions;
103    EnumeratedPropertyObject * cccObj;
104    const StringPropertyObject * sUCobj;
105    const StringPropertyObject * sLCobj;
106    const UnicodeSet & ccc0set;
107    const UnicodeSet selfUC;
108    const UnicodeSet selfLC;
109};
110
111   
112
113RE * ClusterMatchTransformer::transformGroup(Group * g) {
114    re::Group::Mode mode = g->getMode();
115    re::Group::Sense sense = g->getSense();
116    auto r = g->getRE();
117    EquivalenceOptions saveOptions = mOptions;
118    if (mode == re::Group::Mode::CaseInsensitiveMode) {
119        if (sense == re::Group::Sense::On) {
120            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Caseless);
121        } else {
122            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Caseless);
123        }
124    } else if (mode == re::Group::Mode::CompatibilityMode) {
125        if (sense == re::Group::Sense::On) {
126            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Compatible);
127        } else {
128            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Compatible);
129        }
130    }
131    RE * t = transform(r);
132    mOptions = saveOptions;
133    if (t == r) return g;
134    return makeGroup(mode, t, sense);
135}
136
137RE * ClusterMatchTransformer::transformSeq(Seq * seq) {
138    // find and process all string pieces
139    unsigned size = seq->size();
140    if (size == 0) return seq;
141    std::vector<RE *> list;
142    unsigned i = 0;
143    bool changed = false;
144    while (i < size) {
145        std::u32string stringPiece = getStringPiece(seq, i);
146        if (stringPiece.size() > 0) {
147            RE * e = addEquivalents(stringPiece);
148            if (Seq * t = dyn_cast<Seq>(e)) {
149                unsigned tsize = t->size();
150                if ((tsize != size) || (getStringPiece(t,0) != stringPiece)) changed = true;
151            } else changed = true;
152            list.push_back(addEquivalents(stringPiece));
153            i += stringPiece.size();
154        } else {
155            RE * r = (*seq)[i];
156            RE * t = transform(r);
157            if (t != r) changed = true;
158            list.push_back(t);
159            i++;
160        }
161    }
162    if (!changed) return seq;
163    return makeSeq(list.begin(), list.end());
164}
165   
166// The lookup process to find precomposed characters that match a
167// given NFD string results a vector of match results.   Each
168// match result is a pair consisting of the precomposed codepoint
169// identified, plus a vector of suffix match positions.   Suffix
170// match positions are the position of the final starter (ccc = 0)
171// and the positions of following marks.  When there are multiple
172// consecutive marks in one combining class, only the last of the
173// marks is included.  Mark positions are ordered, but may not be
174// consecutive, because of permissible reordering under Unicode rules.
175   
176
177void ClusterMatchTransformer::find_precomposed(std::u32string & NFD_string, std::vector<unsigned> & combining_class,
178                                                unsigned j, std::vector<MatchResult_t> & matches) {
179    codepoint_t cp = NFD_string[j];
180    //
181    // Canonical equivalents are added in every case.
182    if ((cp >= Hangul_LBase) && (cp <= Hangul_LMax)) {
183        // Algorithmic precomposition for Hangul.
184        // We may have an LV precomposition or both an LV and an
185        // LVT precomposition.
186        if ((j+1 < NFD_string.size()) && (NFD_string[j+1] >= Hangul_VBase) && (NFD_string[j+1] <= Hangul_VMax)) {
187            auto LIndex = cp - Hangul_LBase;
188            auto VIndex = NFD_string[j+1] - Hangul_VBase;
189            auto LVIndex = LIndex * Hangul_NCount + VIndex * Hangul_TCount;
190            matches.push_back(MatchResult_t{Hangul_SBase + LVIndex, {j+1}});
191            if ((j+2 < NFD_string.size()) && (NFD_string[j+2] > Hangul_TBase) && (NFD_string[j+2] <= Hangul_TMax)) {
192                auto TIndex = NFD_string[j+2] - Hangul_TBase;
193                matches.push_back(MatchResult_t{Hangul_SBase + LVIndex + TIndex, {j+2}});
194            }
195        }
196    } else {
197        getMatches(&NFD_Trie, NFD_string, combining_class, j, matches);
198        if (hasOption(mOptions, UCD::Caseless)) {
199            getMatches(&NFDi_Trie, NFD_string, combining_class, j, matches);
200        }
201        if (hasOption(mOptions, UCD::Compatible)) {
202            getMatches(&NFKD_Trie, NFD_string, combining_class, j, matches);
203        }
204    }
205}
206
207
208void ClusterMatchTransformer::getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & combining_class, unsigned pos,
209                                         std::vector<MatchResult_t> & matches) const {
210    getMatches(t, s, combining_class, pos, false, {}, matches);
211}
212
213void ClusterMatchTransformer::getMatches(const Trie * t, std::u32string NFD_string, std::vector<unsigned> & combining_class,
214                                            unsigned pos, bool skip, std::vector<unsigned> positions, std::vector<MatchResult_t> & matches) const {
215    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
216    if (t->branches.empty() || pos == NFD_string.size()) return;
217    codepoint_t cp = NFD_string[pos];
218    // If we have skipped any marks but now find a starter, matching fails.
219    if (skip && (combining_class[cp] == 0)) return;
220    auto f = t->branches.find(cp);
221    // If we didn't find the codepoint, check for a caseless match.
222    if ((f == t->branches.end()) && hasOption(mOptions, UCD::Caseless)) {
223        if (!selfUC.contains(cp)) {
224            std::u32string cp_UC = conv.from_bytes(sUCobj->GetStringValue(cp));
225            assert(cp_UC.size() == 1);  // Simple upper case should alway yields a single codepoint.
226            cp = cp_UC[0];
227        } else if (!selfLC.contains(cp)) {
228            std::u32string cp_LC = conv.from_bytes(sLCobj->GetStringValue(cp));
229            assert(cp_LC.size() == 1);  // Simple lower case should alway yields a single codepoint.
230            cp = cp_LC[0];
231        }
232        f = t->branches.find(cp);
233    }
234    if (f != t->branches.end()) {
235        // Found a matching character.  Determine a new result position vector.
236        std::vector<unsigned> result_positions;
237        if ((combining_class[pos] == 0) || positions.empty()) result_positions = {pos};
238        else {
239            result_positions = positions;
240            if ((positions.back() == pos - 1) && (combining_class[pos - 1] == combining_class[pos])) {
241                result_positions[positions.size() - 1] = pos;
242            } else {
243                result_positions.push_back(pos);
244            }
245        }
246        const Trie * subtrie = &(f->second);
247        codepoint_t pc = subtrie->getPrecomposed();
248        if (pc <= 0x10FFFF) {
249            matches.push_back(MatchResult_t{pc, result_positions});
250        }
251        // There may be more matches; find them, with skip unchanged.
252        getMatches(subtrie, NFD_string, combining_class, pos + 1, skip, result_positions, matches);
253    }
254    // Now check for matches that skip the current character.   This is only
255    // possible when matching marks and we have at least one match.
256    if (positions.empty() || ccc0set.contains(cp)) return;
257    // Matching with gaps:  a precomposed sequence Q M122a can be matched with
258    // Q M4 M120 M122a, but not with Q M4 M122b M122a where M122a and M122b have
259    // the same CCC.  That is, if one mark is unmatched, then all following marks
260    // in the same CCC must remain unmatched.   Note that marks are in nondescending
261    // order by the NFD property.
262    unsigned ccc = cccObj->GetEnumerationValue(cp);
263    pos++;
264    while ((pos < NFD_string.size()) && (cccObj->GetEnumerationValue(NFD_string[pos]) == ccc)) {
265        pos++;
266    }
267    getMatches(t, NFD_string, combining_class, pos, /* skip = */ true, positions, matches);
268}
269
270   
271
272RE * markCoverageExpression(std::u32string s, std::vector<unsigned> & combining_class, unsigned pos, std::vector<std::pair<unsigned, codepoint_t>> & precomposed_coverage) {
273    CC * starter = makeCC(s[pos], &cc::Unicode);
274    CC * all_starters = makeCC(s[pos], &cc::Unicode);
275    for (auto & a : precomposed_coverage) {
276        all_starters->insert(a.second);
277    }
278    // First determine all marks of the cluster that actually occur, as
279    // well as equivalents for the four nonstarter decompositions.
280    // 0344 = 0308 0301, 0F73 = 0F71 0F72, 0F75 = 0F71 0F74, 0F81 = 0F71 0F80
281    // Note: no reordering possible for 0308 0301 sequences, both in ccc 230.
282    CC * allMarksCC = makeCC();
283    for (unsigned i = pos + 1; combining_class[i] != 0; i++) {
284        allMarksCC->insert(s[i]);
285        if ((s[i-1] == 0x308) && (s[i] == 0x301)) allMarksCC->insert(0x344);
286    }
287    bool has_xF71 = allMarksCC->contains(0xF71);
288    if (has_xF71) {
289        if (allMarksCC->contains(0xF72)) allMarksCC->insert(0xF73);
290        if (allMarksCC->contains(0xF74)) allMarksCC->insert(0xF75);
291        if (allMarksCC->contains(0xF80)) allMarksCC->insert(0xF81);
292    }
293    std::vector<RE *> coverage_terms = {all_starters, makeRep(allMarksCC, 0, Rep::UNBOUNDED_REP)};
294    unsigned i = pos + 1; // mark index
295    unsigned k = 0;       // precomposed_coverage index
296    while (combining_class[i] != 0) {
297        unsigned theCCC = combining_class[i];
298        // Prepare a coverage expression for all the marks of this CCC.
299        CC * combining_classCC = makeCC();
300        for (unsigned j = i; combining_class[j] == theCCC; j++) {
301            combining_classCC->insert(s[j]);
302            if ((s[j-1] == 0x308) && (s[j] == 0x301)) combining_classCC->insert(0x344);
303        }
304        if (has_xF71) {
305            if (combining_classCC->contains(0xF71)) {
306                if (allMarksCC->contains(0xF72)) combining_classCC->insert(0xF73);
307                if (allMarksCC->contains(0xF74)) combining_classCC->insert(0xF75);
308                if (allMarksCC->contains(0xF80)) combining_classCC->insert(0xF81);
309            }
310            if (combining_classCC->contains(0xF72)) combining_classCC->insert(0xF73);
311            if (combining_classCC->contains(0xF74)) combining_classCC->insert(0xF75);
312            if (combining_classCC->contains(0xF80)) combining_classCC->insert(0xF81);
313        }
314        RE * markStar = makeRep(subtractCC(allMarksCC, combining_classCC), 0, Rep::UNBOUNDED_REP);
315        std::vector<RE *> mark_terms;
316        //if (exactCoverageMode) {
317            mark_terms.push_back(starter);
318            mark_terms.push_back(markStar);
319        //}
320        RE * pre308 = nullptr;
321        do {
322            CC * theMark = makeCC(s[i], &cc::Unicode);
323            if (has_xF71) {
324                if (s[i] == 0xF71) {
325                    if (allMarksCC->contains(0xF72)) theMark->insert(0xF73);
326                    if (allMarksCC->contains(0xF74)) theMark->insert(0xF75);
327                    if (allMarksCC->contains(0xF80)) theMark->insert(0xF81);
328                }
329                if (s[i] == 0xF72) theMark->insert(0xF73);
330                if (s[i] == 0xF74) theMark->insert(0xF75);
331                if (s[i] == 0xF80) theMark->insert(0xF81);
332            }
333            if (s[i] == 0x308) {
334                pre308 = makeSeq(mark_terms.begin(), mark_terms.end());
335                mark_terms = std::vector<RE *>{pre308};
336                mark_terms.push_back(theMark);
337            } else if (pre308 && (s[i] == 0x301)) {
338                mark_terms.push_back(theMark);
339                RE * prefix = makeSeq(mark_terms.begin(), mark_terms.end());
340                mark_terms = std::vector<RE *>{makeAlt({prefix, makeSeq({pre308, makeCC(0x344)})})};
341                pre308 = nullptr;
342            } else {
343                pre308 = nullptr;
344                mark_terms.push_back(theMark);
345            }
346            if ((k < precomposed_coverage.size()) && (precomposed_coverage[k].first == i)) {
347                CC * s = makeCC();
348                while ((k < precomposed_coverage.size()) && (precomposed_coverage[k].first == i)) {
349                    s->insert(precomposed_coverage[k].second);
350                    k++;
351                }
352                RE * prefix = makeSeq(mark_terms.begin(), mark_terms.end());
353                mark_terms = std::vector<RE *>{makeAlt({prefix, s})};
354            }
355            mark_terms.push_back(markStar);
356            i++;
357        } while (combining_class[i] == theCCC);
358        coverage_terms.push_back(makeLookBehindAssertion(makeSeq(mark_terms.begin(), mark_terms.end())));
359    }
360    return makeSeq(coverage_terms.begin(), coverage_terms.end());
361}
362
363// Given a list of match results consisting of (codepoint, position vector) pairs,
364// produce a vector of (position, codepoint) pairs sorted by position.  As we
365// expect the vectors to be short and the entries to be mostly generated in
366// sorted order anyway, we use a simple insertion sort.
367
368std::vector<std::pair<unsigned, codepoint_t>> coverageVector(std::vector<MatchResult_t> matches) {
369    std::vector<std::pair<unsigned, codepoint_t>> coverage;
370    for (auto & m : matches) {
371        codepoint_t cp = m.first;
372        for (unsigned k = 1; k < m.second.size(); k++) {
373            unsigned posn = m.second[k];
374            unsigned i = coverage.size();
375            auto pair = std::make_pair(posn, cp);
376            coverage.push_back(pair);
377            while ((i > 0) && (posn < coverage[i-1].first)) {
378                coverage[i] = coverage[i-1];
379                i--;
380            }
381            coverage[i] = pair;
382        }
383    }
384    return coverage;
385}
386
387//
388//  Given an NFD string, construct an RE that will match all equivalent
389//  strings under the given equivalence relation.   Three processes are
390//  used:
391//  (a) finding all possible precomposed equivalents to substrings within
392//      the string.
393//  (b) permitting all possible orderings of marks, subject to the constraints
394//      of the canonical ordering algorithm, and
395//  (c) replacing all characters with their equivalent character class under
396//      the given equivalence relation.
397//
398
399RE * ClusterMatchTransformer::addEquivalents(std::u32string NFD_string) {
400    // Compute a vector of canonical combining class values of each character
401    // in the string, with an extra sentinel position, to allow simpl
402    // comparisons of CCC values of current and next positions.
403    std::vector<unsigned> combining_class(NFD_string.size()+1, 0);
404    for (unsigned i = 0; i < NFD_string.size(); i++) {
405        if (!ccc0set.contains(NFD_string[i])) {
406            combining_class[i] = cccObj->GetEnumerationValue(NFD_string[i]);
407        }
408    }
409   
410    RE * prefixRE = makeSeq();
411    // A vector of alternative prefix REs by NFD_string position.
412    std::vector<std::vector<RE *>> prefixTerms(NFD_string.size());
413   
414    unsigned i = 0;
415    while (i < NFD_string.size()) {
416        std::vector<MatchResult_t> matches;
417        unsigned cccCount = 0;
418        unsigned markCount = 0;
419        for (unsigned j = i+1; combining_class[j] != 0; j++) {
420            cccCount += combining_class[j] != combining_class[j+1];
421            markCount++;
422        }
423        if (combining_class[i] == 0) {
424            find_precomposed(NFD_string, combining_class, i, matches);
425        }
426        if (cccCount > 1) {
427            // We are at a cluster with multiple possible orderings.
428            // We use mark coverage expressions to handle all possible orderings
429            // as well as precomposed equivalents.
430            std::vector<std::pair<unsigned, codepoint_t>> coverage = coverageVector(matches);
431            prefixRE = makeSeq({prefixRE, markCoverageExpression(NFD_string, combining_class, i, coverage)});
432        } else {
433            for (unsigned k = 0; k < matches.size(); k++) {
434                CC * precomposedCC = makeCC(matches[k].first, &cc::Unicode);
435                unsigned last_cluster_pos = matches[k].second.back();
436                prefixTerms[last_cluster_pos].push_back(makeSeq({prefixRE, precomposedCC}));
437            }
438            for (unsigned j = i; j <= i + markCount; j++) {
439                prefixTerms[j].push_back(makeSeq({prefixRE, makeCC(NFD_string[j], &cc::Unicode)}));
440                prefixRE = makeAlt(prefixTerms[j].begin(), prefixTerms[j].end());
441            }
442        }
443        i += 1 + markCount;
444    }
445    return prefixRE;
446}
447
448
449RE * addClusterMatches(RE * r, EquivalenceOptions options) {
450    return ClusterMatchTransformer(options).transformRE(r);
451}
452       
453CC * add_equivalent_codepoints(CC * cc, EquivalenceOptions options) {
454    if (cc->getAlphabet() != &cc::Unicode) return cc;
455    UnicodeSet addedCC;
456    for (const interval_t & i : *cc) {
457        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
458            addedCC = addedCC + equivalentCodepoints(cp, options);
459        }
460    }
461    if ((addedCC - *cc).empty()) return cc;
462    return makeCC(*cc + addedCC);
463}
464       
465
466class EquivalentCodepointsTransformer : public re::RE_Transformer {
467public:
468    EquivalentCodepointsTransformer(EquivalenceOptions opt = Canonical);
469protected:
470    re::RE * transformCC(re::CC * cc) override;
471    re::RE * transformGroup(re::Group * g) override;
472private:
473    EquivalenceOptions mOptions;
474};
475
476EquivalentCodepointsTransformer::EquivalentCodepointsTransformer(UCD::EquivalenceOptions opt) :
477    RE_Transformer("EquivalentCodepoints"),
478    mOptions(opt)
479    {}
480
481
482RE * EquivalentCodepointsTransformer::transformGroup(Group * g) {
483    re::Group::Mode mode = g->getMode();
484    re::Group::Sense sense = g->getSense();
485    auto r = g->getRE();
486    EquivalenceOptions saveOptions = mOptions;
487    if (mode == re::Group::Mode::CaseInsensitiveMode) {
488        if (sense == re::Group::Sense::On) {
489            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Caseless);
490        } else {
491            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Caseless);
492        }
493    } else if (mode == re::Group::Mode::CompatibilityMode) {
494        if (sense == re::Group::Sense::On) {
495            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Compatible);
496        } else {
497            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Compatible);
498        }
499    } else {
500        RE * t = transform(r);
501        // Keep the group/mode info.
502        if (t == r) return g;
503        return makeGroup(mode, t, sense);
504    }
505    // Once transformation is complete, the +i or +K modes are
506    // no longer needed.
507    RE * t = transform(r);
508    mOptions = saveOptions;
509    if (t == r) return r;
510    return t;
511}
512
513
514RE * EquivalentCodepointsTransformer::transformCC(CC * cc) {
515    return add_equivalent_codepoints(cc, mOptions);
516}
517
518RE * addEquivalentCodepoints(RE * re, UCD::EquivalenceOptions eqopt) {
519    return EquivalentCodepointsTransformer(eqopt).transformRE(re);
520}
521
522}
Note: See TracBrowser for help on using the repository browser.