source: icGREP/icgrep-devel/icgrep/re/Unicode/equivalence.cpp @ 6228

Last change on this file since 6228 was 6228, checked in by nmedfort, 10 months ago

redesign of PopCount? calculation + mem leak fix

File size: 21.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "equivalence.h"
8#include <string>
9#include <locale>
10#include <codecvt>
11#include <cc/alphabet.h>
12#include <re/re_re.h>
13#include <re/re_seq.h>
14#include <re/re_alt.h>
15#include <re/re_cc.h>
16#include <re/re_group.h>
17#include <re/re_range.h>
18#include <re/re_diff.h>
19#include <re/re_intersect.h>
20#include <re/re_assertion.h>
21#include <re/re_toolchain.h>
22#include <re/Unicode/decomposition.h>
23#include <re/printer_re.h>
24#include "UCD/PropertyAliases.h"
25#include "UCD/PropertyObjects.h"
26#include "UCD/PropertyObjectTable.h"
27#include "UCD/PropertyValueAliases.h"
28#include <UCD/Equivalence.h>
29#include <UCD/PrecomposedMappings.h>
30#include <UCD/unicode_set.h>
31#include <llvm/Support/Casting.h>
32#include <llvm/Support/raw_ostream.h>
33
34using namespace re;
35using namespace llvm;
36namespace UCD {
37   
38std::u32string getStringPiece(Seq * s, unsigned position) {
39    unsigned pos = position;
40    unsigned size = s->size();
41    std::u32string rslt;
42    while ((pos < size) && isa<CC>((*s)[pos])) {
43        CC * cc = cast<CC>((*s)[pos]);
44        if (cc->getAlphabet() != &cc::Unicode) return rslt;
45        if (cc->empty()) return rslt;
46        codepoint_t lo = lo_codepoint(cc->front());
47        codepoint_t hi = hi_codepoint(cc->back());
48        if (lo != hi) // not a singleton CC; end of the string piece.
49            return rslt;
50        rslt.push_back(lo);
51        pos++;
52    }
53    return rslt;
54}
55   
56
57// Constants for computation of Hangul decompositions, see Unicode Standard, section 3.12.
58const codepoint_t Hangul_SBase = 0xAC00;
59const codepoint_t Hangul_LBase = 0x1100;
60const codepoint_t Hangul_LMax = 0x1112;
61const codepoint_t Hangul_VBase = 0x1161;
62const codepoint_t Hangul_VMax = 0x1175;
63const codepoint_t Hangul_TBase = 0x11A7;
64const codepoint_t Hangul_TMax = 0x11C2;
65const unsigned Hangul_TCount = 28;
66const unsigned Hangul_NCount = 588;
67const unsigned Hangul_SCount = 11172;
68
69
70typedef std::pair<codepoint_t, std::vector<unsigned>> MatchResult_t;
71
72class ClusterMatchTransformer : public re::RE_Transformer {
73public:
74    /* Givan an RE in decomposed form (NFD + CaseFold and/or NFKD),
75     produce an RE that will match all equivalent strings under the
76     given options.   Note that embedded groups within the RE
77     may change options.  Example:
78     UCD:AllEquivalentsTransformer(UCD::Caseless).transformRE(r);
79     */
80    ClusterMatchTransformer(EquivalenceOptions opt = Canonical) :
81        RE_Transformer("ClusterMatchTransformer"),
82        mOptions(opt),
83        cccObj(cast<EnumeratedPropertyObject>(property_object_table[ccc])),
84        sUCobj(cast<StringPropertyObject>(property_object_table[suc])),
85        sLCobj(cast<StringPropertyObject>(property_object_table[slc])),
86        ccc0set(cccObj->GetCodepointSet(CCC_ns::NR)),
87        selfUC(std::move(sUCobj->GetReflexiveSet())),
88        selfLC(std::move(sLCobj->GetReflexiveSet()))
89        {}
90protected:
91    re::RE * transformSeq(re::Seq * seq) override;
92    re::RE * transformGroup(re::Group * g) override;
93    void find_precomposed(std::u32string & s, std::vector<unsigned> & sss,
94                          unsigned j, std::vector<MatchResult_t> & matches);
95    void getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & ccc, unsigned pos,
96                              std::vector<MatchResult_t> & matches) const;
97    void getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & ccc,
98                              unsigned pos, bool skip, std::vector<unsigned> positions, std::vector<MatchResult_t> & matches) const;
99    RE * addEquivalents(std::u32string NFD_string);
100private:
101    EquivalenceOptions mOptions;
102    EnumeratedPropertyObject * cccObj;
103    const StringPropertyObject * sUCobj;
104    const StringPropertyObject * sLCobj;
105    const UnicodeSet & ccc0set;
106    const UnicodeSet selfUC;
107    const UnicodeSet selfLC;
108};
109
110   
111
112RE * ClusterMatchTransformer::transformGroup(Group * g) {
113    re::Group::Mode mode = g->getMode();
114    re::Group::Sense sense = g->getSense();
115    auto r = g->getRE();
116    EquivalenceOptions saveOptions = mOptions;
117    if (mode == re::Group::Mode::CaseInsensitiveMode) {
118        if (sense == re::Group::Sense::On) {
119            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Caseless);
120        } else {
121            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Caseless);
122        }
123    } else if (mode == re::Group::Mode::CompatibilityMode) {
124        if (sense == re::Group::Sense::On) {
125            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Compatible);
126        } else {
127            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Compatible);
128        }
129    }
130    RE * t = transform(r);
131    mOptions = saveOptions;
132    if (t == r) return g;
133    return makeGroup(mode, t, sense);
134}
135
136RE * ClusterMatchTransformer::transformSeq(Seq * seq) {
137    // find and process all string pieces
138    unsigned size = seq->size();
139    if (size == 0) return seq;
140    std::vector<RE *> list;
141    unsigned i = 0;
142    bool changed = false;
143    while (i < size) {
144        std::u32string stringPiece = getStringPiece(seq, i);
145        if (stringPiece.size() > 0) {
146            RE * e = addEquivalents(stringPiece);
147            if (Seq * t = dyn_cast<Seq>(e)) {
148                unsigned tsize = t->size();
149                if ((tsize != size) || (getStringPiece(t,0) != stringPiece)) changed = true;
150            } else changed = true;
151            list.push_back(addEquivalents(stringPiece));
152            i += stringPiece.size();
153        } else {
154            RE * r = (*seq)[i];
155            RE * t = transform(r);
156            if (t != r) changed = true;
157            list.push_back(t);
158            i++;
159        }
160    }
161    if (!changed) return seq;
162    return makeSeq(list.begin(), list.end());
163}
164   
165// The lookup process to find precomposed characters that match a
166// given NFD string results a vector of match results.   Each
167// match result is a pair consisting of the precomposed codepoint
168// identified, plus a vector of suffix match positions.   Suffix
169// match positions are the position of the final starter (ccc = 0)
170// and the positions of following marks.  When there are multiple
171// consecutive marks in one combining class, only the last of the
172// marks is included.  Mark positions are ordered, but may not be
173// consecutive, because of permissible reordering under Unicode rules.
174   
175
176void ClusterMatchTransformer::find_precomposed(std::u32string & NFD_string, std::vector<unsigned> & combining_class,
177                                                unsigned j, std::vector<MatchResult_t> & matches) {
178    codepoint_t cp = NFD_string[j];
179    //
180    // Canonical equivalents are added in every case.
181    if ((cp >= Hangul_LBase) && (cp <= Hangul_LMax)) {
182        // Algorithmic precomposition for Hangul.
183        // We may have an LV precomposition or both an LV and an
184        // LVT precomposition.
185        if ((j+1 < NFD_string.size()) && (NFD_string[j+1] >= Hangul_VBase) && (NFD_string[j+1] <= Hangul_VMax)) {
186            auto LIndex = cp - Hangul_LBase;
187            auto VIndex = NFD_string[j+1] - Hangul_VBase;
188            auto LVIndex = LIndex * Hangul_NCount + VIndex * Hangul_TCount;
189            matches.push_back(MatchResult_t{Hangul_SBase + LVIndex, {j+1}});
190            if ((j+2 < NFD_string.size()) && (NFD_string[j+2] > Hangul_TBase) && (NFD_string[j+2] <= Hangul_TMax)) {
191                auto TIndex = NFD_string[j+2] - Hangul_TBase;
192                matches.push_back(MatchResult_t{Hangul_SBase + LVIndex + TIndex, {j+2}});
193            }
194        }
195    } else {
196        getMatches(&NFD_Trie, NFD_string, combining_class, j, matches);
197        if (hasOption(mOptions, UCD::Caseless)) {
198            getMatches(&NFDi_Trie, NFD_string, combining_class, j, matches);
199        }
200        if (hasOption(mOptions, UCD::Compatible)) {
201            getMatches(&NFKD_Trie, NFD_string, combining_class, j, matches);
202        }
203    }
204}
205
206
207void ClusterMatchTransformer::getMatches(const Trie * t, std::u32string s, std::vector<unsigned> & combining_class, unsigned pos,
208                                         std::vector<MatchResult_t> & matches) const {
209    getMatches(t, s, combining_class, pos, false, {}, matches);
210}
211
212void ClusterMatchTransformer::getMatches(const Trie * t, std::u32string NFD_string, std::vector<unsigned> & combining_class,
213                                            unsigned pos, bool skip, std::vector<unsigned> positions, std::vector<MatchResult_t> & matches) const {
214    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
215    if (t->branches.empty() || pos == NFD_string.size()) return;
216    codepoint_t cp = NFD_string[pos];
217    // If we have skipped any marks but now find a starter, matching fails.
218    if (skip && (combining_class[cp] == 0)) return;
219    auto f = t->branches.find(cp);
220    // If we didn't find the codepoint, check for a caseless match.
221    if ((f == t->branches.end()) && hasOption(mOptions, UCD::Caseless)) {
222        if (!selfUC.contains(cp)) {
223            std::u32string cp_UC = conv.from_bytes(sUCobj->GetStringValue(cp));
224            assert(cp_UC.size() == 1);  // Simple upper case should alway yields a single codepoint.
225            cp = cp_UC[0];
226        } else if (!selfLC.contains(cp)) {
227            std::u32string cp_LC = conv.from_bytes(sLCobj->GetStringValue(cp));
228            assert(cp_LC.size() == 1);  // Simple lower case should alway yields a single codepoint.
229            cp = cp_LC[0];
230        }
231        f = t->branches.find(cp);
232    }
233    if (f != t->branches.end()) {
234        // Found a matching character.  Determine a new result position vector.
235        std::vector<unsigned> result_positions;
236        if ((combining_class[pos] == 0) || positions.empty()) result_positions = {pos};
237        else {
238            result_positions = positions;
239            if ((positions.back() == pos - 1) && (combining_class[pos - 1] == combining_class[pos])) {
240                result_positions[positions.size() - 1] = pos;
241            } else {
242                result_positions.push_back(pos);
243            }
244        }
245        const Trie * subtrie = &(f->second);
246        codepoint_t pc = subtrie->getPrecomposed();
247        if (pc <= 0x10FFFF) {
248            matches.push_back(MatchResult_t{pc, result_positions});
249        }
250        // There may be more matches; find them, with skip unchanged.
251        getMatches(subtrie, NFD_string, combining_class, pos + 1, skip, result_positions, matches);
252    }
253    // Now check for matches that skip the current character.   This is only
254    // possible when matching marks and we have at least one match.
255    if (positions.empty() || ccc0set.contains(cp)) return;
256    // Matching with gaps:  a precomposed sequence Q M122a can be matched with
257    // Q M4 M120 M122a, but not with Q M4 M122b M122a where M122a and M122b have
258    // the same CCC.  That is, if one mark is unmatched, then all following marks
259    // in the same CCC must remain unmatched.   Note that marks are in nondescending
260    // order by the NFD property.
261    unsigned ccc = cccObj->GetEnumerationValue(cp);
262    pos++;
263    while ((pos < NFD_string.size()) && (cccObj->GetEnumerationValue(NFD_string[pos]) == ccc)) {
264        pos++;
265    }
266    getMatches(t, NFD_string, combining_class, pos, /* skip = */ true, positions, matches);
267}
268
269   
270
271RE * markCoverageExpression(std::u32string s, std::vector<unsigned> & combining_class, unsigned pos, std::vector<std::pair<unsigned, codepoint_t>> & precomposed_coverage) {
272    CC * starter = makeCC(s[pos], &cc::Unicode);
273    CC * all_starters = makeCC(s[pos], &cc::Unicode);
274    for (auto & a : precomposed_coverage) {
275        all_starters->insert(a.second);
276    }
277    // First determine all marks of the cluster that actually occur, as
278    // well as equivalents for the four nonstarter decompositions.
279    // 0344 = 0308 0301, 0F73 = 0F71 0F72, 0F75 = 0F71 0F74, 0F81 = 0F71 0F80
280    // Note: no reordering possible for 0308 0301 sequences, both in ccc 230.
281    CC * allMarksCC = makeCC();
282    for (unsigned i = pos + 1; combining_class[i] != 0; i++) {
283        allMarksCC->insert(s[i]);
284        if ((s[i-1] == 0x308) && (s[i] == 0x301)) allMarksCC->insert(0x344);
285    }
286    bool has_xF71 = allMarksCC->contains(0xF71);
287    if (has_xF71) {
288        if (allMarksCC->contains(0xF72)) allMarksCC->insert(0xF73);
289        if (allMarksCC->contains(0xF74)) allMarksCC->insert(0xF75);
290        if (allMarksCC->contains(0xF80)) allMarksCC->insert(0xF81);
291    }
292    std::vector<RE *> coverage_terms = {all_starters, makeRep(allMarksCC, 0, Rep::UNBOUNDED_REP)};
293    unsigned i = pos + 1; // mark index
294    unsigned k = 0;       // precomposed_coverage index
295    while (combining_class[i] != 0) {
296        unsigned theCCC = combining_class[i];
297        // Prepare a coverage expression for all the marks of this CCC.
298        CC * combining_classCC = makeCC();
299        for (unsigned j = i; combining_class[j] == theCCC; j++) {
300            combining_classCC->insert(s[j]);
301            if ((s[j-1] == 0x308) && (s[j] == 0x301)) combining_classCC->insert(0x344);
302        }
303        if (has_xF71) {
304            if (combining_classCC->contains(0xF71)) {
305                if (allMarksCC->contains(0xF72)) combining_classCC->insert(0xF73);
306                if (allMarksCC->contains(0xF74)) combining_classCC->insert(0xF75);
307                if (allMarksCC->contains(0xF80)) combining_classCC->insert(0xF81);
308            }
309            if (combining_classCC->contains(0xF72)) combining_classCC->insert(0xF73);
310            if (combining_classCC->contains(0xF74)) combining_classCC->insert(0xF75);
311            if (combining_classCC->contains(0xF80)) combining_classCC->insert(0xF81);
312        }
313        RE * markStar = makeRep(subtractCC(allMarksCC, combining_classCC), 0, Rep::UNBOUNDED_REP);
314        std::vector<RE *> mark_terms;
315        //if (exactCoverageMode) {
316            mark_terms.push_back(starter);
317            mark_terms.push_back(markStar);
318        //}
319        RE * pre308 = nullptr;
320        do {
321            CC * theMark = makeCC(s[i], &cc::Unicode);
322            if (has_xF71) {
323                if (s[i] == 0xF71) {
324                    if (allMarksCC->contains(0xF72)) theMark->insert(0xF73);
325                    if (allMarksCC->contains(0xF74)) theMark->insert(0xF75);
326                    if (allMarksCC->contains(0xF80)) theMark->insert(0xF81);
327                }
328                if (s[i] == 0xF72) theMark->insert(0xF73);
329                if (s[i] == 0xF74) theMark->insert(0xF75);
330                if (s[i] == 0xF80) theMark->insert(0xF81);
331            }
332            if (s[i] == 0x308) {
333                pre308 = makeSeq(mark_terms.begin(), mark_terms.end());
334                mark_terms = std::vector<RE *>{pre308};
335                mark_terms.push_back(theMark);
336            } else if (pre308 && (s[i] == 0x301)) {
337                mark_terms.push_back(theMark);
338                RE * prefix = makeSeq(mark_terms.begin(), mark_terms.end());
339                mark_terms = std::vector<RE *>{makeAlt({prefix, makeSeq({pre308, makeCC(0x344)})})};
340                pre308 = nullptr;
341            } else {
342                pre308 = nullptr;
343                mark_terms.push_back(theMark);
344            }
345            if ((k < precomposed_coverage.size()) && (precomposed_coverage[k].first == i)) {
346                CC * s = makeCC();
347                while ((k < precomposed_coverage.size()) && (precomposed_coverage[k].first == i)) {
348                    s->insert(precomposed_coverage[k].second);
349                    k++;
350                }
351                RE * prefix = makeSeq(mark_terms.begin(), mark_terms.end());
352                mark_terms = std::vector<RE *>{makeAlt({prefix, s})};
353            }
354            mark_terms.push_back(markStar);
355            i++;
356        } while (combining_class[i] == theCCC);
357        coverage_terms.push_back(makeLookBehindAssertion(makeSeq(mark_terms.begin(), mark_terms.end())));
358    }
359    return makeSeq(coverage_terms.begin(), coverage_terms.end());
360}
361
362// Given a list of match results consisting of (codepoint, position vector) pairs,
363// produce a vector of (position, codepoint) pairs sorted by position.  As we
364// expect the vectors to be short and the entries to be mostly generated in
365// sorted order anyway, we use a simple insertion sort.
366
367std::vector<std::pair<unsigned, codepoint_t>> coverageVector(std::vector<MatchResult_t> matches) {
368    std::vector<std::pair<unsigned, codepoint_t>> coverage;
369    for (auto & m : matches) {
370        codepoint_t cp = m.first;
371        for (unsigned k = 1; k < m.second.size(); k++) {
372            unsigned posn = m.second[k];
373            unsigned i = coverage.size();
374            auto pair = std::make_pair(posn, cp);
375            coverage.push_back(pair);
376            while ((i > 0) && (posn < coverage[i-1].first)) {
377                coverage[i] = coverage[i-1];
378                i--;
379            }
380            coverage[i] = pair;
381        }
382    }
383    return coverage;
384}
385
386//
387//  Given an NFD string, construct an RE that will match all equivalent
388//  strings under the given equivalence relation.   Three processes are
389//  used:
390//  (a) finding all possible precomposed equivalents to substrings within
391//      the string.
392//  (b) permitting all possible orderings of marks, subject to the constraints
393//      of the canonical ordering algorithm, and
394//  (c) replacing all characters with their equivalent character class under
395//      the given equivalence relation.
396//
397
398RE * ClusterMatchTransformer::addEquivalents(std::u32string NFD_string) {
399    // Compute a vector of canonical combining class values of each character
400    // in the string, with an extra sentinel position, to allow simpl
401    // comparisons of CCC values of current and next positions.
402    std::vector<unsigned> combining_class(NFD_string.size()+1, 0);
403    for (unsigned i = 0; i < NFD_string.size(); i++) {
404        if (!ccc0set.contains(NFD_string[i])) {
405            combining_class[i] = cccObj->GetEnumerationValue(NFD_string[i]);
406        }
407    }
408   
409    RE * prefixRE = makeSeq();
410    // A vector of alternative prefix REs by NFD_string position.
411    std::vector<std::vector<RE *>> prefixTerms(NFD_string.size());
412   
413    unsigned i = 0;
414    while (i < NFD_string.size()) {
415        std::vector<MatchResult_t> matches;
416        unsigned cccCount = 0;
417        unsigned markCount = 0;
418        for (unsigned j = i+1; combining_class[j] != 0; j++) {
419            cccCount += combining_class[j] != combining_class[j+1];
420            markCount++;
421        }
422        if (combining_class[i] == 0) {
423            find_precomposed(NFD_string, combining_class, i, matches);
424        }
425        if (cccCount > 1) {
426            // We are at a cluster with multiple possible orderings.
427            // We use mark coverage expressions to handle all possible orderings
428            // as well as precomposed equivalents.
429            std::vector<std::pair<unsigned, codepoint_t>> coverage = coverageVector(matches);
430            prefixRE = makeSeq({prefixRE, markCoverageExpression(NFD_string, combining_class, i, coverage)});
431        } else {
432            for (unsigned k = 0; k < matches.size(); k++) {
433                CC * precomposedCC = makeCC(matches[k].first, &cc::Unicode);
434                unsigned last_cluster_pos = matches[k].second.back();
435                prefixTerms[last_cluster_pos].push_back(makeSeq({prefixRE, precomposedCC}));
436            }
437            for (unsigned j = i; j <= i + markCount; j++) {
438                prefixTerms[j].push_back(makeSeq({prefixRE, makeCC(NFD_string[j], &cc::Unicode)}));
439                prefixRE = makeAlt(prefixTerms[j].begin(), prefixTerms[j].end());
440            }
441        }
442        i += 1 + markCount;
443    }
444    return prefixRE;
445}
446
447
448RE * addClusterMatches(RE * r, EquivalenceOptions options) {
449    return ClusterMatchTransformer(options).transformRE(r);
450}
451       
452CC * add_equivalent_codepoints(CC * cc, EquivalenceOptions options) {
453    if (cc->getAlphabet() != &cc::Unicode) return cc;
454    UnicodeSet addedCC;
455    for (const interval_t & i : *cc) {
456        for (codepoint_t cp = lo_codepoint(i); cp <= hi_codepoint(i); cp++) {
457            addedCC = addedCC + equivalentCodepoints(cp, options);
458        }
459    }
460    if ((addedCC - *cc).empty()) return cc;
461    return makeCC(*cc + addedCC);
462}
463       
464
465class EquivalentCodepointsTransformer : public re::RE_Transformer {
466public:
467    EquivalentCodepointsTransformer(EquivalenceOptions opt = Canonical);
468protected:
469    re::RE * transformCC(re::CC * cc) override;
470    re::RE * transformGroup(re::Group * g) override;
471private:
472    EquivalenceOptions mOptions;
473};
474
475EquivalentCodepointsTransformer::EquivalentCodepointsTransformer(UCD::EquivalenceOptions opt) :
476    RE_Transformer("EquivalentCodepoints"),
477    mOptions(opt)
478    {}
479
480
481RE * EquivalentCodepointsTransformer::transformGroup(Group * g) {
482    re::Group::Mode mode = g->getMode();
483    re::Group::Sense sense = g->getSense();
484    auto r = g->getRE();
485    EquivalenceOptions saveOptions = mOptions;
486    if (mode == re::Group::Mode::CaseInsensitiveMode) {
487        if (sense == re::Group::Sense::On) {
488            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Caseless);
489        } else {
490            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Caseless);
491        }
492    } else if (mode == re::Group::Mode::CompatibilityMode) {
493        if (sense == re::Group::Sense::On) {
494            mOptions = static_cast<EquivalenceOptions>(mOptions | UCD::Compatible);
495        } else {
496            mOptions = static_cast<EquivalenceOptions>(mOptions & ~UCD::Compatible);
497        }
498    } else {
499        RE * t = transform(r);
500        // Keep the group/mode info.
501        if (t == r) return g;
502        return makeGroup(mode, t, sense);
503    }
504    // Once transformation is complete, the +i or +K modes are
505    // no longer needed.
506    RE * t = transform(r);
507    mOptions = saveOptions;
508    if (t == r) return r;
509    return t;
510}
511
512
513RE * EquivalentCodepointsTransformer::transformCC(CC * cc) {
514    return add_equivalent_codepoints(cc, mOptions);
515}
516
517RE * addEquivalentCodepoints(RE * re, UCD::EquivalenceOptions eqopt) {
518    return EquivalentCodepointsTransformer(eqopt).transformRE(re);
519}
520
521}
Note: See TracBrowser for help on using the repository browser.