[4316] | 1 | /* |
---|
| 2 | * Copyright (c) 2014 International Characters, Inc. |
---|
| 3 | * This software is licensed to the public under the Open Software License 3.0. |
---|
| 4 | * icgrep is a trademark of International Characters, Inc. |
---|
| 5 | * |
---|
| 6 | */ |
---|
| 7 | |
---|
[5673] | 8 | #include "CaseFolding.h" |
---|
[5770] | 9 | #include <UCD/unicode_set.h> |
---|
[4316] | 10 | #include <algorithm> |
---|
| 11 | |
---|
[5748] | 12 | using namespace UCD; |
---|
[4615] | 13 | |
---|
| 14 | int findFoldEntry(const codepoint_t cp) { |
---|
[4612] | 15 | int lo = 0; |
---|
[4316] | 16 | int hi = foldTableSize; |
---|
| 17 | while (hi - lo > 1) { |
---|
[4612] | 18 | int mid = (lo + hi)/2; |
---|
| 19 | if (cp < foldTable[mid].range_lo) { |
---|
| 20 | hi = mid; |
---|
| 21 | } |
---|
| 22 | else { |
---|
| 23 | lo = mid; |
---|
| 24 | } |
---|
[4316] | 25 | } |
---|
| 26 | return lo; |
---|
| 27 | } |
---|
| 28 | |
---|
[5781] | 29 | void caseInsensitiveInsertRange(UnicodeSet & cc, const codepoint_t lo, const codepoint_t hi) { |
---|
| 30 | cc.insert_range(lo, hi); |
---|
[4316] | 31 | // Find the first foldTable entry overlapping the (lo, hi) range. |
---|
| 32 | int e = findFoldEntry(lo); |
---|
| 33 | // There may be multiple consecutive entries overlapping the range. |
---|
| 34 | // Keep processing until we are done. |
---|
| 35 | while (foldTable[e].range_lo <= hi) { |
---|
[4612] | 36 | const FoldEntry & fe = foldTable[e]; |
---|
| 37 | const FoldEntry & fnext = foldTable[e + 1]; |
---|
| 38 | // Constrain (lo, hi) to this entry only. |
---|
| 39 | codepoint_t lo1 = std::max(lo, fe.range_lo); |
---|
| 40 | codepoint_t hi1 = std::min(hi, fnext.range_lo - 1); |
---|
| 41 | if (fe.fold_offset > 0 && fe.range_lo + fe.fold_offset < fnext.range_lo) { |
---|
| 42 | // |
---|
| 43 | // There are more than fold_offset values in the range, meaning that |
---|
| 44 | // we have an extended range with alternating subranges of positive |
---|
| 45 | // and negative offsets. |
---|
| 46 | // First find the negative offset subrange. |
---|
| 47 | codepoint_t subrange_lo = lo1 - ((lo1 - fe.range_lo) % (2 * fe.fold_offset)); |
---|
| 48 | codepoint_t negative_subrange_lo = subrange_lo + fe.fold_offset; |
---|
| 49 | codepoint_t negative_subrange_hi = subrange_lo + 2 * fe.fold_offset - 1; |
---|
| 50 | if ((lo1 <= negative_subrange_hi) && (hi1 >= negative_subrange_lo)) { |
---|
| 51 | // negative offsets apply |
---|
[5781] | 52 | cc.insert_range(std::max(negative_subrange_lo,lo1) - fe.fold_offset, std::min(negative_subrange_hi, hi1) - fe.fold_offset); |
---|
[4612] | 53 | } |
---|
| 54 | // Now the positive offset subrange. |
---|
| 55 | codepoint_t positive_subrange_lo = hi1 - ((hi1 - fe.range_lo) % (2 * fe.fold_offset)); |
---|
| 56 | codepoint_t positive_subrange_hi = positive_subrange_lo + fe.fold_offset - 1; |
---|
| 57 | if ((lo1 <= positive_subrange_hi) && (hi1 >= positive_subrange_lo)) { |
---|
[5781] | 58 | cc.insert_range(std::max(positive_subrange_lo, lo1) + fe.fold_offset, std::min(positive_subrange_hi, hi1) + fe.fold_offset); |
---|
[4612] | 59 | } |
---|
| 60 | } |
---|
| 61 | else if (fe.fold_offset != 0) { |
---|
| 62 | // We have either a positive or negative offset, and all offsets for |
---|
| 63 | // this entry have the same sign. |
---|
[5781] | 64 | cc.insert_range(lo1 + fe.fold_offset, hi1 + fe.fold_offset); |
---|
[4612] | 65 | } |
---|
| 66 | // Now pick up any individual fold entries. |
---|
[4750] | 67 | for (unsigned i = 0; i < fe.fold_pairs.size(); i++) { |
---|
[4612] | 68 | if (fe.fold_pairs[i].first < lo) continue; // Only possible for first fold_entry. |
---|
| 69 | if (fe.fold_pairs[i].first > hi) break; // Only possible for last fold_entry. |
---|
[5781] | 70 | cc.insert(fe.fold_pairs[i].second); |
---|
[4612] | 71 | } |
---|
| 72 | // Move on to the next fold_entry. |
---|
| 73 | e++; |
---|
[4316] | 74 | } |
---|
| 75 | } |
---|
[5770] | 76 | inline codepoint_t lo_codepoint(const interval_t & i) { |
---|
| 77 | return std::get<0>(i); |
---|
| 78 | } |
---|
[4316] | 79 | |
---|
[5770] | 80 | inline codepoint_t hi_codepoint(const interval_t & i) { |
---|
| 81 | return std::get<1>(i); |
---|
| 82 | } |
---|
[4316] | 83 | |
---|
[5781] | 84 | UnicodeSet caseInsensitize(UnicodeSet & cc) { |
---|
| 85 | UnicodeSet cci; |
---|
| 86 | for (const interval_t i : cc) { |
---|
[5770] | 87 | caseInsensitiveInsertRange(cci, lo_codepoint(i), hi_codepoint(i)); |
---|
| 88 | } |
---|
| 89 | return cci; |
---|
| 90 | } |
---|
| 91 | |
---|
| 92 | |
---|