source: icGREP/icgrep-devel/icgrep/UCD/CaseFolding.cpp @ 6161

Last change on this file since 6161 was 5782, checked in by nmedfort, 23 months ago

Initial check-in of LookAhead? support; modified LineBreakKernel? to compute CR+LF using LookAhead?(1) + misc. fixes.

File size: 3.6 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters, Inc.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters, Inc.
5 *
6 */
7
8#include "CaseFolding.h"
9#include <UCD/unicode_set.h>
10#include <algorithm>
11
12using namespace UCD;
13
14int findFoldEntry(const codepoint_t cp) {
15    int lo = 0;
16    int hi = foldTableSize;
17    while (hi - lo > 1) {
18        int mid = (lo + hi)/2;
19        if (cp < foldTable[mid].range_lo) {
20            hi = mid;
21        }
22        else {
23            lo = mid;
24        }
25    }
26    return lo;
27}
28
29void caseInsensitiveInsertRange(UnicodeSet & cc, const codepoint_t lo, const codepoint_t hi) {
30    cc.insert_range(lo, hi);
31    // Find the first foldTable entry overlapping the (lo, hi) range.
32    int e = findFoldEntry(lo);
33    // There may be multiple consecutive entries overlapping the range.
34    // Keep processing until we are done.
35    while (foldTable[e].range_lo <= hi) {
36        const FoldEntry & fe = foldTable[e];
37        const FoldEntry & fnext = foldTable[e + 1];
38        // Constrain (lo, hi) to this entry only.
39        codepoint_t lo1 = std::max(lo, fe.range_lo);
40        codepoint_t hi1 = std::min(hi, fnext.range_lo - 1);
41        if (fe.fold_offset > 0 && fe.range_lo + fe.fold_offset < fnext.range_lo) {
42            //
43            // There are more than fold_offset values in the range, meaning that
44            // we have an extended range with alternating subranges of positive
45            // and negative offsets.
46            // First find the negative offset subrange.
47            codepoint_t subrange_lo = lo1 - ((lo1 - fe.range_lo) % (2 * fe.fold_offset));
48            codepoint_t negative_subrange_lo = subrange_lo + fe.fold_offset;
49            codepoint_t negative_subrange_hi = subrange_lo + 2 * fe.fold_offset - 1;
50            if ((lo1 <= negative_subrange_hi) && (hi1 >= negative_subrange_lo)) {
51                // negative offsets apply
52                cc.insert_range(std::max(negative_subrange_lo,lo1) - fe.fold_offset, std::min(negative_subrange_hi, hi1) - fe.fold_offset);
53            }
54            // Now the positive offset subrange.
55            codepoint_t positive_subrange_lo = hi1 - ((hi1 - fe.range_lo) % (2 * fe.fold_offset));
56            codepoint_t positive_subrange_hi = positive_subrange_lo + fe.fold_offset - 1;
57            if ((lo1 <= positive_subrange_hi) && (hi1 >= positive_subrange_lo)) {
58                cc.insert_range(std::max(positive_subrange_lo, lo1) + fe.fold_offset, std::min(positive_subrange_hi, hi1) + fe.fold_offset);
59            }
60        }
61        else if (fe.fold_offset != 0) {
62            // We have either a positive or negative offset, and all offsets for
63            // this entry have the same sign.
64            cc.insert_range(lo1 + fe.fold_offset, hi1 + fe.fold_offset);
65        }
66        // Now pick up any individual fold entries.
67        for (unsigned i = 0; i < fe.fold_pairs.size(); i++) {
68            if (fe.fold_pairs[i].first < lo) continue;  // Only possible for first fold_entry.
69            if (fe.fold_pairs[i].first > hi) break;     // Only possible for last fold_entry.
70            cc.insert(fe.fold_pairs[i].second);
71        }
72        // Move on to the next fold_entry.
73        e++;
74    }
75}
76inline codepoint_t lo_codepoint(const interval_t & i) {
77    return std::get<0>(i);
78}
79
80inline codepoint_t hi_codepoint(const interval_t & i) {
81    return std::get<1>(i);
82}
83
84UnicodeSet caseInsensitize(const UnicodeSet & cc) {
85    UnicodeSet cci;
86    for (const interval_t i : cc) {
87        caseInsensitiveInsertRange(cci, lo_codepoint(i), hi_codepoint(i));
88    }
89    return cci;
90}
91
92
Note: See TracBrowser for help on using the repository browser.