source: icGREP/icgrep-devel/UCD-scripts/casefold.py @ 5673

Last change on this file since 5673 was 5673, checked in by cameron, 2 years ago

Case folding property objects

File size: 5.6 KB
Line 
1#
2# casefold.py - parsing Unicode Character Database (UCD) files
3# and generating C headers for property data using a compact bitset
4# representation.
5#
6# Robert D. Cameron
7# December 2, 2014
8#
9# Licensed under Open Software License 3.0.
10#
11#
12import re, string, cformat
13import UCD_config
14from unicode_set import *
15from UCD_parser import parse_PropertyAlias_txt, parse_CaseFolding_txt
16
17def simple_CaseFolding_BitSets(fold_map):
18   BitDiffSet = {}
19   all_diff_bits = 0
20   for k in fold_map.keys():
21      v = fold_map[k]
22      if not isinstance(v, int): continue # skip nonsimple case folds
23      all_diff_bits |= v ^ k
24   diff_bit_count = 0
25   while all_diff_bits != 0:
26      diff_bit_count += 1
27      all_diff_bits >>= 1
28   for bit in range(diff_bit_count):
29      BitDiffSet[bit] = empty_uset()
30   for k in fold_map.keys():
31      s = singleton_uset(k)
32      v = fold_map[k]
33      if not isinstance(v, int): continue # skip nonsimple case folds
34      diff_bits = v ^ k
35      for b in range(diff_bit_count):
36         if diff_bits & 1 == 1:  BitDiffSet[b] = uset_union(BitDiffSet[b], s)
37         diff_bits >>= 1
38   return BitDiffSet
39
40def simple_CaseClosure_map(fold_data):
41   simpleFoldMap = {}
42   for k in fold_data['S'].keys(): simpleFoldMap[k] = int(fold_data['S'][k], 16)
43   for k in fold_data['C'].keys(): simpleFoldMap[k] = int(fold_data['C'][k], 16)
44   cl_map = {}
45   for k in simpleFoldMap.keys():
46      v = simpleFoldMap[k]
47      if not v in cl_map: cl_map[v] = [k]
48      else: cl_map[v].append(k)
49      if not k in cl_map: cl_map[k] = [v]
50      else: cl_map[k].append(v)
51   newEntries = True
52   while newEntries:
53      newEntries = False
54      for k in cl_map.keys():
55         vlist = cl_map[k]
56         for v in vlist:
57            for w in cl_map[v]:
58               if k != w and not k in cl_map[w]:
59                  cl_map[w].append(k)
60                  newEntries = True
61   return cl_map
62
63#
64# Simple case fold map.     
65# The simple case fold map is an ordered list of fold entries each of
66# the form (lo_codepoint, hicodepoint, offset).  Each entry describes
67# the case fold that applies for the consecutive entries in the given
68# codepoint range, according to the following equations. 
69# casefold(x) = x + offset, if ((x - low_codepoint) div offset) mod 2 = 0
70#             = x - offset, if ((x - low_codepoint) div offset) mod 2 = 1
71#
72#
73def caseFoldRangeMap(casemap):
74   foldable = sorted(casemap.keys())
75   entries = []
76   cp = foldable[0]
77   open_entries = [(cp, f - cp) for f in casemap[cp]]
78   last_cp = cp
79   for cp in foldable[1:]:
80      if cp != last_cp + 1:
81         # Close the pending range entries
82         for (cp0, offset) in open_entries:
83            entries.append((cp0, last_cp, offset))
84         open_entries = [(cp, f - cp) for f in casemap[cp]]
85      else:
86         new_open = []
87         projected = []
88         for (cp0, offset) in open_entries:
89            even_odd_offset_group = int(abs(cp - cp0)/ abs(offset)) & 1
90            if even_odd_offset_group == 0: 
91               projected_foldcp = cp + offset
92            else: projected_foldcp = cp - offset
93            if not projected_foldcp in casemap[cp]:
94               entries.append((cp0, last_cp, offset))
95            else:
96               new_open.append((cp0, offset))
97               projected.append(projected_foldcp)
98         open_entries = new_open
99         for f in casemap[cp]:
100            if not f in projected:
101               open_entries.append((cp, f-cp))
102      last_cp = cp
103   # Close the final entries.
104   for (cp0, offset) in open_entries:
105      entries.append((cp0, last_cp, offset))
106   return entries
107
108
109
110def genFoldEntryData(casemap):
111   rMap = caseFoldRangeMap(casemap)
112   individuals = [(m[0],m[0]+m[2]) for m in rMap if m[0] == m[1]]
113   ranges = [m for m in rMap if m[0] != m[1]]
114   last_hi = -1
115   generated = "const FoldEntry foldTable[foldTableSize] = {\n"
116   foldTableSize = 0
117   for (lo, hi, offset) in ranges:
118      if lo != last_hi + 1:
119         pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi and m[0]< lo]
120         generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
121         foldTableSize += 1
122      last_hi = hi
123      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>=lo and m[0]<= hi]
124      generated += "  {0x%x, %i, {" % (lo, offset) + cformat.multiline_fill(pairs) + "}},\n"
125      foldTableSize += 1
126   if last_hi != 0x10FFFF:
127      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi]
128      generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
129      foldTableSize += 1
130   generated += "  {0x110000, 0, {}}};"
131   foldTableSize += 1
132   generated = "\nconst int foldTableSize = %s;\n\n" % foldTableSize  + generated
133   return generated
134
135foldDeclarations = r"""
136typedef unsigned codepoint_t;
137
138struct FoldEntry {
139    re::codepoint_t range_lo;
140    int fold_offset;
141    std::vector<re::interval_t> fold_pairs;
142};
143
144
145void caseInsensitiveInsertRange(re::CC * cc, const re::codepoint_t lo, const re::codepoint_t hi);
146
147inline void caseInsensitiveInsert(re::CC * cc, const re::codepoint_t cp) {
148    caseInsensitiveInsertRange(cc, cp, cp);
149}
150"""
151
152def genCaseFolding_txt_h():
153    (property_enum_name_list, property_object_map) = parse_PropertyAlias_txt()
154    fold_data = parse_CaseFolding_txt(property_object_map)
155    cm = simple_CaseClosure_map(fold_data)
156    f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
157    cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
158    f.write(foldDeclarations)
159    f.write(genFoldEntryData(cm))
160    #emit_property(f, 'scf')
161    #emit_property(f, 'cf')
162    cformat.close_header_file(f)
163
164if __name__ == "__main__":
165   genCaseFolding_txt_h()
166
167
168
Note: See TracBrowser for help on using the repository browser.