source: proto/charsetcompiler/UCD/casefold.py @ 5143

Last change on this file since 5143 was 5143, checked in by cameron, 3 years ago

Updates for Unicode 9.0; clean-ups

File size: 6.5 KB
Line 
1#
2# casefold.py - parsing Unicode Character Database (UCD) files
3# and generating C headers for property data using a compact bitset
4# representation.
5#
6# Robert D. Cameron
7# December 2, 2014
8#
9# Licensed under Open Software License 3.0.
10#
11#
12import re, string, cformat
13import UCD_config
14from unicode_set import *
15
16
17
18#
19#  Processing files of the UCD
20#
21#  General format for skippable comments, blank lines
22UCD_skip = re.compile("^#.*$|^\s*$")
23
24#
25#  UCD Property File Format 4: property aliases
26#  PropertyAliases.txt
27#
28UCD_case_fold_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*([CSFT]);\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
29
30def parse_CaseFolding_txt():
31   fold_type = {}
32   fold_value = {}
33   f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
34   lines = f.readlines()
35   for t in lines:
36      if UCD_skip.match(t): continue  # skip comment and blank lines
37      m = UCD_case_fold_regexp.match(t)
38      if not m: raise Exception("Unknown case fold syntax: %s" % t)
39      codepoint = int(m.group(1), 16)
40      fold_t = m.group(2)
41      fold_type[codepoint] = fold_t
42      fold_val = m.group(3)
43      if fold_t == 'T': 
44         print "Skipping Turkic entry"
45         continue  # skip Turkic
46      if fold_t == 'F':
47          fold_val = [int(x, 16) for x in fold_val.split(" ")]
48      else:
49          fold_val = int(fold_val, 16)
50      if fold_value.has_key(codepoint): fold_value[codepoint].append(fold_val)
51      else: fold_value[codepoint] = [fold_val]
52   return (fold_type, fold_value)
53
54
55def simple_CaseFolding_BitSets(fold_map):
56   BitDiffSet = {}
57   all_diff_bits = 0
58   for k in fold_map.keys():
59      v = fold_map[k]
60      if not isinstance(v, int): continue # skip nonsimple case folds
61      all_diff_bits |= v ^ k
62   diff_bit_count = 0
63   while all_diff_bits != 0:
64      diff_bit_count += 1
65      all_diff_bits >>= 1
66   for bit in range(diff_bit_count):
67      BitDiffSet[bit] = empty_uset()
68   for k in fold_map.keys():
69      s = singleton_uset(k)
70      v = fold_map[k]
71      if not isinstance(v, int): continue # skip nonsimple case folds
72      diff_bits = v ^ k
73      for b in range(diff_bit_count):
74         if diff_bits & 1 == 1:  BitDiffSet[b] = uset_union(BitDiffSet[b], s)
75         diff_bits >>= 1
76   return BitDiffSet
77
78def simple_CaseClosure_map(fold_map):
79   cl_map = {}
80   for k in fold_map.keys():
81      folds = fold_map[k]
82      for v in folds:
83        if not isinstance(v, int): continue # skip nonsimple case folds
84        if not cl_map.has_key(v): cl_map[v] = [k]
85        else: cl_map[v].append(k)
86        if not cl_map.has_key(k): cl_map[k] = [v]
87        else: cl_map[k].append(v)
88   newEntries = True
89   while newEntries:
90      newEntries = False
91      for k in cl_map.keys():
92         vlist = cl_map[k]
93         for v in vlist:
94            for w in cl_map[v]:
95               if k != w and not k in cl_map[w]:
96                  cl_map[w].append(k)
97                  newEntries = True
98   return cl_map
99
100#
101# Simple case fold map.     
102# The simple case fold map is an ordered list of fold entries each of
103# the form (lo_codepoint, hicodepoint, offset).  Each entry describes
104# the case fold that applies for the consecutive entries in the given
105# codepoint range, according to the following equations. 
106# casefold(x) = x + offset, if ((x - low_codepoint) div offset) mod 2 = 0
107#             = x - offset, if ((x - low_codepoint) div offset) mod 2 = 1
108#
109#
110def caseFoldRangeMap(casemap):
111   foldable = sorted(casemap.keys())
112   entries = []
113   cp = foldable[0]
114   open_entries = [(cp, f - cp) for f in casemap[cp]]
115   last_cp = cp
116   for cp in foldable[1:]:
117      if cp != last_cp + 1:
118         # Close the pending range entries
119         for (cp0, offset) in open_entries:
120            entries.append((cp0, last_cp, offset))
121         open_entries = [(cp, f - cp) for f in casemap[cp]]
122      else:
123         new_open = []
124         projected = []
125         for (cp0, offset) in open_entries:
126            even_odd_offset_group = (abs(cp - cp0)/ abs(offset)) & 1
127            if even_odd_offset_group == 0: 
128               projected_foldcp = cp + offset
129            else: projected_foldcp = cp - offset
130            if not projected_foldcp in casemap[cp]:
131               entries.append((cp0, last_cp, offset))
132            else:
133               new_open.append((cp0, offset))
134               projected.append(projected_foldcp)
135         open_entries = new_open
136         for f in casemap[cp]:
137            if not f in projected:
138               open_entries.append((cp, f-cp))
139      last_cp = cp
140   # Close the final entries.
141   for (cp0, offset) in open_entries:
142      entries.append((cp0, last_cp, offset))
143   return entries
144
145
146
147def genFoldEntryData(casemap):
148   rMap = caseFoldRangeMap(casemap)
149   individuals = [(m[0],m[0]+m[2]) for m in rMap if m[0] == m[1]]
150   ranges = [m for m in rMap if m[0] != m[1]]
151   last_hi = -1
152   generated = "const FoldEntry foldTable[foldTableSize] = {\n"
153   foldTableSize = 0
154   for (lo, hi, offset) in ranges:
155      if lo != last_hi + 1:
156         pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi and m[0]< lo]
157         generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
158         foldTableSize += 1
159      last_hi = hi
160      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>=lo and m[0]<= hi]
161      generated += "  {0x%x, %i, {" % (lo, offset) + cformat.multiline_fill(pairs) + "}},\n"
162      foldTableSize += 1
163   if last_hi != 0x10FFFF:
164      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi]
165      generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
166      foldTableSize += 1
167   generated += "  {0x110000, 0, {}}};"
168   foldTableSize += 1
169   generated = "\nconst int foldTableSize = %s;\n\n" % foldTableSize  + generated
170   return generated
171
172foldDeclarations = r"""
173typedef unsigned codepoint_t;
174
175struct FoldEntry {
176    re::codepoint_t range_lo;
177    int fold_offset;
178    std::vector<re::interval_t> fold_pairs;
179};
180
181
182void caseInsensitiveInsertRange(re::CC * cc, const re::codepoint_t lo, const re::codepoint_t hi);
183
184inline void caseInsensitiveInsert(re::CC * cc, const re::codepoint_t cp) {
185    caseInsensitiveInsertRange(cc, cp, cp);
186}
187"""
188
189def genCaseFolding_txt_h():
190   (ft, fv) = parse_CaseFolding_txt()
191   cm = simple_CaseClosure_map(fv)
192   f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
193   cformat.write_imports(f, ["<vector>", "<utility>", '"re/re_re.h"', '"re/re_cc.h"'])
194   f.write(foldDeclarations)
195   f.write(genFoldEntryData(cm))
196   cformat.close_header_file(f)
197
198if __name__ == "__main__":
199   genCaseFolding_txt_h()
200
201
202
Note: See TracBrowser for help on using the repository browser.