1 | # |
---|
2 | # UCD_properties.py - parsing Unicode Character Database (UCD) files |
---|
3 | # and generating C headers for property data using a compact bitset |
---|
4 | # representation. |
---|
5 | # |
---|
6 | # Robert D. Cameron |
---|
7 | # September 10, 2014 |
---|
8 | # |
---|
9 | # Licensed under Open Software License 3.0. |
---|
10 | # |
---|
11 | # |
---|
12 | import re, string |
---|
13 | from unicode_set import * |
---|
14 | |
---|
15 | UCD_dir = "7.0.0" |
---|
16 | |
---|
17 | |
---|
18 | # |
---|
19 | # Processing files of the UCD |
---|
20 | # |
---|
21 | # General format for skippable comments, blank lines |
---|
22 | UCD_skip = re.compile("^#.*$|^\s*$") |
---|
23 | |
---|
24 | # |
---|
25 | # UCD Property File Format 1: property aliases |
---|
26 | # PropertyAliases.txt |
---|
27 | # |
---|
28 | UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)") |
---|
29 | |
---|
30 | def parse_PropertyAlias_txt(): |
---|
31 | property_enum_name_list = [] |
---|
32 | full_name_map = {} |
---|
33 | property_lookup_map = {} |
---|
34 | f = open(UCD_dir + "/" + 'PropertyAliases.txt') |
---|
35 | lines = f.readlines() |
---|
36 | for t in lines: |
---|
37 | if UCD_skip.match(t): continue # skip comment and blank lines |
---|
38 | m = UCD_property_alias_regexp.match(t) |
---|
39 | if not m: raise Exception("Unknown property alias syntax: %s" % t) |
---|
40 | prop_enum = m.group(1) |
---|
41 | prop_preferred_full_name = m.group(2) |
---|
42 | prop_extra = m.group(3) |
---|
43 | prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra) |
---|
44 | property_enum_name_list.append(prop_enum) |
---|
45 | full_name_map[prop_enum] = prop_preferred_full_name |
---|
46 | property_lookup_map[canonicalize(prop_enum)] = prop_enum |
---|
47 | property_lookup_map[canonicalize(prop_preferred_full_name)] = prop_enum |
---|
48 | for a in prop_aliases: property_lookup_map[canonicalize(a)] = prop_enum |
---|
49 | return (property_enum_name_list, full_name_map, property_lookup_map) |
---|
50 | |
---|
51 | trivial_name_char_re = re.compile('[-_\s]') |
---|
52 | def canonicalize(property_string): |
---|
53 | c = trivial_name_char_re.sub('', property_string.lower()) |
---|
54 | if len(c) > 2 and c[0:2] == "is": return c[2:] |
---|
55 | else: return c |
---|
56 | |
---|
57 | |
---|
58 | header_template = r"""#ifndef %s |
---|
59 | #define %s |
---|
60 | /* |
---|
61 | * Copyright (c) 2014 International Characters, Inc. |
---|
62 | * This software is licensed to the public under the Open Software License 3.0. |
---|
63 | * icgrep is a trademark of International Characters, Inc. |
---|
64 | * |
---|
65 | * This file is generated by UCD_properties.y - manual edits may be lost. |
---|
66 | */ |
---|
67 | |
---|
68 | #include <string> |
---|
69 | #include <unordered_map> |
---|
70 | |
---|
71 | """ |
---|
72 | |
---|
73 | def open_header_file_for_write(filename): |
---|
74 | f = open(filename + '.h', 'w') |
---|
75 | hname = filename.upper() + '_H' |
---|
76 | f.write(header_template % (hname, hname)) |
---|
77 | return f |
---|
78 | |
---|
79 | def close_header_file(f): |
---|
80 | f.write("#endif\n") |
---|
81 | f.close() |
---|
82 | |
---|
83 | PropertyAliases_template = r""" |
---|
84 | namespace UCD { |
---|
85 | enum property_t { |
---|
86 | %s |
---|
87 | }; |
---|
88 | const std::string property_full_name[] = { |
---|
89 | %s |
---|
90 | }; |
---|
91 | const std::unordered_map<std::string, property_t> alias_map = { |
---|
92 | %s |
---|
93 | }; |
---|
94 | } |
---|
95 | """ |
---|
96 | |
---|
97 | def multiline_join(item_list, items_per_line, separator = ",", closer='', indent = 4): |
---|
98 | lines = "" |
---|
99 | sep_with_space = separator + " " |
---|
100 | while len(item_list) > items_per_line: |
---|
101 | line_items = item_list[:items_per_line] |
---|
102 | lines += (" " * indent) + sep_with_space.join(line_items) + separator + "\n" |
---|
103 | item_list = item_list[items_per_line:] |
---|
104 | lines += (" " * indent) + sep_with_space.join(item_list) + closer |
---|
105 | return lines |
---|
106 | |
---|
107 | enums_per_line = 4 |
---|
108 | def generate_PropertyAliases_h(): |
---|
109 | (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt() |
---|
110 | f = open_header_file_for_write('PropertyAliases') |
---|
111 | enum_text = multiline_join([e.lower() for e in property_enum_name_list], enums_per_line, ',') |
---|
112 | full_name_text = multiline_join(['"%s"' % full_name_map[e] for e in property_enum_name_list], 2, ',') |
---|
113 | map_text = multiline_join(['{"%s", %s}' % (k, property_lookup_map[k].lower()) for k in sorted(property_lookup_map.keys())], 2, ',') |
---|
114 | f.write(PropertyAliases_template % (enum_text, full_name_text, map_text)) |
---|
115 | close_header_file(f) |
---|
116 | |
---|
117 | # |
---|
118 | # UCD Property File Format 2: property value aliases |
---|
119 | # PropertyValueAliases.txt |
---|
120 | # |
---|
121 | # This file records value aliases for property values for |
---|
122 | # each enumerated property, with the following additional notes: |
---|
123 | # (1) The corresponding integer value of the enum constant is |
---|
124 | # also specified for ccc (second field). |
---|
125 | # (2) The Age property is a numeric type which has decimal float |
---|
126 | # values as the enum constants: these won't be legal in enum syntax. |
---|
127 | # (3) Binary properties also have enumerated values and aliases listed, |
---|
128 | # although this is redundant, because all binary properties have the |
---|
129 | # same value space. |
---|
130 | # |
---|
131 | |
---|
132 | UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)") |
---|
133 | |
---|
134 | def parse_PropertyValueAlias_txt(): |
---|
135 | property_value_list = {} |
---|
136 | property_value_enum_integer = {} |
---|
137 | property_value_full_name_map = {} |
---|
138 | property_value_lookup_map = {} |
---|
139 | f = open(UCD_dir + "/" + 'PropertyValueAliases.txt') |
---|
140 | lines = f.readlines() |
---|
141 | for t in lines: |
---|
142 | if UCD_skip.match(t): continue # skip comment and blank lines |
---|
143 | m = UCD_property_value_alias_regexp.match(t) |
---|
144 | if not m: raise Exception("Unknown property value alias syntax: %s" % t) |
---|
145 | prop_code = m.group(1) |
---|
146 | if not property_value_list.has_key(prop_code): |
---|
147 | property_value_list[prop_code] = [] |
---|
148 | property_value_enum_integer[prop_code] = {} |
---|
149 | property_value_full_name_map[prop_code] = {} |
---|
150 | property_value_lookup_map[prop_code] = {} |
---|
151 | enum_integer = 0 |
---|
152 | # Special case for ccc: second field is enum integer value |
---|
153 | if prop_code == 'ccc': |
---|
154 | enum_integer = int(m.group(2)) |
---|
155 | value_enum = m.group(3) |
---|
156 | extra = m.group(4) |
---|
157 | extra_list = re.findall("[-A-Za-z_0-9.]+", extra) |
---|
158 | value_preferred_full_name = extra_list[0] |
---|
159 | value_aliases = extra_list[1:] |
---|
160 | # Special case for age: second field is numeric, third field is enum |
---|
161 | # treat numeric value as an alias string |
---|
162 | elif prop_code == 'age': |
---|
163 | value_enum = m.group(3) |
---|
164 | value_preferred_full_name = m.group(3) |
---|
165 | extra = m.group(4) |
---|
166 | value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra) |
---|
167 | else: |
---|
168 | value_enum = m.group(2) |
---|
169 | value_preferred_full_name = m.group(3) |
---|
170 | extra = m.group(4) |
---|
171 | value_aliases = re.findall("[-A-Za-z_0-9]+", extra) |
---|
172 | property_value_list[prop_code].append(value_enum) |
---|
173 | property_value_enum_integer[prop_code][value_enum] = enum_integer |
---|
174 | enum_integer += 1 |
---|
175 | property_value_full_name_map[prop_code][value_enum] = value_preferred_full_name |
---|
176 | property_value_lookup_map[prop_code][canonicalize(value_enum)] = value_enum |
---|
177 | property_value_lookup_map[prop_code][canonicalize(value_preferred_full_name)] = value_enum |
---|
178 | for a in value_aliases: property_value_lookup_map[prop_code][canonicalize(a)] = value_enum |
---|
179 | return (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map) |
---|
180 | |
---|
181 | |
---|
182 | PropertyValueAliases_template = r""" |
---|
183 | namespace UCD { |
---|
184 | namespace %s { |
---|
185 | enum value_t { |
---|
186 | %s |
---|
187 | }; |
---|
188 | const std::string value_name[] = { |
---|
189 | %s |
---|
190 | }; |
---|
191 | const std::unordered_map<std::string, value_t> alias_map = { |
---|
192 | %s |
---|
193 | }; |
---|
194 | } |
---|
195 | } |
---|
196 | """ |
---|
197 | |
---|
198 | def generate_PropertyValueAliases_h(): |
---|
199 | (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt() |
---|
200 | (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map) = parse_PropertyValueAlias_txt() |
---|
201 | f = open_header_file_for_write('PropertyValueAliases') |
---|
202 | # Generate the aliases for all Binary properties. |
---|
203 | enum_text = multiline_join(['N', 'Y'], 4, ',','', 6) |
---|
204 | full_name_text = multiline_join(['"No"', '"Yes"'], 4, ',', '', 6) |
---|
205 | map_text = multiline_join(['{"n", N}', '{"y", Y}', '{"no", N}', '{"yes", Y}', '{"f", N}', '{"t", Y}', '{"false", N}', '{"true", Y}'], 4, ',', '', 6) |
---|
206 | f.write(PropertyValueAliases_template % ('Binary', enum_text, full_name_text, map_text)) |
---|
207 | # |
---|
208 | for p in property_enum_name_list: |
---|
209 | if property_value_list.has_key(p): |
---|
210 | if property_value_list[p] == ['N', 'Y']: continue # skip boolean properties |
---|
211 | enum_text = multiline_join(property_value_list[p], 4, ',','', 6) |
---|
212 | if p == 'ccc': # Special case: add numeric value information for ccc. |
---|
213 | enum_text += r""" |
---|
214 | }; |
---|
215 | const uint8_t enum_val[] = { |
---|
216 | """ |
---|
217 | enum_text += multiline_join(["%s" % (property_value_enum_integer[p][e]) for e in property_value_list['ccc']], 4, ',', '', 6) |
---|
218 | full_name_text = multiline_join(['"%s"' % (property_value_full_name_map[p][e]) for e in property_value_list[p]], 4, ',', '', 6) |
---|
219 | map_text = multiline_join(['{"%s", %s}' % (k, property_value_lookup_map[p][k]) for k in sorted(property_value_lookup_map[p].keys())], 4, ',', '', 6) |
---|
220 | f.write(PropertyValueAliases_template % (p.upper(), enum_text, full_name_text, map_text)) |
---|
221 | close_header_file(f) |
---|
222 | |
---|
223 | |
---|
224 | |
---|
225 | # |
---|
226 | # UCD Property File Format 3: codepoint -> name maps |
---|
227 | # |
---|
228 | UCD_skip = re.compile("^#.*$|^\s*$") |
---|
229 | UCD_point_name_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)") |
---|
230 | UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)") |
---|
231 | |
---|
232 | def parse_UCD_codepoint_name_map(mapfile): |
---|
233 | name_map = {} |
---|
234 | name_list_order = [] |
---|
235 | f = open(UCD_dir + "/" + mapfile) |
---|
236 | lines = f.readlines() |
---|
237 | for t in lines: |
---|
238 | if UCD_skip.match(t): continue # skip comment and blank lines |
---|
239 | m = UCD_point_name_regexp.match(t) |
---|
240 | if m: |
---|
241 | (codepoint, name) = (int(m.group(1), 16), m.group(2)) |
---|
242 | newset = singleton_set(codepoint) |
---|
243 | else: |
---|
244 | m = UCD_range_name_regexp.match(t) |
---|
245 | if not m: raise Exception("Unknown syntax: %s" % t) |
---|
246 | (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3)) |
---|
247 | newset = make_range_set(cp_lo, cp_hi) |
---|
248 | if not name_map.has_key(name): |
---|
249 | name_map[name] = newset |
---|
250 | name_list_order.append(name) |
---|
251 | else: name_map[name] = union(name_map[name], newset) |
---|
252 | return (name_list_order, name_map) |
---|
253 | |
---|
254 | def generate_PropList_h(): |
---|
255 | (props, prop_map) = parse_UCD_codepoint_name_map('PropList.txt') |
---|
256 | f = open_header_file_for_write('PropList') |
---|
257 | for k in props: |
---|
258 | f.write(prop_map[k].showC(k)) |
---|
259 | close_header_file(f) |
---|
260 | |
---|
261 | def generate_Blocks_h(): |
---|
262 | (blocks, block_map) = parse_UCD_codepoint_name_map('Blocks.txt') |
---|
263 | f = open_header_file_for_write('Blocks') |
---|
264 | for k in blocks: |
---|
265 | f.write(block_map[k].showC('block["%s"]' % k)) |
---|
266 | close_header_file(f) |
---|
267 | |
---|
268 | def generate_Scripts_h(): |
---|
269 | (scripts, script_map) = parse_UCD_codepoint_name_map('Scripts.txt') |
---|
270 | f = open_header_file_for_write('Scripts') |
---|
271 | for k in scripts: |
---|
272 | f.write(script_map[k].showC('script["%s"]' % k)) |
---|
273 | close_header_file(f) |
---|
274 | |
---|
275 | def generate_ScriptExtensions_h(): |
---|
276 | (scx_sets, scx_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt') |
---|
277 | map2 = {} |
---|
278 | f = open_header_file_for_write('ScriptExtensions') |
---|
279 | for scx_list in scx_sets: |
---|
280 | scx_items = scx_list.split(" ") |
---|
281 | for scx in scx_items: |
---|
282 | if map2.has_key(scx): |
---|
283 | map2[scx] = union(map2[scx], scx_map[scx_list]) |
---|
284 | else: map2[scx] = scx_map[scx_list] |
---|
285 | for k in sorted(map2.keys()): |
---|
286 | f.write(map2[k].showC('scx["%s"]' % k)) |
---|
287 | close_header_file(f) |
---|
288 | |
---|
289 | def generate_DerivedGeneralCategory_h(): |
---|
290 | (categories, cat_map) = parse_UCD_codepoint_name_map('extracted/DerivedGeneralCategory.txt') |
---|
291 | f = open_header_file_for_write('DerivedGeneralCategory') |
---|
292 | for k in categories: |
---|
293 | f.write(cat_map[k].showC('GC["%s"]' % k)) |
---|
294 | close_header_file(f) |
---|
295 | |
---|
296 | def generate_DerivedCoreProperties_h(): |
---|
297 | (properties, prop_map) = parse_UCD_codepoint_name_map('DerivedCoreProperties.txt') |
---|
298 | f = open_header_file_for_write('DerivedCoreProperties') |
---|
299 | for k in properties: |
---|
300 | f.write(prop_map[k].showC(k)) |
---|
301 | close_header_file(f) |
---|
302 | |
---|
303 | |
---|