source: proto/charsetcompiler/UCD/UCD_properties.py @ 4181

Last change on this file since 4181 was 4181, checked in by cameron, 5 years ago

vector<string> value_name; vector<unordered_map> property_value_alias_map

File size: 15.4 KB
Line 
1#
2# UCD_properties.py - parsing Unicode Character Database (UCD) files
3# and generating C headers for property data using a compact bitset
4# representation.
5#
6# Robert D. Cameron
7# September 10, 2014
8#
9# Licensed under Open Software License 3.0.
10#
11#
12import re, string, os.path
13from unicode_set import *
14
15UCD_dir = "7.0.0"
16
17#
18#  Processing files of the UCD
19#
20#  General format for skippable comments, blank lines
21UCD_skip = re.compile("^#.*$|^\s*$")
22
23#
24#  UCD Property File Format 1: property aliases
25#  PropertyAliases.txt
26#
27UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
28
29def parse_PropertyAlias_txt():
30   property_enum_name_list = []
31   full_name_map = {}
32   property_lookup_map = {}
33   f = open(UCD_dir + "/" + 'PropertyAliases.txt')
34   lines = f.readlines()
35   for t in lines:
36      if UCD_skip.match(t): continue  # skip comment and blank lines
37      m = UCD_property_alias_regexp.match(t)
38      if not m: raise Exception("Unknown property alias syntax: %s" % t)
39      prop_enum = m.group(1).lower()
40      prop_preferred_full_name = m.group(2)
41      prop_extra = m.group(3)
42      prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
43      property_enum_name_list.append(prop_enum)
44      full_name_map[prop_enum] = prop_preferred_full_name
45      property_lookup_map[canonicalize(prop_enum)] = prop_enum
46      property_lookup_map[canonicalize(prop_preferred_full_name)] = prop_enum
47      for a in prop_aliases: property_lookup_map[canonicalize(a)] = prop_enum
48   return (property_enum_name_list, full_name_map, property_lookup_map)
49
50trivial_name_char_re = re.compile('[-_\s]')
51def canonicalize(property_string):
52   c = trivial_name_char_re.sub('', property_string.lower())
53   if len(c) > 2 and c[0:2] == "is": return c[2:]
54   else: return c
55
56
57header_template = r"""#ifndef %s
58#define %s
59/*
60 *  Copyright (c) 2014 International Characters, Inc.
61 *  This software is licensed to the public under the Open Software License 3.0.
62 *  icgrep is a trademark of International Characters, Inc.
63 *
64 *  This file is generated by UCD_properties.y - manual edits may be lost.
65 */
66
67"""
68
69
70
71def open_header_file_for_write(filename):
72   f = open(filename + '.h', 'w')
73   hname = filename.upper() + '_H'
74   f.write(header_template % (hname, hname))
75   return f
76
77def close_header_file(f):
78   f.write("\n#endif\n")
79   f.close()
80
81def write_imports(f, import_list):
82   for i in import_list: f.write("#include %s\n" % i)
83
84PropertyAliases_template = r"""
85namespace UCD {
86  enum property_t {
87%s
88  };
89  const std::string property_full_name[] = {
90%s
91  };
92  const std::unordered_map<std::string, property_t> alias_map = {
93%s
94  };
95}
96"""
97
98def multiline_join(item_list, items_per_line, separator = ",", closer='', indent = 4):
99  lines = ""
100  sep_with_space = separator + " "
101  while len(item_list) > items_per_line:
102    line_items = item_list[:items_per_line]
103    lines += (" " * indent) + sep_with_space.join(line_items) + separator + "\n"
104    item_list = item_list[items_per_line:]
105  lines += (" " * indent) + sep_with_space.join(item_list) + closer
106  return lines
107
108def generate_PropertyAliases_h(property_enum_name_list, full_name_map, property_lookup_map):
109   f = open_header_file_for_write('PropertyAliases')
110   write_imports(f, ["<string>", "<unordered_map>"])
111   enum_text = multiline_join(property_enum_name_list, 4, ',')
112   full_name_text = multiline_join(['"%s"' % full_name_map[e] for e in property_enum_name_list], 2, ',')
113   map_text = multiline_join(['{"%s", %s}' % (k, property_lookup_map[k]) for k in sorted(property_lookup_map.keys())], 2, ',')
114   f.write(PropertyAliases_template % (enum_text, full_name_text, map_text))
115   close_header_file(f)
116
117#
118#  UCD Property File Format 2: property value aliases
119#  PropertyValueAliases.txt
120#
121#  This file records value aliases for property values for
122#  each enumerated property, with the following additional notes:
123#  (1) The corresponding integer value of the enum constant is
124#      also specified for ccc (second field).
125#  (2) The Age property is a numeric type which has decimal float
126#      values as the enum constants: these won't be legal in enum syntax.
127#  (3) Binary properties also have enumerated values and aliases listed,
128#      although this is redundant, because all binary properties have the
129#      same value space.
130#
131
132UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
133
134def parse_PropertyValueAlias_txt(property_lookup_map):
135    property_value_list = {}
136    property_value_enum_integer = {}
137    property_value_full_name_map = {}
138    property_value_lookup_map = {}
139    f = open(UCD_dir + "/" + 'PropertyValueAliases.txt')
140    lines = f.readlines()
141    for t in lines:
142        if UCD_skip.match(t): continue  # skip comment and blank lines
143        m = UCD_property_value_alias_regexp.match(t)
144        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
145        prop_code = canonicalize(m.group(1))
146        if not property_lookup_map.has_key(prop_code): raise Exception("Property code: '%s' is unknown" % prop_code)
147        else: prop_code = property_lookup_map[prop_code]
148        if not property_value_list.has_key(prop_code):
149          property_value_list[prop_code] = []
150          property_value_enum_integer[prop_code] = {}
151          property_value_full_name_map[prop_code] = {}
152          property_value_lookup_map[prop_code] = {}
153          enum_integer = 0
154        # Special case for ccc: second field is enum integer value
155        if prop_code == 'ccc':
156          enum_integer = int(m.group(2))
157          value_enum = m.group(3)
158          extra = m.group(4)
159          extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
160          value_preferred_full_name = extra_list[0]
161          value_aliases = extra_list[1:]
162        # Special case for age: second field is numeric, third field is enum
163        # treat numeric value as an alias string
164        elif prop_code == 'age':
165          value_enum = m.group(3)
166          value_preferred_full_name = m.group(3)
167          extra = m.group(4)
168          value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
169        else:
170          value_enum = m.group(2)
171          value_preferred_full_name = m.group(3)
172          extra = m.group(4)
173          value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
174        property_value_list[prop_code].append(value_enum)
175        property_value_enum_integer[prop_code][value_enum] = enum_integer
176        enum_integer += 1
177        property_value_full_name_map[prop_code][value_enum] = value_preferred_full_name
178        property_value_lookup_map[prop_code][canonicalize(value_enum)] = value_enum
179        property_value_lookup_map[prop_code][canonicalize(value_preferred_full_name)] = value_enum
180        for a in value_aliases: property_value_lookup_map[prop_code][canonicalize(a)] = value_enum
181    return (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map)
182
183
184PropertyValueAliases_template = r"""
185namespace UCD {
186  enum binary_value_t {N, Y};
187%s
188
189  const std::vector<std::string> value_name[] = {
190%s};
191
192  const std::unordered_map<std::string, int> property_value_alias_map[] = {
193%s};
194
195}
196"""
197
198
199
200PropertyValues_template = r"""
201using std::vector;
202
203namespace UCD {
204  vector<UnicodeSet> value_sets[] = {
205%s
206  };
207}
208"""
209
210
211
212def generate_PropertyValueAliases_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map):
213   f = open_header_file_for_write('PropertyValueAliases')
214   write_imports(f, ["<string>", "<unordered_map>", '"unicode_set.h"', '"PropertyAliases.h"'])
215   #  Generate the aliases for all Binary properties.
216   full_name_text = multiline_join(['"No"', '"Yes"'], 4, ',',  '', 6)
217   binary_map_text = multiline_join(['{"n", N}', '{"y", Y}', '{"no", N}', '{"yes", Y}', '{"f", N}', '{"t", Y}', '{"false", N}', '{"true", Y}'], 4, ',', '', 6)
218   #
219   enum_text = ""
220   name_vectors = []
221   alias_maps = []
222   for p in property_enum_name_list:
223     if property_value_list.has_key(p):
224       if property_value_list[p] == ['N', 'Y']:
225         name_vectors.append('    {"No", "Yes"}')
226         alias_maps.append("    {%s}" % binary_map_text)
227       else:
228         enum_text += "  namespace %s {\n    enum value_t {\n" % p.upper()
229         enum_text += multiline_join(property_value_list[p], 4, ',','', 6)
230         if p == 'ccc': # Special case: add numeric value information for ccc.
231           enum_text += r"""
232    };
233    const uint8_t enum_val[] = {
234"""
235           enum_text += multiline_join(["%s" % (property_value_enum_integer[p][e]) for e in property_value_list['ccc']], 4, ',', '', 6)
236         enum_text += "};\n  }\n"
237         full_name_text = multiline_join(['"%s"' % (property_value_full_name_map[p][e]) for e in property_value_list[p]], 4, ',',  '', 6)
238         name_vectors.append("    {%s}" % full_name_text)
239         map_text = multiline_join(['{"%s", %s::%s}' % (k, p.upper(), property_value_lookup_map[p][k]) for k in sorted(property_value_lookup_map[p].keys())], 4, ',', '', 6)
240         alias_maps.append("    {%s}" % map_text)
241     else:
242       name_vectors.append("    {}")
243       alias_maps.append("    {}")
244   f.write(PropertyValueAliases_template % (enum_text, ",\n".join(name_vectors), ",\n".join(alias_maps)))
245   close_header_file(f)
246
247
248
249
250
251def generate_PropertyValueSets_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map):
252   f = open_header_file_for_write('PropertyValueSets')
253   write_imports(f, ["<vector>", '"unicode_set.h"'])
254   vec_decl_list = []
255   for p in property_enum_name_list:
256     if not property_value_list.has_key(p):
257       vec_decl_list.append("vector<UnicodeSet>(0)")
258     elif property_value_list[p] == ['N', 'Y']:
259       vec_decl_list.append("vector<UnicodeSet>(1)")
260     elif p == 'scx': 
261       vec_decl_list.append("vector<UnicodeSet>(%i)" % len(property_value_list['sc']))
262     else: 
263       vec_decl_list.append("vector<UnicodeSet>(%i)" % len(property_value_list[p]))
264   f.write(PropertyValues_template % (multiline_join(vec_decl_list, 4, ',', '', 6)))
265   close_header_file(f)
266
267
268#
269#  UCD Property File Format 3:  codepoint -> name maps
270#
271UCD_skip = re.compile("^#.*$|^\s*$")
272UCD_point_name_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
273UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
274
275def parse_UCD_codepoint_name_map(mapfile, canonical_name_lookup_map = None):
276   value_map = {}
277   name_list_order = []
278   f = open(UCD_dir + "/" + mapfile)
279   lines = f.readlines()
280   for t in lines:
281      if UCD_skip.match(t): continue  # skip comment and blank lines
282      m = UCD_point_name_regexp.match(t)
283      if m:
284        (codepoint, name) = (int(m.group(1), 16), m.group(2))
285        newset = singleton_uset(codepoint)
286      else: 
287        m = UCD_range_name_regexp.match(t)
288        if not m: raise Exception("Unknown syntax: %s" % t)
289        (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
290        newset = range_uset(cp_lo, cp_hi)
291      if not canonical_name_lookup_map == None:
292        cname = canonicalize(name)
293        if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
294        name = canonical_name_lookup_map[cname]
295      if not value_map.has_key(name):
296        value_map[name] = newset
297        name_list_order.append(name)
298      else: value_map[name] = uset_union(value_map[name], newset)
299   return (name_list_order, value_map)
300
301def generate_property_value_file(filename_root, property_code, canonical_property_value_map):
302   (prop_values, value_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', canonical_property_value_map)
303   f = open_header_file_for_write(os.path.basename(filename_root))
304   write_imports(f, ["<vector>", '"unicode_set.h"', '"PropertyAliases.h"', '"PropertyValueAliases.h"', '"PropertyValueSets.h"'])
305   f.write("\nusing namespace UCD;\n\n")
306   print "%s bytes" % sum([value_map[v].bytes() for v in value_map.keys()])
307   for v in prop_values:
308     f.write(value_map[v].showC('value_sets[%s][%s::%s]' % (property_code, property_code.upper(), v)))
309   close_header_file(f)
310   
311def generate_binary_properties_file(filename_root, canonical_property_name_map):
312   (props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', canonical_property_name_map)
313   f = open_header_file_for_write(os.path.basename(filename_root))
314   write_imports(f, ["<vector>", '"unicode_set.h"', '"PropertyAliases.h"', '"PropertyValueSets.h"'])
315   f.write("\nusing namespace UCD;\n\n")
316   print "%s bytes" % sum([prop_map[p].bytes() for p in prop_map.keys()])
317   for p in sorted(props):
318     f.write(prop_map[p].showC('value_sets[%s][0]' % (p)))
319   close_header_file(f)
320     
321def generate_ScriptExtensions_h():
322   (scx_sets, scx_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt')
323   map2 = {}
324   f = open_header_file_for_write('ScriptExtensions')
325   write_imports(f, ["<vector>", '"PropertyAliases.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
326   f.write("\nusing namespace UCD;\n\n")
327   for scx_list in scx_sets:
328     scx_items = scx_list.split(" ")
329     for scx in scx_items:
330        if map2.has_key(scx): 
331           map2[scx] = uset_union(map2[scx], scx_map[scx_list])
332        else: map2[scx] = scx_map[scx_list]
333   print "%s bytes" % sum([map2[k].bytes() for k in map2.keys()])
334   for k in sorted(map2.keys()):
335     f.write(map2[k].showC('value_sets[scx][SC::%s]' % k.lower()))
336   close_header_file(f)
337
338
339
340def UCD_main():
341   # First parse all property names and their aliases
342   (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt()
343   generate_PropertyAliases_h(property_enum_name_list, full_name_map, property_lookup_map)
344   # Next parse all property value names and their aliases
345   (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map) = parse_PropertyValueAlias_txt(property_lookup_map)
346   generate_PropertyValueAliases_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map)
347   #
348   generate_PropertyValueSets_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map)
349   #
350   # Blocks
351   generate_property_value_file('Blocks', 'blk', property_value_lookup_map['blk'])
352   #
353   # Scripts
354   generate_property_value_file('Scripts', 'sc', property_value_lookup_map['sc'])
355   #
356   # Script Extensions
357   generate_ScriptExtensions_h()
358   #
359   # General Category
360   generate_property_value_file('extracted/DerivedGeneralCategory', 'gc', property_value_lookup_map['gc'])
361
362   #
363   # Binary properties from PropList.txt
364   generate_binary_properties_file('PropList', property_lookup_map)
365   #
366   # Binary properties from DerivedCoreProperties.txt
367   generate_binary_properties_file('DerivedCoreProperties', property_lookup_map)
368   #
369   # LineBreak types
370   generate_property_value_file('LineBreak', 'lb', property_value_lookup_map['lb'])
371   #
372   # East Asian Width
373   generate_property_value_file('EastAsianWidth', 'ea', property_value_lookup_map['ea'])
374   #
375   # Hangul Syllable Type
376   generate_property_value_file('HangulSyllableType', 'hst', property_value_lookup_map['hst'])
377
378   #
379   # Jamo Short Name - AAARGH - property value for 110B is an empty string!!!!!  - Not in PropertyValueAliases.txt
380   # generate_property_value_file('Jamo', 'jsn', property_value_lookup_map['jsn'])
381
382
383if __name__ == "__main__":
384  UCD_main()
Note: See TracBrowser for help on using the repository browser.