source: proto/charsetcompiler/UCD/UCD_properties.py @ 4150

Last change on this file since 4150 was 4150, checked in by cameron, 5 years ago

Property names and aliases

File size: 5.4 KB
Line 
1#
2# UCD_properties.py - parsing Unicode Character Database (UCD) files
3# and generating C headers for property data using a compact bitset
4# representation.
5#
6# Robert D. Cameron
7# September 10, 2014
8#
9# Licensed under Open Software License 3.0.
10#
11#
12import re, string
13from unicode_set import *
14
15UCD_dir = "7.0.0"
16
17
18#
19#  Processing files of the UCD
20#
21#  General format for skippable comments, blank lines
22UCD_skip = re.compile("^#.*$|^\s*$")
23
24#
25#  UCD Property File Format 1: property aliases
26#  PropertyAliases.txt
27#
28UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
29
30def parse_PropertyAlias_txt():
31   property_enum_name_list = []
32   full_name_map = {}
33   property_lookup_map = {}
34   f = open(UCD_dir + "/" + 'PropertyAliases.txt')
35   lines = f.readlines()
36   for t in lines:
37      if UCD_skip.match(t): continue  # skip comment and blank lines
38      m = UCD_property_alias_regexp.match(t)
39      if not m: raise Exception("Unknown property alias syntax: %s" % t)
40      prop_enum = m.group(1)
41      prop_preferred_full_name = m.group(2)
42      prop_extra = m.group(3)
43      prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
44      property_enum_name_list.append(prop_enum)
45      full_name_map[prop_enum] = prop_preferred_full_name
46      property_lookup_map[canonicalize(prop_enum)] = prop_enum
47      property_lookup_map[canonicalize(prop_preferred_full_name)] = prop_enum
48      for a in prop_aliases: property_lookup_map[canonicalize(a)] = prop_enum
49   return (property_enum_name_list, full_name_map, property_lookup_map)
50
51trivial_name_char_re = re.compile('[-_\s]')
52def canonicalize(property_string):
53   c = trivial_name_char_re.sub('', property_string.lower())
54   if len(c) > 2 and c[0:2] == "is": return c[2:]
55   else: return c
56
57
58PropertyAliases_template = r"""
59namespace UCD {
60  enum class property_t {
61    %s
62  };
63  std::string[] property_full_name;
64%s
65
66  std::map<std::string, property> alias_map;
67%s
68
69}
70"""
71
72enums_per_line = 8
73def generate_PropertyAliases_h():
74   (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt()
75   f = open('PropertyAliases.h', 'w')
76   enum_text = property_enum_name_list[0]
77   for i in range(1, len(property_enum_name_list)):
78     if i % enums_per_line == 0: enum_text += ",\n    "
79     else: enum_text += ", "
80     enum_text += property_enum_name_list[i]
81   full_name_text = ""
82   for e in property_enum_name_list:
83     full_name_text += '  property_full_name[%s] = "%s";\n' % (e, full_name_map[e])
84   map_text = ""
85   for k in sorted(property_lookup_map.keys()):
86     map_text += '  alias_map.insert(make_pair("%s", %s));\n' % (k, property_lookup_map[k])
87   f.write(PropertyAliases_template % (enum_text, full_name_text, map_text))
88   f.close()
89   
90#
91#  UCD Property File Format 2:  codepoint -> name maps
92#
93UCD_skip = re.compile("^#.*$|^\s*$")
94UCD_point_name_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
95UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
96
97def parse_UCD_codepoint_name_map(mapfile):
98   name_map = {}
99   name_list_order = []
100   f = open(UCD_dir + "/" + mapfile)
101   lines = f.readlines()
102   for t in lines:
103      if UCD_skip.match(t): continue  # skip comment and blank lines
104      m = UCD_point_name_regexp.match(t)
105      if m:
106        (codepoint, name) = (int(m.group(1), 16), m.group(2))
107        newset = singleton_set(codepoint)
108      else: 
109        m = UCD_range_name_regexp.match(t)
110        if not m: raise Exception("Unknown syntax: %s" % t)
111        (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
112        newset = make_range_set(cp_lo, cp_hi)
113      if not name_map.has_key(name):
114        name_map[name] = newset
115        name_list_order.append(name)
116      else: name_map[name] = union(name_map[name], newset)
117   return (name_list_order, name_map)
118
119def generate_PropList_h():
120   (props, prop_map) = parse_UCD_codepoint_name_map('PropList.txt')
121   f = open('PropList.h', 'w')
122   for k in props:
123     f.write(prop_map[k].showC(k))
124   f.close()
125
126def generate_Blocks_h():
127   (blocks, block_map) = parse_UCD_codepoint_name_map('Blocks.txt')
128   f = open('Blocks.h', 'w')
129   for k in blocks:
130     f.write(block_map[k].showC('block["%s"]' % k))
131   f.close()
132
133def generate_Scripts_h():
134   (scripts, script_map) = parse_UCD_codepoint_name_map('Scripts.txt')
135   f = open('Scripts.h', 'w')
136   for k in scripts:
137     f.write(script_map[k].showC('script["%s"]' % k))
138   f.close()
139   
140def generate_ScriptExtensions_h():
141   (scx_sets, scx_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt')
142   map2 = {}
143   f = open('ScriptExtensions.h', 'w')
144   for scx_list in scx_sets:
145     scx_items = scx_list.split(" ")
146     for scx in scx_items:
147        if map2.has_key(scx): 
148           map2[scx] = union(map2[scx], scx_map[scx_list])
149        else: map2[scx] = scx_map[scx_list]
150   for k in sorted(map2.keys()):
151     f.write(map2[k].showC('scx["%s"]' % k))
152   f.close()
153
154def generate_DerivedGeneralCategory_h():
155   (categories, cat_map) = parse_UCD_codepoint_name_map('extracted/DerivedGeneralCategory.txt')
156   f = open('DerivedGeneralCategory.h', 'w')
157   for k in categories:
158     f.write(cat_map[k].showC('GC["%s"]' % k))
159   f.close()
160
161def generate_DerivedCoreProperties_h():
162   (properties, prop_map) = parse_UCD_codepoint_name_map('DerivedCoreProperties.txt')
163   f = open('DerivedCoreProperties.h', 'w')
164   for k in properties:
165     f.write(prop_map[k].showC(k))
166   f.close()
167
168
Note: See TracBrowser for help on using the repository browser.