source: icGREP/icgrep-devel/UCD-scripts/UCD_parser.py @ 5658

Last change on this file since 5658 was 5658, checked in by cameron, 2 years ago

UCD generator restructuring and improvements

File size: 12.7 KB
Line 
1#
2# UCD_parser.py - parsing Unicode Character Database (UCD) files
3#
4# Robert D. Cameron
5# December 28, 2014
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import re, string, os.path
11import UCD_config
12from unicode_set import *
13from UCD_property_objects import *
14
15version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
16
17def setVersionfromReadMe_txt():
18    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
19    lines = f.readlines()
20    for t in lines:
21        m = version_regexp.match(t)
22        if m: 
23            UCD_config.version = m.group(1)
24            print("Version %s" % m.group(1))
25
26trivial_name_char_re = re.compile('[-_\s]')
27def canonicalize(property_string):
28    return trivial_name_char_re.sub('', property_string.lower())
29
30#
31#  Processing files of the UCD
32#
33#  General format for skippable comments, blank lines
34UCD_skip = re.compile("^#.*$|^\s*$")
35
36#
37#  UCD Property File Format 1: property aliases
38#  PropertyAliases.txt
39#
40UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
41UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
42
43def parse_PropertyAlias_txt():
44    property_object_map = {}
45    property_enum_name_list = []
46    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
47    lines = f.readlines()
48    for t in lines:
49        m = UCD_property_section_regexp.match(t)
50        if m:
51            property_kind = m.group(1)
52        if UCD_skip.match(t): continue  # skip comment and blank lines
53        m = UCD_property_alias_regexp.match(t)
54        if not m: raise Exception("Unknown property alias syntax: %s" % t)
55        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
56        property_enum_name_list.append(property_code)
57        if property_kind == "Binary":
58            property_object_map[property_code] = BinaryPropertyObject()
59        elif property_kind == "Enumerated":
60            property_object_map[property_code] = EnumeratedPropertyObject()
61        elif property_kind == "Catalog":   # Age, Block, Script
62            property_object_map[property_code] = EnumeratedPropertyObject()
63        elif property_kind == "String":
64            property_object_map[property_code] = StringPropertyObject()
65        elif property_kind == "Numeric":
66            property_object_map[property_code] = NumericPropertyObject()
67        else:  # Miscellaneous properties
68            if property_code == "scx":
69                property_object_map[property_code] = ExtensionPropertyObject()
70            else:
71                # All other Miscellaneous properties have string values
72                property_object_map[property_code] = StringPropertyObject()
73        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
74        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
75        property_object_map[property_code].setAliases(prop_aliases)
76    return (property_enum_name_list, property_object_map)
77
78#
79#  UCD Property File Format 2: property value aliases
80#  PropertyValueAliases.txt
81#
82#  This file records value aliases for property values for
83#  each enumerated property, with the following additional notes:
84#  (1) The corresponding integer value of the enum constant is
85#      also specified for ccc (second field).
86#  (2) The Age property is a numeric type which has decimal float
87#      values as the enum constants: these won't be legal in enum syntax.
88#  (3) Binary properties also have enumerated values and aliases listed,
89#      although this is redundant, because all binary properties have the
90#      same value space.
91#  (4) @missing lines provide default value information, primarily for some
92#      non-enumerated types
93
94def initializePropertyValues(property_object_map, property_lookup_map):
95    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
96    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
97    missing_specs = {}
98    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
99    lines = f.readlines()
100    for t in lines:
101        if UCD_skip.match(t):
102            m = UCD_property_value_missing_regexp.match(t)
103            if m:
104                if m.group(1) != '0000' or m.group(2) != '10FFFF': raise Exception("Bad missing spec: " + s)
105                cname = canonicalize(m.group(3))
106                if not cname in property_lookup_map: raise Exception("Bad missing property: " + s)
107                property_object_map[property_lookup_map[cname]].setDefaultValue(m.group(4))
108            continue  # skip comment and blank lines
109        m = UCD_property_value_alias_regexp.match(t)
110        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
111        prop_code = canonicalize(m.group(1))
112        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
113        else: prop_code = property_lookup_map[prop_code]
114        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
115        po = property_object_map[prop_code]
116        # Special case for ccc: second field is enum integer value
117        if prop_code == 'ccc':
118            value_enum = m.group(3)
119            extra = m.group(4)
120            extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
121            value_preferred_full_name = extra_list[0]
122            # Treat integer string as an alias
123            value_aliases = [m.group(2)] + extra_list[1:]
124        # Special case for age: second field is numeric, third field is enum
125        # treat numeric value as an alias string
126        elif prop_code == 'age':
127            value_enum = m.group(3)
128            value_preferred_full_name = m.group(3)
129            extra = m.group(4)
130            value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
131        else:
132            value_enum = m.group(2)
133            value_preferred_full_name = m.group(3)
134            extra = m.group(4)
135            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
136        if not isinstance(po, EnumeratedPropertyObject): continue
137        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
138
139
140#
141#  UCD Property File Format 3:  codepoint/range -> data record maps
142#  Many files have data records consisting of a codepoint or codepoint range
143#  followed by fields separated by semicolons.
144#
145
146UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
147UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
148
149def parse_data_record(data_line):
150    m = UCD_point_regexp.match(data_line)
151    if m:
152        cp_lo = int(m.group(1), 16)
153        cp_hi = cp_lo
154        field_data = m.group(2)
155    else:
156        m = UCD_range_regexp.match(data_line)
157        if not m: raise Exception("UCD data record parsing error: " + data_line)
158        cp_lo = int(m.group(1), 16)
159        cp_hi = int(m.group(2), 16)
160        field_data = m.group(3)
161    field_data = field_data.lstrip().rstrip()
162    if field_data == '': 
163        fields = []
164    else:
165        if field_data[0] != ';': 
166            raise Exception("Field data syntax: " + field_data)
167        fields = field_data[1:].split(';')
168    fields = [f.lstrip().rstrip() for f in fields]
169    return (cp_lo, cp_hi, fields)
170
171UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
172
173def parse_missing_spec(data_line):
174    m = UCD_missing_regexp.match(data_line)
175    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
176    cp_lo = int(m.group(1), 16)
177    cp_hi = int(m.group(2), 16)
178    field_data = m.group(3)
179    fields = field_data.split(';')
180    fields = [f.lstrip().rstrip() for f in fields]
181    return (cp_lo, cp_hi, fields)
182
183def parse_property_and_value(fields, property_lookup_map):
184    if len(fields) > 2: raise Exception("Too many fields")
185    if len(fields) == 0: raise Exception("Expecting at least 1 field")
186    canon = canonicalize(fields[0])
187    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
188    pcode = property_lookup_map[canon]
189    if len(fields) == 1: return (pcode, None)
190    else: return (pcode, fields[1])
191
192def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
193    f = open(UCD_config.UCD_src_dir + "/" + pfile)
194    props = []
195    lines = f.readlines()
196    for t in lines:
197        if UCD_missing_regexp.match(t):
198            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
199            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
200            property_object_map[prop_code].setDefaultValue(dflt)
201            if not prop_code in props: props.append(prop_code)
202        elif UCD_skip.match(t):
203            continue
204        else:
205            (cp_lo, cp_hi, fields) = parse_data_record(t)
206            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
207            if not prop_code in props: props.append(prop_code)
208            if v == None:  # binary property
209                property_object_map[prop_code].addDataRecord(cp_lo, cp_hi)
210            else:
211                property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
212    for p in props:
213        property_object_map[p].finalizeProperty()
214    return props
215
216def parse_property_data(property_object, pfile):
217    f = open(UCD_config.UCD_src_dir + "/" + pfile)
218    lines = f.readlines()
219    for t in lines:
220        if UCD_missing_regexp.match(t):
221            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
222            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
223            property_object.setDefaultValue(fields[0])
224        elif UCD_skip.match(t):
225            continue
226        else:
227            (cp_lo, cp_hi, fields) = parse_data_record(t)
228            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
229                property_object.addDataRecord(cp_lo, cp_hi)
230            else:
231                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
232    property_object.finalizeProperty()
233
234def parse_ScriptExtensions_txt(script_property_object):
235    filename_root = 'ScriptExtensions'
236    parse_property_data(script_property_object, filename_root + '.txt')
237
238UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
239
240NonNameRange_regexp = re.compile("<([^>]*)>")
241NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
242
243def parse_UnicodeData_txt():
244    data_records = []
245    range_records = []
246    name_range_starts = {}
247    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
248    lines = f.readlines()
249    for t in lines:
250        if UCD_skip.match(t):
251            continue  # skip comment and blank lines
252        m = UnicodeData_txt_regexp.match(t)
253        if not m: raise Exception("Unknown syntax: %s" % t)
254        (cp, name, gc) = (m.group(1), m.group(2), m.group(3))
255        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
256        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
257        # Unicode 1 name and ISO comment are obolete
258        (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
259        nonNameMatch = NonNameRange_regexp.match(name)
260        if nonNameMatch:
261            rangeMatch = NameRange_regexp.match(name)
262            if rangeMatch:
263                rangeName = rangeMatch.group(1)
264                print(rangeName, rangeMatch.group(2))
265                if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
266                if rangeMatch.group(2) == 'Last': 
267                    if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
268                    range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
269            continue
270        data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
271    return (data_records, range_records)
272
273#  Parse a decomposition mapping field in one of two forms:
274#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
275#  (b) canonical mappings:  {codepoint} 
276compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
277codepoints_regexp = re.compile("^[0-9A-F]{4,6}(?: +[0-9A-F]{4,6})*$")
278def parse_decomposition(s):
279    m = compatibility_regexp.match(s)
280    if m: 
281        decomp_type = m.group(1)
282        mapping = m.group(2)
283    else:
284        decomp_type = "Canonical"
285        mapping = s
286    m = codepoints_regexp.match(mapping)
287    if not m: raise Exception("Bad codepoint string syntax in parse_decomposition: %s" % mapping)
288    cps = [int(x, 16) for x in mapping.split(" ")]
289    return (decomp_type, cps)
290
Note: See TracBrowser for help on using the repository browser.