source: icGREP/icgrep-devel/UCD-scripts/UCD_parser.py @ 5668

Last change on this file since 5668 was 5668, checked in by cameron, 23 months ago

PropertyObject? restructuring - remove Miscellaneous and Codepoint objects, add Obsolete

File size: 17.1 KB
Line 
1#
2# UCD_parser.py - parsing Unicode Character Database (UCD) files
3#
4# Robert D. Cameron
5# December 28, 2014
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import re, string, os.path
11import UCD_config
12from unicode_set import *
13from UCD_property_objects import *
14
15version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
16
17def setVersionfromReadMe_txt():
18    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
19    lines = f.readlines()
20    for t in lines:
21        m = version_regexp.match(t)
22        if m: 
23            UCD_config.version = m.group(1)
24            print("Version %s" % m.group(1))
25
26trivial_name_char_re = re.compile('[-_\s]')
27def canonicalize(property_string):
28    return trivial_name_char_re.sub('', property_string.lower())
29
30#
31#  Processing files of the UCD
32#
33#  General format for skippable comments, blank lines
34UCD_skip = re.compile("^#.*$|^\s*$")
35
36#
37#  UCD Property File Format 1: property aliases
38#  PropertyAliases.txt
39#
40UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
41UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
42
43# Section 2.3.3 of UAX $44
44Obsolete_Properties = ["na1", "Gr_Link", "Hyphen", "isc", "XO_NFC", "XO_NFD", "XO_NFKC", "XO_NFKD" ,"FC_NFKC"]
45
46def parse_PropertyAlias_txt():
47    property_object_map = {}
48    property_enum_name_list = []
49    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
50    lines = f.readlines()
51    for t in lines:
52        m = UCD_property_section_regexp.match(t)
53        if m:
54            property_kind = m.group(1)
55        if UCD_skip.match(t): continue  # skip comment and blank lines
56        m = UCD_property_alias_regexp.match(t)
57        if not m: raise Exception("Unknown property alias syntax: %s" % t)
58        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
59        property_enum_name_list.append(property_code)
60        if property_code in Obsolete_Properties:
61            property_object_map[property_code] = ObsoletePropertyObject()
62        elif property_kind == "Binary":
63            property_object_map[property_code] = BinaryPropertyObject()
64        elif property_kind == "Enumerated":
65            property_object_map[property_code] = EnumeratedPropertyObject()
66        elif property_kind == "Catalog":   # Age, Block, Script
67            property_object_map[property_code] = EnumeratedPropertyObject()
68        elif property_kind == "String":
69            property_object_map[property_code] = StringPropertyObject()
70        elif property_kind == "Numeric":
71            property_object_map[property_code] = NumericPropertyObject()
72        else:  # Miscellaneous properties
73            if property_code == "scx":
74                property_object_map[property_code] = ExtensionPropertyObject()
75            else:
76                # All other Miscellaneous properties have string values
77                property_object_map[property_code] = StringPropertyObject()
78        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
79        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
80        property_object_map[property_code].setAliases(prop_aliases)
81    return (property_enum_name_list, property_object_map)
82
83
84#
85#  Property Default Value Specifications
86#
87#  THe UCD uses special comment lines ("@missing specifications") to declare default
88#  values for properties.   Examples showing the two common formats are:
89#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
90#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
91#  The general format gives a range of codepoints (generally 0000..10FFFF),
92#  an optional property name (if the file containing the specification defines
93#  many different properties), and the default value.
94#
95#  There are some important default values for different property types:
96#  <codepoint>:  This is a default value for certain String properties,
97#                indicating the default for a codepoint under the given property
98#                is to map to itself.
99#  <none>:       This is a default for certain String properties indicating that
100#                the default value for a code point is the empty string.
101#  <script>:     The default value for the ScriptExtnesions property is the
102#                value of the Script property.
103#  NaN           The default value for numeric property is the NaN (not a number) value.
104#
105
106#  Given a line known to contain such a @missing specification,
107#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
108#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
109#  The list of fields contains one or two entries: an optional
110#  property name and the default value specified for the range.
111#  @missing specifications generally omit the property name when
112#  the file being processed is defined for a single property only.
113#
114UCD_missing_check = re.compile("^#\s*@missing:.*")
115UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
116
117def parse_missing_spec(data_line):
118    m = UCD_missing_regexp.match(data_line)
119    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
120    cp_lo = int(m.group(1), 16)
121    cp_hi = int(m.group(2), 16)
122    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
123    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
124    field_data = m.group(3)
125    fields = field_data.split(';')
126    fields = [f.lstrip().rstrip() for f in fields]
127    return (cp_lo, cp_hi, fields)
128
129#
130#  Missing specifications and other types of UCD data records often produce
131#  a list of one or two fields which indicate a property and a value.
132#
133#  parse_property_and_value(fields, property_lookup_map) checks that
134#  first of the given fields is indeed a property identifier identified
135#  in the given lookup map, and returns a pair consisting of the
136#  unique property code for the property, plus a corresponding value
137#  (or None, if only one field was given).
138
139def parse_property_and_value(fields, property_lookup_map):
140    if len(fields) > 2: raise Exception("Too many fields")
141    if len(fields) == 0: raise Exception("Expecting at least 1 field")
142    canon = canonicalize(fields[0])
143    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
144    pcode = property_lookup_map[canon]
145    if len(fields) == 1: return (pcode, None)
146    else: return (pcode, fields[1])
147
148#
149#  UCD Property File Format 2: property value aliases
150#  PropertyValueAliases.txt
151#
152#  This file records value aliases for property values for
153#  each enumerated property, with the following additional notes:
154#  (1) The corresponding integer value of the enum constant is
155#      also specified for ccc (second field).
156#  (2) The Age property is a numeric type which has decimal float
157#      values as the enum constants: these won't be legal in enum syntax.
158#  (3) Binary properties also have enumerated values and aliases listed,
159#      although this is redundant, because all binary properties have the
160#      same value space.
161#  (4) @missing lines provide default value information, primarily for some
162#      non-enumerated types
163
164
165def initializePropertyValues(property_object_map, property_lookup_map):
166    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
167    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
168    missing_specs = {}
169    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
170    lines = f.readlines()
171    for t in lines:
172        if UCD_skip.match(t):
173            if UCD_missing_check.match(t):
174                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
175                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
176                property_object_map[property_code].setDefaultValue(default_value)
177            continue  # skip comment and blank lines
178        m = UCD_property_value_alias_regexp.match(t)
179        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
180        prop_code = canonicalize(m.group(1))
181        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
182        else: prop_code = property_lookup_map[prop_code]
183        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
184        po = property_object_map[prop_code]
185        # Special case for ccc: second field is enum integer value
186        if prop_code == 'ccc':
187            value_enum = m.group(3)
188            extra = m.group(4)
189            extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
190            value_preferred_full_name = extra_list[0]
191            # Treat integer string as an alias
192            value_aliases = [m.group(2)] + extra_list[1:]
193        # Special case for age: second field is numeric, third field is enum
194        # treat numeric value as an alias string
195        elif prop_code == 'age':
196            value_enum = m.group(3)
197            value_preferred_full_name = m.group(3)
198            extra = m.group(4)
199            value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
200        else:
201            value_enum = m.group(2)
202            value_preferred_full_name = m.group(3)
203            extra = m.group(4)
204            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
205        if not isinstance(po, EnumeratedPropertyObject): continue
206        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
207
208
209#
210#  UCD Property File Format 3:  codepoint/range -> data record maps
211#  Many files have data records consisting of a codepoint or codepoint range
212#  followed by fields separated by semicolons.
213#
214
215UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
216UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
217
218#
219# parse_data_record is a generic parser for most of the UCD data files.
220# Given a data_line beginning with a codepoint or codepoint range,
221# this function returns a (cp_lo, cp_hi, fields) triple givnig the
222# low and high codepoints of the range (these values may be equal in
223# the case of a single codepoint), as well as a list of fields.
224# The semicolon separators are removed as well as leading or trailing
225# whitespace for each field value.
226
227def parse_data_record(data_line):
228    m = UCD_point_regexp.match(data_line)
229    if m:
230        cp_lo = int(m.group(1), 16)
231        cp_hi = cp_lo
232        field_data = m.group(2)
233    else:
234        m = UCD_range_regexp.match(data_line)
235        if not m: raise Exception("UCD data record parsing error: " + data_line)
236        cp_lo = int(m.group(1), 16)
237        cp_hi = int(m.group(2), 16)
238        field_data = m.group(3)
239    field_data = field_data.lstrip().rstrip()
240    if field_data == '': 
241        fields = []
242    else:
243        if field_data[0] != ';': 
244            raise Exception("Field data syntax: " + field_data)
245        fields = field_data[1:].split(';')
246    fields = [f.lstrip().rstrip() for f in fields]
247    return (cp_lo, cp_hi, fields)
248
249
250#  parse_multisection_property_data parses such a file and populates
251#  the property objects for each property through successive calls to
252#  the corresponding addDataRecord method.
253#
254def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
255    f = open(UCD_config.UCD_src_dir + "/" + pfile)
256    props = []
257    lines = f.readlines()
258    for t in lines:
259        if UCD_missing_regexp.match(t):
260            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
261            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
262            property_object_map[prop_code].setDefaultValue(dflt)
263            if not prop_code in props: props.append(prop_code)
264        elif UCD_skip.match(t):
265            continue
266        else:
267            (cp_lo, cp_hi, fields) = parse_data_record(t)
268            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
269            if not prop_code in props: props.append(prop_code)
270            property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
271    for p in props:
272        property_object_map[p].finalizeProperty()
273    return props
274
275
276#
277#   Some UCD files are defined for a single property.   
278#   parse_property_data deals with such a file, given the property
279#   object to populate and the file root.
280#
281
282def parse_property_data(property_object, pfile):
283    f = open(UCD_config.UCD_src_dir + "/" + pfile)
284    lines = f.readlines()
285    for t in lines:
286        if UCD_missing_regexp.match(t):
287            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
288            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
289            property_object.setDefaultValue(fields[0])
290        elif UCD_skip.match(t):
291            continue
292        else:
293            (cp_lo, cp_hi, fields) = parse_data_record(t)
294            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
295                property_object.addDataRecord(cp_lo, cp_hi, None)
296            else:
297                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
298    property_object.finalizeProperty()
299
300
301#
302#   Some UCD files are organized to support multiple properties with one
303#   property per column.
304#   parse_multicolumn_property_data deals with such files given a list of
305#   property codes.
306#
307
308def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
309    f = open(UCD_config.UCD_src_dir + "/" + pfile)
310    props = []
311    lines = f.readlines()
312    for t in lines:
313        if UCD_skip.match(t):
314            continue
315        else:
316            (cp_lo, cp_hi, fields) = parse_data_record(t)
317            if len(fields) != len(prop_code_list): raise Exception("Mutlicolumn field count mismatch, expecting %i: " % len(prop_code_list) + t)
318            for i in range(len(fields)):
319                if fields[i] != '':
320                    property_object_map[prop_code_list[i]].addDataRecord(cp_lo, cp_hi, fields[i])
321    for p in prop_code_list:
322        property_object_map[p].finalizeProperty()
323
324UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
325
326NonName_regexp = re.compile("<([^>]*)>")
327NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
328
329#  Parse a decomposition mapping field in one of two forms:
330#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
331#  (b) canonical mappings:  {codepoint} 
332compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
333def parse_decomposition(s):
334    m = compatibility_regexp.match(s)
335    if m: 
336        decomp_type = m.group(1)
337        mapping = m.group(2)
338    else:
339        decomp_type = "Canonical"
340        mapping = s
341    return (decomp_type, mapping)
342
343def parse_UnicodeData_txt(property_object_map):
344    data_records = []
345    range_records = []
346    name_range_starts = {}
347    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
348    lines = f.readlines()
349    for t in lines:
350        if UCD_skip.match(t):
351            continue  # skip comment and blank lines
352        m = UnicodeData_txt_regexp.match(t)
353        if not m: raise Exception("Unknown syntax: %s" % t)
354        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
355        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
356        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
357        (na1, isc) = (m.group(10), m.group(11))
358        (suc, slc, stc) = (m.group(13), m.group(14), m.group(15))
359        rangeMatch = NameRange_regexp.match(name)
360        if rangeMatch:
361            rangeName = rangeMatch.group(1)
362            print(rangeName, rangeMatch.group(2))
363            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
364            if rangeMatch.group(2) == 'Last': 
365                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
366                range_records.append((name_range_starts[rangeName], cp, rangeName, gc))
367        if not NonName_regexp.match(name):
368            property_object_map['na'].addDataRecord(cp, cp, name)
369        if not decomp == '':
370            (decomp_type, mapping) = parse_decomposition(decomp)
371            property_object_map['dm'].addDataRecord(cp, cp, mapping)
372        if not na1 == '':
373            property_object_map['na1'].addDataRecord(cp, cp, na1)
374        if not suc == '':
375            property_object_map['suc'].addDataRecord(cp, cp, suc)
376            if stc == '':
377                property_object_map['stc'].addDataRecord(cp, cp, uc)
378        if not slc == '':
379            property_object_map['slc'].addDataRecord(cp, cp, slc)
380        if not stc == '':
381            property_object_map['stc'].addDataRecord(cp, cp, stc)
382    property_object_map['na'].finalizeProperty()
383    property_object_map['na1'].finalizeProperty()
384    property_object_map['isc'].finalizeProperty()
385    property_object_map['dm'].finalizeProperty()
386    property_object_map['slc'].finalizeProperty()
387    property_object_map['suc'].finalizeProperty()
388    property_object_map['stc'].finalizeProperty()
389
Note: See TracBrowser for help on using the repository browser.