source: icGREP/icgrep-devel/UCD-scripts/UCD_parser.py @ 5919

Last change on this file since 5919 was 5749, checked in by nmedfort, 21 months ago

updated UCD python scripts

File size: 19.7 KB
Line 
1#
2# UCD_parser.py - parsing Unicode Character Database (UCD) files
3#
4# Robert D. Cameron
5# December 28, 2014
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import UCD_config
11from UCD_property_objects import *
12
13version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
14
15def setVersionfromReadMe_txt():
16    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
17    lines = f.readlines()
18    for t in lines:
19        m = version_regexp.match(t)
20        if m: 
21            UCD_config.version = m.group(1)
22            print("Version %s" % m.group(1))
23
24trivial_name_char_re = re.compile('[-_\s]')
25def canonicalize(property_string):
26    return trivial_name_char_re.sub('', property_string.lower())
27
28#
29#  Processing files of the UCD
30#
31#  General format for skippable comments, blank lines
32UCD_skip = re.compile("^#.*$|^\s*$")
33
34#
35#  UCD Property File Format 1: property aliases
36#  PropertyAliases.txt
37#
38UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
39UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
40
41# Section 2.3.3 of UAX $44
42Obsolete_Properties = ["na1", "Gr_Link", "Hyphen", "isc", "XO_NFC", "XO_NFD", "XO_NFKC", "XO_NFKD" ,"FC_NFKC"]
43
44def parse_PropertyAlias_txt():
45    property_object_map = {}
46    property_enum_name_list = []
47    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
48    lines = f.readlines()
49    for t in lines:
50        m = UCD_property_section_regexp.match(t)
51        if m:
52            property_kind = m.group(1)
53        if UCD_skip.match(t): continue  # skip comment and blank lines
54        m = UCD_property_alias_regexp.match(t)
55        if not m: raise Exception("Unknown property alias syntax: %s" % t)
56        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
57        property_enum_name_list.append(property_code)
58        if property_code in Obsolete_Properties:
59            property_object_map[property_code] = ObsoletePropertyObject()
60        elif property_kind == "Binary":
61            property_object_map[property_code] = BinaryPropertyObject()
62        elif property_kind == "Enumerated":
63            property_object_map[property_code] = EnumeratedPropertyObject()
64        elif property_kind == "Catalog":   # Age, Block, Script
65            property_object_map[property_code] = EnumeratedPropertyObject()
66        elif property_kind == "String":
67            if property_code in ["uc", "lc", "tc", "cf"]:
68                property_object_map[property_code] = StringOverridePropertyObject("s" + property_code)
69            else:
70                property_object_map[property_code] = StringPropertyObject()
71        elif property_kind == "Numeric":
72            property_object_map[property_code] = NumericPropertyObject()
73        else:  # Miscellaneous properties
74            if property_code == "scx":
75                property_object_map[property_code] = ExtensionPropertyObject()
76            else:
77                # All other Miscellaneous properties have string values
78                property_object_map[property_code] = StringPropertyObject()
79        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
80        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
81        property_object_map[property_code].setAliases(prop_aliases)
82    return (property_enum_name_list, property_object_map)
83
84
85#
86#  Property Default Value Specifications
87#
88#  THe UCD uses special comment lines ("@missing specifications") to declare default
89#  values for properties.   Examples showing the two common formats are:
90#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
91#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
92#  The general format gives a range of codepoints (generally 0000..10FFFF),
93#  an optional property name (if the file containing the specification defines
94#  many different properties), and the default value.
95#
96#  There are some important default values for different property types:
97#  <codepoint>:  This is a default value for certain String properties,
98#                indicating the default for a codepoint under the given property
99#                is to map to itself.
100#  <none>:       This is a default for certain String properties indicating that
101#                the default value for a code point is the empty string.
102#  <script>:     The default value for the ScriptExtnesions property is the
103#                value of the Script property.
104#  NaN           The default value for numeric property is the NaN (not a number) value.
105#
106
107#  Given a line known to contain such a @missing specification,
108#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
109#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
110#  The list of fields contains one or two entries: an optional
111#  property name and the default value specified for the range.
112#  @missing specifications generally omit the property name when
113#  the file being processed is defined for a single property only.
114#
115UCD_missing_check = re.compile("^#\s*@missing:.*")
116UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
117
118def parse_missing_spec(data_line):
119    m = UCD_missing_regexp.match(data_line)
120    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
121    cp_lo = int(m.group(1), 16)
122    cp_hi = int(m.group(2), 16)
123    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
124    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
125    field_data = m.group(3)
126    fields = field_data.split(';')
127    fields = [f.lstrip().rstrip() for f in fields]
128    return (cp_lo, cp_hi, fields)
129
130#
131#  Missing specifications and other types of UCD data records often produce
132#  a list of one or two fields which indicate a property and a value.
133#
134#  parse_property_and_value(fields, property_lookup_map) checks that
135#  first of the given fields is indeed a property identifier identified
136#  in the given lookup map, and returns a pair consisting of the
137#  unique property code for the property, plus a corresponding value
138#  (or None, if only one field was given).
139
140def parse_property_and_value(fields, property_lookup_map):
141    if len(fields) > 2: raise Exception("Too many fields")
142    if len(fields) == 0: raise Exception("Expecting at least 1 field")
143    canon = canonicalize(fields[0])
144    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
145    pcode = property_lookup_map[canon]
146    if len(fields) == 1: return (pcode, None)
147    else: return (pcode, fields[1])
148
149#
150#  UCD Property File Format 2: property value aliases
151#  PropertyValueAliases.txt
152#
153#  This file records value aliases for property values for
154#  each enumerated property, with the following additional notes:
155#  (1) The corresponding integer value of the enum constant is
156#      also specified for ccc (second field).
157#  (2) The Age property is a numeric type which has decimal float
158#      values as the enum constants: these won't be legal in enum syntax.
159#  (3) Binary properties also have enumerated values and aliases listed,
160#      although this is redundant, because all binary properties have the
161#      same value space.
162#  (4) @missing lines provide default value information, primarily for some
163#      non-enumerated types
164
165
166def initializePropertyValues(property_object_map, property_lookup_map):
167    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
168    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
169    missing_specs = {}
170    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
171    lines = f.readlines()
172    for t in lines:
173        if UCD_skip.match(t):
174            if UCD_missing_check.match(t):
175                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
176                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
177                property_object_map[property_code].setDefaultValue(default_value)
178            continue  # skip comment and blank lines
179        m = UCD_property_value_alias_regexp.match(t)
180        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
181        prop_code = canonicalize(m.group(1))
182        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
183        else: prop_code = property_lookup_map[prop_code]
184        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
185        po = property_object_map[prop_code]
186        # Special case for ccc: second field is enum integer value
187        if prop_code == 'ccc':
188            value_enum = m.group(3)
189            extra = m.group(4)
190            extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
191            value_preferred_full_name = extra_list[0]
192            # Treat integer string as an alias
193            value_aliases = [m.group(2)] + extra_list[1:]
194        # Special case for age: second field is numeric, third field is enum
195        # treat numeric value as an alias string
196        elif prop_code == 'age':
197            value_enum = m.group(3)
198            value_preferred_full_name = m.group(3)
199            extra = m.group(4)
200            value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
201        else:
202            value_enum = m.group(2)
203            value_preferred_full_name = m.group(3)
204            extra = m.group(4)
205            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
206        if not isinstance(po, EnumeratedPropertyObject): continue
207        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
208
209
210#
211#  UCD Property File Format 3:  codepoint/range -> data record maps
212#  Many files have data records consisting of a codepoint or codepoint range
213#  followed by fields separated by semicolons.
214#
215
216UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
217UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
218
219#
220# parse_data_record is a generic parser for most of the UCD data files.
221# Given a data_line beginning with a codepoint or codepoint range,
222# this function returns a (cp_lo, cp_hi, fields) triple givnig the
223# low and high codepoints of the range (these values may be equal in
224# the case of a single codepoint), as well as a list of fields.
225# The semicolon separators are removed as well as leading or trailing
226# whitespace for each field value.
227
228def parse_data_record(data_line):
229    m = UCD_point_regexp.match(data_line)
230    if m:
231        cp_lo = int(m.group(1), 16)
232        cp_hi = cp_lo
233        field_data = m.group(2)
234    else:
235        m = UCD_range_regexp.match(data_line)
236        if not m: raise Exception("UCD data record parsing error: " + data_line)
237        cp_lo = int(m.group(1), 16)
238        cp_hi = int(m.group(2), 16)
239        field_data = m.group(3)
240    field_data = field_data.lstrip().rstrip()
241    if field_data == '': 
242        fields = []
243    else:
244        if field_data[0] != ';': 
245            raise Exception("Field data syntax: " + field_data)
246        fields = field_data[1:].split(';')
247    fields = [f.lstrip().rstrip() for f in fields]
248    return (cp_lo, cp_hi, fields)
249
250
251#  parse_multisection_property_data parses such a file and populates
252#  the property objects for each property through successive calls to
253#  the corresponding addDataRecord method.
254#
255def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
256    f = open(UCD_config.UCD_src_dir + "/" + pfile)
257    props = []
258    lines = f.readlines()
259    for t in lines:
260        if UCD_missing_regexp.match(t):
261            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
262            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
263            property_object_map[prop_code].setDefaultValue(dflt)
264            if not prop_code in props: props.append(prop_code)
265        elif UCD_skip.match(t):
266            continue
267        else:
268            (cp_lo, cp_hi, fields) = parse_data_record(t)
269            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
270            if not prop_code in props: props.append(prop_code)
271            property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
272    for p in props:
273        property_object_map[p].finalizeProperty()
274    return props
275
276
277#
278#   Some UCD files are defined for a single property.   
279#   parse_property_data deals with such a file, given the property
280#   object to populate and the file root.
281#
282
283def parse_property_data(property_object, pfile):
284    f = open(UCD_config.UCD_src_dir + "/" + pfile)
285    lines = f.readlines()
286    for t in lines:
287        if UCD_missing_regexp.match(t):
288            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
289            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
290            property_object.setDefaultValue(fields[0])
291        elif UCD_skip.match(t):
292            continue
293        else:
294            (cp_lo, cp_hi, fields) = parse_data_record(t)
295            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
296                property_object.addDataRecord(cp_lo, cp_hi, None)
297            else:
298                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
299    property_object.finalizeProperty()
300
301
302#
303#   Some UCD files are organized to support multiple properties with one
304#   property per column.
305#   parse_multicolumn_property_data deals with such files given a list of
306#   property codes.
307#
308
309def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
310    f = open(UCD_config.UCD_src_dir + "/" + pfile)
311    props = []
312    lines = f.readlines()
313    for t in lines:
314        if UCD_skip.match(t):
315            continue
316        else:
317            (cp_lo, cp_hi, fields) = parse_data_record(t)
318            if len(fields) != len(prop_code_list): raise Exception("Multicolumn field count mismatch, expecting %i: " % len(prop_code_list) + t)
319            for i in range(len(fields)):
320                if fields[i] != '' and prop_code_list[i] in property_object_map:
321                    property_object_map[prop_code_list[i]].addDataRecord(cp_lo, cp_hi, fields[i])
322    for p in prop_code_list:
323        if prop_code_list[i] in property_object_map: 
324            property_object_map[p].finalizeProperty()
325
326UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
327
328NonName_regexp = re.compile("<([^>]*)>")
329NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
330
331#  Parse a decomposition mapping field in one of two forms:
332#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
333#  (b) canonical mappings:  {codepoint} 
334compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
335def parse_decomposition(s):
336    m = compatibility_regexp.match(s)
337    if m: 
338        decomp_type = m.group(1)
339        mapping = m.group(2)
340    else:
341        decomp_type = "Canonical"
342        mapping = s
343    return (decomp_type, mapping)
344
345def parse_UnicodeData_txt(property_object_map):
346    data_records = []
347    range_records = []
348    name_range_starts = {}
349    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
350    lines = f.readlines()
351    for t in lines:
352        if UCD_skip.match(t):
353            continue  # skip comment and blank lines
354        m = UnicodeData_txt_regexp.match(t)
355        if not m: raise Exception("Unknown syntax: %s" % t)
356        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
357        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
358        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
359        (na1, isc) = (m.group(10), m.group(11))
360        (upper, lower, title) = (m.group(13), m.group(14), m.group(15))
361        rangeMatch = NameRange_regexp.match(name)
362        if rangeMatch:
363            rangeName = rangeMatch.group(1)
364            print(rangeName, rangeMatch.group(2))
365            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
366            if rangeMatch.group(2) == 'Last': 
367                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
368                range_records.append((name_range_starts[rangeName], cp, rangeName, gc))
369        if not NonName_regexp.match(name):
370            property_object_map['na'].addDataRecord(cp, cp, name)
371        if not decomp == '':
372            (decomp_type, mapping) = parse_decomposition(decomp)
373            property_object_map['dm'].addDataRecord(cp, cp, mapping)
374        if not na1 == '':
375            property_object_map['na1'].addDataRecord(cp, cp, na1)
376        if not upper == '':
377            property_object_map['suc'].addDataRecord(cp, cp, upper)
378        if not lower == '':
379            property_object_map['slc'].addDataRecord(cp, cp, lower)
380        if not title == '':
381            property_object_map['stc'].addDataRecord(cp, cp, title)
382        elif not upper == '':
383            property_object_map['stc'].addDataRecord(cp, cp, upper)
384        if not decval == '':
385            property_object_map['nv'].addDataRecord(cp, cp, decval)
386        if not digitval == '':
387            property_object_map['nv'].addDataRecord(cp, cp, digitval)
388        if not numval == '':
389            property_object_map['nv'].addDataRecord(cp, cp, numval)
390
391    property_object_map['na'].finalizeProperty()
392    property_object_map['na1'].finalizeProperty()
393    property_object_map['isc'].finalizeProperty()
394    property_object_map['dm'].finalizeProperty()
395    property_object_map['slc'].finalizeProperty()
396    property_object_map['suc'].finalizeProperty()
397    property_object_map['stc'].finalizeProperty()
398    property_object_map['nv'].finalizeProperty()
399
400def parse_SpecialCasing_txt(property_object_map):
401    f = open(UCD_config.UCD_src_dir + "/SpecialCasing.txt")
402    lines = f.readlines()
403    for t in lines:
404        if UCD_skip.match(t):
405            continue  # skip comment and blank lines
406        (cp, cp_hi, fields) = parse_data_record(t)
407        if len(fields) != 4: continue   #  Ignore context-dependent casing
408        if fields[3] != '': continue
409        (lower, title, upper) = (fields[0], fields[1], fields[2])
410        if not lower == '':
411            property_object_map['lc'].addDataRecord(cp, cp, lower)
412        if not title == '':
413            property_object_map['tc'].addDataRecord(cp, cp, title)
414        if not upper == '':
415            property_object_map['uc'].addDataRecord(cp, cp, upper)
416    property_object_map['lc'].finalizeProperty()
417    property_object_map['tc'].finalizeProperty()
418    property_object_map['uc'].finalizeProperty()
419
420
421# CaseFolding.txt has four types of fold entries:
422# S, C, F, T:  Simple, Common, Full and Turkic. 
423# The SimpleCaseFold property is the set of mappings S+C,
424# The FullCaseFold property is the set F+C
425# There may be multiple entries per codepoint
426
427def parse_CaseFolding_txt(property_object_map):
428    fold_map = {}
429    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
430    lines = f.readlines()
431    for t in lines:
432        if UCD_skip.match(t): continue  # skip comment and blank lines
433        (cp, cp_hi, fields) = parse_data_record(t)
434        (fold_type, fold_val) = (fields[0], fields[1])
435        if not fold_type in fold_map: fold_map[fold_type] = {} 
436        if fold_type == 'S' or fold_type == 'C':
437            # fold value is guaranteed to be a single codepoint
438            property_object_map['scf'].addDataRecord(cp, cp, fold_val)
439        else:
440            if fold_type == 'F':
441                property_object_map['cf'].addDataRecord(cp, cp, fold_val)
442        fold_map[fold_type][cp] = fold_val
443    property_object_map['scf'].finalizeProperty()
444    property_object_map['cf'].finalizeProperty()
445    return fold_map
446
Note: See TracBrowser for help on using the repository browser.