source: icGREP/icgrep-devel/UCD-scripts/UCD_parser.py @ 5674

Last change on this file since 5674 was 5674, checked in by cameron, 22 months ago

Fix upper vs title case confusion

File size: 19.7 KB
Line 
1#
2# UCD_parser.py - parsing Unicode Character Database (UCD) files
3#
4# Robert D. Cameron
5# December 28, 2014
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import re, string, os.path
11import UCD_config
12from unicode_set import *
13from UCD_property_objects import *
14
15version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
16
17def setVersionfromReadMe_txt():
18    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
19    lines = f.readlines()
20    for t in lines:
21        m = version_regexp.match(t)
22        if m: 
23            UCD_config.version = m.group(1)
24            print("Version %s" % m.group(1))
25
26trivial_name_char_re = re.compile('[-_\s]')
27def canonicalize(property_string):
28    return trivial_name_char_re.sub('', property_string.lower())
29
30#
31#  Processing files of the UCD
32#
33#  General format for skippable comments, blank lines
34UCD_skip = re.compile("^#.*$|^\s*$")
35
36#
37#  UCD Property File Format 1: property aliases
38#  PropertyAliases.txt
39#
40UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
41UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
42
43# Section 2.3.3 of UAX $44
44Obsolete_Properties = ["na1", "Gr_Link", "Hyphen", "isc", "XO_NFC", "XO_NFD", "XO_NFKC", "XO_NFKD" ,"FC_NFKC"]
45
46def parse_PropertyAlias_txt():
47    property_object_map = {}
48    property_enum_name_list = []
49    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
50    lines = f.readlines()
51    for t in lines:
52        m = UCD_property_section_regexp.match(t)
53        if m:
54            property_kind = m.group(1)
55        if UCD_skip.match(t): continue  # skip comment and blank lines
56        m = UCD_property_alias_regexp.match(t)
57        if not m: raise Exception("Unknown property alias syntax: %s" % t)
58        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
59        property_enum_name_list.append(property_code)
60        if property_code in Obsolete_Properties:
61            property_object_map[property_code] = ObsoletePropertyObject()
62        elif property_kind == "Binary":
63            property_object_map[property_code] = BinaryPropertyObject()
64        elif property_kind == "Enumerated":
65            property_object_map[property_code] = EnumeratedPropertyObject()
66        elif property_kind == "Catalog":   # Age, Block, Script
67            property_object_map[property_code] = EnumeratedPropertyObject()
68        elif property_kind == "String":
69            if property_code in ["uc", "lc", "tc", "cf"]:
70                property_object_map[property_code] = StringOverridePropertyObject("s" + property_code)
71            else:
72                property_object_map[property_code] = StringPropertyObject()
73        elif property_kind == "Numeric":
74            property_object_map[property_code] = NumericPropertyObject()
75        else:  # Miscellaneous properties
76            if property_code == "scx":
77                property_object_map[property_code] = ExtensionPropertyObject()
78            else:
79                # All other Miscellaneous properties have string values
80                property_object_map[property_code] = StringPropertyObject()
81        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
82        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
83        property_object_map[property_code].setAliases(prop_aliases)
84    return (property_enum_name_list, property_object_map)
85
86
87#
88#  Property Default Value Specifications
89#
90#  THe UCD uses special comment lines ("@missing specifications") to declare default
91#  values for properties.   Examples showing the two common formats are:
92#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
93#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
94#  The general format gives a range of codepoints (generally 0000..10FFFF),
95#  an optional property name (if the file containing the specification defines
96#  many different properties), and the default value.
97#
98#  There are some important default values for different property types:
99#  <codepoint>:  This is a default value for certain String properties,
100#                indicating the default for a codepoint under the given property
101#                is to map to itself.
102#  <none>:       This is a default for certain String properties indicating that
103#                the default value for a code point is the empty string.
104#  <script>:     The default value for the ScriptExtnesions property is the
105#                value of the Script property.
106#  NaN           The default value for numeric property is the NaN (not a number) value.
107#
108
109#  Given a line known to contain such a @missing specification,
110#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
111#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
112#  The list of fields contains one or two entries: an optional
113#  property name and the default value specified for the range.
114#  @missing specifications generally omit the property name when
115#  the file being processed is defined for a single property only.
116#
117UCD_missing_check = re.compile("^#\s*@missing:.*")
118UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
119
120def parse_missing_spec(data_line):
121    m = UCD_missing_regexp.match(data_line)
122    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
123    cp_lo = int(m.group(1), 16)
124    cp_hi = int(m.group(2), 16)
125    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
126    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
127    field_data = m.group(3)
128    fields = field_data.split(';')
129    fields = [f.lstrip().rstrip() for f in fields]
130    return (cp_lo, cp_hi, fields)
131
132#
133#  Missing specifications and other types of UCD data records often produce
134#  a list of one or two fields which indicate a property and a value.
135#
136#  parse_property_and_value(fields, property_lookup_map) checks that
137#  first of the given fields is indeed a property identifier identified
138#  in the given lookup map, and returns a pair consisting of the
139#  unique property code for the property, plus a corresponding value
140#  (or None, if only one field was given).
141
142def parse_property_and_value(fields, property_lookup_map):
143    if len(fields) > 2: raise Exception("Too many fields")
144    if len(fields) == 0: raise Exception("Expecting at least 1 field")
145    canon = canonicalize(fields[0])
146    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
147    pcode = property_lookup_map[canon]
148    if len(fields) == 1: return (pcode, None)
149    else: return (pcode, fields[1])
150
151#
152#  UCD Property File Format 2: property value aliases
153#  PropertyValueAliases.txt
154#
155#  This file records value aliases for property values for
156#  each enumerated property, with the following additional notes:
157#  (1) The corresponding integer value of the enum constant is
158#      also specified for ccc (second field).
159#  (2) The Age property is a numeric type which has decimal float
160#      values as the enum constants: these won't be legal in enum syntax.
161#  (3) Binary properties also have enumerated values and aliases listed,
162#      although this is redundant, because all binary properties have the
163#      same value space.
164#  (4) @missing lines provide default value information, primarily for some
165#      non-enumerated types
166
167
168def initializePropertyValues(property_object_map, property_lookup_map):
169    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
170    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
171    missing_specs = {}
172    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
173    lines = f.readlines()
174    for t in lines:
175        if UCD_skip.match(t):
176            if UCD_missing_check.match(t):
177                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
178                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
179                property_object_map[property_code].setDefaultValue(default_value)
180            continue  # skip comment and blank lines
181        m = UCD_property_value_alias_regexp.match(t)
182        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
183        prop_code = canonicalize(m.group(1))
184        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
185        else: prop_code = property_lookup_map[prop_code]
186        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
187        po = property_object_map[prop_code]
188        # Special case for ccc: second field is enum integer value
189        if prop_code == 'ccc':
190            value_enum = m.group(3)
191            extra = m.group(4)
192            extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
193            value_preferred_full_name = extra_list[0]
194            # Treat integer string as an alias
195            value_aliases = [m.group(2)] + extra_list[1:]
196        # Special case for age: second field is numeric, third field is enum
197        # treat numeric value as an alias string
198        elif prop_code == 'age':
199            value_enum = m.group(3)
200            value_preferred_full_name = m.group(3)
201            extra = m.group(4)
202            value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
203        else:
204            value_enum = m.group(2)
205            value_preferred_full_name = m.group(3)
206            extra = m.group(4)
207            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
208        if not isinstance(po, EnumeratedPropertyObject): continue
209        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
210
211
212#
213#  UCD Property File Format 3:  codepoint/range -> data record maps
214#  Many files have data records consisting of a codepoint or codepoint range
215#  followed by fields separated by semicolons.
216#
217
218UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
219UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
220
221#
222# parse_data_record is a generic parser for most of the UCD data files.
223# Given a data_line beginning with a codepoint or codepoint range,
224# this function returns a (cp_lo, cp_hi, fields) triple givnig the
225# low and high codepoints of the range (these values may be equal in
226# the case of a single codepoint), as well as a list of fields.
227# The semicolon separators are removed as well as leading or trailing
228# whitespace for each field value.
229
230def parse_data_record(data_line):
231    m = UCD_point_regexp.match(data_line)
232    if m:
233        cp_lo = int(m.group(1), 16)
234        cp_hi = cp_lo
235        field_data = m.group(2)
236    else:
237        m = UCD_range_regexp.match(data_line)
238        if not m: raise Exception("UCD data record parsing error: " + data_line)
239        cp_lo = int(m.group(1), 16)
240        cp_hi = int(m.group(2), 16)
241        field_data = m.group(3)
242    field_data = field_data.lstrip().rstrip()
243    if field_data == '': 
244        fields = []
245    else:
246        if field_data[0] != ';': 
247            raise Exception("Field data syntax: " + field_data)
248        fields = field_data[1:].split(';')
249    fields = [f.lstrip().rstrip() for f in fields]
250    return (cp_lo, cp_hi, fields)
251
252
253#  parse_multisection_property_data parses such a file and populates
254#  the property objects for each property through successive calls to
255#  the corresponding addDataRecord method.
256#
257def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
258    f = open(UCD_config.UCD_src_dir + "/" + pfile)
259    props = []
260    lines = f.readlines()
261    for t in lines:
262        if UCD_missing_regexp.match(t):
263            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
264            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
265            property_object_map[prop_code].setDefaultValue(dflt)
266            if not prop_code in props: props.append(prop_code)
267        elif UCD_skip.match(t):
268            continue
269        else:
270            (cp_lo, cp_hi, fields) = parse_data_record(t)
271            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
272            if not prop_code in props: props.append(prop_code)
273            property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
274    for p in props:
275        property_object_map[p].finalizeProperty()
276    return props
277
278
279#
280#   Some UCD files are defined for a single property.   
281#   parse_property_data deals with such a file, given the property
282#   object to populate and the file root.
283#
284
285def parse_property_data(property_object, pfile):
286    f = open(UCD_config.UCD_src_dir + "/" + pfile)
287    lines = f.readlines()
288    for t in lines:
289        if UCD_missing_regexp.match(t):
290            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
291            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
292            property_object.setDefaultValue(fields[0])
293        elif UCD_skip.match(t):
294            continue
295        else:
296            (cp_lo, cp_hi, fields) = parse_data_record(t)
297            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
298                property_object.addDataRecord(cp_lo, cp_hi, None)
299            else:
300                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
301    property_object.finalizeProperty()
302
303
304#
305#   Some UCD files are organized to support multiple properties with one
306#   property per column.
307#   parse_multicolumn_property_data deals with such files given a list of
308#   property codes.
309#
310
311def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
312    f = open(UCD_config.UCD_src_dir + "/" + pfile)
313    props = []
314    lines = f.readlines()
315    for t in lines:
316        if UCD_skip.match(t):
317            continue
318        else:
319            (cp_lo, cp_hi, fields) = parse_data_record(t)
320            if len(fields) != len(prop_code_list): raise Exception("Multicolumn field count mismatch, expecting %i: " % len(prop_code_list) + t)
321            for i in range(len(fields)):
322                if fields[i] != '' and prop_code_list[i] in property_object_map:
323                    property_object_map[prop_code_list[i]].addDataRecord(cp_lo, cp_hi, fields[i])
324    for p in prop_code_list:
325        if prop_code_list[i] in property_object_map: 
326            property_object_map[p].finalizeProperty()
327
328UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
329
330NonName_regexp = re.compile("<([^>]*)>")
331NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
332
333#  Parse a decomposition mapping field in one of two forms:
334#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
335#  (b) canonical mappings:  {codepoint} 
336compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
337def parse_decomposition(s):
338    m = compatibility_regexp.match(s)
339    if m: 
340        decomp_type = m.group(1)
341        mapping = m.group(2)
342    else:
343        decomp_type = "Canonical"
344        mapping = s
345    return (decomp_type, mapping)
346
347def parse_UnicodeData_txt(property_object_map):
348    data_records = []
349    range_records = []
350    name_range_starts = {}
351    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
352    lines = f.readlines()
353    for t in lines:
354        if UCD_skip.match(t):
355            continue  # skip comment and blank lines
356        m = UnicodeData_txt_regexp.match(t)
357        if not m: raise Exception("Unknown syntax: %s" % t)
358        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
359        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
360        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
361        (na1, isc) = (m.group(10), m.group(11))
362        (upper, lower, title) = (m.group(13), m.group(14), m.group(15))
363        rangeMatch = NameRange_regexp.match(name)
364        if rangeMatch:
365            rangeName = rangeMatch.group(1)
366            print(rangeName, rangeMatch.group(2))
367            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
368            if rangeMatch.group(2) == 'Last': 
369                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
370                range_records.append((name_range_starts[rangeName], cp, rangeName, gc))
371        if not NonName_regexp.match(name):
372            property_object_map['na'].addDataRecord(cp, cp, name)
373        if not decomp == '':
374            (decomp_type, mapping) = parse_decomposition(decomp)
375            property_object_map['dm'].addDataRecord(cp, cp, mapping)
376        if not na1 == '':
377            property_object_map['na1'].addDataRecord(cp, cp, na1)
378        if not upper == '':
379            property_object_map['suc'].addDataRecord(cp, cp, upper)
380        if not lower == '':
381            property_object_map['slc'].addDataRecord(cp, cp, lower)
382        if not title == '':
383            property_object_map['stc'].addDataRecord(cp, cp, title)
384        elif not upper == '':
385            property_object_map['stc'].addDataRecord(cp, cp, upper)
386        if not decval == '':
387            property_object_map['nv'].addDataRecord(cp, cp, decval)
388        if not digitval == '':
389            property_object_map['nv'].addDataRecord(cp, cp, digitval)
390        if not numval == '':
391            property_object_map['nv'].addDataRecord(cp, cp, numval)
392
393    property_object_map['na'].finalizeProperty()
394    property_object_map['na1'].finalizeProperty()
395    property_object_map['isc'].finalizeProperty()
396    property_object_map['dm'].finalizeProperty()
397    property_object_map['slc'].finalizeProperty()
398    property_object_map['suc'].finalizeProperty()
399    property_object_map['stc'].finalizeProperty()
400    property_object_map['nv'].finalizeProperty()
401
402def parse_SpecialCasing_txt(property_object_map):
403    f = open(UCD_config.UCD_src_dir + "/SpecialCasing.txt")
404    lines = f.readlines()
405    for t in lines:
406        if UCD_skip.match(t):
407            continue  # skip comment and blank lines
408        (cp, cp_hi, fields) = parse_data_record(t)
409        if len(fields) != 4: continue   #  Ignore context-dependent casing
410        if fields[3] != '': continue
411        (lower, title, upper) = (fields[0], fields[1], fields[2])
412        if not lower == '':
413            property_object_map['lc'].addDataRecord(cp, cp, lower)
414        if not title == '':
415            property_object_map['tc'].addDataRecord(cp, cp, title)
416        if not upper == '':
417            property_object_map['uc'].addDataRecord(cp, cp, upper)
418    property_object_map['lc'].finalizeProperty()
419    property_object_map['tc'].finalizeProperty()
420    property_object_map['uc'].finalizeProperty()
421
422
423# CaseFolding.txt has four types of fold entries:
424# S, C, F, T:  Simple, Common, Full and Turkic. 
425# The SimpleCaseFold property is the set of mappings S+C,
426# The FullCaseFold property is the set F+C
427# There may be multiple entries per codepoint
428
429def parse_CaseFolding_txt(property_object_map):
430    fold_map = {}
431    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
432    lines = f.readlines()
433    for t in lines:
434        if UCD_skip.match(t): continue  # skip comment and blank lines
435        (cp, cp_hi, fields) = parse_data_record(t)
436        (fold_type, fold_val) = (fields[0], fields[1])
437        if not fold_type in fold_map: fold_map[fold_type] = {} 
438        if fold_type == 'S' or fold_type == 'C':
439            # fold value is guaranteed to be a single codepoint
440            property_object_map['scf'].addDataRecord(cp, cp, fold_val)
441        else:
442            if fold_type == 'F':
443                property_object_map['cf'].addDataRecord(cp, cp, fold_val)
444        fold_map[fold_type][cp] = fold_val
445    property_object_map['scf'].finalizeProperty()
446    property_object_map['cf'].finalizeProperty()
447    return fold_map
448
Note: See TracBrowser for help on using the repository browser.