source: icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

Last change on this file was 6192, checked in by cameron, 10 months ago

Updates for Unicode 11.0 plus Emoji properties

File size: 20.0 KB
Line 
1#
2# UCD_parser.py - parsing Unicode Character Database (UCD) files
3#
4# Robert D. Cameron
5# December 28, 2014
6#
7# Licensed under Open Software License 3.0.
8#
9#
10import UCD_config
11from UCD_property_objects import *
12
13version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
14
15def setVersionfromReadMe_txt():
16    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
17    lines = f.readlines()
18    for t in lines:
19        m = version_regexp.match(t)
20        if m: 
21            UCD_config.version = m.group(1)
22            print("Version %s" % m.group(1))
23
24trivial_name_char_re = re.compile('[-_\s]')
25def canonicalize(property_string):
26    return trivial_name_char_re.sub('', property_string.lower())
27
28#
29#  Processing files of the UCD
30#
31#  General format for skippable comments, blank lines
32UCD_skip = re.compile("^#.*$|^\s*$")
33
34#
35#  UCD Property File Format 1: property aliases
36#  PropertyAliases.txt
37#
38UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
39UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
40
41# Section 2.3.3 of UAX $44
42Obsolete_Properties = ["na1", "Gr_Link", "Hyphen", "isc", "XO_NFC", "XO_NFD", "XO_NFKC", "XO_NFKD" ,"FC_NFKC"]
43Emoji_Properties = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"]
44
45def parse_PropertyAlias_txt():
46    property_object_map = {}
47    property_enum_name_list = []
48    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
49    lines = f.readlines()
50    for t in lines:
51        m = UCD_property_section_regexp.match(t)
52        if m:
53            property_kind = m.group(1)
54        if UCD_skip.match(t): continue  # skip comment and blank lines
55        m = UCD_property_alias_regexp.match(t)
56        if not m: raise Exception("Unknown property alias syntax: %s" % t)
57        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
58        property_enum_name_list.append(property_code)
59        if property_code in Obsolete_Properties:
60            property_object_map[property_code] = ObsoletePropertyObject()
61        elif property_kind == "Binary":
62            property_object_map[property_code] = BinaryPropertyObject()
63        elif property_kind == "Enumerated":
64            property_object_map[property_code] = EnumeratedPropertyObject()
65        elif property_kind == "Catalog":   # Age, Block, Script
66            property_object_map[property_code] = EnumeratedPropertyObject()
67        elif property_kind == "String":
68            if property_code in ["uc", "lc", "tc", "cf"]:
69                property_object_map[property_code] = StringOverridePropertyObject("s" + property_code)
70            else:
71                property_object_map[property_code] = StringPropertyObject()
72        elif property_kind == "Numeric":
73            property_object_map[property_code] = NumericPropertyObject()
74        else:  # Miscellaneous properties
75            if property_code == "scx":
76                property_object_map[property_code] = ExtensionPropertyObject()
77            else:
78                # All other Miscellaneous properties have string values
79                property_object_map[property_code] = StringPropertyObject()
80        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
81        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
82        property_object_map[property_code].setAliases(prop_aliases)
83    for p in Emoji_Properties:
84        prop_code = canonicalize(p)
85        property_enum_name_list.append(prop_code)
86        property_object_map[prop_code] = BinaryPropertyObject()
87        property_object_map[prop_code].setID(prop_code, p)
88        property_object_map[prop_code].setAliases([])
89    return (property_enum_name_list, property_object_map)
90
91
92#
93#  Property Default Value Specifications
94#
95#  THe UCD uses special comment lines ("@missing specifications") to declare default
96#  values for properties.   Examples showing the two common formats are:
97#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
98#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
99#  The general format gives a range of codepoints (generally 0000..10FFFF),
100#  an optional property name (if the file containing the specification defines
101#  many different properties), and the default value.
102#
103#  There are some important default values for different property types:
104#  <codepoint>:  This is a default value for certain String properties,
105#                indicating the default for a codepoint under the given property
106#                is to map to itself.
107#  <none>:       This is a default for certain String properties indicating that
108#                the default value for a code point is the empty string.
109#  <script>:     The default value for the ScriptExtnesions property is the
110#                value of the Script property.
111#  NaN           The default value for numeric property is the NaN (not a number) value.
112#
113
114#  Given a line known to contain such a @missing specification,
115#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
116#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
117#  The list of fields contains one or two entries: an optional
118#  property name and the default value specified for the range.
119#  @missing specifications generally omit the property name when
120#  the file being processed is defined for a single property only.
121#
122UCD_missing_check = re.compile("^#\s*@missing:.*")
123UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
124
125def parse_missing_spec(data_line):
126    m = UCD_missing_regexp.match(data_line)
127    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
128    cp_lo = int(m.group(1), 16)
129    cp_hi = int(m.group(2), 16)
130    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
131    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
132    field_data = m.group(3)
133    fields = field_data.split(';')
134    fields = [f.lstrip().rstrip() for f in fields]
135    return (cp_lo, cp_hi, fields)
136
137#
138#  Missing specifications and other types of UCD data records often produce
139#  a list of one or two fields which indicate a property and a value.
140#
141#  parse_property_and_value(fields, property_lookup_map) checks that
142#  first of the given fields is indeed a property identifier identified
143#  in the given lookup map, and returns a pair consisting of the
144#  unique property code for the property, plus a corresponding value
145#  (or None, if only one field was given).
146
147def parse_property_and_value(fields, property_lookup_map):
148    if len(fields) > 2: raise Exception("Too many fields")
149    if len(fields) == 0: raise Exception("Expecting at least 1 field")
150    canon = canonicalize(fields[0])
151    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
152    pcode = property_lookup_map[canon]
153    if len(fields) == 1: return (pcode, None)
154    else: return (pcode, fields[1])
155
156#
157#  UCD Property File Format 2: property value aliases
158#  PropertyValueAliases.txt
159#
160#  This file records value aliases for property values for
161#  each enumerated property, with the following additional notes:
162#  (1) The corresponding integer value of the enum constant is
163#      also specified for ccc (second field).
164#  (2) The Age property is a numeric type which has decimal float
165#      values as the enum constants: these won't be legal in enum syntax.
166#  (3) Binary properties also have enumerated values and aliases listed,
167#      although this is redundant, because all binary properties have the
168#      same value space.
169#  (4) @missing lines provide default value information, primarily for some
170#      non-enumerated types
171
172
173def initializePropertyValues(property_object_map, property_lookup_map):
174    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
175    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
176    missing_specs = {}
177    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
178    lines = f.readlines()
179    for t in lines:
180        if UCD_skip.match(t):
181            if UCD_missing_check.match(t):
182                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
183                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
184                property_object_map[property_code].setDefaultValue(default_value)
185            continue  # skip comment and blank lines
186        m = UCD_property_value_alias_regexp.match(t)
187        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
188        prop_code = canonicalize(m.group(1))
189        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
190        else: prop_code = property_lookup_map[prop_code]
191        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
192        po = property_object_map[prop_code]
193        # Special case for ccc: second field is enum integer value
194        if prop_code == 'ccc':
195            value_enum = m.group(3)
196            extra = m.group(4)
197            extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
198            value_preferred_full_name = extra_list[0]
199            # Treat integer string as an alias
200            value_aliases = [m.group(2)] + extra_list[1:]
201        # Special case for age: second field is numeric, third field is enum
202        # treat numeric value as an alias string
203        elif prop_code == 'age':
204            value_enum = m.group(3)
205            value_preferred_full_name = m.group(3)
206            extra = m.group(4)
207            value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
208        else:
209            value_enum = m.group(2)
210            value_preferred_full_name = m.group(3)
211            extra = m.group(4)
212            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
213        if not isinstance(po, EnumeratedPropertyObject): continue
214        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
215
216
217#
218#  UCD Property File Format 3:  codepoint/range -> data record maps
219#  Many files have data records consisting of a codepoint or codepoint range
220#  followed by fields separated by semicolons.
221#
222
223UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
224UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
225
226#
227# parse_data_record is a generic parser for most of the UCD data files.
228# Given a data_line beginning with a codepoint or codepoint range,
229# this function returns a (cp_lo, cp_hi, fields) triple givnig the
230# low and high codepoints of the range (these values may be equal in
231# the case of a single codepoint), as well as a list of fields.
232# The semicolon separators are removed as well as leading or trailing
233# whitespace for each field value.
234
235def parse_data_record(data_line):
236    m = UCD_point_regexp.match(data_line)
237    if m:
238        cp_lo = int(m.group(1), 16)
239        cp_hi = cp_lo
240        field_data = m.group(2)
241    else:
242        m = UCD_range_regexp.match(data_line)
243        if not m: raise Exception("UCD data record parsing error: " + data_line)
244        cp_lo = int(m.group(1), 16)
245        cp_hi = int(m.group(2), 16)
246        field_data = m.group(3)
247    field_data = field_data.lstrip().rstrip()
248    if field_data == '': 
249        fields = []
250    else:
251        if field_data[0] != ';': 
252            raise Exception("Field data syntax: " + field_data)
253        fields = field_data[1:].split(';')
254    fields = [f.lstrip().rstrip() for f in fields]
255    return (cp_lo, cp_hi, fields)
256
257
258#  parse_multisection_property_data parses such a file and populates
259#  the property objects for each property through successive calls to
260#  the corresponding addDataRecord method.
261#
262def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
263    f = open(UCD_config.UCD_src_dir + "/" + pfile)
264    props = []
265    lines = f.readlines()
266    for t in lines:
267        if UCD_missing_regexp.match(t):
268            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
269            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
270            property_object_map[prop_code].setDefaultValue(dflt)
271            if not prop_code in props: props.append(prop_code)
272        elif UCD_skip.match(t):
273            continue
274        else:
275            (cp_lo, cp_hi, fields) = parse_data_record(t)
276            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
277            if not prop_code in props: props.append(prop_code)
278            property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
279    for p in props:
280        property_object_map[p].finalizeProperty()
281    return props
282
283
284#
285#   Some UCD files are defined for a single property.   
286#   parse_property_data deals with such a file, given the property
287#   object to populate and the file root.
288#
289
290def parse_property_data(property_object, pfile):
291    f = open(UCD_config.UCD_src_dir + "/" + pfile)
292    lines = f.readlines()
293    for t in lines:
294        if UCD_missing_regexp.match(t):
295            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
296            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
297            property_object.setDefaultValue(fields[0])
298        elif UCD_skip.match(t):
299            continue
300        else:
301            (cp_lo, cp_hi, fields) = parse_data_record(t)
302            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
303                property_object.addDataRecord(cp_lo, cp_hi, None)
304            else:
305                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
306    property_object.finalizeProperty()
307
308
309#
310#   Some UCD files are organized to support multiple properties with one
311#   property per column.
312#   parse_multicolumn_property_data deals with such files given a list of
313#   property codes.
314#
315
316def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
317    f = open(UCD_config.UCD_src_dir + "/" + pfile)
318    props = []
319    lines = f.readlines()
320    for t in lines:
321        if UCD_skip.match(t):
322            continue
323        else:
324            (cp_lo, cp_hi, fields) = parse_data_record(t)
325            if len(fields) != len(prop_code_list): raise Exception("Multicolumn field count mismatch, expecting %i: " % len(prop_code_list) + t)
326            for i in range(len(fields)):
327                if fields[i] != '' and prop_code_list[i] in property_object_map:
328                    property_object_map[prop_code_list[i]].addDataRecord(cp_lo, cp_hi, fields[i])
329    for p in prop_code_list:
330        if prop_code_list[i] in property_object_map: 
331            property_object_map[p].finalizeProperty()
332
333UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
334
335NonName_regexp = re.compile("<([^>]*)>")
336NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
337
338#  Parse a decomposition mapping field in one of two forms:
339#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
340#  (b) canonical mappings:  {codepoint} 
341compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
342def parse_decomposition(s):
343    m = compatibility_regexp.match(s)
344    if m: 
345        decomp_type = m.group(1)
346        mapping = m.group(2)
347    else:
348        decomp_type = "Canonical"
349        mapping = s
350    return (decomp_type, mapping)
351
352def parse_UnicodeData_txt(property_object_map):
353    data_records = []
354    range_records = []
355    name_range_starts = {}
356    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
357    lines = f.readlines()
358    for t in lines:
359        if UCD_skip.match(t):
360            continue  # skip comment and blank lines
361        m = UnicodeData_txt_regexp.match(t)
362        if not m: raise Exception("Unknown syntax: %s" % t)
363        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
364        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
365        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
366        (na1, isc) = (m.group(10), m.group(11))
367        (upper, lower, title) = (m.group(13), m.group(14), m.group(15))
368        rangeMatch = NameRange_regexp.match(name)
369        if rangeMatch:
370            rangeName = rangeMatch.group(1)
371            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
372            if rangeMatch.group(2) == 'Last': 
373                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
374                range_records.append((name_range_starts[rangeName], cp, rangeName, gc))
375        if not NonName_regexp.match(name):
376            property_object_map['na'].addDataRecord(cp, cp, name)
377        if not decomp == '':
378            (decomp_type, mapping) = parse_decomposition(decomp)
379            property_object_map['dm'].addDataRecord(cp, cp, mapping)
380        if not na1 == '':
381            property_object_map['na1'].addDataRecord(cp, cp, na1)
382        if not upper == '':
383            property_object_map['suc'].addDataRecord(cp, cp, upper)
384        if not lower == '':
385            property_object_map['slc'].addDataRecord(cp, cp, lower)
386        if not title == '':
387            property_object_map['stc'].addDataRecord(cp, cp, title)
388        elif not upper == '':
389            property_object_map['stc'].addDataRecord(cp, cp, upper)
390        if not decval == '':
391            property_object_map['nv'].addDataRecord(cp, cp, decval)
392        if not digitval == '':
393            property_object_map['nv'].addDataRecord(cp, cp, digitval)
394        if not numval == '':
395            property_object_map['nv'].addDataRecord(cp, cp, numval)
396
397    property_object_map['na'].finalizeProperty()
398    property_object_map['na1'].finalizeProperty()
399    property_object_map['isc'].finalizeProperty()
400    property_object_map['dm'].finalizeProperty()
401    property_object_map['slc'].finalizeProperty()
402    property_object_map['suc'].finalizeProperty()
403    property_object_map['stc'].finalizeProperty()
404    property_object_map['nv'].finalizeProperty()
405
406def parse_SpecialCasing_txt(property_object_map):
407    f = open(UCD_config.UCD_src_dir + "/SpecialCasing.txt")
408    lines = f.readlines()
409    for t in lines:
410        if UCD_skip.match(t):
411            continue  # skip comment and blank lines
412        (cp, cp_hi, fields) = parse_data_record(t)
413        if len(fields) != 4: continue   #  Ignore context-dependent casing
414        if fields[3] != '': continue
415        (lower, title, upper) = (fields[0], fields[1], fields[2])
416        if not lower == '':
417            property_object_map['lc'].addDataRecord(cp, cp, lower)
418        if not title == '':
419            property_object_map['tc'].addDataRecord(cp, cp, title)
420        if not upper == '':
421            property_object_map['uc'].addDataRecord(cp, cp, upper)
422    property_object_map['lc'].finalizeProperty()
423    property_object_map['tc'].finalizeProperty()
424    property_object_map['uc'].finalizeProperty()
425
426
427# CaseFolding.txt has four types of fold entries:
428# S, C, F, T:  Simple, Common, Full and Turkic. 
429# The SimpleCaseFold property is the set of mappings S+C,
430# The FullCaseFold property is the set F+C
431# There may be multiple entries per codepoint
432
433def parse_CaseFolding_txt(property_object_map):
434    fold_map = {}
435    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
436    lines = f.readlines()
437    for t in lines:
438        if UCD_skip.match(t): continue  # skip comment and blank lines
439        (cp, cp_hi, fields) = parse_data_record(t)
440        (fold_type, fold_val) = (fields[0], fields[1])
441        if not fold_type in fold_map: fold_map[fold_type] = {} 
442        if fold_type == 'S' or fold_type == 'C':
443            # fold value is guaranteed to be a single codepoint
444            property_object_map['scf'].addDataRecord(cp, cp, fold_val)
445        else:
446            if fold_type == 'F':
447                property_object_map['cf'].addDataRecord(cp, cp, fold_val)
448        fold_map[fold_type][cp] = fold_val
449    property_object_map['scf'].finalizeProperty()
450    property_object_map['cf'].finalizeProperty()
451    return fold_map
452
Note: See TracBrowser for help on using the repository browser.