Changeset 4367


Ignore:
Timestamp:
Dec 28, 2014, 11:17:11 AM (4 years ago)
Author:
cameron
Message:

Refactoring to separate UCD parsing from header file generation.

Location:
proto/charsetcompiler/UCD
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/UCD/UCD_properties.py

    r4365 r4367  
    1212import re, string, os.path, cformat
    1313from unicode_set import *
     14from UCD_parser import *
    1415
    1516UCD_dir = "7.0.0"
    16 
    17 #
    18 #  Processing files of the UCD
    19 #
    20 #  General format for skippable comments, blank lines
    21 UCD_skip = re.compile("^#.*$|^\s*$")
    22 
    23 #
    24 #  UCD Property File Format 1: property aliases
    25 #  PropertyAliases.txt
    26 #
    27 UCD_property_section_regexp = re.compile("^#\s*([-A-Za-z_0-9]+)\s*Properties\s*$")
    28 UCD_property_alias_regexp = re.compile("^([-A-Za-z_0-9]+)\s*;\s*([-A-Za-z_0-9]+)([^#]*)")
    29 
    30 trivial_name_char_re = re.compile('[-_\s]')
    31 def canonicalize(property_string):
    32    c = trivial_name_char_re.sub('', property_string.lower())
    33    if len(c) > 2 and c[0:2] == "is": return c[2:]
    34    else: return c
    3517
    3618PropertyAliases_template = r"""
     
    7355"""
    7456
    75 #
    76 #  Union of a list of sets
    77 #
    78 def union_of_all(uset_list):
    79    if uset_list == []: return empty_uset()
    80    else:
    81      accum_set = uset_list[0]
    82      for s in uset_list[1:]:
    83         accum_set = uset_union(accum_set, s)
    84      return accum_set
    85 
    86 #
    87 #  UCD Property File Format 3:  codepoint -> name maps
    88 #
    89 UCD_skip = re.compile("^#.*$|^\s*$")
    90 UCD_missing_regexp1 = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    91 UCD_point_name_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    92 UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    93 
    94 def parse_UCD_enumerated_property_map(mapfile, canonical_name_lookup_map, default_value = None):
    95    value_map = {}
    96    name_list_order = []
    97    f = open(UCD_dir + "/" + mapfile)
    98    lines = f.readlines()
    99    for t in lines:
    100       if UCD_skip.match(t):
    101         m = UCD_missing_regexp1.match(t)
    102         if m:
    103           if default_value != None:
    104             raise Exception("Default value already specified, extraneous @missing spec: %s" % t)
    105           (missing_lo, missing_hi, default_value) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    106           default_value = canonicalize(default_value)
    107           if not canonical_name_lookup_map.has_key(default_value):  raise Exception("Unknown defauly property value name '%s'" % default_value)
    108           if missing_lo != 0 or missing_hi != 0x10FFFF: raise Exception("Unexpected missing data range '%x, %x'" % (missing_lo, missing_hi))
    109           default_value = canonical_name_lookup_map[default_value]
    110         continue  # skip comment and blank lines
    111       m = UCD_point_name_regexp.match(t)
    112       if m:
    113         (codepoint, name) = (int(m.group(1), 16), m.group(2))
    114         newset = singleton_uset(codepoint)
    115       else:
    116         m = UCD_range_name_regexp.match(t)
    117         if not m: raise Exception("Unknown syntax: %s" % t)
    118         (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    119         newset = range_uset(cp_lo, cp_hi)
    120       cname = canonicalize(name)
    121       if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
    122       name = canonical_name_lookup_map[cname]
    123       if not value_map.has_key(name):
    124         value_map[name] = newset
    125         name_list_order.append(name)
    126       else: value_map[name] = uset_union(value_map[name], newset)
    127    explicitly_defined_cps = empty_uset()
    128    for k in value_map.keys(): explicitly_defined_cps = uset_union(explicitly_defined_cps, value_map[k])
    129    need_default_value = uset_complement(explicitly_defined_cps)
    130    if default_value != None:
    131      if value_map.has_key(default_value):
    132        value_map[default_value] = uset_union(value_map[default_value], need_default_value)
    133      else:
    134        value_map[default_value] = need_default_value
    135        name_list_order.append(default_value)
    136    elif uset_popcount(need_default_value) > 0:
    137      print "Warning no default value, but %i codepoints not specified" % uset_popcount(need_default_value)
    138    return (name_list_order, value_map)
    139 
    140 def parse_UCD_codepoint_name_map(mapfile, canonical_name_lookup_map = None):
    141    value_map = {}
    142    name_list_order = []
    143    f = open(UCD_dir + "/" + mapfile)
    144    lines = f.readlines()
    145    for t in lines:
    146       if UCD_skip.match(t):
    147         continue  # skip comment and blank lines
    148       m = UCD_point_name_regexp.match(t)
    149       if m:
    150         (codepoint, name) = (int(m.group(1), 16), m.group(2))
    151         newset = singleton_uset(codepoint)
    152       else:
    153         m = UCD_range_name_regexp.match(t)
    154         if not m: raise Exception("Unknown syntax: %s" % t)
    155         (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    156         newset = range_uset(cp_lo, cp_hi)
    157       if not canonical_name_lookup_map == None:
    158         cname = canonicalize(name)
    159         if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
    160         name = canonical_name_lookup_map[cname]
    161       if not value_map.has_key(name):
    162         value_map[name] = newset
    163         name_list_order.append(name)
    164       else: value_map[name] = uset_union(value_map[name], newset)
    165    return (name_list_order, value_map)
    166 
    16757   
    16858CodepointProperties = ['scf', 'slc', 'suc', 'stc']
    16959
    17060class UCD_generator():
    171     def __init__(self, UCD_dir):
    172             self.UCD_dir = UCD_dir
     61    def __init__(self):
    17362            self.supported_props = []
    17463            self.property_data_headers = []
    17564            self.missing_specs = {}
    17665
    177     def parse_PropertyAlias_txt(self):
    178        self.property_enum_name_list = []
    179        self.full_name_map = {}
    180        self.property_lookup_map = {}
    181        self.property_kind_map = {}
    182        property_kind = "unspecified"
    183        f = open(self.UCD_dir + "/" + 'PropertyAliases.txt')
    184        lines = f.readlines()
    185        for t in lines:
    186           m = UCD_property_section_regexp.match(t)
    187           if m:
    188             property_kind = m.group(1)
    189           if UCD_skip.match(t): continue  # skip comment and blank lines
    190           m = UCD_property_alias_regexp.match(t)
    191           if not m: raise Exception("Unknown property alias syntax: %s" % t)
    192           prop_enum = m.group(1).lower()
    193           prop_preferred_full_name = m.group(2)
    194           prop_extra = m.group(3)
    195           prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
    196           self.property_enum_name_list.append(prop_enum)
    197           self.full_name_map[prop_enum] = prop_preferred_full_name
    198           self.property_lookup_map[canonicalize(prop_enum)] = prop_enum
    199           self.property_lookup_map[canonicalize(prop_preferred_full_name)] = prop_enum
    200           for a in prop_aliases: self.property_lookup_map[canonicalize(a)] = prop_enum
    201           self.property_kind_map[prop_enum] = property_kind
     66    def load_property_name_info(self):
     67       (self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt()
    20268
    20369    def generate_PropertyAliases_h(self):
     
    21076       cformat.close_header_file(f)
    21177
    212     def process_missing_spec_format_2(self, s):
    213       UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
    214       m = UCD_property_value_missing_regexp.match(s)
    215       if m:
    216         if m.group(1) != '0000' or m.group(2) != '10FFFF': raise Exception("Bad missing spec: " + s)
    217         cname = canonicalize(m.group(3))
    218         if not self.property_lookup_map.has_key(cname): raise Exception("Bad missing property: " + s)
    219         self.missing_specs[self.property_lookup_map[cname]] = m.group(4)
    220 #
    221 #  UCD Property File Format 2: property value aliases
    222 #  PropertyValueAliases.txt
    223 #
    224 #  This file records value aliases for property values for
    225 #  each enumerated property, with the following additional notes:
    226 #  (1) The corresponding integer value of the enum constant is
    227 #      also specified for ccc (second field).
    228 #  (2) The Age property is a numeric type which has decimal float
    229 #      values as the enum constants: these won't be legal in enum syntax.
    230 #  (3) Binary properties also have enumerated values and aliases listed,
    231 #      although this is redundant, because all binary properties have the
    232 #      same value space.
    233 #  (4) @missing lines provide default value information, primarily for some
    234 #      non-enumerated types
    235 
    236     def parse_PropertyValueAlias_txt(self):
    237         UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
    238         self.property_value_list = {}
    239         self.property_value_enum_integer = {}
    240         self.property_value_full_name_map = {}
    241         self.property_value_lookup_map = {}
    242         f = open(self.UCD_dir + "/" + 'PropertyValueAliases.txt')
    243         lines = f.readlines()
    244         for t in lines:
    245             if UCD_skip.match(t):
    246               self.process_missing_spec_format_2(t)
    247               continue  # skip comment and blank lines
    248             m = UCD_property_value_alias_regexp.match(t)
    249             if not m: raise Exception("Unknown property value alias syntax: %s" % t)
    250             prop_code = canonicalize(m.group(1))
    251             if not self.property_lookup_map.has_key(prop_code): raise Exception("Property code: '%s' is unknown" % prop_code)
    252             else: prop_code = self.property_lookup_map[prop_code]
    253             if not self.property_value_list.has_key(prop_code):
    254               self.property_value_list[prop_code] = []
    255               self.property_value_enum_integer[prop_code] = {}
    256               self.property_value_full_name_map[prop_code] = {}
    257               self.property_value_lookup_map[prop_code] = {}
    258               enum_integer = 0
    259             # Special case for ccc: second field is enum integer value
    260             if prop_code == 'ccc':
    261               enum_integer = int(m.group(2))
    262               value_enum = m.group(3)
    263               extra = m.group(4)
    264               extra_list = re.findall("[-A-Za-z_0-9.]+", extra)
    265               value_preferred_full_name = extra_list[0]
    266               value_aliases = extra_list[1:]
    267             # Special case for age: second field is numeric, third field is enum
    268             # treat numeric value as an alias string
    269             elif prop_code == 'age':
    270               value_enum = m.group(3)
    271               value_preferred_full_name = m.group(3)
    272               extra = m.group(4)
    273               value_aliases = [m.group(2)] + re.findall("[-A-Za-z_0-9]+", extra)
    274             else:
    275               value_enum = m.group(2)
    276               value_preferred_full_name = m.group(3)
    277               extra = m.group(4)
    278               value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
    279             self.property_value_list[prop_code].append(value_enum)
    280             self.property_value_enum_integer[prop_code][value_enum] = enum_integer
    281             enum_integer += 1
    282             self.property_value_full_name_map[prop_code][value_enum] = value_preferred_full_name
    283             self.property_value_lookup_map[prop_code][canonicalize(value_enum)] = value_enum
    284             self.property_value_lookup_map[prop_code][canonicalize(value_preferred_full_name)] = value_enum
    285             for a in value_aliases: self.property_value_lookup_map[prop_code][canonicalize(a)] = value_enum
     78    def load_property_value_info(self):
     79       (self.property_value_list, self.property_value_enum_integer, self.property_value_full_name_map, self.property_value_lookup_map, self.missing_specs) = parse_PropertyValueAlias_txt(self.property_lookup_map)
    28680
    28781
     
    309103             full_name_text = cformat.multiline_fill(['"%s"' % name for name in full_names], ',', 6)
    310104             canon_full_names = [canonicalize(name) for name in full_names]
    311              aliases_only = [k for k in self.property_value_lookup_map[p].keys() if not canonicalize(k) in canon_full_names]
     105             aliases_only = [k for k in self.property_value_lookup_map[p].keys() if not canonicalize(k) in canon_full_names and k == canonicalize(k)]
    312106             map_text = cformat.multiline_fill(['{"%s", %s::%s}' % (k, p.upper(), self.property_value_lookup_map[p][k]) for k in sorted(aliases_only)], ',', 6)
    313107             f.write(EnumeratedProperty_template % (p.upper(), enum_text, full_name_text, map_text))
     
    318112    def generate_property_value_file(self, filename_root, property_code, default_value = None):
    319113       canonical_property_value_map = self.property_value_lookup_map[property_code]
    320        (prop_values, value_map) = parse_UCD_enumerated_property_map(filename_root + '.txt', canonical_property_value_map, default_value)
     114       (prop_values, value_map) = parse_UCD_enumerated_property_map(property_code, filename_root + '.txt', canonical_property_value_map, default_value)
    321115       for v in self.property_value_list[property_code]:
    322116          if not v in prop_values:
    323117             #raise Exception("Property %s value %s missing" % (self.full_name_map[property_code], v))
    324              print("Warning property %s has no instance of value %s" % (self.full_name_map[property_code], v))
     118             print("Warning: property %s has no instance of value %s" % (self.full_name_map[property_code], v))
    325119             value_map[v] = empty_uset()
    326120       basename = os.path.basename(filename_root)
     
    329123       f.write("\nnamespace UCD {\n")
    330124       f.write("  namespace %s {\n" % property_code.upper())
    331        if property_code == 'gc':
    332          # special logic for derived categories
    333          value_map['LC'] = union_of_all([value_map[v] for v in ['Lu', 'Ll', 'Lt']])
    334          value_map['L'] = union_of_all([value_map[v] for v in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo']])
    335          value_map['M'] = union_of_all([value_map[v] for v in ['Mn', 'Mc', 'Me']])
    336          value_map['N'] = union_of_all([value_map[v] for v in ['Nd', 'Nl', 'No']])
    337          value_map['P'] = union_of_all([value_map[v] for v in ['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']])
    338          value_map['S'] = union_of_all([value_map[v] for v in ['Sm', 'Sc', 'Sk', 'So']])
    339          value_map['Z'] = union_of_all([value_map[v] for v in ['Zs', 'Zl', 'Zp']])
    340          value_map['C'] = union_of_all([value_map[v] for v in ['Cc', 'Cf', 'Cs', 'Co', 'Cn']])
    341125       for v in self.property_value_list[property_code]:
    342126         f.write("    const UnicodeSet %s_Set \n" % v.lower())
     
    442226
    443227def UCD_main():
    444    ucd = UCD_generator(UCD_dir)
     228   ucd = UCD_generator()
    445229
    446230   # First parse all property names and their aliases
    447    ucd.parse_PropertyAlias_txt()
     231   ucd.load_property_name_info()
    448232   #
    449233   # Generate the PropertyAliases.h file to define all the Unicode property_t enum
     
    452236   #
    453237   # Next parse all property value names and their aliases.  Generate the data.
    454    ucd.parse_PropertyValueAlias_txt()
     238   ucd.load_property_value_info()
    455239   ucd.generate_PropertyValueAliases_h()
    456240   #
Note: See TracChangeset for help on using the changeset viewer.