Ignore:
Timestamp:
Dec 30, 2014, 12:06:48 PM (4 years ago)
Author:
cameron
Message:

Update to UCD 7.0.0, have UCD parsers return totally-defined value maps

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/UCD/UCD_parser.py

    r4374 r4375  
    156156UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    157157
    158 def parse_UCD_enumerated_property_map(property_code, mapfile, canonical_name_lookup_map, default_value = None):
     158def parse_UCD_enumerated_property_map(property_code, vlist, canon_map, mapfile, default_value = None):
    159159    value_map = {}
    160160    name_list_order = []
     
    169169              (missing_lo, missing_hi, default_value) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    170170              default_value = canonicalize(default_value)
    171               if not canonical_name_lookup_map.has_key(default_value):  raise Exception("Unknown default property value name '%s'" % default_value)
     171              if not canon_map.has_key(default_value):  raise Exception("Unknown default property value name '%s'" % default_value)
    172172              if missing_lo != 0 or missing_hi != 0x10FFFF: raise Exception("Unexpected missing data range '%x, %x'" % (missing_lo, missing_hi))
    173               default_value = canonical_name_lookup_map[default_value]
     173              default_value = canon_map[default_value]
    174174            continue  # skip comment and blank lines
    175175        m = UCD_point_name_regexp.match(t)
     
    183183            newset = range_uset(cp_lo, cp_hi)
    184184        cname = canonicalize(name)
    185         if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
    186         name = canonical_name_lookup_map[cname]
     185        if not canon_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
     186        name = canon_map[cname]
    187187        if not value_map.has_key(name):
    188188            value_map[name] = newset
     
    200200        value_map['C'] = union_of_all([value_map[v] for v in ['Cc', 'Cf', 'Cs', 'Co', 'Cn']])
    201201        name_list_order = ['LC', 'L', 'M', 'N', 'P', 'S', 'Z', 'C']+ name_list_order
     202    for v in vlist:
     203        if not v in name_list_order:
     204            #raise Exception("Property %s value %s missing" % (self.full_name_map[property_code], v))
     205            print("Warning: property %s has no instance of value %s" % (property_code, v))
     206            value_map[v] = empty_uset()
     207            name_list_order.append(v)
    202208    explicitly_defined_cps = empty_uset()
    203209    for k in value_map.keys(): explicitly_defined_cps = uset_union(explicitly_defined_cps, value_map[k])
     
    213219    return (name_list_order, value_map)
    214220
    215 def parse_ScriptExtensions_txt(canonical_property_value_map):
     221def parse_ScriptExtensions_txt(scripts, canon_map):
    216222    filename_root = 'ScriptExtensions'
    217223    property_code = 'scx'
    218     (scripts, script_map) = parse_UCD_enumerated_property_map('sc', 'Scripts.txt', canonical_property_value_map)
     224    (scriptlist, script_map) = parse_UCD_enumerated_property_map('sc', scripts, canon_map, 'Scripts.txt')
    219225    (scx_sets, scx_set_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt')
    220226    value_map = {}
     
    229235            else: value_map[sc] = scx_set_map[scx_list]
    230236        explicitly_defined_set = uset_union(explicitly_defined_set, scx_set_map[scx_list])
    231     for v in canonical_property_value_map.keys():
     237    for v in scripts:
    232238        if value_map.has_key(v):
    233239            value_map[v] = uset_union(value_map[v], uset_difference(script_map[v], explicitly_defined_set))
     
    235241            value_map[v] = script_map[v]
    236242        else: value_map[v] = empty_uset()
    237     return (sorted(canonical_property_value_map.keys()), value_map)
    238 
    239 
    240 def parse_UCD_codepoint_name_map(mapfile, canonical_name_lookup_map = None):
     243    return (scripts, value_map)
     244
     245
     246def parse_UCD_codepoint_name_map(mapfile, canon_map = None):
    241247   value_map = {}
    242248   name_list_order = []
     
    255261        (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    256262        newset = range_uset(cp_lo, cp_hi)
    257       if not canonical_name_lookup_map == None:
     263      if not canon_map == None:
    258264        cname = canonicalize(name)
    259         if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
    260         name = canonical_name_lookup_map[cname]
     265        if not canon_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
     266        name = canon_map[cname]
    261267      if not value_map.has_key(name):
    262268        value_map[name] = newset
Note: See TracChangeset for help on using the changeset viewer.