Changeset 5658


Ignore:
Timestamp:
Oct 2, 2017, 12:53:39 PM (22 months ago)
Author:
cameron
Message:

UCD generator restructuring and improvements

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5655 r5658  
    1111import UCD_config
    1212from unicode_set import *
     13from UCD_property_objects import *
    1314
    1415version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
     
    4142
    4243def parse_PropertyAlias_txt():
     44    property_object_map = {}
    4345    property_enum_name_list = []
    44     full_name_map = {}
    45     property_lookup_map = {}
    46     property_kind_map = {}
    47     property_kind = "unspecified"
    4846    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyAliases.txt')
    4947    lines = f.readlines()
     
    5553        m = UCD_property_alias_regexp.match(t)
    5654        if not m: raise Exception("Unknown property alias syntax: %s" % t)
    57         (prop_enum, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
     55        (property_code, prop_preferred_full_name, prop_extra) = (m.group(1), m.group(2), m.group(3))
     56        property_enum_name_list.append(property_code)
     57        if property_kind == "Binary":
     58            property_object_map[property_code] = BinaryPropertyObject()
     59        elif property_kind == "Enumerated":
     60            property_object_map[property_code] = EnumeratedPropertyObject()
     61        elif property_kind == "Catalog":   # Age, Block, Script
     62            property_object_map[property_code] = EnumeratedPropertyObject()
     63        elif property_kind == "String":
     64            property_object_map[property_code] = StringPropertyObject()
     65        elif property_kind == "Numeric":
     66            property_object_map[property_code] = NumericPropertyObject()
     67        else:  # Miscellaneous properties
     68            if property_code == "scx":
     69                property_object_map[property_code] = ExtensionPropertyObject()
     70            else:
     71                # All other Miscellaneous properties have string values
     72                property_object_map[property_code] = StringPropertyObject()
     73        property_object_map[property_code].setID(property_code, prop_preferred_full_name)
    5874        prop_aliases = re.findall("[-A-Za-z_0-9]+", prop_extra)
    59         property_enum_name_list.append(prop_enum)
    60         full_name_map[prop_enum] = prop_preferred_full_name
    61         property_lookup_map[canonicalize(prop_enum)] = prop_enum
    62         property_lookup_map[canonicalize(prop_preferred_full_name)] = prop_enum
    63         for a in prop_aliases: property_lookup_map[canonicalize(a)] = prop_enum
    64         property_kind_map[prop_enum] = property_kind
    65     #
    66     # Override the property kind for scx
    67     property_kind_map['scx'] = 'Extension'
    68     return (property_enum_name_list, full_name_map, property_lookup_map, property_kind_map)
    69 
    70 
    71 UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
     75        property_object_map[property_code].setAliases(prop_aliases)
     76    return (property_enum_name_list, property_object_map)
     77
    7278#
    7379#  UCD Property File Format 2: property value aliases
     
    8692#      non-enumerated types
    8793
    88 def parse_PropertyValueAlias_txt(property_lookup_map):
     94def initializePropertyValues(property_object_map, property_lookup_map):
     95    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
    8996    UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
    90     property_value_list = {}
    91     property_value_enum_integer = {}
    92     property_value_full_name_map = {}
    93     property_value_lookup_map = {}
    9497    missing_specs = {}
    9598    f = open(UCD_config.UCD_src_dir + "/" + 'PropertyValueAliases.txt')
     
    102105                cname = canonicalize(m.group(3))
    103106                if not cname in property_lookup_map: raise Exception("Bad missing property: " + s)
    104                 missing_specs[property_lookup_map[cname]] = m.group(4)
     107                property_object_map[property_lookup_map[cname]].setDefaultValue(m.group(4))
    105108            continue  # skip comment and blank lines
    106109        m = UCD_property_value_alias_regexp.match(t)
     
    109112        if not prop_code in property_lookup_map: raise Exception("Property code: '%s' is unknown" % prop_code)
    110113        else: prop_code = property_lookup_map[prop_code]
    111         if not prop_code in property_value_list:
    112             property_value_list[prop_code] = []
    113             property_value_enum_integer[prop_code] = {}
    114             property_value_full_name_map[prop_code] = {}
    115             property_value_lookup_map[prop_code] = {}
    116             enum_integer = 0
     114        if not prop_code in property_object_map: raise Exception("Property object: '%s' is uninitialized" % prop_code)
     115        po = property_object_map[prop_code]
    117116        # Special case for ccc: second field is enum integer value
    118117        if prop_code == 'ccc':
    119             enum_integer = int(m.group(2))
    120118            value_enum = m.group(3)
    121119            extra = m.group(4)
     
    136134            extra = m.group(4)
    137135            value_aliases = re.findall("[-A-Za-z_0-9]+", extra)
    138         property_value_list[prop_code].append(value_enum)
    139         property_value_enum_integer[prop_code][value_enum] = enum_integer
    140         enum_integer += 1
    141         property_value_full_name_map[prop_code][value_enum] = value_preferred_full_name
    142         property_value_lookup_map[prop_code][value_enum] = value_enum
    143         property_value_lookup_map[prop_code][canonicalize(value_enum)] = value_enum
    144         property_value_lookup_map[prop_code][canonicalize(value_preferred_full_name)] = value_enum
    145         for a in value_aliases: property_value_lookup_map[prop_code][canonicalize(a)] = value_enum
    146     # Special case for scx:
    147     property_value_list['scx'] = property_value_list['sc']
    148     property_value_enum_integer['scx'] = property_value_enum_integer['sc']
    149     property_value_full_name_map['scx'] = property_value_full_name_map['sc']
    150     property_value_lookup_map['scx'] = property_value_lookup_map['sc']
    151     return (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map, missing_specs)
    152 
    153 
    154 
    155 #
    156 #  Union of a list of sets
    157 #
    158 def union_of_all(uset_list):
    159     if uset_list == []: return empty_uset()
     136        if not isinstance(po, EnumeratedPropertyObject): continue
     137        po.addPropertyValue(value_enum, value_preferred_full_name, value_aliases)
     138
     139
     140#
     141#  UCD Property File Format 3:  codepoint/range -> data record maps
     142#  Many files have data records consisting of a codepoint or codepoint range
     143#  followed by fields separated by semicolons.
     144#
     145
     146UCD_point_regexp = re.compile("^([0-9A-F]{4,6})([^0-9A-F.#][^#]*)(?:#|$)")
     147UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
     148
     149def parse_data_record(data_line):
     150    m = UCD_point_regexp.match(data_line)
     151    if m:
     152        cp_lo = int(m.group(1), 16)
     153        cp_hi = cp_lo
     154        field_data = m.group(2)
    160155    else:
    161         accum_set = uset_list[0]
    162         for s in uset_list[1:]:
    163             accum_set = uset_union(accum_set, s)
    164         return accum_set
    165 
    166 #
    167 #  UCD Property File Format 3:  codepoint -> name maps
    168 #
    169 UCD_skip = re.compile("^#.*$|^\s*$")
    170 UCD_missing_regexp1 = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    171 UCD_point_name_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_.]+\s+)*[-A-Za-z0-9_.]+)\s*(?:[;#]|$)")
    172 UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_.]+\s+)*[-A-Za-z0-9_.]+)\s*(?:[;#]|$)")
    173 
    174 #
    175 # Parse a file defining the enumerated property values for a given enumerated property,
    176 # returning the list of independent property values found, as well as the value map.
    177 # Ensure that the default value for the property is first in the list of property values,
    178 # and that all codepoints not explicitly identified in the file are mapped to this default.
    179 def parse_UCD_enumerated_property_map(property_code, vlist, canon_map, mapfile):
    180     value_map = {}
    181     for v in vlist: value_map[v] = empty_uset()
    182     name_list_order = []
    183     default_specs = []
    184     f = open(UCD_config.UCD_src_dir + "/" + mapfile)
    185     lines = f.readlines()
    186     for t in lines:
    187         if UCD_skip.match(t):
    188             m = UCD_missing_regexp1.match(t)
    189             if m:
    190                 (missing_lo, missing_hi, default_value) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    191                 default_value = canonicalize(default_value)
    192                 if not default_value in canon_map:  raise Exception("Unknown default property value name '%s'" % default_value)
    193                 if missing_lo != 0 or missing_hi != 0x10FFFF: raise Exception("Unexpected missing data range '%x, %x'" % (missing_lo, missing_hi))
    194                 default_value = canon_map[default_value]
    195                 #print "Property %s: setting default_value  %s" % (property_code, default_value)
    196                 # Default value must always be first in the final enumeration order.
    197                 if default_value in name_list_order: name_list_order.remove(default_value)
    198                 name_list_order = [default_value] + name_list_order
    199                 default_specs.append((missing_lo, missing_hi, default_value))
    200             continue  # skip comment and blank lines
    201         m = UCD_point_name_regexp.match(t)
    202         if m:
    203             (codepoint, name) = (int(m.group(1), 16), m.group(2))
    204             newset = singleton_uset(codepoint)
     156        m = UCD_range_regexp.match(data_line)
     157        if not m: raise Exception("UCD data record parsing error: " + data_line)
     158        cp_lo = int(m.group(1), 16)
     159        cp_hi = int(m.group(2), 16)
     160        field_data = m.group(3)
     161    field_data = field_data.lstrip().rstrip()
     162    if field_data == '':
     163        fields = []
     164    else:
     165        if field_data[0] != ';':
     166            raise Exception("Field data syntax: " + field_data)
     167        fields = field_data[1:].split(';')
     168    fields = [f.lstrip().rstrip() for f in fields]
     169    return (cp_lo, cp_hi, fields)
     170
     171UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
     172
     173def parse_missing_spec(data_line):
     174    m = UCD_missing_regexp.match(data_line)
     175    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
     176    cp_lo = int(m.group(1), 16)
     177    cp_hi = int(m.group(2), 16)
     178    field_data = m.group(3)
     179    fields = field_data.split(';')
     180    fields = [f.lstrip().rstrip() for f in fields]
     181    return (cp_lo, cp_hi, fields)
     182
     183def parse_property_and_value(fields, property_lookup_map):
     184    if len(fields) > 2: raise Exception("Too many fields")
     185    if len(fields) == 0: raise Exception("Expecting at least 1 field")
     186    canon = canonicalize(fields[0])
     187    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
     188    pcode = property_lookup_map[canon]
     189    if len(fields) == 1: return (pcode, None)
     190    else: return (pcode, fields[1])
     191
     192def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
     193    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     194    props = []
     195    lines = f.readlines()
     196    for t in lines:
     197        if UCD_missing_regexp.match(t):
     198            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
     199            (prop_code, dflt) = parse_property_and_value(fields, property_lookup_map)
     200            property_object_map[prop_code].setDefaultValue(dflt)
     201            if not prop_code in props: props.append(prop_code)
     202        elif UCD_skip.match(t):
     203            continue
    205204        else:
    206             m = UCD_range_name_regexp.match(t)
    207             if not m: raise Exception("Unknown syntax: %s" % t)
    208             (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    209             newset = range_uset(cp_lo, cp_hi)
    210         cname = canonicalize(name)
    211         if not cname in canon_map:  raise Exception("Unknown property or property value name '%s'" % cname)
    212         name = canon_map[cname]
    213         if not name in name_list_order:
    214             name_list_order.append(name)
    215         value_map[name] = uset_union(value_map[name], newset)
    216     for (default_lo, default_hi, default_val) in default_specs:
    217         value_map = add_Default_Values(value_map, default_lo, default_hi, default_val)
    218     return (name_list_order, value_map)
    219 
    220 def add_Default_Values(value_map, default_lo, default_hi, default_val):
    221     default_region = range_uset(default_lo, default_hi)
    222     explicitly_defined_cps = empty_uset()
    223     for k in value_map.keys(): explicitly_defined_cps = uset_union(explicitly_defined_cps, value_map[k])
    224     need_default_value = uset_difference(default_region, explicitly_defined_cps)
    225     if default_val in value_map:
    226         value_map[default_val] = uset_union(value_map[default_val], need_default_value)
    227     else:
    228         value_map[default_val] = need_default_value
    229     return value_map
    230 
    231 def parse_ScriptExtensions_txt(scripts, canon_map):
     205            (cp_lo, cp_hi, fields) = parse_data_record(t)
     206            (prop_code, v) = parse_property_and_value(fields, property_lookup_map)
     207            if not prop_code in props: props.append(prop_code)
     208            if v == None:  # binary property
     209                property_object_map[prop_code].addDataRecord(cp_lo, cp_hi)
     210            else:
     211                property_object_map[prop_code].addDataRecord(cp_lo, cp_hi, v)
     212    for p in props:
     213        property_object_map[p].finalizeProperty()
     214    return props
     215
     216def parse_property_data(property_object, pfile):
     217    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     218    lines = f.readlines()
     219    for t in lines:
     220        if UCD_missing_regexp.match(t):
     221            (cp_lo, cp_hi, fields) = parse_missing_spec(t)
     222            if len(fields) != 1: raise Exception("Expecting exactly 1 field")
     223            property_object.setDefaultValue(fields[0])
     224        elif UCD_skip.match(t):
     225            continue
     226        else:
     227            (cp_lo, cp_hi, fields) = parse_data_record(t)
     228            if isinstance(property_object, BinaryPropertyObject) and len(fields) == 0:
     229                property_object.addDataRecord(cp_lo, cp_hi)
     230            else:
     231                property_object.addDataRecord(cp_lo, cp_hi, fields[0])
     232    property_object.finalizeProperty()
     233
     234def parse_ScriptExtensions_txt(script_property_object):
    232235    filename_root = 'ScriptExtensions'
    233     property_code = 'scx'
    234     (scriptlist, script_map) = parse_UCD_enumerated_property_map('sc', scripts, canon_map, 'Scripts.txt')
    235     (scx_sets, scx_set_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt')
    236     value_map = {}
    237     explicitly_defined_set = empty_uset()
    238     for scx_list in scx_sets:
    239         scx_items = scx_list.split(" ")
    240         for scx in scx_items:
    241             # sc = canonical_property_value_map[canonicalize(scx)]
    242             sc = scx
    243             if sc in value_map:
    244                 value_map[sc] = uset_union(value_map[sc], scx_set_map[scx_list])
    245             else: value_map[sc] = scx_set_map[scx_list]
    246         explicitly_defined_set = uset_union(explicitly_defined_set, scx_set_map[scx_list])
    247     for v in scripts:
    248         if v in value_map:
    249             value_map[v] = uset_union(value_map[v], uset_difference(script_map[v], explicitly_defined_set))
    250         elif v in script_map:
    251             value_map[v] = script_map[v]
    252         else: value_map[v] = empty_uset()
    253     return (scripts, value_map)
    254 
    255 
    256 def parse_UCD_codepoint_name_map(mapfile, canon_map = None):
    257     value_map = {}
    258     name_list_order = []
    259     f = open(UCD_config.UCD_src_dir + "/" + mapfile)
    260     lines = f.readlines()
    261     for t in lines:
    262         if UCD_skip.match(t):
    263             continue  # skip comment and blank lines
    264         m = UCD_point_name_regexp.match(t)
    265         if m:
    266             (codepoint, name) = (int(m.group(1), 16), m.group(2))
    267             newset = singleton_uset(codepoint)
    268         else:
    269             m = UCD_range_name_regexp.match(t)
    270             if not m: raise Exception("Unknown syntax: %s" % t)
    271             (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    272             newset = range_uset(cp_lo, cp_hi)
    273         if not canon_map == None:
    274             cname = canonicalize(name)
    275             if not cname in canon_map:
    276                 raise Exception("Unknown property or property value name '%s'" % cname)
    277             name = canon_map[cname]
    278         if not name in value_map:
    279             value_map[name] = newset
    280             name_list_order.append(name)
    281         else: value_map[name] = uset_union(value_map[name], newset)
    282     return (name_list_order, value_map)
    283 
    284 # Format 4: simple codepoint sets
    285 
    286 UCD_point_only_regexp = re.compile("^([0-9A-F]{4,6})\s*(?:[#]|$)")
    287 UCD_range_only_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})(?:[#]|$)")
    288 
    289 def parse_UCD_codepoint_set(setfile):
    290     cp_set = empty_uset()
    291     f = open(UCD_config.UCD_src_dir + "/" + setfile)
    292     lines = f.readlines()
    293     for t in lines:
    294         if UCD_skip.match(t):
    295             continue  # skip comment and blank lines
    296         m = UCD_point_only_regexp.match(t)
    297         if m:
    298             codepoint = int(m.group(1), 16)
    299             newset = singleton_uset(codepoint)
    300         else:
    301             m = UCD_range_only_regexp.match(t)
    302             if not m: raise Exception("Unknown syntax: %s" % t)
    303             (cp_lo, cp_hi) = (int(m.group(1), 16), int(m.group(2), 16))
    304             newset = range_uset(cp_lo, cp_hi)
    305         cp_set = uset_union(cp_set, newset)
    306     return cp_set
    307 
     236    parse_property_data(script_property_object, filename_root + '.txt')
    308237
    309238UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5655 r5658  
    1313from unicode_set import *
    1414from UCD_parser import *
     15from UCD_property_objects import *
    1516
    1617PropertyAliases_template = r"""
     
    4243CodepointProperties = ['scf', 'slc', 'suc', 'stc']
    4344
     45
     46def emit_string_property(f, property_code, null_set, reflexive_set, string_values):
     47    f.write("    namespace %s_ns {\n" % property_code.upper())
     48    f.write("        /** Code Point Ranges for %s mapping to <none> \n        " % property_code)
     49    f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8))
     50    f.write("**/\n")
     51    f.write("        const UnicodeSet null_codepoint_set \n")
     52    f.write(null_set.showC(12) + ";\n")
     53    f.write("        /** Code Point Ranges for %s mapping to <codepoint> \n        " % property_code)
     54    f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8))
     55    f.write("**/\n")
     56    f.write("        const UnicodeSet reflexive_set \n")
     57    f.write(reflexive_set.showC(12) + ";\n")
     58    f.write("        const unsigned buffer_length = %s;\n" % string_values.len())
     59    f.write("        const char * string_buffer = u8 R\"__(%s)__\";\n")
     60    f.write("        static StringPropertyObject property_object{%s, null_codepoint_set, reflexive_set, string_buffer, buffer_length};\n    }\n" % property_code)
     61
     62
     63def emit_binary_property(f, property_code, property_set):
     64    f.write("    namespace %s_ns {\n" % property_code.upper())
     65    f.write("        /** Code Point Ranges for %s\n        " % property_code)
     66    f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(property_set)], ',', 8))
     67    f.write("**/\n")
     68    f.write("        const UnicodeSet codepoint_set \n")
     69    f.write(property_set.showC(12) + ";\n")
     70    f.write("        static BinaryPropertyObject property_object{%s, codepoint_set};\n    }\n" % property_code)
     71
     72def emit_enumerated_property(f, property_code, independent_prop_values, prop_values, value_map):
     73    f.write("  namespace %s_ns {\n" % property_code.upper())
     74    f.write("    const unsigned independent_prop_values = %s;\n" % independent_prop_values)
     75    for v in prop_values:
     76        f.write("    /** Code Point Ranges for %s\n    " % v)
     77        f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 4))
     78        f.write("**/\n")
     79        f.write("    const UnicodeSet %s_Set \n" % v.lower())
     80        f.write(value_map[v].showC(8) + ";\n")
     81    set_list = ['&%s_Set' % v.lower() for v in prop_values]
     82    f.write("    static EnumeratedPropertyObject property_object\n")
     83    f.write("        {%s,\n" % property_code)
     84    f.write("         %s_ns::independent_prop_values,\n" % property_code.upper())
     85    f.write("         %s_ns::enum_names,\n" % property_code.upper())
     86    f.write("         %s_ns::value_names,\n" % property_code.upper())
     87    f.write("         %s_ns::aliases_only_map,\n" % property_code.upper())
     88    f.write("         {")
     89    f.write(cformat.multiline_fill(set_list, ',', 8))
     90    f.write("\n         }};\n    }\n")
     91
    4492class UCD_generator():
    4593    def __init__(self):
     
    5098
    5199    def load_property_name_info(self):
    52         (self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt()
     100        #(self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt()
     101        (self.property_enum_name_list, self.property_object_map) = parse_PropertyAlias_txt()
     102        self.property_lookup_map = getPropertyLookupMap(self.property_object_map)
     103        self.full_name_map = {}
     104        for p in self.property_enum_name_list:
     105            self.full_name_map[p] = self.property_object_map[p].getPropertyFullName()
     106
    53107
    54108    def generate_PropertyAliases_h(self):
     
    63117
    64118    def load_property_value_info(self):
    65         (self.property_value_list, self.property_value_enum_integer, self.property_value_full_name_map, self.property_value_lookup_map, self.missing_specs) = parse_PropertyValueAlias_txt(self.property_lookup_map)
    66 
     119        initializePropertyValues(self.property_object_map, self.property_lookup_map)
    67120
    68121    def generate_PropertyValueAliases_h(self):
     
    79132        #
    80133        for p in self.property_enum_name_list:
    81            if p in self.property_value_list:
    82               if not self.property_kind_map[p] == 'Binary':
    83                   enum_text = cformat.multiline_fill(self.property_value_list[p], ',', 12)
    84                   enum_names = cformat.multiline_fill(['"%s"' % s for s in self.property_value_list[p]], ',', 12)
    85                   if p == 'ccc': # Special case: add numeric value information for ccc.
    86                       enum_text += r"""
     134            po = self.property_object_map[p]
     135            if isinstance(po, EnumeratedPropertyObject):
     136                ordered_enum_list = po.property_value_list
     137                enum_text = cformat.multiline_fill(ordered_enum_list, ',', 12)
     138                enum_names = cformat.multiline_fill(['"%s"' % s for s in ordered_enum_list], ',', 12)
     139                if p == 'ccc': # Special case: add numeric value information for ccc.
     140                    enum_text += r"""
    87141        };
    88142        const uint16_t enum_val[] = {
    89     """
    90                       enum_text += "      " + cformat.multiline_fill(["%s" % (self.property_value_enum_integer[p][e]) for e in self.property_value_list['ccc']], ',', 12)
    91                   full_names = [self.property_value_full_name_map[p][e] for e in self.property_value_list[p]]
    92                   full_name_text = cformat.multiline_fill(['"%s"' % name for name in full_names], ',', 12)
    93                   canon_full_names = [canonicalize(name) for name in full_names]
    94                   canon_enums = [canonicalize(e) for e in self.property_value_list[p]]
    95                   canon_keys = [canonicalize(k) for k in self.property_value_lookup_map[p].keys()]
    96                   aliases_only = [k for k in canon_keys if not k in canon_enums + canon_full_names]
    97                   map_text = cformat.multiline_fill(['{"%s", %s_ns::%s}' % (k, p.upper(), self.property_value_lookup_map[p][k]) for k in sorted(aliases_only)], ',', 12)
    98                   f.write(EnumeratedProperty_template % (p.upper(), enum_text, enum_names, full_name_text, map_text))
     143        """
     144                    enum_text += "      " + cformat.multiline_fill(["%s" % (po.property_value_enum_integer[e]) for e in ordered_enum_list], ',', 12)
     145                full_names = [po.property_value_full_name_map[e] for e in ordered_enum_list]
     146                full_name_text = cformat.multiline_fill(['"%s"' % name for name in full_names], ',', 12)
     147                canon_full_names = [canonicalize(name) for name in full_names]
     148                canon_enums = [canonicalize(e) for e in ordered_enum_list]
     149                canon_keys = [canonicalize(k) for k in po.property_value_lookup_map.keys()]
     150                aliases_only = []
     151                for k in canon_keys:
     152                    if k in canon_enums: continue
     153                    if k in canon_full_names: continue
     154                    if k in aliases_only: continue
     155                    aliases_only.append(k)
     156                map_text = cformat.multiline_fill(['{"%s", %s_ns::%s}' % (k, p.upper(), po.property_value_lookup_map[k]) for k in sorted(aliases_only)], ',', 12)
     157                f.write(EnumeratedProperty_template % (p.upper(), enum_text, enum_names, full_name_text, map_text))
    99158        f.write("}\n")
    100159        cformat.close_header_file(f)
    101160
     161    def emit_property(self, f, property_code):
     162        property_object = self.property_object_map[property_code]
     163        if isinstance(property_object, BinaryPropertyObject):
     164            emit_binary_property(f, property_code, property_object.value_map['Y'])
     165            print("%s: %s bytes" % (property_object.getPropertyFullName(), property_object.value_map['Y'].bytes()))
     166        elif isinstance(property_object, EnumeratedPropertyObject):
     167            prop_values = property_object.name_list_order
     168            independent_prop_values = property_object.independent_prop_values
     169            emit_enumerated_property(f, property_code, independent_prop_values, prop_values, property_object.value_map)
     170            print("%s: %s bytes" % (property_object.getPropertyFullName(), sum([property_object.value_map[v].bytes() for v in property_object.value_map.keys()])))
     171        #elif isinstance(property_object, StringPropertyObject):
     172        #    emit_string_property(f, property_code, property_object.value_map)
     173
    102174    def generate_property_value_file(self, filename_root, property_code):
    103         vlist = self.property_value_list[property_code]
    104         canon_map = self.property_value_lookup_map[property_code]
    105         (prop_values, value_map) = parse_UCD_enumerated_property_map(property_code, vlist, canon_map, filename_root + '.txt')
    106         canon_map = self.property_value_lookup_map[property_code]
    107         if property_code in self.missing_specs:
    108             default_value = canon_map[canonicalize(self.missing_specs[property_code])]
    109             value_map = add_Default_Values(value_map, 0, 0x10FFFF, default_value)
    110         independent_prop_values = len(prop_values)
    111         for v in vlist:
    112             if not v in prop_values:
    113                 #raise Exception("Property %s value %s missing" % (self.full_name_map[property_code], v))
    114                 print("Warning: property %s has no instance of value %s" % (property_code, v))
    115                 prop_values.append(v)
    116         #
    117         self.property_value_list[property_code] = prop_values
    118         if property_code == 'gc':
    119             # special logic for derived categories
    120             value_map['LC'] = union_of_all([value_map[v] for v in ['Lu', 'Ll', 'Lt']])
    121             value_map['L'] = union_of_all([value_map[v] for v in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo']])
    122             value_map['M'] = union_of_all([value_map[v] for v in ['Mn', 'Mc', 'Me']])
    123             value_map['N'] = union_of_all([value_map[v] for v in ['Nd', 'Nl', 'No']])
    124             value_map['P'] = union_of_all([value_map[v] for v in ['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']])
    125             value_map['S'] = union_of_all([value_map[v] for v in ['Sm', 'Sc', 'Sk', 'So']])
    126             value_map['Z'] = union_of_all([value_map[v] for v in ['Zs', 'Zl', 'Zp']])
    127             value_map['C'] = union_of_all([value_map[v] for v in ['Cc', 'Cf', 'Cs', 'Co', 'Cn']])
     175        property_object = self.property_object_map[property_code]
     176        parse_property_data(self.property_object_map[property_code], filename_root + '.txt')
    128177        basename = os.path.basename(filename_root)
    129         f = cformat.open_header_file_for_write(os.path.basename(filename_root))
    130         cformat.write_imports(f, ['"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
    131         f.write("\nnamespace UCD {\n")
    132         f.write("  namespace %s_ns {\n" % property_code.upper())
    133         f.write("    const unsigned independent_prop_values = %s;\n" % independent_prop_values)
    134         for v in prop_values:
    135             f.write("    /** Code Point Ranges for %s\n    " % v)
    136             f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 4))
    137             f.write("**/\n")
    138             f.write("    const UnicodeSet %s_Set \n" % v.lower())
    139             f.write(value_map[v].showC(8) + ";\n")
    140         print("%s: %s bytes" % (basename, sum([value_map[v].bytes() for v in value_map.keys()])))
    141         set_list = ['&%s_Set' % v.lower() for v in prop_values]
    142         f.write("    static EnumeratedPropertyObject property_object\n")
    143         f.write("        {%s,\n" % property_code)
    144         f.write("         %s_ns::independent_prop_values,\n" % property_code.upper())
    145         f.write("         %s_ns::enum_names,\n" % property_code.upper())
    146         f.write("         %s_ns::value_names,\n" % property_code.upper())
    147         f.write("         %s_ns::aliases_only_map,\n" % property_code.upper())
    148         f.write("         {")
    149         f.write(cformat.multiline_fill(set_list, ',', 8))
    150         f.write("\n         }};\n    }\n}\n")
    151         cformat.close_header_file(f)
    152         self.supported_props.append(property_code)
     178        f = cformat.open_header_file_for_write(basename)
     179        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     180        f.write("\nnamespace UCD {\n")
     181        self.emit_property(f, property_code)
     182        f.write("}\n")
     183        cformat.close_header_file(f)
     184        if isinstance(property_object, BinaryPropertyObject) or isinstance(property_object, EnumeratedPropertyObject): self.supported_props.append(property_code)
     185        self.property_data_headers.append(basename)
     186
     187    def generate_multisection_properties_file(self, filename_root):
     188        props = parse_multisection_property_data(filename_root + '.txt', self.property_object_map, self.property_lookup_map)
     189        #(props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', self.property_lookup_map)
     190        basename = os.path.basename(filename_root)
     191        f = cformat.open_header_file_for_write(basename)
     192        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     193        f.write("\nnamespace UCD {\n")
     194        for p in sorted(props):
     195            self.emit_property(f, p)
     196            property_object = self.property_object_map[p]
     197            if isinstance(property_object, BinaryPropertyObject) or isinstance(property_object, EnumeratedPropertyObject): self.supported_props.append(p)
     198        f.write("}\n\n")
     199        cformat.close_header_file(f)
    153200        self.property_data_headers.append(basename)
    154201
     
    156203        filename_root = 'ScriptExtensions'
    157204        property_code = 'scx'
    158         (prop_values, value_map) = parse_ScriptExtensions_txt(self.property_value_list['sc'], self.property_value_lookup_map['sc'])
     205        extension_object = self.property_object_map['scx']
     206        extension_object.setBaseProperty(self.property_object_map['sc'])
     207        parse_ScriptExtensions_txt(extension_object)
    159208        basename = os.path.basename(filename_root)
    160209        f = cformat.open_header_file_for_write(basename)
    161         cformat.write_imports(f, ['"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     210        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     211        prop_list = self.property_object_map['sc'].name_list_order
     212        value_map = extension_object.value_map
    162213        f.write("\nnamespace UCD {\n")
    163214        f.write("    namespace SCX_ns {\n")
    164         for v in self.property_value_list['sc']:
     215        for v in prop_list:
    165216            f.write("        /** Code Point Ranges for %s\n        " % v)
    166217            f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 8))
     
    168219            f.write("        const UnicodeSet %s_Ext \n" % v.lower())
    169220            f.write(value_map[v].showC(12) + ";\n")
    170         set_list = ['&%s_Ext' % v.lower() for v in self.property_value_list['sc']]
     221        set_list = ['&%s_Ext' % v.lower() for v in prop_list]
    171222        f.write("        static ExtensionPropertyObject property_object\n")
    172223        f.write("       {%s,\n" % property_code)
     
    180231        self.property_data_headers.append(basename)
    181232
    182 
    183     def emit_binary_property(self, f, property_code, property_set):
    184         f.write("    namespace %s_ns {\n" % property_code.upper())
    185         f.write("        /** Code Point Ranges for %s\n        " % property_code)
    186         f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(property_set)], ',', 8))
    187         f.write("**/\n")
    188         f.write("        const UnicodeSet codepoint_set \n")
    189         f.write(property_set.showC(12) + ";\n")
    190         f.write("        static BinaryPropertyObject property_object{%s, codepoint_set};\n    }\n" % property_code)
    191 
    192     def generate_binary_properties_file(self, filename_root):
    193         (props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', self.property_lookup_map)
    194         basename = os.path.basename(filename_root)
    195         f = cformat.open_header_file_for_write(basename)
    196         cformat.write_imports(f, ['"PropertyAliases.h"', '"unicode_set.h"', "<vector>"])
    197         f.write("\nnamespace UCD {\n")
    198         for p in sorted(props):
    199             self.emit_binary_property(f, p, prop_map[p])
    200         f.write("}\n\n")
    201         cformat.close_header_file(f)
    202         print("%s: %s bytes" % (basename, sum([prop_map[p].bytes() for p in prop_map.keys()])))
    203         self.supported_props += props
    204         for p in prop_map.keys(): self.binary_properties[p] = prop_map[p]
    205         self.property_data_headers.append(basename)
    206 
    207     def generate_binary_property_file(self, filename_root, property_code):
    208         prop_map = parse_UCD_codepoint_set(filename_root + '.txt')
    209         basename = os.path.basename(filename_root)
    210         f = cformat.open_header_file_for_write(basename)
    211         cformat.write_imports(f, ['"PropertyAliases.h"', '"unicode_set.h"', "<vector>"])
    212         f.write("\nnamespace UCD {\n")
    213         self.emit_binary_property(f, property_code, prop_map)
    214         f.write("}\n\n")
    215         cformat.close_header_file(f)
    216         print("%s: %s bytes" % (basename, prop_map.bytes()))
    217         self.supported_props += [property_code]
    218         self.binary_properties[property_code] = prop_map
    219         self.property_data_headers.append(basename)
    220 
    221233    def generate_PropertyObjectTable_h(self):
    222234        f = cformat.open_header_file_for_write('PropertyObjectTable')
     
    226238        objlist = []
    227239        for p in self.property_enum_name_list:
    228             k = self.property_kind_map[p]
     240            k = self.property_object_map[p].getPropertyKind()
    229241            if p in self.supported_props:
    230242                objlist.append("&%s_ns::property_object" % p.upper())
     
    268280    # The Block property
    269281    ucd.generate_property_value_file('Blocks', 'blk')
    270     #
     282   
    271283    # Scripts
    272284    ucd.generate_property_value_file('Scripts', 'sc')
    273285    #
    274     # Script Extensions
     286    # # Script Extensions
    275287    ucd.generate_ScriptExtensions_h()
    276     #
     288    # #
    277289    # General Category
    278290    ucd.generate_property_value_file('extracted/DerivedGeneralCategory', 'gc')
    279     #
     291   
    280292    # Binary properties from PropList.txt
    281     ucd.generate_binary_properties_file('PropList')
    282     #
     293    ucd.generate_multisection_properties_file('PropList')
     294   
    283295    # Binary properties from DerivedCoreProperties.txt
    284     ucd.generate_binary_properties_file('DerivedCoreProperties')
     296    ucd.generate_multisection_properties_file('DerivedCoreProperties')
    285297    #
    286298    #
     
    298310    ucd.generate_property_value_file('auxiliary/WordBreakProperty', 'WB')
    299311    #
    300     # East Asian Width
     312    # East Asian Width - can use either source
    301313    ucd.generate_property_value_file('EastAsianWidth', 'ea')
    302314    #ucd.generate_property_value_file('extracted/DerivedEastAsianWidth', 'ea')
     
    305317    ucd.generate_property_value_file('HangulSyllableType', 'hst')
    306318    #
    307     # Bidi Mirroroing from DerivedCoreProperties.txt
    308     ucd.generate_binary_properties_file('extracted/DerivedBinaryProperties')
    309     #
    310     # Canonical_Combining_Class
     319    ucd.generate_multisection_properties_file('extracted/DerivedBinaryProperties')
     320    # #
     321    # # Canonical_Combining_Class
    311322    ucd.generate_property_value_file('extracted/DerivedCombiningClass', 'ccc')
    312323    #
     
    322333    #ucd.generate_property_value_file('extracted/DerivedNumericValue', 'nv')
    323334    #
    324     # Binary normalization properties.
    325     ucd.generate_binary_properties_file('DerivedNormalizationProps')
     335    # Normalization properties.
     336    ucd.generate_multisection_properties_file('DerivedNormalizationProps')
    326337    #
    327338    # Bidi_Class
     
    332343    ucd.generate_property_value_file('IndicSyllabicCategory', 'InSC')
    333344
    334     ucd.generate_binary_property_file('CompositionExclusions', 'CE')
    335 
     345    ucd.generate_property_value_file('CompositionExclusions', 'CE')
    336346    #
    337347    # Jamo Short Name - AAARGH - property value for 110B is an empty string!!!!!  - Not in PropertyValueAliases.txt
Note: See TracChangeset for help on using the changeset viewer.