Changeset 4179


Ignore:
Timestamp:
Sep 20, 2014, 1:41:31 PM (5 years ago)
Author:
cameron
Message:

Restructure and update for vector<UnicodeSet?> value_sets[]

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/UCD/UCD_properties.py

    r4174 r4179  
    1010#
    1111#
    12 import re, string
     12import re, string, os.path
    1313from unicode_set import *
    1414
    1515UCD_dir = "7.0.0"
    16 
    1716
    1817#
     
    3837      m = UCD_property_alias_regexp.match(t)
    3938      if not m: raise Exception("Unknown property alias syntax: %s" % t)
    40       prop_enum = m.group(1)
     39      prop_enum = m.group(1).lower()
    4140      prop_preferred_full_name = m.group(2)
    4241      prop_extra = m.group(3)
     
    6665 */
    6766
    68 #include <string>
    69 #include <unordered_map>
    70 
    71 """
     67"""
     68
     69
    7270
    7371def open_header_file_for_write(filename):
     
    7876
    7977def close_header_file(f):
    80    f.write("#endif\n")
     78   f.write("\n#endif\n")
    8179   f.close()
     80
     81def write_imports(f, import_list):
     82   for i in import_list: f.write("#include %s\n" % i)
    8283
    8384PropertyAliases_template = r"""
     
    105106  return lines
    106107
    107 enums_per_line = 4
    108 def generate_PropertyAliases_h():
    109    (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt()
     108def generate_PropertyAliases_h(property_enum_name_list, full_name_map, property_lookup_map):
    110109   f = open_header_file_for_write('PropertyAliases')
    111    enum_text = multiline_join([e.lower() for e in property_enum_name_list], enums_per_line, ',')
     110   write_imports(f, ["<string>", "<unordered_map>"])
     111   enum_text = multiline_join(property_enum_name_list, 4, ',')
    112112   full_name_text = multiline_join(['"%s"' % full_name_map[e] for e in property_enum_name_list], 2, ',')
    113    map_text = multiline_join(['{"%s", %s}' % (k, property_lookup_map[k].lower()) for k in sorted(property_lookup_map.keys())], 2, ',')
     113   map_text = multiline_join(['{"%s", %s}' % (k, property_lookup_map[k]) for k in sorted(property_lookup_map.keys())], 2, ',')
    114114   f.write(PropertyAliases_template % (enum_text, full_name_text, map_text))
    115115   close_header_file(f)
    116    
     116
    117117#
    118118#  UCD Property File Format 2: property value aliases
     
    132132UCD_property_value_alias_regexp = re.compile("^([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.]+)([^#]*)")
    133133
    134 def parse_PropertyValueAlias_txt():
     134def parse_PropertyValueAlias_txt(property_lookup_map):
    135135    property_value_list = {}
    136136    property_value_enum_integer = {}
     
    143143        m = UCD_property_value_alias_regexp.match(t)
    144144        if not m: raise Exception("Unknown property value alias syntax: %s" % t)
    145         prop_code = m.group(1)
     145        prop_code = canonicalize(m.group(1))
     146        if not property_lookup_map.has_key(prop_code): raise Exception("Property code: '%s' is unknown" % prop_code)
     147        else: prop_code = property_lookup_map[prop_code]
    146148        if not property_value_list.has_key(prop_code):
    147149          property_value_list[prop_code] = []
     
    196198"""
    197199
    198 def generate_PropertyValueAliases_h():
    199    (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt()
    200    (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map) = parse_PropertyValueAlias_txt()
     200
     201PropertyValues_template = r"""
     202using std::vector;
     203
     204namespace UCD {
     205  vector<UnicodeSet> value_sets[] = {
     206%s
     207  };
     208}
     209"""
     210
     211
     212
     213def generate_PropertyValueAliases_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map):
    201214   f = open_header_file_for_write('PropertyValueAliases')
     215   write_imports(f, ["<string>", "<unordered_map>", '"unicode_set.h"', '"PropertyAliases.h"'])
    202216   #  Generate the aliases for all Binary properties.
    203217   enum_text = multiline_join(['N', 'Y'], 4, ',','', 6)
     
    220234       f.write(PropertyValueAliases_template % (p.upper(), enum_text, full_name_text, map_text))
    221235   close_header_file(f)
    222    
     236
     237def generate_PropertyValueSets_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map):
     238   f = open_header_file_for_write('PropertyValueSets')
     239   write_imports(f, ["<vector>", '"unicode_set.h"'])
     240   vec_decl_list = []
     241   for p in property_enum_name_list:
     242     if not property_value_list.has_key(p):
     243       vec_decl_list.append("vector<UnicodeSet>(0)")
     244     elif property_value_list[p] == ['N', 'Y']:
     245       vec_decl_list.append("vector<UnicodeSet>(1)")
     246     elif p == 'scx':
     247       vec_decl_list.append("vector<UnicodeSet>(%i)" % len(property_value_list['sc']))
     248     else:
     249       vec_decl_list.append("vector<UnicodeSet>(%i)" % len(property_value_list[p]))
     250   f.write(PropertyValues_template % (multiline_join(vec_decl_list, 4, ',', '', 6)))
     251   close_header_file(f)
    223252
    224253
     
    230259UCD_range_name_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    231260
    232 def parse_UCD_codepoint_name_map(mapfile):
    233    name_map = {}
     261def parse_UCD_codepoint_name_map(mapfile, canonical_name_lookup_map = None):
     262   value_map = {}
    234263   name_list_order = []
    235264   f = open(UCD_dir + "/" + mapfile)
     
    240269      if m:
    241270        (codepoint, name) = (int(m.group(1), 16), m.group(2))
    242         newset = singleton_set(codepoint)
     271        newset = singleton_uset(codepoint)
    243272      else:
    244273        m = UCD_range_name_regexp.match(t)
    245274        if not m: raise Exception("Unknown syntax: %s" % t)
    246275        (cp_lo, cp_hi, name) = (int(m.group(1), 16), int(m.group(2), 16), m.group(3))
    247         newset = make_range_set(cp_lo, cp_hi)
    248       if not name_map.has_key(name):
    249         name_map[name] = newset
     276        newset = range_uset(cp_lo, cp_hi)
     277      if not canonical_name_lookup_map == None:
     278        cname = canonicalize(name)
     279        if not canonical_name_lookup_map.has_key(cname):  raise Exception("Unknown property or property value name '%s'" % cname)
     280        name = canonical_name_lookup_map[cname]
     281      if not value_map.has_key(name):
     282        value_map[name] = newset
    250283        name_list_order.append(name)
    251       else: name_map[name] = union(name_map[name], newset)
    252    return (name_list_order, name_map)
    253 
    254 def generate_PropList_h():
    255    (props, prop_map) = parse_UCD_codepoint_name_map('PropList.txt')
    256    f = open_header_file_for_write('PropList')
    257    for k in props:
    258      f.write(prop_map[k].showC(k))
    259    close_header_file(f)
    260 
    261 def generate_Blocks_h():
    262    (blocks, block_map) = parse_UCD_codepoint_name_map('Blocks.txt')
    263    f = open_header_file_for_write('Blocks')
    264    for k in blocks:
    265      f.write(block_map[k].showC('block["%s"]' % k))
    266    close_header_file(f)
    267 
    268 def generate_Scripts_h():
    269    (scripts, script_map) = parse_UCD_codepoint_name_map('Scripts.txt')
    270    f = open_header_file_for_write('Scripts')
    271    for k in scripts:
    272      f.write(script_map[k].showC('script["%s"]' % k))
     284      else: value_map[name] = uset_union(value_map[name], newset)
     285   return (name_list_order, value_map)
     286
     287def generate_property_value_file(filename_root, property_code, canonical_property_value_map):
     288   (prop_values, value_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', canonical_property_value_map)
     289   f = open_header_file_for_write(os.path.basename(filename_root))
     290   write_imports(f, ["<vector>", '"unicode_set.h"', '"PropertyAliases.h"', '"PropertyValueAliases.h"', '"PropertyValueSets.h"'])
     291   f.write("\nusing namespace UCD;\n\n")
     292   print "%s bytes" % sum([value_map[v].bytes() for v in value_map.keys()])
     293   for v in prop_values:
     294     f.write(value_map[v].showC('value_sets[%s][%s::%s]' % (property_code, property_code.upper(), v)))
    273295   close_header_file(f)
    274296   
     297def generate_binary_properties_file(filename_root, canonical_property_name_map):
     298   (props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', canonical_property_name_map)
     299   f = open_header_file_for_write(os.path.basename(filename_root))
     300   write_imports(f, ["<vector>", '"unicode_set.h"', '"PropertyAliases.h"', '"PropertyValueSets.h"'])
     301   f.write("\nusing namespace UCD;\n\n")
     302   print "%s bytes" % sum([prop_map[p].bytes() for p in prop_map.keys()])
     303   for p in props:
     304     f.write(prop_map[p].showC('value_sets[%s][0]' % (p)))
     305   close_header_file(f)
     306     
    275307def generate_ScriptExtensions_h():
    276308   (scx_sets, scx_map) = parse_UCD_codepoint_name_map('ScriptExtensions.txt')
    277309   map2 = {}
    278310   f = open_header_file_for_write('ScriptExtensions')
     311   write_imports(f, ["<vector>", '"PropertyAliases.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     312   f.write("\nusing namespace UCD;\n\n")
    279313   for scx_list in scx_sets:
    280314     scx_items = scx_list.split(" ")
    281315     for scx in scx_items:
    282316        if map2.has_key(scx):
    283            map2[scx] = union(map2[scx], scx_map[scx_list])
     317           map2[scx] = uset_union(map2[scx], scx_map[scx_list])
    284318        else: map2[scx] = scx_map[scx_list]
     319   print "%s bytes" % sum([map2[k].bytes() for k in map2.keys()])
    285320   for k in sorted(map2.keys()):
    286      f.write(map2[k].showC('scx["%s"]' % k))
    287    close_header_file(f)
    288 
    289 def generate_DerivedGeneralCategory_h():
    290    (categories, cat_map) = parse_UCD_codepoint_name_map('extracted/DerivedGeneralCategory.txt')
    291    f = open_header_file_for_write('DerivedGeneralCategory')
    292    for k in categories:
    293      f.write(cat_map[k].showC('GC["%s"]' % k))
    294    close_header_file(f)
    295 
    296 def generate_DerivedCoreProperties_h():
    297    (properties, prop_map) = parse_UCD_codepoint_name_map('DerivedCoreProperties.txt')
    298    f = open_header_file_for_write('DerivedCoreProperties')
    299    for k in properties:
    300      f.write(prop_map[k].showC(k))
    301    close_header_file(f)
    302 
    303 
     321     f.write(map2[k].showC('value_sets[scx][SC::%s]' % k.lower()))
     322   close_header_file(f)
     323
     324
     325
     326def UCD_main():
     327   # First parse all property names and their aliases
     328   (property_enum_name_list, full_name_map, property_lookup_map) = parse_PropertyAlias_txt()
     329   generate_PropertyAliases_h(property_enum_name_list, full_name_map, property_lookup_map)
     330   # Next parse all property value names and their aliases
     331   (property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map) = parse_PropertyValueAlias_txt(property_lookup_map)
     332   generate_PropertyValueAliases_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map)
     333   #
     334   generate_PropertyValueSets_h(property_enum_name_list, property_value_list, property_value_enum_integer, property_value_full_name_map, property_value_lookup_map)
     335   #
     336   # Blocks
     337   generate_property_value_file('Blocks', 'blk', property_value_lookup_map['blk'])
     338   #
     339   # Scripts
     340   generate_property_value_file('Scripts', 'sc', property_value_lookup_map['sc'])
     341   #
     342   # Script Extensions
     343   generate_ScriptExtensions_h()
     344   #
     345   # General Category
     346   generate_property_value_file('extracted/DerivedGeneralCategory', 'gc', property_value_lookup_map['gc'])
     347
     348   #
     349   # Binary properties from PropList.txt
     350   generate_binary_properties_file('PropList', property_lookup_map)
     351   #
     352   # Binary properties from DerivedCoreProperties.txt
     353   generate_binary_properties_file('DerivedCoreProperties', property_lookup_map)
     354   #
     355   # East Asian Width
     356   generate_property_value_file('EastAsianWidth', 'ea', property_value_lookup_map['ea'])
     357
     358
Note: See TracChangeset for help on using the changeset viewer.