Changeset 5655


Ignore:
Timestamp:
Sep 30, 2017, 9:49:24 AM (19 months ago)
Author:
cameron
Message:

UCD_Config.h, add Indic and CompositionExclusion? properties

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5653 r5655  
    282282    return (name_list_order, value_map)
    283283
     284# Format 4: simple codepoint sets
     285
     286UCD_point_only_regexp = re.compile("^([0-9A-F]{4,6})\s*(?:[#]|$)")
     287UCD_range_only_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})(?:[#]|$)")
     288
     289def parse_UCD_codepoint_set(setfile):
     290    cp_set = empty_uset()
     291    f = open(UCD_config.UCD_src_dir + "/" + setfile)
     292    lines = f.readlines()
     293    for t in lines:
     294        if UCD_skip.match(t):
     295            continue  # skip comment and blank lines
     296        m = UCD_point_only_regexp.match(t)
     297        if m:
     298            codepoint = int(m.group(1), 16)
     299            newset = singleton_uset(codepoint)
     300        else:
     301            m = UCD_range_only_regexp.match(t)
     302            if not m: raise Exception("Unknown syntax: %s" % t)
     303            (cp_lo, cp_hi) = (int(m.group(1), 16), int(m.group(2), 16))
     304            newset = range_uset(cp_lo, cp_hi)
     305        cp_set = uset_union(cp_set, newset)
     306    return cp_set
     307
    284308
    285309UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
    286310
     311NonNameRange_regexp = re.compile("<([^>]*)>")
     312NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
     313
    287314def parse_UnicodeData_txt():
    288    data_records = []
    289    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
    290    lines = f.readlines()
    291    for t in lines:
    292       if UCD_skip.match(t):
    293         continue  # skip comment and blank lines
    294       m = UnicodeData_txt_regexp.match(t)
    295       if not m: raise Exception("Unknown syntax: %s" % t)
    296       (cp, name, gc) = (m.group(1), m.group(2), m.group(3))
    297       (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
    298       (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
    299       # Unicode 1 name and ISO comment are obolete
    300       (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
    301       data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
    302    return data_records
     315    data_records = []
     316    range_records = []
     317    name_range_starts = {}
     318    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
     319    lines = f.readlines()
     320    for t in lines:
     321        if UCD_skip.match(t):
     322            continue  # skip comment and blank lines
     323        m = UnicodeData_txt_regexp.match(t)
     324        if not m: raise Exception("Unknown syntax: %s" % t)
     325        (cp, name, gc) = (m.group(1), m.group(2), m.group(3))
     326        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
     327        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
     328        # Unicode 1 name and ISO comment are obolete
     329        (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
     330        nonNameMatch = NonNameRange_regexp.match(name)
     331        if nonNameMatch:
     332            rangeMatch = NameRange_regexp.match(name)
     333            if rangeMatch:
     334                rangeName = rangeMatch.group(1)
     335                print(rangeName, rangeMatch.group(2))
     336                if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
     337                if rangeMatch.group(2) == 'Last':
     338                    if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
     339                    range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
     340            continue
     341        data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
     342    return (data_records, range_records)
    303343
    304344#  Parse a decomposition mapping field in one of two forms:
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5653 r5655  
    164164        for v in self.property_value_list['sc']:
    165165            f.write("        /** Code Point Ranges for %s\n        " % v)
    166             f.write(cformat.multiline_fill(['[%s, %s]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 8))
     166            f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 8))
    167167            f.write("**/\n")
    168168            f.write("        const UnicodeSet %s_Ext \n" % v.lower())
     
    180180        self.property_data_headers.append(basename)
    181181
     182
     183    def emit_binary_property(self, f, property_code, property_set):
     184        f.write("    namespace %s_ns {\n" % property_code.upper())
     185        f.write("        /** Code Point Ranges for %s\n        " % property_code)
     186        f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(property_set)], ',', 8))
     187        f.write("**/\n")
     188        f.write("        const UnicodeSet codepoint_set \n")
     189        f.write(property_set.showC(12) + ";\n")
     190        f.write("        static BinaryPropertyObject property_object{%s, codepoint_set};\n    }\n" % property_code)
     191
    182192    def generate_binary_properties_file(self, filename_root):
    183193        (props, prop_map) = parse_UCD_codepoint_name_map(filename_root + '.txt', self.property_lookup_map)
     
    187197        f.write("\nnamespace UCD {\n")
    188198        for p in sorted(props):
    189             # f.write("  namespace %s_ns {\n    const UnicodeSet codepoint_set \n" % p.upper())
    190             # f.write(prop_map[p].showC(12) + ";\n")
    191             # f.write("    static BinaryPropertyObject property_object{%s, codepoint_set};\n  }\n" % p)
    192             f.write("    namespace %s_ns {\n" % p.upper())
    193             f.write("        /** Code Point Ranges for %s\n        " % p)
    194             f.write(cformat.multiline_fill(['[%s, %s]' % (lo, hi) for (lo, hi) in uset_to_range_list(prop_map[p])], ',', 8))
    195             f.write("**/\n")
    196             f.write("        const UnicodeSet codepoint_set \n")
    197             f.write(prop_map[p].showC(12) + ";\n")
    198             f.write("        static BinaryPropertyObject property_object{%s, codepoint_set};\n    }\n" % p)
     199            self.emit_binary_property(f, p, prop_map[p])
    199200        f.write("}\n\n")
    200201        cformat.close_header_file(f)
     
    204205        self.property_data_headers.append(basename)
    205206
     207    def generate_binary_property_file(self, filename_root, property_code):
     208        prop_map = parse_UCD_codepoint_set(filename_root + '.txt')
     209        basename = os.path.basename(filename_root)
     210        f = cformat.open_header_file_for_write(basename)
     211        cformat.write_imports(f, ['"PropertyAliases.h"', '"unicode_set.h"', "<vector>"])
     212        f.write("\nnamespace UCD {\n")
     213        self.emit_binary_property(f, property_code, prop_map)
     214        f.write("}\n\n")
     215        cformat.close_header_file(f)
     216        print("%s: %s bytes" % (basename, prop_map.bytes()))
     217        self.supported_props += [property_code]
     218        self.binary_properties[property_code] = prop_map
     219        self.property_data_headers.append(basename)
     220
    206221    def generate_PropertyObjectTable_h(self):
    207222        f = cformat.open_header_file_for_write('PropertyObjectTable')
     
    209224        cformat.write_imports(f, ['"%s.h"' % fname for fname in self.property_data_headers])
    210225        f.write("\nnamespace UCD {\n")
    211         f.write("   const std::string UnicodeVersion = \"%s\";\n" % UCD_config.version)
    212226        objlist = []
    213227        for p in self.property_enum_name_list:
     
    226240        cformat.close_header_file(f)
    227241
     242    def generate_UCD_Config_h(self):
     243        setVersionfromReadMe_txt()
     244        f = cformat.open_header_file_for_write('UCD_Config')
     245        f.write("\nnamespace UCD {\n")
     246        f.write("   const std::string UnicodeVersion = \"%s\";\n" % UCD_config.version)
     247        f.write("}\n")
     248        cformat.close_header_file(f)
     249
    228250
    229251
    230252def UCD_main():
    231     setVersionfromReadMe_txt()
    232    
    233253    ucd = UCD_generator()
    234254
     
    308328    ucd.generate_property_value_file('extracted/DerivedBidiClass', 'bc')
    309329
     330    # Indic properties
     331    ucd.generate_property_value_file('IndicPositionalCategory', 'InPC')
     332    ucd.generate_property_value_file('IndicSyllabicCategory', 'InSC')
     333
     334    ucd.generate_binary_property_file('CompositionExclusions', 'CE')
     335
    310336    #
    311337    # Jamo Short Name - AAARGH - property value for 110B is an empty string!!!!!  - Not in PropertyValueAliases.txt
     
    318344    ucd.generate_PropertyObjectTable_h()
    319345
     346    ucd.generate_UCD_Config_h()
     347
    320348if __name__ == "__main__":
    321349  UCD_main()
  • icGREP/icgrep-devel/UCD-scripts/UnicodeNameData.py

    r5642 r5655  
    1616#include "UnicodeNameData.h"
    1717const int Unamesize = %s;
    18 char __attribute__ ((aligned (32))) Unamedata[Unamesize + %s] = "%s";
     18char __attribute__ ((aligned (32))) Unamedata[Unamesize + %s] = R"___(%s)___";
    1919
    2020char * getUnicodeNameDataPtr() {
     
    2626"""
    2727
     28NonName_regexp = re.compile("<[^>]*>")
    2829
    2930def genUnicodeNameData():
    30     parsed_data = parse_UnicodeData_txt()
     31    (parsed_data, ranges) = parse_UnicodeData_txt()
    3132    name_data_string = ""
    3233    name_data_len = 0
    3334    for record in parsed_data:
    3435        (cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc) = record
    35         name_data_string += cp + ";" + name + "\\n"
    36         name_data_len += len(cp) + len(name) + 2
     36        if NonName_regexp.match(name): continue   # Skip codepoints whose name field is not actually a name.
     37        name_data_string += cp + ";" + name + "\n"
     38    # for range_record in ranges:
     39    #     (lo_cp, hi_cp, range_name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc) = range_record
     40    #     print(lo_cp, hi_cp, range_name)
     41    #     if range_name[:13] == "CJK Ideograph":
     42    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
     43    #             name_data_string += "%04X;CJK UNIFIED IDEOGRAPH-%04X\n" % (cp, cp)
     44    #     elif range_name[:16] == "Tangut Ideograph":
     45    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
     46    #             name_data_string += "%04X;TANGUT IDEOGRAPH-%04X\n" % (cp, cp)
     47    #     elif range_name[:5] == "Nushu":
     48    #         for cp in range(int(lo_cp,16), int(hi_cp,16)):
     49    #             name_data_string += "%04X;NUSHU CHARACTER-%04X\n" % (cp, cp)
     50    name_data_len = len(name_data_string)
    3751    f = open(UCD_config.UCD_output_dir + '/UnicodeNameData.cpp', 'w')
    3852    f.write(UnicodeNameData_cpp_template % (name_data_len + 1, 255 - (name_data_len % 256), name_data_string))
  • icGREP/icgrep-devel/UCD-scripts/cformat.py

    r5143 r5655  
    44#define %s
    55/*
    6  *  Copyright (c) 2016 International Characters, Inc.
     6 *  Copyright (c) 2017 International Characters, Inc.
    77 *  This software is licensed to the public under the Open Software License 3.0.
    88 *  icgrep is a trademark of International Characters, Inc.
Note: See TracChangeset for help on using the changeset viewer.