Changeset 5662 for icGREP


Ignore:
Timestamp:
Oct 3, 2017, 2:18:24 PM (19 months ago)
Author:
cameron
Message:

UCD parsing improvements; initial support for generating StringPropertyObjects? - not yet functional

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5661 r5662  
    7676    return (property_enum_name_list, property_object_map)
    7777
     78
     79#
     80#  Property Default Value Specifications
     81#
     82#  THe UCD uses special comment lines ("@missing specifications") to declare default
     83#  values for properties.   Examples showing the two common formats are:
     84#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
     85#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
     86#  The general format gives a range of codepoints (generally 0000..10FFFF),
     87#  an optional property name (if the file containing the specification defines
     88#  many different properties), and the default value.
     89#
     90#  There are some important default values for different property types:
     91#  <codepoint>:  This is a default value for certain String properties,
     92#                indicating the default for a codepoint under the given property
     93#                is to map to itself.
     94#  <none>:       This is a default for certain String properties indicating that
     95#                the default value for a code point is the empty string.
     96#  <script>:     The default value for the ScriptExtnesions property is the
     97#                value of the Script property.
     98#  NaN           The default value for numeric property is the NaN (not a number) value.
     99#
     100
     101#  Given a line known to contain such a @missing specification,
     102#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
     103#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
     104#  The list of fields contains one or two entries: an optional
     105#  property name and the default value specified for the range.
     106#  @missing specifications generally omit the property name when
     107#  the file being processed is defined for a single property only.
     108#
     109UCD_missing_check = re.compile("^#\s*@missing:.*")
     110UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
     111
     112def parse_missing_spec(data_line):
     113    m = UCD_missing_regexp.match(data_line)
     114    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
     115    cp_lo = int(m.group(1), 16)
     116    cp_hi = int(m.group(2), 16)
     117    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
     118    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
     119    field_data = m.group(3)
     120    fields = field_data.split(';')
     121    fields = [f.lstrip().rstrip() for f in fields]
     122    return (cp_lo, cp_hi, fields)
     123
     124#
     125#  Missing specifications and other types of UCD data records often produce
     126#  a list of one or two fields which indicate a property and a value.
     127#
     128#  parse_property_and_value(fields, property_lookup_map) checks that
     129#  first of the given fields is indeed a property identifier identified
     130#  in the given lookup map, and returns a pair consisting of the
     131#  unique property code for the property, plus a corresponding value
     132#  (or None, if only one field was given).
     133
     134def parse_property_and_value(fields, property_lookup_map):
     135    if len(fields) > 2: raise Exception("Too many fields")
     136    if len(fields) == 0: raise Exception("Expecting at least 1 field")
     137    canon = canonicalize(fields[0])
     138    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
     139    pcode = property_lookup_map[canon]
     140    if len(fields) == 1: return (pcode, None)
     141    else: return (pcode, fields[1])
     142
    78143#
    79144#  UCD Property File Format 2: property value aliases
     
    92157#      non-enumerated types
    93158
     159
    94160def initializePropertyValues(property_object_map, property_lookup_map):
    95161    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
     
    100166    for t in lines:
    101167        if UCD_skip.match(t):
    102             m = UCD_property_value_missing_regexp.match(t)
    103             if m:
    104                 if m.group(1) != '0000' or m.group(2) != '10FFFF': raise Exception("Bad missing spec: " + s)
    105                 cname = canonicalize(m.group(3))
    106                 if not cname in property_lookup_map: raise Exception("Bad missing property: " + s)
    107                 property_object_map[property_lookup_map[cname]].setDefaultValue(m.group(4))
     168            if UCD_missing_check.match(t):
     169                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
     170                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
     171                property_object_map[property_code].setDefaultValue(default_value)
    108172            continue  # skip comment and blank lines
    109173        m = UCD_property_value_alias_regexp.match(t)
     
    147211UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
    148212
     213#
     214# parse_data_record is a generic parser for most of the UCD data files.
     215# Given a data_line beginning with a codepoint or codepoint range,
     216# this function returns a (cp_lo, cp_hi, fields) triple givnig the
     217# low and high codepoints of the range (these values may be equal in
     218# the case of a single codepoint), as well as a list of fields.
     219# The semicolon separators are removed as well as leading or trailing
     220# whitespace for each field value.
     221
    149222def parse_data_record(data_line):
    150223    m = UCD_point_regexp.match(data_line)
     
    169242    return (cp_lo, cp_hi, fields)
    170243
    171 UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
    172 
    173 def parse_missing_spec(data_line):
    174     m = UCD_missing_regexp.match(data_line)
    175     if not m: raise Exception("UCD missing spec parsing error: " + data_line)
    176     cp_lo = int(m.group(1), 16)
    177     cp_hi = int(m.group(2), 16)
    178     field_data = m.group(3)
    179     fields = field_data.split(';')
    180     fields = [f.lstrip().rstrip() for f in fields]
    181     return (cp_lo, cp_hi, fields)
    182 
    183 def parse_property_and_value(fields, property_lookup_map):
    184     if len(fields) > 2: raise Exception("Too many fields")
    185     if len(fields) == 0: raise Exception("Expecting at least 1 field")
    186     canon = canonicalize(fields[0])
    187     if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
    188     pcode = property_lookup_map[canon]
    189     if len(fields) == 1: return (pcode, None)
    190     else: return (pcode, fields[1])
    191 
     244
     245#  parse_multisection_property_data parses such a file and populates
     246#  the property objects for each property through successive calls to
     247#  the corresponding addDataRecord method.
     248#
    192249def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
    193250    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    214271    return props
    215272
     273
     274#
     275#   Some UCD files are defined for a single property.   
     276#   parse_property_data deals with such a file, given the property
     277#   object to populate and the file root.
     278#
     279
    216280def parse_property_data(property_object, pfile):
    217281    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    232296    property_object.finalizeProperty()
    233297
     298
     299#
     300#   Some UCD files are organized to support multiple properties with one
     301#   property per column.
     302#   parse_multicolumn_property_data deals with such files given a list of
     303#   property codes.
     304#
     305
    234306def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
    235307    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    248320        property_object_map[p].finalizeProperty()
    249321
    250 def parse_ScriptExtensions_txt(script_property_object):
    251     filename_root = 'ScriptExtensions'
    252     parse_property_data(script_property_object, filename_root + '.txt')
    253 
    254322UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
    255323
    256 NonNameRange_regexp = re.compile("<([^>]*)>")
     324NonName_regexp = re.compile("<([^>]*)>")
    257325NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
    258 
    259 def parse_UnicodeData_txt():
    260     data_records = []
    261     range_records = []
    262     name_range_starts = {}
    263     f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
    264     lines = f.readlines()
    265     for t in lines:
    266         if UCD_skip.match(t):
    267             continue  # skip comment and blank lines
    268         m = UnicodeData_txt_regexp.match(t)
    269         if not m: raise Exception("Unknown syntax: %s" % t)
    270         (cp, name, gc) = (m.group(1), m.group(2), m.group(3))
    271         (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
    272         (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
    273         # Unicode 1 name and ISO comment are obolete
    274         (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
    275         nonNameMatch = NonNameRange_regexp.match(name)
    276         if nonNameMatch:
    277             rangeMatch = NameRange_regexp.match(name)
    278             if rangeMatch:
    279                 rangeName = rangeMatch.group(1)
    280                 print(rangeName, rangeMatch.group(2))
    281                 if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
    282                 if rangeMatch.group(2) == 'Last':
    283                     if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
    284                     range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
    285             continue
    286         data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
    287     return (data_records, range_records)
    288326
    289327#  Parse a decomposition mapping field in one of two forms:
     
    291329#  (b) canonical mappings:  {codepoint} 
    292330compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
    293 codepoints_regexp = re.compile("^[0-9A-F]{4,6}(?: +[0-9A-F]{4,6})*$")
    294331def parse_decomposition(s):
    295332    m = compatibility_regexp.match(s)
     
    300337        decomp_type = "Canonical"
    301338        mapping = s
    302     m = codepoints_regexp.match(mapping)
    303     if not m: raise Exception("Bad codepoint string syntax in parse_decomposition: %s" % mapping)
    304     cps = [int(x, 16) for x in mapping.split(" ")]
    305     return (decomp_type, cps)
    306 
     339    return (decomp_type, mapping)
     340
     341def parse_UnicodeData_txt(property_object_map):
     342    data_records = []
     343    range_records = []
     344    name_range_starts = {}
     345    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
     346    lines = f.readlines()
     347    for t in lines:
     348        if UCD_skip.match(t):
     349            continue  # skip comment and blank lines
     350        m = UnicodeData_txt_regexp.match(t)
     351        if not m: raise Exception("Unknown syntax: %s" % t)
     352        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
     353        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
     354        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
     355        # Unicode 1 name and ISO comment are obolete
     356        (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
     357        rangeMatch = NameRange_regexp.match(name)
     358        if rangeMatch:
     359            rangeName = rangeMatch.group(1)
     360            print(rangeName, rangeMatch.group(2))
     361            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
     362            if rangeMatch.group(2) == 'Last':
     363                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
     364                range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
     365            continue
     366        if not NonName_regexp.match(name):
     367            property_object_map['na'].addDataRecord(cp, cp, name)
     368        if not decomp == '':
     369            (decomp_type, mapping) = parse_decomposition(decomp)
     370            property_object_map['dm'].addDataRecord(cp, cp, mapping)
     371        if not uc == '':
     372            property_object_map['suc'].addDataRecord(cp, cp, uc)
     373            if tc == '':
     374                property_object_map['stc'].addDataRecord(cp, cp, uc)
     375        if not lc == '':
     376            property_object_map['slc'].addDataRecord(cp, cp, lc)
     377        if not tc == '':
     378            property_object_map['stc'].addDataRecord(cp, cp, tc)
     379    property_object_map['na'].finalizeProperty()
     380    property_object_map['dm'].finalizeProperty()
     381    property_object_map['slc'].finalizeProperty()
     382    property_object_map['suc'].finalizeProperty()
     383    property_object_map['stc'].finalizeProperty()
     384
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5661 r5662  
    4444
    4545
    46 def emit_string_property(f, property_code, null_set, reflexive_set, string_values):
    47     f.write("    namespace %s_ns {\n" % property_code.upper())
    48     f.write("        /** Code Point Ranges for %s mapping to <none> \n        " % property_code)
    49     f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8))
    50     f.write("**/\n")
    51     f.write("        const UnicodeSet null_codepoint_set \n")
    52     f.write(null_set.showC(12) + ";\n")
    53     f.write("        /** Code Point Ranges for %s mapping to <codepoint> \n        " % property_code)
    54     f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8))
    55     f.write("**/\n")
    56     f.write("        const UnicodeSet reflexive_set \n")
    57     f.write(reflexive_set.showC(12) + ";\n")
    58     f.write("        const unsigned buffer_length = %s;\n" % string_values.len())
    59     f.write("        const char * string_buffer = u8 R\"__(%s)__\";\n")
    60     f.write("        static StringPropertyObject property_object{%s, null_codepoint_set, reflexive_set, string_buffer, buffer_length};\n    }\n" % property_code)
     46def emit_string_property(f, property_code, null_set, reflexive_set, cp_value_map):
     47    s = string.Template(r"""    namespace ${prop_enum_up}_ns {
     48        /** Code Point Ranges for ${prop_enum} mapping to <none>
     49        ${null_set_ranges}**/
     50
     51        const UnicodeSet null_codepoint_set
     52        ${null_set_value};
     53
     54        /** Code Point Ranges for ${prop_enum} mapping to <codepoint>
     55        ${reflexive_set_ranges}**/
     56        const UnicodeSet reflexive_set
     57        ${reflexive_set_value};
     58
     59        const unsigned buffer_length = ${buffer_length};
     60        const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
     61
     62        const static std::vector<codepoint_t> defined_cps = {
     63        ${explicitly_defined_cps}};
     64        static StringPropertyObject property_object(${prop_enum},
     65                                                    null_codepoint_set,
     66                                                    reflexive_set,
     67                                                    static_cast<const char *>(string_buffer),
     68                                                    buffer_length,
     69                                                    defined_cps);
     70    }
     71""")
     72    cps = sorted(cp_value_map.keys())
     73    string_buffer = ""
     74    for cp in cps:
     75        string_buffer += cp_value_map[cp] + "\n"
     76    buffer_length = len(string_buffer.encode("utf-8"))
     77    f.write(s.substitute(prop_enum = property_code,
     78    prop_enum_up = property_code.upper(),
     79    string_buffer = string_buffer,
     80    buffer_length = buffer_length,
     81    allocation_length = (buffer_length + 255) & -256,
     82    null_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8),
     83    null_set_value = null_set.showC(12),
     84    reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8),
     85    reflexive_set_value = reflexive_set.showC(12),
     86    explicitly_defined_cp_count = len(cps),
     87    explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     88    ))
    6189
    6290
     
    98126
    99127    def load_property_name_info(self):
    100         #(self.property_enum_name_list, self.full_name_map, self.property_lookup_map, self.property_kind_map) = parse_PropertyAlias_txt()
    101128        (self.property_enum_name_list, self.property_object_map) = parse_PropertyAlias_txt()
    102129        self.property_lookup_map = getPropertyLookupMap(self.property_object_map)
     
    169196            emit_enumerated_property(f, property_code, independent_prop_values, prop_values, property_object.value_map)
    170197            print("%s: %s bytes" % (property_object.getPropertyFullName(), sum([property_object.value_map[v].bytes() for v in property_object.value_map.keys()])))
    171         #elif isinstance(property_object, StringPropertyObject):
    172         #    emit_string_property(f, property_code, property_object.value_map)
     198        elif isinstance(property_object, StringPropertyObject):
     199            emit_string_property(f, property_code, property_object.null_str_set, property_object.reflexive_set, property_object.cp_value_map)
    173200
    174201    def generate_property_value_file(self, filename_root, property_code):
     
    215242        self.property_data_headers.append(basename)
    216243
     244    def generate_UnicodeData_h(self):
     245        basename = 'UnicodeData'
     246        parse_UnicodeData_txt(self.property_object_map)
     247        f = cformat.open_header_file_for_write(basename)
     248        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     249        prop_code_list = ['na', 'dm', 'suc', 'slc', 'stc']
     250        f.write("\nnamespace UCD {\n")
     251        for p in prop_code_list:
     252            self.emit_property(f, p)
     253            property_object = self.property_object_map[p]
     254            self.supported_props.append(p)
     255        f.write("}\n\n")
     256        cformat.close_header_file(f)
     257        self.property_data_headers.append(basename)
     258
    217259    def generate_ScriptExtensions_h(self):
    218260        filename_root = 'ScriptExtensions'
     
    220262        extension_object = self.property_object_map['scx']
    221263        extension_object.setBaseProperty(self.property_object_map['sc'])
    222         parse_ScriptExtensions_txt(extension_object)
     264        parse_property_data(extension_object, filename_root+'.txt')
    223265        basename = os.path.basename(filename_root)
    224266        f = cformat.open_header_file_for_write(basename)
     
    289331    # Next parse all property value names and their aliases.  Generate the data.
    290332    ucd.load_property_value_info()
     333
     334    ucd.generate_UnicodeData_h()
    291335    #
    292336    # The Age property
  • icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py

    r5659 r5662  
    118118        if not enum_code in self.name_list_order: self.name_list_order.append(enum_code)
    119119
    120     def emit():
    121         f.write("\nnamespace UCD {\n")
    122         f.write("  namespace %s_ns {\n" % self.property_code.upper())
    123         #f.write("    const unsigned independent_prop_values = %s;\n" % self.independent_prop_values)
    124         for v in self.property_values:
    125             f.write("    /** Code Point Ranges for %s\n    " % v)
    126             f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 4))
    127             f.write("**/\n")
    128             f.write("    const UnicodeSet %s_Set \n" % v.lower())
    129             f.write(self.value_map[v].showC(8) + ";\n")
    130         print("%s: %s bytes" % (basename, sum([self.value_map[v].bytes() for v in self.value_map.keys()])))
    131         set_list = ['&%s_Set' % v.lower() for v in self.property_values]
    132         f.write("    static EnumeratedPropertyObject property_object\n")
    133         f.write("        {%s,\n" % self.property_code)
    134         f.write("         %s_ns::independent_prop_values,\n" % self.property_code.upper())
    135         f.write("         %s_ns::enum_names,\n" % self.property_code.upper())
    136         f.write("         %s_ns::value_names,\n" % self.property_code.upper())
    137         f.write("         %s_ns::aliases_only_map,\n" % self.property_code.upper())
    138         f.write("         {")
    139         f.write(cformat.multiline_fill(set_list, ',', 8))
    140         f.write("\n         }};\n    }\n}\n")
    141 
    142120class BinaryPropertyObject(PropertyObject):
    143121    def __init__(self):
     
    197175                self.value_map[k] = uset_union(self.value_map[k], base_set)
    198176
     177codepoint_String_regexp = re.compile("^[A-F0-9]{4,6}(?: [A-F0-9]{4,6})*$")
    199178class StringPropertyObject(PropertyObject):
    200179    def __init__(self):
    201180        PropertyObject.__init__(self)
    202         self.str_value_map = {}
    203 
     181        self.cp_value_map = {}
     182        self.null_str_set = empty_uset()
     183        self.reflexive_set = empty_uset()
     184       
    204185    def getPropertyKind(self):
    205186        if self.property_code in ['scf', 'slc', 'suc', 'stc']:
     
    209190
    210191    def addDataRecord(self, cp_lo, cp_hi, stringValue):
    211         if not self.property_code in ['na', 'JSN', 'na1', 'isc'] and stringValue != '':
    212             s = ""
    213             for cp in [int(x, 16) for x in stringValue.split(' ')]:
    214                 s+= chr(cp)
    215             stringValue = s
    216         for cp in range(cp_lo, cp_hi+1):
    217             self.str_value_map[cp] = stringValue
     192        if stringValue == '':
     193            self.null_str_set = uset_union(self.null_str_set, range_uset(cp_lo, cp_hi))
     194        else:
     195            if codepoint_String_regexp.match(stringValue):
     196                s = ""
     197                for cp in [int(x, 16) for x in stringValue.split(' ')]:
     198                    s += chr(cp)
     199                stringValue = s
     200            for cp in range(cp_lo, cp_hi+1):
     201                if len(stringValue) == 1 and ord(stringValue[0]) == cp:
     202                    print("Found reflexive entry for %s: %s" % (self.property_code, stringValue))
     203                    self.reflexive_set = uset_union(self.reflexive_set, singleton_uset(ord(stringValue[0])))
     204                else:
     205                    self.cp_value_map[cp] = stringValue
     206
     207    def finalizeProperty(self):
     208        explicitly_defined_cps = empty_uset()
     209        for cp in self.cp_value_map.keys():
     210            explicitly_defined_cps = uset_union(explicitly_defined_cps, singleton_uset(cp))
     211        # set <script> default
     212        if self.default_value == "<code point>":
     213            self.reflexive_set = uset_union(self.reflexive_set, uset_complement(uset_union(explicitly_defined_cps, self.null_str_set)))
     214        else:
     215            self.null_str_set = uset_union(self.null_str_set, uset_complement(uset_union(explicitly_defined_cps, self.reflexive_set)))
    218216
    219217def getPropertyLookupMap(property_object_map):
Note: See TracChangeset for help on using the changeset viewer.