Ignore:
Timestamp:
Oct 3, 2017, 2:18:24 PM (2 years ago)
Author:
cameron
Message:

UCD parsing improvements; initial support for generating StringPropertyObjects? - not yet functional

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5661 r5662  
    7676    return (property_enum_name_list, property_object_map)
    7777
     78
     79#
     80#  Property Default Value Specifications
     81#
     82#  THe UCD uses special comment lines ("@missing specifications") to declare default
     83#  values for properties.   Examples showing the two common formats are:
     84#  (1)  Blocks.txt                    # @missing: 0000..10FFFF; No_Block
     85#  (2)  PropertyValueAliases.txt      # @missing: 0000..10FFFF; Case_Folding; <code point>
     86#  The general format gives a range of codepoints (generally 0000..10FFFF),
     87#  an optional property name (if the file containing the specification defines
     88#  many different properties), and the default value.
     89#
     90#  There are some important default values for different property types:
     91#  <codepoint>:  This is a default value for certain String properties,
     92#                indicating the default for a codepoint under the given property
     93#                is to map to itself.
     94#  <none>:       This is a default for certain String properties indicating that
     95#                the default value for a code point is the empty string.
     96#  <script>:     The default value for the ScriptExtnesions property is the
     97#                value of the Script property.
     98#  NaN           The default value for numeric property is the NaN (not a number) value.
     99#
     100
     101#  Given a line known to contain such a @missing specification,
     102#  parse_missing_spec(data_line) returns a (cp_lo, cp_hi, fields) triple.
     103#  Generally, cp_lo = 0 and cp_hi = 0x10FFFF
     104#  The list of fields contains one or two entries: an optional
     105#  property name and the default value specified for the range.
     106#  @missing specifications generally omit the property name when
     107#  the file being processed is defined for a single property only.
     108#
     109UCD_missing_check = re.compile("^#\s*@missing:.*")
     110UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
     111
     112def parse_missing_spec(data_line):
     113    m = UCD_missing_regexp.match(data_line)
     114    if not m: raise Exception("UCD missing spec parsing error: " + data_line)
     115    cp_lo = int(m.group(1), 16)
     116    cp_hi = int(m.group(2), 16)
     117    # We may have to restructure in the event that missing specs do not cover the full Unicode range.
     118    if cp_lo != 0 or cp_hi != 0x10FFFF: raise Exception("Unexpected range error in missing spec: " + data_line)
     119    field_data = m.group(3)
     120    fields = field_data.split(';')
     121    fields = [f.lstrip().rstrip() for f in fields]
     122    return (cp_lo, cp_hi, fields)
     123
     124#
     125#  Missing specifications and other types of UCD data records often produce
     126#  a list of one or two fields which indicate a property and a value.
     127#
     128#  parse_property_and_value(fields, property_lookup_map) checks that
     129#  first of the given fields is indeed a property identifier identified
     130#  in the given lookup map, and returns a pair consisting of the
     131#  unique property code for the property, plus a corresponding value
     132#  (or None, if only one field was given).
     133
     134def parse_property_and_value(fields, property_lookup_map):
     135    if len(fields) > 2: raise Exception("Too many fields")
     136    if len(fields) == 0: raise Exception("Expecting at least 1 field")
     137    canon = canonicalize(fields[0])
     138    if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
     139    pcode = property_lookup_map[canon]
     140    if len(fields) == 1: return (pcode, None)
     141    else: return (pcode, fields[1])
     142
    78143#
    79144#  UCD Property File Format 2: property value aliases
     
    92157#      non-enumerated types
    93158
     159
    94160def initializePropertyValues(property_object_map, property_lookup_map):
    95161    UCD_property_value_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([-A-Za-z_0-9.]+)\s*;\s*([-A-Za-z_0-9.<> ]+)\s*([^#]*)")
     
    100166    for t in lines:
    101167        if UCD_skip.match(t):
    102             m = UCD_property_value_missing_regexp.match(t)
    103             if m:
    104                 if m.group(1) != '0000' or m.group(2) != '10FFFF': raise Exception("Bad missing spec: " + s)
    105                 cname = canonicalize(m.group(3))
    106                 if not cname in property_lookup_map: raise Exception("Bad missing property: " + s)
    107                 property_object_map[property_lookup_map[cname]].setDefaultValue(m.group(4))
     168            if UCD_missing_check.match(t):
     169                (cp_lo, cp_hi, fields) = parse_missing_spec(t)
     170                (property_code, default_value) = parse_property_and_value(fields, property_lookup_map)
     171                property_object_map[property_code].setDefaultValue(default_value)
    108172            continue  # skip comment and blank lines
    109173        m = UCD_property_value_alias_regexp.match(t)
     
    147211UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})([^#]*)(?:#|$)")
    148212
     213#
     214# parse_data_record is a generic parser for most of the UCD data files.
     215# Given a data_line beginning with a codepoint or codepoint range,
     216# this function returns a (cp_lo, cp_hi, fields) triple givnig the
     217# low and high codepoints of the range (these values may be equal in
     218# the case of a single codepoint), as well as a list of fields.
     219# The semicolon separators are removed as well as leading or trailing
     220# whitespace for each field value.
     221
    149222def parse_data_record(data_line):
    150223    m = UCD_point_regexp.match(data_line)
     
    169242    return (cp_lo, cp_hi, fields)
    170243
    171 UCD_missing_regexp = re.compile("^#\s*@missing:\s*([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s*;\s*([^#]*)(?:#|$)")
    172 
    173 def parse_missing_spec(data_line):
    174     m = UCD_missing_regexp.match(data_line)
    175     if not m: raise Exception("UCD missing spec parsing error: " + data_line)
    176     cp_lo = int(m.group(1), 16)
    177     cp_hi = int(m.group(2), 16)
    178     field_data = m.group(3)
    179     fields = field_data.split(';')
    180     fields = [f.lstrip().rstrip() for f in fields]
    181     return (cp_lo, cp_hi, fields)
    182 
    183 def parse_property_and_value(fields, property_lookup_map):
    184     if len(fields) > 2: raise Exception("Too many fields")
    185     if len(fields) == 0: raise Exception("Expecting at least 1 field")
    186     canon = canonicalize(fields[0])
    187     if not canon in property_lookup_map: raise Exception("Unexpected name: " + name_str)
    188     pcode = property_lookup_map[canon]
    189     if len(fields) == 1: return (pcode, None)
    190     else: return (pcode, fields[1])
    191 
     244
     245#  parse_multisection_property_data parses such a file and populates
     246#  the property objects for each property through successive calls to
     247#  the corresponding addDataRecord method.
     248#
    192249def parse_multisection_property_data(pfile, property_object_map, property_lookup_map):
    193250    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    214271    return props
    215272
     273
     274#
     275#   Some UCD files are defined for a single property.   
     276#   parse_property_data deals with such a file, given the property
     277#   object to populate and the file root.
     278#
     279
    216280def parse_property_data(property_object, pfile):
    217281    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    232296    property_object.finalizeProperty()
    233297
     298
     299#
     300#   Some UCD files are organized to support multiple properties with one
     301#   property per column.
     302#   parse_multicolumn_property_data deals with such files given a list of
     303#   property codes.
     304#
     305
    234306def parse_multicolumn_property_data(pfile, property_object_map, property_lookup_map, prop_code_list):
    235307    f = open(UCD_config.UCD_src_dir + "/" + pfile)
     
    248320        property_object_map[p].finalizeProperty()
    249321
    250 def parse_ScriptExtensions_txt(script_property_object):
    251     filename_root = 'ScriptExtensions'
    252     parse_property_data(script_property_object, filename_root + '.txt')
    253 
    254322UnicodeData_txt_regexp = re.compile("^([0-9A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);(.*)$")
    255323
    256 NonNameRange_regexp = re.compile("<([^>]*)>")
     324NonName_regexp = re.compile("<([^>]*)>")
    257325NameRange_regexp = re.compile("<([^,]*), (First|Last)>")
    258 
    259 def parse_UnicodeData_txt():
    260     data_records = []
    261     range_records = []
    262     name_range_starts = {}
    263     f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
    264     lines = f.readlines()
    265     for t in lines:
    266         if UCD_skip.match(t):
    267             continue  # skip comment and blank lines
    268         m = UnicodeData_txt_regexp.match(t)
    269         if not m: raise Exception("Unknown syntax: %s" % t)
    270         (cp, name, gc) = (m.group(1), m.group(2), m.group(3))
    271         (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
    272         (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
    273         # Unicode 1 name and ISO comment are obolete
    274         (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
    275         nonNameMatch = NonNameRange_regexp.match(name)
    276         if nonNameMatch:
    277             rangeMatch = NameRange_regexp.match(name)
    278             if rangeMatch:
    279                 rangeName = rangeMatch.group(1)
    280                 print(rangeName, rangeMatch.group(2))
    281                 if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
    282                 if rangeMatch.group(2) == 'Last':
    283                     if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
    284                     range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
    285             continue
    286         data_records.append((cp, name, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
    287     return (data_records, range_records)
    288326
    289327#  Parse a decomposition mapping field in one of two forms:
     
    291329#  (b) canonical mappings:  {codepoint} 
    292330compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
    293 codepoints_regexp = re.compile("^[0-9A-F]{4,6}(?: +[0-9A-F]{4,6})*$")
    294331def parse_decomposition(s):
    295332    m = compatibility_regexp.match(s)
     
    300337        decomp_type = "Canonical"
    301338        mapping = s
    302     m = codepoints_regexp.match(mapping)
    303     if not m: raise Exception("Bad codepoint string syntax in parse_decomposition: %s" % mapping)
    304     cps = [int(x, 16) for x in mapping.split(" ")]
    305     return (decomp_type, cps)
    306 
     339    return (decomp_type, mapping)
     340
     341def parse_UnicodeData_txt(property_object_map):
     342    data_records = []
     343    range_records = []
     344    name_range_starts = {}
     345    f = open(UCD_config.UCD_src_dir + "/UnicodeData.txt")
     346    lines = f.readlines()
     347    for t in lines:
     348        if UCD_skip.match(t):
     349            continue  # skip comment and blank lines
     350        m = UnicodeData_txt_regexp.match(t)
     351        if not m: raise Exception("Unknown syntax: %s" % t)
     352        (cp, name, gc) = (int(m.group(1), 16), m.group(2), m.group(3))
     353        (ccc, bidic, decomp, bidim) = (m.group(4), m.group(5), m.group(6), m.group(10))
     354        (decval, digitval, numval) = (m.group(7), m.group(8), m.group(9))
     355        # Unicode 1 name and ISO comment are obolete
     356        (uc, lc, tc) = (m.group(13), m.group(14), m.group(15))
     357        rangeMatch = NameRange_regexp.match(name)
     358        if rangeMatch:
     359            rangeName = rangeMatch.group(1)
     360            print(rangeName, rangeMatch.group(2))
     361            if rangeMatch.group(2) == 'First': name_range_starts[rangeName] = cp
     362            if rangeMatch.group(2) == 'Last':
     363                if not rangeName in name_range_starts: raise Exception("UnicodeData range end encountered without prior range start: %s" % t)
     364                range_records.append((name_range_starts[rangeName], cp, rangeName, gc, ccc, bidic, decomp, decval, digitval, numval, bidim, uc, lc, tc))
     365            continue
     366        if not NonName_regexp.match(name):
     367            property_object_map['na'].addDataRecord(cp, cp, name)
     368        if not decomp == '':
     369            (decomp_type, mapping) = parse_decomposition(decomp)
     370            property_object_map['dm'].addDataRecord(cp, cp, mapping)
     371        if not uc == '':
     372            property_object_map['suc'].addDataRecord(cp, cp, uc)
     373            if tc == '':
     374                property_object_map['stc'].addDataRecord(cp, cp, uc)
     375        if not lc == '':
     376            property_object_map['slc'].addDataRecord(cp, cp, lc)
     377        if not tc == '':
     378            property_object_map['stc'].addDataRecord(cp, cp, tc)
     379    property_object_map['na'].finalizeProperty()
     380    property_object_map['dm'].finalizeProperty()
     381    property_object_map['slc'].finalizeProperty()
     382    property_object_map['suc'].finalizeProperty()
     383    property_object_map['stc'].finalizeProperty()
     384
Note: See TracChangeset for help on using the changeset viewer.