Ignore:
Timestamp:
Sep 28, 2017, 2:11:38 PM (2 years ago)
Author:
cameron
Message:

Parse Unicode version; parse decomposition mapping fields

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5155 r5652  
    1111import UCD_config
    1212from unicode_set import *
     13
     14version_regexp = re.compile(".*Version\s+([0-9.]*)\s+of the Unicode Standard.*")
     15
     16def setVersionfromReadMe_txt():
     17    f = open(UCD_config.UCD_src_dir + "/" + 'ReadMe.txt')
     18    lines = f.readlines()
     19    for t in lines:
     20        m = version_regexp.match(t)
     21        if m:
     22            UCD_config.version = m.group(1)
     23            print "Version %s" % m.group(1)
    1324
    1425trivial_name_char_re = re.compile('[-_\s]')
     
    289300   return data_records
    290301
     302#  Parse a decomposition mapping field in one of two forms:
     303#  (a) compatibility mappings:  "<" decomp_type:[A-Za-z]* ">" {codepoint}
     304#  (b) canonical mappings:  {codepoint} 
     305compatibility_regexp = re.compile("^<([^>]*)>\s*([0-9A-F ]*)$")
     306codepoints_regexp = re.compile("^[0-9A-F]{4,6}(?: +[0-9A-F]{4,6})*$")
     307def parse_decomposition(s):
     308    m = compatibility_regexp.match(s)
     309    if m:
     310        decomp_type = m.group(1)
     311        mapping = m.group(2)
     312    else:
     313        decomp_type = "Canonical"
     314        mapping = s
     315    m = codepoints_regexp.match(mapping)
     316    if not m: raise Exception("Bad codepoint string syntax in parse_decomposition: %s" % mapping)
     317    cps = [int(x, 16) for x in mapping.split(" ")]
     318    return (decomp_type, cps)
     319
Note: See TracChangeset for help on using the changeset viewer.