Ignore:
Timestamp:
Oct 6, 2017, 11:36:55 AM (22 months ago)
Author:
cameron
Message:

StringOverride? properties (simple case conversion vs full case conversion)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/casefold.py

    r5653 r5672  
    1313import UCD_config
    1414from unicode_set import *
    15 
    16 
    17 
    18 #
    19 #  Processing files of the UCD
    20 #
    21 #  General format for skippable comments, blank lines
    22 UCD_skip = re.compile("^#.*$|^\s*$")
    23 
    24 #
    25 #  UCD Property File Format 4: property aliases
    26 #  PropertyAliases.txt
    27 #
    28 UCD_case_fold_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*([CSFT]);\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    29 
    30 def parse_CaseFolding_txt():
    31    fold_type = {}
    32    fold_value = {}
    33    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
    34    lines = f.readlines()
    35    for t in lines:
    36       if UCD_skip.match(t): continue  # skip comment and blank lines
    37       m = UCD_case_fold_regexp.match(t)
    38       if not m: raise Exception("Unknown case fold syntax: %s" % t)
    39       codepoint = int(m.group(1), 16)
    40       fold_t = m.group(2)
    41       fold_type[codepoint] = fold_t
    42       fold_val = m.group(3)
    43       if fold_t == 'T':
    44          print("Skipping Turkic entry")
    45          continue  # skip Turkic
    46       if fold_t == 'F':
    47           fold_val = [int(x, 16) for x in fold_val.split(" ")]
    48       else:
    49           fold_val = int(fold_val, 16)
    50       if codepoint in fold_value: fold_value[codepoint].append(fold_val)
    51       else: fold_value[codepoint] = [fold_val]
    52    return (fold_type, fold_value)
    53 
     15from UCD_parser import parse_CaseFolding_txt
    5416
    5517def simple_CaseFolding_BitSets(fold_map):
     
    7638   return BitDiffSet
    7739
    78 def simple_CaseClosure_map(fold_map):
     40def simple_CaseClosure_map(fold_data):
     41   simpleFoldMap = {}
     42   for k in fold_data['S'].keys(): simpleFoldMap[k] = fold_data['S'][k]
     43   for k in fold_data['C'].keys(): simpleFoldMap[k] = fold_data['C'][k]
    7944   cl_map = {}
    80    for k in fold_map.keys():
    81       folds = fold_map[k]
    82       for v in folds:
    83         if not isinstance(v, int): continue # skip nonsimple case folds
    84         if not v in cl_map: cl_map[v] = [k]
    85         else: cl_map[v].append(k)
    86         if not k in cl_map: cl_map[k] = [v]
    87         else: cl_map[k].append(v)
     45   for k in simpleFoldMap.keys():
     46      v = simpleFoldMap[k]
     47      if not v in cl_map: cl_map[v] = [k]
     48      else: cl_map[v].append(k)
     49      if not k in cl_map: cl_map[k] = [v]
     50      else: cl_map[k].append(v)
    8851   newEntries = True
    8952   while newEntries:
     
    188151
    189152def genCaseFolding_txt_h():
    190    (ft, fv) = parse_CaseFolding_txt()
    191    cm = simple_CaseClosure_map(fv)
     153   fold_data = parse_CaseFolding_txt()
     154   cm = simple_CaseClosure_map(fold_data)
    192155   f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
    193156   cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
Note: See TracChangeset for help on using the changeset viewer.