Ignore:
Timestamp:
Oct 6, 2017, 11:36:55 AM (2 years ago)
Author:
cameron
Message:

StringOverride? properties (simple case conversion vs full case conversion)

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5671 r5672  
    6767            property_object_map[property_code] = EnumeratedPropertyObject()
    6868        elif property_kind == "String":
    69             property_object_map[property_code] = StringPropertyObject()
     69            if property_code in ["uc", "lc", "tc", "cf"]:
     70                property_object_map[property_code] = StringOverridePropertyObject("s" + property_code)
     71            else:
     72                property_object_map[property_code] = StringPropertyObject()
    7073        elif property_kind == "Numeric":
    7174            property_object_map[property_code] = NumericPropertyObject()
     
    398401
    399402def parse_SpecialCasing_txt(property_object_map):
    400     data_records = []
    401403    f = open(UCD_config.UCD_src_dir + "/SpecialCasing.txt")
    402404    lines = f.readlines()
     
    418420    property_object_map['tc'].finalizeProperty()
    419421
     422
     423# CaseFolding.txt has four types of fold entries:
     424# S, C, F, T:  Simple, Common, Full and Turkic. 
     425# The SimpleCaseFold property is the set of mappings S+C,
     426# The FullCaseFold property is the set F+C
     427# There may be multiple entries per codepoint
     428
     429def parse_CaseFolding_txt():
     430    fold_map = {}
     431    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
     432    lines = f.readlines()
     433    for t in lines:
     434        if UCD_skip.match(t): continue  # skip comment and blank lines
     435        (cp, cp_hi, fields) = parse_data_record(t)
     436        (fold_type, fold_val) = (fields[0], fields[1])
     437        if not fold_type in fold_map: fold_map[fold_type] = {}
     438        if fold_type == 'S' or fold_type == 'C':
     439            # fold value is guaranteed to be a single codepoint
     440            fold_val = int(fold_val, 16)
     441        else:
     442            fold_val = [int(x, 16) for x in fold_val.split(" ")]
     443        fold_map[fold_type][cp] = fold_val
     444    return fold_map
     445
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5671 r5672  
    8181    reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8),
    8282    reflexive_set_value = reflexive_set.showC(12),
     83    explicitly_defined_cp_count = len(cps),
     84    explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     85    ))
     86
     87def emit_string_override_property(f, property_code, overridden_code, override_set, cp_value_map):
     88    s = string.Template(r"""    namespace ${prop_enum_up}_ns {
     89        /** Code Point Ranges for ${prop_enum} overriding values from ${overridden}
     90        ${overridden_set_ranges}**/
     91
     92        const UnicodeSet overridden_set
     93        ${overridden_set_value};
     94
     95        const unsigned buffer_length = ${buffer_length};
     96        const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
     97
     98        const static std::vector<codepoint_t> defined_cps = {
     99        ${explicitly_defined_cps}};
     100        static StringOverridePropertyObject property_object(${prop_enum},
     101                                                    ${overridden}_ns::property_object,
     102                                                    overridden_set,
     103                                                    static_cast<const char *>(string_buffer),
     104                                                    buffer_length,
     105                                                    defined_cps);
     106    }
     107""")
     108    cps = sorted(cp_value_map.keys())
     109    string_buffer = ""
     110    for cp in cps:
     111        string_buffer += cp_value_map[cp] + "\n"
     112    buffer_length = len(string_buffer.encode("utf-8"))
     113    f.write(s.substitute(prop_enum = property_code,
     114    prop_enum_up = property_code.upper(),
     115    overridden = overridden_code.upper(),
     116    string_buffer = string_buffer,
     117    buffer_length = buffer_length,
     118    allocation_length = (buffer_length + 255) & -256,
     119    overridden_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(override_set)], ',', 8),
     120    overridden_set_value = override_set.showC(12),
    83121    explicitly_defined_cp_count = len(cps),
    84122    explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     
    239277        elif isinstance(property_object, StringPropertyObject):
    240278            emit_string_property(f, property_code, property_object.null_str_set, property_object.reflexive_set, property_object.cp_value_map)
     279        elif isinstance(property_object, StringOverridePropertyObject):
     280            emit_string_override_property(f, property_code, property_object.overridden_code, property_object.overridden_set, property_object.cp_value_map)
    241281        elif isinstance(property_object, NumericPropertyObject):
    242282            emit_numeric_property(f, property_code, property_object.NaN_set, property_object.cp_value_map)
     
    305345        parse_SpecialCasing_txt(self.property_object_map)
    306346        f = cformat.open_header_file_for_write(basename)
    307         cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"'])
     347        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"UnicodeData.h"', '"unicode_set.h"'])
    308348        f.write("\nnamespace UCD {\n")
    309349        for p in ['lc', 'uc', 'tc']:
  • icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py

    r5670 r5672  
    231231            self.null_str_set = uset_union(self.null_str_set, uset_complement(uset_union(explicitly_defined_cps, self.reflexive_set)))
    232232
     233class StringOverridePropertyObject(PropertyObject):
     234    def __init__(self, overridden_code):
     235        PropertyObject.__init__(self)
     236        self.cp_value_map = {}
     237        self.overridden_code = overridden_code
     238        self.overridden_set = empty_uset()
     239       
     240    def getPropertyKind(self):
     241        return "StringOverride"
     242
     243    def addDataRecord(self, cp_lo, cp_hi, stringValue):
     244        if codepoint_String_regexp.match(stringValue):
     245            s = ""
     246            for cp in [int(x, 16) for x in stringValue.split(' ')]:
     247                s += chr(cp)
     248            stringValue = s
     249        else:
     250            raise Exception("Expecting codepoint string, but got " + stringValue)
     251        self.cp_value_map[cp] = stringValue
     252
     253    def finalizeProperty(self):
     254        explicitly_defined_cps = empty_uset()
     255        for cp in self.cp_value_map.keys():
     256            explicitly_defined_cps = uset_union(explicitly_defined_cps, singleton_uset(cp))
     257        self.overridden_set = explicitly_defined_cps
     258
    233259class ObsoletePropertyObject(PropertyObject):
    234260    def __init__(self):
  • icGREP/icgrep-devel/UCD-scripts/casefold.py

    r5653 r5672  
    1313import UCD_config
    1414from unicode_set import *
    15 
    16 
    17 
    18 #
    19 #  Processing files of the UCD
    20 #
    21 #  General format for skippable comments, blank lines
    22 UCD_skip = re.compile("^#.*$|^\s*$")
    23 
    24 #
    25 #  UCD Property File Format 4: property aliases
    26 #  PropertyAliases.txt
    27 #
    28 UCD_case_fold_regexp = re.compile("^([0-9A-F]{4,6})\s*;\s*([CSFT]);\s*((?:[-A-Za-z0-9_]+\s+)*[-A-Za-z0-9_]+)\s*(?:[;#]|$)")
    29 
    30 def parse_CaseFolding_txt():
    31    fold_type = {}
    32    fold_value = {}
    33    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
    34    lines = f.readlines()
    35    for t in lines:
    36       if UCD_skip.match(t): continue  # skip comment and blank lines
    37       m = UCD_case_fold_regexp.match(t)
    38       if not m: raise Exception("Unknown case fold syntax: %s" % t)
    39       codepoint = int(m.group(1), 16)
    40       fold_t = m.group(2)
    41       fold_type[codepoint] = fold_t
    42       fold_val = m.group(3)
    43       if fold_t == 'T':
    44          print("Skipping Turkic entry")
    45          continue  # skip Turkic
    46       if fold_t == 'F':
    47           fold_val = [int(x, 16) for x in fold_val.split(" ")]
    48       else:
    49           fold_val = int(fold_val, 16)
    50       if codepoint in fold_value: fold_value[codepoint].append(fold_val)
    51       else: fold_value[codepoint] = [fold_val]
    52    return (fold_type, fold_value)
    53 
     15from UCD_parser import parse_CaseFolding_txt
    5416
    5517def simple_CaseFolding_BitSets(fold_map):
     
    7638   return BitDiffSet
    7739
    78 def simple_CaseClosure_map(fold_map):
     40def simple_CaseClosure_map(fold_data):
     41   simpleFoldMap = {}
     42   for k in fold_data['S'].keys(): simpleFoldMap[k] = fold_data['S'][k]
     43   for k in fold_data['C'].keys(): simpleFoldMap[k] = fold_data['C'][k]
    7944   cl_map = {}
    80    for k in fold_map.keys():
    81       folds = fold_map[k]
    82       for v in folds:
    83         if not isinstance(v, int): continue # skip nonsimple case folds
    84         if not v in cl_map: cl_map[v] = [k]
    85         else: cl_map[v].append(k)
    86         if not k in cl_map: cl_map[k] = [v]
    87         else: cl_map[k].append(v)
     45   for k in simpleFoldMap.keys():
     46      v = simpleFoldMap[k]
     47      if not v in cl_map: cl_map[v] = [k]
     48      else: cl_map[v].append(k)
     49      if not k in cl_map: cl_map[k] = [v]
     50      else: cl_map[k].append(v)
    8851   newEntries = True
    8952   while newEntries:
     
    188151
    189152def genCaseFolding_txt_h():
    190    (ft, fv) = parse_CaseFolding_txt()
    191    cm = simple_CaseClosure_map(fv)
     153   fold_data = parse_CaseFolding_txt()
     154   cm = simple_CaseClosure_map(fold_data)
    192155   f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
    193156   cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
Note: See TracChangeset for help on using the changeset viewer.