Ignore:
Timestamp:
Oct 6, 2017, 1:22:53 PM (21 months ago)
Author:
cameron
Message:

Case folding property objects

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5672 r5673  
    427427# There may be multiple entries per codepoint
    428428
    429 def parse_CaseFolding_txt():
     429def parse_CaseFolding_txt(property_object_map):
    430430    fold_map = {}
    431431    f = open(UCD_config.UCD_src_dir + "/" + 'CaseFolding.txt')
     
    438438        if fold_type == 'S' or fold_type == 'C':
    439439            # fold value is guaranteed to be a single codepoint
    440             fold_val = int(fold_val, 16)
     440            property_object_map['scf'].addDataRecord(cp, cp, fold_val)
    441441        else:
    442             fold_val = [int(x, 16) for x in fold_val.split(" ")]
     442            if fold_type == 'F':
     443                property_object_map['cf'].addDataRecord(cp, cp, fold_val)
    443444        fold_map[fold_type][cp] = fold_val
     445    property_object_map['scf'].finalizeProperty()
     446    property_object_map['cf'].finalizeProperty()
    444447    return fold_map
    445448
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5672 r5673  
    8787def emit_string_override_property(f, property_code, overridden_code, override_set, cp_value_map):
    8888    s = string.Template(r"""    namespace ${prop_enum_up}_ns {
    89         /** Code Point Ranges for ${prop_enum} overriding values from ${overridden}
     89        /** Code Point Ranges for ${prop_enum} (possibly overriding values from ${overridden})
    9090        ${overridden_set_ranges}**/
    9191
    92         const UnicodeSet overridden_set
     92        const UnicodeSet explicitly_defined_set
    9393        ${overridden_set_value};
    9494
     
    100100        static StringOverridePropertyObject property_object(${prop_enum},
    101101                                                    ${overridden}_ns::property_object,
    102                                                     overridden_set,
     102                                                    explicitly_defined_set,
    103103                                                    static_cast<const char *>(string_buffer),
    104104                                                    buffer_length,
     
    195195""")
    196196    f.write(s.substitute(prop_enum = property_code, prop_enum_up = property_code.upper()))
     197
     198
     199def simple_CaseClosure_map(fold_data):
     200   simpleFoldMap = {}
     201   for k in fold_data['S'].keys(): simpleFoldMap[k] = int(fold_data['S'][k], 16)
     202   for k in fold_data['C'].keys(): simpleFoldMap[k] = int(fold_data['C'][k], 16)
     203   cl_map = {}
     204   for k in simpleFoldMap.keys():
     205      v = simpleFoldMap[k]
     206      if not v in cl_map: cl_map[v] = [k]
     207      else: cl_map[v].append(k)
     208      if not k in cl_map: cl_map[k] = [v]
     209      else: cl_map[k].append(v)
     210   newEntries = True
     211   while newEntries:
     212      newEntries = False
     213      for k in cl_map.keys():
     214         vlist = cl_map[k]
     215         for v in vlist:
     216            for w in cl_map[v]:
     217               if k != w and not k in cl_map[w]:
     218                  cl_map[w].append(k)
     219                  newEntries = True
     220   return cl_map
     221
     222#
     223# Simple case fold map.     
     224# The simple case fold map is an ordered list of fold entries each of
     225# the form (lo_codepoint, hicodepoint, offset).  Each entry describes
     226# the case fold that applies for the consecutive entries in the given
     227# codepoint range, according to the following equations. 
     228# casefold(x) = x + offset, if ((x - low_codepoint) div offset) mod 2 = 0
     229#             = x - offset, if ((x - low_codepoint) div offset) mod 2 = 1
     230#
     231#
     232def caseFoldRangeMap(casemap):
     233   foldable = sorted(casemap.keys())
     234   entries = []
     235   cp = foldable[0]
     236   open_entries = [(cp, f - cp) for f in casemap[cp]]
     237   last_cp = cp
     238   for cp in foldable[1:]:
     239      if cp != last_cp + 1:
     240         # Close the pending range entries
     241         for (cp0, offset) in open_entries:
     242            entries.append((cp0, last_cp, offset))
     243         open_entries = [(cp, f - cp) for f in casemap[cp]]
     244      else:
     245         new_open = []
     246         projected = []
     247         for (cp0, offset) in open_entries:
     248            even_odd_offset_group = int(abs(cp - cp0)/ abs(offset)) & 1
     249            if even_odd_offset_group == 0:
     250               projected_foldcp = cp + offset
     251            else: projected_foldcp = cp - offset
     252            if not projected_foldcp in casemap[cp]:
     253               entries.append((cp0, last_cp, offset))
     254            else:
     255               new_open.append((cp0, offset))
     256               projected.append(projected_foldcp)
     257         open_entries = new_open
     258         for f in casemap[cp]:
     259            if not f in projected:
     260               open_entries.append((cp, f-cp))
     261      last_cp = cp
     262   # Close the final entries.
     263   for (cp0, offset) in open_entries:
     264      entries.append((cp0, last_cp, offset))
     265   return entries
     266
     267
     268
     269def genFoldEntryData(casemap):
     270   rMap = caseFoldRangeMap(casemap)
     271   individuals = [(m[0],m[0]+m[2]) for m in rMap if m[0] == m[1]]
     272   ranges = [m for m in rMap if m[0] != m[1]]
     273   last_hi = -1
     274   generated = "const FoldEntry foldTable[foldTableSize] = {\n"
     275   foldTableSize = 0
     276   for (lo, hi, offset) in ranges:
     277      if lo != last_hi + 1:
     278         pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi and m[0]< lo]
     279         generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
     280         foldTableSize += 1
     281      last_hi = hi
     282      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>=lo and m[0]<= hi]
     283      generated += "  {0x%x, %i, {" % (lo, offset) + cformat.multiline_fill(pairs) + "}},\n"
     284      foldTableSize += 1
     285   if last_hi != 0x10FFFF:
     286      pairs = ["{0x%x, 0x%x}" % (m[0], m[1]) for m in individuals if m[0]>last_hi]
     287      generated += "  {0x%x, 0, {" % (last_hi + 1) + cformat.multiline_fill(pairs) + "}},\n"
     288      foldTableSize += 1
     289   generated += "  {0x110000, 0, {}}};"
     290   foldTableSize += 1
     291   generated = "\nconst int foldTableSize = %s;\n\n" % foldTableSize  + generated
     292   return generated
     293
     294foldDeclarations = r"""
     295typedef unsigned codepoint_t;
     296
     297struct FoldEntry {
     298    re::codepoint_t range_lo;
     299    int fold_offset;
     300    std::vector<re::interval_t> fold_pairs;
     301};
     302
     303
     304void caseInsensitiveInsertRange(re::CC * cc, const re::codepoint_t lo, const re::codepoint_t hi);
     305
     306inline void caseInsensitiveInsert(re::CC * cc, const re::codepoint_t cp) {
     307    caseInsensitiveInsertRange(cc, cp, cp);
     308}
     309"""
    197310
    198311
     
    410523
    411524
     525    def genCaseFolding_h(self):
     526        basename = 'CaseFolding'
     527        fold_data = parse_CaseFolding_txt(self.property_object_map)
     528        cm = simple_CaseClosure_map(fold_data)
     529        f = cformat.open_header_file_for_write(basename, 'casefold.py')
     530        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"', "<vector>", '"re/re_cc.h"'])
     531        f.write(foldDeclarations)
     532        f.write(genFoldEntryData(cm))
     533        f.write("\nnamespace UCD {\n")
     534        self.emit_property(f, 'scf')
     535        self.emit_property(f, 'cf')
     536        f.write("}\n")
     537        cformat.close_header_file(f)
     538        self.supported_props.append(['scf', 'cf'])
     539        self.property_data_headers.append(basename)
     540
     541
    412542
    413543def UCD_main():
     
    427557   
    428558    ucd.generate_SpecialCasing_h()
     559   
     560    ucd.genCaseFolding_h()
    429561   
    430562    ucd.generate_multicolumn_properties_file('NameAliases', ['Name_Alias', 'Alias_Kind'])
  • icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py

    r5672 r5673  
    249249        else:
    250250            raise Exception("Expecting codepoint string, but got " + stringValue)
    251         self.cp_value_map[cp] = stringValue
     251        for cp in range(cp_lo, cp_hi+1): self.cp_value_map[cp] = stringValue
    252252
    253253    def finalizeProperty(self):
  • icGREP/icgrep-devel/UCD-scripts/casefold.py

    r5672 r5673  
    1313import UCD_config
    1414from unicode_set import *
    15 from UCD_parser import parse_CaseFolding_txt
     15from UCD_parser import parse_PropertyAlias_txt, parse_CaseFolding_txt
    1616
    1717def simple_CaseFolding_BitSets(fold_map):
     
    4040def simple_CaseClosure_map(fold_data):
    4141   simpleFoldMap = {}
    42    for k in fold_data['S'].keys(): simpleFoldMap[k] = fold_data['S'][k]
    43    for k in fold_data['C'].keys(): simpleFoldMap[k] = fold_data['C'][k]
     42   for k in fold_data['S'].keys(): simpleFoldMap[k] = int(fold_data['S'][k], 16)
     43   for k in fold_data['C'].keys(): simpleFoldMap[k] = int(fold_data['C'][k], 16)
    4444   cl_map = {}
    4545   for k in simpleFoldMap.keys():
     
    151151
    152152def genCaseFolding_txt_h():
    153    fold_data = parse_CaseFolding_txt()
    154    cm = simple_CaseClosure_map(fold_data)
    155    f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
    156    cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
    157    f.write(foldDeclarations)
    158    f.write(genFoldEntryData(cm))
    159    cformat.close_header_file(f)
     153    (property_enum_name_list, property_object_map) = parse_PropertyAlias_txt()
     154    fold_data = parse_CaseFolding_txt(property_object_map)
     155    cm = simple_CaseClosure_map(fold_data)
     156    f = cformat.open_header_file_for_write('CaseFolding_txt', 'casefold.py')
     157    cformat.write_imports(f, ["<vector>", '"re/re_cc.h"'])
     158    f.write(foldDeclarations)
     159    f.write(genFoldEntryData(cm))
     160    #emit_property(f, 'scf')
     161    #emit_property(f, 'cf')
     162    cformat.close_header_file(f)
    160163
    161164if __name__ == "__main__":
Note: See TracChangeset for help on using the changeset viewer.