Changeset 5749


Ignore:
Timestamp:
Nov 28, 2017, 1:48:14 AM (17 months ago)
Author:
nmedfort
Message:

updated UCD python scripts

Location:
icGREP/icgrep-devel/UCD-scripts
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/UCD-scripts/UCD_config.py

    r5686 r5749  
    1111
    1212version = "Unknown"
     13
     14UCD_max_code_point = "0x10FFFF"
  • icGREP/icgrep-devel/UCD-scripts/UCD_parser.py

    r5674 r5749  
    88#
    99#
    10 import re, string, os.path
    1110import UCD_config
    12 from unicode_set import *
    1311from UCD_property_objects import *
    1412
  • icGREP/icgrep-devel/UCD-scripts/UCD_properties.py

    r5686 r5749  
    1010#
    1111#
    12 import re, string, os.path, cformat, UCD_config
    13 from unicode_set import *
     12import string, os.path
    1413from UCD_parser import *
    1514from UCD_property_objects import *
     
    4544        /** Code Point Ranges for ${prop_enum} mapping to <none>
    4645        ${null_set_ranges}**/
    47 
    48         const UnicodeSet null_codepoint_set
    49         ${null_set_value};
     46       
     47        ${null_set_value}
    5048
    5149        /** Code Point Ranges for ${prop_enum} mapping to <codepoint>
    5250        ${reflexive_set_ranges}**/
    53         const UnicodeSet reflexive_set
    54         ${reflexive_set_value};
     51       
     52        ${reflexive_set_value}
    5553
    5654        const unsigned buffer_length = ${buffer_length};
    57         const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
    58 
    59         const static std::vector<codepoint_t> defined_cps = {
     55        const static char string_buffer[${allocation_length}] LLVM_ALIGNAS(32) = u8R"__(${string_buffer})__";
     56
     57        const static std::vector<codepoint_t> defined_cps{
    6058        ${explicitly_defined_cps}};
    6159        static StringPropertyObject property_object(${prop_enum},
    62                                                     null_codepoint_set,
    63                                                     reflexive_set,
     60                                                    std::move(null_codepoint_set),
     61                                                    std::move(reflexive_set),
    6462                                                    static_cast<const char *>(string_buffer),
    6563                                                    buffer_length,
    66                                                     defined_cps);
     64                                                    std::move(defined_cps));
    6765    }
    6866""")
     
    7371    buffer_length = len(string_buffer.encode("utf-8"))
    7472    f.write(s.substitute(prop_enum = property_code,
    75     prop_enum_up = property_code.upper(),
    76     string_buffer = string_buffer,
    77     buffer_length = buffer_length,
    78     allocation_length = (buffer_length + 255) & -256,
    79     null_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8),
    80     null_set_value = null_set.showC(12),
    81     reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8),
    82     reflexive_set_value = reflexive_set.showC(12),
    83     explicitly_defined_cp_count = len(cps),
    84     explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
    85     ))
     73                         prop_enum_up = property_code.upper(),
     74                         string_buffer = string_buffer,
     75                         buffer_length = buffer_length,
     76                         allocation_length = (buffer_length + 255) & -256,
     77                         null_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(null_set)], ',', 8),
     78                         null_set_value = null_set.generate("null_codepoint_set", 8),
     79                         reflexive_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(reflexive_set)], ',', 8),
     80                         reflexive_set_value = reflexive_set.generate("reflexive_set", 8),
     81                         explicitly_defined_cp_count = len(cps),
     82                         explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     83                         ))
    8684
    8785def emit_string_override_property(f, property_code, overridden_code, override_set, cp_value_map):
     
    9088        ${overridden_set_ranges}**/
    9189
    92         const UnicodeSet explicitly_defined_set
    93         ${overridden_set_value};
     90        ${overridden_set_value}
    9491
    9592        const unsigned buffer_length = ${buffer_length};
    96         const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
    97 
    98         const static std::vector<codepoint_t> defined_cps = {
     93        const static char string_buffer[${allocation_length}] LLVM_ALIGNAS(32) = u8R"__(${string_buffer})__";
     94
     95        const static std::vector<codepoint_t> defined_cps{
    9996        ${explicitly_defined_cps}};
    10097        static StringOverridePropertyObject property_object(${prop_enum},
    10198                                                    ${overridden}_ns::property_object,
    102                                                     explicitly_defined_set,
     99                                                    std::move(explicitly_defined_set),
    103100                                                    static_cast<const char *>(string_buffer),
    104101                                                    buffer_length,
    105                                                     defined_cps);
     102                                                    std::move(defined_cps));
    106103    }
    107104""")
     
    112109    buffer_length = len(string_buffer.encode("utf-8"))
    113110    f.write(s.substitute(prop_enum = property_code,
    114     prop_enum_up = property_code.upper(),
    115     overridden = overridden_code.upper(),
    116     string_buffer = string_buffer,
    117     buffer_length = buffer_length,
    118     allocation_length = (buffer_length + 255) & -256,
    119     overridden_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(override_set)], ',', 8),
    120     overridden_set_value = override_set.showC(12),
    121     explicitly_defined_cp_count = len(cps),
    122     explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
    123     ))
     111                         prop_enum_up = property_code.upper(),
     112                         overridden = overridden_code.upper(),
     113                         string_buffer = string_buffer,
     114                         buffer_length = buffer_length,
     115                         allocation_length = (buffer_length + 255) & -256,
     116                         overridden_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(override_set)], ',', 8),
     117                         overridden_set_value = override_set.generate("explicitly_defined_set", 8),
     118                         explicitly_defined_cp_count = len(cps),
     119                         explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     120                         ))
    124121
    125122def emit_numeric_property(f, property_code, NaN_set, cp_value_map):
     
    128125        ${NaN_set_ranges}**/
    129126
    130         const UnicodeSet NaN_set
    131         ${NaN_set_value};
    132 
    133        const unsigned buffer_length = ${buffer_length};
    134         const static char __attribute__ ((aligned (32))) string_buffer[${allocation_length}] = u8R"__(${string_buffer})__";
     127        ${NaN_set_value}
     128
     129        const unsigned buffer_length = ${buffer_length};
     130        const static char string_buffer[${allocation_length}] LLVM_ALIGNAS(32) = u8R"__(${string_buffer})__";
    135131
    136132        const static std::vector<codepoint_t> defined_cps = {
    137133        ${explicitly_defined_cps}};
    138134        static NumericPropertyObject property_object(${prop_enum},
    139                                                     NaN_set,
     135                                                    std::move(NaN_set),
    140136                                                    static_cast<const char *>(string_buffer),
    141137                                                    buffer_length,
    142                                                     defined_cps);
     138                                                    std::move(defined_cps));
    143139    }
    144140""")
     
    150146    buffer_length = len(string_buffer.encode("utf-8"))
    151147    f.write(s.substitute(prop_enum = property_code,
    152     prop_enum_up = property_code.upper(),
    153     string_buffer = string_buffer,
    154     buffer_length = buffer_length,
    155     allocation_length = (buffer_length + 255) & -256,
    156     NaN_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(NaN_set)], ',', 8),
    157     NaN_set_value = NaN_set.showC(12),
    158     explicitly_defined_cp_count = len(cps),
    159     explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
    160     ))
     148                         prop_enum_up = property_code.upper(),
     149                         string_buffer = string_buffer,
     150                         buffer_length = buffer_length,
     151                         allocation_length = (buffer_length + 255) & -256,
     152                         NaN_set_ranges = cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(NaN_set)], ',', 8),
     153                         NaN_set_value = NaN_set.generate("NaN_set", 8),
     154                         explicitly_defined_cp_count = len(cps),
     155                         explicitly_defined_cps = cformat.multiline_fill(['0x%04x' % cp for cp in cps], ',', 8)
     156                         ))
    161157
    162158
     
    165161    f.write("        /** Code Point Ranges for %s\n        " % property_code)
    166162    f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(property_set)], ',', 8))
    167     f.write("**/\n")
    168     f.write("        const UnicodeSet codepoint_set \n")
    169     f.write(property_set.showC(12) + ";\n")
    170     f.write("        static BinaryPropertyObject property_object{%s, codepoint_set};\n    }\n" % property_code)
     163    f.write("**/\n\n")
     164    f.write(property_set.generate("codepoint_set", 8))
     165    f.write("        static BinaryPropertyObject property_object{%s, std::move(codepoint_set)};\n    }\n" % property_code)
    171166
    172167def emit_enumerated_property(f, property_code, independent_prop_values, prop_values, value_map):
     
    176171        f.write("    /** Code Point Ranges for %s\n    " % v)
    177172        f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 4))
    178         f.write("**/\n")
    179         f.write("    const UnicodeSet %s_Set \n" % v.lower())
    180         f.write(value_map[v].showC(8) + ";\n")
     173        f.write("**/\n\n")
     174        f.write(value_map[v].generate(v.lower() + "_Set", 4))
    181175    set_list = ['&%s_Set' % v.lower() for v in prop_values]
    182176    f.write("    static EnumeratedPropertyObject property_object\n")
    183177    f.write("        {%s,\n" % property_code)
    184     f.write("         %s_ns::independent_prop_values,\n" % property_code.upper())
    185     f.write("         %s_ns::enum_names,\n" % property_code.upper())
    186     f.write("         %s_ns::value_names,\n" % property_code.upper())
    187     f.write("         %s_ns::aliases_only_map,\n" % property_code.upper())
    188     f.write("         {")
    189     f.write(cformat.multiline_fill(set_list, ',', 8))
    190     f.write("\n         }};\n    }\n")
     178    f.write("        %s_ns::independent_prop_values,\n" % property_code.upper())
     179    f.write("        std::move(%s_ns::enum_names),\n" % property_code.upper())
     180    f.write("        std::move(%s_ns::value_names),\n" % property_code.upper())
     181    f.write("        std::move(%s_ns::aliases_only_map),{\n" % property_code.upper())
     182    f.write("        " + cformat.multiline_fill(set_list, ',', 8))
     183    f.write("\n        }};"
     184            "\n    }\n")
    191185
    192186def emit_Obsolete_property(f, property_code):
     
    294288
    295289foldDeclarations = r"""
    296 typedef unsigned codepoint_t;
    297 
    298290struct FoldEntry {
    299     re::codepoint_t range_lo;
    300     int fold_offset;
    301     std::vector<re::interval_t> fold_pairs;
     291    const UCD::codepoint_t range_lo;
     292    const int fold_offset;
     293    const std::vector<UCD::interval_t> fold_pairs;
    302294};
    303295
    304 
    305 void caseInsensitiveInsertRange(re::CC * cc, const re::codepoint_t lo, const re::codepoint_t hi);
    306 
    307 inline void caseInsensitiveInsert(re::CC * cc, const re::codepoint_t cp) {
     296void caseInsensitiveInsertRange(UCD::UnicodeSet * const cc, const UCD::codepoint_t lo, const UCD::codepoint_t hi);
     297
     298inline void caseInsensitiveInsert(UCD::UnicodeSet * const cc, const UCD::codepoint_t cp) {
    308299    caseInsensitiveInsertRange(cc, cp, cp);
    309300}
     
    485476            f.write(cformat.multiline_fill(['[%04x, %04x]' % (lo, hi) for (lo, hi) in uset_to_range_list(value_map[v])], ',', 8))
    486477            f.write("**/\n")
    487             f.write("        const UnicodeSet %s_Ext \n" % v.lower())
    488             f.write(value_map[v].showC(12) + ";\n")
     478            f.write(value_map[v].generate(v.lower() + "_Ext", 8))
    489479        set_list = ['&%s_Ext' % v.lower() for v in prop_list]
    490480        f.write("        static ExtensionPropertyObject property_object\n")
     
    518508        setVersionfromReadMe_txt()
    519509        f = cformat.open_header_file_for_write('UCD_Config')
    520         f.write("\nnamespace UCD {\n")
    521         f.write("   const std::string UnicodeVersion = \"%s\";\n" % UCD_config.version)
     510        f.write("#include <utility>\n")
     511        f.write("namespace UCD {\n")
     512        f.write("\tconst auto UnicodeVersion = \"%s\";\n" % UCD_config.version)
     513        f.write("\tusing codepoint_t = unsigned;\n")
     514        f.write("\tenum : codepoint_t { UNICODE_MAX = %s };\n" % UCD_config.UCD_max_code_point)
     515        f.write("\tusing interval_t = std::pair<codepoint_t, codepoint_t>;\n")
    522516        f.write("}\n")
    523517        cformat.close_header_file(f)
     
    529523        cm = simple_CaseClosure_map(fold_data)
    530524        f = cformat.open_header_file_for_write(basename, 'casefold.py')
    531         cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"', "<vector>", '"re/re_cc.h"'])
     525        cformat.write_imports(f, ['"PropertyAliases.h"', '"PropertyObjects.h"', '"PropertyValueAliases.h"', '"unicode_set.h"', '<vector>'])
    532526        f.write(foldDeclarations)
    533527        f.write(genFoldEntryData(cm))
  • icGREP/icgrep-devel/UCD-scripts/UCD_property_objects.py

    r5673 r5749  
    1 import re, string, os.path
    2 import UCD_config
    31from unicode_set import *
    42trivial_name_char_re = re.compile('[-_\s]')
  • icGREP/icgrep-devel/UCD-scripts/cformat.py

    r5655 r5749  
    3333   return f
    3434
    35 def open_cpp_file_for_write(filename, generator_name='UCD_properties.py'):
    36    f = open(UCD_config.UCD_output_dir + '/' + filename + '.cpp', 'w')
    37    f.write(cpp_template % (generator_name, filename))
    38    return f
    39 
    4035def close_header_file(f):
    4136   f.write("\n#endif\n")
    42    f.close()
    43 
    44 def close_cpp_file(f):
    4537   f.close()
    4638
  • icGREP/icgrep-devel/UCD-scripts/generate_UCD_tests.py

    r5686 r5749  
    99#
    1010#
    11 import re, string, os.path, cformat
    1211from random import randint
    13 from unicode_set import *
     12
    1413from UCD_parser import *
    15 from string import Template
     14
    1615
    1716class UCD_test_generator():
  • icGREP/icgrep-devel/UCD-scripts/unicode_set.py

    r5653 r5749  
    77#
    88# Licensed under Open Software License 3.0.
    9 import re, cformat
     9import cformat
     10import re
     11
    1012#
    1113# Unicode Sparse Bitset Representation
     
    2830quad_bits = 1 << log2_quad_bits
    2931mod_quad_bit_mask = quad_bits - 1
    30 UnicodeQuadCount = int(0x110000 / quad_bits) # 2**log2_quad_bits codepoints per quad
    31 FullQuadMask = (1<<(quad_bits)) - 1
     32UnicodeQuadCount = int(0x110000 / quad_bits)  # 2**log2_quad_bits codepoints per quad
     33FullQuadMask = (1 << (quad_bits)) - 1
    3234run_bytes = 4
    3335
    3436
    3537class UCset:
    36    def __init__(self):
    37       self.runs = []
    38       self.quads = []
    39       self.quad_count = 0
    40      
    41    # internal methods
    42    def append_run(self, runtype, runlength):
    43       if runlength == 0: return
    44       if self.runs == []:  self.runs = [(runtype, runlength)]
    45       else:
    46          (lastruntype, lastrunlength) = self.runs[-1]
    47          if lastruntype == runtype:  self.runs[-1] = (runtype, lastrunlength + runlength)
    48          else: self.runs.append((runtype, runlength))
    49       self.quad_count += runlength
    50    def append_mixed_run(self, n, quadlist):
    51       self.append_run(Mixed, n)
    52       self.quads += quadlist
    53    def append_quad(self, q):
    54       if q == 0:
    55         self.append_run(Empty, 1)
    56       elif q & FullQuadMask == FullQuadMask:
    57         self.append_run(Full, 1)
    58       else:
    59         self.append_run(Mixed, 1)
    60         self.quads.append(q)
    61 
    62    # printing
    63    def showC(self, indent = 4):
    64       hex_specifier =  "%%#0%ix" % (int(quad_bits/4) + 2)
    65       runtype = {-1:"Full", 0:"Empty", 1: "Mixed"}
    66       formatted_runs = ['{%s, %i}' % (runtype[r[0]], r[1]) for r in self.runs]
    67       formatted_quads = [hex_specifier % q for q in self.quads]
    68       setrep = (" " * indent) + "{{"
    69       setrep += cformat.multiline_fill(formatted_runs, ',', indent+2)
    70       setrep += '},\n'
    71       setrep += (" " * indent) + " {"
    72       setrep += cformat.multiline_fill(formatted_quads, ',', indent+2)
    73       setrep += '}}'
    74       return setrep
    75 
    76    def bytes(self):
    77        return (len(self.runs) * run_bytes) + (len(self.quads) * int(quad_bits/8))
     38    def __init__(self):
     39        self.runs = []
     40        self.quads = []
     41
     42    # internal methods
     43    def append_run(self, runtype, runlength):
     44        if runlength == 0: return
     45        if self.runs == []:
     46            self.runs = [(runtype, runlength)]
     47        else:
     48            (lastruntype, lastrunlength) = self.runs[-1]
     49            if lastruntype == runtype:
     50                self.runs[-1] = (runtype, lastrunlength + runlength)
     51            else:
     52                self.runs.append((runtype, runlength))
     53
     54    def append_quad(self, q):
     55        if q == 0:
     56            self.append_run(Empty, 1)
     57        elif (q & FullQuadMask) == FullQuadMask:
     58            self.append_run(Full, 1)
     59        else:
     60            self.append_run(Mixed, 1)
     61            self.quads.append(q)
     62
     63    # printing
     64    def generate(self, propertyName, indent=4):
     65        hex_specifier = "%%#0%ix" % (int(quad_bits / 4) + 2)
     66        runtype = {-1: "Full", 0: "Empty", 1: "Mixed"}
     67
     68        str = "\n" + (" " * indent) + "namespace {\n" + \
     69              (" " * indent) + "const static UnicodeSet::run_t __%s_runs[] = {\n" % propertyName + \
     70              (" " * indent) + cformat.multiline_fill(['{%s, %i}' % (runtype[r[0]], r[1]) for r in self.runs], ',',
     71                                                      indent) + \
     72              "};\n"
     73
     74        if len(self.quads) == 0:
     75            str += (" " * indent) + "const static UnicodeSet::bitquad_t * const __%s_quads = nullptr;\n" % propertyName
     76        else:
     77            str += (" " * indent) + "const static UnicodeSet::bitquad_t  __%s_quads[] = {\n" % propertyName + \
     78                   (" " * indent) + cformat.multiline_fill([hex_specifier % q for q in self.quads], ',', indent) + \
     79                   "};\n"
     80
     81        # Despite being const_cast below, neither runs nor quads will be modified by the UnicodeSet. If any
     82        # modifications are made, they first test the run/quad capacity and will observe that they 0 length
     83        # and allocate heap memory to make any changes
     84
     85        str += (" " * indent) + "}\n\n" + \
     86               (" " * indent) + \
     87               "const static UnicodeSet %s{const_cast<UnicodeSet::run_t *>(__%s_runs), %i, 0, " \
     88               "const_cast<UnicodeSet::bitquad_t *>(__%s_quads), %i, 0};\n\n" \
     89               % (propertyName, propertyName, len(self.runs), propertyName, len(self.quads))
     90
     91        return str
     92
     93    def bytes(self):
     94        return (len(self.runs) * run_bytes) + (len(self.quads) * int(quad_bits / 8))
    7895
    7996
     
    8299#
    83100def empty_uset():
    84    e = UCset()
    85    e.runs = [(Empty, UnicodeQuadCount)]
    86    e.quads = []
    87    e.quad_count = UnicodeQuadCount
    88    return e
     101    e = UCset()
     102    e.runs = [(Empty, UnicodeQuadCount)]
     103    e.quads = []
     104    return e
     105
    89106
    90107def singleton_uset(codepoint):
    91    e = UCset()
    92    quad_no = codepoint >> log2_quad_bits
    93    quad_val = 1 << (codepoint & mod_quad_bit_mask)
    94    if quad_no > 0: e.append_run(Empty, quad_no)
    95    e.append_run(Mixed, 1)
    96    e.quads = [quad_val]
    97    if quad_no < UnicodeQuadCount - 1: e.append_run(Empty, UnicodeQuadCount - (quad_no + 1))
    98    e.quad_count = UnicodeQuadCount
    99    return e
     108    e = UCset()
     109    quad_no = codepoint >> log2_quad_bits
     110    quad_val = 1 << (codepoint & mod_quad_bit_mask)
     111    if quad_no > 0: e.append_run(Empty, quad_no)
     112    e.append_run(Mixed, 1)
     113    e.quads = [quad_val]
     114    if quad_no < UnicodeQuadCount - 1:
     115        e.append_run(Empty, UnicodeQuadCount - (quad_no + 1))
     116    return e
     117
    100118
    101119def range_uset(lo_codepoint, hi_codepoint):
    102    e = UCset()
    103    lo_quad_no = lo_codepoint >> log2_quad_bits   
    104    hi_quad_no = hi_codepoint >> log2_quad_bits
    105    lo_offset = lo_codepoint & mod_quad_bit_mask
    106    hi_offset = hi_codepoint & mod_quad_bit_mask
    107    if lo_quad_no > 0:  e.append_run(Empty, lo_quad_no)
    108    if lo_quad_no == hi_quad_no:
    109       quad = (FullQuadMask << lo_offset) & (FullQuadMask >> (quad_bits - 1 - hi_offset))
    110       e.append_quad(quad)
    111    else:
    112       e.append_quad((FullQuadMask << lo_offset) & FullQuadMask)
    113       e.append_run(Full, hi_quad_no - (lo_quad_no + 1))
    114       e.append_quad((FullQuadMask >> (quad_bits - 1 - hi_offset)) & FullQuadMask)
    115    if hi_quad_no < UnicodeQuadCount - 1: e.append_run(Empty, UnicodeQuadCount - (hi_quad_no + 1))
    116    return e
     120    e = UCset()
     121    lo_quad_no = lo_codepoint >> log2_quad_bits
     122    hi_quad_no = hi_codepoint >> log2_quad_bits
     123    lo_offset = lo_codepoint & mod_quad_bit_mask
     124    hi_offset = hi_codepoint & mod_quad_bit_mask
     125    if lo_quad_no > 0:  e.append_run(Empty, lo_quad_no)
     126    if lo_quad_no == hi_quad_no:
     127        quad = (FullQuadMask << lo_offset) & (FullQuadMask >> (quad_bits - 1 - hi_offset))
     128        e.append_quad(quad)
     129    else:
     130        e.append_quad((FullQuadMask << lo_offset) & FullQuadMask)
     131        e.append_run(Full, hi_quad_no - (lo_quad_no + 1))
     132        e.append_quad((FullQuadMask >> (quad_bits - 1 - hi_offset)) & FullQuadMask)
     133    if hi_quad_no < UnicodeQuadCount - 1:
     134        e.append_run(Empty, UnicodeQuadCount - (hi_quad_no + 1))
     135    return e
    117136
    118137
     
    123142        self.offset = 0
    124143        self.quad_no = 0
     144
    125145    def at_end(self):
    126146        return self.run_no == len(self.uSet.runs)
     147
    127148    def current_run(self):
    128149        (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
    129150        return (this_run_type, this_run_length - self.offset)
     151
    130152    def get_quad(self):
    131153        (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
    132         if this_run_type == Empty: return 0
    133         elif this_run_type == Full: return FullQuadMask
    134         else: return self.uSet.quads[self.quad_no]
     154        if this_run_type == Empty:
     155            return 0
     156        elif this_run_type == Full:
     157            return FullQuadMask
     158        else:
     159            return self.uSet.quads[self.quad_no]
     160
    135161    def advance(self, n):
    136162        while n > 0:
    137            (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
    138            remain = this_run_length - self.offset
    139            if remain > n:
    140                self.offset += n
    141                if this_run_type == Mixed: self.quad_no += n
    142                n = 0
    143            elif remain == n:
    144                self.run_no += 1
    145                self.offset = 0
    146                if this_run_type == Mixed: self.quad_no += n
    147                n = 0
    148            else:
    149                self.run_no += 1
    150                self.offset = 0
    151                if this_run_type == Mixed: self.quad_no += remain
    152                n -= remain
     163            (this_run_type, this_run_length) = self.uSet.runs[self.run_no]
     164            remain = this_run_length - self.offset
     165            if remain > n:
     166                self.offset += n
     167                if this_run_type == Mixed: self.quad_no += n
     168                n = 0
     169            elif remain == n:
     170                self.run_no += 1
     171                self.offset = 0
     172                if this_run_type == Mixed: self.quad_no += n
     173                n = 0
     174            else:
     175                self.run_no += 1
     176                self.offset = 0
     177                if this_run_type == Mixed: self.quad_no += remain
     178                n -= remain
    153179
    154180
    155181def uset_member(s, codepoint):
    156    quad_no = int(codepoint / quad_bits)
    157    quad_val = 1 << (codepoint & mod_quad_bit_mask)
    158    it = Uset_Iterator(s)   
    159    it.advance(quad_no)
    160    return (it.get_quad() & quad_val) != 0
     182    quad_no = int(codepoint / quad_bits)
     183    quad_val = 1 << (codepoint & mod_quad_bit_mask)
     184    it = Uset_Iterator(s)
     185    it.advance(quad_no)
     186    return (it.get_quad() & quad_val) != 0
     187
    161188
    162189def uset_popcount(s):
     
    175202    return popcount
    176203
     204
    177205def popcount_quad(q):
    178206    c = 0
    179207    while q != 0:
    180         q = q & (q - 1) # clear low bit
     208        q = q & (q - 1)  # clear low bit
    181209        c += 1
    182210    return c
    183211
    184 def uset_complement (s):
    185    assert s.quad_count == UnicodeQuadCount
    186    iset = UCset()
    187    it = Uset_Iterator(s)
    188    while not it.at_end():
    189       (runtype, n) = it.current_run()
    190       if runtype == Empty:
    191          iset.append_run(Full, n)
    192          it.advance(n)
    193       elif runtype == Full:
    194          iset.append_run(Empty, n)
    195          it.advance(n)
    196       else:
    197          for i in range(n):
    198             iset.append_quad(FullQuadMask ^ it.get_quad())
    199             it.advance(1)
    200    return iset
    201 
    202 def uset_intersection (s1, s2):
    203    assert s1.quad_count == UnicodeQuadCount
    204    assert s2.quad_count == UnicodeQuadCount
    205    iset = UCset()
    206    i1 = Uset_Iterator(s1)
    207    i2 = Uset_Iterator(s2)
    208    while not i1.at_end():
    209       (s1_type, s1_length) = i1.current_run()
    210       (s2_type, s2_length) = i2.current_run()
    211       n = min(s1_length, s2_length)
    212       if s1_type == Empty or s2_type == Empty:
    213          iset.append_run(Empty, n)
    214          i1.advance(n)
    215          i2.advance(n)
    216       elif s1_type == Full and s2_type == Full:
    217          iset.append_run(Full, n)
    218          i1.advance(n)
    219          i2.advance(n)
    220       elif s1_type == Full:
    221          for i in range(n):
    222             iset.append_quad(i2.get_quad())
    223             i2.advance(1)
    224          i1.advance(n)
    225       elif s2_type == Full:
    226          for i in range(n):
    227             iset.append_quad(i1.get_quad())
    228             i1.advance(1)
    229          i2.advance(n)
    230       else: # both s1 and s2 have mixed blocks; form block-by-block intersection
    231          for i in range(n):
    232             iset.append_quad(i1.get_quad() & i2.get_quad())
    233             i1.advance(1)
    234             i2.advance(1)
    235    return iset
    236 
    237 def uset_union (s1, s2):
    238    assert s1.quad_count == UnicodeQuadCount
    239    assert s2.quad_count == UnicodeQuadCount
    240    iset = UCset()
    241    i1 = Uset_Iterator(s1)
    242    i2 = Uset_Iterator(s2)
    243    while not i1.at_end():
    244       (s1_type, s1_length) = i1.current_run()
    245       (s2_type, s2_length) = i2.current_run()
    246       n = min(s1_length, s2_length)
    247       if s1_type == Empty and s2_type == Empty:
    248          iset.append_run(Empty, n)
    249          i1.advance(n)
    250          i2.advance(n)
    251       elif s1_type == Full or s2_type == Full:
    252          iset.append_run(Full, n)
    253          i1.advance(n)
    254          i2.advance(n)
    255       elif s1_type == Empty:
    256          for i in range(n):
    257             iset.append_quad(i2.get_quad())
    258             i2.advance(1)
    259          i1.advance(n)
    260       elif s2_type == Empty:
    261          for i in range(n):
    262             iset.append_quad(i1.get_quad())
    263             i1.advance(1)
    264          i2.advance(n)
    265       else: # both s1 and s2 have mixed blocks; form block-by-block union
    266          for i in range(n):
    267             iset.append_quad(i1.get_quad() | i2.get_quad())
    268             i1.advance(1)
    269             i2.advance(1)
    270    return iset
    271 
    272 def uset_difference (s1, s2):
    273    assert s1.quad_count == UnicodeQuadCount
    274    assert s2.quad_count == UnicodeQuadCount
    275    iset = UCset()
    276    i1 = Uset_Iterator(s1)
    277    i2 = Uset_Iterator(s2)
    278    while not i1.at_end():
    279       (s1_type, s1_length) = i1.current_run()
    280       (s2_type, s2_length) = i2.current_run()
    281       n = min(s1_length, s2_length)
    282       if s1_type == Empty or s2_type == Full:
    283          iset.append_run(Empty, n)
    284          i1.advance(n)
    285          i2.advance(n)
    286       elif s1_type == Full and s2_type == Empty:
    287          iset.append_run(Full, n)
    288          i1.advance(n)
    289          i2.advance(n)
    290       elif s1_type == Full:
    291          for i in range(n):
    292             iset.append_quad(FullQuadMask ^ i2.get_quad())
    293             i2.advance(1)
    294          i1.advance(n)
    295       elif s2_type == Empty:
    296          for i in range(n):
    297             iset.append_quad(i1.get_quad())
    298             i1.advance(1)
    299          i2.advance(n)
    300       else: # both s1 and s2 have mixed blocks; form block-by-block union
    301          for i in range(n):
    302             iset.append_quad(i1.get_quad() &~ i2.get_quad())
    303             i1.advance(1)
    304             i2.advance(1)
    305    return iset
    306 
    307 def uset_symmetric_difference (s1, s2):
    308    assert s1.quad_count == UnicodeQuadCount
    309    assert s2.quad_count == UnicodeQuadCount
    310    iset = UCset()
    311    i1 = Uset_Iterator(s1)
    312    i2 = Uset_Iterator(s2)
    313    while not i1.at_end():
    314       (s1_type, s1_length) = i1.current_run()
    315       (s2_type, s2_length) = i2.current_run()
    316       n = min(s1_length, s2_length)
    317       if s1_type == Empty and s2_type == Full or s1_type == Full and s2_type == Empty:
    318          iset.append_run(Full, n)
    319          i1.advance(n)
    320          i2.advance(n)
    321       elif s1_type == Full and s2_type == Full or s1_type == Empty and s2_type == Empty:
    322          iset.append_run(Empty, n)
    323          i1.advance(n)
    324          i2.advance(n)
    325       elif s1_type == Empty:
    326          for i in range(n):
    327             iset.append_quad(i2.get_quad())
    328             i2.advance(1)
    329          i1.advance(n)
    330       elif s2_type == Empty:
    331          for i in range(n):
    332             iset.append_quad(i1.get_quad())
    333             i1.advance(1)
    334          i2.advance(n)
    335       elif s1_type == Full:
    336          for i in range(n):
    337             iset.append_quad(FullQuadMask ^ i2.get_quad())
    338             i2.advance(1)
    339          i1.advance(n)
    340       elif s2_type == Full:
    341          for i in range(n):
    342             iset.append_quad(FullQuadMask ^ i1.get_quad())
    343             i1.advance(1)
    344          i2.advance(n)
    345       else: # both s1 and s2 have mixed blocks; form block-by-block union
    346          for i in range(n):
    347             iset.append_quad(i1.get_quad() ^ i2.get_quad())
    348             i1.advance(1)
    349             i2.advance(1)
    350    return iset
     212
     213def uset_complement(s):
     214    iset = UCset()
     215    it = Uset_Iterator(s)
     216    while not it.at_end():
     217        (runtype, n) = it.current_run()
     218        if runtype == Empty:
     219            iset.append_run(Full, n)
     220            it.advance(n)
     221        elif runtype == Full:
     222            iset.append_run(Empty, n)
     223            it.advance(n)
     224        else:
     225            for i in range(n):
     226                iset.append_quad(FullQuadMask ^ it.get_quad())
     227                it.advance(1)
     228    return iset
     229
     230
     231def uset_intersection(s1, s2):
     232    iset = UCset()
     233    i1 = Uset_Iterator(s1)
     234    i2 = Uset_Iterator(s2)
     235    while not i1.at_end():
     236        (s1_type, s1_length) = i1.current_run()
     237        (s2_type, s2_length) = i2.current_run()
     238        n = min(s1_length, s2_length)
     239        if s1_type == Empty or s2_type == Empty:
     240            iset.append_run(Empty, n)
     241            i1.advance(n)
     242            i2.advance(n)
     243        elif s1_type == Full and s2_type == Full:
     244            iset.append_run(Full, n)
     245            i1.advance(n)
     246            i2.advance(n)
     247        elif s1_type == Full:
     248            for i in range(n):
     249                iset.append_quad(i2.get_quad())
     250                i2.advance(1)
     251            i1.advance(n)
     252        elif s2_type == Full:
     253            for i in range(n):
     254                iset.append_quad(i1.get_quad())
     255                i1.advance(1)
     256            i2.advance(n)
     257        else:  # both s1 and s2 have mixed blocks; form block-by-block intersection
     258            for i in range(n):
     259                iset.append_quad(i1.get_quad() & i2.get_quad())
     260                i1.advance(1)
     261                i2.advance(1)
     262    return iset
     263
     264
     265def uset_union(s1, s2):
     266    iset = UCset()
     267    i1 = Uset_Iterator(s1)
     268    i2 = Uset_Iterator(s2)
     269    while not i1.at_end():
     270        (s1_type, s1_length) = i1.current_run()
     271        (s2_type, s2_length) = i2.current_run()
     272        n = min(s1_length, s2_length)
     273        if s1_type == Empty and s2_type == Empty:
     274            iset.append_run(Empty, n)
     275            i1.advance(n)
     276            i2.advance(n)
     277        elif s1_type == Full or s2_type == Full:
     278            iset.append_run(Full, n)
     279            i1.advance(n)
     280            i2.advance(n)
     281        elif s1_type == Empty:
     282            for i in range(n):
     283                iset.append_quad(i2.get_quad())
     284                i2.advance(1)
     285            i1.advance(n)
     286        elif s2_type == Empty:
     287            for i in range(n):
     288                iset.append_quad(i1.get_quad())
     289                i1.advance(1)
     290            i2.advance(n)
     291        else:  # both s1 and s2 have mixed blocks; form block-by-block union
     292            for i in range(n):
     293                iset.append_quad(i1.get_quad() | i2.get_quad())
     294                i1.advance(1)
     295                i2.advance(1)
     296    return iset
     297
     298
     299def uset_difference(s1, s2):
     300    iset = UCset()
     301    i1 = Uset_Iterator(s1)
     302    i2 = Uset_Iterator(s2)
     303    while not i1.at_end():
     304        (s1_type, s1_length) = i1.current_run()
     305        (s2_type, s2_length) = i2.current_run()
     306        n = min(s1_length, s2_length)
     307        if s1_type == Empty or s2_type == Full:
     308            iset.append_run(Empty, n)
     309            i1.advance(n)
     310            i2.advance(n)
     311        elif s1_type == Full and s2_type == Empty:
     312            iset.append_run(Full, n)
     313            i1.advance(n)
     314            i2.advance(n)
     315        elif s1_type == Full:
     316            for i in range(n):
     317                iset.append_quad(FullQuadMask ^ i2.get_quad())
     318                i2.advance(1)
     319            i1.advance(n)
     320        elif s2_type == Empty:
     321            for i in range(n):
     322                iset.append_quad(i1.get_quad())
     323                i1.advance(1)
     324            i2.advance(n)
     325        else:  # both s1 and s2 have mixed blocks; form block-by-block union
     326            for i in range(n):
     327                iset.append_quad(i1.get_quad() & ~ i2.get_quad())
     328                i1.advance(1)
     329                i2.advance(1)
     330    return iset
     331
     332
     333def uset_symmetric_difference(s1, s2):
     334    iset = UCset()
     335    i1 = Uset_Iterator(s1)
     336    i2 = Uset_Iterator(s2)
     337    while not i1.at_end():
     338        (s1_type, s1_length) = i1.current_run()
     339        (s2_type, s2_length) = i2.current_run()
     340        n = min(s1_length, s2_length)
     341        if s1_type == Empty and s2_type == Full or s1_type == Full and s2_type == Empty:
     342            iset.append_run(Full, n)
     343            i1.advance(n)
     344            i2.advance(n)
     345        elif s1_type == Full and s2_type == Full or s1_type == Empty and s2_type == Empty:
     346            iset.append_run(Empty, n)
     347            i1.advance(n)
     348            i2.advance(n)
     349        elif s1_type == Empty:
     350            for i in range(n):
     351                iset.append_quad(i2.get_quad())
     352                i2.advance(1)
     353            i1.advance(n)
     354        elif s2_type == Empty:
     355            for i in range(n):
     356                iset.append_quad(i1.get_quad())
     357                i1.advance(1)
     358            i2.advance(n)
     359        elif s1_type == Full:
     360            for i in range(n):
     361                iset.append_quad(FullQuadMask ^ i2.get_quad())
     362                i2.advance(1)
     363            i1.advance(n)
     364        elif s2_type == Full:
     365            for i in range(n):
     366                iset.append_quad(FullQuadMask ^ i1.get_quad())
     367                i1.advance(1)
     368            i2.advance(n)
     369        else:  # both s1 and s2 have mixed blocks; form block-by-block union
     370            for i in range(n):
     371                iset.append_quad(i1.get_quad() ^ i2.get_quad())
     372                i1.advance(1)
     373                i2.advance(1)
     374    return iset
     375
    351376
    352377def uset_to_range_list(s):
     
    359384        (q_type, q_length) = i.current_run()
    360385        if q_type == Empty:
    361             if open_range: 
     386            if open_range:
    362387                rl.append((range_first, pos - 1))
    363388                open_range = False
     
    373398            q = i.get_quad()
    374399            qpos = pos
    375             for qpos in range(pos, pos+quad_bits):
     400            for qpos in range(pos, pos + quad_bits):
    376401                if q & 1 == 0:
    377402                    if open_range:
     
    390415    return rl
    391416
     417
    392418UCD_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;")
    393419UCD_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;")
    394 
    395 def parse_UCD_set(lines):
    396     pset = empty_set()
    397     for t in lines:
    398         m = UCD_point_regexp.match(t)
    399         if m:
    400             point = m.group(1)
    401             pval = int(point, 16)
    402             pset = union(pset, singleton_set(pval))
    403         m = UCD_range_regexp.match(t)
    404         if m:
    405             point1 = m.group(1)
    406             point2 = m.group(2)
    407             pval1 = int(point1, 16)
    408             pval2 = int(point2, 16)
    409             pset = union(pset, make_range_set(pval1, pval2))
    410     return pset
    411 
    412 def parse_UCD_file(fname, vname):
    413     f = open(fname)
    414     lines = f.readlines()
    415     f.close()
    416     s = parse_UCD_set(lines)
    417     print(s.showC(vname))
    418 
    419 
Note: See TracChangeset for help on using the changeset viewer.