Changeset 4424


Ignore:
Timestamp:
Jan 19, 2015, 5:58:01 PM (4 years ago)
Author:
cameron
Message:

Restructure/bug fix for if-hierarchy generation

Location:
proto/charsetcompiler
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/generate_UCD_property_functions.py

    r4423 r4424  
    9090            f.write(generateCharClassDefs(defaultIfRangeList, {k: uset_to_range_list(value_map[k])},  template_var))
    9191        f.write(Unicode_dummy_main)
    92         cformat.close_header_file(f)
     92        f.close()
    9393
    9494    def generate_ScriptExtensions_pablo(self):
     
    106106            f.write(generateCharClassDefs(defaultIfRangeList, {k: uset_to_range_list(value_map[k])},  template_var))
    107107        f.write(Unicode_dummy_main)
    108         cformat.close_header_file(f)
     108        f.close()
    109109
    110110    def generate_binary_property_template(self, filename_root):
     
    132132            f.write(generateCharClassDefs(defaultIfRangeList, {'Y': uset_to_range_list(prop_map[p])},  template_var))
    133133        f.write(Unicode_dummy_main)
    134         cformat.close_header_file(f)
     134        f.close()
    135135
    136136
  • proto/charsetcompiler/if_hierarchy.py

    r4370 r4424  
    7979   enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
    8080   missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
     81   # Codepoints in unenclosed ranges will be computed unconditionally.
     82   # Generate them first so that computed subexpressions may be shared
     83   # with calculations within the if hierarchy.
    8184   for rg in missingRanges:
    8285     (rglo, rghi) = rg
     
    105108     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
    106109     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
    107      for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
     110     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xD800, 0xDFFF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
    108111        (lo1, hi1) = byte_range
    109112        subcc2 = rangeIntersect(subcc1, lo1, hi1)
     
    242245
    243246
    244 #
    245 # Partition a list of ranges into a minimum set of utf8 groups
    246 # UTF-8 prefix groups, where a group is
    247 # (a) a range of codepoints with UTF-8 prefixes of the same length
    248 #     such that every codepoint in the range is within the group, or
    249 # (b) a sublist all having the same UTF-8 initial
    250 #     byte
    251 def partition_by_UTF8_group(range_list, byte_no):
    252     if range_list == []: return []
    253     (lo, hi) = range_list[0]
    254     u8len_lo = utf8_length(lo)
    255     u8len_hi = utf8_length(hi)
    256     if u8len_lo != u8len_hi:
    257         mid = max_codepoint_of_length(u8len_lo)
    258         return partition_by_UTF8_group([(lo, mid), (mid+1, hi)] + range_list[1:], byte_no)
    259     lobyte1 = utf8_byte(lo, byte_no)
    260     hibyte1 = utf8_byte(hi, byte_no)
    261     if lobyte1 != hibyte1:
    262         if not is_low_codepoint_after_byte(lo, byte_no):
    263             lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
    264             #print "lo--lo1:  %x--%x" % (lo, lo1)
    265             return [[(lo, lo1)]] + partition_by_UTF8_group([(lo1+1, hi)] + range_list[1:], byte_no)
    266         elif not is_high_codepoint_after_byte(hi, byte_no):
    267             hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
    268             #print "lo--hi-1:  %x--%x" % (lo, hi1-1)
    269             return [[(lo, hi1-1)]] + partition_by_UTF8_group([(hi1, hi)] + range_list[1:], byte_no)
    270         else:
    271             # we have a prefix group of type (a)
    272             return [[(lo, hi)]] + partition_by_UTF8_group(range_list[1:], byte_no)
    273     group1 = [(lo, hi)]
    274     subpartitions = partition_by_UTF8_group(range_list[1:], byte_no)
    275     if subpartitions == []: return [group1]
    276     elif utf8_byte(subpartitions[0][0][0], byte_no) == lobyte1:
    277         return [group1 + subpartitions[0]] + subpartitions[1:]
    278     else:
    279         return [group1] + subpartitions
    280247
    281248# Ensure the sequence of preceding bytes is defined, up to, but
     
    294261
    295262
     263
     264
    296265#
    297266# Generate remaining code to match UTF-8 code sequences within
    298 # the codepoint set u8_partition, assuming that the code matching the
     267# the codepoint set cpset, assuming that the code matching the
    299268# sequences up to byte number byte_no have been generated.
    300269#
    301 def utf8_sequence_generator(u8_partition, byte_no, targetVar, cgo):
    302    if len(u8_partition) == 0: return
    303    (lo, hi) = u8_partition[0]
    304    if utf8_length(lo) == byte_no:
    305       # We have a single byte remaining to match for all codepoints
    306       # in this partition.  Use the byte class compiler to generate
    307       # matches for these codepoints.
    308       ensure_preceding_prefix_defined(lo, byte_no, cgo)
    309       byte_pair_list = byte_definitions(u8_partition, byte_no)
    310       #print byte_pair_list
    311       if len(byte_pair_list) == 1:
    312           (lobyte, hibyte) = byte_pair_list[0]
    313           if lo == hi:
    314               final_byte_var = "byte_%x" % lobyte
    315           else:
    316               final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
    317           cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
    318       else:
    319           hi = u8_partition[-1][0]
    320           final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
    321           cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
    322       test_expr = Var(final_byte_var)
    323       if byte_no > 1: 
    324          pfx1 = utf8_prefix_var(lo, byte_no-1)
    325          pfx1_adv = pfx1 + "_adv"
    326          cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
    327          test_expr = make_and(Var(pfx1_adv), test_expr)
    328       cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
    329    else:
    330      partitions = partition_by_UTF8_group(u8_partition, byte_no)
    331      for p in partitions:
    332        (lo, hi) = p[0]
    333        lbyte = utf8_byte(lo, byte_no)
    334        hbyte = utf8_byte(hi, byte_no)
    335        ensure_preceding_prefix_defined(lo, byte_no, cgo)
    336        if lbyte == hbyte:
    337          byteVar = "byte_%x" % lbyte
    338          cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
    339          if byte_no > 1:
    340            last_prefix = utf8_prefix_var(lo, byte_no - 1)
    341            this_prefix = utf8_prefix_var(lo, byte_no)
    342            cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
    343          if byte_no < utf8_length(lo): utf8_sequence_generator(p, byte_no+1, targetVar, cgo)
    344        else:
    345          byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
    346          cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
    347          if byte_no > 1:
    348            last_prefix = utf8_prefix_var(lo, byte_no - 1)
    349            this_prefix = last_prefix + "_" + byteVar
    350            cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
    351          else: this_prefix = byteVar
    352          suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
    353          cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
    354          last_prefix = this_prefix
    355          while byte_no < utf8_length(lo):
    356            byte_no += 1
    357            this_prefix = last_prefix + "_sfx"
    358            cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
    359            last_prefix = this_prefix
    360          cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
     270def utf8_sequence_generator(cpset, byte_no, targetVar, cgo):
     271    if len(cpset) == 0: return
     272    (lo, hi) = cpset[0]
     273    u8len_lo = utf8_length(lo)
     274    u8len_max = utf8_length(cpset[-1][1])
     275    if u8len_lo != u8len_max:
     276        mid = max_codepoint_of_length(u8len_lo)
     277        utf8_sequence_generator(range_intersect(cpset, lo, mid), byte_no)
     278        utf8_sequence_generator(range_intersect(cpset, mid+1, hi1), byte_no)
     279        return
     280    if u8len_lo == byte_no:
     281        # We have a single byte remaining to match for all codepoints
     282        # in this cpset.  Use the byte class compiler to generate
     283        # matches for these codepoints.
     284        ensure_preceding_prefix_defined(lo, byte_no, cgo)
     285        byte_pair_list = byte_definitions(cpset, byte_no)
     286        #print byte_pair_list
     287        if len(byte_pair_list) == 1:
     288            (lobyte, hibyte) = byte_pair_list[0]
     289            if lo == hi:
     290                final_byte_var = "byte_%x" % lobyte
     291            else:
     292                final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
     293            cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     294        else:
     295            hi = cpset[-1][1]
     296            final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
     297            cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     298        test_expr = Var(final_byte_var)
     299        if byte_no > 1: 
     300           pfx1 = utf8_prefix_var(lo, byte_no-1)
     301           pfx1_adv = pfx1 + "_adv"
     302           cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
     303           test_expr = make_and(Var(pfx1_adv), test_expr)
     304        cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
     305        return
     306    #
     307    #
     308    for rg in cpset:
     309            (lo, hi) = rg
     310            lbyte = utf8_byte(lo, byte_no)
     311            hbyte = utf8_byte(hi, byte_no)
     312            if lbyte != hbyte:
     313                if not is_low_codepoint_after_byte(lo, byte_no):
     314                    lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
     315                    #print "lo--lo1:  %x--%x (%i)" % (lo, lo1, byte_no)
     316                    utf8_sequence_generator([(lo, lo1)], byte_no, targetVar, cgo)
     317                    utf8_sequence_generator([(lo1+1, hi)], byte_no, targetVar, cgo)
     318                elif not is_high_codepoint_after_byte(hi, byte_no):
     319                    hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
     320                    #print "lo--hi1-1:  %x--%x (%i)" % (lo, hi1-1, byte_no)
     321                    utf8_sequence_generator([(lo, hi1-1)], byte_no, targetVar, cgo)
     322                    utf8_sequence_generator([(hi1, hi)], byte_no, targetVar, cgo)
     323                else:
     324                    # we have a prefix group of type (a)
     325                    #print "lo--hi:  %x--%x (%i)" % (lo, hi, byte_no)
     326                    byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
     327                    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
     328                    if byte_no > 1:
     329                           last_prefix = utf8_prefix_var(lo, byte_no - 1)
     330                           this_prefix = last_prefix + "_" + byteVar
     331                           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
     332                    else: this_prefix = byteVar
     333                    suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
     334                    cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
     335                    last_prefix = this_prefix
     336                    while byte_no < utf8_length(lo):
     337                           byte_no += 1
     338                           this_prefix = last_prefix + "_sfx"
     339                           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
     340                           last_prefix = this_prefix
     341                    cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
     342            else:
     343                # lobyte1 == hybyte1
     344                byteVar = "byte_%x" % lbyte
     345                cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
     346                if byte_no > 1:
     347                    last_prefix = utf8_prefix_var(lo, byte_no - 1)
     348                    this_prefix = utf8_prefix_var(lo, byte_no)
     349                    cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
     350                if byte_no < utf8_length(lo): utf8_sequence_generator([rg], byte_no+1, targetVar, cgo)
     351
     352
    361353
    362354
Note: See TracChangeset for help on using the changeset viewer.