Changeset 3964


Ignore:
Timestamp:
Aug 6, 2014, 9:56:31 AM (5 years ago)
Author:
cameron
Message:

Shorten if-range tests

Location:
proto/charsetcompiler
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/CC_compiler.py

    r3960 r3964  
    274274               e = self.expr_string_to_variable(self.expr2py(expr.operand))
    275275               return 'Advance(%s, %i)' % (e, expr.offset)
    276             else: raise Exception("Bad expression: %s" % repr(e))
     276            else: raise Exception("Bad expression: %s" % repr(expr))
    277277
    278278    def chardef2py(self, chardef):
  • proto/charsetcompiler/unicode_category_compiler.py

    r3962 r3964  
    123123
    124124
     125
     126# Generate a simplest possible test for a Unicode codepoint range
     127# such that each 1 bit marks a position within a UTF-8 initial
     128# subsequence such that each legal continuation of that subsequence
     129# is within the range.  Return the generated variable.
     130def utf8_ifrange_compiler(cgo, lo, hi):
     131   lo_len = utf8_length(lo)
     132   hi_len = utf8_length(hi)
     133   # If different length code unit sequences are involved, make
     134   # a union of equilength subranges.
     135   if hi_len > lo_len:
     136     m = max_codepoint_of_length(hi_len - 1)
     137     v_lo = utf8_ifrange_compiler(cgo, lo, m)
     138     v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
     139     range_var = "test_%x_%x" % (lo, hi)
     140     cgo.add_assignment(range_var, cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
     141     return range_var
     142   #
     143   else:
     144     return matched_ifsequence_compiler(cgo, lo, hi, 1, hi_len)
     145
     146
     147def matched_ifsequence_compiler(cgo, lo, hi, n, hlen):
     148   """ Helper function to generate the code necessary to match bytes
     149       n through hlen (1-based indexing) of the range of utf-8 sequences
     150       for codepoints lo through hi. """
     151   hbyte = utf8_byte(hi, n)
     152   lbyte = utf8_byte(lo, n)
     153   if n == hlen:
     154     targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
     155     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     156     return
     157   #
     158   # One or more bytes of the lower and upper bound may be the same.
     159   # Build a sequence of byte tests.
     160   if hbyte == lbyte:
     161     sfxVar = matched_ifsequence_compiler(cgo, lo, hi, n+1, hlen)
     162     targetVar = "bytetest_%x" % (lbyte)
     163     cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     164     cgo.add_assignment(targetVar, cgo.expr2py(make_and(make_shift_forward(Var(targetVar), 1), Var(sfxVar))))
     165     return targetVar
     166   # We now have a range involving different bytes at position n.
     167   following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
     168   # A separate test may be needed for the high byte sequence if
     169   # there are constraints on following suffix bytes.
     170   if hi & following_suffix_mask != following_suffix_mask:
     171     hi_floor = hi &~following_suffix_mask
     172     hiVar = matched_ifsequence_compiler(cgo, hi_floor, hi, n, hlen)
     173     loVar = matched_ifsequence_compiler(cgo, lo, hi_floor - 1, n, hlen)
     174     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
     175     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
     176     return targetVar
     177   # A separate test may be needed for the low byte sequence if
     178   # there are constraints on following suffix bytes.
     179   if lo & following_suffix_mask != 0:
     180     low_ceil = lo | following_suffix_mask
     181     hiVar = matched_ifsequence_compiler(cgo, low_ceil + 1, hi, n, hlen)
     182     loVar = matched_ifsequence_compiler(cgo, lo, low_ceil, n, hlen)
     183     targetVar = "range_test_%x_%x_%i" % (lo, hi, n)
     184     cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(hiVar), Var(loVar))))
     185     return targetVar
     186   #
     187   # Now we have a range that permits all suffix combinations.
     188   # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
     189   # has been validated.
     190   targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
     191   cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
     192   return targetVar
     193
     194
     195
     196
    125197def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
    126198  if bytecount == 0: return [make_assign(targetVar, "1")]
     
    194266   for rg in topRanges:
    195267     (rglo, rghi) = rg
    196      range_var = "CC_%x_%x" % (rglo, rghi)
    197      utf8_range_compiler(cgo, rglo, rghi, range_var)
    198      inner_cgo = CC_compiler(UTF8(), range_var + '_tmp%i', False, '')
     268     inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
    199269     inner_cgo.add_common_expressions(cgo)
    200270     generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
    201271     if inner_cgo.generated_code != []:
     272        range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
    202273        cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
    203274   return cgo.showcode()
     
    253324 
    254325
    255 defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
     326#defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
    256327
    257328#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
    258329
     330defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF),
     331(0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
     332(0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
     333(0x800,0xFFFF),
     334(0x10000, 0x10FFFF)]
    259335
    260336Unicode_CC_struct = "class category_%s:\n\tcc = 0\n\n"
    261337Unicode_CC_header = "def %s(basis_bits, struct_%s):\n"
    262338Unicode_dummy_main = "\n\ndef Main(basis_bits):\n    pass\n"
     339
    263340def generateDefs1(general_category):
    264341  catmap = {}
Note: See TracChangeset for help on using the changeset viewer.