Ignore:
Timestamp:
Dec 24, 2014, 11:09:52 AM (5 years ago)
Author:
cameron
Message:

Improve if-test generation, eliminate some redundancies

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/unicode_category_compiler.py

    r4223 r4355  
    2828# a common multibyte prefix.
    2929def utf8_iftest_compiler(cgo, lo, hi):
    30    lo_byte = utf8_byte(lo, 1)
    31    hi_byte = utf8_byte(hi, 1)
    32    if lo_byte == hi_byte:
    33       targetVar = "cp_range_%x_%x" % (lo, hi)
    34       utf8_sequence_generator([(lo, hi)], 1, targetVar, cgo)
    35       return targetVar
    36    if lo > 0 and utf8_byte(lo - 1, 1) == lo_byte:
    37       lo1 = max_codepoint_with_initial_byte(lo_byte)
    38       targetVar = "cp_range_%x_%x" % (lo, lo1)
    39       utf8_sequence_generator([(lo, lo1)], 1, targetVar, cgo)
    40       test_expr1 = Var(targetVar)
    41       lo_byte = utf8_byte(lo1 + 1, 1)
    42    else:
    43       test_expr1 = FalseLiteral()
     30  lo_byte = utf8_byte(lo, 1)
     31  hi_byte = utf8_byte(hi, 1)
     32  targetVar = "cp_range_%x_%x" % (lo, hi)
     33  return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
     34
     35def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
     36  lo_byte = utf8_byte(lo, byte_no)
     37  hi_byte = utf8_byte(hi, byte_no)
     38  at_lo_boundary = lo == 0 or utf8_byte(lo-1, byte_no) != lo_byte
     39  at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi+1, byte_no) != hi_byte
     40  if at_lo_boundary and at_hi_boundary:
     41    if lo_byte == hi_byte:
     42      byteVar = "byte_%x" % lo_byte
     43    else:
    4444      if lo == 0x80: lo_byte = 0xC0
    45    if hi < 0x10FFFF and utf8_byte(hi + 1, 1) == hi_byte:
    46       hi1 = min_codepoint_with_initial_byte(hi_byte)
    47       targetVar = "cp_range_%x_%x" % (hi1, hi)
    48       utf8_sequence_generator([(hi1, hi)], 1, targetVar, cgo)
    49       test_expr2 = Var(targetVar)
    50       hi_byte = utf8_byte(hi1 - 1, 1)
    51    else:
    52       test_expr2 = FalseLiteral()
    5345      if hi == 0x10FFFF: hi_byte = 0xFF
    54    if lo_byte > hi_byte: return cgo.expr_string_to_variable(cgo.expr2py(make_or(test_expr1, test_expr2)))
    55    if lo_byte == hi_byte:
    56       byteVar = "byte_%x" % lo_byte
    57    else:
    5846      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
    59    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
    60    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(byteVar), make_or(test_expr1, test_expr2))))
     47    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     48    return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
     49  elif lo_byte == hi_byte:
     50    byteVar = "byte_%x" % lo_byte
     51    cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     52    new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
     53    return utf8_iftest_helper(cgo, lo, hi, byte_no+1, targetVar, new_marker)
     54  elif not at_hi_boundary:
     55    hi1 = min_codepoint_with_common_bytes(hi, byte_no)
     56    e1 = utf8_iftest_helper(cgo, lo, hi1-1, byte_no, targetVar, marker)
     57    e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
     58    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
     59  else: # if at_hi_boundary:
     60    lo1 = max_codepoint_with_common_bytes(lo, byte_no)
     61    e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
     62    e2 = utf8_iftest_helper(cgo, lo1+1, hi, byte_no, targetVar, marker)
     63    return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
     64   
     65def min_codepoint_with_common_bytes(cp, byte_no):
     66  u8len = utf8_length(cp)
     67  mask = (1 << (u8len-byte_no) * 6) - 1
     68  lo_cp = cp &~ mask
     69  if lo_cp == 0: return mask + 1
     70  else: return lo_cp
     71
     72def max_codepoint_with_common_bytes(cp, byte_no):
     73  u8len = utf8_length(cp)
     74  mask = (1 << (u8len-byte_no) * 6) - 1
     75  return cp | mask
    6176
    6277
     
    174189(0x800, 0x1FFF),
    175190(0x800, 0x0FFF),
    176 (0x1000, 0x1FFF),
    177191#0800..083F; Samaritan
    178192#0840..085F; Mandaic
Note: See TracChangeset for help on using the changeset viewer.