Changeset 4222


Ignore:
Timestamp:
Oct 8, 2014, 8:18:27 AM (5 years ago)
Author:
cameron
Message:

Factor out utf8 support functions; clean out old some old code

Location:
proto/charsetcompiler
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/unicode_category_compiler.py

    r4100 r4222  
    77# Licensed under Open Software License 3.0.
    88
     9from utf8_lib import *
    910from pablo_expr import *
    1011from CC_compiler import *
     
    1415import optparse, sys
    1516
    16 #
    17 # Definitions for debugging/prototyping
    18 def make_if(p, s): return ["if %s\n" %p] + ["  " + x for x in s] + ["endif %s\n" %p]
    19 #
    20 def utf8_length(codepoint):
    21    if codepoint <= 0x7F: return 1
    22    elif codepoint <= 0x7FF: return 2
    23    elif codepoint <= 0xFFFF: return 3
    24    else: return 4
    25 
    26 def utf8_byte(codepoint, n):
    27    lgth = utf8_length(codepoint)
    28    if n == 1:
    29      if lgth == 1: return codepoint
    30      elif lgth == 2: return 0xC0 | (codepoint >> 6)
    31      elif lgth == 3: return 0xE0 | (codepoint >> 12)
    32      elif lgth == 4: return 0xF0 | (codepoint >> 18)
    33    else:
    34      bits = (codepoint >> (6 * (lgth - n))) & 0x3F
    35      return 0x80 | bits
    36 
    37 def max_codepoint_of_length(n):
    38    if n == 1: return 0x7F
    39    elif n == 2: return 0x7FF
    40    elif n == 3: return 0xFFFF
    41    else: return 0x10FFFF
    42 
    43 def max_codepoint_with_initial_byte(byte):
    44    if byte <= 0x7F: return 0x7F
    45    elif byte <= 0xDF: return ((byte & 0x1F) <<6) | 0x3F
    46    elif byte == 0xED: return 0xD7FF
    47    elif byte <= 0xEF: return ((byte & 0x0F) <<12) | 0xFFF
    48    elif byte == 0xF4: return 0x10FFFF
    49    else: return ((byte & 0x07) <<18) | 0x3FFFF
    50 
    51 def min_codepoint_with_initial_byte(byte):
    52    if byte <= 0x7F: return 0
    53    elif byte <= 0xDF: return ((byte & 0x1F) <<6)
    54    elif byte == 0xE0: return 0x1000
    55    elif byte <= 0xEF: return ((byte & 0x0F) <<12)
    56    elif byte == 0xF0: return 0x10000
    57    else: return ((byte & 0x07) <<18)
    58 
    59 #
    60 # Given two codepoints lo, hi: return the number of
    61 # leading UTF-8 bytes that their respective UTF-8
    62 # representations have in common.
    63 def common_utf8_leading_bytes(lo, hi):
    64    u8len_lo = utf8_length(lo)
    65    u8len_hi = utf8_length(hi)
    66    if u8len_lo != u8len_hi: return 0
    67    remaining = u8len_lo
    68    while remaining > 0:
    69      if lo == hi: return remaining
    70      lo >>= 6
    71      hi >>= 6
    72      remaining -= 1
    73    return 0
    74 
    75 
    76 def matched_ifsequence_compiler(cgo, lo, hi, hlen):
    77    return matched_ifsequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
    78 
    79 def matched_ifsequence_helper(cgo, lo, hi, prefix, n, hlen):
    80    """ Helper function to generate the code necessary to match bytes
    81        n through hlen (1-based indexing) of the range of utf-8 sequences
    82        for codepoints lo through hi. """
    83    hbyte = utf8_byte(hi, n)
    84    lbyte = utf8_byte(lo, n)
    85    if n == hlen:
    86      targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    87      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    88      if n == 1: return targetVar
    89      else: return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
    90    #
    91    # One or more bytes of the lower and upper bound may be the same.
    92    # Build a sequence of byte tests.
    93    if hbyte == lbyte:
    94      targetVar = "bytetest_%x" % (lbyte)
    95      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    96      return matched_ifsequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
    97    # We now have a range involving different bytes at position n.
    98    following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
    99    # A separate test may be needed for the high byte sequence if
    100    # there are constraints on following suffix bytes.
    101    if hi & following_suffix_mask != following_suffix_mask:
    102      hi_floor = hi &~following_suffix_mask     
    103      hiVar = matched_ifsequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
    104      loVar = matched_ifsequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
    105      return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    106    # A separate test may be needed for the low byte sequence if
    107    # there are constraints on following suffix bytes.
    108    if lo & following_suffix_mask != 0:
    109      low_ceil = lo | following_suffix_mask
    110      hiVar = matched_ifsequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
    111      loVar = matched_ifsequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
    112      return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    113    #
    114    # Now we have a range that permits all suffix combinations.
    115    # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
    116    # has been validated.
    117    targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    118    cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    119    if n == 1: return targetVar
    120    return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
    121 
    12217
    12318# Generate a simplest possible test for a Unicode codepoint range
     
    12520# subsequence such that each legal continuation of that subsequence
    12621# is within the range.  Return the generated variable.
    127 def utf8_ifrange_compiler(cgo, lo, hi):
    128    lo_len = utf8_length(lo)
    129    hi_len = utf8_length(hi)
    130    # If different length code unit sequences are involved, make
    131    # a union of equilength subranges.
    132    if hi_len > lo_len:
    133      m = max_codepoint_of_length(hi_len - 1)
    134      v_lo = utf8_ifrange_compiler(cgo, lo, m)
    135      v_hi = utf8_ifrange_compiler(cgo, m+1, hi)
    136      return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(v_lo), Var(v_hi))))
    137    #
    138    else:
    139      return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
    140 
    14122#
    14223# The test may be made up of up to three parts:
Note: See TracChangeset for help on using the changeset viewer.