Ignore:
Timestamp:
Jul 2, 2015, 10:34:26 AM (4 years ago)
Author:
nmedfort
Message:

Passing last used prefix instead of regenerating it implicitly by name.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/if_hierarchy.py

    r4514 r4630  
    88
    99from utf8_lib import *
    10 from pablo_expr import *
    1110from CC_compiler import *
    1211from UTF_encoding import *
     
    2524# It is possible that the low- and high- boundary tests have
    2625# a common multibyte prefix.
     26# def utf8_iftest_compiler(cgo, lo, hi):
     27#     targetVar = "cp_range_%x_%x" % (lo, hi)
     28#     return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
     29#
     30#
     31# def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
     32#     lo_byte = utf8_byte(lo, byte_no)
     33#     hi_byte = utf8_byte(hi, byte_no)
     34#     at_lo_boundary = lo == 0 or utf8_byte(lo - 1, byte_no) != lo_byte
     35#     at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi + 1, byte_no) != hi_byte
     36#     if at_lo_boundary and at_hi_boundary:
     37#         if lo_byte == hi_byte:
     38#             byteVar = "byte_%x" % lo_byte
     39#         else:
     40#             if lo == 0x80: lo_byte = 0xC0
     41#             if hi == 0x10FFFF: hi_byte = 0xFF
     42#             byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
     43#         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     44#         return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
     45#     elif lo_byte == hi_byte:
     46#         byteVar = "byte_%x" % lo_byte
     47#         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     48#         new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
     49#         return utf8_iftest_helper(cgo, lo, hi, byte_no + 1, targetVar, new_marker)
     50#     elif not at_hi_boundary:
     51#         hi1 = min_codepoint_with_common_bytes(hi, byte_no)
     52#         e1 = utf8_iftest_helper(cgo, lo, hi1 - 1, byte_no, targetVar, marker)
     53#         e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
     54#         return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
     55#     else:  # if at_hi_boundary:
     56#         lo1 = max_codepoint_with_common_bytes(lo, byte_no)
     57#         e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
     58#         e2 = utf8_iftest_helper(cgo, lo1 + 1, hi, byte_no, targetVar, marker)
     59#         return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
    2760def utf8_iftest_compiler(cgo, lo, hi):
    28   lo_byte = utf8_byte(lo, 1)
    29   hi_byte = utf8_byte(hi, 1)
    30   targetVar = "cp_range_%x_%x" % (lo, hi)
    31   return utf8_iftest_helper(cgo, lo, hi, 1, targetVar, TrueLiteral())
    32 
    33 def utf8_iftest_helper(cgo, lo, hi, byte_no, targetVar, marker):
    34   lo_byte = utf8_byte(lo, byte_no)
    35   hi_byte = utf8_byte(hi, byte_no)
    36   at_lo_boundary = lo == 0 or utf8_byte(lo-1, byte_no) != lo_byte
    37   at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi+1, byte_no) != hi_byte
    38   if at_lo_boundary and at_hi_boundary:
    39     if lo_byte == hi_byte:
    40       byteVar = "byte_%x" % lo_byte
     61    return utf8_iftest_helper(cgo, lo, hi, 1, TrueLiteral())
     62
     63
     64def utf8_iftest_helper(cgo, lo, hi, byte_no, marker):
     65    lo_byte = utf8_byte(lo, byte_no)
     66    hi_byte = utf8_byte(hi, byte_no)
     67    at_lo_boundary = lo == 0 or utf8_byte(lo - 1, byte_no) != lo_byte
     68    at_hi_boundary = hi == 0x10FFFF or utf8_byte(hi + 1, byte_no) != hi_byte
     69    if at_lo_boundary and at_hi_boundary:
     70        if lo_byte == hi_byte:
     71            byteVar = "byte_%x" % lo_byte
     72        else:
     73            if lo == 0x80: lo_byte = 0xC0
     74            if hi == 0x10FFFF: hi_byte = 0xFF
     75            byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
     76        cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     77        return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
     78    elif lo_byte == hi_byte:
     79        byteVar = "byte_%x" % lo_byte
     80        cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     81        new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
     82        return utf8_iftest_helper(cgo, lo, hi, byte_no + 1, new_marker)
     83    elif not at_hi_boundary:
     84        mid = min_codepoint_with_common_bytes(hi, byte_no)
     85        e1 = utf8_iftest_helper(cgo, lo, mid - 1, byte_no, marker)
     86        e2 = utf8_iftest_helper(cgo, mid, hi, byte_no, marker)
     87        return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
     88    else:  # if at_hi_boundary:
     89        mid = max_codepoint_with_common_bytes(lo, byte_no)
     90        e1 = utf8_iftest_helper(cgo, lo, mid, byte_no, marker)
     91        e2 = utf8_iftest_helper(cgo, mid + 1, hi, byte_no, marker)
     92        return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
     93
     94def min_codepoint_with_common_bytes(cp, byte_no):
     95    u8len = utf8_length(cp)
     96    mask = (1 << (u8len - byte_no) * 6) - 1
     97    lo_cp = cp & ~ mask
     98    if lo_cp == 0:
     99        return mask + 1
    41100    else:
    42       if lo == 0x80: lo_byte = 0xC0
    43       if hi == 0x10FFFF: hi_byte = 0xFF
    44       byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
    45     cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
    46     return cgo.expr_string_to_variable(cgo.expr2py(make_and(marker, Var(byteVar))))
    47   elif lo_byte == hi_byte:
    48     byteVar = "byte_%x" % lo_byte
    49     cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
    50     new_marker = make_shift_forward(make_and(marker, Var(byteVar)), 1)
    51     return utf8_iftest_helper(cgo, lo, hi, byte_no+1, targetVar, new_marker)
    52   elif not at_hi_boundary:
    53     hi1 = min_codepoint_with_common_bytes(hi, byte_no)
    54     e1 = utf8_iftest_helper(cgo, lo, hi1-1, byte_no, targetVar, marker)
    55     e2 = utf8_iftest_helper(cgo, hi1, hi, byte_no, targetVar, marker)
    56     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
    57   else: # if at_hi_boundary:
    58     lo1 = max_codepoint_with_common_bytes(lo, byte_no)
    59     e1 = utf8_iftest_helper(cgo, lo, lo1, byte_no, targetVar, marker)
    60     e2 = utf8_iftest_helper(cgo, lo1+1, hi, byte_no, targetVar, marker)
    61     return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(e1), Var(e2))))
    62    
    63 def min_codepoint_with_common_bytes(cp, byte_no):
    64   u8len = utf8_length(cp)
    65   mask = (1 << (u8len-byte_no) * 6) - 1
    66   lo_cp = cp &~ mask
    67   if lo_cp == 0: return mask + 1
    68   else: return lo_cp
     101        return lo_cp
     102
    69103
    70104def max_codepoint_with_common_bytes(cp, byte_no):
    71   u8len = utf8_length(cp)
    72   mask = (1 << (u8len-byte_no) * 6) - 1
    73   return cp | mask
    74 
    75 
    76 def generateCharClassDefsInIfHierarchy(cgo, enclosingRange, ifRangeList, charClassMap, template_var):
    77 #   inner_code = []
    78    (outer_lo, outer_hi) = enclosingRange
    79    enclosedRanges = rangeIntersect(ifRangeList, outer_lo, outer_hi)
    80    missingRanges = rangeGaps(enclosedRanges, outer_lo, outer_hi)
    81    # Codepoints in unenclosed ranges will be computed unconditionally.
    82    # Generate them first so that computed subexpressions may be shared
    83    # with calculations within the if hierarchy.
    84    for rg in missingRanges:
    85      (rglo, rghi) = rg
    86      generateCharClassSubDefs(cgo, rglo, rghi, charClassMap, template_var)
    87    topRanges = outerRanges(enclosedRanges)
    88    inner = innerRanges(enclosedRanges)
    89    for rg in topRanges:
    90      (rglo, rghi) = rg
    91      empty_range = True
    92      for k in charClassMap.keys():
    93         if rangeIntersect(charClassMap[k], rglo, rghi) != []:
    94            empty_range = False
    95            break
    96      if not empty_range:
    97        range_var = utf8_iftest_compiler(cgo, rglo, rghi)
    98        inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
    99        inner_cgo.add_common_expressions(cgo)
    100        generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap, template_var)
    101        if inner_cgo.generated_code != []:
    102          cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
    103    return cgo.showcode()
     105    u8len = utf8_length(cp)
     106    mask = (1 << (u8len - byte_no) * 6) - 1
     107    return cp | mask
     108
     109
     110def generateCharClassDefsInIfHierarchy(cgo, ifRangeList, charClassMap, enclosingRange, template_var):
     111    #   inner_code = []
     112    (lo, hi) = enclosingRange
     113    enclosedRanges = rangeIntersect(ifRangeList, lo, hi)
     114    missingRanges = rangeGaps(enclosedRanges, lo, hi)
     115    # Codepoints in unenclosed ranges will be computed unconditionally.
     116    # Generate them first so that computed subexpressions may be shared
     117    # with calculations within the if hierarchy.
     118    for rg in missingRanges:
     119        (lo, hi) = rg
     120        generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var)
     121    if len(enclosedRanges) > 0:
     122        topRanges = outerRanges(enclosedRanges)
     123        inner = innerRanges(enclosedRanges)
     124        for rg in topRanges:
     125            (lo, hi) = rg
     126            empty = True
     127            for k in charClassMap.keys():
     128                if len(rangeIntersect(charClassMap[k], lo, hi)) > 0:
     129                    empty = False
     130                    break
     131            if not empty:
     132                inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (lo, hi) + '_tmp%i', False, '')
     133                inner_cgo.add_common_expressions(cgo)
     134                generateCharClassDefsInIfHierarchy(inner_cgo, inner, charClassMap, rg, template_var)
     135                if len(inner_cgo.generated_code) > 0:
     136                    cgo.add_if_stmt(Var(utf8_iftest_compiler(cgo, lo, hi)), inner_cgo.generated_code)
     137    return cgo.showcode()
    104138
    105139def generateCharClassSubDefs(cgo, lo, hi, charClassMap, template_var):
    106    for k in charClassMap.keys():
    107      targetVar = template_var % k
    108      subcc1 = rangeIntersect(charClassMap[k], lo, hi)
    109      # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
    110      for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xD800, 0xDFFF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
    111         (lo1, hi1) = byte_range
    112         subcc2 = rangeIntersect(subcc1, lo1, hi1)
    113         utf8_sequence_generator(subcc2, 1, targetVar, cgo)
    114 
    115 def rangeIntersect(ccList, lo, hi):
    116     return [(max(lo, p[0]), min(hi, p[1])) for p in ccList if p[0] <= hi and p[1] >= lo]
    117 
    118 def rangeGaps(ccList, lo, hi):
    119     if lo >= hi: return []
    120     if ccList == []: return [(lo, hi)]
    121     (lo1, hi1) = ccList[0]
    122     if hi1 < lo: return rangeGaps(ccList[1:], lo, hi)
    123     if lo1 > lo: return [(lo, lo1 - 1)] + rangeGaps(ccList[1:], hi1+1, hi)
    124     elif hi1 < hi: return rangeGaps(ccList[1:], hi1+1, hi)
    125     else: return []
     140    for k in charClassMap.keys():
     141        targetVar = template_var % k
     142        subcc1 = rangeIntersect(charClassMap[k], lo, hi)
     143        # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
     144        for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xD800, 0xDFFF),
     145                           (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
     146            (lo1, hi1) = byte_range
     147            subcc2 = rangeIntersect(subcc1, lo1, hi1)
     148            utf8_sequence_generator(cgo, 1, targetVar, subcc2)
     149
     150
     151def rangeIntersect(range, lo, hi):
     152    return [(max(lo, p[0]), min(hi, p[1])) for p in range if p[0] <= hi and p[1] >= lo]
     153
     154
     155def rangeGaps(range, lo, hi):
     156    gaps = []
     157    if lo < hi:
     158        if len(range) == 0:
     159            gaps.append((lo, hi))
     160        else:
     161            for item in range:
     162                (lo1, hi1) = item
     163                if hi1 < lo:
     164                    continue
     165                elif lo1 > lo:
     166                    gaps.append((lo, lo1 - 1))
     167                elif hi1 >= hi:
     168                    continue
     169                lo = hi1 + 1
     170    return gaps
    126171
    127172def outerRanges(ccList):
    128     if len(ccList) <= 1: return ccList
    129     (lo1, hi1) = ccList[0]
    130     (lo2, hi2) = ccList[1]
    131     if hi2 <= hi1: return outerRanges([(lo1, hi1)] + ccList[2:])
    132     else: return [(lo1, hi1)] + outerRanges(ccList[1:])
     173    ranges = []
     174    if (len(ccList) > 0):
     175        i = 0
     176        for j in range(1, len(ccList)):
     177            (lo1, hi1) = ccList[i]
     178            (lo2, hi2) = ccList[j]
     179            if hi2 > hi1:
     180                ranges.append(ccList[i])
     181                i = j
     182        if i < len(ccList):
     183            ranges.append(ccList[i])
     184    return ranges
    133185
    134186def innerRanges(ccList):
    135     if len(ccList) <= 1: return []
    136     (lo1, hi1) = ccList[0]
    137     (lo2, hi2) = ccList[1]
    138     if hi2 <= hi1: return [(lo2, hi2)] + innerRanges([(lo1, hi1)] + ccList[2:])
    139     else: return innerRanges(ccList[1:])
    140 
    141 
     187    ranges = []
     188    if (len(ccList) > 0):
     189        i = 0
     190        for j in range(1, len(ccList)):
     191            (lo1, hi1) = ccList[i]
     192            (lo2, hi2) = ccList[j]
     193            if hi2 <= hi1:
     194                ranges.append(ccList[j])
     195            else:
     196                i = j
     197    return ranges
    142198
    143199def generateCharClassDefs(ifRangeList, charClassMap, template_var):
    144    cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
    145    for k in charClassMap.keys():
    146      cgo.add_assignment(template_var % k, '0')
    147    generateCharClassDefsInIfHierarchy(cgo, (0, 0x10FFFF), ifRangeList, charClassMap, template_var)
    148    return cgo.showcode()
    149  
    150 
    151 #defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
    152 
    153 #defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
     200    cgo = CC_compiler(UTF8(), 'tmp%i', False, '')
     201    for k in charClassMap.keys():
     202        print template_var % k
     203        cgo.add_assignment(template_var % k, '0')
     204    generateCharClassDefsInIfHierarchy(cgo, ifRangeList, charClassMap, (0, 0x10FFFF), template_var)
     205    return cgo.showcode()
     206
     207# defaultIfRangeList = [(0,0x7FF), (0, 0x7F), (0x80, 0x3FF), (0x400,0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]
     208
     209# defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
    154210
    155211
    156212defaultIfRangeList = [
    157 #Non-ASCII
    158 (0x80,0x10FFFF),
    159 #Two-byte sequences
    160 (0x80,0x7FF),
    161 (0x100, 0x3FF),
    162 #0100..017F; Latin Extended-A
    163 #0180..024F; Latin Extended-B
    164 #0250..02AF; IPA Extensions
    165 #02B0..02FF; Spacing Modifier Letters
    166 (0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
    167 #0300..036F; Combining Diacritical Marks
    168 #0370..03FF; Greek and Coptic
    169 (0x300, 0x36F), (0x370, 0x3FF),
    170 #0400..04FF; Cyrillic
    171 #0500..052F; Cyrillic Supplement
    172 #0530..058F; Armenian
    173 #0590..05FF; Hebrew
    174 #0600..06FF; Arabic
    175 (0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
    176 #0700..074F; Syriac
    177 #0750..077F; Arabic Supplement
    178 #0780..07BF; Thaana
    179 #07C0..07FF; NKo
    180 (0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF),
    181 #Three-byte sequences
    182 (0x800, 0xFFFF),
    183 (0x800, 0x4DFF),
    184 (0x800, 0x1FFF),
    185 (0x800, 0x0FFF),
    186 #0800..083F; Samaritan
    187 #0840..085F; Mandaic
    188 #08A0..08FF; Arabic Extended-A
    189 #0900..097F; Devanagari
    190 #0980..09FF; Bengali
    191 #0A00..0A7F; Gurmukhi
    192 #0A80..0AFF; Gujarati
    193 #0B00..0B7F; Oriya
    194 #0B80..0BFF; Tamil
    195 #0C00..0C7F; Telugu
    196 #0C80..0CFF; Kannada
    197 #0D00..0D7F; Malayalam
    198 #0D80..0DFF; Sinhala
    199 #0E00..0E7F; Thai
    200 #0E80..0EFF; Lao
    201 #0F00..0FFF; Tibetan
    202 (0x1000, 0x1FFF),
    203 #1000..109F; Myanmar
    204 #10A0..10FF; Georgian
    205 #1100..11FF; Hangul Jamo
    206 #1200..137F; Ethiopic
    207 #1380..139F; Ethiopic Supplement
    208 #13A0..13FF; Cherokee
    209 #1400..167F; Unified Canadian Aboriginal Syllabics
    210 #1680..169F; Ogham
    211 #16A0..16FF; Runic
    212 #1700..171F; Tagalog
    213 #1720..173F; Hanunoo
    214 #1740..175F; Buhid
    215 #1760..177F; Tagbanwa
    216 #1780..17FF; Khmer
    217 #1800..18AF; Mongolian
    218 #18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
    219 #1900..194F; Limbu
    220 #1950..197F; Tai Le
    221 #1980..19DF; New Tai Lue
    222 #19E0..19FF; Khmer Symbols
    223 #1A00..1A1F; Buginese
    224 #1A20..1AAF; Tai Tham
    225 #1AB0..1AFF; Combining Diacritical Marks Extended
    226 #1B00..1B7F; Balinese
    227 #1B80..1BBF; Sundanese
    228 #1BC0..1BFF; Batak
    229 #1C00..1C4F; Lepcha
    230 #1C50..1C7F; Ol Chiki
    231 #1CC0..1CCF; Sundanese Supplement
    232 #1CD0..1CFF; Vedic Extensions
    233 #1D00..1D7F; Phonetic Extensions
    234 #1D80..1DBF; Phonetic Extensions Supplement
    235 #1DC0..1DFF; Combining Diacritical Marks Supplement
    236 #1E00..1EFF; Latin Extended Additional
    237 #1F00..1FFF; Greek Extended
    238 (0x2000, 0x4DFF),(0x2000, 0x2FFF),
    239 (0x3000, 0x4DFF),
    240 (0x4E00,0x9FFF),
    241 #4E00..9FFF; CJK Unified Ideographs
    242 (0xA000,0xFFFF),
    243 
    244 (0x10000, 0x10FFFF)]
    245 
    246 
     213    # Non-ASCII
     214    (0x80, 0x10FFFF),
     215    # Two-byte sequences
     216    (0x80, 0x7FF),
     217    (0x100, 0x3FF),
     218    # 0100..017F; Latin Extended-A
     219    # 0180..024F; Latin Extended-B
     220    # 0250..02AF; IPA Extensions
     221    # 02B0..02FF; Spacing Modifier Letters
     222    (0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
     223    # 0300..036F; Combining Diacritical Marks
     224    # 0370..03FF; Greek and Coptic
     225    (0x300, 0x36F), (0x370, 0x3FF),
     226    # 0400..04FF; Cyrillic
     227    # 0500..052F; Cyrillic Supplement
     228    # 0530..058F; Armenian
     229    # 0590..05FF; Hebrew
     230    # 0600..06FF; Arabic
     231    (0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
     232    # 0700..074F; Syriac
     233    # 0750..077F; Arabic Supplement
     234    # 0780..07BF; Thaana
     235    # 07C0..07FF; NKo
     236    (0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF),
     237    # Three-byte sequences
     238    (0x800, 0xFFFF),
     239    (0x800, 0x4DFF),
     240    (0x800, 0x1FFF),
     241    (0x800, 0x0FFF),
     242    # 0800..083F; Samaritan
     243    # 0840..085F; Mandaic
     244    # 08A0..08FF; Arabic Extended-A
     245    # 0900..097F; Devanagari
     246    # 0980..09FF; Bengali
     247    # 0A00..0A7F; Gurmukhi
     248    # 0A80..0AFF; Gujarati
     249    # 0B00..0B7F; Oriya
     250    # 0B80..0BFF; Tamil
     251    # 0C00..0C7F; Telugu
     252    # 0C80..0CFF; Kannada
     253    # 0D00..0D7F; Malayalam
     254    # 0D80..0DFF; Sinhala
     255    # 0E00..0E7F; Thai
     256    # 0E80..0EFF; Lao
     257    # 0F00..0FFF; Tibetan
     258    (0x1000, 0x1FFF),
     259    # 1000..109F; Myanmar
     260    # 10A0..10FF; Georgian
     261    # 1100..11FF; Hangul Jamo
     262    # 1200..137F; Ethiopic
     263    # 1380..139F; Ethiopic Supplement
     264    # 13A0..13FF; Cherokee
     265    # 1400..167F; Unified Canadian Aboriginal Syllabics
     266    # 1680..169F; Ogham
     267    # 16A0..16FF; Runic
     268    # 1700..171F; Tagalog
     269    # 1720..173F; Hanunoo
     270    # 1740..175F; Buhid
     271    # 1760..177F; Tagbanwa
     272    # 1780..17FF; Khmer
     273    # 1800..18AF; Mongolian
     274    # 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
     275    # 1900..194F; Limbu
     276    # 1950..197F; Tai Le
     277    # 1980..19DF; New Tai Lue
     278    # 19E0..19FF; Khmer Symbols
     279    # 1A00..1A1F; Buginese
     280    # 1A20..1AAF; Tai Tham
     281    # 1AB0..1AFF; Combining Diacritical Marks Extended
     282    # 1B00..1B7F; Balinese
     283    # 1B80..1BBF; Sundanese
     284    # 1BC0..1BFF; Batak
     285    # 1C00..1C4F; Lepcha
     286    # 1C50..1C7F; Ol Chiki
     287    # 1CC0..1CCF; Sundanese Supplement
     288    # 1CD0..1CFF; Vedic Extensions
     289    # 1D00..1D7F; Phonetic Extensions
     290    # 1D80..1DBF; Phonetic Extensions Supplement
     291    # 1DC0..1DFF; Combining Diacritical Marks Supplement
     292    # 1E00..1EFF; Latin Extended Additional
     293    # 1F00..1FFF; Greek Extended
     294    (0x2000, 0x4DFF), (0x2000, 0x2FFF),
     295    (0x3000, 0x4DFF),
     296    (0x4E00, 0x9FFF),
     297    # 4E00..9FFF; CJK Unified Ideographs
     298    (0xA000, 0xFFFF),
     299
     300    (0x10000, 0x10FFFF)]
    247301
    248302# Ensure the sequence of preceding bytes is defined, up to, but
    249303# not including the given byte_no
    250 def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
    251    for i in range(1, byte_no):
    252       byte_i = utf8_byte(codepoint, i)
    253       byteVar = "byte_%x" % byte_i
    254       cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
    255       if i > 1:
    256          pfx1 = utf8_prefix_var(codepoint, i-1)
    257          pfx1_adv = pfx1 + "_adv"
    258          cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
    259          pfx2 = utf8_prefix_var(codepoint, i)
    260          cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
    261 
    262 
    263 
     304def make_prefix(cgo, codepoint, byte_no, prefix):
     305    for i in range(1, byte_no):
     306        byte_i = utf8_byte(codepoint, i)
     307        var = "byte_%x" % byte_i
     308        cgo.chardef_canonical(CanonicalCharSetDef(prefix, [(byte_i, byte_i)]))
     309        if i > 1:
     310            adv_prefix = prefix + "_adv"
     311            cgo.add_canonical_assignment(adv_prefix, cgo.expr2py(make_shift_forward(Var(prefix), 1)))
     312            next_prefix = utf8_prefix_var(codepoint, i)
     313            cgo.add_canonical_assignment(next_prefix, cgo.expr2py(make_and(Var(adv_prefix), Var(var))))
     314            var = next_prefix
     315        prefix = var
     316    return prefix
    264317
    265318#
     
    268321# sequences up to byte number byte_no have been generated.
    269322#
    270 def utf8_sequence_generator(cpset, byte_no, targetVar, cgo):
    271     if len(cpset) == 0: return
    272     (lo, hi) = cpset[0]
    273     u8len_lo = utf8_length(lo)
    274     u8len_max = utf8_length(cpset[-1][1])
    275     if u8len_lo != u8len_max:
    276         mid = max_codepoint_of_length(u8len_lo)
    277         utf8_sequence_generator(range_intersect(cpset, lo, mid), byte_no)
    278         utf8_sequence_generator(range_intersect(cpset, mid+1, hi1), byte_no)
     323def utf8_sequence_generator(cgo, byte_no, target, cc, prefix = None):
     324    if len(cc) == 0:
    279325        return
    280     if u8len_lo == byte_no:
    281         # We have a single byte remaining to match for all codepoints
    282         # in this cpset.  Use the byte class compiler to generate
    283         # matches for these codepoints.
    284         ensure_preceding_prefix_defined(lo, byte_no, cgo)
    285         byte_pair_list = byte_definitions(cpset, byte_no)
    286         #print byte_pair_list
    287         if len(byte_pair_list) == 1:
    288             (lobyte, hibyte) = byte_pair_list[0]
     326    (lo, hi) = cc[0]
     327    u8len_min = utf8_length(lo)
     328    u8len_max = utf8_length(cc[-1][1])
     329
     330    # print " -- ", byte_no, "  ", cc[0], "    ", (u8len_min, u8len_max)
     331
     332    assert(u8len_min == u8len_max)
     333
     334    if u8len_min != u8len_max:
     335
     336        mid = max_codepoint_of_length(u8len_min)
     337        utf8_sequence_generator(cgo, byte_no, target, rangeIntersect(cc, lo, mid), prefix)
     338        utf8_sequence_generator(cgo, byte_no, target, rangeIntersect(cc, mid + 1, hi), prefix)
     339
     340    elif u8len_min == byte_no:
     341        # We have a single byte remaining to match for all code points
     342        # in this cc.  Use the byte class compiler to generate matches
     343        # for these code points.
     344
     345        byte_pairs = byte_definitions(cc, byte_no)
     346        if len(byte_pairs) == 1:
     347            (lobyte, hibyte) = byte_pairs[0]
    289348            if lo == hi:
    290349                final_byte_var = "byte_%x" % lobyte
    291350            else:
    292351                final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
    293             cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     352            cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pairs))
    294353        else:
    295             hi = cpset[-1][1]
    296             final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
    297             cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     354            hi = cc[-1][1]
     355            final_byte_var = "cp_range_%x_%x_%i" % (lo, hi, byte_no)
     356            cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pairs))
    298357        test_expr = Var(final_byte_var)
    299         if byte_no > 1: 
    300            pfx1 = utf8_prefix_var(lo, byte_no-1)
    301            pfx1_adv = pfx1 + "_adv"
    302            cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
    303            test_expr = make_and(Var(pfx1_adv), test_expr)
    304         cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
    305         return
    306     #
    307     #
    308     for rg in cpset:
    309         (lo, hi) = rg
    310         lbyte = utf8_byte(lo, byte_no)
    311         hbyte = utf8_byte(hi, byte_no)
    312         if lbyte != hbyte:
    313             if not is_low_codepoint_after_byte(lo, byte_no):
    314                 lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
    315                 #print "lo--lo1:  %x--%x (%i)" % (lo, lo1, byte_no)
    316                 utf8_sequence_generator([(lo, lo1)], byte_no, targetVar, cgo)
    317                 utf8_sequence_generator([(lo1+1, hi)], byte_no, targetVar, cgo)
    318             elif not is_high_codepoint_after_byte(hi, byte_no):
    319                 hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
    320                 #print "lo--hi1-1:  %x--%x (%i)" % (lo, hi1-1, byte_no)
    321                 utf8_sequence_generator([(lo, hi1-1)], byte_no, targetVar, cgo)
    322                 utf8_sequence_generator([(hi1, hi)], byte_no, targetVar, cgo)
    323             else:
    324                 # we have a prefix group of type (a)
    325                 #print "lo--hi:  %x--%x (%i)" % (lo, hi, byte_no)
    326                 byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
    327                 cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
     358        if byte_no > 1:
     359            pfx1 = make_prefix(cgo, lo, byte_no, prefix)
     360            pfx1_adv = pfx1 + "_adv"
     361            cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
     362            test_expr = make_and(Var(pfx1_adv), test_expr)
     363        cgo.add_assignment(target, cgo.expr2py(make_or(Var(target), test_expr)))
     364
     365    else:
     366
     367        for rg in cc:
     368            (lo, hi) = rg
     369            lbyte = utf8_byte(lo, byte_no)
     370            hbyte = utf8_byte(hi, byte_no)
     371
     372            if lbyte != hbyte:
     373                if not is_low_codepoint_after_byte(lo, byte_no):
     374                    mid = lo | ((1 << (6 * (u8len_min - byte_no))) - 1)
     375                    utf8_sequence_generator(cgo, byte_no, target, [(lo, mid)], prefix)
     376                    utf8_sequence_generator(cgo, byte_no, target, [(mid + 1, hi)], prefix)
     377                elif not is_high_codepoint_after_byte(hi, byte_no):
     378                    mid = hi & ~ ((1 << (6 * (u8len_min - byte_no))) - 1)
     379                    utf8_sequence_generator(cgo, byte_no, target, [(lo, mid - 1)], prefix)
     380                    utf8_sequence_generator(cgo, byte_no, target, [(mid, hi)], prefix)
     381                else:
     382                    # we have a prefix group of type (a)
     383                    var = "byte_range_%x_%x" % (lbyte, hbyte)
     384                    cgo.chardef_canonical(CanonicalCharSetDef(var, [(lbyte, hbyte)]))
     385                    if byte_no > 1:
     386                        last_prefix = prefix
     387                        assert(last_prefix == utf8_prefix_var(lo, byte_no - 1))
     388                        last_prefix_adv = last_prefix + "_adv"
     389                        this_prefix = last_prefix + "_" + var
     390                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
     391                        cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(var))))
     392                    else:
     393                        this_prefix = var
     394                    suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
     395                    cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
     396                    last_prefix = this_prefix
     397                    while byte_no < utf8_length(lo):
     398                        byte_no += 1
     399                        last_prefix_adv = last_prefix + "_adv"
     400                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
     401                        this_prefix = utf8_prefix_var(lo, byte_no)
     402                        cgo.add_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(suffixVar))))
     403                        last_prefix = this_prefix
     404                    cgo.add_assignment(target, cgo.expr2py(make_or(Var(target), Var(last_prefix))))
     405
     406            else: # lobyte1 == hybyte1
     407                var = "byte_%x" % lbyte
     408                cgo.chardef_canonical(CanonicalCharSetDef(var, [(lbyte, lbyte)]))
    328409                if byte_no > 1:
    329                        last_prefix = utf8_prefix_var(lo, byte_no - 1)
    330                        last_prefix_adv = last_prefix + "_adv"
    331                        this_prefix = last_prefix + "_" + byteVar
    332                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
    333                        cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(byteVar))))
    334                 else: this_prefix = byteVar
    335                 suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
    336                 cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
    337                 last_prefix = this_prefix
    338                 while byte_no < utf8_length(lo):
    339                        byte_no += 1
    340                        last_prefix_adv = last_prefix + "_adv"
    341                        cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
    342                        this_prefix = last_prefix + "_sfx"
    343                        cgo.add_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(suffixVar))))
    344                        last_prefix = this_prefix
    345                 cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
    346         else:
    347             # lobyte1 == hybyte1
    348             byteVar = "byte_%x" % lbyte
    349             cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
    350             if byte_no > 1:
    351                 last_prefix = utf8_prefix_var(lo, byte_no - 1)
    352                 last_prefix_adv = last_prefix + "_adv"
    353                 cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
    354                 this_prefix = utf8_prefix_var(lo, byte_no)
    355                 cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(byteVar))))
    356             if byte_no < utf8_length(lo): utf8_sequence_generator([rg], byte_no+1, targetVar, cgo)
    357 
    358 
    359 
     410                    last_prefix = var if prefix == None else prefix
     411                    assert(last_prefix == utf8_prefix_var(lo, byte_no - 1))
     412                    last_prefix_adv = last_prefix + "_adv"
     413                    cgo.add_canonical_assignment(last_prefix_adv, cgo.expr2py(make_shift_forward(Var(last_prefix), 1)))
     414                    this_prefix = utf8_prefix_var(lo, byte_no)
     415                    cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(Var(last_prefix_adv), Var(var))))
     416                    var = this_prefix
     417                if byte_no < utf8_length(lo):
     418                    utf8_sequence_generator(cgo, byte_no + 1, target, [rg], var)
    360419
    361420
    362421def utf8_prefix_var(codepoint, prefix_bytes):
    363    if prefix_bytes == 0:
    364       raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
    365    elif prefix_bytes == 1:
    366       return "byte_%x" % utf8_byte(codepoint, 1)
    367    else:
    368       return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
    369 
     422    if prefix_bytes == 0:
     423        raise Exception("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
     424    elif prefix_bytes == 1:
     425        return "byte_%x" % utf8_byte(codepoint, 1)
     426    else:
     427        return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n + 1) for n in range(prefix_bytes)])
    370428
    371429def byte_definitions(range_list, n):
    372    #print ["%x--%x" % (p[0], p[1]) for p in range_list]
    373    result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
    374    #print ["%x--%x" % (p[0], p[1]) for p in result]
    375    return result
    376 
    377 
    378 
    379 
    380 
     430    return [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
Note: See TracChangeset for help on using the changeset viewer.