Changeset 4071 for proto


Ignore:
Timestamp:
Aug 21, 2014, 11:48:52 AM (5 years ago)
Author:
cameron
Message:

Restructuring; if-test simplification; odd/even ranges for Lu/Ll?

Location:
proto/charsetcompiler
Files:
3 added
2 edited
1 moved

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/CC_compiler.py

    r3978 r4071  
    2020        predeclared = [self.bit_var(i) for i in range(0, self.mEncoding.bits)]
    2121        for sym in predeclared: self.common_expression_map[sym] = sym
    22 
    23     def add_symbol_to_map(self, sym):
    24         self.common_expression_map[sym] = sym
     22        self.canonical_sym_map = {}
    2523
    2624    def add_common_expressions(self, enclosing_cgo):
    2725        for sym in enclosing_cgo.common_expression_map.keys():
    2826            self.common_expression_map[sym] = enclosing_cgo.common_expression_map[sym]
     27        for sym in enclosing_cgo.canonical_sym_map.keys():
     28            self.canonical_sym_map[sym] = enclosing_cgo.canonical_sym_map[sym]
    2929
    3030    def bit_var(self, n):
     
    6565                        return self.mEncoding.basis_pattern[2] % (n - 24)
    6666
    67     def make_bitv(self, n):
    68                
     67    def make_bitv(self, n):             
    6968            if self.little_endian == True:
    7069                return Var(self.bit_var(n))
     
    105104              selected_bits &= ~test_bit
    106105              bit_no += 1
    107              
    108106            while len(bit_terms) > 1:
    109107                new_terms = []
     
    112110                if len(bit_terms) % 2 == 1:
    113111                    new_terms.append(bit_terms[-1])
    114                 bit_terms = new_terms
    115            
     112                bit_terms = new_terms     
    116113            return bit_terms[0]
    117114   
    118 
    119115    def char_test_expr(self, chval):
    120116            return self.bit_pattern_expr(chval, self.mEncoding.mask) 
     
    188184                if combine == True:
    189185                    #If charset items are all of the form X1 = X0 + 2.
    190                     for i in range(1 , len(chardef.items) - 1):
     186                    for i in range(len(chardef.items) - 1):
    191187                        curr_item = chardef.items[i]
    192188                        next_item = chardef.items[i+1]
     
    197193                    lo = chardef.items[0][0]
    198194                    hi = chardef.items[-1][0]
     195                    print "Combined odd/even range %x-%x" % (lo, hi)
    199196                    utf_temp = self.mEncoding.mask - 1
    200197                    lo &= utf_temp
    201198                    hi |= (self.mEncoding.mask ^ utf_temp)
    202                     return self.char_or_range_expr((lo, hi))
     199                    if lo & 1 == 1: return make_and(self.char_or_range_expr((lo, hi)), self.make_bitv(0))
     200                    else: return make_and(self.char_or_range_expr((lo, hi)), make_not(self.make_bitv(0)))
    203201            e1 = self.char_or_range_expr(chardef.items[0])
    204202            for i in range(1, len(chardef.items)):   
     
    208206
    209207    def add_assignment(self, varname, expr):
    210         self.common_expression_map[expr] = varname
    211         #self.generated_code.append('%s%s = %s;\n' % (self.typedecl, varname, expr))
    212         self.generated_code.append('\t%s%s = %s\n' % (self.typedecl, varname, expr))
     208        #if not self.sym_in_map(varname):
     209            self.common_expression_map[expr] = varname       
     210            self.generated_code.append('\t%s%s = %s\n' % (self.typedecl, varname, expr))
     211
     212    # An assignment to a variable name that uniquely specifies the expr
     213    def add_canonical_assignment(self, canonical_var, expr):
     214        if not canonical_var in self.canonical_sym_map.keys():
     215            self.common_expression_map[expr] = canonical_var       
     216            self.canonical_sym_map[canonical_var] = expr
     217            self.generated_code.append('\t%s%s = %s\n' % (self.typedecl, canonical_var, expr))
    213218
    214219    def add_if_stmt(self, test_expr, generated_subcode):
     
    279284    def chardef2py(self, chardef):
    280285            self.add_assignment(chardef.name, self.expr2py(self.charset_expr(chardef)))
     286
     287    def chardef_canonical(self, chardef):
     288            self.add_canonical_assignment(chardef.name, self.expr2py(self.charset_expr(chardef)))
    281289   
    282290    def chardeflist2py(self, chardeflist):
  • proto/charsetcompiler/unicode_category_compiler.py

    r3987 r4071  
    1111from UTF_encoding import *
    1212from charset_def import *
     13from UCD.general_category import *
    1314import optparse, sys
    1415
     
    2526
    2627def utf8_byte(codepoint, n):
    27    len = utf8_length(codepoint)
     28   lgth = utf8_length(codepoint)
    2829   if n == 1:
    29      if len == 1: return codepoint
    30      elif len == 2: return 0xC0 + (codepoint >> 6)
    31      elif len == 3: return 0xE0 + (codepoint >> 12)
    32      elif len == 4: return 0xF0 + (codepoint >> 18)
     30     if lgth == 1: return codepoint
     31     elif lgth == 2: return 0xC0 | (codepoint >> 6)
     32     elif lgth == 3: return 0xE0 | (codepoint >> 12)
     33     elif lgth == 4: return 0xF0 | (codepoint >> 18)
    3334   else:
    34      bits = (codepoint >> (6 * (len - n))) & 0x3F
    35      return 0x80 + bits
     35     bits = (codepoint >> (6 * (lgth - n))) & 0x3F
     36     return 0x80 | bits
    3637
    3738def max_codepoint_of_length(n):
     
    4041   elif n == 3: return 0xFFFF
    4142   else: return 0x10FFFF
     43
     44def max_codepoint_with_initial_byte(byte):
     45   if byte <= 0x7F: return 0x7F
     46   elif byte <= 0xDF: return ((byte & 0x1F) <<6) | 0x3F
     47   elif byte == 0xED: return 0xD7FF
     48   elif byte <= 0xEF: return ((byte & 0x0F) <<12) | 0xFFF
     49   elif byte == 0xF4: return 0x10FFFF
     50   else: return ((byte & 0x07) <<18) | 0x3FFFF
     51
     52def min_codepoint_with_initial_byte(byte):
     53   if byte <= 0x7F: return 0
     54   elif byte <= 0xDF: return ((byte & 0x1F) <<6)
     55   elif byte == 0xE0: return 0x1000
     56   elif byte <= 0xEF: return ((byte & 0x0F) <<12)
     57   elif byte == 0xF0: return 0x10000
     58   else: return ((byte & 0x07) <<18)
    4259
    4360#
     
    5673     remaining -= 1
    5774   return 0
    58 
    59 def utf8_range_compiler(cgo, lo, hi, targetVar):
    60    lo_len = utf8_length(lo)
    61    hi_len = utf8_length(hi)
    62    # If different length code unit sequences are involved, make
    63    # a union of equilength subranges.
    64    if hi_len > lo_len:
    65      m = max_codepoint_of_length(hi_len - 1)
    66      targetV_lo = "%s_%i" % (targetVar, lo_len)   
    67      targetV_hi = "%s_%i" % (targetVar, hi_len)
    68      utf8_range_compiler(cgo, lo, m, targetV_lo)
    69      utf8_range_compiler(cgo, m+1, hi, targetV_hi)
    70      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetV_lo), Var(targetV_hi))))
    71    #
    72    else:
    73      matched_sequence_compiler(cgo, lo, hi, 1, hi_len, targetVar)
    74 
    75 
    76 def matched_sequence_compiler(cgo, lo, hi, hlen):
    77    return matched_sequence_helper(cgo, lo, hi, TrueLiteral(), 1, hlen)
    78 
    79 def matched_sequence_helper(cgo, lo, hi, prefix, n, hlen):
    80    """ Helper function to generate the code necessary to match bytes
    81        n through hlen (1-based indexing) of the range of utf-8 sequences
    82        for codepoints lo through hi. """
    83    hbyte = utf8_byte(hi, n)
    84    lbyte = utf8_byte(lo, n)
    85    if n == hlen:
    86      targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    87      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    88      if n == 1: return targetVar
    89      return cgo.expr_string_to_variable(cgo.expr2py(make_and(make_shift_forward(prefix, 1), Var(targetVar))))
    90    #
    91    # One or more bytes of the lower and upper bound may be the same.
    92    # Build a sequence of byte tests.
    93    if hbyte == lbyte:
    94      targetVar = "bytetest_%x" % (lbyte)
    95      cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    96      return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
    97    # We now have a range involving different bytes at position n.
    98    following_suffix_mask = (1 << ((hlen - n) * 6)) - 1
    99    # A separate test may be needed for the high byte sequence if
    100    # there are constraints on following suffix bytes.
    101    if hi & following_suffix_mask != following_suffix_mask:
    102      hi_floor = hi &~following_suffix_mask     
    103      hiVar = matched_sequence_helper(cgo, hi_floor, hi, prefix, n, hlen)
    104      loVar = matched_sequence_helper(cgo, lo, hi_floor - 1, prefix, n, hlen)
    105      return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    106    # A separate test may be needed for the low byte sequence if
    107    # there are constraints on following suffix bytes.
    108    if lo & following_suffix_mask != 0:
    109      low_ceil = lo | following_suffix_mask
    110      hiVar = matched_sequence_helper(cgo, low_ceil + 1, hi, prefix, n, hlen)
    111      loVar = matched_sequence_helper(cgo, lo, low_ceil, prefix, n, hlen)
    112      return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(loVar), Var(hiVar))))
    113    #
    114    # Now we have a range that permits all suffix combinations.
    115    # We don't have to test suffixes UNDER THE ASSUMPTION that the utf-8
    116    # has been validated.
    117    targetVar = "bytetest_%x_%x" % (lbyte, hbyte)
    118    cgo.chardef2py(CanonicalCharSetDef(targetVar, [(lbyte, hbyte)]))
    119    if n == 1: return targetVar
    120    return matched_sequence_helper(cgo, lo, hi, make_and(make_shift_forward(prefix, 1), Var(targetVar)), n+1, hlen)
    12175
    12276
     
    186140     return matched_ifsequence_compiler(cgo, lo, hi, hi_len)
    187141
    188 def generate_utf8_leading_bytes_test(codepoint, bytecount, targetVar):
    189   if bytecount == 0: return [make_assign(targetVar, "1")]
    190   byte1 = utf8_byte(codepoint, 1)
    191   stmts = [make_assign(targetVar, ByteClassCompiler(byte1))]
    192   byteno = 1
    193   while byteno < bytecount:
    194     byteno += 1
    195     sfx_byte = utf8_byte(codepoint, byteno)
    196     stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
    197   return stmts
    198 
    199 def generate_utf8_intermediate_bytes_test(codepoint, startbyte, endbyte, targetVar):
    200   if startbyte == 1: return generate_utf8_leading_bytes_test(codepoint, endbyte, targetVar)
    201   byteno = startbyte
    202   while byteno < endbyte:
    203     byteno += 1
    204     sfx_byte = utf8_byte(codepoint, byteno)
    205     stmts.append(make_assign(targetVar, make_and(make_shift_forward(targetVar, 1), ByteClassCompiler(sfx_byte))))
    206   return stmts
    207 
    208 import re
    209 
    210 Unicode_point_regexp = re.compile("^([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
    211 Unicode_range_regexp = re.compile("^([0-9A-F]{4,6})[.][.]([0-9A-F]{4,6})\s+;\s+([A-Z][a-z0-9])\s+#")
    212 
    213 
    214 def parse_general():
    215   category_size = {}
    216   category_def = {}
    217   f = open("DerivedGeneralCategory.txt")
    218   lines = f.readlines()
    219   for t in lines:
    220     m = Unicode_point_regexp.match(t)
    221     if m:
    222       point = m.group(1)
    223       category = m.group(2)
    224       if not category_size.has_key(category):
    225         category_size[category] = 0
    226         category_def[category] = []
    227       pval = int(point, 16)
    228       category_def[category].append((pval, pval))
    229       category_size[category] += 1     
    230     m = Unicode_range_regexp.match(t)
    231     if m:
    232       point1 = m.group(1)
    233       point2 = m.group(2)
    234       category = m.group(3)
    235       if not category_size.has_key(category):
    236         category_size[category] = 0
    237         category_def[category] = []
    238       pval1 = int(point1, 16)
    239       pval2 = int(point2, 16)
    240       category_def[category].append((pval1, pval2))
    241       category_size[category] += 1
    242   return (category_size, category_def)
    243   f.close()
    244 
     142#
     143# The test may be made up of up to three parts:
     144# (a) a multibyte low-boundary test,
     145# (b) a multibyte high-boundary test, and
     146# (c) a range test.
     147# It is possible that the low- and high- boundary tests have
     148# a common multibyte prefix.
     149def utf8_iftest_compiler(cgo, lo, hi):
     150   lo_byte = utf8_byte(lo, 1)
     151   hi_byte = utf8_byte(hi, 1)
     152   if lo_byte == hi_byte:
     153      targetVar = "cp_range_%x_%x" % (lo, hi)
     154      utf8_sequence_generator([(lo, hi)], 1, targetVar, cgo)
     155      return targetVar
     156   if lo > 0 and utf8_byte(lo - 1, 1) == lo_byte:
     157      lo1 = max_codepoint_with_initial_byte(lo_byte)
     158      targetVar = "cp_range_%x_%x" % (lo, lo1)
     159      utf8_sequence_generator([(lo, lo1)], 1, targetVar, cgo)
     160      test_expr1 = Var(targetVar)
     161      lo_byte = utf8_byte(lo1 + 1, 1)
     162   else:
     163      test_expr1 = FalseLiteral()
     164      if lo == 0x80: lo_byte = 0xC0
     165   if hi < 0x10FFFF and utf8_byte(hi + 1, 1) == hi_byte:
     166      hi1 = min_codepoint_with_initial_byte(hi_byte)
     167      targetVar = "cp_range_%x_%x" % (hi1, hi)
     168      utf8_sequence_generator([(hi1, hi)], 1, targetVar, cgo)
     169      test_expr2 = Var(targetVar)
     170      hi_byte = utf8_byte(hi1 - 1, 1)
     171   else:
     172      test_expr2 = FalseLiteral()
     173      if hi == 0x10FFFF: hi_byte = 0xFF
     174   if lo_byte > hi_byte: return cgo.expr_string_to_variable(cgo.expr2py(make_or(test_expr1, test_expr2)))
     175   if lo_byte == hi_byte:
     176      byteVar = "byte_%x" % lo_byte
     177   else:
     178      byteVar = "byte_range_%x_%x" % (lo_byte, hi_byte)
     179   cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lo_byte, hi_byte)]))
     180   return cgo.expr_string_to_variable(cgo.expr2py(make_or(Var(byteVar), make_or(test_expr1, test_expr2))))
    245181
    246182
     
    257193   for rg in topRanges:
    258194     (rglo, rghi) = rg
    259      inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
    260      inner_cgo.add_common_expressions(cgo)
    261      generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
    262      if inner_cgo.generated_code != []:
    263         range_var = utf8_ifrange_compiler(cgo, rglo, rghi)
    264         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
     195     empty_range = True
     196     for k in charClassMap.keys():
     197        if rangeIntersect(charClassMap[k], rglo, rghi) != []:
     198           empty_range = False
     199           break
     200     if not empty_range:
     201       range_var = utf8_iftest_compiler(cgo, rglo, rghi)
     202       inner_cgo = CC_compiler(UTF8(), "r%x_%x" % (rglo, rghi) + '_tmp%i', False, '')
     203       inner_cgo.add_common_expressions(cgo)
     204       generateCharClassDefsInIfHierarchy(inner_cgo, rg, inner, charClassMap)
     205       if inner_cgo.generated_code != []:
     206         cgo.add_if_stmt(Var(range_var), inner_cgo.generated_code)
    265207   return cgo.showcode()
    266208
    267209def generateCharClassSubDefs(cgo, lo, hi, charClassMap):
    268210   for k in charClassMap.keys():
     211     if options.grep:
     212        targetVar = "all_chars"
     213     else:
     214        targetVar = "struct_%s.cc" % k
    269215     subcc1 = rangeIntersect(charClassMap[k], lo, hi)
    270      # Divide by UTF-8 length
    271      for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFFF), (0x10000, 0x10FFFF)]:
     216     # Divide by UTF-8 length, separating out E0, ED, F0 and F4 ranges
     217     for byte_range in [(0, 0x7F), (0x80, 0x7FF), (0x800, 0xFFF), (0x1000, 0xD7FF), (0xE000, 0xFFFF), (0x10000, 0x3FFFF), (0x40000, 0xFFFFF), (0x100000, 0x10FFFF)]:
    272218        (lo1, hi1) = byte_range
    273219        subcc2 = rangeIntersect(subcc1, lo1, hi1)
    274         ulen = utf8_length(lo1)
    275         for subrange in subcc2:
    276            (lo2, hi2) = subrange
    277            subrangeE = matched_sequence_compiler(cgo, lo2, hi2, ulen)
    278            if options.grep:
    279               target = "all_chars"
    280            else:
    281               target = "struct_%s.cc" % k
    282            cgo.add_assignment(target, cgo.expr2py(make_or(Var(subrangeE), Var(target))))
     220        utf8_sequence_generator(subcc2, 1, targetVar, cgo)
    283221
    284222def rangeIntersect(ccList, lo, hi):
     
    325263#defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x800,0xFFFF), (0x10000, 0x10FFFF)]
    326264
    327 defaultIfRangeList = [(0x80,0x10FFFF), (0x80,0x7FF), (0x100,0x2FF),
    328 (0x300,0x36F), (0x370,0x3FF), (0x400,0x7FF), (0x400,0x4FF),  (0x500, 0x52F), (0x530, 0x58F), (0x590,0x5FF), (0x600,0x6FF), 
    329 (0x700,0x7FF), (0x700,0x74F), (0x750,0x77F), (0x750,0x77F), (0x780,0x7BF), (0x7C0,0x7FF),
    330 (0x800,0xFFFF),
     265
     266defaultIfRangeList = [
     267#Non-ASCII
     268(0x80,0x10FFFF),
     269#Two-byte sequences
     270(0x80,0x7FF),
     271(0x100, 0x3FF),
     272#0100..017F; Latin Extended-A
     273#0180..024F; Latin Extended-B
     274#0250..02AF; IPA Extensions
     275#02B0..02FF; Spacing Modifier Letters
     276(0x100, 0x2FF), (0x100, 0x24F), (0x100, 0x17F), (0x180, 0x24F), (0x250, 0x2AF), (0x2B0, 0x2FF),
     277#0300..036F; Combining Diacritical Marks
     278#0370..03FF; Greek and Coptic
     279(0x300, 0x36F), (0x370, 0x3FF),
     280#0400..04FF; Cyrillic
     281#0500..052F; Cyrillic Supplement
     282#0530..058F; Armenian
     283#0590..05FF; Hebrew
     284#0600..06FF; Arabic
     285(0x400, 0x5FF), (0x400, 0x4FF), (0x500, 0x058F), (0x500, 0x52F), (0x530, 0x58F), (0x590, 0x5FF), (0x600, 0x6FF),
     286#0700..074F; Syriac
     287#0750..077F; Arabic Supplement
     288#0780..07BF; Thaana
     289#07C0..07FF; NKo
     290(0x700, 0x77F), (0x700, 0x74F), (0x750, 0x77F), (0x780, 0x7FF), (0x780, 0x7BF), (0x7C0, 0x7FF),
     291#Three-byte sequences
     292(0x800, 0xFFFF),
     293(0x800, 0x4DFF),
     294(0x800, 0x1FFF),
     295(0x800, 0x0FFF),
     296(0x1000, 0x1FFF),
     297#0800..083F; Samaritan
     298#0840..085F; Mandaic
     299#08A0..08FF; Arabic Extended-A
     300#0900..097F; Devanagari
     301#0980..09FF; Bengali
     302#0A00..0A7F; Gurmukhi
     303#0A80..0AFF; Gujarati
     304#0B00..0B7F; Oriya
     305#0B80..0BFF; Tamil
     306#0C00..0C7F; Telugu
     307#0C80..0CFF; Kannada
     308#0D00..0D7F; Malayalam
     309#0D80..0DFF; Sinhala
     310#0E00..0E7F; Thai
     311#0E80..0EFF; Lao
     312#0F00..0FFF; Tibetan
     313(0x1000, 0x1FFF),
     314#1000..109F; Myanmar
     315#10A0..10FF; Georgian
     316#1100..11FF; Hangul Jamo
     317#1200..137F; Ethiopic
     318#1380..139F; Ethiopic Supplement
     319#13A0..13FF; Cherokee
     320#1400..167F; Unified Canadian Aboriginal Syllabics
     321#1680..169F; Ogham
     322#16A0..16FF; Runic
     323#1700..171F; Tagalog
     324#1720..173F; Hanunoo
     325#1740..175F; Buhid
     326#1760..177F; Tagbanwa
     327#1780..17FF; Khmer
     328#1800..18AF; Mongolian
     329#18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
     330#1900..194F; Limbu
     331#1950..197F; Tai Le
     332#1980..19DF; New Tai Lue
     333#19E0..19FF; Khmer Symbols
     334#1A00..1A1F; Buginese
     335#1A20..1AAF; Tai Tham
     336#1AB0..1AFF; Combining Diacritical Marks Extended
     337#1B00..1B7F; Balinese
     338#1B80..1BBF; Sundanese
     339#1BC0..1BFF; Batak
     340#1C00..1C4F; Lepcha
     341#1C50..1C7F; Ol Chiki
     342#1CC0..1CCF; Sundanese Supplement
     343#1CD0..1CFF; Vedic Extensions
     344#1D00..1D7F; Phonetic Extensions
     345#1D80..1DBF; Phonetic Extensions Supplement
     346#1DC0..1DFF; Combining Diacritical Marks Supplement
     347#1E00..1EFF; Latin Extended Additional
     348#1F00..1FFF; Greek Extended
     349(0x2000, 0x4DFF),(0x2000, 0x2FFF),
     350(0x3000, 0x4DFF),
     351(0x4E00,0x9FFF),
     352#4E00..9FFF; CJK Unified Ideographs
     353(0xA000,0xFFFF),
     354
    331355(0x10000, 0x10FFFF)]
    332 
    333356
    334357
     
    371394"""
    372395        header = "def Demo(basis_bits, lex, output):\n"
    373         main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
    374396  else:
    375397        struct = Unicode_CC_struct % (general_category)
    376398        header = "def %s(basis_bits, struct_%s):\n" % (general_category, general_category)
    377         main = Unicode_dummy_main
    378399  if options.flat:
    379400      code = generateCharClassDefs([], catmap)
     
    390411                output.matches = pablo.MatchStar(all_matches, ~lex.LF) & lex.LF
    391412"""
    392   return struct + header + "".join(code) + main
    393 
     413  return struct + header + "".join(code)
     414
     415def generate_main():
     416  if options.grep:
     417        main = "\n\ndef Main(basis_bits, lex, output):\n    ParseLines(basis_bits, lex)\n    Demo(basis_bits, lex, output)\n"
     418  else:
     419        main = Unicode_dummy_main
     420  return main
     421
     422#
     423# Partition a list of ranges into a minimum set of utf8 groups
     424# UTF-8 prefix groups, where a group is
     425# (a) a range of codepoints with UTF-8 prefixes of the same length
     426#     such that every codepoint in the range is within the group, or
     427# (b) a sublist all having the same UTF-8 initial
     428#     byte
     429def partition_by_UTF8_group(range_list, byte_no):
     430    if range_list == []: return []
     431    (lo, hi) = range_list[0]
     432    u8len_lo = utf8_length(lo)
     433    u8len_hi = utf8_length(hi)
     434    if u8len_lo != u8len_hi:
     435        mid = max_codepoint_of_length(u8len_lo)
     436        return partition_by_UTF8_group([(lo, mid), (mid+1, hi)] + range_list[1:], byte_no)
     437    lobyte1 = utf8_byte(lo, byte_no)
     438    hibyte1 = utf8_byte(hi, byte_no)
     439    if lobyte1 != hibyte1:
     440        if not is_low_codepoint_after_byte(lo, byte_no):
     441            lo1 = lo | ((1 << (6 * (u8len_lo - byte_no))) - 1)
     442            #print "lo--lo1:  %x--%x" % (lo, lo1)
     443            return [[(lo, lo1)]] + partition_by_UTF8_group([(lo1+1, hi)] + range_list[1:], byte_no)
     444        elif not is_high_codepoint_after_byte(hi, byte_no):
     445            hi1 = hi &~ ((1 << (6 * (u8len_lo - byte_no))) - 1)
     446            #print "lo--hi-1:  %x--%x" % (lo, hi1-1)
     447            return [[(lo, hi1-1)]] + partition_by_UTF8_group([(hi1, hi)] + range_list[1:], byte_no)
     448        else:
     449            # we have a prefix group of type (a)
     450            return [[(lo, hi)]] + partition_by_UTF8_group(range_list[1:], byte_no)
     451    group1 = [(lo, hi)]
     452    subpartitions = partition_by_UTF8_group(range_list[1:], byte_no)
     453    if subpartitions == []: return [group1]
     454    elif utf8_byte(subpartitions[0][0][0], byte_no) == lobyte1:
     455        return [group1 + subpartitions[0]] + subpartitions[1:]
     456    else:
     457        return [group1] + subpartitions
     458
     459#
     460def is_low_codepoint_after_byte(codepoint, byte):
     461    for i in range(byte, utf8_length(codepoint)):
     462        if utf8_byte(codepoint, i+1) != 0x80: return False
     463    return True
     464
     465def is_high_codepoint_after_byte(codepoint, byte):
     466    for i in range(byte, utf8_length(codepoint)):
     467        if utf8_byte(codepoint, i+1) != 0xBF: return False
     468    return True
     469
     470# Ensure the sequence of preceding bytes is defined, up to, but
     471# not including the given byte_no
     472def ensure_preceding_prefix_defined(codepoint, byte_no, cgo):
     473   for i in range(1, byte_no):
     474      byte_i = utf8_byte(codepoint, i)
     475      byteVar = "byte_%x" % byte_i
     476      cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(byte_i, byte_i)]))
     477      if i > 1:
     478         pfx1 = utf8_prefix_var(codepoint, i-1)
     479         pfx1_adv = pfx1 + "_adv"
     480         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
     481         pfx2 = utf8_prefix_var(codepoint, i)
     482         cgo.add_canonical_assignment(pfx2, cgo.expr2py(make_and(Var(pfx1_adv), Var(byteVar))))
     483
     484
     485#
     486# Generate remaining code to match UTF-8 code sequences within
     487# the codepoint set u8_partition, assuming that the code matching the
     488# sequences up to byte number byte_no have been generated.
     489#
     490def utf8_sequence_generator(u8_partition, byte_no, targetVar, cgo):
     491   if len(u8_partition) == 0: return
     492   (lo, hi) = u8_partition[0]
     493   if utf8_length(lo) == byte_no:
     494      # We have a single byte remaining to match for all codepoints
     495      # in this partition.  Use the byte class compiler to generate
     496      # matches for these codepoints.
     497      ensure_preceding_prefix_defined(lo, byte_no, cgo)
     498      byte_pair_list = byte_definitions(u8_partition, byte_no)
     499      #print byte_pair_list
     500      if len(byte_pair_list) == 1:
     501          (lobyte, hibyte) = byte_pair_list[0]
     502          if lo == hi:
     503              final_byte_var = "byte_%x" % lobyte
     504          else:
     505              final_byte_var = "byte_range_%x_%x" % (lobyte, hibyte)
     506          cgo.chardef_canonical(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     507      else:
     508          hi = u8_partition[-1][0]
     509          final_byte_var = "%s_range_%x_%x_%i" % (targetVar[-2:], lo, hi, byte_no)
     510          cgo.chardef2py(CanonicalCharSetDef(final_byte_var, byte_pair_list))
     511      test_expr = Var(final_byte_var)
     512      if byte_no > 1: 
     513         pfx1 = utf8_prefix_var(lo, byte_no-1)
     514         pfx1_adv = pfx1 + "_adv"
     515         cgo.add_canonical_assignment(pfx1_adv, cgo.expr2py(make_shift_forward(Var(pfx1), 1)))
     516         test_expr = make_and(Var(pfx1_adv), test_expr)
     517      cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), test_expr)))
     518   else:
     519     partitions = partition_by_UTF8_group(u8_partition, byte_no)
     520     for p in partitions:
     521       (lo, hi) = p[0]
     522       lbyte = utf8_byte(lo, byte_no)
     523       hbyte = utf8_byte(hi, byte_no)
     524       ensure_preceding_prefix_defined(lo, byte_no, cgo)
     525       if lbyte == hbyte:
     526         byteVar = "byte_%x" % lbyte
     527         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, lbyte)]))
     528         if byte_no > 1:
     529           last_prefix = utf8_prefix_var(lo, byte_no - 1)
     530           this_prefix = utf8_prefix_var(lo, byte_no)
     531           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
     532         if byte_no < utf8_length(lo): utf8_sequence_generator(p, byte_no+1, targetVar, cgo)
     533       else:
     534         byteVar = "byte_range_%x_%x" % (lbyte, hbyte)
     535         cgo.chardef_canonical(CanonicalCharSetDef(byteVar, [(lbyte, hbyte)]))
     536         if byte_no > 1:
     537           last_prefix = utf8_prefix_var(lo, byte_no - 1)
     538           this_prefix = last_prefix + "_" + byteVar
     539           cgo.add_canonical_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(byteVar))))
     540         else: this_prefix = byteVar
     541         suffixVar = "byte_range_%x_%x" % (0x80, 0xBF)
     542         cgo.chardef_canonical(CanonicalCharSetDef(suffixVar, [(0x80, 0xBF)]))
     543         last_prefix = this_prefix
     544         while byte_no < utf8_length(lo):
     545           byte_no += 1
     546           this_prefix = last_prefix + "_sfx"
     547           cgo.add_assignment(this_prefix, cgo.expr2py(make_and(make_shift_forward(Var(last_prefix), 1), Var(suffixVar))))
     548           last_prefix = this_prefix
     549         cgo.add_assignment(targetVar, cgo.expr2py(make_or(Var(targetVar), Var(last_prefix))))
     550
     551
     552
     553def utf8_prefix_var(codepoint, prefix_bytes):
     554   if prefix_bytes == 0:
     555      raise Exception ("utf8_prefix_var(%x, %i)" % (codepoint, prefix_bytes))
     556   elif prefix_bytes == 1:
     557      return "byte_%x" % utf8_byte(codepoint, 1)
     558   else:
     559      return "_".join(["sequence"] + ["%x" % utf8_byte(codepoint, n+1) for n in range(prefix_bytes)])
     560
     561
     562def byte_definitions(range_list, n):
     563   #print ["%x--%x" % (p[0], p[1]) for p in range_list]
     564   result = [(utf8_byte(rg[0], n), utf8_byte(rg[1], n)) for rg in range_list]
     565   #print ["%x--%x" % (p[0], p[1]) for p in result]
     566   return result
    394567
    395568def main():   
     
    439612        code = generateDefs1(options.category)
    440613
     614
     615    code += generate_main()
     616
    441617    if (len(args) == 1):
    442618        fh = open(args[0], "w")
     
    449625       
    450626
     627
     628
     629
    451630if __name__ == "__main__": main()
    452631
     
    457636
    458637
    459 
    460 
    461 
    462 
    463 
    464 
    465 
    466 
    467 
    468 
    469 
    470 
    471 
    472 
    473 
    474 
    475 
    476 
Note: See TracChangeset for help on using the changeset viewer.