Changeset 3413 for proto/charsetcompiler


Ignore:
Timestamp:
Jul 29, 2013, 11:26:03 AM (6 years ago)
Author:
cameron
Message:

Autodetect encoding type

Location:
proto/charsetcompiler
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/charset_compiler.py

    r3349 r3413  
    3333import charset_input_parser
    3434
    35 class UTF_Encoding_Type:
    36     def __init__(self, name, bits, mask):
    37         self.name = name
    38         self.bits = bits
    39         self.mask = mask
    40         self.basis_pattern = []
    41     def __str__(self): return self.name
    42 
    43 
    44 Encoding = UTF_Encoding_Type
    45 
     35import UTF_encoding
     36Encoding_Type = UTF_encoding.UTF_Encoding_Type
    4637
    4738#
     
    255246    global Encoding
    256247
    257     if len(Encoding.basis_pattern) == 1:
    258         return Encoding.basis_pattern[0] % n
     248    if len(UTF_encoding.Encoding.basis_pattern) == 1:
     249        return UTF_encoding.Encoding.basis_pattern[0] % n
    259250   
    260     if Encoding.name == "UTF-16":
     251    if UTF_encoding.Encoding.name == UTF_encoding.UTF16.name:
    261252        if options.little_endian == True:
    262253            if n >= 8:
    263                 return Encoding.basis_pattern[0] % (n - 8)
     254                return UTF_encoding.Encoding.basis_pattern[0] % (n - 8)
    264255            else:
    265                 return Encoding.basis_pattern[1] % n
     256                return UTF_encoding.Encoding.basis_pattern[1] % n
    266257        else:
    267258            if n <= 7:
    268                 return Encoding.basis_pattern[0] % n
     259                return UTF_encoding.Encoding.basis_pattern[0] % n
    269260            else:
    270                 return Encoding.basis_pattern[1] % (n - 8)
    271 
    272     if Encoding.name == "UTF-32":
     261                return UTF_encoding.Encoding.basis_pattern[1] % (n - 8)
     262
     263    if UTF_encoding.Encoding.name == UTF_encoding.UTF32.name:
    273264        if options.little_endian == True:
    274265            if n >= 21:
    275266                return "unused_bit%i" % (n - 21)
    276267            elif n < 21 and n >= 16:
    277                 return Encoding.basis_pattern[0] % (n - 16)
     268                return UTF_encoding.Encoding.basis_pattern[0] % (n - 16)
    278269            elif n < 16 and n >= 8:
    279                 return Encoding.basis_pattern[1] % (n - 8)
     270                return UTF_encoding.Encoding.basis_pattern[1] % (n - 8)
    280271            elif n < 8:
    281                 return Encoding.basis_pattern[2] % n
     272                return UTF_encoding.Encoding.basis_pattern[2] % n
    282273        else:
    283274            if n <= 10:
    284275                return "unused_bit%i" % n
    285276            elif n > 10 and n <= 15:
    286                 return Encoding.basis_pattern[0] % (n - 8)
     277                return UTF_encoding.Encoding.basis_pattern[0] % (n - 8)
    287278            elif n > 15 and n <= 23:
    288                 return Encoding.basis_pattern[1] % (n - 16)
     279                return UTF_encoding.Encoding.basis_pattern[1] % (n - 16)
    289280            elif n > 23:
    290                 return Encoding.basis_pattern[2] % (n - 24)
     281                return UTF_encoding.Encoding.basis_pattern[2] % (n - 24)
    291282
    292283
     
    298289        return Var(bit_var(n))
    299290    else:
    300         return Var(bit_var((Encoding.bits - 1) -n))
     291        return Var(bit_var((UTF_encoding.Encoding.bits - 1) -n))
    301292       
    302293def make_bit_test(pattern, bit_count):
     
    306297    for i in range(0, bit_count):
    307298        if (pattern & test_bit) == 0:
    308             bit_terms.append(make_not(make_bitv((Encoding.bits - 1)-i)))   
    309         else: bit_terms.append(make_bitv((Encoding.bits - 1)-i))           
     299            bit_terms.append(make_not(make_bitv((UTF_encoding.Encoding.bits - 1)-i)))   
     300        else: bit_terms.append(make_bitv((UTF_encoding.Encoding.bits - 1)-i))           
    310301        test_bit >>= 1
    311302    while len(bit_terms) > 1:
     
    347338def char_test_expr(ch):
    348339    #return make_bit_test(ord(ch), 8)
    349     return bit_pattern_expr(ord(ch), Encoding.mask) 
     340    return bit_pattern_expr(ord(ch), UTF_encoding.Encoding.mask) 
    350341
    351342def GE_Range(N, n):
     
    390381        diff_count += 1
    391382        diff_bits >>= 1
    392     if n2 < n1 or diff_count > Encoding.bits: raise BadRange() 
     383    if n2 < n1 or diff_count > UTF_encoding.Encoding.bits: raise BadRange() 
    393384    mask = 2**(diff_count) - 1
    394385    #common = make_bit_test(n1 >> diff_count, 8 - diff_count)
    395     common = bit_pattern_expr(n1 & ~mask, Encoding.mask^mask)   
     386    common = bit_pattern_expr(n1 & ~mask, UTF_encoding.Encoding.mask^mask)   
    396387    if diff_count == 0: return common
    397388    mask = 2**(diff_count-1) - 1
     
    490481
    491482def chardeflist2simd(chardeflist):
    492     cgo = CodeGenObject([bit_var(i) for i in range(0,Encoding.bits)])
     483    cgo = CodeGenObject([bit_var(i) for i in range(0, UTF_encoding.Encoding.bits)])
    493484    for d in chardeflist:
    494485        chardef2simd(cgo, d)
     
    540531
    541532def chardeflist2py(chardeflist):
    542     cgo = CodeGenObject([bit_var(i) for i in range(0, Encoding.bits)],'')
     533    cgo = CodeGenObject([bit_var(i) for i in range(0, UTF_encoding.Encoding.bits)],'')
    543534    for d in chardeflist:
    544535        chardef2py(cgo, d)
    545536    return cgo.showcode()# + "  return "+ py_chardefmap(chardeflist) + "\n"
    546537
    547 def main():
    548 
    549     global Encoding   
     538def main():   
    550539
    551540    global options
     
    556545                             dest='character_encoding',
    557546                             type='string',
    558                              default='UTF-8',
     547                             default='Default',
    559548                             help='character encoding; default: UTF-8',
    560549                             ) 
     
    597586    options, args = option_parser.parse_args(sys.argv[1:])
    598587
    599     error = False
    600 
    601588    # Set the encoding.
    602     if options.character_encoding == "UTF-32":
    603         Encoding = UTF_Encoding_Type(options.character_encoding, 32, 0xFFFFFFFF)
    604     elif options.character_encoding == "UTF-16":
    605         Encoding = UTF_Encoding_Type(options.character_encoding, 16, 0xFFFF)
    606     elif options.character_encoding == "UTF-8":
    607         Encoding = UTF_Encoding_Type(options.character_encoding, 8, 0xFF)
     589       
     590    #If the user has entered the encoding type as a command-line argument
     591    #then the encoding type that is to be used is locked.
     592    if options.character_encoding == UTF_encoding.UTF32.name:
     593        UTF_encoding.Encoding = Encoding_Type(options.character_encoding,
     594        UTF_encoding.UTF32.bits, UTF_encoding.UTF32.mask, False, True)
     595    elif options.character_encoding == UTF_encoding.UTF16.name:
     596        UTF_encoding.Encoding = Encoding_Type(options.character_encoding,
     597        UTF_encoding.UTF16.bits, UTF_encoding.UTF16.mask, False, True)
     598    elif options.character_encoding == UTF_encoding.UTF8.name:
     599        UTF_encoding.Encoding = Encoding_Type(options.character_encoding,
     600        UTF_encoding.UTF8.bits, UTF_encoding.UTF8.mask, False, True)
     601    elif options.character_encoding == 'Default':
     602        UTF_encoding.Encoding = Encoding_Type(UTF_encoding.UTF8.name,
     603        UTF_encoding.UTF8.bits, UTF_encoding.UTF8.mask, True, False)
    608604    else:
    609605        print "ERROR: Invalid encoding format."
    610         error = True;
     606        return
    611607
    612608    # If we have a valid encoding format then set the basis pattern.
    613     if error == False:
    614         Encoding.basis_pattern = string.split(options.basis_pattern, ",")
    615         if len(Encoding.basis_pattern) == 1:
    616             # If we have the default basis pattern string then adjust it
    617             # for UTF-16 or UTF-32.  If the encoding is UTF-8 then we will
    618             # leave it as is.
    619             if "basis_bits.bit_%i" in Encoding.basis_pattern[0]:
    620                 if "UTF-16" in Encoding.name:
    621                     Encoding.basis_pattern[0] = "u16_bit%i"
    622                 elif "UTF-32" in Encoding.name:
    623                     Encoding.basis_pattern[0] = "u32_bit%i"
    624         elif len(Encoding.basis_pattern) == 2:
    625             if "UTF-16" not in Encoding.name:
    626                 print "ERROR: Invalid encoding for basis pattern variables."
    627                 error = True
    628         elif len(Encoding.basis_pattern) == 3:
    629             if "UTF-32" not in Encoding.name:
    630                 print "ERROR: Invalid encoding for basis pattern variables."
    631                 error = True
    632         else:
    633             print "ERROR: Invalid number of basis pattern variables."
    634             error = True
     609    UTF_encoding.Encoding.basis_pattern = string.split(options.basis_pattern, ",")
     610    if len(UTF_encoding.Encoding.basis_pattern) == 1:
     611        # If we have the default basis pattern string then adjust it
     612        # for UTF-16 or UTF-32.  If the encoding is UTF-8 then we will
     613        # leave it as is.
     614        if "basis_bits.bit_%i" in UTF_encoding.Encoding.basis_pattern[0]:
     615            if UTF_encoding.UTF16.name in UTF_encoding.Encoding.name:
     616                UTF_encoding.Encoding.basis_pattern[0] = "u16_bit%i"
     617            elif UTF_encoding.UTF32.name in UTF_encoding.Encoding.name:
     618                UTF_encoding.Encoding.basis_pattern[0] = "u32_bit%i"
     619    elif len(UTF_encoding.Encoding.basis_pattern) == 2:
     620        if UTF_encoding.UTF16.name not in UTF_encoding.Encoding.name:
     621            print "ERROR: Invalid encoding for the basis pattern variables."
     622            return
     623    elif len(UTF_encoding.Encoding.basis_pattern) == 3:
     624        if UTF_encoding.UTF32.name not in UTF_encoding.Encoding.name:
     625            print "ERROR: Invalid encoding for the basis pattern variables."
     626            return
     627    else:
     628        print "ERROR: Invalid number of basis pattern variables."
     629        return
    635630               
    636631           
    637632    # Positional arguments
    638     if (len(args) == 1) and (error == False):
     633    if (len(args) == 1):
    639634        # if the specified argument is not in the DefinitionSet, then assume that it's a filename
    640635        if args[0] not in DefinitionSet:
    641636            #define the characters in the list
    642637            defs = charset_input_parser.input_chardef(args[0])
     638            if UTF_encoding.Encoding.encoding_error == True:
     639               if UTF_encoding.Encoding.default:
     640                  print "ERROR: The input file contains characters with mixed encodings."
     641               else:
     642                  print ''.join(["ERROR: The input file contains encodings that are not ",
     643                                  UTF_encoding.Encoding.name, "."])
     644               return
    643645        else: defs = DefinitionSet[args[1]]
    644646        if options.use_EBCDIC:
  • proto/charsetcompiler/charset_input_parser.py

    r3349 r3413  
    66#
    77import charset_def
     8import UTF_encoding
    89
    910debug = False
     
    141142        # read per line
    142143        while (string != ""):
     144
     145                #Before we encode lets check to ensure that we are being presented with
     146                #characters encoded with the encoding that we expect.
     147
     148                #If the input file contains a character that has been explicitly encoded as UTF-8
     149                if string.find(r'\x') > -1:
     150                    #The default encoding is UTF8 so if the encoding isn't UTF8 then
     151                    #we know that the encoding is locked to another encoding type.
     152                    if UTF_encoding.Encoding.name != UTF_encoding.UTF8.name:
     153                        UTF_encoding.Encoding.encoding_error = True
     154                    else:
     155                        UTF_encoding.Encoding.locked = True
     156
     157                #If the input file contains a character that has been explicitly encoded as UTF-16
     158                if string.find(r'\u') > -1:
     159                    if UTF_encoding.Encoding.locked == False:
     160                        UTF_encoding.Encoding.name = UTF_encoding.UTF16.name
     161                        UTF_encoding.Encoding.bits = UTF_encoding.UTF16.bits
     162                        UTF_encoding.Encoding.mask = UTF_encoding.UTF16.mask
     163                        UTF_encoding.Encoding.locked = True
     164                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF16.name:
     165                        UTF_encoding.Encoding.encoding_error = True
     166               
     167                #If the input file contains a character that has been explictly encoded as UTF-32
     168                if string.find(r'\U') > -1:
     169                    if UTF_encoding.Encoding.locked == False:
     170                        UTF_encoding.Encoding.name = UTF_encoding.UTF32.name
     171                        UTF_encoding.Encoding.bits = UTF_encoding.UTF32.bits
     172                        UTF_encoding.Encoding.mask = UTF_encoding.UTF32.mask
     173                        UTF_encoding.Encoding.locked = True
     174                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF32.name:
     175                        UTF_encoding.Encoding.encoding_error = True
     176               
    143177                string = string.decode('unicode_escape')
    144 
     178               
    145179                # '#' indicates comment
    146180                if string[0] != '#': 
Note: See TracChangeset for help on using the changeset viewer.