Ignore:
Timestamp:
Jul 29, 2013, 11:26:03 AM (6 years ago)
Author:
cameron
Message:

Autodetect encoding type

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/charset_input_parser.py

    r3349 r3413  
    66#
    77import charset_def
     8import UTF_encoding
    89
    910debug = False
     
    141142        # read per line
    142143        while (string != ""):
     144
     145                #Before we encode lets check to ensure that we are being presented with
     146                #characters encoded with the encoding that we expect.
     147
     148                #If the input file contains a character that has been explicitly encoded as UTF-8
     149                if string.find(r'\x') > -1:
     150                    #The default encoding is UTF8 so if the encoding isn't UTF8 then
     151                    #we know that the encoding is locked to another encoding type.
     152                    if UTF_encoding.Encoding.name != UTF_encoding.UTF8.name:
     153                        UTF_encoding.Encoding.encoding_error = True
     154                    else:
     155                        UTF_encoding.Encoding.locked = True
     156
     157                #If the input file contains a character that has been explicitly encoded as UTF-16
     158                if string.find(r'\u') > -1:
     159                    if UTF_encoding.Encoding.locked == False:
     160                        UTF_encoding.Encoding.name = UTF_encoding.UTF16.name
     161                        UTF_encoding.Encoding.bits = UTF_encoding.UTF16.bits
     162                        UTF_encoding.Encoding.mask = UTF_encoding.UTF16.mask
     163                        UTF_encoding.Encoding.locked = True
     164                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF16.name:
     165                        UTF_encoding.Encoding.encoding_error = True
     166               
     167                #If the input file contains a character that has been explictly encoded as UTF-32
     168                if string.find(r'\U') > -1:
     169                    if UTF_encoding.Encoding.locked == False:
     170                        UTF_encoding.Encoding.name = UTF_encoding.UTF32.name
     171                        UTF_encoding.Encoding.bits = UTF_encoding.UTF32.bits
     172                        UTF_encoding.Encoding.mask = UTF_encoding.UTF32.mask
     173                        UTF_encoding.Encoding.locked = True
     174                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF32.name:
     175                        UTF_encoding.Encoding.encoding_error = True
     176               
    143177                string = string.decode('unicode_escape')
    144 
     178               
    145179                # '#' indicates comment
    146180                if string[0] != '#': 
Note: See TracChangeset for help on using the changeset viewer.