source: proto/charsetcompiler/charset_input_parser.py @ 4185

Last change on this file since 4185 was 3413, checked in by cameron, 6 years ago

Autodetect encoding type

File size: 8.0 KB
Line 
1# -*- coding: utf-8 -*-
2# charset_input_parser.py
3#
4# This library contains functions to parse line deliminated charset definitions
5# of the form 'character class name = []' and produces (character class name, character class item list).
6#
7import charset_def
8import UTF_encoding
9
10debug = False
11       
12def report_CharSetDef(charset_declaration_list):
13        """
14        Diagnostic function. Prints out the character class name and character class item list for each
15  charset definition.
16        """
17        print "-----------CharSetDef-----------"
18        for element in charset_declaration_list:
19                print "name: "  + element[0] + " | items: "  + str(element[1])
20        print "-----------CharSetDef-----------"
21
22def split(statement):
23        """
24        Splits a charset definition statement on the first occurence of '='
25        and returns a two item token list.
26        """
27        if len(statement)==0:
28            return []
29       
30        tokens_tuple = statement.partition('=')
31        tokens_list = []
32
33        # don't append the delimiter ('=') to the list
34        tokens_list.append(tokens_tuple[0])
35        tokens_list.append(tokens_tuple[2])
36       
37        # trim spaces
38        for i in range(0,len(tokens_list)):
39            tokens_list[i] = tokens_list[i].strip()
40           
41        return tokens_list
42       
43
44def isValidDeclaration(statement):
45        """
46        Validates character set definition statement syntax as 'character class name = [character or range expression]'
47        WARNING: This function does not validate the regular expression on the definition statement. The task will be done by genCharSetItems.
48        """
49        # split up the string to a list of tokens
50        declaration_list = split(statement)
51           
52        if len(declaration_list) == 2:
53            token = declaration_list[1]
54
55            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
56            if token[0] != '[' or token[-1] != ']':
57                return False
58        else:
59            return False
60        return True
61   
62def genCharSetItems(token, items):
63        """
64        Generates a list of items from a given token (of type string).
65        Returns true if the regular expression is valid.
66        Eg. input = "[A-Za-z_]"
67            output = ['a-z', 'A-Z', '_']
68        Example of invalid regular expression: [Z-A]
69        """
70        token_length = len(token) - 1
71       
72        # let's process the items and append into a list (items)
73        # check from index one to length-1 because we want to skip the square brackets
74        index = 1
75       
76        while index < token_length:
77
78            # range case: we want the pattern of a-b where a is not '-' and a <= b
79            if index+2 < token_length and token [index+1] == '-':
80                   
81                # allows range whose starting point is '-' if it is declared at the beginning of the list
82                if (token [index] == '-' and index == 1) or token [index] != '-':
83                   if isValidCharacterRange(token[index], token[index+2]):
84                      items.append(token[index:index+3])
85                      index += 3
86                         
87                   else:
88                      print "Invalid range: " + token[index:index+3] + ", starting point is greater than ending point."
89                      return False
90                     
91                else:
92                   print "Invalid range: " + token[index:index+3] + ", starting point of a range cannot be '-' if it is not at the beginning of the list."
93                   return False                               
94
95            else:
96                items.append(token[index])
97                index += 1
98
99        return True
100
101def isValidCharacterRange(c1, c2):
102        """
103        Takes two characters: starting point, c1, and ending point, c2.
104        Returns true if c1 <= c2, returns false otherwise.
105        """
106        return ord(c1) <= ord(c2)
107       
108def parseCharsetInput(string):
109        """
110        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
111        Returns an empty pair if the charset declared in the file is not valid.
112        """
113        if len(string) == 0:
114            return
115
116        # split up the string to a list of tokens
117        if isValidDeclaration(string):
118            tokens_list = split (string)
119            # get the items and store the (name, items) pair to the charset_declaration_list
120            items = []
121            if genCharSetItems(tokens_list[1], items):
122               return (tokens_list[0], items)
123            else:
124               print "Invalid regular expression: " + string
125               return ()
126        else:
127            print "Invalid charset declaration: " + string
128            return ()
129
130def processCharsetInput(input_filename):
131        """
132        Takes input_filename as an argument and reads the file to generate the charset items and its name.
133        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
134        """
135        # get input from file and read it
136        input_handle = open(input_filename, 'r')
137        string = input_handle.readline()
138       
139        # This list contains pairs of (name, items) to be passed to the CharSetDef class
140        charset_declaration_list = []
141
142        # read per line
143        while (string != ""):
144
145                #Before we encode lets check to ensure that we are being presented with
146                #characters encoded with the encoding that we expect.
147
148                #If the input file contains a character that has been explicitly encoded as UTF-8
149                if string.find(r'\x') > -1:
150                    #The default encoding is UTF8 so if the encoding isn't UTF8 then
151                    #we know that the encoding is locked to another encoding type.
152                    if UTF_encoding.Encoding.name != UTF_encoding.UTF8.name:
153                        UTF_encoding.Encoding.encoding_error = True
154                    else:
155                        UTF_encoding.Encoding.locked = True
156
157                #If the input file contains a character that has been explicitly encoded as UTF-16
158                if string.find(r'\u') > -1:
159                    if UTF_encoding.Encoding.locked == False:
160                        UTF_encoding.Encoding.name = UTF_encoding.UTF16.name
161                        UTF_encoding.Encoding.bits = UTF_encoding.UTF16.bits
162                        UTF_encoding.Encoding.mask = UTF_encoding.UTF16.mask
163                        UTF_encoding.Encoding.locked = True
164                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF16.name:
165                        UTF_encoding.Encoding.encoding_error = True
166               
167                #If the input file contains a character that has been explictly encoded as UTF-32
168                if string.find(r'\U') > -1:
169                    if UTF_encoding.Encoding.locked == False:
170                        UTF_encoding.Encoding.name = UTF_encoding.UTF32.name
171                        UTF_encoding.Encoding.bits = UTF_encoding.UTF32.bits
172                        UTF_encoding.Encoding.mask = UTF_encoding.UTF32.mask
173                        UTF_encoding.Encoding.locked = True
174                    elif UTF_encoding.Encoding.name != UTF_encoding.UTF32.name:
175                        UTF_encoding.Encoding.encoding_error = True
176               
177                string = string.decode('unicode_escape')
178               
179                # '#' indicates comment
180                if string[0] != '#': 
181                   # check if the last character a new line (\n) character
182                   if string[-1] == '\n':
183                      string = string [:-1]
184                   if len(string) != 0:
185                      # get the pair of name and items from the declared charsets
186                      pair = parseCharsetInput(string)
187                      if len(pair) == 2:
188                         charset_declaration_list.append(pair)
189                else:
190                   #comment case
191                   pass
192                   
193                string = input_handle.readline()
194
195        input_handle.close()
196
197        # Check if we parse it properly
198        if debug:
199                report_CharSetDef(charset_declaration_list)
200        return charset_declaration_list
201
202def input_chardef(filename):
203    """
204    Returns a list of declared CharSet from the declarations in the input file
205    """
206    defs = []
207    charset_declaration_list = processCharsetInput(filename)
208   
209    for charset_declaration in charset_declaration_list:
210        defs.append(charset_def.CharSetDef (charset_declaration[0], charset_declaration[1]))
211
212    return defs
213
Note: See TracBrowser for help on using the repository browser.