source: proto/charsetcompiler/charset_input_parser.py @ 609

Last change on this file since 609 was 609, checked in by cameron, 9 years ago

charset compiler check-in

File size: 5.9 KB
Line 
1# charset_input_parser.py
2#
3#
4# This library can be used to parse a charset declaration and produce a pair of (name, items)
5# It contains the following functions:
6#
7# 1. report_CharSetDef(charset_declaration_list):
8#       Prints out name and items of each pair of charset in the given list.
9#
10# 2. split(statement)
11#       Splits up a statement (or line) and returns a list of at most 3 elements.
12#       The first token will be the first element of the list.
13#       The second token will be the second element of the list.
14#       The rest will be the third element of the list
15#
16# 3. checkValidDeclaration(statement)
17#       Checks if a given statement (of type string) contains a valid character set declaration.
18#       The expected declaration is:
19#           charsetname = [regular expression range]
20#
21# 4. checkValidDeclarationList(declaration_list):
22#       Checks if a declaration list contains a valid character set declaration.
23#       The expected declaration is:
24#               declaration_list = ["charsetname", "=", "[regular expression range]"]
25#
26# 5. genCharSetItems(token)
27#       Generates a list of items from a given token (of type string).
28#       eg. input = "[A-Za-z_]"
29#           output = ['a-z', 'A-Z', '_']
30#
31# 6. parseCharsetInput(string)
32#       Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
33#       Returns an empty pair if the charset declared in the file is not valid.
34#
35# 7. processCharsetInput(input_filename):
36#       Takes input_filename as an argument and reads the file to generate the charset items and its name.
37#       Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
38#
39
40
41def report_CharSetDef(charset_declaration_list):
42        """
43        Prints out name and items of each pair of charset in the given list.
44        """
45        print "-----------CharSetDef-----------"
46        for element in charset_declaration_list:
47                print "name: "  + element[0] + " | items: "  + str(element[1])
48        print "-----------CharSetDef-----------"
49
50def split(statement):
51        """
52        Splits up a statement (or line) and returns a list of at most 3 elements.
53        The first token will be the first element of the list.
54        The second token will be the second element of the list.
55        The rest will be the third element of the list
56        """
57        tokens_list = []
58        token_index = 0
59        start = 0
60        space_index = 0
61        while (space_index != -1 and token_index < 2):
62            space_index = statement.find (" ", start)
63            if space_index > -1:
64                tokens_list.append (statement[start:space_index])
65                start = space_index + 1
66                token_index += 1
67
68        tokens_list.append (statement[start:])
69        return tokens_list
70
71def checkValidDeclaration(statement):
72        """
73        Checks if a given statement (of type string) contains a valid character set declaration.
74        The expected declaration is:
75                charsetname = [regular expression range]
76        """
77        # split up the string to a list of tokens
78        tokens_list = split(statement)
79        return checkValidDeclarationList (tokens_list)
80
81def checkValidDeclarationList(declaration_list):
82        """
83        Checks if a declaration list contains a valid character set declaration.
84        The expected declaration is:
85                declaration_list = ["charsetname", "=", "[regular expression range]"]
86        NOTE: This function does not check an invalid range. If there is a hypen at the beginning
87                or the end of the declaration, it will be considered as a hypen character
88              Eg. [A-Z-] is considered as ['A-Z', '-']
89        """
90        if len(declaration_list) == 3 and declaration_list[1] == '=':
91            token = declaration_list[2]
92
93            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
94            if token[0] != '[' or token[-1] != ']':
95                return False
96        else:
97            return False
98        return True
99   
100def genCharSetItems(token):
101        """
102        Generates a list of items from a given token (of type string).
103        eg. input = "[A-Za-z_]"
104                output = ['a-z', 'A-Z', '_']
105        """
106        items = []
107        token_length = len(token)
108       
109        # let's process the items and append into a list (items)
110        # check from index one to length-1 because we want to skip the square brackets
111        index = 1
112        while index < token_length-1:
113            # range case: we want the pattern of a-b where b is not -
114            if index+2 < token_length and (token[index+1] == '-' and token[index+2] != '-'):
115                items.append(token[index:index+3])
116                index += 3
117            else:
118                items.append(token[index])
119                index += 1
120        return items
121
122def parseCharsetInput(string):
123        """
124        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
125        Returns an empty pair if the charset declared in the file is not valid.
126        """
127        # split up the string to a list of tokens
128        tokens_list = split(string)
129
130        if checkValidDeclarationList(tokens_list):
131            # get the items and store the (name, items) pair to the charset_declaration_list
132            items = genCharSetItems(tokens_list[2])
133            return (tokens_list[0], items)
134        return ()
135
136def processCharsetInput(input_filename):
137        """
138        Takes input_filename as an argument and reads the file to generate the charset items and its name.
139        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
140        """
141        # get input from file and read it
142        input_handle = open(input_filename, 'r')
143        string = input_handle.readline()
144       
145        # This list contains pairs of (name, items) to be passed to the CharSetDef class
146        charset_declaration_list = []
147
148        # read per line
149        while (string != ""):
150                string = string.decode('string_escape')
151               
152                # check if the last character a new line (\n) character
153                if string[-1] == '\n':
154                        string = string [:-1]
155
156                # get the pair of name and items from the declared charsets
157                pair = parseCharsetInput(string)
158                if len(pair) == 2:
159                        charset_declaration_list.append(pair)
160                string = input_handle.readline()
161
162        input_handle.close()
163
164        # Checking if we parse it properly
165        report_CharSetDef(charset_declaration_list)
166        return charset_declaration_list
Note: See TracBrowser for help on using the repository browser.