source: proto/charsetcompiler/charset_input_parser.py @ 662

Last change on this file since 662 was 662, checked in by ksherdy, 9 years ago

Remove redundant function comments. Replace manual string whitespace trimming function with standard Python library implementation.

File size: 4.5 KB
Line 
1# charset_input_parser.py
2#
3# This library contains functions to parse line deliminated charset definitions
4# of the form 'character class name = []' and produces (character class name, character class item list).
5#
6
7def report_CharSetDef(charset_declaration_list):
8        """
9        Diagnostic function. Prints out the character class name and character class item list for each
10  charset definition.
11        """
12        print "-----------CharSetDef-----------"
13        for element in charset_declaration_list:
14                print "name: "  + element[0] + " | items: "  + str(element[1])
15        print "-----------CharSetDef-----------"
16
17def split(statement):
18        """
19        Splits a charset definitaion statement on the first occurence of '='
20  and returns a two item token list.
21        """
22        if len(statement)==0:
23            return
24       
25        tokens_list = []
26        start = 0
27        equal_index = statement.find ("=", start)
28        if equal_index > -1: #means that we found '='!
29            tokens_list.append (statement[start:equal_index])
30            start = equal_index + 1
31
32        #append the remaining string to as the last member of the list
33        tokens_list.append (statement[start:])
34       
35        # trim spaces
36        for i in range(0,len(tokens_list)):
37            tokens_list[i] = tokens_list[i].strip()
38           
39        return tokens_list
40
41def checkValidDeclaration(statement):
42        """
43        Validates character set definition statement syntax as 'character class name = [character or range expression]'
44       
45  WARNING: This function does not check an invalid range. If there is a hypen at the beginning
46                or the end of the declaration, it will be considered as a hypen character
47              Eg. [A-Z-] is considered as ['A-Z', '-']
48        """
49        # split up the string to a list of tokens
50        declaration_list = split(statement)
51           
52        if len(declaration_list) == 2:
53            token = declaration_list[1]
54
55            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
56            if token[0] != '[' or token[-1] != ']':
57                return False
58        else:
59            return False
60        return True
61   
62def genCharSetItems(token):
63        """
64        Generates a list of items from a given token (of type string).
65        eg. input = "[A-Za-z_]"
66            output = ['a-z', 'A-Z', '_']
67        """
68        items = []
69        token_length = len(token)
70       
71        # let's process the items and append into a list (items)
72        # check from index one to length-1 because we want to skip the square brackets
73        index = 1
74        while index < token_length-1:
75            # range case: we want the pattern of a-b where b is not -
76            if index+2 < token_length and (token[index+1] == '-' and token[index+2] != '-'):
77                items.append(token[index:index+3])
78                index += 3
79            else:
80                items.append(token[index])
81                index += 1
82        return items
83       
84def parseCharsetInput(string):
85        """
86        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
87        Returns an empty pair if the charset declared in the file is not valid.
88        """
89        if len(string) == 0:
90            return
91       
92        # split up the string to a list of tokens
93        if checkValidDeclaration(string):
94            tokens_list = split (string)
95            # get the items and store the (name, items) pair to the charset_declaration_list
96            items = genCharSetItems(tokens_list[1])
97            return (tokens_list[0], items)
98        else:
99            print "Invalid declaration: " + string
100        return ()
101
102def processCharsetInput(input_filename):
103        """
104        Takes input_filename as an argument and reads the file to generate the charset items and its name.
105        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
106        """
107        # get input from file and read it
108        input_handle = open(input_filename, 'r')
109        string = input_handle.readline()
110       
111        # This list contains pairs of (name, items) to be passed to the CharSetDef class
112        charset_declaration_list = []
113
114        # read per line
115        while (string != ""):
116                string = string.decode('string_escape')
117               
118                # check if the last character a new line (\n) character
119                if string[-1] == '\n':
120                        string = string [:-1]
121                if len(string) != 0:
122                        # get the pair of name and items from the declared charsets
123                        pair = parseCharsetInput(string)
124                        if len(pair) == 2:
125                                charset_declaration_list.append(pair)
126
127                string = input_handle.readline()
128
129        input_handle.close()
130
131        # Check if we parse it properly
132        report_CharSetDef(charset_declaration_list)
133        return charset_declaration_list
Note: See TracBrowser for help on using the repository browser.