source: proto/charsetcompiler/charset_input_parser.py @ 675

Last change on this file since 675 was 663, checked in by ksherdy, 9 years ago

Add module specific debug variable.

File size: 4.6 KB
Line 
1# charset_input_parser.py
2#
3# This library contains functions to parse line deliminated charset definitions
4# of the form 'character class name = []' and produces (character class name, character class item list).
5#
6
7debug = False
8
9def report_CharSetDef(charset_declaration_list):
10        """
11        Diagnostic function. Prints out the character class name and character class item list for each
12  charset definition.
13        """
14        print "-----------CharSetDef-----------"
15        for element in charset_declaration_list:
16                print "name: "  + element[0] + " | items: "  + str(element[1])
17        print "-----------CharSetDef-----------"
18
19def split(statement):
20        """
21        Splits a charset definitaion statement on the first occurence of '='
22  and returns a two item token list.
23        """
24        if len(statement)==0:
25            return
26       
27        tokens_list = []
28        start = 0
29        equal_index = statement.find ("=", start)
30        if equal_index > -1: #means that we found '='!
31            tokens_list.append (statement[start:equal_index])
32            start = equal_index + 1
33
34        #append the remaining string to as the last member of the list
35        tokens_list.append (statement[start:])
36       
37        # trim spaces
38        for i in range(0,len(tokens_list)):
39            tokens_list[i] = tokens_list[i].strip()
40           
41        return tokens_list
42
43def checkValidDeclaration(statement):
44        """
45        Validates character set definition statement syntax as 'character class name = [character or range expression]'
46       
47  WARNING: This function does not check an invalid range. If there is a hypen at the beginning
48                or the end of the declaration, it will be considered as a hypen character
49              Eg. [A-Z-] is considered as ['A-Z', '-']
50        """
51        # split up the string to a list of tokens
52        declaration_list = split(statement)
53           
54        if len(declaration_list) == 2:
55            token = declaration_list[1]
56
57            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
58            if token[0] != '[' or token[-1] != ']':
59                return False
60        else:
61            return False
62        return True
63   
64def genCharSetItems(token):
65        """
66        Generates a list of items from a given token (of type string).
67        eg. input = "[A-Za-z_]"
68            output = ['a-z', 'A-Z', '_']
69        """
70        items = []
71        token_length = len(token)
72       
73        # let's process the items and append into a list (items)
74        # check from index one to length-1 because we want to skip the square brackets
75        index = 1
76        while index < token_length-1:
77            # range case: we want the pattern of a-b where b is not -
78            if index+2 < token_length and (token[index+1] == '-' and token[index+2] != '-'):
79                items.append(token[index:index+3])
80                index += 3
81            else:
82                items.append(token[index])
83                index += 1
84        return items
85       
86def parseCharsetInput(string):
87        """
88        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
89        Returns an empty pair if the charset declared in the file is not valid.
90        """
91        if len(string) == 0:
92            return
93       
94        # split up the string to a list of tokens
95        if checkValidDeclaration(string):
96            tokens_list = split (string)
97            # get the items and store the (name, items) pair to the charset_declaration_list
98            items = genCharSetItems(tokens_list[1])
99            return (tokens_list[0], items)
100        else:
101            print "Invalid declaration: " + string
102        return ()
103
104def processCharsetInput(input_filename):
105        """
106        Takes input_filename as an argument and reads the file to generate the charset items and its name.
107        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
108        """
109        # get input from file and read it
110        input_handle = open(input_filename, 'r')
111        string = input_handle.readline()
112       
113        # This list contains pairs of (name, items) to be passed to the CharSetDef class
114        charset_declaration_list = []
115
116        # read per line
117        while (string != ""):
118                string = string.decode('string_escape')
119               
120                # check if the last character a new line (\n) character
121                if string[-1] == '\n':
122                        string = string [:-1]
123                if len(string) != 0:
124                        # get the pair of name and items from the declared charsets
125                        pair = parseCharsetInput(string)
126                        if len(pair) == 2:
127                                charset_declaration_list.append(pair)
128
129                string = input_handle.readline()
130
131        input_handle.close()
132
133        # Check if we parse it properly
134        if debug:
135                report_CharSetDef(charset_declaration_list)
136        return charset_declaration_list
Note: See TracBrowser for help on using the repository browser.