source: proto/charsetcompiler/charset_input_parser.py @ 854

Last change on this file since 854 was 854, checked in by ksherdy, 8 years ago

Removed leading spaces on bitstream statements. Implemented regex range checker.

File size: 5.7 KB
Line 
1# -*- coding: utf-8 -*-
2# charset_input_parser.py
3#
4# This library contains functions to parse line deliminated charset definitions
5# of the form 'character class name = []' and produces (character class name, character class item list).
6#
7
8debug = False
9       
10def report_CharSetDef(charset_declaration_list):
11        """
12        Diagnostic function. Prints out the character class name and character class item list for each
13  charset definition.
14        """
15        print "-----------CharSetDef-----------"
16        for element in charset_declaration_list:
17                print "name: "  + element[0] + " | items: "  + str(element[1])
18        print "-----------CharSetDef-----------"
19
20def split(statement):
21        """
22        Splits a charset definition statement on the first occurence of '='
23        and returns a two item token list.
24        """
25        if len(statement)==0:
26            return
27       
28        tokens_tuple = statement.partition('=')
29        tokens_list = []
30
31        # don't append the delimiter ('=') to the list
32        tokens_list.append(tokens_tuple[0])
33        tokens_list.append(tokens_tuple[2])
34       
35        # trim spaces
36        for i in range(0,len(tokens_list)):
37            tokens_list[i] = tokens_list[i].strip()
38           
39        return tokens_list
40       
41
42def isValidDeclaration(statement):
43        """
44        Validates character set definition statement syntax as 'character class name = [character or range expression]'
45        WARNING: This function does not validate the regular expression on the definition statement. The task will be done by genCharSetItems.
46        """
47        # split up the string to a list of tokens
48        declaration_list = split(statement)
49           
50        if len(declaration_list) == 2:
51            token = declaration_list[1]
52
53            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
54            if token[0] != '[' or token[-1] != ']':
55                return False
56        else:
57            return False
58        return True
59   
60def genCharSetItems(token, items):
61        """
62        Generates a list of items from a given token (of type string).
63        Returns true if the regular expression is valid.
64        Eg. input = "[A-Za-z_]"
65            output = ['a-z', 'A-Z', '_']
66        Example of invalid regular expression: [Z-A]
67        """
68        token_length = len(token) - 1
69       
70        # let's process the items and append into a list (items)
71        # check from index one to length-1 because we want to skip the square brackets
72        index = 1
73       
74        while index < token_length:
75
76            # range case: we want the pattern of a-b where a is not '-' and a <= b
77            if index+2 < token_length and token [index+1] == '-':
78                   
79                # allows range whose starting point is '-' if it is declared at the beginning of the list
80                if (token [index] == '-' and index == 1) or token [index] != '-':
81                   if isValidCharacterRange(token[index], token[index+2]):
82                      items.append(token[index:index+3])
83                      index += 3
84                         
85                   else:
86                      print "Invalid range: " + token[index:index+3] + ", starting point is greater than ending point."
87                      return False
88                     
89                else:
90                   print "Invalid range: " + token[index:index+3] + ", starting point of a range cannot be '-' if it is not at the beginning of the list."
91                   return False                               
92
93            else:
94                items.append(token[index])
95                index += 1
96
97        return True
98
99def isValidCharacterRange(c1, c2):
100        """
101        Takes two characters: starting point, c1, and ending point, c2.
102        Returns true if c1 <= c2, returns false otherwise.
103        """
104        return ord(c1) <= ord(c2)
105       
106def parseCharsetInput(string):
107        """
108        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
109        Returns an empty pair if the charset declared in the file is not valid.
110        """
111        if len(string) == 0:
112            return
113
114        # split up the string to a list of tokens
115        if isValidDeclaration(string):
116            tokens_list = split (string)
117            # get the items and store the (name, items) pair to the charset_declaration_list
118            items = []
119            if genCharSetItems(tokens_list[1], items):
120               return (tokens_list[0], items)
121            else:
122               print "Invalid regular expression: " + string
123               return ()
124        else:
125            print "Invalid charset declaration: " + string
126            return ()
127
128
129def processCharsetInput(input_filename):
130        """
131        Takes input_filename as an argument and reads the file to generate the charset items and its name.
132        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
133        """
134        # get input from file and read it
135        input_handle = open(input_filename, 'r')
136        string = input_handle.readline()
137       
138        # This list contains pairs of (name, items) to be passed to the CharSetDef class
139        charset_declaration_list = []
140
141        # read per line
142        while (string != ""):
143                string = string.decode('string_escape')
144
145                # '#' indicates comment
146                if string[0] != '#': 
147                   # check if the last character a new line (\n) character
148                   if string[-1] == '\n':
149                      string = string [:-1]
150                   if len(string) != 0:
151                      # get the pair of name and items from the declared charsets
152                      pair = parseCharsetInput(string)
153                      if len(pair) == 2:
154                         charset_declaration_list.append(pair)
155                else:
156                   #comment case
157                   pass
158                   
159                string = input_handle.readline()
160
161        input_handle.close()
162
163        # Check if we parse it properly
164        if debug:
165                report_CharSetDef(charset_declaration_list)
166        return charset_declaration_list
Note: See TracBrowser for help on using the repository browser.