source: proto/charsetcompiler/charset_input_parser.py @ 3310

Last change on this file since 3310 was 2232, checked in by cameron, 7 years ago

Transcode compiler generates charset_compiler input

File size: 6.1 KB
Line 
1# -*- coding: utf-8 -*-
2# charset_input_parser.py
3#
4# This library contains functions to parse line deliminated charset definitions
5# of the form 'character class name = []' and produces (character class name, character class item list).
6#
7import charset_def
8
9debug = False
10       
11def report_CharSetDef(charset_declaration_list):
12        """
13        Diagnostic function. Prints out the character class name and character class item list for each
14  charset definition.
15        """
16        print "-----------CharSetDef-----------"
17        for element in charset_declaration_list:
18                print "name: "  + element[0] + " | items: "  + str(element[1])
19        print "-----------CharSetDef-----------"
20
21def split(statement):
22        """
23        Splits a charset definition statement on the first occurence of '='
24        and returns a two item token list.
25        """
26        if len(statement)==0:
27            return []
28       
29        tokens_tuple = statement.partition('=')
30        tokens_list = []
31
32        # don't append the delimiter ('=') to the list
33        tokens_list.append(tokens_tuple[0])
34        tokens_list.append(tokens_tuple[2])
35       
36        # trim spaces
37        for i in range(0,len(tokens_list)):
38            tokens_list[i] = tokens_list[i].strip()
39           
40        return tokens_list
41       
42
43def isValidDeclaration(statement):
44        """
45        Validates character set definition statement syntax as 'character class name = [character or range expression]'
46        WARNING: This function does not validate the regular expression on the definition statement. The task will be done by genCharSetItems.
47        """
48        # split up the string to a list of tokens
49        declaration_list = split(statement)
50           
51        if len(declaration_list) == 2:
52            token = declaration_list[1]
53
54            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
55            if token[0] != '[' or token[-1] != ']':
56                return False
57        else:
58            return False
59        return True
60   
61def genCharSetItems(token, items):
62        """
63        Generates a list of items from a given token (of type string).
64        Returns true if the regular expression is valid.
65        Eg. input = "[A-Za-z_]"
66            output = ['a-z', 'A-Z', '_']
67        Example of invalid regular expression: [Z-A]
68        """
69        token_length = len(token) - 1
70       
71        # let's process the items and append into a list (items)
72        # check from index one to length-1 because we want to skip the square brackets
73        index = 1
74       
75        while index < token_length:
76
77            # range case: we want the pattern of a-b where a is not '-' and a <= b
78            if index+2 < token_length and token [index+1] == '-':
79                   
80                # allows range whose starting point is '-' if it is declared at the beginning of the list
81                if (token [index] == '-' and index == 1) or token [index] != '-':
82                   if isValidCharacterRange(token[index], token[index+2]):
83                      items.append(token[index:index+3])
84                      index += 3
85                         
86                   else:
87                      print "Invalid range: " + token[index:index+3] + ", starting point is greater than ending point."
88                      return False
89                     
90                else:
91                   print "Invalid range: " + token[index:index+3] + ", starting point of a range cannot be '-' if it is not at the beginning of the list."
92                   return False                               
93
94            else:
95                items.append(token[index])
96                index += 1
97
98        return True
99
100def isValidCharacterRange(c1, c2):
101        """
102        Takes two characters: starting point, c1, and ending point, c2.
103        Returns true if c1 <= c2, returns false otherwise.
104        """
105        return ord(c1) <= ord(c2)
106       
107def parseCharsetInput(string):
108        """
109        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
110        Returns an empty pair if the charset declared in the file is not valid.
111        """
112        if len(string) == 0:
113            return
114
115        # split up the string to a list of tokens
116        if isValidDeclaration(string):
117            tokens_list = split (string)
118            # get the items and store the (name, items) pair to the charset_declaration_list
119            items = []
120            if genCharSetItems(tokens_list[1], items):
121               return (tokens_list[0], items)
122            else:
123               print "Invalid regular expression: " + string
124               return ()
125        else:
126            print "Invalid charset declaration: " + string
127            return ()
128
129def processCharsetInput(input_filename):
130        """
131        Takes input_filename as an argument and reads the file to generate the charset items and its name.
132        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
133        """
134        # get input from file and read it
135        input_handle = open(input_filename, 'r')
136        string = input_handle.readline()
137       
138        # This list contains pairs of (name, items) to be passed to the CharSetDef class
139        charset_declaration_list = []
140
141        # read per line
142        while (string != ""):
143                string = string.decode('string_escape')
144
145                # '#' indicates comment
146                if string[0] != '#': 
147                   # check if the last character a new line (\n) character
148                   if string[-1] == '\n':
149                      string = string [:-1]
150                   if len(string) != 0:
151                      # get the pair of name and items from the declared charsets
152                      pair = parseCharsetInput(string)
153                      if len(pair) == 2:
154                         charset_declaration_list.append(pair)
155                else:
156                   #comment case
157                   pass
158                   
159                string = input_handle.readline()
160
161        input_handle.close()
162
163        # Check if we parse it properly
164        if debug:
165                report_CharSetDef(charset_declaration_list)
166        return charset_declaration_list
167
168def input_chardef(filename):
169    """
170    Returns a list of declared CharSet from the declarations in the input file
171    """
172    defs = []
173    charset_declaration_list = processCharsetInput(filename)
174   
175    for charset_declaration in charset_declaration_list:
176        defs.append(charset_def.CharSetDef (charset_declaration[0], charset_declaration[1]))
177
178    return defs
179
Note: See TracBrowser for help on using the repository browser.