source: proto/charsetcompiler/charset_input_parser.py @ 676

Last change on this file since 676 was 676, checked in by ksherdy, 9 years ago

Add JSON character class definitions.

File size: 4.6 KB
Line 
1# -*- coding: utf-8 -*-
2# charset_input_parser.py
3#
4# This library contains functions to parse line deliminated charset definitions
5# of the form 'character class name = []' and produces (character class name, character class item list).
6#
7
8debug = False
9
10def report_CharSetDef(charset_declaration_list):
11        """
12        Diagnostic function. Prints out the character class name and character class item list for each
13  charset definition.
14        """
15        print "-----------CharSetDef-----------"
16        for element in charset_declaration_list:
17                print "name: "  + element[0] + " | items: "  + str(element[1])
18        print "-----------CharSetDef-----------"
19
20def split(statement):
21        """
22        Splits a charset definitaion statement on the first occurence of '='
23  and returns a two item token list.
24        """
25        if len(statement)==0:
26            return
27       
28        tokens_list = []
29        start = 0
30        equal_index = statement.find ("=", start)
31        if equal_index > -1: #means that we found '='!
32            tokens_list.append (statement[start:equal_index])
33            start = equal_index + 1
34
35        #append the remaining string to as the last member of the list
36        tokens_list.append (statement[start:])
37       
38        # trim spaces
39        for i in range(0,len(tokens_list)):
40            tokens_list[i] = tokens_list[i].strip()
41           
42        return tokens_list
43
44def checkValidDeclaration(statement):
45        """
46        Validates character set definition statement syntax as 'character class name = [character or range expression]'
47       
48  WARNING: This function does not check an invalid range. If there is a hypen at the beginning
49                or the end of the declaration, it will be considered as a hypen character
50              Eg. [A-Z-] is considered as ['A-Z', '-']
51        """
52        # split up the string to a list of tokens
53        declaration_list = split(statement)
54           
55        if len(declaration_list) == 2:
56            token = declaration_list[1]
57
58            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
59            if token[0] != '[' or token[-1] != ']':
60                return False
61        else:
62            return False
63        return True
64   
65def genCharSetItems(token):
66        """
67        Generates a list of items from a given token (of type string).
68        eg. input = "[A-Za-z_]"
69            output = ['a-z', 'A-Z', '_']
70        """
71        items = []
72        token_length = len(token)
73       
74        # let's process the items and append into a list (items)
75        # check from index one to length-1 because we want to skip the square brackets
76        index = 1
77        while index < token_length-1:
78            # range case: we want the pattern of a-b where b is not -
79            if index+2 < token_length and (token[index+1] == '-' and token[index+2] != '-'):
80                items.append(token[index:index+3])
81                index += 3
82            else:
83                items.append(token[index])
84                index += 1
85        return items
86       
87def parseCharsetInput(string):
88        """
89        Takes a line of charset declaration as an argument and generates a list of (name, items) pair.
90        Returns an empty pair if the charset declared in the file is not valid.
91        """
92        if len(string) == 0:
93            return
94       
95        # split up the string to a list of tokens
96        if checkValidDeclaration(string):
97            tokens_list = split (string)
98            # get the items and store the (name, items) pair to the charset_declaration_list
99            items = genCharSetItems(tokens_list[1])
100            return (tokens_list[0], items)
101        else:
102            print "Invalid declaration: " + string
103        return ()
104
105def processCharsetInput(input_filename):
106        """
107        Takes input_filename as an argument and reads the file to generate the charset items and its name.
108        Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
109        """
110        # get input from file and read it
111        input_handle = open(input_filename, 'r')
112        string = input_handle.readline()
113       
114        # This list contains pairs of (name, items) to be passed to the CharSetDef class
115        charset_declaration_list = []
116
117        # read per line
118        while (string != ""):
119                string = string.decode('string_escape')
120               
121                # check if the last character a new line (\n) character
122                if string[-1] == '\n':
123                        string = string [:-1]
124                if len(string) != 0:
125                        # get the pair of name and items from the declared charsets
126                        pair = parseCharsetInput(string)
127                        if len(pair) == 2:
128                                charset_declaration_list.append(pair)
129
130                string = input_handle.readline()
131
132        input_handle.close()
133
134        # Check if we parse it properly
135        if debug:
136                report_CharSetDef(charset_declaration_list)
137        return charset_declaration_list
Note: See TracBrowser for help on using the repository browser.