Changeset 660 for proto/charsetcompiler


Ignore:
Timestamp:
Oct 15, 2010, 1:53:03 PM (9 years ago)
Author:
ksherdy
Message:

Update charset_input_parser.py to delimit character class name value pairs on "="

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/charsetcompiler/charset_input_parser.py

    r609 r660  
    1414#       The rest will be the third element of the list
    1515#
    16 # 3. checkValidDeclaration(statement)
     16# 3. checkValidDeclaration(statement, token_list)
    1717#       Checks if a given statement (of type string) contains a valid character set declaration.
    1818#       The expected declaration is:
    1919#           charsetname = [regular expression range]
    20 #
    21 # 4. checkValidDeclarationList(declaration_list):
    22 #       Checks if a declaration list contains a valid character set declaration.
    23 #       The expected declaration is:
    24 #               declaration_list = ["charsetname", "=", "[regular expression range]"]
    2520#
    2621# 5. genCharSetItems(token)
     
    3631#       Takes input_filename as an argument and reads the file to generate the charset items and its name.
    3732#       Returns a list containing pairs of (name, items) to be passed to the CharSetDef class
     33#       Expected input file:
     34#               charsetname = [regular expression range]
     35#               charsetname = [regular expression range]
     36#       Note the spaces and new line. Invalid declarations will be ignored.
    3837#
    3938
     
    5049def split(statement):
    5150        """
    52         Splits up a statement (or line) and returns a list of at most 3 elements.
     51        Splits up a statement (or line) with '=' as the delimiter and returns a list of at most 2 elements.
    5352        The first token will be the first element of the list.
    54         The second token will be the second element of the list.
    55         The rest will be the third element of the list
     53        The rest will be the second element of the list.
    5654        """
     55        if len(statement)==0:
     56            return
     57       
    5758        tokens_list = []
    58         token_index = 0
    5959        start = 0
    60         space_index = 0
    61         while (space_index != -1 and token_index < 2):
    62             space_index = statement.find (" ", start)
    63             if space_index > -1:
    64                 tokens_list.append (statement[start:space_index])
    65                 start = space_index + 1
    66                 token_index += 1
     60        equal_index = statement.find ("=", start)
     61        if equal_index > -1: #means that we found '='!
     62            tokens_list.append (statement[start:equal_index])
     63            start = equal_index + 1
    6764
     65        #append the remaining string to as the last member of the list
    6866        tokens_list.append (statement[start:])
     67       
     68        #get rid of spaces at the beginning and the end of each token
     69        for i in range(0,len(tokens_list)):
     70            tokens_list[i] = removeSpaces(tokens_list[i])
     71           
    6972        return tokens_list
    7073
     
    7477        The expected declaration is:
    7578                charsetname = [regular expression range]
    76         """
    77         # split up the string to a list of tokens
    78         tokens_list = split(statement)
    79         return checkValidDeclarationList (tokens_list)
    80 
    81 def checkValidDeclarationList(declaration_list):
    82         """
    83         Checks if a declaration list contains a valid character set declaration.
    84         The expected declaration is:
    85                 declaration_list = ["charsetname", "=", "[regular expression range]"]
    8679        NOTE: This function does not check an invalid range. If there is a hypen at the beginning
    8780                or the end of the declaration, it will be considered as a hypen character
    8881              Eg. [A-Z-] is considered as ['A-Z', '-']
    8982        """
    90         if len(declaration_list) == 3 and declaration_list[1] == '=':
    91             token = declaration_list[2]
     83        # split up the string to a list of tokens
     84        declaration_list = split(statement)
     85           
     86        if len(declaration_list) == 2:
     87            token = declaration_list[1]
    9288
    9389            # token[0] != '[' or token[-1] != ']' handles the case when the first and the last characters are not a square brackets pair
    9490            if token[0] != '[' or token[-1] != ']':
    95                 return False
     91                return False
    9692        else:
    9793            return False
     
    10298        Generates a list of items from a given token (of type string).
    10399        eg. input = "[A-Za-z_]"
    104                 output = ['a-z', 'A-Z', '_']
     100            output = ['a-z', 'A-Z', '_']
    105101        """
    106102        items = []
     
    120116        return items
    121117
     118def removeSpaces(string):
     119        """
     120        Return a string with removed spaces at the beginning and the end of string
     121        """
     122        start = 0
     123        end = len(string)-1
     124        while (string[start] == ' '):
     125                start += 1
     126               
     127        while (string[end] == ' '):
     128                end -= 1
     129        return string [start:end+1]
     130       
    122131def parseCharsetInput(string):
    123132        """
     
    125134        Returns an empty pair if the charset declared in the file is not valid.
    126135        """
     136        if len(string) == 0:
     137            return
     138       
    127139        # split up the string to a list of tokens
    128         tokens_list = split(string)
    129 
    130         if checkValidDeclarationList(tokens_list):
     140        if checkValidDeclaration(string):
     141            tokens_list = split (string)
    131142            # get the items and store the (name, items) pair to the charset_declaration_list
    132             items = genCharSetItems(tokens_list[2])
     143            items = genCharSetItems(tokens_list[1])
    133144            return (tokens_list[0], items)
     145        else:
     146            print "Invalid declaration: " + string
    134147        return ()
    135148
     
    153166                if string[-1] == '\n':
    154167                        string = string [:-1]
     168                if len(string) != 0:
     169                        # get the pair of name and items from the declared charsets
     170                        pair = parseCharsetInput(string)
     171                        if len(pair) == 2:
     172                                charset_declaration_list.append(pair)
    155173
    156                 # get the pair of name and items from the declared charsets
    157                 pair = parseCharsetInput(string)
    158                 if len(pair) == 2:
    159                         charset_declaration_list.append(pair)
    160                 string = input_handle.readline()
     174                string = input_handle.readline()
    161175
    162176        input_handle.close()
    163177
    164         # Checking if we parse it properly
     178        # Check if we parse it properly
    165179        report_CharSetDef(charset_declaration_list)
    166180        return charset_declaration_list
Note: See TracChangeset for help on using the changeset viewer.