source: trunk/symbol_table/test/st_test_file_generator.py @ 2026

Last change on this file since 2026 was 1973, checked in by ksherdy, 8 years ago

Added tests.

File size: 10.2 KB
Line 
1import random
2import sys
3import math
4       
5# This script generates 3 files.
6#
7# 1. Output Test File
8#
9#    st_test_UNIFORM_(L_O_U)_(L_O_U)_(L_O_U)_Ag_Mg, where for each (L_O_U),
10
11#    * UNIFORM denotes that equal counts of symbols of each length L are generated
12#    * L denotes length
13#    * O denotes the total number of symbols of length L
14#    * U denotes the number of unique symbols of length L
15#
16#    and
17#
18#    * Ag denotes the average gap (or separation) distance between symbols.
19#    * Mg denotes the maximum absolute deviation gap distance, see Absolute Deviation
20#
21# 2. Meta Results File
22#
23#    st_meta_test_UNIFORM_(L_O_U)_(L_O_U)_(L_O_U)_Ag_Mg
24#
25#    This file contains start position, length and gap distance for each symbol contains in the generated test file, as
26#    well as the average gap distance and the maximum gap distance.
27#       
28
29usage = "python testFileGenerator.py [L,O,U] [L,O,U] [L,O,U] ... AvgGap MaxGap"
30
31# parallel lists unique_symbols and unique_symbol_counts
32# list of unique symbols
33unique_symbols = [] 
34# list of counts of the number of occurrences for each unique symbol
35unique_symbol_counts = []
36
37# a list that contains symbol specification [[L,O,U], [L,O,U], [L,O,U], ...]
38symbol_specs = []
39
40unique_sym_pos = []
41
42class SymbolFileGenerator:
43    testFileHandle = 0   
44    def __init__(self,base_filename):   
45        self.testFileHandle = open("st_test_UNIFORM_"+base_filename, 'w')
46       
47    def appendUniqueSymbolAndGap(self, symbol_string, gap):   
48        self.testFileHandle.write(symbol_string + ','*gap)       
49       
50    def close(self):
51        self.testFileHandle.close()
52     
53
54class SymbolLogger:
55    logFileHandle = 0
56
57    def __init__(self,base_filename,unique_symbol_total,avg_gap, max_gap):
58
59        #prepare for unique symbol position list
60        for i in range(unique_symbol_total):
61            unique_sym_pos.append([])
62
63        self.logFileHandle = open ("st_meta_test_UNIFORM_"+base_filename, 'w')
64        self.logFileHandle.write("Average Gap Distance: " + str(avg_gap) + "\nMaximum Gap Distance: " + str(max_gap))
65
66    def logUniqueSymbolPositions(self, index):
67        self.logFileHandle.write("Symbol \'" + unique_symbols[index] + "\' appeared at positions: \n")
68        self.logFileHandle.write(str(unique_sym_pos[index]) + "\n")
69
70    def actualUniqueSymbolInfo(self,length,position,gap):
71        self.logFileHandle.write("Start: " + repr(position).rjust(10) + " | Length: " + repr(length).rjust(10) + " | Gap: " + repr(gap).rjust(10) + "\n")
72
73    def expectedUniqueSymbolsInfo(self, length, num_occurrences, num_unique_syms):
74        self.logFileHandle.write("Length " + str(length)
75                       + "\nNumber of occurrences: " + str(num_occurrences)
76                       + "\nNumber of unique symbols: " + str(num_unique_syms)
77                       + "\nList unique symbols")
78        for i in range (len(unique_symbols)):
79            if len(unique_symbols[i]) == length:
80                self.logFileHandle.write ("\n\tSymbol \'" + unique_symbols[i] + "\'"
81                                   + "\n\tNumber of occurrences: " + str(unique_symbol_counts[i]))
82        self.logFileHandle.write("\n")
83
84    def appendAvgGapMaxGap(self,average_gap, max_gap):
85        self.metadataMsg("")
86        self.logFileHandle.write("Average gap distance: " + str(average_gap) + "\nMaximum gap distance: " + str(max_gap))
87
88    def metadataMsg(self,string):
89        self.logFileHandle.write("\n----------\n" + string +"\n")
90           
91    def close(self):
92        self.logFileHandle.close()
93
94def genLowerCaseCharactersList():
95    return ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
96
97def genUpperCaseCharactersList():
98    return ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
99
100def genDigitCharactersList():
101    return ['0','1','2','3','4','5','6','7','8','9']
102
103def genSpecialCharactersList():
104    return ['_',':','-','.']
105
106def genUniqueSymbol(length, num_unique_sym):
107    new_symbols = []
108    while (len(new_symbols) < num_unique_sym):
109        tempStr = ""
110        for j in range (length):
111            sym_lst = genDigitCharactersList() + genLowerCaseCharactersList() + genUpperCaseCharactersList() + genSpecialCharactersList() # XML-like characters
112            index = random.randint (0, len(sym_lst)-1)
113            tempStr += sym_lst[index]
114        #search for duplicates
115        if (new_symbols.count(tempStr) == 0):
116            new_symbols.append(tempStr)   
117    unique_symbols.extend(new_symbols)
118
119# NOTE: This function assumes that num_occurences > num_unique_sym
120def genDistributionUNIFORM(num_occurences, num_unique_sym):
121    k = num_occurences/num_unique_sym
122    for i in range (num_unique_sym-1):
123        unique_symbol_counts.append(k)
124    unique_symbol_counts.append(k+(num_occurences % k))
125   
126# NOTE: This function assumes that num_occurences > num_unique_sym
127def genDistributionRANDOM(num_occurences, num_unique_sym):
128    upperbound = num_occurences-num_unique_sym + 1
129    for i in range (num_unique_sym-1):
130        unique_symbol_counts.append(random.randint(1,upperbound))
131        num_occurences = num_occurences - unique_symbol_counts[len(unique_symbol_counts)-1]
132        upperbound = num_occurences - (num_unique_sym - i - 2)
133    unique_symbol_counts.append(num_occurences)
134
135def genSymbols():
136    index = -1
137    symbol_counter = 0
138
139    while symbol_counter <= 0:
140        # decide which unique symbol to get
141        index = random.randint (0, len(unique_symbols)-1)
142        symbol_counter = unique_symbol_counts[index]
143
144    return index
145
146def genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols):
147    gaps = []
148    position = 0
149    gap_counter = 0
150    gap_total = 0
151    max_unique_gap = -1
152    while total_symbols > 0:
153        index = genSymbols()
154        sym_str = unique_symbols[index]
155        # update counter
156        unique_symbol_counts[index] -= 1
157        total_symbols -= 1
158        gap = 0
159
160        # if we are out of gaps, then generate gaps
161        if len(gaps) == 0:
162                        gap_list_size = (max_gap - avg_gap) + 1
163
164                        if total_symbols > gap_list_size:
165                                gaps.append(avg_gap)
166                                for i in range(1, gap_list_size):
167                                        gaps.append(avg_gap + i)
168                                        gaps.append(avg_gap - i)
169                            #print avg_gap - i
170                        else:   
171                                remainder = total_symbols % gap_list_size
172                                if remainder % 2 == 0: # k is even
173                                        gaps.append(avg_gap)
174                                        gaps.append(avg_gap)
175                                else:
176                                        gaps.append(avg_gap)
177
178                                for i in range(1, remainder/2):
179                                        gaps.append(avg_gap + i)
180                                        gaps.append(avg_gap - i)
181                                    #print avg_gap - i
182
183
184
185        gap = gaps.pop(random.randint(0, len(gaps) - 1))
186        gap_total += gap
187        gap_counter += 1
188        if max_unique_gap < gap:
189            max_unique_gap = gap
190
191        # write test file
192        symbol_file_generator.appendUniqueSymbolAndGap(sym_str, gap)
193        symbol_logger.actualUniqueSymbolInfo(len(sym_str),position,gap)
194        unique_sym_pos[index].append(position)
195        # and update counter
196        position += len(sym_str) + gap
197# symbol_logger gap statistic
198   
199    symbol_logger.appendAvgGapMaxGap(float(gap_total) / gap_counter, max_unique_gap)
200    return
201   
202
203def main(*arguments):
204    num_args = len(arguments)
205    print num_args, arguments
206    if num_args < 3:
207        print usage
208    else:
209        avg_gap = int(arguments[num_args-2])
210        print "Average Gap Value: " + str(avg_gap)
211
212        max_gap = int(arguments[num_args-1])
213        print "Maximum Gap Value: " + str(max_gap)
214       
215        if avg_gap - max_gap/2 <= 0 :
216            print "Average Gap - Maximum Gap/2 must be strictly greater than 0"
217            sys.exit()
218
219        # parse unique symbol distributions
220        for i in range (num_args-2):
221            unique_symbol_dist_str = arguments[i]
222            lst = eval(unique_symbol_dist_str)
223            symbol_specs.append(lst)
224
225        print "Generating test file . . . "
226
227        # generate base filename
228        base_filename = ""
229        for lst in symbol_specs:
230            base_filename += "(" + str(lst[0]) + "_" + str(lst[1]) + "_" + str(lst[2]) + ")_"
231        base_filename += str(arguments[num_args-2]) + "_" + str(arguments[num_args-1])
232
233        # calculate total symbol count
234        unique_symbol_total = 0       
235        for lst in symbol_specs:
236            unique_symbol_total += lst[2]       
237         
238        # construct logger instance
239        symbol_file_generator = SymbolFileGenerator(base_filename)
240        symbol_logger = SymbolLogger(base_filename, unique_symbol_total, avg_gap, max_gap)
241               
242        symbol_logger.metadataMsg("Actual Result Statistics: \n")
243        total_symbols = 0
244
245        for symbol_spec in symbol_specs:
246            # generate the unique symbols and the distribution of each of the symbols
247            genUniqueSymbol(symbol_spec[0], symbol_spec[2])
248            genDistributionUNIFORM(symbol_spec[1], symbol_spec[2])
249            symbol_logger.expectedUniqueSymbolsInfo(symbol_spec[0], symbol_spec[1], symbol_spec[2])
250            total_symbols += symbol_spec[1]
251
252        #print "Unique Symbols: " + str(unique_symbols)
253        #print "Unique Symbols Distribution: " + str(unique_symbol_counts)
254
255        symbol_logger.metadataMsg("Actual Result Symbols: ")
256        genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols)
257
258        # symbol_logger statistic of unique symbols position
259        symbol_logger.metadataMsg("Unique Symbol Position: ")
260        for i in range (len(unique_symbols)):
261            symbol_logger.logUniqueSymbolPositions(i)
262
263        symbol_logger.close()
264
265        print "Done generating test file"
266
267        # reset shared variables
268        for i in range (len(unique_symbols)):
269            unique_symbols.pop()
270        for i in range (len(unique_symbol_counts)):
271            unique_symbol_counts.pop()
272        for i in range (len(symbol_specs)):
273            symbol_specs.pop()
274        for i in range (len(unique_sym_pos)):
275            unique_sym_pos.pop()
276
277if __name__ == "__main__": 
278        main(*sys.argv[1:]) 
279
280
Note: See TracBrowser for help on using the repository browser.