source: trunk/symbol_table/test/st_test_file_generator.py @ 2075

Last change on this file since 2075 was 2075, checked in by ksherdy, 7 years ago

Boo.

File size: 9.8 KB
Line 
1import random
2import sys
3import math
4       
5# (i) Test Files
6#
7#   [L_O_U]+_Ag_Mg.test, where for each triple (L,O,U)
8
9#    L - length
10#    O - total occurences of length L symbols
11#    U - unique occurences of length L symbols
12#
13#    and
14#
15#    * Ag denotes the average gap (or separation) distance between symbols.
16#    * Mg denotes the maximum absolute deviation gap distance, see Absolute Deviation
17#
18# (ii) Test Meta Files
19#
20#    [L_O_U]+_Ag_Mg.meta
21#
22#    Contains test file meta data, i.e. start position,
23#    length and gap distance for each symbol contained
24#    in the generated test file.
25
26usage = "python testFileGenerator.py [L,O,U] [L,O,U] [L,O,U] ... AvgGap MaxGap"
27
28# parallel lists
29unique_symbols = [] 
30unique_symbol_counts = []
31
32# symbol specs [[L,O,U], [L,O,U], [L,O,U], ...]
33symbol_specs = []
34
35unique_sym_pos = []
36
37################################################################################
38# Symbol Characters
39################################################################################
40def genLowerCaseCharactersList():
41    return ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
42
43def genUpperCaseCharactersList():
44    return ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
45
46def genDigitCharactersList():
47    return ['0','1','2','3','4','5','6','7','8','9']
48
49def genSpecialCharactersList():
50    return ['_',':','-','.']
51
52delim = ','
53char_pool = genDigitCharactersList() + genLowerCaseCharactersList() + genUpperCaseCharactersList() + genSpecialCharactersList() # XML-like characters
54
55class TestFileGenerator:
56    test_file = None
57    def __init__(self,base_filename):   
58        self.test_file = open(base_filename+".test", 'w')
59       
60    def appendUniqueSymbolAndGap(self, symbol_string, gap):
61        global delim
62        delims = delim*gap
63        self.test_file.write(symbol_string + delims)
64       
65    def close(self):
66        self.test_file.close()
67     
68
69class SymbolLogger:
70    meta_file = 0
71
72    def __init__(self,base_filename,unique_symbol_total,avg_gap, max_gap):
73
74        #prepare for unique symbol position list
75        for i in range(unique_symbol_total):
76            unique_sym_pos.append([])
77
78        self.meta_file = open (base_filename+".meta", 'w')
79        self.meta_file.write("Average Gap Distance: " + str(avg_gap) + "\nMaximum Gap Distance: " + str(max_gap))
80
81    def logUniqueSymbolPositions(self, index):
82        self.meta_file.write("Symbol \'" + unique_symbols[index] + "\' appeared at positions: \n")
83        self.meta_file.write(str(unique_sym_pos[index]) + "\n")
84
85    def actualUniqueSymbolInfo(self,length,position,gap):
86        self.meta_file.write("Start: " + repr(position).rjust(10) + " | Length: " + repr(length).rjust(10) + " | Gap: " + repr(gap).rjust(10) + "\n")
87
88    def expectedUniqueSymbolsInfo(self, length, num_occurrences, num_unique_syms):
89        self.meta_file.write("Length " + str(length)
90                       + "\nNumber of occurrences: " + str(num_occurrences)
91                       + "\nNumber of unique symbols: " + str(num_unique_syms)
92                       + "\nList unique symbols")
93        for i in range (len(unique_symbols)):
94            if len(unique_symbols[i]) == length:
95                self.meta_file.write ("\n\tSymbol \'" + unique_symbols[i] + "\'"
96                                   + "\n\tNumber of occurrences: " + str(unique_symbol_counts[i]))
97        self.meta_file.write("\n")
98
99    def appendAvgGapMaxGap(self,average_gap, max_gap):
100        self.metadataMsg("")
101        self.meta_file.write("Average gap distance: " + str(average_gap) + "\nMaximum gap distance: " + str(max_gap))
102
103    def metadataMsg(self,string):
104        self.meta_file.write("\n----------\n" + string +"\n")
105           
106    def close(self):
107        self.meta_file.close()
108
109def genUniqueSymbol(length, num_unique_sym):
110    new_symbols = []
111    while (len(new_symbols) < num_unique_sym):
112        tempStr = ""
113        for j in range (length):
114            sym_lst = genDigitCharactersList() + genLowerCaseCharactersList() + genUpperCaseCharactersList() + genSpecialCharactersList() # XML-like characters
115            index = random.randint (0, len(sym_lst)-1)
116            tempStr += sym_lst[index]
117        #search for duplicates
118        if (new_symbols.count(tempStr) == 0):
119            new_symbols.append(tempStr)
120    unique_symbols.extend(new_symbols)
121
122# WARNING: PRECONDITION: (num_occurences > num_unique_sym)
123def genDistributionUNIFORM(num_occurences, num_unique_sym):
124    k = num_occurences/num_unique_sym
125    for i in range (num_unique_sym-1):
126        unique_symbol_counts.append(k)
127    unique_symbol_counts.append(k+(num_occurences % k))
128   
129# WARNING: PRECONDITION: (num_occurences > num_unique_sym)
130#def genDistributionRANDOM(num_occurences, num_unique_sym):
131#    upperbound = num_occurences-num_unique_sym + 1
132#    for i in range (num_unique_sym-1):
133#        unique_symbol_counts.append(random.randint(1,upperbound))
134#        num_occurences = num_occurences - unique_symbol_counts[len(unique_symbol_counts)-1]
135#        upperbound = num_occurences - (num_unique_sym - i - 2)
136#    unique_symbol_counts.append(num_occurences)
137
138def genSymbols():
139    index = -1
140    symbol_counter = 0
141
142    while symbol_counter <= 0:
143        # decide which unique symbol to get
144        index = random.randint (0, len(unique_symbols)-1)
145        symbol_counter = unique_symbol_counts[index]
146
147    return index
148
149def genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols):
150    gaps = []
151    position = 0
152    gap_counter = 0
153    gap_total = 0
154    max_unique_gap = -1
155    while total_symbols > 0:
156        index = genSymbols()
157        sym_str = unique_symbols[index]
158        # update counter
159        unique_symbol_counts[index] -= 1
160        total_symbols -= 1
161        gap = 0
162
163        # if we are out of gaps, then generate gaps
164        if len(gaps) == 0:
165                        gap_list_size = (max_gap - avg_gap) + 1
166
167                        if total_symbols > gap_list_size:
168                                gaps.append(avg_gap)
169                                for i in range(1, gap_list_size):
170                                        gaps.append(avg_gap + i)
171                                        gaps.append(avg_gap - i)
172                            #print avg_gap - i
173                        else:   
174                                remainder = total_symbols % gap_list_size
175                                if remainder % 2 == 0: # k is even
176                                        gaps.append(avg_gap)
177                                        gaps.append(avg_gap)
178                                else:
179                                        gaps.append(avg_gap)
180
181                                for i in range(1, remainder/2):
182                                        gaps.append(avg_gap + i)
183                                        gaps.append(avg_gap - i)
184                                    #print avg_gap - i
185
186        gap = gaps.pop(random.randint(0, len(gaps) - 1))
187        gap_total += gap
188        gap_counter += 1
189        if max_unique_gap < gap:
190            max_unique_gap = gap
191
192        # write test file
193        symbol_file_generator.appendUniqueSymbolAndGap(sym_str, gap)
194        symbol_logger.actualUniqueSymbolInfo(len(sym_str),position,gap)
195        unique_sym_pos[index].append(position)
196        # and update counter
197        position += len(sym_str) + gap
198# symbol_logger gap statistic
199   
200    symbol_logger.appendAvgGapMaxGap(float(gap_total) / gap_counter, max_unique_gap)
201    return
202   
203
204def main(*arguments):
205
206    num_args = len(arguments)
207    if num_args < 3:
208        print usage
209    else:
210        avg_gap = int(arguments[num_args-2])
211        print "Average Gap Value: " + str(avg_gap)
212
213        max_gap = int(arguments[num_args-1])
214        print "Maximum Gap Value: " + str(max_gap)
215       
216        if avg_gap - max_gap/2 <= 0 :
217            print "Average Gap - Maximum Gap/2 must be strictly greater than 0"
218            sys.exit()
219
220        # parse unique symbol distributions
221        for i in range (num_args-2):
222            unique_symbol_dist_str = arguments[i]
223            lst = eval(unique_symbol_dist_str)
224            symbol_specs.append(lst)
225
226        print "Generating test file . . . "
227
228        # generate base filename
229        base_filename = ""
230        for lst in symbol_specs:
231            base_filename += "[" + str(lst[0]) + "_" + str(lst[1]) + "_" + str(lst[2]) + "]"
232        base_filename += "_" + str(arguments[num_args-2]) + "_" + str(arguments[num_args-1])
233
234        # calculate total symbol count
235        unique_symbol_total = 0       
236        for lst in symbol_specs:
237            unique_symbol_total += lst[2]       
238         
239        # construct logger instance
240        symbol_file_generator = TestFileGenerator(base_filename)
241        symbol_logger = SymbolLogger(base_filename, unique_symbol_total, avg_gap, max_gap)
242               
243        symbol_logger.metadataMsg("Actual Result Statistics: \n")
244        total_symbols = 0
245
246        for symbol_spec in symbol_specs:
247            # generate the unique symbols and the distribution of each of the symbols
248            genUniqueSymbol(symbol_spec[0], symbol_spec[2])
249            genDistributionUNIFORM(symbol_spec[1], symbol_spec[2])
250            symbol_logger.expectedUniqueSymbolsInfo(symbol_spec[0], symbol_spec[1], symbol_spec[2])
251            total_symbols += symbol_spec[1]
252
253        #print "Unique Symbols: " + str(unique_symbols)
254        #print "Unique Symbols Distribution: " + str(unique_symbol_counts)
255
256        symbol_logger.metadataMsg("Actual Result Symbols: ")
257        genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols)
258
259        # symbol_logger statistic of unique symbols position
260        symbol_logger.metadataMsg("Unique Symbol Position: ")
261        for i in range (len(unique_symbols)):
262            symbol_logger.logUniqueSymbolPositions(i)
263
264        symbol_logger.close()
265
266        print "Done generating test file."
267
268        # reset shared variables
269        for i in range (len(unique_symbols)):
270            unique_symbols.pop()
271        for i in range (len(unique_symbol_counts)):
272            unique_symbol_counts.pop()
273        for i in range (len(symbol_specs)):
274            symbol_specs.pop()
275        for i in range (len(unique_sym_pos)):
276            unique_sym_pos.pop()
277
278if __name__ == "__main__": 
279        main(*sys.argv[1:]) 
280
281
Note: See TracBrowser for help on using the repository browser.