source: tags/symbol_table-0.39/test/st_test_file_generator.py @ 2361

Last change on this file since 2361 was 2105, checked in by ksherdy, 7 years ago

Updated test files generator script to prepend delimeter character.

File size: 9.8 KB
Line 
1import random
2import sys
3import math
4       
5# (i) Test Files
6#
7#   [L_O_U]+_Ag_Mg.test, where for each triple (L,O,U)
8
9#    L - length
10#    O - total occurences of length L symbols
11#    U - unique occurences of length L symbols
12#
13#    and
14#
15#    * Ag denotes the average gap (or separation) distance between symbols.
16#    * Mg denotes the maximum absolute deviation gap distance, see Absolute Deviation
17#
18# (ii) Test Meta Files
19#
20#    [L_O_U]+_Ag_Mg.meta
21#
22#    Contains test file meta data, i.e. start position,
23#    length and gap distance for each symbol contained
24#    in the generated test file.
25
26usage = "python testFileGenerator.py (L,O,U) (L,O,U) (L,O,U) ... AvgGap MaxGap"
27
28# parallel lists
29unique_symbols = [] 
30unique_symbol_counts = []
31
32# symbol specs [[L,O,U], [L,O,U], [L,O,U], ...]
33symbol_specs = []
34
35unique_sym_pos = []
36
37################################################################################
38# Symbol Characters
39################################################################################
40def genLowerCaseCharactersList():
41    return ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
42
43def genUpperCaseCharactersList():
44    return ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
45
46def genDigitCharactersList():
47    return ['0','1','2','3','4','5','6','7','8','9']
48
49def genSpecialCharactersList():
50    return ['_',':','-','.']
51
52delim = ','
53char_pool = genDigitCharactersList() + genLowerCaseCharactersList() + genUpperCaseCharactersList() + genSpecialCharactersList() # XML-like characters
54
55class TestFileGenerator:
56    test_file = None
57    def __init__(self,base_filename):   
58        self.test_file = open(base_filename+".test", 'w')
59        self.test_file.write(delim)
60       
61    def appendUniqueSymbolAndGap(self, symbol_string, gap):
62        global delim
63        delims = delim*gap
64        self.test_file.write(symbol_string + delims)
65       
66    def close(self):
67        self.test_file.close()
68     
69
70class SymbolLogger:
71    meta_file = 0
72
73    def __init__(self,base_filename,unique_symbol_total,avg_gap, max_gap):
74
75        #prepare for unique symbol position list
76        for i in range(unique_symbol_total):
77            unique_sym_pos.append([])
78
79        self.meta_file = open (base_filename+".meta", 'w')
80        self.meta_file.write("Average Gap Distance: " + str(avg_gap) + "\nMaximum Gap Distance: " + str(max_gap))
81
82    def logUniqueSymbolPositions(self, index):
83        self.meta_file.write("Symbol \'" + unique_symbols[index] + "\' appeared at positions: \n")
84        self.meta_file.write(str(unique_sym_pos[index]) + "\n")
85
86    def actualUniqueSymbolInfo(self,length,position,gap):
87        self.meta_file.write("Start: " + repr(position).rjust(10) + " | Length: " + repr(length).rjust(10) + " | Gap: " + repr(gap).rjust(10) + "\n")
88
89    def expectedUniqueSymbolsInfo(self, length, num_occurrences, num_unique_syms):
90        self.meta_file.write("Length " + str(length)
91                       + "\nNumber of occurrences: " + str(num_occurrences)
92                       + "\nNumber of unique symbols: " + str(num_unique_syms)
93                       + "\nList unique symbols")
94        for i in range (len(unique_symbols)):
95            if len(unique_symbols[i]) == length:
96                self.meta_file.write ("\n\tSymbol \'" + unique_symbols[i] + "\'"
97                                   + "\n\tNumber of occurrences: " + str(unique_symbol_counts[i]))
98        self.meta_file.write("\n")
99
100    def appendAvgGapMaxGap(self,average_gap, max_gap):
101        self.metadataMsg("")
102        self.meta_file.write("Average gap distance: " + str(average_gap) + "\nMaximum gap distance: " + str(max_gap))
103
104    def metadataMsg(self,string):
105        self.meta_file.write("\n----------\n" + string +"\n")
106           
107    def close(self):
108        self.meta_file.close()
109
110def genUniqueSymbol(length, num_unique_sym):
111    new_symbols = []
112    while (len(new_symbols) < num_unique_sym):
113        tempStr = ""
114        for j in range (length):
115            sym_lst = genDigitCharactersList() + genLowerCaseCharactersList() + genUpperCaseCharactersList() + genSpecialCharactersList() # XML-like characters
116            index = random.randint (0, len(sym_lst)-1)
117            tempStr += sym_lst[index]
118        #search for duplicates
119        if (new_symbols.count(tempStr) == 0):
120            new_symbols.append(tempStr)
121    unique_symbols.extend(new_symbols)
122
123# WARNING: PRECONDITION: (num_occurences > num_unique_sym)
124def genDistributionUNIFORM(num_occurences, num_unique_sym):
125    k = num_occurences/num_unique_sym
126    for i in range (num_unique_sym-1):
127        unique_symbol_counts.append(k)
128    unique_symbol_counts.append(k+(num_occurences % k))
129   
130# WARNING: PRECONDITION: (num_occurences > num_unique_sym)
131#def genDistributionRANDOM(num_occurences, num_unique_sym):
132#    upperbound = num_occurences-num_unique_sym + 1
133#    for i in range (num_unique_sym-1):
134#        unique_symbol_counts.append(random.randint(1,upperbound))
135#        num_occurences = num_occurences - unique_symbol_counts[len(unique_symbol_counts)-1]
136#        upperbound = num_occurences - (num_unique_sym - i - 2)
137#    unique_symbol_counts.append(num_occurences)
138
139def genSymbols():
140    index = -1
141    symbol_counter = 0
142
143    while symbol_counter <= 0:
144        # decide which unique symbol to get
145        index = random.randint (0, len(unique_symbols)-1)
146        symbol_counter = unique_symbol_counts[index]
147
148    return index
149
150def genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols):
151    gaps = []
152    position = 0
153    gap_counter = 0
154    gap_total = 0
155    max_unique_gap = -1
156    while total_symbols > 0:
157        index = genSymbols()
158        sym_str = unique_symbols[index]
159        # update counter
160        unique_symbol_counts[index] -= 1
161        total_symbols -= 1
162        gap = 0
163
164        # if we are out of gaps, then generate gaps
165        if len(gaps) == 0:
166                        gap_list_size = (max_gap - avg_gap) + 1
167
168                        if total_symbols > gap_list_size:
169                                gaps.append(avg_gap)
170                                for i in range(1, gap_list_size):
171                                        gaps.append(avg_gap + i)
172                                        gaps.append(avg_gap - i)
173                            #print avg_gap - i
174                        else:   
175                                remainder = total_symbols % gap_list_size
176                                if remainder % 2 == 0: # k is even
177                                        gaps.append(avg_gap)
178                                        gaps.append(avg_gap)
179                                else:
180                                        gaps.append(avg_gap)
181
182                                for i in range(1, remainder/2):
183                                        gaps.append(avg_gap + i)
184                                        gaps.append(avg_gap - i)
185                                    #print avg_gap - i
186
187        gap = gaps.pop(random.randint(0, len(gaps) - 1))
188        gap_total += gap
189        gap_counter += 1
190        if max_unique_gap < gap:
191            max_unique_gap = gap
192
193        # write test file
194        symbol_file_generator.appendUniqueSymbolAndGap(sym_str, gap)
195        symbol_logger.actualUniqueSymbolInfo(len(sym_str),position,gap)
196        unique_sym_pos[index].append(position)
197        # and update counter
198        position += len(sym_str) + gap
199# symbol_logger gap statistic
200   
201    symbol_logger.appendAvgGapMaxGap(float(gap_total) / gap_counter, max_unique_gap)
202    return
203   
204
205def main(*arguments):
206
207    num_args = len(arguments)
208    if num_args < 3:
209        print usage
210    else:
211        avg_gap = int(arguments[num_args-2])
212        print "Average Gap Value: " + str(avg_gap)
213
214        max_gap = int(arguments[num_args-1])
215        print "Maximum Gap Value: " + str(max_gap)
216       
217        if avg_gap - max_gap/2 <= 0 :
218            print "Average Gap - Maximum Gap/2 must be strictly greater than 0"
219            sys.exit()
220
221        # parse unique symbol distributions
222        for i in range (num_args-2):
223            unique_symbol_dist_str = arguments[i]
224            lst = eval(unique_symbol_dist_str)
225            symbol_specs.append(lst)
226
227        print "Generating test file . . . "
228
229        # generate base filename
230        base_filename = ""
231        for lst in symbol_specs:
232            base_filename += "(" + str(lst[0]) + "_" + str(lst[1]) + "_" + str(lst[2]) + ")_"
233        base_filename += str(arguments[num_args-2]) + "_" + str(arguments[num_args-1])
234
235        # calculate total symbol count
236        unique_symbol_total = 0       
237        for lst in symbol_specs:
238            unique_symbol_total += lst[2]       
239         
240        # construct logger instance
241        symbol_file_generator = TestFileGenerator(base_filename)
242        symbol_logger = SymbolLogger(base_filename, unique_symbol_total, avg_gap, max_gap)
243               
244        symbol_logger.metadataMsg("Actual Result Statistics: \n")
245        total_symbols = 0
246
247        for symbol_spec in symbol_specs:
248            # generate the unique symbols and the distribution of each of the symbols
249            genUniqueSymbol(symbol_spec[0], symbol_spec[2])
250            genDistributionUNIFORM(symbol_spec[1], symbol_spec[2])
251            symbol_logger.expectedUniqueSymbolsInfo(symbol_spec[0], symbol_spec[1], symbol_spec[2])
252            total_symbols += symbol_spec[1]
253
254        #print "Unique Symbols: " + str(unique_symbols)
255        #print "Unique Symbols Distribution: " + str(unique_symbol_counts)
256
257        symbol_logger.metadataMsg("Actual Result Symbols: ")
258        genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols)
259
260        # symbol_logger statistic of unique symbols position
261        symbol_logger.metadataMsg("Unique Symbol Position: ")
262        for i in range (len(unique_symbols)):
263            symbol_logger.logUniqueSymbolPositions(i)
264
265        symbol_logger.close()
266
267        print "Done generating test file."
268
269        # reset shared variables
270        for i in range (len(unique_symbols)):
271            unique_symbols.pop()
272        for i in range (len(unique_symbol_counts)):
273            unique_symbol_counts.pop()
274        for i in range (len(symbol_specs)):
275            symbol_specs.pop()
276        for i in range (len(unique_sym_pos)):
277            unique_sym_pos.pop()
278
279if __name__ == "__main__": 
280        main(*sys.argv[1:]) 
281
282
Note: See TracBrowser for help on using the repository browser.