source: perf/stream2runs/st_test_file_generator.py @ 3346

Last change on this file since 3346 was 415, checked in by ksherdy, 9 years ago

Initial check in.

File size: 9.8 KB
Line 
1import random
2import sys
3import math
4       
5# This script generates 3 files.
6#
7# 1. Output Test File
8#
9#    st_test_UNIFORM_(L_O_U)_(L_O_U)_(L_O_U)_Ag_Mg, where for each (L_O_U),
10
11#    * UNIFORM denotes that equal counts of symbols of each length L are generated
12#    * L denotes length
13#    * O denotes the total number of symbols of length L
14#    * U denotes the number of unique symbols of length L
15#
16#    and
17#
18#    * Ag denotes the average gap (or separation) distance between symbols.
19#    * Mg denotes the maximum absolute deviation gap distance, see Absolute Deviation
20#
21# 2. Meta Results File
22#
23#    st_meta_test_UNIFORM_(L_O_U)_(L_O_U)_(L_O_U)_Ag_Mg
24#
25#    This file contains start position, length and gap distance for each symbol contains in the generated test file, as
26#    well as the average gap distance and the maximum gap distance.
27#       
28
29usage = "python testFileGenerator.py [L,O,U] [L,O,U] [L,O,U] ... AvgGap MaxGap"
30
31# parallel lists unique_symbols and unique_symbol_counts
32# list of unique symbols
33unique_symbols = [] 
34# list of counts of the number of occurrences for each unique symbol
35unique_symbol_counts = []
36
37# a list that contains symbol specification [[L,O,U], [L,O,U], [L,O,U], ...]
38symbol_specs = []
39
40unique_sym_pos = []
41
42class SymbolFileGenerator:
43    testFileHandle = 0   
44    def __init__(self,base_filename):   
45        self.testFileHandle = open("st_test_UNIFORM_"+base_filename, 'w')
46       
47    def appendUniqueSymbolAndGap(self, symbol_string, gap):   
48        self.testFileHandle.write(symbol_string + ','*gap)       
49       
50    def close(self):
51        self.testFileHandle.close()
52     
53
54class SymbolLogger:
55    logFileHandle = 0
56
57    def __init__(self,base_filename,unique_symbol_total,avg_gap, max_gap):
58
59        #prepare for unique symbol position list
60        for i in range(unique_symbol_total):
61            unique_sym_pos.append([])
62
63        self.logFileHandle = open ("st_meta_test_UNIFORM_"+base_filename, 'w')
64        self.logFileHandle.write("Average Gap Distance: " + str(avg_gap) + "\nMaximum Gap Distance: " + str(max_gap))
65
66    def logUniqueSymbolPositions(self, index):
67        self.logFileHandle.write("Symbol \'" + unique_symbols[index] + "\' appeared at positions: \n")
68        self.logFileHandle.write(str(unique_sym_pos[index]) + "\n")
69
70    def actualUniqueSymbolInfo(self,length,position,gap):
71        self.logFileHandle.write("Start: " + repr(position).rjust(10) + " | Length: " + repr(length).rjust(10) + " | Gap: " + repr(gap).rjust(10) + "\n")
72
73    def expectedUniqueSymbolsInfo(self, length, num_occurrences, num_unique_syms):
74        self.logFileHandle.write("Length " + str(length)
75                       + "\nNumber of occurrences: " + str(num_occurrences)
76                       + "\nNumber of unique symbols: " + str(num_unique_syms)
77                       + "\nList unique symbols")
78        for i in range (len(unique_symbols)):
79            if len(unique_symbols[i]) == length:
80                self.logFileHandle.write ("\n\tSymbol \'" + unique_symbols[i] + "\'"
81                                   + "\n\tNumber of occurrences: " + str(unique_symbol_counts[i]))
82        self.logFileHandle.write("\n")
83
84    def appendAvgGapMaxGap(self,average_gap, max_gap):
85        self.metadataMsg("")
86        self.logFileHandle.write("Average gap distance: " + str(average_gap) + "\nMaximum gap distance: " + str(max_gap))
87
88    def metadataMsg(self,string):
89        self.logFileHandle.write("\n----------\n" + string +"\n")
90           
91    def close(self):
92        self.logFileHandle.close()
93
94def genAlphabetCharactersList():
95    return ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
96
97def genDigitCharactersList():
98    return ['0','1','2','3','4','5','6','7','8','9']
99
100def genUniqueSymbol(length, num_unique_sym):
101    new_symbols = []
102    while (len(new_symbols) < num_unique_sym):
103        tempStr = ""
104        for j in range (length):
105            sym_lst = genDigitCharactersList()
106            index = random.randint (0, len(sym_lst)-1)
107            tempStr += sym_lst[index]
108        #search for duplicates
109        if (new_symbols.count(tempStr) == 0):
110            new_symbols.append(tempStr)   
111    unique_symbols.extend(new_symbols)
112
113# NOTE: This function assumes that num_occurences > num_unique_sym
114def genDistributionUNIFORM(num_occurences, num_unique_sym):
115    k = num_occurences/num_unique_sym
116    for i in range (num_unique_sym-1):
117        unique_symbol_counts.append(k)
118    unique_symbol_counts.append(k+(num_occurences % k))
119   
120# NOTE: This function assumes that num_occurences > num_unique_sym
121def genDistributionRANDOM(num_occurences, num_unique_sym):
122    upperbound = num_occurences-num_unique_sym + 1
123    for i in range (num_unique_sym-1):
124        unique_symbol_counts.append(random.randint(1,upperbound))
125        num_occurences = num_occurences - unique_symbol_counts[len(unique_symbol_counts)-1]
126        upperbound = num_occurences - (num_unique_sym - i - 2)
127    unique_symbol_counts.append(num_occurences)
128
129def genSymbols():
130    index = -1
131    symbol_counter = 0
132
133    while symbol_counter <= 0:
134        # decide which unique symbol to get
135        index = random.randint (0, len(unique_symbols)-1)
136        symbol_counter = unique_symbol_counts[index]
137
138    return index
139
140def genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols):
141    gaps = []
142    position = 0
143    gap_counter = 0
144    gap_total = 0
145    max_unique_gap = -1
146    while total_symbols > 0:
147        index = genSymbols()
148        sym_str = unique_symbols[index]
149        # update counter
150        unique_symbol_counts[index] -= 1
151        total_symbols -= 1
152        gap = 0
153
154        # if we are out of gaps, then generate gaps
155        if len(gaps) == 0:
156                        gap_list_size = (max_gap - avg_gap) + 1
157
158                        if total_symbols > gap_list_size:
159                                gaps.append(avg_gap)
160                                for i in range(1, gap_list_size):
161                                        gaps.append(avg_gap + i)
162                                        gaps.append(avg_gap - i)
163                            #print avg_gap - i
164                        else:   
165                                remainder = total_symbols % gap_list_size
166                                if remainder % 2 == 0: # k is even
167                                        gaps.append(avg_gap)
168                                        gaps.append(avg_gap)
169                                else:
170                                        gaps.append(avg_gap)
171
172                                for i in range(1, remainder/2):
173                                        gaps.append(avg_gap + i)
174                                        gaps.append(avg_gap - i)
175                                    #print avg_gap - i
176
177
178
179        gap = gaps.pop(random.randint(0, len(gaps) - 1))
180        gap_total += gap
181        gap_counter += 1
182        if max_unique_gap < gap:
183            max_unique_gap = gap
184
185        # write test file
186        symbol_file_generator.appendUniqueSymbolAndGap(sym_str, gap)
187        symbol_logger.actualUniqueSymbolInfo(len(sym_str),position,gap)
188        unique_sym_pos[index].append(position)
189        # and update counter
190        position += len(sym_str) + gap
191# symbol_logger gap statistic
192   
193    symbol_logger.appendAvgGapMaxGap(float(gap_total) / gap_counter, max_unique_gap)
194    return
195   
196
197def main(*arguments):
198    num_args = len(arguments)
199    print num_args, arguments
200    if num_args < 3:
201        print usage
202    else:
203        avg_gap = int(arguments[num_args-2])
204        print "Average Gap Value: " + str(avg_gap)
205
206        max_gap = int(arguments[num_args-1])
207        print "Maximum Gap Value: " + str(max_gap)
208       
209        if avg_gap - max_gap/2 <= 0 :
210            print "Average Gap - Maximum Gap/2 must be strictly greater than 0"
211            sys.exit()
212
213        # parse unique symbol distributions
214        for i in range (num_args-2):
215            unique_symbol_dist_str = arguments[i]
216            lst = eval(unique_symbol_dist_str)
217            symbol_specs.append(lst)
218
219        print "Generating test file . . . "
220
221        # generate base filename
222        base_filename = ""
223        for lst in symbol_specs:
224            base_filename += "(" + str(lst[0]) + "_" + str(lst[1]) + "_" + str(lst[2]) + ")_"
225        base_filename += str(arguments[num_args-2]) + "_" + str(arguments[num_args-1])
226
227        # calculate total symbol count
228        unique_symbol_total = 0       
229        for lst in symbol_specs:
230            unique_symbol_total += lst[2]       
231         
232        # construct logger instance
233        symbol_file_generator = SymbolFileGenerator(base_filename)
234        symbol_logger = SymbolLogger(base_filename, unique_symbol_total, avg_gap, max_gap)
235               
236        symbol_logger.metadataMsg("Actual Result Statistics: \n")
237        total_symbols = 0
238
239        for symbol_spec in symbol_specs:
240            # generate the unique symbols and the distribution of each of the symbols
241            genUniqueSymbol(symbol_spec[0], symbol_spec[2])
242            genDistributionUNIFORM(symbol_spec[1], symbol_spec[2])
243            symbol_logger.expectedUniqueSymbolsInfo(symbol_spec[0], symbol_spec[1], symbol_spec[2])
244            total_symbols += symbol_spec[1]
245
246        #print "Unique Symbols: " + str(unique_symbols)
247        #print "Unique Symbols Distribution: " + str(unique_symbol_counts)
248
249        symbol_logger.metadataMsg("Actual Result Symbols: ")
250        genTestFile(symbol_file_generator,symbol_logger, avg_gap, max_gap, total_symbols)
251
252        # symbol_logger statistic of unique symbols position
253        symbol_logger.metadataMsg("Unique Symbol Position: ")
254        for i in range (len(unique_symbols)):
255            symbol_logger.logUniqueSymbolPositions(i)
256
257        symbol_logger.close()
258
259        print "Done generating test file"
260
261        # reset shared variables
262        for i in range (len(unique_symbols)):
263            unique_symbols.pop()
264        for i in range (len(unique_symbol_counts)):
265            unique_symbol_counts.pop()
266        for i in range (len(symbol_specs)):
267            symbol_specs.pop()
268        for i in range (len(unique_sym_pos)):
269            unique_sym_pos.pop()
270
271if __name__ == "__main__": 
272        main(*sys.argv[1:]) 
273
274
Note: See TracBrowser for help on using the repository browser.