source: proto/SymbolTable/wcd_pbgs_div_template.cpp @ 1766

Last change on this file since 1766 was 1741, checked in by vla24, 8 years ago

SymbolTable?: Fixed custom hashing function for parallel bitstream based grouping

File size: 11.2 KB
Line 
1#include "../common_definitions.h"
2#include <pbgs_div_symbol_table.h>
3
4#include "../wcd_common_functions.h"
5#include "../symtab_common_functions.h"
6#include "parser_common_functions_generated.h"
7
8#ifdef BUFFER_PROFILING
9        BOM_Table * parser_timer;
10
11#elif CODE_CLOCKER
12        #define NUM_EVENTS 1
13        int Events[NUM_EVENTS] = {PAPI_TOT_CYC};
14        //int Events[NUM_EVENTS] = {PAPI_L2_DCM};
15        //int Events[NUM_EVENTS] = {PAPI_TOT_CYC, PAPI_BR_MSP};
16        int cal_size = 20;
17        CC * parser_timer = new CC(Events,NUM_EVENTS,cal_size);
18#else
19        void * parser_timer;
20#endif
21
22int block_base=0;
23int buffer_base=0;
24int buffer_last;
25char * source;
26
27BitBlock elem_ends;
28int last_elem_start;
29bool block_boundary_case = false;
30BytePack hashvalues[2];
31
32vector <int> gids;
33PBGSDivSymbolTable pbgs_symbol_table;
34
35static inline void postprocess_do_block(Dictionary& dictionary, Hash_data hash_data);
36template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile);
37
38template <int L> static inline void validate_block_length_grouping(BitBlockForwardIterator & start, int block_base);
39
40static inline int ElemStart_grouping(int start_pos, int lgth); // lgth > 16
41template <int L> static inline int ElemEnd_grouping(int pos, int length);
42
43int main(int argc, char * argv[]) {
44    char * dictionaryfilename, * infilename, * outfilename;
45    FILE * dictionaryfile, *infile, *outfile;
46
47    getFilenames(argc, argv, dictionaryfilename, infilename, outfilename);
48    openInputOutputFiles(dictionaryfilename, infilename, outfilename,
49                         dictionaryfile, infile, outfile);
50
51    int greatest_GID_in_dictionary;
52    populateDictionary(dictionaryfile, outfile, gids, greatest_GID_in_dictionary);
53
54//      PERF_SEC_BIND(1);
55
56    PERF_SEC_INIT(parser_timer);
57
58    // store symbols form text to Symbol Table
59    do_process<true>(infile, outfile);
60
61    PERF_SEC_DUMP(parser_timer);
62
63    PERF_SEC_DESTROY(parser_timer);
64
65    // gather dictionary statistics
66    int totalKnownWordsInDictionary, totalUnknownWordsInDictionary;
67    wordCountInDictionary(greatest_GID_in_dictionary, gids, totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
68    printWordCountInDictionary(totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
69
70#if PRINT_SYMBOL_DISTRIBUTION
71//    print_GIDS();
72    pbgs_symbol_table.Print_Symbol_Table_Distribution();
73#endif
74
75    fclose(dictionaryfile);
76    fclose(infile);
77    fclose(outfile);
78
79    return(0);
80}
81
82// length in [1,16]
83template <int L>
84static inline int ElemEnd_grouping(int end) {
85    int start = end - L;
86    int hashvalue = compute_hash_value(L, start - block_base, hashvalues);
87    int gid = pbgs_symbol_table.Lookup_or_Insert_Name<L>(source + start, hashvalue);
88    gids.push_back(gid);
89#if DEBUG
90    char* symbol = new char[L+1];
91    strncpy ( symbol, source + start, L );
92    symbol[L] ='\0';
93    printf ("%s | start: %i[%i] | end: %i[%i] | lgth: %i | gid: %i | hashvalue: %i | symbol: %s\n", __FUNCTION__, start, start-buffer_base, end, end-buffer_base, L, gid, hashvalue, symbol );
94    delete symbol; symbol = 0;
95#endif
96    return 0;
97}
98
99// length > 16
100static inline int ElemStart_grouping(int start, int lgth) {
101    int hashvalue = compute_hash_value(lgth, start - block_base, hashvalues);
102    int gid = pbgs_symbol_table.Lookup_or_Insert_Name(source + start, hashvalue, lgth);
103    gids.push_back(gid);
104#if DEBUG
105    char* symbol = new char[lgth+1];
106    strncpy ( symbol, source + start, lgth );
107    symbol[lgth] ='\0';
108    printf ("%s | start: %i[%i] | lgth: %i | hashvalue: %i | gid: %i | symbol: %s\n", __FUNCTION__, start, start - block_base, lgth, hashvalue, gid, symbol);
109#endif
110    return 0;
111}
112
113// L = 2, pass in bitstream for symbols length [1,2]
114// L = 4, pass in bitstream for symbols length [3,4]
115// L = 6, pass in bitstream for symbols length [5,6]
116// L = 8, pass in bitstream for symbols length [7,8]
117// L = 10, pass in bitstream for symbols length [9,10]
118// L = 12, pass in bitstream for symbols length [11,12]
119// L = 14, pass in bitstream for symbols length [13,14]
120// L = 16, pass in bitstream for symbols length [15,16]
121// L = 17, pass in bitstream for symbols length longer than 16
122template <int L>
123static inline void validate_block_length_grouping(BitBlockForwardIterator & start, int block_base) {
124
125    BitBlockForwardIterator end;
126    int end_pos;
127
128    while(start != end) {
129        end_pos = *start;
130        ElemEnd_grouping<L>(end_pos + block_base);
131        start++;
132    }
133}
134
135template <>
136inline void validate_block_length_grouping<17>(BitBlockForwardIterator & start, int block_base) {
137
138    BitBlockForwardIterator end;
139    int start_pos, end_pos;
140
141    while(start != end) {
142        start_pos = *start;
143        end_pos = ScanForwardPos (&elem_ends, start_pos);
144        if (end_pos)
145        {
146            ElemStart_grouping(start_pos - 16 + block_base, end_pos - start_pos + 16);
147        }
148        else
149        {
150#if DEBUG
151            printf ("There is no more 1 bit in the block. pos: %i | sym: %c%c[%c]\n", start_pos,
152                    source[start_pos + block_base-2], source[start_pos + block_base-1], source[start_pos + block_base]);
153#endif
154            //handle boundary case
155            block_boundary_case = true;
156            last_elem_start = start_pos - 16 - BLOCK_SIZE;
157#if DEBUG
158            printf ("last_elem_start: %i\n", last_elem_start);
159#endif
160        }
161        start++;
162    }
163}
164
165static inline void postprocess_do_block(Dictionary& dictionary, Hash_data hash_data){
166
167    elem_ends = dictionary.Word_ends;
168    hashvalues[1] = hash_data.Hash_value;
169
170    // Check for block boundary case for length 16 and above
171    if (block_boundary_case)
172    {
173#if DEBUG
174        printf ("block boundary case! Special handle!\n");
175#endif
176        int lgth = count_forward_zeroes(elem_ends)-last_elem_start;
177        int start = block_base + last_elem_start;
178        int hashvalue = compute_hash_value(lgth, last_elem_start, hashvalues);
179        int gid = pbgs_symbol_table.Lookup_or_Insert_Name(source + start, hashvalue, lgth);
180        gids.push_back(gid);
181#if DEBUG
182        printf ("%s | start: %i[%i] | lgth: %i | hashvalue: %i | gid: %i \n", __FUNCTION__, start, start - block_base, lgth, hashvalue, gid);
183#endif
184        block_boundary_case = false;
185    }
186
187    if ( bitblock_has_bit(dictionary.Word_ends_1_to_2) )
188    {
189        BitBlockForwardIterator iter_length_grouping_2(&dictionary.Word_ends_1_to_2);
190        validate_block_length_grouping<2>(iter_length_grouping_2, block_base);
191    }
192
193    if ( bitblock_has_bit(dictionary.Word_ends_3_to_4) )
194    {
195        BitBlockForwardIterator iter_length_grouping_4(&dictionary.Word_ends_3_to_4);
196        validate_block_length_grouping<4>(iter_length_grouping_4, block_base);
197    }
198
199    if ( bitblock_has_bit(dictionary.Word_ends_5_to_6) )
200    {
201        BitBlockForwardIterator iter_length_grouping_6(&dictionary.Word_ends_5_to_6);
202        validate_block_length_grouping<6>(iter_length_grouping_6, block_base);
203    }
204
205    if ( bitblock_has_bit(dictionary.Word_ends_7_to_8) )
206    {
207        BitBlockForwardIterator iter_length_grouping_8(&dictionary.Word_ends_7_to_8);
208        validate_block_length_grouping<8>(iter_length_grouping_8, block_base);
209    }
210
211    if ( bitblock_has_bit(dictionary.Word_ends_9_to_10) )
212    {
213        BitBlockForwardIterator iter_length_grouping_10(&dictionary.Word_ends_9_to_10);
214        validate_block_length_grouping<10>(iter_length_grouping_10, block_base);
215    }
216
217    if ( bitblock_has_bit(dictionary.Word_ends_11_to_12) )
218    {
219        BitBlockForwardIterator iter_length_grouping_12(&dictionary.Word_ends_11_to_12);
220        validate_block_length_grouping<12>(iter_length_grouping_12, block_base);
221    }
222
223    if ( bitblock_has_bit(dictionary.Word_ends_13_to_14) )
224    {
225        BitBlockForwardIterator iter_length_grouping_14(&dictionary.Word_ends_13_to_14);
226        validate_block_length_grouping<14>(iter_length_grouping_14, block_base);
227    }
228
229    if ( bitblock_has_bit(dictionary.Word_ends_15_to_16) )
230    {
231        BitBlockForwardIterator iter_length_grouping_16(&dictionary.Word_ends_15_to_16);
232        validate_block_length_grouping<16>(iter_length_grouping_16, block_base);
233    }
234
235    if ( bitblock_has_bit(dictionary.Word_remaining_ends) )
236    {
237        BitBlockForwardIterator iter_length_grouping_remaining(&dictionary.Word_remaining_ends);
238        validate_block_length_grouping<17>(iter_length_grouping_remaining, block_base);
239    }
240
241    //copy current hash value data as previous one.
242    memmove (&hashvalues[0], &hashvalues[1], 16);
243}
244
245template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile) {
246
247@decl
248
249  int buf_pos = 0;
250  int block_pos = 0;
251  int errpos = 0;
252  int chars_avail = 0;
253  int check_pos = 0;
254  int chars_read = 0;
255  BytePack buf[(BUFFER_SIZE+BLOCK_SIZE+OVERLAP_BUFSIZE*2)/sizeof(SIMD_type)];
256
257  char * srcbuf = ((char *) buf) + OVERLAP_BUFSIZE;
258  buffer_base = buf_pos;
259  source = srcbuf;
260
261  chars_read = fread((void *)srcbuf, 1, BUFFER_SIZE + OVERLAP_BUFSIZE, infile);
262  chars_avail = chars_read;
263  if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE;
264
265  Entity_Info * e = new Entity_Info;
266  e->AnalyzeSignature((unsigned char *)srcbuf);
267
268  if (e->content_start != 0) {
269        memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start);
270        buf_pos = e->content_start;
271        buffer_base = buf_pos;
272        if (chars_avail == BUFFER_SIZE) {
273                chars_read = chars_read - e->content_start +
274                             fread(&srcbuf[chars_read-e->content_start], 1, e->content_start, infile);
275                chars_avail = chars_read;
276                if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE;
277        }
278        else {
279          chars_read -=e->content_start;
280          chars_avail -=e->content_start;
281        }
282  }
283
284@stream_stmts
285
286/* Full Buffers */
287
288    while (chars_avail == BUFFER_SIZE) {
289      if (allow_performance_check)
290      {
291        PERF_SEC_START(parser_timer);
292      }
293
294      for (int blk = 0; blk < SEGMENT_BLOCKS; blk++) {
295          block_base = blk*BLOCK_SIZE;
296          s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits);
297          @block_stmts
298          postprocess_do_block(dictionary, hash_data);
299      }
300
301      if (allow_performance_check)
302      {
303        PERF_SEC_END(parser_timer, chars_avail);
304      }
305      int bytes_left = chars_read - chars_avail;
306      memmove(buf, &srcbuf[BUFFER_SIZE - OVERLAP_BUFSIZE], bytes_left + OVERLAP_BUFSIZE);
307      chars_read = fread(&srcbuf[bytes_left],1, BUFFER_SIZE + OVERLAP_BUFSIZE - bytes_left, infile) + bytes_left;
308      chars_avail = chars_read;
309      if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE;
310      buf_pos += chars_avail;
311      buffer_base = buf_pos;
312    }
313/* Final Partial Buffer */
314    if (allow_performance_check)
315    {
316        PERF_SEC_START(parser_timer);
317    }
318
319    block_pos = 0;
320    int remaining = chars_avail;
321/* Full Blocks */
322    while (remaining >= BLOCK_SIZE) {
323          block_base = block_pos;
324          s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);
325          @block_stmts
326          postprocess_do_block(dictionary, hash_data);
327          block_pos += BLOCK_SIZE;
328          remaining -= BLOCK_SIZE;
329    }
330    block_base = block_pos;
331    if (remaining > 0 || @any_carry) {
332          EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
333          s2p_do_final_block((BytePack *) &srcbuf[block_pos], basis_bits, EOF_mask);
334          @final_block_stmts
335          postprocess_do_block(dictionary, hash_data);
336    }
337    buf_pos += chars_avail;
338    buffer_base = buf_pos;
339    if (allow_performance_check)
340    {
341        PERF_SEC_END(parser_timer, chars_avail);
342    }
343}
Note: See TracBrowser for help on using the repository browser.