Ignore:
Timestamp:
Nov 21, 2011, 4:09:54 PM (8 years ago)
Author:
vla24
Message:

SymbolTable?: completed dictionary implementation and refactored templates

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/SymbolTable/wcd_pbgs_log_template.cpp

    r1688 r1721  
    11#define USE_MASK_COMPARE    //Comparison using masking technique.
    22
    3 #include "../symtab_global.h"
     3#include "../common_definitions.h"
    44#include <pbgs_log_symbol_table.h>
     5
     6#include "../wcd_common_functions.h"
     7#include "../symtab_common_functions.h"
     8#include "parser_common_functions_generated.h"
    59
    610#ifdef BUFFER_PROFILING
     
    2226int buffer_last;
    2327char * source;
    24 BitBlock EOF_mask = simd<1>::constant<1>();
     28
    2529BitBlock elem_starts;
    2630int previous_block_last_elem_start;
     
    3034PBGSLogSymbolTable pbgs_symbol_table;
    3135
    32 @global
    33 
    34 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits);
    35 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask);
    3636static inline void postprocess_do_block(Dictionary& dictionary, Hash_data hash_data);
    37 
    38 void do_process(FILE *infile, FILE *outfile);
     37template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile);
    3938
    4039template <int L> static inline void validate_block_length_grouping(BitBlockForwardIterator & start, int block_base);
    41 
    42 static inline int ScanBackwardPos(BitBlock * block, int pos);
    43 static inline int compute_hash_value (int lgth, int start);
    4440template <int L> static inline int ElemEnd_grouping(int pos, int length);
    45 template <int L> static inline int StreamScanLengthGrouping(ScanBlock * stream, int blk_count);
    4641
    4742int main(int argc, char * argv[]) {
    48         char * infilename, * outfilename;
    49         FILE *infile, *outfile;
    50         struct stat fileinfo;
    51 
    52         if (argc < 2) {
    53                 printf("Usage: %s <filename> [<outputfile>]\n", argv[0]);
    54                 exit(-1);
    55         }
    56 
    57         infilename = argv[1];
    58         stat(infilename, &fileinfo);
    59         infile = fopen(infilename, "rb");
    60         if (!infile) {
    61                 fprintf(stderr, "Error: cannot open %s for input.\n", infilename);
    62                 exit(-1);
    63         }
    64 
    65         if (argc < 3) outfile = stdout;
    66         else {
    67                 outfilename = argv[2];
    68                 outfile = fopen(outfilename, "wb");
    69                 if (!outfile) {
    70                         fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
    71                         exit(-1);
    72                 }
    73         }
     43    char * dictionaryfilename, * infilename, * outfilename;
     44    FILE * dictionaryfile, *infile, *outfile;
     45
     46    getFilenames(argc, argv, dictionaryfilename, infilename, outfilename);
     47    openInputOutputFiles(dictionaryfilename, infilename, outfilename,
     48                         dictionaryfile, infile, outfile);
     49
     50    int greatest_GID_in_dictionary;
     51    populateDictionary(dictionaryfile, outfile, gids, greatest_GID_in_dictionary);
    7452
    7553//      PERF_SEC_BIND(1);
    7654
    77         PERF_SEC_INIT(parser_timer);
    78 
    79         do_process(infile, outfile);
    80 
    81         PERF_SEC_DUMP(parser_timer);
    82 
    83         PERF_SEC_DESTROY(parser_timer);
    84 
    85         fclose(infile);
    86         fclose(outfile);
    87 
    88         return(0);
    89 }
    90 
    91 /* s2p Definitions */
    92 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits) {
    93   s2p(U8[0], U8[1], U8[2], U8[3], U8[4], U8[5], U8[6], U8[7],
    94         basis_bits.bit_0, basis_bits.bit_1, basis_bits.bit_2, basis_bits.bit_3, basis_bits.bit_4, basis_bits.bit_5, basis_bits.bit_6, basis_bits.bit_7);
    95 }
    96 
    97 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask) {
    98   s2p_do_block(U8, basis_bits);
    99   basis_bits.bit_0 = simd_and(basis_bits.bit_0, EOF_mask);
    100   basis_bits.bit_1 = simd_and(basis_bits.bit_1, EOF_mask);
    101   basis_bits.bit_2 = simd_and(basis_bits.bit_2, EOF_mask);
    102   basis_bits.bit_3 = simd_and(basis_bits.bit_3, EOF_mask);
    103   basis_bits.bit_4 = simd_and(basis_bits.bit_4, EOF_mask);
    104   basis_bits.bit_5 = simd_and(basis_bits.bit_5, EOF_mask);
    105   basis_bits.bit_6 = simd_and(basis_bits.bit_6, EOF_mask);
    106   basis_bits.bit_7 = simd_and(basis_bits.bit_7, EOF_mask);
    107 }
    108 
    109 
    110 static inline int ScanBackwardPos(BitBlock * block, int pos)
    111 {
    112     BitBlock s = block[0];
    113     BitBlock temp = simd_and( s, simd_not(simd<128>::sll(simd<2>::constant<3>(), convert(pos))) );
    114 
    115     if (bitblock_has_bit(temp))
    116     {
    117         // sizeof (BitBlock)*8 - cbzl( s & ~(~0 << pos)) - 1;
    118         return sizeof(BitBlock)*8 - count_reverse_zeroes (temp) - 1;
    119     }
    120     else
    121     {
    122         //handle boundary case
    123 #if DEBUG
    124         printf ("%s | block boundary case, return %i\n", __FUNCTION__, previous_block_last_elem_start - 1);
    125 #endif
    126         return previous_block_last_elem_start - 1;
    127     }
    128 }
    129 
    130 static inline int compute_hash_value (int lgth, int start)
    131 {
    132     unsigned int offset_bit = start + 128;
    133     uint64_t stream = *((uint64_t*)(((uint32_t*)hashvalues)+(offset_bit>>5)));
    134     return stream >> (offset_bit & 0x1F) & ~(~0 << lgth);
     55    PERF_SEC_INIT(parser_timer);
     56
     57    // store symbols form text to Symbol Table
     58    do_process<true>(infile, outfile);
     59
     60    PERF_SEC_DUMP(parser_timer);
     61
     62    PERF_SEC_DESTROY(parser_timer);
     63
     64    // gather dictionary statistics
     65    int totalKnownWordsInDictionary, totalUnknownWordsInDictionary;
     66    wordCountInDictionary(greatest_GID_in_dictionary, gids, totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
     67    printWordCountInDictionary(totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
     68
     69    fclose(dictionaryfile);
     70    fclose(infile);
     71    fclose(outfile);
     72
     73#if PRINT_SYMBOL_DISTRIBUTION
     74//    print_GIDS();
     75    pbgs_symbol_table.Print_Symbol_Table_Distribution();
     76#endif
    13577}
    13678
     
    14486inline int ElemEnd_grouping<1>(int pos, int length) {
    14587    int start = block_base + pos - length;
    146     int hashvalue = compute_hash_value(length, start - block_base);
     88    int hashvalue = compute_hash_value(length, start - block_base, hashvalues);
    14789    int gid = pbgs_symbol_table.Lookup_or_Insert_Name_1(source + start, hashvalue);
    14890    gids.push_back(gid);
     
    163105inline int ElemEnd_grouping<2>(int pos, int length) {
    164106    int start = block_base + pos - length;
    165     int hashvalue = compute_hash_value(length, start - block_base);
     107    int hashvalue = compute_hash_value(length, start - block_base, hashvalues);
    166108    int gid = pbgs_symbol_table.Lookup_or_Insert_Name_2(source + start, hashvalue);
    167109    gids.push_back(gid);
     
    183125inline int ElemEnd_grouping<4>(int pos, int L) {
    184126    int start = pos + block_base;
    185     int hashvalue = compute_hash_value(L, pos);
     127    int hashvalue = compute_hash_value(L, pos, hashvalues);
    186128    int gid = pbgs_symbol_table.Lookup_or_Insert_Name_4(source + start, hashvalue, L);
    187129    gids.push_back(gid);
     
    201143inline int ElemEnd_grouping<8>(int pos,  int L) {
    202144    int start = pos + block_base;
    203     int hashvalue = compute_hash_value(L, pos);
     145    int hashvalue = compute_hash_value(L, pos, hashvalues);
    204146    int gid = pbgs_symbol_table.Lookup_or_Insert_Name_8(source + start, hashvalue, L);
    205147    gids.push_back(gid);
     
    219161inline int ElemEnd_grouping<16>(int pos, int L) {
    220162    int start = pos + block_base;
    221     int hashvalue = compute_hash_value(L, pos);
     163    int hashvalue = compute_hash_value(L, pos, hashvalues);
    222164    int gid = pbgs_symbol_table.Lookup_or_Insert_Name_16(source + start, hashvalue, L);
    223165    gids.push_back(gid);
     
    237179inline int ElemEnd_grouping<17>(int pos, int lgth) {
    238180    int start = pos + block_base;
    239     int hashvalue = compute_hash_value(lgth, start - block_base);
     181    int hashvalue = compute_hash_value(lgth, start - block_base, hashvalues);
    240182    int gid = 0;
    241183
     
    270212    while(start != end) {
    271213        end_pos = /*block_base + */*start;
    272         start_pos = ScanBackwardPos (&elem_starts, end_pos);
     214        start_pos = ScanBackwardPos (&elem_starts, end_pos, previous_block_last_elem_start);
    273215        length = end_pos - start_pos;
    274216        ElemEnd_grouping<L>(start_pos, length);
     
    286228    while(start != end) {
    287229        end_pos = /*block_base + */*start;
    288         start_pos = ScanBackwardPos (&elem_starts, end_pos);
     230        start_pos = ScanBackwardPos (&elem_starts, end_pos, previous_block_last_elem_start);
    289231        length = end_pos - start_pos;
    290232        ElemEnd_grouping<1>(end_pos, 1);
     
    302244    while(start != end) {
    303245        end_pos = /*block_base + */*start;
    304         start_pos = ScanBackwardPos (&elem_starts, end_pos);
     246        start_pos = ScanBackwardPos (&elem_starts, end_pos, previous_block_last_elem_start);
    305247        length = end_pos - start_pos;
    306248        ElemEnd_grouping<2>(end_pos, 2);
     
    357299}
    358300
    359 static inline void print_GIDS()
    360 {
    361     int span_count = gids.size();
    362     for(int i=0;i<span_count;i++) {
    363              cout << gids[i] << " ";
    364     }
    365     cout << endl;
    366 }
    367 
    368 void do_process(FILE *infile, FILE *outfile) {
     301template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile) {
    369302
    370303@decl
     
    390323
    391324  if (e->content_start != 0) {
    392         memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start);
     325        memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start);
    393326        buf_pos = e->content_start;
    394327        buffer_base = buf_pos;
    395         if (chars_avail == BUFFER_SIZE) {
    396                 chars_read = chars_read - e->content_start +
     328        if (chars_avail == BUFFER_SIZE) {
     329                chars_read = chars_read - e->content_start +
    397330                             fread(&srcbuf[chars_read-e->content_start], 1, e->content_start, infile);
    398                 chars_avail = chars_read;
     331                chars_avail = chars_read;
    399332                if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE;
    400         }
    401         else {
    402           chars_read -=e->content_start;
     333        }
     334        else {
     335          chars_read -=e->content_start;
    403336          chars_avail -=e->content_start;
    404337        }
     
    410343
    411344    while (chars_avail == BUFFER_SIZE) {
    412       PERF_SEC_START(parser_timer);
     345      if (allow_performance_check)
     346      {
     347        PERF_SEC_START(parser_timer);
     348      }
     349
    413350      for (int blk = 0; blk < SEGMENT_BLOCKS; blk++) {
    414351          block_base = blk*BLOCK_SIZE;
    415           s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits);
     352          s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits);
    416353          @block_stmts
    417354          postprocess_do_block(dictionary, hash_data);
    418355      }
    419       PERF_SEC_END(parser_timer, chars_avail);
    420            
     356
     357      if (allow_performance_check)
     358      {
     359        PERF_SEC_END(parser_timer, chars_avail);
     360      }
    421361      int bytes_left = chars_read - chars_avail;
    422362      memmove(buf, &srcbuf[BUFFER_SIZE - OVERLAP_BUFSIZE], bytes_left + OVERLAP_BUFSIZE);
     
    428368    }
    429369/* Final Partial Buffer */
    430     PERF_SEC_START(parser_timer);
     370    if (allow_performance_check)
     371    {
     372        PERF_SEC_START(parser_timer);
     373    }
    431374
    432375    block_pos = 0;
     
    434377/* Full Blocks */
    435378    while (remaining >= BLOCK_SIZE) {
    436           block_base = block_pos;
    437           s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);
    438           @block_stmts
    439           postprocess_do_block(dictionary, hash_data);
    440           block_pos += BLOCK_SIZE;
    441           remaining -= BLOCK_SIZE;
     379          block_base = block_pos;
     380          s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);
     381          @block_stmts
     382          postprocess_do_block(dictionary, hash_data);
     383          block_pos += BLOCK_SIZE;
     384          remaining -= BLOCK_SIZE;
    442385    }
    443386    block_base = block_pos;
    444387    if (remaining > 0 || @any_carry) {
    445388          EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
    446           s2p_do_final_block((BytePack *) &srcbuf[block_pos], basis_bits, EOF_mask);
    447           @final_block_stmts
    448           postprocess_do_block(dictionary, hash_data);
     389          s2p_do_final_block((BytePack *) &srcbuf[block_pos], basis_bits, EOF_mask);
     390          @final_block_stmts
     391          postprocess_do_block(dictionary, hash_data);
    449392    }
    450393    buf_pos += chars_avail;
    451394    buffer_base = buf_pos;
    452 
    453     PERF_SEC_END(parser_timer, chars_avail);
    454 
    455 #if DEBUG
    456 //    print_GIDS();
    457     pbgs_symbol_table.Print_Symbol_Table_Distribution();
    458 #endif
    459 }
     395    if (allow_performance_check)
     396    {
     397        PERF_SEC_END(parser_timer, chars_avail);
     398    }
     399}
Note: See TracChangeset for help on using the changeset viewer.