Ignore:
Timestamp:
Nov 21, 2011, 4:09:54 PM (8 years ago)
Author:
vla24
Message:

SymbolTable?: completed dictionary implementation and refactored templates

File:
1 edited

Legend:

Unmodified
Added
Removed
  • proto/SymbolTable/wcd_pbgs_identity_template.cpp

    r1688 r1721  
    1 #include "../symtab_global.h"
     1#include "../common_definitions.h"
    22#include <pbgs_identity_symbol_table.h>
     3
     4#include "../wcd_common_functions.h"
     5#include "../symtab_common_functions.h"
     6#include "parser_common_functions_generated.h"
    37
    48#ifdef BUFFER_PROFILING
     
    2024int buffer_last;
    2125char * source;
    22 LineColTracker tracker;
    23 TagMatcher matcher;
    24 BitBlock EOF_mask = simd<1>::constant<1>();
    25 ErrorTracker error_tracker;
    2626
    2727BitBlock elem_starts;
     
    3232PBGSIdentitySymbolTable pbgs_symbol_table;
    3333
    34 @global
    35 
    36 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits);
    37 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask);
    3834static inline void postprocess_do_block(Dictionary& dictionary, Hash_data hash_data);
    39 
    40 void do_process(FILE *infile, FILE *outfile);
     35template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile);
    4136
    4237template <int L> static inline void validate_block_length_grouping(BitBlockForwardIterator & start, int block_base);
    43 
    44 static inline int ScanBackwardPos(BitBlock * block, int pos);
    45 static inline int compute_hash_value (int lgth, int start);
    4638template <int L> static inline int ElemEnd_grouping(int pos);
    4739
    4840int main(int argc, char * argv[]) {
    49         char * infilename, * outfilename;
    50         FILE *infile, *outfile;
    51         struct stat fileinfo;
    52 
    53         if (argc < 2) {
    54                 printf("Usage: %s <filename> [<outputfile>]\n", argv[0]);
    55                 exit(-1);
    56         }
    57 
    58         infilename = argv[1];
    59         stat(infilename, &fileinfo);
    60         infile = fopen(infilename, "rb");
    61         if (!infile) {
    62                 fprintf(stderr, "Error: cannot open %s for input.\n", infilename);
    63                 exit(-1);
    64         }
    65 
    66         if (argc < 3) outfile = stdout;
    67         else {
    68                 outfilename = argv[2];
    69                 outfile = fopen(outfilename, "wb");
    70                 if (!outfile) {
    71                         fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
    72                         exit(-1);
    73                 }
    74         }
     41    char * dictionaryfilename, * infilename, * outfilename;
     42    FILE * dictionaryfile, *infile, *outfile;
     43
     44    getFilenames(argc, argv, dictionaryfilename, infilename, outfilename);
     45    openInputOutputFiles(dictionaryfilename, infilename, outfilename,
     46                         dictionaryfile, infile, outfile);
     47
     48    int greatest_GID_in_dictionary;
     49    populateDictionary(dictionaryfile, outfile, gids, greatest_GID_in_dictionary);
    7550
    7651//      PERF_SEC_BIND(1);
    7752
    78         PERF_SEC_INIT(parser_timer);
    79 
    80         do_process(infile, outfile);
    81 
    82         PERF_SEC_DUMP(parser_timer);
    83 
    84         PERF_SEC_DESTROY(parser_timer);
    85 
    86         fclose(infile);
    87         fclose(outfile);
    88 
    89         return(0);
    90 }
    91 
    92 /* s2p Definitions */
    93 static inline void s2p_do_block(BytePack U8[], Basis_bits & basis_bits) {
    94   s2p(U8[0], U8[1], U8[2], U8[3], U8[4], U8[5], U8[6], U8[7],
    95         basis_bits.bit_0, basis_bits.bit_1, basis_bits.bit_2, basis_bits.bit_3, basis_bits.bit_4, basis_bits.bit_5, basis_bits.bit_6, basis_bits.bit_7);
    96 }
    97 
    98 static inline void s2p_do_final_block(BytePack U8[], Basis_bits & basis_bits, BitBlock EOF_mask) {
    99   s2p_do_block(U8, basis_bits);
    100   basis_bits.bit_0 = simd_and(basis_bits.bit_0, EOF_mask);
    101   basis_bits.bit_1 = simd_and(basis_bits.bit_1, EOF_mask);
    102   basis_bits.bit_2 = simd_and(basis_bits.bit_2, EOF_mask);
    103   basis_bits.bit_3 = simd_and(basis_bits.bit_3, EOF_mask);
    104   basis_bits.bit_4 = simd_and(basis_bits.bit_4, EOF_mask);
    105   basis_bits.bit_5 = simd_and(basis_bits.bit_5, EOF_mask);
    106   basis_bits.bit_6 = simd_and(basis_bits.bit_6, EOF_mask);
    107   basis_bits.bit_7 = simd_and(basis_bits.bit_7, EOF_mask);
    108 }
    109 
    110 static inline int ScanBackwardPos(BitBlock * block, int pos)
    111 {
    112     BitBlock s = block[0];
    113     BitBlock temp = simd_and( s, simd_not(simd<128>::sll(simd<2>::constant<3>(), convert(pos))) );
    114 
    115     if (bitblock_has_bit(temp))
    116     {
    117         // sizeof (BitBlock)*8 - cbzl( s & ~(~0 << pos)) - 1;
    118         return BLOCK_SIZE - count_reverse_zeroes (temp) - 1;
    119     }
    120     else
    121     {
    122         //handle boundary case
    123         return previous_block_last_elem_start - 1;
    124     }
    125 }
    126 
    127 static inline int compute_hash_value (int lgth, int start)
    128 {
    129     unsigned int offset_bit = start + 128;
    130     uint64_t stream = *((uint64_t*)(((uint32_t*)hashvalues)+(offset_bit>>5)));
    131     return stream >> (offset_bit & 0x1F) & ~(~0 << lgth);
     53    PERF_SEC_INIT(parser_timer);
     54
     55    // store symbols form text to Symbol Table
     56    do_process<true>(infile, outfile);
     57
     58    PERF_SEC_DUMP(parser_timer);
     59
     60    PERF_SEC_DESTROY(parser_timer);
     61
     62    // gather dictionary statistics
     63    int totalKnownWordsInDictionary, totalUnknownWordsInDictionary;
     64    wordCountInDictionary(greatest_GID_in_dictionary, gids, totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
     65    printWordCountInDictionary(totalUnknownWordsInDictionary, totalKnownWordsInDictionary);
     66
     67    fclose(dictionaryfile);
     68    fclose(infile);
     69    fclose(outfile);
     70
     71#if PRINT_SYMBOL_DISTRIBUTION
     72//    print_GIDS();
     73    pbgs_symbol_table.Print_Symbol_Table_Distribution();
     74#endif
     75
     76    return(0);
    13277}
    13378
     
    13580static inline int ElemEnd_grouping(int end) {
    13681    int start = end - L;
    137     int hashvalue = compute_hash_value(L, start - block_base);
     82    int hashvalue = compute_hash_value(L, start - block_base, hashvalues);
    13883    int gid = pbgs_symbol_table.Lookup_or_Insert_Name<L>(source + start, hashvalue);
    13984    gids.push_back(gid);
     
    15095template<>
    15196inline int ElemEnd_grouping<17>(int end) {
    152     int start = ScanBackwardPos (&elem_starts, end - block_base) + block_base;
     97    int start = ScanBackwardPos (&elem_starts, end - block_base, previous_block_last_elem_start) + block_base;
    15398    int lgth = end - start;
    154     int hashvalue = compute_hash_value(lgth, start - block_base);
     99    int hashvalue = compute_hash_value(lgth, start - block_base, hashvalues);
    155100    int gid = 0;
    156101
     
    300245}
    301246
    302 static inline void print_GIDS()
    303 {
    304     int span_count = gids.size();
    305     for(int i=0;i<span_count;i++) {
    306              cout << gids[i] << " ";
    307     }
    308     cout << endl;
    309 }
    310 
    311 void do_process(FILE *infile, FILE *outfile) {
     247template<bool allow_performance_check> void do_process(FILE *infile, FILE *outfile) {
    312248
    313249@decl
     
    333269
    334270  if (e->content_start != 0) {
    335         memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start);
     271        memmove(&srcbuf[0], &srcbuf[e->content_start], chars_read - e->content_start);
    336272        buf_pos = e->content_start;
    337273        buffer_base = buf_pos;
    338         if (chars_avail == BUFFER_SIZE) {
    339                 chars_read = chars_read - e->content_start +
     274        if (chars_avail == BUFFER_SIZE) {
     275                chars_read = chars_read - e->content_start +
    340276                             fread(&srcbuf[chars_read-e->content_start], 1, e->content_start, infile);
    341                 chars_avail = chars_read;
     277                chars_avail = chars_read;
    342278                if (chars_avail > BUFFER_SIZE) chars_avail = BUFFER_SIZE;
    343         }
    344         else {
    345           chars_read -=e->content_start;
     279        }
     280        else {
     281          chars_read -=e->content_start;
    346282          chars_avail -=e->content_start;
    347283        }
     
    353289
    354290    while (chars_avail == BUFFER_SIZE) {
    355       PERF_SEC_START(parser_timer);
     291      if (allow_performance_check)
     292      {
     293        PERF_SEC_START(parser_timer);
     294      }
     295
    356296      for (int blk = 0; blk < SEGMENT_BLOCKS; blk++) {
    357297          block_base = blk*BLOCK_SIZE;
    358           s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits);
     298          s2p_do_block((BytePack *) &srcbuf[block_base], basis_bits);
    359299          @block_stmts
    360300          postprocess_do_block(dictionary, hash_data);
    361301      }
    362       PERF_SEC_END(parser_timer, chars_avail);
    363            
     302
     303      if (allow_performance_check)
     304      {
     305        PERF_SEC_END(parser_timer, chars_avail);
     306      }
    364307      int bytes_left = chars_read - chars_avail;
    365308      memmove(buf, &srcbuf[BUFFER_SIZE - OVERLAP_BUFSIZE], bytes_left + OVERLAP_BUFSIZE);
     
    371314    }
    372315/* Final Partial Buffer */
    373     PERF_SEC_START(parser_timer);
     316    if (allow_performance_check)
     317    {
     318        PERF_SEC_START(parser_timer);
     319    }
    374320
    375321    block_pos = 0;
     
    377323/* Full Blocks */
    378324    while (remaining >= BLOCK_SIZE) {
    379           block_base = block_pos;
    380           s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);
    381           @block_stmts
    382           postprocess_do_block(dictionary, hash_data);
    383           block_pos += BLOCK_SIZE;
    384           remaining -= BLOCK_SIZE;
     325          block_base = block_pos;
     326          s2p_do_block((BytePack *) &srcbuf[block_pos], basis_bits);
     327          @block_stmts
     328          postprocess_do_block(dictionary, hash_data);
     329          block_pos += BLOCK_SIZE;
     330          remaining -= BLOCK_SIZE;
    385331    }
    386332    block_base = block_pos;
    387333    if (remaining > 0 || @any_carry) {
    388334          EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
    389           s2p_do_final_block((BytePack *) &srcbuf[block_pos], basis_bits, EOF_mask);
    390           @final_block_stmts
    391           postprocess_do_block(dictionary, hash_data);
     335          s2p_do_final_block((BytePack *) &srcbuf[block_pos], basis_bits, EOF_mask);
     336          @final_block_stmts
     337          postprocess_do_block(dictionary, hash_data);
    392338    }
    393339    buf_pos += chars_avail;
    394340    buffer_base = buf_pos;
    395 
    396     PERF_SEC_END(parser_timer, chars_avail);
    397 
    398 #if DEBUG
    399 //    print_GIDS();
    400     pbgs_symbol_table.Print_Symbol_Table_Distribution();
    401 #endif
    402 }
     341    if (allow_performance_check)
     342    {
     343        PERF_SEC_END(parser_timer, chars_avail);
     344    }
     345}
Note: See TracChangeset for help on using the changeset viewer.