Ignore:
Timestamp:
Feb 23, 2016, 9:07:05 AM (4 years ago)
Author:
lindanl
Message:

new version using the kernels.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/do_grep.cpp

    r4937 r4939  
    55 */
    66
    7 #include "basis_bits.h"
     7#include "toolchain.h"
    88#include "do_grep.h"
    99
     
    2323#include <cctype>
    2424
    25 #include "include/simd-lib/carryQ.hpp"
    26 #include "include/simd-lib/pabloSupport.hpp"
    27 #include "include/simd-lib/s2p.hpp"
    28 #include "include/simd-lib/buffer.hpp"
    2925
    3026#include <llvm/Support/raw_os_ostream.h>
     
    5753
    5854
    59 void GrepExecutor::write_matched_line(llvm::raw_ostream & out, const char * buffer, ssize_t line_start, ssize_t line_end) {
    60     if (mShowFileNameOption) {
    61         out << mFileName << ':';
    62     }
    63     if (mShowLineNumberingOption) {
    64         out << mLineNum << ":";
    65     }
    66     if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
    67         // The line "starts" on the LF of a CRLF.  Really the end of the last line.
    68         line_start++;
    69     }
    70     if (buffer + line_end == mFileBuffer + mFileSize) {
    71         // The match position is at end-of-file.   We have a final unterminated line.
    72         out.write(&buffer[line_start], line_end - line_start);
    73         if (mNormalizeLineBreaksOption) {
    74             out << '\n';  // terminate it
    75         }
    76         return;
    77     }
    78     unsigned char end_byte = (unsigned char)buffer[line_end];
    79     if (mNormalizeLineBreaksOption) {
    80         if (end_byte == 0x85) {
    81             // Line terminated with NEL, on the second byte.  Back up 1.
    82             line_end--;
    83         } else if (end_byte > 0xD) {
    84             // Line terminated with PS or LS, on the third byte.  Back up 2.
    85             line_end -= 2;
    86         }
    87         out.write(&buffer[line_start], line_end - line_start);
    88         out << '\n';
    89     }
    90     else {
    91         if (end_byte == 0x0D) {
    92             // Check for line_end on first byte of CRLF;  note that we don't
    93             // want to access past the end of buffer.
    94             if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
    95                 // Found CRLF; preserve both bytes.
    96                 line_end++;
    97             }
    98         }
    99         out.write(&buffer[line_start], line_end - line_start + 1);
    100     }
    101    
    102 }
    103 
    104 ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
    105 
    106     ssize_t match_pos;
    107     ssize_t line_end;
    108     while (mMatch_scanner.has_next()) {
    109         match_pos = mMatch_scanner.scan_to_next();
    110         // If we found a match, it must be at a line end.
    111         while (true) {
    112             line_end = mLineBreak_scanner.scan_to_next();
    113             if (line_end >= match_pos) {
    114                 break;
    115             }
    116             line_start = line_end + 1;
    117             mLineNum++;
    118         }
    119         write_matched_line(out,buffer, line_start, line_end);
    120         line_start = line_end + 1;
    121         mLineNum++;
    122     }
    123     while(mLineBreak_scanner.has_next()) {
    124         line_end = mLineBreak_scanner.scan_to_next();
    125         line_start = line_end+1;
    126         mLineNum++;
    127     }
    128     return line_start;
    129 }
    130 
    13155bool GrepExecutor::finalLineIsUnterminated() const {
    13256    if (mFileSize == 0) return false;
     
    14569}
    14670
    147 // Extracting codepoint data from UCD name data file.
    148 ssize_t GrepExecutor::extract_codepoints(char * buffer, ssize_t first_line_start) {
    149    
    150     ssize_t line_start = first_line_start;
    151     size_t match_pos;
    152     size_t line_end;
    153    
    154     while (mMatch_scanner.has_next()) {
    155         match_pos = mMatch_scanner.scan_to_next();
    156         // If we found a match, it must be at a line end.
    157         line_end = mLineBreak_scanner.scan_to_next();
    158         while (line_end < match_pos) {
    159             line_start = line_end + 1;
    160             mLineNum++;
    161             line_end = mLineBreak_scanner.scan_to_next();
    162         }
    163        
    164         re::codepoint_t c = 0;
    165         ssize_t line_pos = line_start;
    166         while (isxdigit(buffer[line_pos])) {
    167             if (isdigit(buffer[line_pos])) {
    168                 c = (c << 4) | (buffer[line_pos] - '0');
    169             }
    170             else {
    171                 c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
    172             }
    173             line_pos++;
    174         }
    175         assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
    176 #ifndef NDEBUG
    177         std::cerr << "\\N expression found codepoint " << std::hex << c << std::dec << std::endl;
    178 #endif
    179        
    180         mParsedCodePointSet->insert(c);
    181        
    182         line_start = line_end + 1;
    183         mLineNum++;
    184     }
    185     while(mLineBreak_scanner.has_next()) {
    186         line_end = mLineBreak_scanner.scan_to_next();
    187         line_start = line_end+1;
    188         mLineNum++;
    189     }
    190     return line_start;
    191    
    192 }
    193 
    194 
    19571void GrepExecutor::doGrep(const std::string & fileName) {
    19672
    197     Basis_bits basis_bits;
    198     BitBlock match_vector = simd<1>::constant<0>();
    199     size_t match_count = 0;
    200     size_t chars_avail = 0;
    201     ssize_t line_start = 0;
    202 
    20373    mFileName = fileName;
    204     mLineNum = 1;
    20574
    20675#ifdef USE_BOOST_MMAP
     
    250119    }
    251120    else {
    252         mFileBuffer = (char *) mmap(nullptr, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
     121        mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
    253122        if (mFileBuffer == MAP_FAILED) {
    254123            if (errno ==  ENOMEM) {
     
    264133    }
    265134#endif
    266     size_t segment = 0;
    267     chars_avail = mFileSize;
    268135
    269136    llvm::raw_os_ostream out(std::cout);
    270     mInitializeCarriesFcn();
    271     //////////////////////////////////////////////////////////////////////////////////////////
    272     // Full Segments
    273     //////////////////////////////////////////////////////////////////////////////////////////
    274137
    275     while (chars_avail >= SEGMENT_SIZE) {
     138    uint64_t finalLineUnterminated = 0;
     139    if(finalLineIsUnterminated())
     140        finalLineUnterminated = 1;
    276141
    277         mLineBreak_scanner.init();
    278         mMatch_scanner.init();
     142    mMainFcn(mFileBuffer, mFileSize, fileName.c_str(), finalLineUnterminated);
    279143
    280         for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
    281             mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
    282             Output output;
    283             mProcessBlockFcn(basis_bits, output);
    284             mMatch_scanner.load_block(output.matches, blk);
    285             mLineBreak_scanner.load_block(output.LF, blk);
    286 
    287             if (mCountOnlyOption) {
    288                 if (bitblock::any(output.matches)) {
    289                     if (bitblock::any(simd_and(match_vector, output.matches))) {
    290                         match_count += bitblock::popcount(match_vector);
    291                         match_vector = output.matches;
    292                     } else {
    293                         match_vector = simd_or(match_vector, output.matches);
    294                     }
    295                 }
    296             }
    297         }
    298         if (!mCountOnlyOption) {
    299             if (mGetCodePointsOption) {
    300                 line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
    301             }
    302             else {
    303                 line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
    304             }
    305         }
    306         segment++;
    307         line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
    308         chars_avail -= SEGMENT_SIZE;
    309     }
    310 
    311     //////////////////////////////////////////////////////////////////////////////////////////
    312     // For the Final Partial Segment.
    313     //////////////////////////////////////////////////////////////////////////////////////////
    314 
    315     size_t remaining = chars_avail;
    316     size_t blk = 0;
    317 
    318     mLineBreak_scanner.init();
    319     mMatch_scanner.init();
    320 
    321     /* Full Blocks */
    322     for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
    323         mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
    324         Output output;
    325         mProcessBlockFcn(basis_bits, output);
    326         mLineBreak_scanner.load_block(output.LF, blk);
    327         mMatch_scanner.load_block(output.matches, blk);
    328         if (mCountOnlyOption) {
    329             if (bitblock::any(output.matches)) {
    330                 if (bitblock::any(simd_and(match_vector, output.matches))) {
    331                     match_count += bitblock::popcount(match_vector);
    332                     match_vector = output.matches;
    333                 } else {
    334                     match_vector = simd_or(match_vector, output.matches);
    335                 }
    336             }
    337         }
    338     }
    339 
    340     //Final Partial Block (may be empty, but there could be carries pending).
     144    PrintTotalCount();
    341145   
    342    
    343     const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
    344    
    345     if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
    346         basis_bits.bit_0 = simd<1>::constant<0>();
    347         basis_bits.bit_1 = simd<1>::constant<0>();
    348         basis_bits.bit_2 = simd<1>::constant<0>();
    349         basis_bits.bit_3 = simd<1>::constant<0>();
    350         basis_bits.bit_4 = simd<1>::constant<0>();
    351         basis_bits.bit_5 = simd<1>::constant<0>();
    352         basis_bits.bit_6 = simd<1>::constant<0>();
    353         basis_bits.bit_7 = simd<1>::constant<0>();
    354     }
    355     else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
    356         mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
    357     }
    358 
    359     if (finalLineIsUnterminated()) {
    360         // Add a LF at the EOF position
    361         BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
    362         //  LF = 00001010  (bits 4 and 6 set).
    363         basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
    364         basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
    365     }
    366    
    367     Output output;
    368     mProcessBlockFcn(basis_bits, output);
    369     if (mCountOnlyOption) {
    370         match_count += bitblock::popcount(match_vector);
    371         if (bitblock::any(output.matches)) {
    372             match_count += bitblock::popcount(output.matches);
    373         }
    374         if (mShowFileNameOption) {
    375             out << mFileName << ':';
    376         }
    377         out << match_count << '\n';
    378     } else {
    379         mLineBreak_scanner.load_block(output.LF, blk);
    380         mMatch_scanner.load_block(output.matches, blk);
    381         while (++blk < SEGMENT_BLOCKS) {
    382             mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
    383             mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
    384         }
    385         if (mGetCodePointsOption) {
    386             line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
    387         }
    388         else {
    389             line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
    390         }
    391     }
    392146#ifdef USE_BOOST_MMAP
    393147    mFile.close();
Note: See TracChangeset for help on using the changeset viewer.