Changeset 4478 for icGREP


Ignore:
Timestamp:
Feb 8, 2015, 9:02:24 AM (5 years ago)
Author:
cameron
Message:

Fix to handle files without a proper line break at EOF

Location:
icGREP/icgrep-devel/icgrep
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/do_grep.cpp

    r4477 r4478  
    6262  size_t match_pos;
    6363  size_t line_end;
    64   while (match_scanner.has_next()) {
    65     match_pos = match_scanner.scan_to_next();
     64  while (mMatch_scanner.has_next()) {
     65    match_pos = mMatch_scanner.scan_to_next();
    6666    // If we found a match, it must be at a line end.
    67     line_end = LF_scanner.scan_to_next();
     67    line_end = mLineBreak_scanner.scan_to_next();
    6868    while (line_end < match_pos) {
    6969      line_start = line_end + 1;
    7070      line_no++;
    71       line_end = LF_scanner.scan_to_next();
     71      line_end = mLineBreak_scanner.scan_to_next();
    7272    }
    7373    if (mShowFileNameOption) {
     
    8383    unsigned char end_byte = (unsigned char) buffer[line_end];
    8484    if (mNormalizeLineBreaksOption) {
    85         if (end_byte <= 0xD) {
    86             // Line terminated with LF, VT, FF or CR. 
    87             std::cout.write(&buffer[line_start], line_end - line_start);
    88             std::cout << std::endl;
    89         }
    90         else if (end_byte == 0x85) {
    91             // Line terminated with NEL, on the second byte. 
    92             std::cout.write(&buffer[line_start], line_end - line_start - 1);
    93             std::cout << std::endl;
    94         }
    95         else  {
    96             // Line terminated with PS or LS, on the third byte.
    97             std::cout.write(&buffer[line_start], line_end - line_start - 2);
    98             std::cout << std::endl;
    99         }
     85      if (end_byte == 0x85) {
     86          // Line terminated with NEL, on the second byte.  Back up 1.
     87          line_end--;
     88      }
     89      else if (end_byte > 0xD) {
     90          // Line terminated with PS or LS, on the third byte.  Back up 2.
     91          line_end -= 2;
     92      }
     93      std::cout.write(&buffer[line_start], line_end - line_start);
     94      std::cout << std::endl;
    10095    }
    10196    else {
    102         // Check for line_end on first byte of CRLF;  note that to safely
    103         // access past line_end, even at the end of buffer, we require the
    104         // mmap_sentinel_bytes >= 1.
    105         if (end_byte == 0x0D) {
    106             if (buffer[line_end + 1] == 0x0A) {
    107                 line_end++;
    108             }
    109         }
    110         std::cout.write(&buffer[line_start], line_end - line_start + 1);
     97      if (end_byte == 0x0) {
     98          // This must be a sentinel byte position at the end of file.
     99          // Do not write it.
     100          line_end--;
     101      }
     102      else if (end_byte == 0x0D) {
     103          // Check for line_end on first byte of CRLF;  note that to safely
     104          // access past line_end, even at the end of buffer, we require the
     105          // mmap_sentinel_bytes >= 1.
     106          if (buffer[line_end + 1] == 0x0A) {
     107              // Found CRLF; preserve both bytes.
     108              line_end++;
     109          }
     110      }
     111      std::cout.write(&buffer[line_start], line_end - line_start + 1);
    111112    }
    112113    line_start = line_end + 1;
    113114    line_no++;
    114 
    115115  }
    116   while(LF_scanner.has_next()) {
    117     line_end = LF_scanner.scan_to_next();
     116  while(mLineBreak_scanner.has_next()) {
     117    line_end = mLineBreak_scanner.scan_to_next();
    118118    line_start = line_end+1;
    119119    line_no++;
     
    122122}
    123123
    124 
     124bool GrepExecutor::finalLineIsUnterminated() {
     125    if (mFileSize == 0) return false;
     126    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
     127    // LF through CR are line break characters
     128    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
     129    // Other line breaks require at least two bytes.
     130    if (mFileSize == 1) return true;
     131    // NEL 
     132    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
     133    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
     134    if (mFileSize == 2) return true;
     135    // LS and PS
     136    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
     137    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
     138}
    125139
    126140void GrepExecutor::doGrep(const std::string infilename) {
     
    149163    int fdSrc;
    150164    struct stat infile_sb;
    151     char * infile_buffer;
    152165    fdSrc = open(infilename.c_str(), O_RDONLY);
    153166    if (fdSrc == -1) {
     
    166179    mFileSize = infile_sb.st_size;
    167180    // Set 2 sentinel bytes, 1 for possible addition of LF for unterminated last line,
    168     // 1 guard byte.
     181    // 1 guard byte.  PROT_WRITE enables writing the sentinel.
    169182    const size_t mmap_sentinel_bytes = 2; 
    170     infile_buffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ, MAP_PRIVATE, fdSrc, 0);
    171     if (infile_buffer == MAP_FAILED) {
     183    mFileBuffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE, fdSrc, 0);
     184    if (mFileBuffer == MAP_FAILED) {
    172185        std::cerr << "Error: mmap of " << infilename << " failed. Skipped.\n";
    173186        return;
     
    185198
    186199        segment_base = segment * SEGMENT_SIZE;
    187         LF_scanner.init();
    188         match_scanner.init();
     200        mLineBreak_scanner.init();
     201        mMatch_scanner.init();
    189202
    190203        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
    191204            block_base = blk*BLOCK_SIZE + segment_base;
    192             s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
     205            s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
    193206            mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
    194207
    195             LF_scanner.load_block(output.LF, blk);
    196             match_scanner.load_block(output.matches, blk);
     208            mLineBreak_scanner.load_block(output.LF, blk);
     209            mMatch_scanner.load_block(output.matches, blk);
    197210            if (mCountOnlyOption){
    198211                if (bitblock::any(output.matches))
     
    210223        }
    211224
    212         buffer_ptr = &infile_buffer[segment_base];
     225        buffer_ptr = &mFileBuffer[segment_base];
    213226
    214227        if (!mCountOnlyOption) {
     
    228241       
    229242
    230     LF_scanner.init();
    231     match_scanner.init();
     243    mLineBreak_scanner.init();
     244    mMatch_scanner.init();
    232245
    233246    /* Full Blocks */
     
    235248    while (remaining >= BLOCK_SIZE) {
    236249        block_base = block_pos + segment_base;
    237         s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
     250        s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
    238251        mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
    239252
    240         LF_scanner.load_block(output.LF, blk);
    241         match_scanner.load_block(output.matches, blk);
     253        mLineBreak_scanner.load_block(output.LF, blk);
     254        mMatch_scanner.load_block(output.matches, blk);
    242255        if (mCountOnlyOption)
    243256        {
     
    261274    }
    262275    block_base = block_pos;
    263     //fprintf(stderr, "Remaining = %i\n", remaining);
    264 
    265     //For the last partial block, or for any carry.
    266    
     276
     277    //Final Partial Block (may be empty, but there could be carries pending).
    267278   
    268279    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
     280   
    269281    block_base = block_pos + segment_base;
    270     s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
     282    s2p_do_final_block((BytePack *) &mFileBuffer[block_base], basis_bits, EOF_mask);
     283
     284    if (finalLineIsUnterminated()) {
     285        // Add a LF at the EOF position
     286        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
     287        //  LF = 00001010  (bits 4 and 6 set).
     288        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
     289        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
     290        // Add final sentinel byte so write_matches knows what to do.
     291        mFileBuffer[mFileSize] = 0x0;
     292    }
     293   
    271294    mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
    272295
     
    285308    else
    286309    {
    287         LF_scanner.load_block(output.LF, blk);
    288         match_scanner.load_block(output.matches, blk);
     310        mLineBreak_scanner.load_block(output.LF, blk);
     311        mMatch_scanner.load_block(output.matches, blk);
    289312        blk++;
    290313        for (int i = blk; i < SEGMENT_BLOCKS; i++)
    291314        {
    292             LF_scanner.load_block(simd<1>::constant<0>(), i);
    293             match_scanner.load_block(simd<1>::constant<0>(), i);
    294         }
    295         buffer_ptr = &infile_buffer[segment_base];
     315            mLineBreak_scanner.load_block(simd<1>::constant<0>(), i);
     316            mMatch_scanner.load_block(simd<1>::constant<0>(), i);
     317        }
     318        buffer_ptr = &mFileBuffer[segment_base];
    296319        line_start = write_matches(buffer_ptr, line_start);
    297320    }
    298321   
    299     munmap((void *) infile_buffer, mFileSize + mmap_sentinel_bytes);
     322    munmap((void *) mFileBuffer, mFileSize + mmap_sentinel_bytes);
    300323    close(fdSrc);
    301324   
  • icGREP/icgrep-devel/icgrep/do_grep.h

    r4477 r4478  
    6262private:
    6363    ssize_t write_matches(char * buffer, ssize_t first_line_start);
     64    bool finalLineIsUnterminated();
    6465
    6566    bool mCountOnlyOption;
     
    6768    bool mShowLineNumberingOption;
    6869    bool mNormalizeLineBreaksOption;
     70
    6971    int mCarries;
    7072    int mAdvances;
     
    7375    std::string mFileName;
    7476    size_t mFileSize;
    75     ScannerT LF_scanner;
    76     ScannerT match_scanner;
     77    char * mFileBuffer;
     78    ScannerT mLineBreak_scanner;
     79    ScannerT mMatch_scanner;
    7780    size_t line_no;
    7881};
Note: See TracChangeset for help on using the changeset viewer.