source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4482

Last change on this file since 4482 was 4482, checked in by cameron, 4 years ago

Ensure line breaks are not matched.

File size: 10.8 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29// mmap system
30#include <sys/mman.h>
31#include <fcntl.h>
32
33
34#define BUFFER_SEGMENTS 15
35#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
36
37#define BitBlock_declare(name)  BitBlock name
38
39#define ubitblock_declare(name, n) \
40  ubitblock name[n];\
41  do {int i;\
42      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
43     }\
44  while (0)
45
46BitBlock EOF_mask = simd<1>::constant<1>();
47
48//
49// Write matched lines from a buffer to an output file, given segment
50// scanners for line ends and matches (where matches are a subset of line ends).
51// The buffer pointer must point to the first byte of the segment
52// corresponding to the scanner indexes.   The first_line_start is the
53// start position of the first line relative to the buffer start position.
54// It must be zero or negative;  if negative, the buffer must permit negative
55// indexing so that the lineup to the buffer start position can also be printed.
56// The start position of the final line in the processed segment is returned.
57//
58
59ssize_t GrepExecutor::write_matches(char * buffer, ssize_t first_line_start) {
60
61  ssize_t line_start = first_line_start;
62  size_t match_pos;
63  size_t line_end;
64  while (mMatch_scanner.has_next()) {
65    match_pos = mMatch_scanner.scan_to_next();
66    // If we found a match, it must be at a line end.
67    line_end = mLineBreak_scanner.scan_to_next();
68    while (line_end < match_pos) {
69      line_start = line_end + 1;
70      line_no++;
71      line_end = mLineBreak_scanner.scan_to_next();
72    }
73    if (mShowFileNameOption) {
74      std::cout << mFileName;
75    }
76    if (mShowLineNumberingOption) {
77      std::cout << line_no << ":";
78    }
79    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
80        // The LF of a CRLF.  Really the end of the last line. 
81        line_start++;
82    }
83    unsigned char end_byte = (unsigned char) buffer[line_end];
84    if (mNormalizeLineBreaksOption) {
85      if (end_byte == 0x85) {
86          // Line terminated with NEL, on the second byte.  Back up 1.
87          line_end--;
88      }
89      else if (end_byte > 0xD) {
90          // Line terminated with PS or LS, on the third byte.  Back up 2.
91          line_end -= 2;
92      }
93      std::cout.write(&buffer[line_start], line_end - line_start);
94      std::cout << std::endl;
95    }
96    else {
97      if (end_byte == 0x0) {
98          // This must be a sentinel byte position at the end of file.
99          // Do not write it.
100          line_end--;
101      }
102      else if (end_byte == 0x0D) {
103          // Check for line_end on first byte of CRLF;  note that to safely
104          // access past line_end, even at the end of buffer, we require the
105          // mmap_sentinel_bytes >= 1.
106          if (buffer[line_end + 1] == 0x0A) { 
107              // Found CRLF; preserve both bytes.
108              line_end++;
109          }
110      }
111      std::cout.write(&buffer[line_start], line_end - line_start + 1);
112    }
113    line_start = line_end + 1;
114    line_no++;
115  }
116  while(mLineBreak_scanner.has_next()) {
117    line_end = mLineBreak_scanner.scan_to_next();
118    line_start = line_end+1;
119    line_no++;
120  }
121  return line_start;
122}
123
124bool GrepExecutor::finalLineIsUnterminated() {
125    if (mFileSize == 0) return false;
126    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
127    // LF through CR are line break characters
128    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
129    // Other line breaks require at least two bytes.
130    if (mFileSize == 1) return true;
131    // NEL 
132    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
133    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
134    if (mFileSize == 2) return true;
135    // LS and PS
136    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
137    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
138}
139
140void GrepExecutor::doGrep(const std::string infilename) {
141
142    struct Basis_bits basis_bits;
143    struct Output output;
144    BitBlock match_vector;
145    BitBlock carry_q[mCarries];
146    BitBlock advance_q[mAdvances];
147   
148   
149    mFileName = infilename + ":";
150   
151    size_t match_count = 0;
152    size_t blk = 0;
153    size_t block_base  = 0;
154    size_t block_pos   = 0;
155    size_t chars_avail = 0;
156    ssize_t line_start = 0;
157    line_no = 1;
158
159    match_vector = simd<1>::constant<0>();
160    memset (carry_q, 0, sizeof(BitBlock) * mCarries);
161    memset (advance_q, 0, sizeof(BitBlock) * mAdvances);
162   
163    int fdSrc;
164    struct stat infile_sb;
165    fdSrc = open(infilename.c_str(), O_RDONLY);
166    if (fdSrc == -1) {
167        std::cerr << "Error: cannot open " << infilename << " for processing. Skipped.\n";
168        return;
169    }
170    if (fstat(fdSrc, &infile_sb) == -1) {
171        std::cerr << "Error: cannot stat " << infilename << " for processing. Skipped.\n";
172        return;
173    }
174    if (S_ISDIR(infile_sb.st_mode)) {
175        // Silently ignore directories.
176        // std::cerr << "Error: " << infilename << " is a directory. Skipped.\n";
177        return;
178    }
179    mFileSize = infile_sb.st_size;
180    // Set 2 sentinel bytes, 1 for possible addition of LF for unterminated last line,
181    // 1 guard byte.  PROT_WRITE enables writing the sentinel.
182    const size_t mmap_sentinel_bytes = 2; 
183    mFileBuffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE, fdSrc, 0);
184    if (mFileBuffer == MAP_FAILED) {
185        if (errno ==  ENOMEM) {
186            std::cerr << "Error:  mmap of " << infilename << " failed: out of memory\n";
187        }
188        else {
189            std::cerr << "Error: mmap of " << infilename << " failed with errno " << errno << ". Skipped.\n";
190        }
191        return;
192    }
193    char * buffer_ptr;
194    size_t segment = 0;
195    size_t segment_base = 0;
196    chars_avail = mFileSize;
197   
198//////////////////////////////////////////////////////////////////////////////////////////
199// Full Segments
200//////////////////////////////////////////////////////////////////////////////////////////
201
202    while (chars_avail >= SEGMENT_SIZE) {
203
204        segment_base = segment * SEGMENT_SIZE;
205        mLineBreak_scanner.init();
206        mMatch_scanner.init();
207
208        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
209            block_base = blk*BLOCK_SIZE + segment_base;
210            s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
211            mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
212
213            mLineBreak_scanner.load_block(output.LF, blk);
214            mMatch_scanner.load_block(output.matches, blk);
215            if (mCountOnlyOption){
216                if (bitblock::any(output.matches))
217                {
218                    if (bitblock::any(simd_and(match_vector, output.matches))){
219                        match_count += bitblock::popcount(match_vector);
220                        match_vector = output.matches;
221                    }
222                    else
223                    {
224                        match_vector = simd_or(match_vector, output.matches);
225                    }
226                }
227            }
228        }
229
230        buffer_ptr = &mFileBuffer[segment_base];
231
232        if (!mCountOnlyOption) {
233          line_start = write_matches(buffer_ptr, line_start);
234        }
235        segment++;
236        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
237        chars_avail -= SEGMENT_SIZE;
238    }
239
240//////////////////////////////////////////////////////////////////////////////////////////
241// For the Final Partial Segment.
242//////////////////////////////////////////////////////////////////////////////////////////
243
244    segment_base = segment * SEGMENT_SIZE;
245    int remaining = chars_avail;
246       
247
248    mLineBreak_scanner.init();
249    mMatch_scanner.init();
250
251    /* Full Blocks */
252    blk = 0;
253    while (remaining >= BLOCK_SIZE) {
254        block_base = block_pos + segment_base;
255        s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
256        mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
257
258        mLineBreak_scanner.load_block(output.LF, blk);
259        mMatch_scanner.load_block(output.matches, blk);
260        if (mCountOnlyOption)
261        {
262            if (bitblock::any(output.matches))
263            {
264                if (bitblock::any(simd_and(match_vector, output.matches)))
265                {
266                    match_count += bitblock::popcount(match_vector);
267                    match_vector = output.matches;
268                }
269                else
270                {
271                    match_vector = simd_or(match_vector, output.matches);
272                }
273            }
274        }
275
276        block_pos += BLOCK_SIZE;
277        remaining -= BLOCK_SIZE;
278        blk++;
279    }
280    block_base = block_pos;
281
282    //Final Partial Block (may be empty, but there could be carries pending).
283   
284    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
285   
286    block_base = block_pos + segment_base;
287    s2p_do_final_block((BytePack *) &mFileBuffer[block_base], basis_bits, EOF_mask);
288
289    if (finalLineIsUnterminated()) {
290        // Add a LF at the EOF position
291        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
292        //  LF = 00001010  (bits 4 and 6 set).
293        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
294        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
295        // Add final sentinel byte so write_matches knows what to do.
296        mFileBuffer[mFileSize] = 0x0;
297    }
298   
299    mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
300
301    if (mCountOnlyOption)
302    {
303        match_count += bitblock::popcount(match_vector);
304        if (bitblock::any(output.matches))
305        {
306            match_count += bitblock::popcount(output.matches);
307        }
308        if (mShowFileNameOption) {
309            std::cout << mFileName;
310        }
311        std::cout << match_count << std::endl;
312    }
313    else
314    {
315        mLineBreak_scanner.load_block(output.LF, blk);
316        mMatch_scanner.load_block(output.matches, blk);
317        blk++;
318        for (int i = blk; i < SEGMENT_BLOCKS; i++)
319        {
320            mLineBreak_scanner.load_block(simd<1>::constant<0>(), i);
321            mMatch_scanner.load_block(simd<1>::constant<0>(), i);
322        }
323        buffer_ptr = &mFileBuffer[segment_base];
324        line_start = write_matches(buffer_ptr, line_start);
325    }
326   
327    munmap((void *) mFileBuffer, mFileSize + mmap_sentinel_bytes);
328    close(fdSrc);
329   
330}
Note: See TracBrowser for help on using the repository browser.