source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4639

Last change on this file since 4639 was 4538, checked in by cameron, 4 years ago

Restructure to use a single process_block_state data area

File size: 10.6 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29// mmap system
30#include <sys/mman.h>
31#include <fcntl.h>
32
33
34#define BUFFER_SEGMENTS 15
35#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
36
37BitBlock EOF_mask = simd<1>::constant<1>();
38
39//
40// Write matched lines from a buffer to an output file, given segment
41// scanners for line ends and matches (where matches are a subset of line ends).
42// The buffer pointer must point to the first byte of the segment
43// corresponding to the scanner indexes.   The first_line_start is the
44// start position of the first line relative to the buffer start position.
45// It must be zero or negative;  if negative, the buffer must permit negative
46// indexing so that the lineup to the buffer start position can also be printed.
47// The start position of the final line in the processed segment is returned.
48//
49
50ssize_t GrepExecutor::write_matches(char * buffer, ssize_t first_line_start) {
51
52  ssize_t line_start = first_line_start;
53  size_t match_pos;
54  size_t line_end;
55  while (mMatch_scanner.has_next()) {
56    match_pos = mMatch_scanner.scan_to_next();
57    // If we found a match, it must be at a line end.
58    line_end = mLineBreak_scanner.scan_to_next();
59    while (line_end < match_pos) {
60      line_start = line_end + 1;
61      line_no++;
62      line_end = mLineBreak_scanner.scan_to_next();
63    }
64    if (mShowFileNameOption) {
65      std::cout << mFileName;
66    }
67    if (mShowLineNumberingOption) {
68      std::cout << line_no << ":";
69    }
70    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
71        // The LF of a CRLF.  Really the end of the last line. 
72        line_start++;
73    }
74    unsigned char end_byte = (unsigned char) buffer[line_end];
75    if (mNormalizeLineBreaksOption) {
76      if (end_byte == 0x85) {
77          // Line terminated with NEL, on the second byte.  Back up 1.
78          line_end--;
79      }
80      else if (end_byte > 0xD) {
81          // Line terminated with PS or LS, on the third byte.  Back up 2.
82          line_end -= 2;
83      }
84      std::cout.write(&buffer[line_start], line_end - line_start);
85      std::cout << std::endl;
86    }
87    else {
88      if (end_byte == 0x0) {
89          // This must be a sentinel byte position at the end of file.
90          // Do not write it.
91          line_end--;
92      }
93      else if (end_byte == 0x0D) {
94          // Check for line_end on first byte of CRLF;  note that to safely
95          // access past line_end, even at the end of buffer, we require the
96          // mmap_sentinel_bytes >= 1.
97          if (buffer[line_end + 1] == 0x0A) { 
98              // Found CRLF; preserve both bytes.
99              line_end++;
100          }
101      }
102      std::cout.write(&buffer[line_start], line_end - line_start + 1);
103    }
104    line_start = line_end + 1;
105    line_no++;
106  }
107  while(mLineBreak_scanner.has_next()) {
108    line_end = mLineBreak_scanner.scan_to_next();
109    line_start = line_end+1;
110    line_no++;
111  }
112  return line_start;
113}
114
115bool GrepExecutor::finalLineIsUnterminated() {
116    if (mFileSize == 0) return false;
117    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
118    // LF through CR are line break characters
119    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
120    // Other line breaks require at least two bytes.
121    if (mFileSize == 1) return true;
122    // NEL 
123    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
124    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
125    if (mFileSize == 2) return true;
126    // LS and PS
127    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
128    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
129}
130
131void GrepExecutor::doGrep(const std::string infilename) {
132
133    struct Basis_bits basis_bits;
134    struct Output output;
135    BitBlock match_vector;
136    BitBlock process_block_state_data[(mProcessBlockStateSize + sizeof(BitBlock) - 1)/sizeof(BitBlock)];   
137   
138    mFileName = infilename + ":";
139   
140    size_t match_count = 0;
141    size_t blk = 0;
142    size_t block_base  = 0;
143    size_t block_pos   = 0;
144    size_t chars_avail = 0;
145    ssize_t line_start = 0;
146    line_no = 1;
147
148    match_vector = simd<1>::constant<0>();
149    memset (process_block_state_data, 0, mProcessBlockStateSize);
150    int fdSrc;
151    struct stat infile_sb;
152    fdSrc = open(infilename.c_str(), O_RDONLY);
153    if (fdSrc == -1) {
154        std::cerr << "Error: cannot open " << infilename << " for processing. Skipped.\n";
155        return;
156    }
157    if (fstat(fdSrc, &infile_sb) == -1) {
158        std::cerr << "Error: cannot stat " << infilename << " for processing. Skipped.\n";
159        return;
160    }
161    if (S_ISDIR(infile_sb.st_mode)) {
162        // Silently ignore directories.
163        // std::cerr << "Error: " << infilename << " is a directory. Skipped.\n";
164        return;
165    }
166    mFileSize = infile_sb.st_size;
167    // Set 2 sentinel bytes, 1 for possible addition of LF for unterminated last line,
168    // 1 guard byte.  PROT_WRITE enables writing the sentinel.
169    const size_t mmap_sentinel_bytes = 2; 
170    mFileBuffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE, fdSrc, 0);
171    if (mFileBuffer == MAP_FAILED) {
172        if (errno ==  ENOMEM) {
173            std::cerr << "Error:  mmap of " << infilename << " failed: out of memory\n";
174        }
175        else {
176            std::cerr << "Error: mmap of " << infilename << " failed with errno " << errno << ". Skipped.\n";
177        }
178        return;
179    }
180    char * buffer_ptr;
181    size_t segment = 0;
182    size_t segment_base = 0;
183    chars_avail = mFileSize;
184   
185//////////////////////////////////////////////////////////////////////////////////////////
186// Full Segments
187//////////////////////////////////////////////////////////////////////////////////////////
188
189    while (chars_avail >= SEGMENT_SIZE) {
190
191        segment_base = segment * SEGMENT_SIZE;
192        mLineBreak_scanner.init();
193        mMatch_scanner.init();
194
195        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
196            block_base = blk*BLOCK_SIZE + segment_base;
197            s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
198            mProcessBlockFcn(basis_bits, process_block_state_data, output);
199
200            mLineBreak_scanner.load_block(output.LF, blk);
201            mMatch_scanner.load_block(output.matches, blk);
202            if (mCountOnlyOption){
203                if (bitblock::any(output.matches))
204                {
205                    if (bitblock::any(simd_and(match_vector, output.matches))){
206                        match_count += bitblock::popcount(match_vector);
207                        match_vector = output.matches;
208                    }
209                    else
210                    {
211                        match_vector = simd_or(match_vector, output.matches);
212                    }
213                }
214            }
215        }
216
217        buffer_ptr = &mFileBuffer[segment_base];
218
219        if (!mCountOnlyOption) {
220          line_start = write_matches(buffer_ptr, line_start);
221        }
222        segment++;
223        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
224        chars_avail -= SEGMENT_SIZE;
225    }
226
227//////////////////////////////////////////////////////////////////////////////////////////
228// For the Final Partial Segment.
229//////////////////////////////////////////////////////////////////////////////////////////
230
231    segment_base = segment * SEGMENT_SIZE;
232    int remaining = chars_avail;
233       
234
235    mLineBreak_scanner.init();
236    mMatch_scanner.init();
237
238    /* Full Blocks */
239    blk = 0;
240    while (remaining >= BLOCK_SIZE) {
241        block_base = block_pos + segment_base;
242        s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
243        mProcessBlockFcn(basis_bits, process_block_state_data, output);
244
245        mLineBreak_scanner.load_block(output.LF, blk);
246        mMatch_scanner.load_block(output.matches, blk);
247        if (mCountOnlyOption)
248        {
249            if (bitblock::any(output.matches))
250            {
251                if (bitblock::any(simd_and(match_vector, output.matches)))
252                {
253                    match_count += bitblock::popcount(match_vector);
254                    match_vector = output.matches;
255                }
256                else
257                {
258                    match_vector = simd_or(match_vector, output.matches);
259                }
260            }
261        }
262
263        block_pos += BLOCK_SIZE;
264        remaining -= BLOCK_SIZE;
265        blk++;
266    }
267    block_base = block_pos;
268
269    //Final Partial Block (may be empty, but there could be carries pending).
270   
271    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
272   
273    block_base = block_pos + segment_base;
274    s2p_do_final_block((BytePack *) &mFileBuffer[block_base], basis_bits, EOF_mask);
275
276    if (finalLineIsUnterminated()) {
277        // Add a LF at the EOF position
278        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
279        //  LF = 00001010  (bits 4 and 6 set).
280        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
281        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
282        // Add final sentinel byte so write_matches knows what to do.
283        mFileBuffer[mFileSize] = 0x0;
284    }
285   
286    mProcessBlockFcn(basis_bits, process_block_state_data, output);
287
288    if (mCountOnlyOption)
289    {
290        match_count += bitblock::popcount(match_vector);
291        if (bitblock::any(output.matches))
292        {
293            match_count += bitblock::popcount(output.matches);
294        }
295        if (mShowFileNameOption) {
296            std::cout << mFileName;
297        }
298        std::cout << match_count << std::endl;
299    }
300    else
301    {
302        mLineBreak_scanner.load_block(output.LF, blk);
303        mMatch_scanner.load_block(output.matches, blk);
304        blk++;
305        for (int i = blk; i < SEGMENT_BLOCKS; i++)
306        {
307            mLineBreak_scanner.load_block(simd<1>::constant<0>(), i);
308            mMatch_scanner.load_block(simd<1>::constant<0>(), i);
309        }
310        buffer_ptr = &mFileBuffer[segment_base];
311        line_start = write_matches(buffer_ptr, line_start);
312    }
313   
314    munmap((void *) mFileBuffer, mFileSize + mmap_sentinel_bytes);
315    close(fdSrc);
316   
317}
Note: See TracBrowser for help on using the repository browser.