source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4778

Last change on this file since 4778 was 4778, checked in by cameron, 4 years ago

Hongpu's option to use Boost mmap; fix an include for std::iota

File size: 11.0 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29// mmap system
30#ifdef USE_BOOST_MMAP
31#include <boost/iostreams/device/mapped_file.hpp>
32#else
33#include <sys/mman.h>
34#endif
35#include <fcntl.h>
36
37
38#define BUFFER_SEGMENTS 15
39#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
40
41BitBlock EOF_mask = simd<1>::constant<1>();
42
43//
44// Write matched lines from a buffer to an output file, given segment
45// scanners for line ends and matches (where matches are a subset of line ends).
46// The buffer pointer must point to the first byte of the segment
47// corresponding to the scanner indexes.   The first_line_start is the
48// start position of the first line relative to the buffer start position.
49// It must be zero or negative;  if negative, the buffer must permit negative
50// indexing so that the lineup to the buffer start position can also be printed.
51// The start position of the final line in the processed segment is returned.
52//
53
54ssize_t GrepExecutor::write_matches(char * buffer, ssize_t first_line_start) {
55
56  ssize_t line_start = first_line_start;
57  ssize_t match_pos;
58  ssize_t line_end;
59  while (mMatch_scanner.has_next()) {
60    match_pos = mMatch_scanner.scan_to_next();
61    // If we found a match, it must be at a line end.
62    line_end = mLineBreak_scanner.scan_to_next();
63    while (line_end < match_pos) {
64      line_start = line_end + 1;
65      line_no++;
66      line_end = mLineBreak_scanner.scan_to_next();
67    }
68    if (mShowFileNameOption) {
69      std::cout << mFileName;
70    }
71    if (mShowLineNumberingOption) {
72      std::cout << line_no << ":";
73    }
74    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
75        // The LF of a CRLF.  Really the end of the last line. 
76        line_start++;
77    }
78    unsigned char end_byte = (unsigned char) buffer[line_end];
79    if (mNormalizeLineBreaksOption) {
80      if (end_byte == 0x85) {
81          // Line terminated with NEL, on the second byte.  Back up 1.
82          line_end--;
83      }
84      else if (end_byte > 0xD) {
85          // Line terminated with PS or LS, on the third byte.  Back up 2.
86          line_end -= 2;
87      }
88      std::cout.write(&buffer[line_start], line_end - line_start);
89      std::cout << std::endl;
90    }
91    else {
92      if (end_byte == 0x0) {
93          // This must be a sentinel byte position at the end of file.
94          // Do not write it.
95          line_end--;
96      }
97      else if (end_byte == 0x0D) {
98          // Check for line_end on first byte of CRLF;  note that to safely
99          // access past line_end, even at the end of buffer, we require the
100          // mmap_sentinel_bytes >= 1.
101          if (buffer[line_end + 1] == 0x0A) { 
102              // Found CRLF; preserve both bytes.
103              line_end++;
104          }
105      }
106      std::cout.write(&buffer[line_start], line_end - line_start + 1);
107    }
108    line_start = line_end + 1;
109    line_no++;
110  }
111  while(mLineBreak_scanner.has_next()) {
112    line_end = mLineBreak_scanner.scan_to_next();
113    line_start = line_end+1;
114    line_no++;
115  }
116  return line_start;
117}
118
119bool GrepExecutor::finalLineIsUnterminated() {
120    if (mFileSize == 0) return false;
121    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
122    // LF through CR are line break characters
123    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
124    // Other line breaks require at least two bytes.
125    if (mFileSize == 1) return true;
126    // NEL 
127    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
128    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
129    if (mFileSize == 2) return true;
130    // LS and PS
131    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
132    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
133}
134
135void GrepExecutor::doGrep(const std::string infilename) {
136
137    Basis_bits basis_bits;
138    BitBlock match_vector;
139   
140    mFileName = infilename + ":";
141   
142    size_t match_count = 0;
143    size_t blk = 0;
144    size_t block_base  = 0;
145    size_t block_pos   = 0;
146    size_t chars_avail = 0;
147    ssize_t line_start = 0;
148    line_no = 1;
149
150    match_vector = simd<1>::constant<0>();
151    int fdSrc;
152    struct stat infile_sb;
153    fdSrc = open(infilename.c_str(), O_RDONLY);
154    if (fdSrc == -1) {
155        std::cerr << "Error: cannot open " << infilename << " for processing. Skipped.\n";
156        return;
157    }
158    if (fstat(fdSrc, &infile_sb) == -1) {
159        std::cerr << "Error: cannot stat " << infilename << " for processing. Skipped.\n";
160        return;
161    }
162    if (S_ISDIR(infile_sb.st_mode)) {
163        // Silently ignore directories.
164        // std::cerr << "Error: " << infilename << " is a directory. Skipped.\n";
165        return;
166    }
167    mFileSize = infile_sb.st_size;
168    // Set 2 sentinel bytes, 1 for possible addition of LF for unterminated last line,
169    // 1 guard byte.  PROT_WRITE enables writing the sentinel.
170    const size_t mmap_sentinel_bytes = 2; 
171#ifdef USE_BOOST_MMAP
172    boost::iostreams::mapped_file mFile;
173    try {
174        mFile.open(
175            infilename,
176            boost::iostreams::mapped_file_base::mapmode::priv,
177            mFileSize + mmap_sentinel_bytes, 0
178        );
179    } catch (std::ios_base::failure e) {
180        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
181        return;
182    }
183    mFileBuffer = mFile.data();
184#else
185    mFileBuffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE, fdSrc, 0);
186    if (mFileBuffer == MAP_FAILED) {
187        if (errno ==  ENOMEM) {
188            std::cerr << "Error:  mmap of " << infilename << " failed: out of memory\n";
189        }
190        else {
191            std::cerr << "Error: mmap of " << infilename << " failed with errno " << errno << ". Skipped.\n";
192        }
193        return;
194    }
195#endif
196    char * buffer_ptr;
197    size_t segment = 0;
198    size_t segment_base = 0;
199    chars_avail = mFileSize;
200   
201//////////////////////////////////////////////////////////////////////////////////////////
202// Full Segments
203//////////////////////////////////////////////////////////////////////////////////////////
204
205    while (chars_avail >= SEGMENT_SIZE) {
206
207        segment_base = segment * SEGMENT_SIZE;
208        mLineBreak_scanner.init();
209        mMatch_scanner.init();
210
211        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
212            block_base = blk*BLOCK_SIZE + segment_base;
213            s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
214            Output output;
215            mProcessBlockFcn(basis_bits, output);
216
217            mMatch_scanner.load_block(output.matches, blk);
218            mLineBreak_scanner.load_block(output.LF, blk);
219
220            if (mCountOnlyOption){
221                if (bitblock::any(output.matches))
222                {
223                    if (bitblock::any(simd_and(match_vector, output.matches))){
224                        match_count += bitblock::popcount(match_vector);
225                        match_vector = output.matches;
226                    }
227                    else
228                    {
229                        match_vector = simd_or(match_vector, output.matches);
230                    }
231                }
232            }
233        }
234
235        buffer_ptr = &mFileBuffer[segment_base];
236
237        if (!mCountOnlyOption) {
238          line_start = write_matches(buffer_ptr, line_start);
239        }
240        segment++;
241        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
242        chars_avail -= SEGMENT_SIZE;
243    }
244
245//////////////////////////////////////////////////////////////////////////////////////////
246// For the Final Partial Segment.
247//////////////////////////////////////////////////////////////////////////////////////////
248
249    segment_base = segment * SEGMENT_SIZE;
250    int remaining = chars_avail;
251       
252
253    mLineBreak_scanner.init();
254    mMatch_scanner.init();
255
256    /* Full Blocks */
257    blk = 0;
258    while (remaining >= BLOCK_SIZE) {
259        block_base = block_pos + segment_base;
260        s2p_do_block((BytePack *) &mFileBuffer[block_base], basis_bits);
261        Output output;
262        mProcessBlockFcn(basis_bits, output);
263
264        mLineBreak_scanner.load_block(output.LF, blk);
265        mMatch_scanner.load_block(output.matches, blk);
266        if (mCountOnlyOption)
267        {
268            if (bitblock::any(output.matches))
269            {
270                if (bitblock::any(simd_and(match_vector, output.matches)))
271                {
272                    match_count += bitblock::popcount(match_vector);
273                    match_vector = output.matches;
274                }
275                else
276                {
277                    match_vector = simd_or(match_vector, output.matches);
278                }
279            }
280        }
281
282        block_pos += BLOCK_SIZE;
283        remaining -= BLOCK_SIZE;
284        blk++;
285    }
286    block_base = block_pos;
287
288    //Final Partial Block (may be empty, but there could be carries pending).
289   
290    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
291   
292    block_base = block_pos + segment_base;
293    s2p_do_final_block((BytePack *) &mFileBuffer[block_base], basis_bits, EOF_mask);
294
295    if (finalLineIsUnterminated()) {
296        // Add a LF at the EOF position
297        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
298        //  LF = 00001010  (bits 4 and 6 set).
299        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
300        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
301        // Add final sentinel byte so write_matches knows what to do.
302        mFileBuffer[mFileSize] = 0x0;
303    }
304   
305    Output output;
306    mProcessBlockFcn(basis_bits, output);
307
308    if (mCountOnlyOption)
309    {
310        match_count += bitblock::popcount(match_vector);
311        if (bitblock::any(output.matches))
312        {
313            match_count += bitblock::popcount(output.matches);
314        }
315        if (mShowFileNameOption) {
316            std::cout << mFileName;
317        }
318        std::cout << match_count << std::endl;
319    }
320    else
321    {
322        mLineBreak_scanner.load_block(output.LF, blk);
323        mMatch_scanner.load_block(output.matches, blk);
324        blk++;
325        for (int i = blk; i < SEGMENT_BLOCKS; i++)
326        {
327            mLineBreak_scanner.load_block(simd<1>::constant<0>(), i);
328            mMatch_scanner.load_block(simd<1>::constant<0>(), i);
329        }
330        buffer_ptr = &mFileBuffer[segment_base];
331        line_start = write_matches(buffer_ptr, line_start);
332    }
333   
334#ifdef USE_BOOST_MMAP
335    mFile.close();
336#else
337    munmap((void *) mFileBuffer, mFileSize + mmap_sentinel_bytes);
338#endif
339    close(fdSrc);
340   
341}
Note: See TracBrowser for help on using the repository browser.