source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4793

Last change on this file since 4793 was 4792, checked in by cameron, 4 years ago

Fix segfaults at page boundary

File size: 11.2 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29#include <llvm/Support/raw_os_ostream.h>
30
31// mmap system
32#ifdef USE_BOOST_MMAP
33#include <boost/filesystem.hpp>
34#include <boost/iostreams/device/mapped_file.hpp>
35using namespace boost::iostreams;
36using namespace boost::filesystem;
37#else
38#include <sys/mman.h>
39#endif
40#include <fcntl.h>
41
42
43#define BUFFER_SEGMENTS 15
44#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
45
46//
47// Write matched lines from a buffer to an output file, given segment
48// scanners for line ends and matches (where matches are a subset of line ends).
49// The buffer pointer must point to the first byte of the segment
50// corresponding to the scanner indexes.   The first_line_start is the
51// start position of the first line relative to the buffer start position.
52// It must be zero or negative;  if negative, the buffer must permit negative
53// indexing so that the lineup to the buffer start position can also be printed.
54// The start position of the final line in the processed segment is returned.
55//
56
57ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
58
59    ssize_t match_pos;
60    ssize_t line_end;
61    while (mMatch_scanner.has_next()) {
62        match_pos = mMatch_scanner.scan_to_next();
63        // If we found a match, it must be at a line end.
64        while (true) {
65            line_end = mLineBreak_scanner.scan_to_next();
66            if (line_end >= match_pos) {
67                break;
68            }
69            line_start = line_end + 1;
70            mLineNum++;
71        }
72        assert (buffer + line_end < mFileBuffer + mFileSize);
73        if (mShowFileNameOption) {
74            out << mFileName << ':';
75        }
76        if (mShowLineNumberingOption) {
77            out << mLineNum << ":";
78        }
79        if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
80            // The LF of a CRLF.  Really the end of the last line.
81            line_start++;
82        }
83        unsigned char end_byte = (unsigned char)buffer[line_end];
84        if (mNormalizeLineBreaksOption) {
85            if (end_byte == 0x85) {
86                // Line terminated with NEL, on the second byte.  Back up 1.
87                line_end--;
88            } else if (end_byte > 0xD) {
89                // Line terminated with PS or LS, on the third byte.  Back up 2.
90                line_end -= 2;
91            }
92            out.write(&buffer[line_start], line_end - line_start);
93            out << '\n';
94        }
95        else {
96            if (end_byte == 0x0) {
97                // This must be a sentinel byte position at the end of file.
98                // Do not write it.
99                line_end--;
100            } else if (end_byte == 0x0D) {
101                // Check for line_end on first byte of CRLF;  note that we don't
102                // want to access past the end of buffer.
103                if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
104                    // Found CRLF; preserve both bytes.
105                    line_end++;
106                }
107            }
108            out.write(&buffer[line_start], line_end - line_start + 1);
109        }
110        line_start = line_end + 1;
111        mLineNum++;
112    }
113    while(mLineBreak_scanner.has_next()) {
114        line_end = mLineBreak_scanner.scan_to_next();
115        line_start = line_end+1;
116        mLineNum++;
117    }
118    return line_start;
119}
120
121bool GrepExecutor::finalLineIsUnterminated() const {
122    if (mFileSize == 0) return false;
123    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
124    // LF through CR are line break characters
125    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
126    // Other line breaks require at least two bytes.
127    if (mFileSize == 1) return true;
128    // NEL
129    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
130    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
131    if (mFileSize == 2) return true;
132    // LS and PS
133    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
134    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
135}
136
137void GrepExecutor::doGrep(const std::string & fileName) {
138
139    Basis_bits basis_bits;
140    BitBlock match_vector = simd<1>::constant<0>();
141    size_t match_count = 0;
142    size_t chars_avail = 0;
143    ssize_t line_start = 0;
144
145    mFileName = fileName;
146    mLineNum = 1;
147
148#ifdef USE_BOOST_MMAP
149    const path file(mFileName);
150    if (exists(file)) {
151        if (is_directory(file)) {
152            return;
153        }
154    } else {
155        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
156        return;
157    }
158
159    mFileSize = file_size(file);
160    mapped_file mFile;
161    try {
162        mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
163    } catch (std::ios_base::failure e) {
164        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
165        return;
166    }
167    mFileBuffer = mFile.data();
168#else
169    struct stat infile_sb;
170    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
171    if (fdSrc == -1) {
172        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
173        return;
174    }
175    if (fstat(fdSrc, &infile_sb) == -1) {
176        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
177        close (fdSrc);
178        return;
179    }
180    if (S_ISDIR(infile_sb.st_mode)) {
181        close (fdSrc);
182        return;
183    }
184    mFileSize = infile_sb.st_size;
185    mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
186    if (mFileBuffer == MAP_FAILED) {
187        if (errno ==  ENOMEM) {
188            std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
189        }
190        else {
191            std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
192        }
193        return;
194    }
195#endif
196    size_t segment = 0;
197    chars_avail = mFileSize;
198
199    llvm::raw_os_ostream out(std::cout);
200    //////////////////////////////////////////////////////////////////////////////////////////
201    // Full Segments
202    //////////////////////////////////////////////////////////////////////////////////////////
203
204    while (chars_avail >= SEGMENT_SIZE) {
205
206        mLineBreak_scanner.init();
207        mMatch_scanner.init();
208
209        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
210            s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
211            Output output;
212            mProcessBlockFcn(basis_bits, output);
213
214            mMatch_scanner.load_block(output.matches, blk);
215            mLineBreak_scanner.load_block(output.LF, blk);
216
217            if (mCountOnlyOption) {
218                if (bitblock::any(output.matches)) {
219                    if (bitblock::any(simd_and(match_vector, output.matches))) {
220                        match_count += bitblock::popcount(match_vector);
221                        match_vector = output.matches;
222                    } else {
223                        match_vector = simd_or(match_vector, output.matches);
224                    }
225                }
226            }
227        }
228
229        if (!mCountOnlyOption) {
230            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
231        }
232        segment++;
233        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
234        chars_avail -= SEGMENT_SIZE;
235    }
236
237    //////////////////////////////////////////////////////////////////////////////////////////
238    // For the Final Partial Segment.
239    //////////////////////////////////////////////////////////////////////////////////////////
240
241    size_t remaining = chars_avail;
242    size_t blk = 0;
243
244    mLineBreak_scanner.init();
245    mMatch_scanner.init();
246
247    /* Full Blocks */
248    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
249        s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
250        Output output;
251        mProcessBlockFcn(basis_bits, output);
252
253        mLineBreak_scanner.load_block(output.LF, blk);
254        mMatch_scanner.load_block(output.matches, blk);
255        if (mCountOnlyOption) {
256            if (bitblock::any(output.matches)) {
257                if (bitblock::any(simd_and(match_vector, output.matches))) {
258                    match_count += bitblock::popcount(match_vector);
259                    match_vector = output.matches;
260                } else {
261                    match_vector = simd_or(match_vector, output.matches);
262                }
263            }
264        }
265    }
266
267    //Final Partial Block (may be empty, but there could be carries pending).
268   
269   
270    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
271   
272    if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
273        basis_bits.bit_0 = simd<1>::constant<0>();
274        basis_bits.bit_1 = simd<1>::constant<0>();
275        basis_bits.bit_2 = simd<1>::constant<0>();
276        basis_bits.bit_3 = simd<1>::constant<0>();
277        basis_bits.bit_4 = simd<1>::constant<0>();
278        basis_bits.bit_5 = simd<1>::constant<0>();
279        basis_bits.bit_6 = simd<1>::constant<0>();
280        basis_bits.bit_7 = simd<1>::constant<0>();
281    }
282    else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
283        s2p_do_final_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits, EOF_mask);
284    }
285
286    if (finalLineIsUnterminated()) {
287        // Add a LF at the EOF position
288        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
289        //  LF = 00001010  (bits 4 and 6 set).
290        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
291        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
292    }
293   
294    Output output;
295    mProcessBlockFcn(basis_bits, output);
296
297    if (mCountOnlyOption) {
298        match_count += bitblock::popcount(match_vector);
299        if (bitblock::any(output.matches)) {
300            match_count += bitblock::popcount(output.matches);
301        }
302        if (mShowFileNameOption) {
303            out << mFileName << ':';
304        }
305        out << match_count << '\n';
306    } else {
307        mLineBreak_scanner.load_block(output.LF, blk);
308        mMatch_scanner.load_block(output.matches, blk);
309        while (++blk < SEGMENT_BLOCKS) {
310            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
311            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
312        }
313        line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
314    }
315#ifdef USE_BOOST_MMAP
316    mFile.close();
317#else
318    munmap((void *)mFileBuffer, mFileSize);
319    close(fdSrc);
320#endif   
321}
Note: See TracBrowser for help on using the repository browser.