source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4790

Last change on this file since 4790 was 4790, checked in by hongpum, 4 years ago

Fixed a remaining issue (out of bound) in CL4788

If the file ends with a CR, we will try to access one more byte past the
buffer to look for an LF. Since we stop mmaping sentinel bytes, this byte is
now out of the mapped memory region and won't cause SIGBUS, but it's now a
"normal" out-of-bound case and may read garbage or cause segfault.

File size: 10.6 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29#include <llvm/Support/raw_os_ostream.h>
30
31// mmap system
32#ifdef USE_BOOST_MMAP
33#include <boost/filesystem.hpp>
34#include <boost/iostreams/device/mapped_file.hpp>
35using namespace boost::iostreams;
36using namespace boost::filesystem;
37#else
38#include <sys/mman.h>
39#endif
40#include <fcntl.h>
41
42
43#define BUFFER_SEGMENTS 15
44#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
45
46//
47// Write matched lines from a buffer to an output file, given segment
48// scanners for line ends and matches (where matches are a subset of line ends).
49// The buffer pointer must point to the first byte of the segment
50// corresponding to the scanner indexes.   The first_line_start is the
51// start position of the first line relative to the buffer start position.
52// It must be zero or negative;  if negative, the buffer must permit negative
53// indexing so that the lineup to the buffer start position can also be printed.
54// The start position of the final line in the processed segment is returned.
55//
56
57ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
58
59    ssize_t match_pos;
60    ssize_t line_end;
61    while (mMatch_scanner.has_next()) {
62        match_pos = mMatch_scanner.scan_to_next();
63        // If we found a match, it must be at a line end.
64        while (true) {
65            line_end = mLineBreak_scanner.scan_to_next();
66            if (line_end >= match_pos) {
67                break;
68            }
69            line_start = line_end + 1;
70            mLineNum++;
71        }
72        assert (buffer + line_end < mFileBuffer + mFileSize);
73        if (mShowFileNameOption) {
74            out << mFileName << ':';
75        }
76        if (mShowLineNumberingOption) {
77            out << mLineNum << ":";
78        }
79        if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
80            // The LF of a CRLF.  Really the end of the last line.
81            line_start++;
82        }
83        unsigned char end_byte = (unsigned char)buffer[line_end];
84        if (mNormalizeLineBreaksOption) {
85            if (end_byte == 0x85) {
86                // Line terminated with NEL, on the second byte.  Back up 1.
87                line_end--;
88            } else if (end_byte > 0xD) {
89                // Line terminated with PS or LS, on the third byte.  Back up 2.
90                line_end -= 2;
91            }
92            out.write(&buffer[line_start], line_end - line_start);
93            out << '\n';
94        }
95        else {
96            if (end_byte == 0x0) {
97                // This must be a sentinel byte position at the end of file.
98                // Do not write it.
99                line_end--;
100            } else if (end_byte == 0x0D) {
101                // Check for line_end on first byte of CRLF;  note that we don't
102                // want to access past the end of buffer.
103                if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
104                    // Found CRLF; preserve both bytes.
105                    line_end++;
106                }
107            }
108            out.write(&buffer[line_start], line_end - line_start + 1);
109        }
110        line_start = line_end + 1;
111        mLineNum++;
112    }
113    while(mLineBreak_scanner.has_next()) {
114        line_end = mLineBreak_scanner.scan_to_next();
115        line_start = line_end+1;
116        mLineNum++;
117    }
118    return line_start;
119}
120
121bool GrepExecutor::finalLineIsUnterminated() const {
122    if (mFileSize == 0) return false;
123    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
124    // LF through CR are line break characters
125    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
126    // Other line breaks require at least two bytes.
127    if (mFileSize == 1) return true;
128    // NEL
129    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
130    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
131    if (mFileSize == 2) return true;
132    // LS and PS
133    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
134    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
135}
136
137void GrepExecutor::doGrep(const std::string & fileName) {
138
139    Basis_bits basis_bits;
140    BitBlock match_vector = simd<1>::constant<0>();
141    size_t match_count = 0;
142    size_t chars_avail = 0;
143    ssize_t line_start = 0;
144
145    mFileName = fileName;
146    mLineNum = 1;
147
148#ifdef USE_BOOST_MMAP
149    const path file(mFileName);
150    if (exists(file)) {
151        if (is_directory(file)) {
152            return;
153        }
154    } else {
155        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
156        return;
157    }
158
159    mFileSize = file_size(file);
160    mapped_file mFile;
161    try {
162        mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
163    } catch (std::ios_base::failure e) {
164        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
165        return;
166    }
167    mFileBuffer = mFile.data();
168#else
169    struct stat infile_sb;
170    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
171    if (fdSrc == -1) {
172        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
173        return;
174    }
175    if (fstat(fdSrc, &infile_sb) == -1) {
176        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
177        close (fdSrc);
178        return;
179    }
180    if (S_ISDIR(infile_sb.st_mode)) {
181        close (fdSrc);
182        return;
183    }
184    mFileSize = infile_sb.st_size;
185    mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
186    if (mFileBuffer == MAP_FAILED) {
187        if (errno ==  ENOMEM) {
188            std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
189        }
190        else {
191            std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
192        }
193        return;
194    }
195#endif
196    size_t segment = 0;
197    chars_avail = mFileSize;
198
199    llvm::raw_os_ostream out(std::cout);
200    //////////////////////////////////////////////////////////////////////////////////////////
201    // Full Segments
202    //////////////////////////////////////////////////////////////////////////////////////////
203
204    while (chars_avail >= SEGMENT_SIZE) {
205
206        mLineBreak_scanner.init();
207        mMatch_scanner.init();
208
209        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
210            s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
211            Output output;
212            mProcessBlockFcn(basis_bits, output);
213
214            mMatch_scanner.load_block(output.matches, blk);
215            mLineBreak_scanner.load_block(output.LF, blk);
216
217            if (mCountOnlyOption) {
218                if (bitblock::any(output.matches)) {
219                    if (bitblock::any(simd_and(match_vector, output.matches))) {
220                        match_count += bitblock::popcount(match_vector);
221                        match_vector = output.matches;
222                    } else {
223                        match_vector = simd_or(match_vector, output.matches);
224                    }
225                }
226            }
227        }
228
229        if (!mCountOnlyOption) {
230            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
231        }
232        segment++;
233        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
234        chars_avail -= SEGMENT_SIZE;
235    }
236
237    //////////////////////////////////////////////////////////////////////////////////////////
238    // For the Final Partial Segment.
239    //////////////////////////////////////////////////////////////////////////////////////////
240
241    size_t remaining = chars_avail;
242    size_t blk = 0;
243
244    mLineBreak_scanner.init();
245    mMatch_scanner.init();
246
247    /* Full Blocks */
248    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
249        s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
250        Output output;
251        mProcessBlockFcn(basis_bits, output);
252
253        mLineBreak_scanner.load_block(output.LF, blk);
254        mMatch_scanner.load_block(output.matches, blk);
255        if (mCountOnlyOption) {
256            if (bitblock::any(output.matches)) {
257                if (bitblock::any(simd_and(match_vector, output.matches))) {
258                    match_count += bitblock::popcount(match_vector);
259                    match_vector = output.matches;
260                } else {
261                    match_vector = simd_or(match_vector, output.matches);
262                }
263            }
264        }
265    }
266
267    //Final Partial Block (may be empty, but there could be carries pending).
268   
269    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
270   
271    s2p_do_final_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits, EOF_mask);
272
273    if (finalLineIsUnterminated()) {
274        // Add a LF at the EOF position
275        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
276        //  LF = 00001010  (bits 4 and 6 set).
277        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
278        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
279    }
280   
281    Output output;
282    mProcessBlockFcn(basis_bits, output);
283
284    if (mCountOnlyOption) {
285        match_count += bitblock::popcount(match_vector);
286        if (bitblock::any(output.matches)) {
287            match_count += bitblock::popcount(output.matches);
288        }
289        if (mShowFileNameOption) {
290            out << mFileName << ':';
291        }
292        out << match_count << '\n';
293    } else {
294        mLineBreak_scanner.load_block(output.LF, blk);
295        mMatch_scanner.load_block(output.matches, blk);
296        while (++blk < SEGMENT_BLOCKS) {
297            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
298            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
299        }
300        line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
301    }
302#ifdef USE_BOOST_MMAP
303    mFile.close();
304#else
305    munmap((void *)mFileBuffer, mFileSize);
306    close(fdSrc);
307#endif   
308}
Note: See TracBrowser for help on using the repository browser.