source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4795

Last change on this file since 4795 was 4795, checked in by cameron, 3 years ago

Remove assert; handle buffer + line_end == mFileBuffer + mFileSize

File size: 11.3 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29#include <llvm/Support/raw_os_ostream.h>
30
31// mmap system
32#ifdef USE_BOOST_MMAP
33#include <boost/filesystem.hpp>
34#include <boost/iostreams/device/mapped_file.hpp>
35using namespace boost::iostreams;
36using namespace boost::filesystem;
37#else
38#include <sys/mman.h>
39#endif
40#include <fcntl.h>
41
42
43#define BUFFER_SEGMENTS 15
44#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
45
46//
47// Write matched lines from a buffer to an output file, given segment
48// scanners for line ends and matches (where matches are a subset of line ends).
49// The buffer pointer must point to the first byte of the segment
50// corresponding to the scanner indexes.   The first_line_start is the
51// start position of the first line relative to the buffer start position.
52// It must be zero or negative;  if negative, the buffer must permit negative
53// indexing so that the lineup to the buffer start position can also be printed.
54// The start position of the final line in the processed segment is returned.
55//
56
57ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
58
59    ssize_t match_pos;
60    ssize_t line_end;
61    while (mMatch_scanner.has_next()) {
62        match_pos = mMatch_scanner.scan_to_next();
63        // If we found a match, it must be at a line end.
64        while (true) {
65            line_end = mLineBreak_scanner.scan_to_next();
66            if (line_end >= match_pos) {
67                break;
68            }
69            line_start = line_end + 1;
70            mLineNum++;
71        }
72        if (mShowFileNameOption) {
73            out << mFileName << ':';
74        }
75        if (mShowLineNumberingOption) {
76            out << mLineNum << ":";
77        }
78        if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
79            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
80            line_start++;
81        }
82        if (buffer + line_end == mFileBuffer + mFileSize) {
83            // The match position is at end-of-file.   We have a final unterminated line.
84            out.write(&buffer[line_start], line_end - line_start);
85            if (mNormalizeLineBreaksOption) {
86              out << '\n';  // terminate it
87            }
88            return line_end;
89        }
90        unsigned char end_byte = (unsigned char)buffer[line_end];
91        if (mNormalizeLineBreaksOption) {
92            if (end_byte == 0x85) {
93                // Line terminated with NEL, on the second byte.  Back up 1.
94                line_end--;
95            } else if (end_byte > 0xD) {
96                // Line terminated with PS or LS, on the third byte.  Back up 2.
97                line_end -= 2;
98            }
99            out.write(&buffer[line_start], line_end - line_start);
100            out << '\n';
101        }
102        else {
103            if (end_byte == 0x0D) {
104                // Check for line_end on first byte of CRLF;  note that we don't
105                // want to access past the end of buffer.
106                if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
107                    // Found CRLF; preserve both bytes.
108                    line_end++;
109                }
110            }
111            out.write(&buffer[line_start], line_end - line_start + 1);
112        }
113        line_start = line_end + 1;
114        mLineNum++;
115    }
116    while(mLineBreak_scanner.has_next()) {
117        line_end = mLineBreak_scanner.scan_to_next();
118        line_start = line_end+1;
119        mLineNum++;
120    }
121    return line_start;
122}
123
124bool GrepExecutor::finalLineIsUnterminated() const {
125    if (mFileSize == 0) return false;
126    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
127    // LF through CR are line break characters
128    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
129    // Other line breaks require at least two bytes.
130    if (mFileSize == 1) return true;
131    // NEL
132    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
133    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
134    if (mFileSize == 2) return true;
135    // LS and PS
136    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
137    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
138}
139
140void GrepExecutor::doGrep(const std::string & fileName) {
141
142    Basis_bits basis_bits;
143    BitBlock match_vector = simd<1>::constant<0>();
144    size_t match_count = 0;
145    size_t chars_avail = 0;
146    ssize_t line_start = 0;
147
148    mFileName = fileName;
149    mLineNum = 1;
150
151#ifdef USE_BOOST_MMAP
152    const path file(mFileName);
153    if (exists(file)) {
154        if (is_directory(file)) {
155            return;
156        }
157    } else {
158        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
159        return;
160    }
161
162    mFileSize = file_size(file);
163    mapped_file mFile;
164    try {
165        mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
166    } catch (std::ios_base::failure e) {
167        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
168        return;
169    }
170    mFileBuffer = mFile.data();
171#else
172    struct stat infile_sb;
173    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
174    if (fdSrc == -1) {
175        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
176        return;
177    }
178    if (fstat(fdSrc, &infile_sb) == -1) {
179        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
180        close (fdSrc);
181        return;
182    }
183    if (S_ISDIR(infile_sb.st_mode)) {
184        close (fdSrc);
185        return;
186    }
187    mFileSize = infile_sb.st_size;
188    mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
189    if (mFileBuffer == MAP_FAILED) {
190        if (errno ==  ENOMEM) {
191            std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
192        }
193        else {
194            std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
195        }
196        return;
197    }
198#endif
199    size_t segment = 0;
200    chars_avail = mFileSize;
201
202    llvm::raw_os_ostream out(std::cout);
203    //////////////////////////////////////////////////////////////////////////////////////////
204    // Full Segments
205    //////////////////////////////////////////////////////////////////////////////////////////
206
207    while (chars_avail >= SEGMENT_SIZE) {
208
209        mLineBreak_scanner.init();
210        mMatch_scanner.init();
211
212        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
213            s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
214            Output output;
215            mProcessBlockFcn(basis_bits, output);
216
217            mMatch_scanner.load_block(output.matches, blk);
218            mLineBreak_scanner.load_block(output.LF, blk);
219
220            if (mCountOnlyOption) {
221                if (bitblock::any(output.matches)) {
222                    if (bitblock::any(simd_and(match_vector, output.matches))) {
223                        match_count += bitblock::popcount(match_vector);
224                        match_vector = output.matches;
225                    } else {
226                        match_vector = simd_or(match_vector, output.matches);
227                    }
228                }
229            }
230        }
231
232        if (!mCountOnlyOption) {
233            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
234        }
235        segment++;
236        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
237        chars_avail -= SEGMENT_SIZE;
238    }
239
240    //////////////////////////////////////////////////////////////////////////////////////////
241    // For the Final Partial Segment.
242    //////////////////////////////////////////////////////////////////////////////////////////
243
244    size_t remaining = chars_avail;
245    size_t blk = 0;
246
247    mLineBreak_scanner.init();
248    mMatch_scanner.init();
249
250    /* Full Blocks */
251    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
252        s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
253        Output output;
254        mProcessBlockFcn(basis_bits, output);
255
256        mLineBreak_scanner.load_block(output.LF, blk);
257        mMatch_scanner.load_block(output.matches, blk);
258        if (mCountOnlyOption) {
259            if (bitblock::any(output.matches)) {
260                if (bitblock::any(simd_and(match_vector, output.matches))) {
261                    match_count += bitblock::popcount(match_vector);
262                    match_vector = output.matches;
263                } else {
264                    match_vector = simd_or(match_vector, output.matches);
265                }
266            }
267        }
268    }
269
270    //Final Partial Block (may be empty, but there could be carries pending).
271   
272   
273    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
274   
275    if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
276        basis_bits.bit_0 = simd<1>::constant<0>();
277        basis_bits.bit_1 = simd<1>::constant<0>();
278        basis_bits.bit_2 = simd<1>::constant<0>();
279        basis_bits.bit_3 = simd<1>::constant<0>();
280        basis_bits.bit_4 = simd<1>::constant<0>();
281        basis_bits.bit_5 = simd<1>::constant<0>();
282        basis_bits.bit_6 = simd<1>::constant<0>();
283        basis_bits.bit_7 = simd<1>::constant<0>();
284    }
285    else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
286        s2p_do_final_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits, EOF_mask);
287    }
288
289    if (finalLineIsUnterminated()) {
290        // Add a LF at the EOF position
291        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
292        //  LF = 00001010  (bits 4 and 6 set).
293        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
294        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
295    }
296   
297    Output output;
298    mProcessBlockFcn(basis_bits, output);
299
300    if (mCountOnlyOption) {
301        match_count += bitblock::popcount(match_vector);
302        if (bitblock::any(output.matches)) {
303            match_count += bitblock::popcount(output.matches);
304        }
305        if (mShowFileNameOption) {
306            out << mFileName << ':';
307        }
308        out << match_count << '\n';
309    } else {
310        mLineBreak_scanner.load_block(output.LF, blk);
311        mMatch_scanner.load_block(output.matches, blk);
312        while (++blk < SEGMENT_BLOCKS) {
313            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
314            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
315        }
316        line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
317    }
318#ifdef USE_BOOST_MMAP
319    mFile.close();
320#else
321    munmap((void *)mFileBuffer, mFileSize);
322    close(fdSrc);
323#endif   
324}
Note: See TracBrowser for help on using the repository browser.