source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4932

Last change on this file since 4932 was 4904, checked in by cameron, 4 years ago

Refactoring progress towards layered kernels

File size: 13.5 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "basis_bits.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23#include <cctype>
24
25#include "include/simd-lib/carryQ.hpp"
26#include "include/simd-lib/pabloSupport.hpp"
27#include "include/simd-lib/s2p.hpp"
28#include "include/simd-lib/buffer.hpp"
29
30#include <llvm/Support/raw_os_ostream.h>
31
32// mmap system
33#ifdef USE_BOOST_MMAP
34#include <boost/filesystem.hpp>
35#include <boost/iostreams/device/mapped_file.hpp>
36using namespace boost::iostreams;
37using namespace boost::filesystem;
38#else
39#include <sys/mman.h>
40#endif
41#include <fcntl.h>
42
43
44#define BUFFER_SEGMENTS 15
45#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
46
47//
48// Write matched lines from a buffer to an output file, given segment
49// scanners for line ends and matches (where matches are a subset of line ends).
50// The buffer pointer must point to the first byte of the segment
51// corresponding to the scanner indexes.   The first_line_start is the
52// start position of the first line relative to the buffer start position.
53// It must be zero or negative;  if negative, the buffer must permit negative
54// indexing so that the lineup to the buffer start position can also be printed.
55// The start position of the final line in the processed segment is returned.
56//
57
58
59void GrepExecutor::write_matched_line(llvm::raw_ostream & out, const char * buffer, ssize_t line_start, ssize_t line_end) {
60    if (mShowFileNameOption) {
61        out << mFileName << ':';
62    }
63    if (mShowLineNumberingOption) {
64        out << mLineNum << ":";
65    }
66    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
67        // The line "starts" on the LF of a CRLF.  Really the end of the last line.
68        line_start++;
69    }
70    if (buffer + line_end == mFileBuffer + mFileSize) {
71        // The match position is at end-of-file.   We have a final unterminated line.
72        out.write(&buffer[line_start], line_end - line_start);
73        if (mNormalizeLineBreaksOption) {
74            out << '\n';  // terminate it
75        }
76        return;
77    }
78    unsigned char end_byte = (unsigned char)buffer[line_end];
79    if (mNormalizeLineBreaksOption) {
80        if (end_byte == 0x85) {
81            // Line terminated with NEL, on the second byte.  Back up 1.
82            line_end--;
83        } else if (end_byte > 0xD) {
84            // Line terminated with PS or LS, on the third byte.  Back up 2.
85            line_end -= 2;
86        }
87        out.write(&buffer[line_start], line_end - line_start);
88        out << '\n';
89    }
90    else {
91        if (end_byte == 0x0D) {
92            // Check for line_end on first byte of CRLF;  note that we don't
93            // want to access past the end of buffer.
94            if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
95                // Found CRLF; preserve both bytes.
96                line_end++;
97            }
98        }
99        out.write(&buffer[line_start], line_end - line_start + 1);
100    }
101   
102}
103
104ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
105
106    ssize_t match_pos;
107    ssize_t line_end;
108    while (mMatch_scanner.has_next()) {
109        match_pos = mMatch_scanner.scan_to_next();
110        // If we found a match, it must be at a line end.
111        while (true) {
112            line_end = mLineBreak_scanner.scan_to_next();
113            if (line_end >= match_pos) {
114                break;
115            }
116            line_start = line_end + 1;
117            mLineNum++;
118        }
119        write_matched_line(out,buffer, line_start, line_end);
120        line_start = line_end + 1;
121        mLineNum++;
122    }
123    while(mLineBreak_scanner.has_next()) {
124        line_end = mLineBreak_scanner.scan_to_next();
125        line_start = line_end+1;
126        mLineNum++;
127    }
128    return line_start;
129}
130
131bool GrepExecutor::finalLineIsUnterminated() const {
132    if (mFileSize == 0) return false;
133    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
134    // LF through CR are line break characters
135    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
136    // Other line breaks require at least two bytes.
137    if (mFileSize == 1) return true;
138    // NEL
139    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
140    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
141    if (mFileSize == 2) return true;
142    // LS and PS
143    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
144    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
145}
146
147// Extracting codepoint data from UCD name data file.
148ssize_t GrepExecutor::extract_codepoints(char * buffer, ssize_t first_line_start) {
149   
150    ssize_t line_start = first_line_start;
151    size_t match_pos;
152    size_t line_end;
153   
154    while (mMatch_scanner.has_next()) {
155        match_pos = mMatch_scanner.scan_to_next();
156        // If we found a match, it must be at a line end.
157        line_end = mLineBreak_scanner.scan_to_next();
158        while (line_end < match_pos) {
159            line_start = line_end + 1;
160            mLineNum++;
161            line_end = mLineBreak_scanner.scan_to_next();
162        }
163       
164        re::codepoint_t c = 0;
165        ssize_t line_pos = line_start;
166        while (isxdigit(buffer[line_pos])) {
167            if (isdigit(buffer[line_pos])) {
168                c = (c << 4) | (buffer[line_pos] - '0');
169            }
170            else {
171                c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
172            }
173            line_pos++;
174        }
175        assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
176#ifndef NDEBUG
177        std::cerr << "\\N expression found codepoint " << std::hex << c << std::dec << std::endl;
178#endif
179       
180        mParsedCodePointSet->insert(c);
181       
182        line_start = line_end + 1;
183        mLineNum++;
184    }
185    while(mLineBreak_scanner.has_next()) {
186        line_end = mLineBreak_scanner.scan_to_next();
187        line_start = line_end+1;
188        mLineNum++;
189    }
190    return line_start;
191   
192}
193
194
195void GrepExecutor::doGrep(const std::string & fileName) {
196
197    Basis_bits basis_bits;
198    BitBlock match_vector = simd<1>::constant<0>();
199    size_t match_count = 0;
200    size_t chars_avail = 0;
201    ssize_t line_start = 0;
202
203    mFileName = fileName;
204    mLineNum = 1;
205
206#ifdef USE_BOOST_MMAP
207    const path file(mFileName);
208    if (exists(file)) {
209        if (is_directory(file)) {
210            return;
211        }
212    } else {
213        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
214        return;
215    }
216
217    mFileSize = file_size(file);
218    mapped_file mFile;
219    if (mFileSize == 0) {
220        mFileBuffer = nullptr;
221    }
222    else {
223        try {
224            mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
225        } catch (std::ios_base::failure e) {
226            std::cerr << "Error: Boost mmap of " << mFileName << ": " << e.what() << std::endl;
227            return;
228        }
229        mFileBuffer = mFile.data();
230    }
231#else
232    struct stat infile_sb;
233    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
234    if (fdSrc == -1) {
235        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
236        return;
237    }
238    if (fstat(fdSrc, &infile_sb) == -1) {
239        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
240        close (fdSrc);
241        return;
242    }
243    if (S_ISDIR(infile_sb.st_mode)) {
244        close (fdSrc);
245        return;
246    }
247    mFileSize = infile_sb.st_size;
248    if (mFileSize == 0) {
249        mFileBuffer = nullptr;
250    }
251    else {
252        mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
253        if (mFileBuffer == MAP_FAILED) {
254            if (errno ==  ENOMEM) {
255                std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
256                close (fdSrc);
257            }
258            else {
259                std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
260                close (fdSrc);
261            }
262            return;
263        }
264    }
265#endif
266    size_t segment = 0;
267    chars_avail = mFileSize;
268
269    llvm::raw_os_ostream out(std::cout);
270    mInitializeCarriesFcn();
271    //////////////////////////////////////////////////////////////////////////////////////////
272    // Full Segments
273    //////////////////////////////////////////////////////////////////////////////////////////
274
275    while (chars_avail >= SEGMENT_SIZE) {
276
277        mLineBreak_scanner.init();
278        mMatch_scanner.init();
279
280        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
281            mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
282            Output output;
283            mProcessBlockFcn(basis_bits, output);
284            mMatch_scanner.load_block(output.matches, blk);
285            mLineBreak_scanner.load_block(output.LF, blk);
286
287            if (mCountOnlyOption) {
288                if (bitblock::any(output.matches)) {
289                    if (bitblock::any(simd_and(match_vector, output.matches))) {
290                        match_count += bitblock::popcount(match_vector);
291                        match_vector = output.matches;
292                    } else {
293                        match_vector = simd_or(match_vector, output.matches);
294                    }
295                }
296            }
297        }
298        if (!mCountOnlyOption) {
299            if (mGetCodePointsOption) {
300                line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
301            }
302            else {
303                line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
304            }
305        }
306        segment++;
307        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
308        chars_avail -= SEGMENT_SIZE;
309    }
310
311    //////////////////////////////////////////////////////////////////////////////////////////
312    // For the Final Partial Segment.
313    //////////////////////////////////////////////////////////////////////////////////////////
314
315    size_t remaining = chars_avail;
316    size_t blk = 0;
317
318    mLineBreak_scanner.init();
319    mMatch_scanner.init();
320
321    /* Full Blocks */
322    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
323        mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
324        Output output;
325        mProcessBlockFcn(basis_bits, output);
326        mLineBreak_scanner.load_block(output.LF, blk);
327        mMatch_scanner.load_block(output.matches, blk);
328        if (mCountOnlyOption) {
329            if (bitblock::any(output.matches)) {
330                if (bitblock::any(simd_and(match_vector, output.matches))) {
331                    match_count += bitblock::popcount(match_vector);
332                    match_vector = output.matches;
333                } else {
334                    match_vector = simd_or(match_vector, output.matches);
335                }
336            }
337        }
338    }
339
340    //Final Partial Block (may be empty, but there could be carries pending).
341   
342   
343    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
344   
345    if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
346        basis_bits.bit_0 = simd<1>::constant<0>();
347        basis_bits.bit_1 = simd<1>::constant<0>();
348        basis_bits.bit_2 = simd<1>::constant<0>();
349        basis_bits.bit_3 = simd<1>::constant<0>();
350        basis_bits.bit_4 = simd<1>::constant<0>();
351        basis_bits.bit_5 = simd<1>::constant<0>();
352        basis_bits.bit_6 = simd<1>::constant<0>();
353        basis_bits.bit_7 = simd<1>::constant<0>();
354    }
355    else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
356        mTransposeFcn(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
357    }
358
359    if (finalLineIsUnterminated()) {
360        // Add a LF at the EOF position
361        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
362        //  LF = 00001010  (bits 4 and 6 set).
363        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
364        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
365    }
366   
367    Output output;
368    mProcessBlockFcn(basis_bits, output);
369    if (mCountOnlyOption) {
370        match_count += bitblock::popcount(match_vector);
371        if (bitblock::any(output.matches)) {
372            match_count += bitblock::popcount(output.matches);
373        }
374        if (mShowFileNameOption) {
375            out << mFileName << ':';
376        }
377        out << match_count << '\n';
378    } else {
379        mLineBreak_scanner.load_block(output.LF, blk);
380        mMatch_scanner.load_block(output.matches, blk);
381        while (++blk < SEGMENT_BLOCKS) {
382            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
383            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
384        }
385        if (mGetCodePointsOption) {
386            line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
387        }
388        else {
389            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
390        }
391    }
392#ifdef USE_BOOST_MMAP
393    mFile.close();
394#else
395    munmap((void *)mFileBuffer, mFileSize);
396    close(fdSrc);
397#endif   
398}
Note: See TracBrowser for help on using the repository browser.