source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4802

Last change on this file since 4802 was 4802, checked in by cameron, 4 years ago

getCodepointSetOption

File size: 13.0 KB
RevLine 
[4324]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
[4430]22#include <stdexcept>
[4802]23#include <cctype>
[4324]24
25#include "include/simd-lib/carryQ.hpp"
26#include "include/simd-lib/pabloSupport.hpp"
27#include "include/simd-lib/s2p.hpp"
28#include "include/simd-lib/buffer.hpp"
29
[4788]30#include <llvm/Support/raw_os_ostream.h>
31
[4324]32// mmap system
[4778]33#ifdef USE_BOOST_MMAP
[4788]34#include <boost/filesystem.hpp>
[4778]35#include <boost/iostreams/device/mapped_file.hpp>
[4788]36using namespace boost::iostreams;
37using namespace boost::filesystem;
[4778]38#else
[4324]39#include <sys/mman.h>
[4778]40#endif
[4324]41#include <fcntl.h>
42
43
44#define BUFFER_SEGMENTS 15
45#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
46
47//
48// Write matched lines from a buffer to an output file, given segment
49// scanners for line ends and matches (where matches are a subset of line ends).
50// The buffer pointer must point to the first byte of the segment
51// corresponding to the scanner indexes.   The first_line_start is the
52// start position of the first line relative to the buffer start position.
53// It must be zero or negative;  if negative, the buffer must permit negative
54// indexing so that the lineup to the buffer start position can also be printed.
55// The start position of the final line in the processed segment is returned.
56//
57
[4788]58ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
[4324]59
[4788]60    ssize_t match_pos;
61    ssize_t line_end;
62    while (mMatch_scanner.has_next()) {
63        match_pos = mMatch_scanner.scan_to_next();
64        // If we found a match, it must be at a line end.
65        while (true) {
66            line_end = mLineBreak_scanner.scan_to_next();
67            if (line_end >= match_pos) {
68                break;
69            }
70            line_start = line_end + 1;
71            mLineNum++;
72        }
73        if (mShowFileNameOption) {
74            out << mFileName << ':';
75        }
76        if (mShowLineNumberingOption) {
77            out << mLineNum << ":";
78        }
79        if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
[4800]80            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
[4788]81            line_start++;
82        }
[4800]83        if (buffer + line_end == mFileBuffer + mFileSize) {
84            // The match position is at end-of-file.   We have a final unterminated line.
85            out.write(&buffer[line_start], line_end - line_start);
86            if (mNormalizeLineBreaksOption) {
87              out << '\n';  // terminate it
88            }
89            return line_end;
90        }
[4788]91        unsigned char end_byte = (unsigned char)buffer[line_end];
92        if (mNormalizeLineBreaksOption) {
93            if (end_byte == 0x85) {
94                // Line terminated with NEL, on the second byte.  Back up 1.
95                line_end--;
96            } else if (end_byte > 0xD) {
97                // Line terminated with PS or LS, on the third byte.  Back up 2.
98                line_end -= 2;
99            }
100            out.write(&buffer[line_start], line_end - line_start);
101            out << '\n';
102        }
103        else {
[4800]104            if (end_byte == 0x0D) {
[4790]105                // Check for line_end on first byte of CRLF;  note that we don't
106                // want to access past the end of buffer.
107                if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
[4788]108                    // Found CRLF; preserve both bytes.
109                    line_end++;
110                }
111            }
112            out.write(&buffer[line_start], line_end - line_start + 1);
113        }
114        line_start = line_end + 1;
115        mLineNum++;
[4324]116    }
[4788]117    while(mLineBreak_scanner.has_next()) {
118        line_end = mLineBreak_scanner.scan_to_next();
119        line_start = line_end+1;
120        mLineNum++;
[4325]121    }
[4788]122    return line_start;
[4324]123}
124
[4788]125bool GrepExecutor::finalLineIsUnterminated() const {
[4478]126    if (mFileSize == 0) return false;
127    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
128    // LF through CR are line break characters
129    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
130    // Other line breaks require at least two bytes.
131    if (mFileSize == 1) return true;
[4788]132    // NEL
[4478]133    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
134    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
135    if (mFileSize == 2) return true;
136    // LS and PS
137    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
138    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
139}
[4324]140
[4802]141// Extracting codepoint data from UCD name data file.
142ssize_t GrepExecutor::extract_codepoints(char * buffer, ssize_t first_line_start) {
143   
144    ssize_t line_start = first_line_start;
145    size_t match_pos;
146    size_t line_end;
147   
148    while (mMatch_scanner.has_next()) {
149        match_pos = mMatch_scanner.scan_to_next();
150        // If we found a match, it must be at a line end.
151        line_end = mLineBreak_scanner.scan_to_next();
152        while (line_end < match_pos) {
153            line_start = line_end + 1;
154            mLineNum++;
155            line_end = mLineBreak_scanner.scan_to_next();
156        }
157       
158        re::codepoint_t c = 0;
159        ssize_t line_pos = line_start;
160        while (isxdigit(buffer[line_pos])) {
161            if (isdigit(buffer[line_pos])) {
162                c = (c << 4) | (buffer[line_pos] - '0');
163            }
164            else {
165                c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
166            }
167        }
168        assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
169        mParsedCodePointSet->insert(c);
170       
171        line_start = line_end + 1;
172        mLineNum++;
173    }
174    while(mLineBreak_scanner.has_next()) {
175        line_end = mLineBreak_scanner.scan_to_next();
176        line_start = line_end+1;
177        mLineNum++;
178    }
179    return line_start;
180   
181}
182
183
[4788]184void GrepExecutor::doGrep(const std::string & fileName) {
[4324]185
[4659]186    Basis_bits basis_bits;
[4788]187    BitBlock match_vector = simd<1>::constant<0>();
[4477]188    size_t match_count = 0;
189    size_t chars_avail = 0;
190    ssize_t line_start = 0;
[4324]191
[4788]192    mFileName = fileName;
193    mLineNum = 1;
194
195#ifdef USE_BOOST_MMAP
196    const path file(mFileName);
197    if (exists(file)) {
198        if (is_directory(file)) {
199            return;
200        }
201    } else {
202        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
203        return;
204    }
205
206    mFileSize = file_size(file);
207    mapped_file mFile;
208    try {
209        mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
210    } catch (std::ios_base::failure e) {
211        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
212        return;
213    }
214    mFileBuffer = mFile.data();
215#else
[4324]216    struct stat infile_sb;
[4788]217    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
[4324]218    if (fdSrc == -1) {
[4788]219        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
[4430]220        return;
[4324]221    }
222    if (fstat(fdSrc, &infile_sb) == -1) {
[4788]223        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
224        close (fdSrc);
[4430]225        return;
[4324]226    }
[4430]227    if (S_ISDIR(infile_sb.st_mode)) {
[4788]228        close (fdSrc);
[4430]229        return;
[4324]230    }
[4477]231    mFileSize = infile_sb.st_size;
[4788]232    mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
[4478]233    if (mFileBuffer == MAP_FAILED) {
[4482]234        if (errno ==  ENOMEM) {
[4788]235            std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
[4482]236        }
237        else {
[4788]238            std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
[4482]239        }
[4430]240        return;
[4324]241    }
[4778]242#endif
[4477]243    size_t segment = 0;
244    chars_avail = mFileSize;
[4324]245
[4788]246    llvm::raw_os_ostream out(std::cout);
247    //////////////////////////////////////////////////////////////////////////////////////////
248    // Full Segments
249    //////////////////////////////////////////////////////////////////////////////////////////
250
[4324]251    while (chars_avail >= SEGMENT_SIZE) {
252
[4478]253        mLineBreak_scanner.init();
254        mMatch_scanner.init();
[4324]255
[4788]256        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
257            s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
[4659]258            Output output;
[4726]259            mProcessBlockFcn(basis_bits, output);
[4324]260
[4659]261            mMatch_scanner.load_block(output.matches, blk);
[4478]262            mLineBreak_scanner.load_block(output.LF, blk);
[4659]263
[4788]264            if (mCountOnlyOption) {
265                if (bitblock::any(output.matches)) {
266                    if (bitblock::any(simd_and(match_vector, output.matches))) {
[4324]267                        match_count += bitblock::popcount(match_vector);
268                        match_vector = output.matches;
[4788]269                    } else {
[4324]270                        match_vector = simd_or(match_vector, output.matches);
271                    }
272                }
273            }
274        }
275        if (!mCountOnlyOption) {
[4802]276            if (mGetCodePointsOption) {
277                line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
278            }
279            else {
280                line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
281            }
[4324]282        }
[4477]283        segment++;
284        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
285        chars_avail -= SEGMENT_SIZE;
[4324]286    }
287
[4788]288    //////////////////////////////////////////////////////////////////////////////////////////
289    // For the Final Partial Segment.
290    //////////////////////////////////////////////////////////////////////////////////////////
[4324]291
[4788]292    size_t remaining = chars_avail;
293    size_t blk = 0;
[4324]294
[4478]295    mLineBreak_scanner.init();
296    mMatch_scanner.init();
[4324]297
298    /* Full Blocks */
[4788]299    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
300        s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
[4659]301        Output output;
[4726]302        mProcessBlockFcn(basis_bits, output);
[4324]303
[4478]304        mLineBreak_scanner.load_block(output.LF, blk);
305        mMatch_scanner.load_block(output.matches, blk);
[4788]306        if (mCountOnlyOption) {
307            if (bitblock::any(output.matches)) {
308                if (bitblock::any(simd_and(match_vector, output.matches))) {
[4324]309                    match_count += bitblock::popcount(match_vector);
310                    match_vector = output.matches;
[4788]311                } else {
[4324]312                    match_vector = simd_or(match_vector, output.matches);
313                }
314            }
315        }
316    }
317
[4478]318    //Final Partial Block (may be empty, but there could be carries pending).
[4477]319   
[4800]320   
[4788]321    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
[4477]322   
[4800]323    if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
324        basis_bits.bit_0 = simd<1>::constant<0>();
325        basis_bits.bit_1 = simd<1>::constant<0>();
326        basis_bits.bit_2 = simd<1>::constant<0>();
327        basis_bits.bit_3 = simd<1>::constant<0>();
328        basis_bits.bit_4 = simd<1>::constant<0>();
329        basis_bits.bit_5 = simd<1>::constant<0>();
330        basis_bits.bit_6 = simd<1>::constant<0>();
331        basis_bits.bit_7 = simd<1>::constant<0>();
332    }
333    else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
334        s2p_do_final_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits, EOF_mask);
335    }
[4478]336
337    if (finalLineIsUnterminated()) {
338        // Add a LF at the EOF position
339        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
340        //  LF = 00001010  (bits 4 and 6 set).
341        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
342        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
343    }
344   
[4659]345    Output output;
[4726]346    mProcessBlockFcn(basis_bits, output);
[4324]347
[4788]348    if (mCountOnlyOption) {
[4324]349        match_count += bitblock::popcount(match_vector);
[4788]350        if (bitblock::any(output.matches)) {
[4324]351            match_count += bitblock::popcount(output.matches);
352        }
[4325]353        if (mShowFileNameOption) {
[4788]354            out << mFileName << ':';
[4325]355        }
[4788]356        out << match_count << '\n';
357    } else {
[4478]358        mLineBreak_scanner.load_block(output.LF, blk);
359        mMatch_scanner.load_block(output.matches, blk);
[4788]360        while (++blk < SEGMENT_BLOCKS) {
361            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
362            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
[4324]363        }
[4802]364        if (mGetCodePointsOption) {
365            line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
366        }
367        else {
368            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
369        }
[4324]370    }
[4778]371#ifdef USE_BOOST_MMAP
372    mFile.close();
373#else
[4788]374    munmap((void *)mFileBuffer, mFileSize);
[4324]375    close(fdSrc);
[4788]376#endif   
[4324]377}
Note: See TracBrowser for help on using the repository browser.