source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4803

Last change on this file since 4803 was 4803, checked in by cameron, 4 years ago

Work on character name patterns

File size: 13.0 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "basis_bits.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23#include <cctype>
24
25#include "include/simd-lib/carryQ.hpp"
26#include "include/simd-lib/pabloSupport.hpp"
27#include "include/simd-lib/s2p.hpp"
28#include "include/simd-lib/buffer.hpp"
29
30#include <llvm/Support/raw_os_ostream.h>
31
32// mmap system
33#ifdef USE_BOOST_MMAP
34#include <boost/filesystem.hpp>
35#include <boost/iostreams/device/mapped_file.hpp>
36using namespace boost::iostreams;
37using namespace boost::filesystem;
38#else
39#include <sys/mman.h>
40#endif
41#include <fcntl.h>
42
43
44#define BUFFER_SEGMENTS 15
45#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
46
47//
48// Write matched lines from a buffer to an output file, given segment
49// scanners for line ends and matches (where matches are a subset of line ends).
50// The buffer pointer must point to the first byte of the segment
51// corresponding to the scanner indexes.   The first_line_start is the
52// start position of the first line relative to the buffer start position.
53// It must be zero or negative;  if negative, the buffer must permit negative
54// indexing so that the lineup to the buffer start position can also be printed.
55// The start position of the final line in the processed segment is returned.
56//
57
58ssize_t GrepExecutor::write_matches(llvm::raw_ostream & out, const char * buffer, ssize_t line_start) {
59
60    ssize_t match_pos;
61    ssize_t line_end;
62    while (mMatch_scanner.has_next()) {
63        match_pos = mMatch_scanner.scan_to_next();
64        // If we found a match, it must be at a line end.
65        while (true) {
66            line_end = mLineBreak_scanner.scan_to_next();
67            if (line_end >= match_pos) {
68                break;
69            }
70            line_start = line_end + 1;
71            mLineNum++;
72        }
73        if (mShowFileNameOption) {
74            out << mFileName << ':';
75        }
76        if (mShowLineNumberingOption) {
77            out << mLineNum << ":";
78        }
79        if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
80            // The line "starts" on the LF of a CRLF.  Really the end of the last line.
81            line_start++;
82        }
83        if (buffer + line_end == mFileBuffer + mFileSize) {
84            // The match position is at end-of-file.   We have a final unterminated line.
85            out.write(&buffer[line_start], line_end - line_start);
86            if (mNormalizeLineBreaksOption) {
87              out << '\n';  // terminate it
88            }
89            return line_end;
90        }
91        unsigned char end_byte = (unsigned char)buffer[line_end];
92        if (mNormalizeLineBreaksOption) {
93            if (end_byte == 0x85) {
94                // Line terminated with NEL, on the second byte.  Back up 1.
95                line_end--;
96            } else if (end_byte > 0xD) {
97                // Line terminated with PS or LS, on the third byte.  Back up 2.
98                line_end -= 2;
99            }
100            out.write(&buffer[line_start], line_end - line_start);
101            out << '\n';
102        }
103        else {
104            if (end_byte == 0x0D) {
105                // Check for line_end on first byte of CRLF;  note that we don't
106                // want to access past the end of buffer.
107                if ((buffer + line_end + 1 < mFileBuffer + mFileSize) && (buffer[line_end + 1] == 0x0A)) {
108                    // Found CRLF; preserve both bytes.
109                    line_end++;
110                }
111            }
112            out.write(&buffer[line_start], line_end - line_start + 1);
113        }
114        line_start = line_end + 1;
115        mLineNum++;
116    }
117    while(mLineBreak_scanner.has_next()) {
118        line_end = mLineBreak_scanner.scan_to_next();
119        line_start = line_end+1;
120        mLineNum++;
121    }
122    return line_start;
123}
124
125bool GrepExecutor::finalLineIsUnterminated() const {
126    if (mFileSize == 0) return false;
127    unsigned char end_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-1]);
128    // LF through CR are line break characters
129    if ((end_byte >= 0xA) && (end_byte <= 0xD)) return false;
130    // Other line breaks require at least two bytes.
131    if (mFileSize == 1) return true;
132    // NEL
133    unsigned char penult_byte = static_cast<unsigned char>(mFileBuffer[mFileSize-2]);
134    if ((end_byte == 0x85) && (penult_byte == 0xC2)) return false;
135    if (mFileSize == 2) return true;
136    // LS and PS
137    if ((end_byte < 0xA8) || (end_byte > 0xA9)) return true;
138    return (static_cast<unsigned char>(mFileBuffer[mFileSize-3]) != 0xE2) || (penult_byte != 0x80);
139}
140
141// Extracting codepoint data from UCD name data file.
142ssize_t GrepExecutor::extract_codepoints(char * buffer, ssize_t first_line_start) {
143   
144    ssize_t line_start = first_line_start;
145    size_t match_pos;
146    size_t line_end;
147   
148    while (mMatch_scanner.has_next()) {
149        match_pos = mMatch_scanner.scan_to_next();
150        // If we found a match, it must be at a line end.
151        line_end = mLineBreak_scanner.scan_to_next();
152        while (line_end < match_pos) {
153            line_start = line_end + 1;
154            mLineNum++;
155            line_end = mLineBreak_scanner.scan_to_next();
156        }
157       
158        re::codepoint_t c = 0;
159        ssize_t line_pos = line_start;
160        while (isxdigit(buffer[line_pos])) {
161            if (isdigit(buffer[line_pos])) {
162                c = (c << 4) | (buffer[line_pos] - '0');
163            }
164            else {
165                c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
166            }
167        }
168        assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
169        mParsedCodePointSet->insert(c);
170       
171        line_start = line_end + 1;
172        mLineNum++;
173    }
174    while(mLineBreak_scanner.has_next()) {
175        line_end = mLineBreak_scanner.scan_to_next();
176        line_start = line_end+1;
177        mLineNum++;
178    }
179    return line_start;
180   
181}
182
183
184void GrepExecutor::doGrep(const std::string & fileName) {
185
186    Basis_bits basis_bits;
187    BitBlock match_vector = simd<1>::constant<0>();
188    size_t match_count = 0;
189    size_t chars_avail = 0;
190    ssize_t line_start = 0;
191
192    mFileName = fileName;
193    mLineNum = 1;
194
195#ifdef USE_BOOST_MMAP
196    const path file(mFileName);
197    if (exists(file)) {
198        if (is_directory(file)) {
199            return;
200        }
201    } else {
202        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
203        return;
204    }
205
206    mFileSize = file_size(file);
207    mapped_file mFile;
208    try {
209        mFile.open(mFileName, mapped_file::priv, mFileSize, 0);
210    } catch (std::ios_base::failure e) {
211        std::cerr << "Error: Boost mmap " << e.what() << std::endl;
212        return;
213    }
214    mFileBuffer = mFile.data();
215#else
216    struct stat infile_sb;
217    const int fdSrc = open(mFileName.c_str(), O_RDONLY);
218    if (fdSrc == -1) {
219        std::cerr << "Error: cannot open " << mFileName << " for processing. Skipped.\n";
220        return;
221    }
222    if (fstat(fdSrc, &infile_sb) == -1) {
223        std::cerr << "Error: cannot stat " << mFileName << " for processing. Skipped.\n";
224        close (fdSrc);
225        return;
226    }
227    if (S_ISDIR(infile_sb.st_mode)) {
228        close (fdSrc);
229        return;
230    }
231    mFileSize = infile_sb.st_size;
232    mFileBuffer = (char *) mmap(NULL, mFileSize, PROT_READ, MAP_PRIVATE, fdSrc, 0);
233    if (mFileBuffer == MAP_FAILED) {
234        if (errno ==  ENOMEM) {
235            std::cerr << "Error:  mmap of " << mFileName << " failed: out of memory\n";
236        }
237        else {
238            std::cerr << "Error: mmap of " << mFileName << " failed with errno " << errno << ". Skipped.\n";
239        }
240        return;
241    }
242#endif
243    size_t segment = 0;
244    chars_avail = mFileSize;
245
246    llvm::raw_os_ostream out(std::cout);
247    //////////////////////////////////////////////////////////////////////////////////////////
248    // Full Segments
249    //////////////////////////////////////////////////////////////////////////////////////////
250
251    while (chars_avail >= SEGMENT_SIZE) {
252
253        mLineBreak_scanner.init();
254        mMatch_scanner.init();
255
256        for (size_t blk = 0; blk != SEGMENT_BLOCKS; ++blk) {
257            s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
258            Output output;
259            mProcessBlockFcn(basis_bits, output);
260
261            mMatch_scanner.load_block(output.matches, blk);
262            mLineBreak_scanner.load_block(output.LF, blk);
263
264            if (mCountOnlyOption) {
265                if (bitblock::any(output.matches)) {
266                    if (bitblock::any(simd_and(match_vector, output.matches))) {
267                        match_count += bitblock::popcount(match_vector);
268                        match_vector = output.matches;
269                    } else {
270                        match_vector = simd_or(match_vector, output.matches);
271                    }
272                }
273            }
274        }
275        if (!mCountOnlyOption) {
276            if (mGetCodePointsOption) {
277                line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
278            }
279            else {
280                line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
281            }
282        }
283        segment++;
284        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
285        chars_avail -= SEGMENT_SIZE;
286    }
287
288    //////////////////////////////////////////////////////////////////////////////////////////
289    // For the Final Partial Segment.
290    //////////////////////////////////////////////////////////////////////////////////////////
291
292    size_t remaining = chars_avail;
293    size_t blk = 0;
294
295    mLineBreak_scanner.init();
296    mMatch_scanner.init();
297
298    /* Full Blocks */
299    for (; remaining >= BLOCK_SIZE; remaining -= BLOCK_SIZE, ++blk) {
300        s2p_do_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits);
301        Output output;
302        mProcessBlockFcn(basis_bits, output);
303
304        mLineBreak_scanner.load_block(output.LF, blk);
305        mMatch_scanner.load_block(output.matches, blk);
306        if (mCountOnlyOption) {
307            if (bitblock::any(output.matches)) {
308                if (bitblock::any(simd_and(match_vector, output.matches))) {
309                    match_count += bitblock::popcount(match_vector);
310                    match_vector = output.matches;
311                } else {
312                    match_vector = simd_or(match_vector, output.matches);
313                }
314            }
315        }
316    }
317
318    //Final Partial Block (may be empty, but there could be carries pending).
319   
320   
321    const auto EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE - remaining));
322   
323    if (remaining == 0) {  // No data, we may be at a page boundary.   Do not access memory.
324        basis_bits.bit_0 = simd<1>::constant<0>();
325        basis_bits.bit_1 = simd<1>::constant<0>();
326        basis_bits.bit_2 = simd<1>::constant<0>();
327        basis_bits.bit_3 = simd<1>::constant<0>();
328        basis_bits.bit_4 = simd<1>::constant<0>();
329        basis_bits.bit_5 = simd<1>::constant<0>();
330        basis_bits.bit_6 = simd<1>::constant<0>();
331        basis_bits.bit_7 = simd<1>::constant<0>();
332    }
333    else { // At least 1 byte, so we are not at a page boundary yet, safe to access a full block.
334        s2p_do_final_block(reinterpret_cast<BytePack *>(mFileBuffer + (blk * BLOCK_SIZE) + (segment * SEGMENT_SIZE)), basis_bits, EOF_mask);
335    }
336
337    if (finalLineIsUnterminated()) {
338        // Add a LF at the EOF position
339        BitBlock EOF_pos = simd_not(simd_or(bitblock::slli<1>(simd_not(EOF_mask)), EOF_mask));
340        //  LF = 00001010  (bits 4 and 6 set).
341        basis_bits.bit_4 = simd_or(basis_bits.bit_4, EOF_pos);
342        basis_bits.bit_6 = simd_or(basis_bits.bit_6, EOF_pos);
343    }
344   
345    Output output;
346    mProcessBlockFcn(basis_bits, output);
347
348    if (mCountOnlyOption) {
349        match_count += bitblock::popcount(match_vector);
350        if (bitblock::any(output.matches)) {
351            match_count += bitblock::popcount(output.matches);
352        }
353        if (mShowFileNameOption) {
354            out << mFileName << ':';
355        }
356        out << match_count << '\n';
357    } else {
358        mLineBreak_scanner.load_block(output.LF, blk);
359        mMatch_scanner.load_block(output.matches, blk);
360        while (++blk < SEGMENT_BLOCKS) {
361            mLineBreak_scanner.load_block(simd<1>::constant<0>(), blk);
362            mMatch_scanner.load_block(simd<1>::constant<0>(), blk);
363        }
364        if (mGetCodePointsOption) {
365            line_start = extract_codepoints(mFileBuffer + (segment * SEGMENT_SIZE), line_start);
366        }
367        else {
368            line_start = write_matches(out, mFileBuffer + (segment * SEGMENT_SIZE), line_start);
369        }
370    }
371#ifdef USE_BOOST_MMAP
372    mFile.close();
373#else
374    munmap((void *)mFileBuffer, mFileSize);
375    close(fdSrc);
376#endif   
377}
Note: See TracBrowser for help on using the repository browser.