source: icGREP/icgrep-devel/icgrep/do_grep.cpp @ 4477

Last change on this file since 4477 was 4477, checked in by cameron, 5 years ago

Bug fix for big files

File size: 9.4 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "do_grep.h"
9
10#include <fstream>
11#include <sstream>
12#include <iostream>
13#include <string>
14#include <stdint.h>
15
16#include <stdio.h>
17#include <stdlib.h>
18#include <unistd.h>
19#include <errno.h>
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <stdexcept>
23
24#include "include/simd-lib/carryQ.hpp"
25#include "include/simd-lib/pabloSupport.hpp"
26#include "include/simd-lib/s2p.hpp"
27#include "include/simd-lib/buffer.hpp"
28
29// mmap system
30#include <sys/mman.h>
31#include <fcntl.h>
32
33
34#define BUFFER_SEGMENTS 15
35#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
36
37#define BitBlock_declare(name)  BitBlock name
38
39#define ubitblock_declare(name, n) \
40  ubitblock name[n];\
41  do {int i;\
42      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
43     }\
44  while (0)
45
46BitBlock EOF_mask = simd<1>::constant<1>();
47
48//
49// Write matched lines from a buffer to an output file, given segment
50// scanners for line ends and matches (where matches are a subset of line ends).
51// The buffer pointer must point to the first byte of the segment
52// corresponding to the scanner indexes.   The first_line_start is the
53// start position of the first line relative to the buffer start position.
54// It must be zero or negative;  if negative, the buffer must permit negative
55// indexing so that the lineup to the buffer start position can also be printed.
56// The start position of the final line in the processed segment is returned.
57//
58
59ssize_t GrepExecutor::write_matches(char * buffer, ssize_t first_line_start) {
60
61  ssize_t line_start = first_line_start;
62  size_t match_pos;
63  size_t line_end;
64  while (match_scanner.has_next()) {
65    match_pos = match_scanner.scan_to_next();
66    // If we found a match, it must be at a line end.
67    line_end = LF_scanner.scan_to_next();
68    while (line_end < match_pos) {
69      line_start = line_end + 1;
70      line_no++;
71      line_end = LF_scanner.scan_to_next();
72    }
73    if (mShowFileNameOption) {
74      std::cout << mFileName;
75    }
76    if (mShowLineNumberingOption) {
77      std::cout << line_no << ":";
78    }
79    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
80        // The LF of a CRLF.  Really the end of the last line. 
81        line_start++;
82    }
83    unsigned char end_byte = (unsigned char) buffer[line_end];
84    if (mNormalizeLineBreaksOption) {
85        if (end_byte <= 0xD) {
86            // Line terminated with LF, VT, FF or CR. 
87            std::cout.write(&buffer[line_start], line_end - line_start);
88            std::cout << std::endl;
89        }
90        else if (end_byte == 0x85) {
91            // Line terminated with NEL, on the second byte. 
92            std::cout.write(&buffer[line_start], line_end - line_start - 1);
93            std::cout << std::endl;
94        }
95        else  {
96            // Line terminated with PS or LS, on the third byte.
97            std::cout.write(&buffer[line_start], line_end - line_start - 2);
98            std::cout << std::endl;
99        }
100    }
101    else {
102        // Check for line_end on first byte of CRLF;  note that to safely
103        // access past line_end, even at the end of buffer, we require the
104        // mmap_sentinel_bytes >= 1.
105        if (end_byte == 0x0D) {
106            if (buffer[line_end + 1] == 0x0A) {
107                line_end++;
108            }
109        }
110        std::cout.write(&buffer[line_start], line_end - line_start + 1);
111    }
112    line_start = line_end + 1;
113    line_no++;
114
115  }
116  while(LF_scanner.has_next()) {
117    line_end = LF_scanner.scan_to_next();
118    line_start = line_end+1;
119    line_no++;
120  }
121  return line_start;
122}
123
124
125
126void GrepExecutor::doGrep(const std::string infilename) {
127
128    struct Basis_bits basis_bits;
129    struct Output output;
130    BitBlock match_vector;
131    BitBlock carry_q[mCarries];
132    BitBlock advance_q[mAdvances];
133   
134   
135    mFileName = infilename + ":";
136   
137    size_t match_count = 0;
138    size_t blk = 0;
139    size_t block_base  = 0;
140    size_t block_pos   = 0;
141    size_t chars_avail = 0;
142    ssize_t line_start = 0;
143    line_no = 1;
144
145    match_vector = simd<1>::constant<0>();
146    memset (carry_q, 0, sizeof(BitBlock) * mCarries);
147    memset (advance_q, 0, sizeof(BitBlock) * mAdvances);
148   
149    int fdSrc;
150    struct stat infile_sb;
151    char * infile_buffer;
152    fdSrc = open(infilename.c_str(), O_RDONLY);
153    if (fdSrc == -1) {
154        std::cerr << "Error: cannot open " << infilename << " for processing. Skipped.\n";
155        return;
156    }
157    if (fstat(fdSrc, &infile_sb) == -1) {
158        std::cerr << "Error: cannot stat " << infilename << " for processing. Skipped.\n";
159        return;
160    }
161    if (S_ISDIR(infile_sb.st_mode)) {
162        // Silently ignore directories.
163        // std::cerr << "Error: " << infilename << " is a directory. Skipped.\n";
164        return;
165    }
166    mFileSize = infile_sb.st_size;
167    // Set 2 sentinel bytes, 1 for possible addition of LF for unterminated last line,
168    // 1 guard byte.
169    const size_t mmap_sentinel_bytes = 2; 
170    infile_buffer = (char *) mmap(NULL, mFileSize + mmap_sentinel_bytes, PROT_READ, MAP_PRIVATE, fdSrc, 0);
171    if (infile_buffer == MAP_FAILED) {
172        std::cerr << "Error: mmap of " << infilename << " failed. Skipped.\n";
173        return;
174    }
175    char * buffer_ptr;
176    size_t segment = 0;
177    size_t segment_base = 0;
178    chars_avail = mFileSize;
179   
180//////////////////////////////////////////////////////////////////////////////////////////
181// Full Segments
182//////////////////////////////////////////////////////////////////////////////////////////
183
184    while (chars_avail >= SEGMENT_SIZE) {
185
186        segment_base = segment * SEGMENT_SIZE;
187        LF_scanner.init();
188        match_scanner.init();
189
190        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
191            block_base = blk*BLOCK_SIZE + segment_base;
192            s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
193            mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
194
195            LF_scanner.load_block(output.LF, blk);
196            match_scanner.load_block(output.matches, blk);
197            if (mCountOnlyOption){
198                if (bitblock::any(output.matches))
199                {
200                    if (bitblock::any(simd_and(match_vector, output.matches))){
201                        match_count += bitblock::popcount(match_vector);
202                        match_vector = output.matches;
203                    }
204                    else
205                    {
206                        match_vector = simd_or(match_vector, output.matches);
207                    }
208                }
209            }
210        }
211
212        buffer_ptr = &infile_buffer[segment_base];
213
214        if (!mCountOnlyOption) {
215          line_start = write_matches(buffer_ptr, line_start);
216        }
217        segment++;
218        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
219        chars_avail -= SEGMENT_SIZE;
220    }
221
222//////////////////////////////////////////////////////////////////////////////////////////
223// For the Final Partial Segment.
224//////////////////////////////////////////////////////////////////////////////////////////
225
226    segment_base = segment * SEGMENT_SIZE;
227    int remaining = chars_avail;
228       
229
230    LF_scanner.init();
231    match_scanner.init();
232
233    /* Full Blocks */
234    blk = 0;
235    while (remaining >= BLOCK_SIZE) {
236        block_base = block_pos + segment_base;
237        s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
238        mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
239
240        LF_scanner.load_block(output.LF, blk);
241        match_scanner.load_block(output.matches, blk);
242        if (mCountOnlyOption)
243        {
244            if (bitblock::any(output.matches))
245            {
246                if (bitblock::any(simd_and(match_vector, output.matches)))
247                {
248                    match_count += bitblock::popcount(match_vector);
249                    match_vector = output.matches;
250                }
251                else
252                {
253                    match_vector = simd_or(match_vector, output.matches);
254                }
255            }
256        }
257
258        block_pos += BLOCK_SIZE;
259        remaining -= BLOCK_SIZE;
260        blk++;
261    }
262    block_base = block_pos;
263    //fprintf(stderr, "Remaining = %i\n", remaining);
264
265    //For the last partial block, or for any carry.
266   
267   
268    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
269    block_base = block_pos + segment_base;
270    s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
271    mProcessBlockFcn(basis_bits, carry_q, advance_q, output);
272
273    if (mCountOnlyOption)
274    {
275        match_count += bitblock::popcount(match_vector);
276        if (bitblock::any(output.matches))
277        {
278            match_count += bitblock::popcount(output.matches);
279        }
280        if (mShowFileNameOption) {
281            std::cout << mFileName;
282        }
283        std::cout << match_count << std::endl;
284    }
285    else
286    {
287        LF_scanner.load_block(output.LF, blk);
288        match_scanner.load_block(output.matches, blk);
289        blk++;
290        for (int i = blk; i < SEGMENT_BLOCKS; i++)
291        {
292            LF_scanner.load_block(simd<1>::constant<0>(), i);
293            match_scanner.load_block(simd<1>::constant<0>(), i);
294        }
295        buffer_ptr = &infile_buffer[segment_base];
296        line_start = write_matches(buffer_ptr, line_start);
297    }
298   
299    munmap((void *) infile_buffer, mFileSize + mmap_sentinel_bytes);
300    close(fdSrc);
301   
302}
Note: See TracBrowser for help on using the repository browser.