source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 4207

Last change on this file since 4207 was 4197, checked in by nmedfort, 5 years ago

More refactoring of the RE system; moved the original re/RE_Compiler to compiler.cpp and the PBIX_Compiler to the re/RE_Compiler.

File size: 15.4 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "utf_encoding.h"
9#include "compiler.h"
10
11#include <fstream>
12#include <sstream>
13#include <iostream>
14#include <string>
15#include <stdint.h>
16
17#include <stdio.h>
18#include <stdlib.h>
19#include <unistd.h>
20#include <errno.h>
21#include <sys/types.h>
22#include <sys/stat.h>
23
24#include "include/simd-lib/bitblock.hpp"
25#include "include/simd-lib/carryQ.hpp"
26#include "include/simd-lib/pabloSupport.hpp"
27#include "include/simd-lib/s2p.hpp"
28#include "include/simd-lib/buffer.hpp"
29#include "include/simd-lib/bitblock_iterator.hpp"
30#include "include/simd-lib/transpose.hpp"
31
32#include "hrtime.h"
33
34// mmap system
35#include <sys/mman.h>
36#include <fcntl.h>
37
38#if (BLOCK_SIZE == 128)
39#define SEGMENT_BLOCKS 7
40#endif
41
42#if (BLOCK_SIZE == 256)
43#define SEGMENT_BLOCKS 15
44#endif
45
46#define SEGMENT_SIZE (BLOCK_SIZE * SEGMENT_BLOCKS)
47
48#define BUFFER_SEGMENTS 15
49#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
50
51#define BitBlock_declare(name)  BitBlock name
52
53#define ubitblock_declare(name, n) \
54  ubitblock name[n];\
55  do {int i;\
56      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
57     }\
58  while (0)
59
60BitBlock EOF_mask = simd<1>::constant<1>();
61
62struct Output {
63    BitBlock matches;
64    BitBlock LF;
65};
66
67using namespace std;
68
69typedef void (*process_block_fcn)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output);
70
71
72#define USE_MMAP
73#ifndef USE_MMAP
74void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
75#endif
76#ifdef USE_MMAP
77void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
78#endif
79
80
81BitBlock get_category(Basis_bits &basis_bits, const char* category);
82
83int main(int argc, char *argv[])
84{
85    char * inregex, * fileregex, * infilename, * outfilename;
86    FILE *infile, *outfile, *regexfile;
87
88#ifdef USE_MMAP
89    int fdSrc;
90    struct stat infile_sb;
91    char * infile_buffer;
92#endif
93
94    int opt_code;
95    int count_only_option = 0;
96    int print_version_option = 0;
97    int regex_from_file_option = 0;
98    int ascii_only_option = 0;
99
100    int compile_time_option = 0;
101
102    unsigned long long cycles = 0;
103    double timer = 0;
104
105    long lSize = 0;
106
107    size_t result;
108
109    while ((opt_code = getopt(argc, argv, "cvfta")) != -1)
110    {
111        switch (opt_code)
112        {
113        case 'c':
114            count_only_option = 1;
115            break;
116        case 'v':
117            print_version_option = 1;
118            break;
119        case 'f':
120            regex_from_file_option = 1;
121            break;
122        case 't':
123            compile_time_option = 1;
124            break;
125        case 'a':
126            ascii_only_option = 1;
127            break;
128        case '?':
129            break;
130        default:
131            printf ("Invalid option: %c\n", opt_code);
132            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
133                    exit(-1);
134        }
135    }
136
137    if (optind >= argc)
138    {
139        printf ("Too few arguments\n");
140        printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
141        exit(-1);
142    }
143
144    inregex = argv[optind++];
145    if (inregex == 0)
146    {
147        fprintf(stderr, "Error: cannot read the regular expression.\n");
148        exit(-1);
149    }
150
151    if (regex_from_file_option)
152    {
153        regexfile = fopen(inregex, "rb");
154        if (!regexfile){
155            fprintf(stderr, "Error: cannot open %s for processing.\n", inregex);
156            exit(-1);
157        }
158
159        fseek (regexfile , 0 , SEEK_END);
160        lSize = ftell (regexfile);
161        rewind (regexfile);
162
163        fileregex = (char*) malloc (sizeof(char)*lSize);
164        if (fileregex == NULL) {fputs ("Memory error",stderr); exit (2);}
165
166        result = fread (fileregex, 1, lSize, regexfile);
167        if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
168        fclose(regexfile);
169
170        if (fileregex[lSize - 1] == '\n') fileregex[lSize - 1] = '\0';
171    }
172
173    infilename = argv[optind++];
174#ifndef USE_MMAP
175    infile = fopen(infilename, "rb");
176    if (!infile) {
177        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
178        exit(-1);
179    }
180#endif
181
182    if (optind >= argc) {
183        outfile = stdout;
184    }
185    else {
186        outfilename = argv[optind++];
187        if (optind != argc)
188        {
189            printf ("Too many arguments\n");
190            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
191            exit(-1);
192        }
193        outfile = fopen(outfilename, "wb");
194        if (!outfile)
195        {
196            fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
197            exit(-1);
198        }
199    }
200
201#ifdef USE_MMAP
202    fdSrc = open(infilename, O_RDONLY);
203    if (fdSrc == -1) {
204        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
205        exit(-1);
206    }
207    if (fstat(fdSrc, &infile_sb) == -1) {
208        fprintf(stderr, "Error: cannot stat %s for processing.\n", infilename);
209        exit(-1);
210    }
211    if (infile_sb.st_size == 0) {
212        if (count_only_option) fprintf(outfile, "Matching Lines: %d\n", 0);
213        exit(0);
214    }
215    infile_buffer = (char *) mmap(NULL, infile_sb.st_size, PROT_READ, MAP_PRIVATE, fdSrc, 0);
216    if (infile_buffer == MAP_FAILED) {
217        fprintf(stderr, "Error: mmap of %s failure.\n", infilename);
218        exit(-1);
219    }
220#endif
221
222    if (print_version_option)
223    {
224        fprintf(outfile, "Parabix icgrep implementation: August 2014\n");
225    }
226
227    UTF_Encoding encoding;
228    encoding.setName("UTF-8");
229    encoding.setBits(8);
230    encoding.setMask(0xFF);
231
232    if (compile_time_option)
233    {
234        cycles = get_hrcycles();
235        timer = getElapsedTime();
236    }
237    LLVM_Gen_RetVal llvm_codegen = icgrep::compile(compile_time_option,
238                                                   ascii_only_option,
239                                                   "basis_bits.bit_",
240                                                   "temp",
241                                                   encoding ,
242                                                   (regex_from_file_option ? fileregex : inregex));
243
244    if (compile_time_option)
245    {
246        cycles = get_hrcycles() - cycles;
247        timer = getElapsedTime() - timer;
248        std::cout << "Total compile time - cycles:       " << cycles << std::endl;
249        std::cout << "Total compile time - milliseconds: " << timer << std::endl;
250    }
251
252    if (llvm_codegen.process_block_fptr != 0)
253    {
254        void (*FP)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output) = (void (*)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output))(void*)llvm_codegen.process_block_fptr;
255#ifndef USE_MMAP
256        do_process(infile, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
257#endif
258#ifdef USE_MMAP
259        do_process(infile_buffer, infile_sb.st_size, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
260#endif
261    }
262
263#ifndef USE_MMAP
264    fclose(infile);
265#endif
266#ifdef USE_MMAP
267    munmap((void *) infile_buffer, infile_sb.st_size);
268    close(fdSrc);
269#endif
270    fclose(outfile);
271    if (regex_from_file_option) free(fileregex);
272
273    return 0;
274}
275
276#if (BLOCK_SIZE == 256)
277typedef BitStreamScanner<BitBlock, uint64_t, uint64_t, SEGMENT_BLOCKS> ScannerT;
278#endif
279
280#if (BLOCK_SIZE == 128)
281typedef BitStreamScanner<BitBlock, uint32_t, uint32_t, SEGMENT_BLOCKS> ScannerT;
282#endif
283
284//
285// Write matched lines from a buffer to an output file, given segment
286// scanners for line ends and matches (where matches are a subset of line ends).
287// The buffer pointer must point to the first byte of the segment
288// corresponding to the scanner indexes.   The first_line_start is the
289// start position of the first line relative to the buffer start position.
290// It must be zero or negative;  if negative, the buffer must permit negative
291// indexing so that the lineup to the buffer start position can also be printed.
292// The start position of the final line in the processed segment is returned.
293//
294
295ssize_t write_matches(FILE * outfile, ScannerT line_scanner, ScannerT match_scanner, char * buffer, ssize_t first_line_start) {
296
297  ssize_t line_start = first_line_start;
298  size_t match_pos;
299  size_t line_end;
300  while (match_scanner.has_next()) {
301    match_pos = match_scanner.scan_to_next();
302    // If we found a match, it must be at a line end.
303    line_end = line_scanner.scan_to_next();
304    while (line_end < match_pos) {
305      line_start = line_end + 1;
306      line_end = line_scanner.scan_to_next();
307    }
308    fwrite(&buffer[line_start], 1, line_end - line_start + 1, outfile);
309    line_start = line_end + 1;
310
311  }
312  while(line_scanner.has_next()) {
313    line_end = line_scanner.scan_to_next();
314    line_start = line_end+1;
315  }
316  return line_start;
317}
318
319
320
321#ifndef USE_MMAP
322void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
323#endif
324#ifdef USE_MMAP
325void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
326#endif
327
328    struct Basis_bits basis_bits;
329    struct Output output;
330
331    BitBlock carry_q[carry_count];
332    memset (carry_q, 0, sizeof(BitBlock) * carry_count);
333
334    BitBlock match_vector = simd<1>::constant<0>();
335    int match_count=0;
336    int blk = 0;
337    int block_base  = 0;
338    int block_pos   = 0;
339    int buffer_pos  = 0;
340    int chars_avail = 0;
341    int chars_read  = 0;
342
343    int line_start = 0;
344    int line_end = 0;
345    int match_pos = 0;
346    int line_no = 0;
347
348    ScannerT LF_scanner;
349    ScannerT match_scanner;
350
351    char * buffer_ptr;
352#ifndef USE_MMAP
353    ATTRIBUTE_SIMD_ALIGN char src_buffer[SEGMENT_SIZE];
354    buffer_ptr = &src_buffer;
355    chars_read = fread((void *)&src_buffer[0], 1, SEGMENT_SIZE, infile);
356    chars_avail = chars_read;
357    if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
358#endif
359#ifdef USE_MMAP
360    int segment = 0;
361    int segment_base = 0;
362    chars_avail = infile_size;
363
364#endif
365//////////////////////////////////////////////////////////////////////////////////////////
366// Full Segments
367//////////////////////////////////////////////////////////////////////////////////////////
368
369
370
371    while (chars_avail >= SEGMENT_SIZE) {
372
373#ifdef USE_MMAP
374        segment_base = segment * SEGMENT_SIZE;
375#endif
376        LF_scanner.init();
377        match_scanner.init();
378
379        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
380#ifndef USE_MMAP
381            block_base = blk*BLOCK_SIZE;
382            s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
383#endif
384#ifdef USE_MMAP
385            block_base = blk*BLOCK_SIZE + segment_base;
386            s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
387#endif
388            process_block(basis_bits, carry_q, output);
389
390            LF_scanner.load_block(output.LF, blk);
391            match_scanner.load_block(output.matches, blk);
392            if (count_only_option){
393                if (bitblock::any(output.matches))
394                {
395                    if (bitblock::any(simd_and(match_vector, output.matches))){
396                        match_count += bitblock::popcount(match_vector);
397                        match_vector = output.matches;
398                    }
399                    else
400                    {
401                        match_vector = simd_or(match_vector, output.matches);
402                    }
403                }
404            }
405        }
406
407#ifndef USE_MMAP
408        int copy_back_pos = 0;
409
410
411        if (LF_scanner.count() > 0) {
412            copy_back_pos = LF_scanner.get_final_pos() + 1;
413            memset (carry_q, 0, sizeof(BitBlock) * carry_count);
414        }
415        else {
416            copy_back_pos =  SEGMENT_SIZE;
417        }
418
419        int  copy_back_size = SEGMENT_SIZE - copy_back_pos;
420#endif
421#ifdef USE_MMAP
422    buffer_ptr = &infile_buffer[segment_base];
423#endif
424
425        if (!count_only_option) {
426          line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
427        }
428#ifndef USE_MMAP
429        memmove(&src_buffer[0], &src_buffer[copy_back_pos], copy_back_size);
430
431        //Do another read.
432        chars_read = fread(&src_buffer[copy_back_size], 1, copy_back_pos, infile);
433        chars_avail = chars_read + copy_back_size;
434        if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
435        buffer_pos += chars_avail;
436#endif
437#ifdef USE_MMAP
438        segment++;
439        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
440        chars_avail -= SEGMENT_SIZE;
441
442#endif
443    }
444
445
446//////////////////////////////////////////////////////////////////////////////////////////
447// For the Final Partial Segment.
448//////////////////////////////////////////////////////////////////////////////////////////
449
450#ifdef USE_MMAP
451    segment_base = segment * SEGMENT_SIZE;
452#endif
453    int remaining = chars_avail;
454
455    LF_scanner.init();
456    match_scanner.init();
457
458    /* Full Blocks */
459    blk = 0;
460    while (remaining >= BLOCK_SIZE) {
461    //fprintf(outfile, "Remaining = %i\n", remaining);
462#ifndef USE_MMAP
463        block_base = block_pos;
464        s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
465#endif
466#ifdef USE_MMAP
467        block_base = block_pos + segment_base;
468        s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
469#endif
470        process_block(basis_bits, carry_q, output);
471
472        LF_scanner.load_block(output.LF, blk);
473        match_scanner.load_block(output.matches, blk);
474        if (count_only_option)
475        {
476            if (bitblock::any(output.matches))
477            {
478                if (bitblock::any(simd_and(match_vector, output.matches)))
479                {
480                    match_count += bitblock::popcount(match_vector);
481                    match_vector = output.matches;
482                }
483                else
484                {
485                    match_vector = simd_or(match_vector, output.matches);
486                }
487            }
488        }
489
490        block_pos += BLOCK_SIZE;
491        remaining -= BLOCK_SIZE;
492        blk++;
493    }
494    block_base = block_pos;
495    //fprintf(stderr, "Remaining = %i\n", remaining);
496
497    //For the last partial block, or for any carry.
498    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
499#ifndef USE_MMAP
500     block_base = block_pos;
501     s2p_do_final_block((BytePack *) &src_buffer[block_base], basis_bits, EOF_mask);
502#endif
503#ifdef USE_MMAP
504     block_base = block_pos + segment_base;
505     s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
506#endif
507    process_block(basis_bits, carry_q, output);
508
509    if (count_only_option)
510    {
511        match_count += bitblock::popcount(match_vector);
512        if (bitblock::any(output.matches))
513        {
514            match_count += bitblock::popcount(output.matches);
515        }
516        fprintf(outfile, "Matching Lines:%d\n", match_count);
517    }
518    else
519    {
520        LF_scanner.load_block(output.LF, blk);
521        match_scanner.load_block(output.matches, blk);
522        blk++;
523        for (int i = blk; i < SEGMENT_BLOCKS; i++)
524        {
525            LF_scanner.load_block(simd<1>::constant<0>(), i);
526            match_scanner.load_block(simd<1>::constant<0>(), i);
527        }
528#ifndef USE_MMAP
529        line_start = 0;
530#endif
531#ifdef USE_MMAP
532        buffer_ptr = &infile_buffer[segment_base];
533#endif
534        line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
535    }
536
537    buffer_pos += chars_avail;
538}
539
540
541
Note: See TracBrowser for help on using the repository browser.