source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 4137

Last change on this file since 4137 was 4137, checked in by linmengl, 5 years ago

move USE_UADD_OVERFLOW to llvm_gen.h: it doesn't work in icgrep.cpp; add abc_test in 'make check';

File size: 15.2 KB
RevLine 
[3850]1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[3965]7#include "icgrep.h"
8
[3850]9#include "utf_encoding.h"
10#include "re_compiler.h"
11
12#include <fstream>
13#include <sstream>
14#include <iostream>
15#include <string>
16#include <stdint.h>
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <unistd.h>
21#include <errno.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24
25#include <simd-lib/bitblock.hpp>
26#include <simd-lib/carryQ.hpp>
27#include <simd-lib/pabloSupport.hpp>
28#include <simd-lib/s2p.hpp>
29#include <simd-lib/buffer.hpp>
30#include <simd-lib/bitblock_iterator.hpp>
31
[4038]32// mmap system
33#include <sys/mman.h>
34#include <fcntl.h>
35
36
[4131]37#define SEGMENT_BLOCKS 7
[3850]38#define SEGMENT_SIZE (BLOCK_SIZE * SEGMENT_BLOCKS)
39
40#define BUFFER_SEGMENTS 15
41#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
42
43#define BitBlock_declare(name)  BitBlock name
44
45#define ubitblock_declare(name, n) \
46  ubitblock name[n];\
47  do {int i;\
48      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
49     }\
50  while (0)
51
52BitBlock EOF_mask = simd<1>::constant<1>();
53
54struct Output {
55    BitBlock matches;
56    BitBlock LF;
57};
58
59#include <simd-lib/transpose.hpp>
60
61using namespace std;
62
63typedef void (*process_block_fcn)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output);
64
[4038]65
[4074]66#define USE_MMAP
[4038]67#ifndef USE_MMAP
[3850]68void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
[4038]69#endif
70#ifdef USE_MMAP
71void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
72#endif
73
[4118]74
[3969]75BitBlock get_category(Basis_bits &basis_bits, const char* category);
[3850]76
77int main(int argc, char *argv[])
78{
79    char * inregex, * fileregex, * infilename, * outfilename;
80    FILE *infile, *outfile, *regexfile;
81
[4038]82#ifdef USE_MMAP
83    int fdSrc;
84    struct stat infile_sb;
85    char * infile_buffer;
86#endif
[4118]87
[3850]88    int opt_code;
89    int count_only_option = 0;
90    int print_version_option = 0;
91    int regex_from_file_option = 0;
[3914]92    int ascii_only_option = 0;
[3850]93
94    int compile_time_option = 0;
95
[3914]96    unsigned long long cycles = 0;
97    double timer = 0;
98
99    long lSize = 0;
100
[3850]101    size_t result;
102
[3914]103    while ((opt_code = getopt(argc, argv, "cvfta")) != -1)
[3850]104    {
105        switch (opt_code)
106        {
107        case 'c':
108            count_only_option = 1;
109            break;
110        case 'v':
111            print_version_option = 1;
112            break;
113        case 'f':
114            regex_from_file_option = 1;
115            break;
116        case 't':
117            compile_time_option = 1;
118            break;
[3914]119        case 'a':
120            ascii_only_option = 1;
121            break;
[3850]122        case '?':
123            break;
124        default:
125            printf ("Invalid option: %c\n", opt_code);
[3914]126            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
[3850]127                    exit(-1);
128        }
129    }
130
131    if (optind >= argc)
132    {
133        printf ("Too few arguments\n");
[3914]134        printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
[3850]135        exit(-1);
136    }
137
138    inregex = argv[optind++];
139    if (inregex == 0)
140    {
141        fprintf(stderr, "Error: cannot read the regular expression.\n");
142        exit(-1);
143    }
144
145    if (regex_from_file_option)
146    {
147        regexfile = fopen(inregex, "rb");
148        if (!regexfile){
149            fprintf(stderr, "Error: cannot open %s for processing.\n", inregex);
150            exit(-1);
151        }
152
153        fseek (regexfile , 0 , SEEK_END);
154        lSize = ftell (regexfile);
155        rewind (regexfile);
156
157        fileregex = (char*) malloc (sizeof(char)*lSize);
158        if (fileregex == NULL) {fputs ("Memory error",stderr); exit (2);}
159
160        result = fread (fileregex, 1, lSize, regexfile);
161        if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
162        fclose(regexfile);
163
164        if (fileregex[lSize - 1] == '\n') fileregex[lSize - 1] = '\0';
165    }
166
167    infilename = argv[optind++];
[4038]168#ifndef USE_MMAP
[3850]169    infile = fopen(infilename, "rb");
170    if (!infile) {
171        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
172        exit(-1);
173    }
[4038]174#endif
175#ifdef USE_MMAP
176    fdSrc = open(infilename, O_RDONLY);
177    if (fdSrc == -1) {
178        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
179        exit(-1);
180    }
181    if (fstat(fdSrc, &infile_sb) == -1) {
182        fprintf(stderr, "Error: cannot stat %s for processing.\n", infilename);
183        exit(-1);
184    }
185    if (infile_sb.st_size == 0) {
186        if (count_only_option) fprintf(outfile, "Matching Lines%d\n", 0);
187        exit(0);
188    }
189    infile_buffer = (char *) mmap(NULL, infile_sb.st_size, PROT_READ, MAP_PRIVATE, fdSrc, 0);
190    if (infile_buffer == MAP_FAILED) {
191        fprintf(stderr, "Error: mmap of %s failure.\n", infilename);
192        exit(-1);
193    }
194#endif
[3850]195
196    if (optind >= argc) outfile = stdout;
197    else
198    {
199        outfilename = argv[optind++];
200        if (optind != argc)
201        {
202            printf ("Too many arguments\n");
[3914]203            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
[3850]204            exit(-1);
205        }
206        outfile = fopen(outfilename, "wb");
207        if (!outfile)
208        {
209            fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
210            exit(-1);
211        }
212    }
213
214    if (print_version_option)
215    {
[4038]216        fprintf(outfile, "Parabix icgrep implementation: August 2014\n");
[3850]217    }
218
219    UTF_Encoding encoding;
220    encoding.setName("UTF-8");
221    encoding.setBits(8);
222    encoding.setMask(0xFF);
223
224    RE_Compiler* re_compiler = new RE_Compiler();
[3914]225    if (compile_time_option)
226    {
227        cycles = get_hrcycles();
228        timer = getElapsedTime();
229    }
230    LLVM_Gen_RetVal llvm_codegen = re_compiler->compile(compile_time_option,
231                                                        ascii_only_option,
232                                                        "basis_bits.bit_",
233                                                        "temp",
234                                                        encoding ,
235                                                        (regex_from_file_option ? fileregex : inregex));
[3850]236
237    if (compile_time_option)
238    {
[3914]239        cycles = get_hrcycles() - cycles;
[3850]240        timer = getElapsedTime() - timer;
[3914]241        std::cout << "Total compile time - cycles:       " << cycles << std::endl;
242        std::cout << "Total compile time - milliseconds: " << timer << std::endl;
[3850]243    }
244
245    if (llvm_codegen.process_block_fptr != 0)
246    {
247        void (*FP)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output) = (void (*)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output))(void*)llvm_codegen.process_block_fptr;
[4038]248#ifndef USE_MMAP
[3850]249        do_process(infile, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
[4038]250#endif
251#ifdef USE_MMAP
252        do_process(infile_buffer, infile_sb.st_size, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
253#endif
[3850]254    }
255
256    delete re_compiler;
[4038]257#ifndef USE_MMAP
[3850]258    fclose(infile);
[4038]259#endif
260#ifdef USE_MMAP
[4080]261    munmap((void *) infile_buffer, infile_sb.st_size);
[4038]262    close(fdSrc);
263#endif
[3850]264    fclose(outfile);
265    if (regex_from_file_option) free(fileregex);
266
267    return 0;
268}
269
[4134]270
271typedef BitStreamScanner<BitBlock, uint32_t, uint32_t, SEGMENT_BLOCKS> ScannerT;
272
273//
[4137]274// Write matched lines from a buffer to an output file, given segment
[4134]275// scanners for line ends and matches (where matches are a subset of line ends).
[4137]276// The buffer pointer must point to the first byte of the segment
[4134]277// corresponding to the scanner indexes.   The first_line_start is the
278// start position of the first line relative to the buffer start position.
[4137]279// It must be zero or negative;  if negative, the buffer must permit negative
280// indexing so that the lineup to the buffer start position can also be printed.
281// The start position of the final line in the processed segment is returned.
[4134]282//
283
284ssize_t write_matches(FILE * outfile, ScannerT line_scanner, ScannerT match_scanner, char * buffer, ssize_t first_line_start) {
285
286  ssize_t line_start = first_line_start;
287  size_t match_pos;
288  size_t line_end;
289  while (match_scanner.has_next()) {
290    match_pos = match_scanner.scan_to_next();
291    // If we found a match, it must be at a line end.
292    line_end = line_scanner.scan_to_next();
293    while (line_end < match_pos) {
294      line_start = line_end + 1;
295      line_end = line_scanner.scan_to_next();
296    }
297    fwrite(&buffer[line_start], 1, line_end - line_start + 1, outfile);
298    line_start = line_end + 1;
299
300  }
301  while(line_scanner.has_next()) {
302    line_end = line_scanner.scan_to_next();
303    line_start = line_end+1;
304  }
305  return line_start;
306}
307
308
[4137]309
[4038]310#ifndef USE_MMAP
[3850]311void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
[4038]312#endif
313#ifdef USE_MMAP
314void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
315#endif
[3850]316
317    struct Basis_bits basis_bits;
318    struct Output output;
319
320    BitBlock carry_q[carry_count];
321    memset (carry_q, 0, sizeof(BitBlock) * carry_count);
322
323    BitBlock match_vector = simd<1>::constant<0>();
324    int match_count=0;
325    int blk = 0;
326    int block_base  = 0;
327    int block_pos   = 0;
328    int buffer_pos  = 0;
329    int chars_avail = 0;
330    int chars_read  = 0;
331
332    int line_start = 0;
333    int line_end = 0;
334    int match_pos = 0;
335    int line_no = 0;
336
[4134]337    ScannerT LF_scanner;
338    ScannerT match_scanner;
[4118]339
[4134]340    char * buffer_ptr;
[4038]341#ifndef USE_MMAP
[3850]342    ATTRIBUTE_SIMD_ALIGN char src_buffer[SEGMENT_SIZE];
[4134]343    buffer_ptr = &src_buffer;
[3850]344    chars_read = fread((void *)&src_buffer[0], 1, SEGMENT_SIZE, infile);
345    chars_avail = chars_read;
346    if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
[4038]347#endif
[4118]348#ifdef USE_MMAP
[4038]349    int segment = 0;
350    int segment_base = 0;
351    chars_avail = infile_size;
[4118]352
[4038]353#endif
[3850]354//////////////////////////////////////////////////////////////////////////////////////////
355// Full Segments
356//////////////////////////////////////////////////////////////////////////////////////////
357
[4118]358
359
[3850]360    while (chars_avail >= SEGMENT_SIZE) {
361
[4038]362#ifdef USE_MMAP
363        segment_base = segment * SEGMENT_SIZE;
364#endif
[3850]365        LF_scanner.init();
366        match_scanner.init();
367
368        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
[4038]369#ifndef USE_MMAP
[3850]370            block_base = blk*BLOCK_SIZE;
371            s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
[4038]372#endif
373#ifdef USE_MMAP
374            block_base = blk*BLOCK_SIZE + segment_base;
[4118]375            s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
[4038]376#endif
[3850]377            process_block(basis_bits, carry_q, output);
378
379            LF_scanner.load_block(output.LF, blk);
380            match_scanner.load_block(output.matches, blk);
381            if (count_only_option){
382                if (bitblock::any(output.matches))
383                {
384                    if (bitblock::any(simd_and(match_vector, output.matches))){
385                        match_count += bitblock::popcount(match_vector);
386                        match_vector = output.matches;
387                    }
388                    else
389                    {
390                        match_vector = simd_or(match_vector, output.matches);
391                    }
392                }
393            }
394        }
395
[4038]396#ifndef USE_MMAP
[3850]397        int copy_back_pos = 0;
398
[3914]399
[3850]400        if (LF_scanner.count() > 0) {
401            copy_back_pos = LF_scanner.get_final_pos() + 1;
402            memset (carry_q, 0, sizeof(BitBlock) * carry_count);
403        }
404        else {
405            copy_back_pos =  SEGMENT_SIZE;
406        }
407
408        int  copy_back_size = SEGMENT_SIZE - copy_back_pos;
[4038]409#endif
410#ifdef USE_MMAP
[4134]411    buffer_ptr = &infile_buffer[segment_base];
[4038]412#endif
[3850]413
414        if (!count_only_option) {
[4134]415          line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
[3850]416        }
[4038]417#ifndef USE_MMAP
[3850]418        memmove(&src_buffer[0], &src_buffer[copy_back_pos], copy_back_size);
419
[4038]420        //Do another read.
[3850]421        chars_read = fread(&src_buffer[copy_back_size], 1, copy_back_pos, infile);
422        chars_avail = chars_read + copy_back_size;
423        if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
424        buffer_pos += chars_avail;
[4038]425#endif
426#ifdef USE_MMAP
427        segment++;
428        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
429        chars_avail -= SEGMENT_SIZE;
[4118]430
[4038]431#endif
[3850]432    }
433
434
435//////////////////////////////////////////////////////////////////////////////////////////
436// For the Final Partial Segment.
437//////////////////////////////////////////////////////////////////////////////////////////
438
[4038]439#ifdef USE_MMAP
440    segment_base = segment * SEGMENT_SIZE;
441#endif
[3850]442    int remaining = chars_avail;
443
444    LF_scanner.init();
445    match_scanner.init();
446
447    /* Full Blocks */
448    blk = 0;
449    while (remaining >= BLOCK_SIZE) {
[4038]450    //fprintf(outfile, "Remaining = %i\n", remaining);
451#ifndef USE_MMAP
[3850]452        block_base = block_pos;
[4038]453        s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
454#endif
455#ifdef USE_MMAP
456        block_base = block_pos + segment_base;
[4118]457        s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
[4038]458#endif
[3850]459        process_block(basis_bits, carry_q, output);
460
461        LF_scanner.load_block(output.LF, blk);
462        match_scanner.load_block(output.matches, blk);
463        if (count_only_option)
464        {
465            if (bitblock::any(output.matches))
466            {
467                if (bitblock::any(simd_and(match_vector, output.matches)))
468                {
469                    match_count += bitblock::popcount(match_vector);
470                    match_vector = output.matches;
471                }
472                else
473                {
474                    match_vector = simd_or(match_vector, output.matches);
475                }
476            }
477        }
478
479        block_pos += BLOCK_SIZE;
480        remaining -= BLOCK_SIZE;
481        blk++;
482    }
483    block_base = block_pos;
[4038]484    //fprintf(stderr, "Remaining = %i\n", remaining);
[3850]485
486    //For the last partial block, or for any carry.
487    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
[4038]488#ifndef USE_MMAP
489     block_base = block_pos;
490     s2p_do_final_block((BytePack *) &src_buffer[block_base], basis_bits, EOF_mask);
491#endif
492#ifdef USE_MMAP
493     block_base = block_pos + segment_base;
[4118]494     s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
[4038]495#endif
[3850]496    process_block(basis_bits, carry_q, output);
497
498    if (count_only_option)
499    {
500        match_count += bitblock::popcount(match_vector);
501        if (bitblock::any(output.matches))
502        {
503            match_count += bitblock::popcount(output.matches);
504        }
505        fprintf(outfile, "Matching Lines:%d\n", match_count);
506    }
507    else
508    {
509        LF_scanner.load_block(output.LF, blk);
510        match_scanner.load_block(output.matches, blk);
511        blk++;
512        for (int i = blk; i < SEGMENT_BLOCKS; i++)
513        {
514            LF_scanner.load_block(simd<1>::constant<0>(), i);
515            match_scanner.load_block(simd<1>::constant<0>(), i);
516        }
[4038]517#ifndef USE_MMAP
[3850]518        line_start = 0;
[4038]519#endif
520#ifdef USE_MMAP
[4134]521        buffer_ptr = &infile_buffer[segment_base];
[4038]522#endif
[4134]523        line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
[3850]524    }
525
526    buffer_pos += chars_avail;
527}
528
529
530
Note: See TracBrowser for help on using the repository browser.