source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 4187

Last change on this file since 4187 was 4187, checked in by nmedfort, 5 years ago

Some refactoring of the RE CC class and CC Compiler; Moved RE into re subdirectory.

File size: 15.5 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8
9#include "utf_encoding.h"
10#include "re/re_compiler.h"
11
12#include <fstream>
13#include <sstream>
14#include <iostream>
15#include <string>
16#include <stdint.h>
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <unistd.h>
21#include <errno.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24
25#include <simd-lib/bitblock.hpp>
26#include <simd-lib/carryQ.hpp>
27#include <simd-lib/pabloSupport.hpp>
28#include <simd-lib/s2p.hpp>
29#include <simd-lib/buffer.hpp>
30#include <simd-lib/bitblock_iterator.hpp>
31
32#include "hrtime.h"
33
34// mmap system
35#include <sys/mman.h>
36#include <fcntl.h>
37
38#if (BLOCK_SIZE == 128)
39#define SEGMENT_BLOCKS 7
40#endif
41
42#if (BLOCK_SIZE == 256)
43#define SEGMENT_BLOCKS 15
44#endif
45
46#define SEGMENT_SIZE (BLOCK_SIZE * SEGMENT_BLOCKS)
47
48#define BUFFER_SEGMENTS 15
49#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
50
51#define BitBlock_declare(name)  BitBlock name
52
53#define ubitblock_declare(name, n) \
54  ubitblock name[n];\
55  do {int i;\
56      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
57     }\
58  while (0)
59
60BitBlock EOF_mask = simd<1>::constant<1>();
61
62struct Output {
63    BitBlock matches;
64    BitBlock LF;
65};
66
67#include <simd-lib/transpose.hpp>
68
69using namespace std;
70
71typedef void (*process_block_fcn)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output);
72
73
74#define USE_MMAP
75#ifndef USE_MMAP
76void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
77#endif
78#ifdef USE_MMAP
79void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block);
80#endif
81
82
83BitBlock get_category(Basis_bits &basis_bits, const char* category);
84
85int main(int argc, char *argv[])
86{
87    char * inregex, * fileregex, * infilename, * outfilename;
88    FILE *infile, *outfile, *regexfile;
89
90#ifdef USE_MMAP
91    int fdSrc;
92    struct stat infile_sb;
93    char * infile_buffer;
94#endif
95
96    int opt_code;
97    int count_only_option = 0;
98    int print_version_option = 0;
99    int regex_from_file_option = 0;
100    int ascii_only_option = 0;
101
102    int compile_time_option = 0;
103
104    unsigned long long cycles = 0;
105    double timer = 0;
106
107    long lSize = 0;
108
109    size_t result;
110
111    while ((opt_code = getopt(argc, argv, "cvfta")) != -1)
112    {
113        switch (opt_code)
114        {
115        case 'c':
116            count_only_option = 1;
117            break;
118        case 'v':
119            print_version_option = 1;
120            break;
121        case 'f':
122            regex_from_file_option = 1;
123            break;
124        case 't':
125            compile_time_option = 1;
126            break;
127        case 'a':
128            ascii_only_option = 1;
129            break;
130        case '?':
131            break;
132        default:
133            printf ("Invalid option: %c\n", opt_code);
134            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
135                    exit(-1);
136        }
137    }
138
139    if (optind >= argc)
140    {
141        printf ("Too few arguments\n");
142        printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
143        exit(-1);
144    }
145
146    inregex = argv[optind++];
147    if (inregex == 0)
148    {
149        fprintf(stderr, "Error: cannot read the regular expression.\n");
150        exit(-1);
151    }
152
153    if (regex_from_file_option)
154    {
155        regexfile = fopen(inregex, "rb");
156        if (!regexfile){
157            fprintf(stderr, "Error: cannot open %s for processing.\n", inregex);
158            exit(-1);
159        }
160
161        fseek (regexfile , 0 , SEEK_END);
162        lSize = ftell (regexfile);
163        rewind (regexfile);
164
165        fileregex = (char*) malloc (sizeof(char)*lSize);
166        if (fileregex == NULL) {fputs ("Memory error",stderr); exit (2);}
167
168        result = fread (fileregex, 1, lSize, regexfile);
169        if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
170        fclose(regexfile);
171
172        if (fileregex[lSize - 1] == '\n') fileregex[lSize - 1] = '\0';
173    }
174
175    infilename = argv[optind++];
176#ifndef USE_MMAP
177    infile = fopen(infilename, "rb");
178    if (!infile) {
179        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
180        exit(-1);
181    }
182#endif
183
184    if (optind >= argc) {
185        outfile = stdout;
186    }
187    else {
188        outfilename = argv[optind++];
189        if (optind != argc)
190        {
191            printf ("Too many arguments\n");
192            printf("Usage: %s [-c] [-v] [-f] [-t] [-a] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
193            exit(-1);
194        }
195        outfile = fopen(outfilename, "wb");
196        if (!outfile)
197        {
198            fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
199            exit(-1);
200        }
201    }
202
203#ifdef USE_MMAP
204    fdSrc = open(infilename, O_RDONLY);
205    if (fdSrc == -1) {
206        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
207        exit(-1);
208    }
209    if (fstat(fdSrc, &infile_sb) == -1) {
210        fprintf(stderr, "Error: cannot stat %s for processing.\n", infilename);
211        exit(-1);
212    }
213    if (infile_sb.st_size == 0) {
214        if (count_only_option) fprintf(outfile, "Matching Lines: %d\n", 0);
215        exit(0);
216    }
217    infile_buffer = (char *) mmap(NULL, infile_sb.st_size, PROT_READ, MAP_PRIVATE, fdSrc, 0);
218    if (infile_buffer == MAP_FAILED) {
219        fprintf(stderr, "Error: mmap of %s failure.\n", infilename);
220        exit(-1);
221    }
222#endif
223
224    if (print_version_option)
225    {
226        fprintf(outfile, "Parabix icgrep implementation: August 2014\n");
227    }
228
229    UTF_Encoding encoding;
230    encoding.setName("UTF-8");
231    encoding.setBits(8);
232    encoding.setMask(0xFF);
233
234    RE_Compiler* re_compiler = new RE_Compiler();
235    if (compile_time_option)
236    {
237        cycles = get_hrcycles();
238        timer = getElapsedTime();
239    }
240    LLVM_Gen_RetVal llvm_codegen = re_compiler->compile(compile_time_option,
241                                                        ascii_only_option,
242                                                        "basis_bits.bit_",
243                                                        "temp",
244                                                        encoding ,
245                                                        (regex_from_file_option ? fileregex : inregex));
246
247    if (compile_time_option)
248    {
249        cycles = get_hrcycles() - cycles;
250        timer = getElapsedTime() - timer;
251        std::cout << "Total compile time - cycles:       " << cycles << std::endl;
252        std::cout << "Total compile time - milliseconds: " << timer << std::endl;
253    }
254
255    if (llvm_codegen.process_block_fptr != 0)
256    {
257        void (*FP)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output) = (void (*)(const Basis_bits &basis_bits, BitBlock carry_q[], Output &output))(void*)llvm_codegen.process_block_fptr;
258#ifndef USE_MMAP
259        do_process(infile, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
260#endif
261#ifdef USE_MMAP
262        do_process(infile_buffer, infile_sb.st_size, outfile, count_only_option, llvm_codegen.carry_q_size, FP);
263#endif
264    }
265
266    delete re_compiler;
267#ifndef USE_MMAP
268    fclose(infile);
269#endif
270#ifdef USE_MMAP
271    munmap((void *) infile_buffer, infile_sb.st_size);
272    close(fdSrc);
273#endif
274    fclose(outfile);
275    if (regex_from_file_option) free(fileregex);
276
277    return 0;
278}
279
280#if (BLOCK_SIZE == 256)
281typedef BitStreamScanner<BitBlock, uint64_t, uint64_t, SEGMENT_BLOCKS> ScannerT;
282#endif
283
284#if (BLOCK_SIZE == 128)
285typedef BitStreamScanner<BitBlock, uint32_t, uint32_t, SEGMENT_BLOCKS> ScannerT;
286#endif
287
288//
289// Write matched lines from a buffer to an output file, given segment
290// scanners for line ends and matches (where matches are a subset of line ends).
291// The buffer pointer must point to the first byte of the segment
292// corresponding to the scanner indexes.   The first_line_start is the
293// start position of the first line relative to the buffer start position.
294// It must be zero or negative;  if negative, the buffer must permit negative
295// indexing so that the lineup to the buffer start position can also be printed.
296// The start position of the final line in the processed segment is returned.
297//
298
299ssize_t write_matches(FILE * outfile, ScannerT line_scanner, ScannerT match_scanner, char * buffer, ssize_t first_line_start) {
300
301  ssize_t line_start = first_line_start;
302  size_t match_pos;
303  size_t line_end;
304  while (match_scanner.has_next()) {
305    match_pos = match_scanner.scan_to_next();
306    // If we found a match, it must be at a line end.
307    line_end = line_scanner.scan_to_next();
308    while (line_end < match_pos) {
309      line_start = line_end + 1;
310      line_end = line_scanner.scan_to_next();
311    }
312    fwrite(&buffer[line_start], 1, line_end - line_start + 1, outfile);
313    line_start = line_end + 1;
314
315  }
316  while(line_scanner.has_next()) {
317    line_end = line_scanner.scan_to_next();
318    line_start = line_end+1;
319  }
320  return line_start;
321}
322
323
324
325#ifndef USE_MMAP
326void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
327#endif
328#ifdef USE_MMAP
329void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, process_block_fcn process_block) {
330#endif
331
332    struct Basis_bits basis_bits;
333    struct Output output;
334
335    BitBlock carry_q[carry_count];
336    memset (carry_q, 0, sizeof(BitBlock) * carry_count);
337
338    BitBlock match_vector = simd<1>::constant<0>();
339    int match_count=0;
340    int blk = 0;
341    int block_base  = 0;
342    int block_pos   = 0;
343    int buffer_pos  = 0;
344    int chars_avail = 0;
345    int chars_read  = 0;
346
347    int line_start = 0;
348    int line_end = 0;
349    int match_pos = 0;
350    int line_no = 0;
351
352    ScannerT LF_scanner;
353    ScannerT match_scanner;
354
355    char * buffer_ptr;
356#ifndef USE_MMAP
357    ATTRIBUTE_SIMD_ALIGN char src_buffer[SEGMENT_SIZE];
358    buffer_ptr = &src_buffer;
359    chars_read = fread((void *)&src_buffer[0], 1, SEGMENT_SIZE, infile);
360    chars_avail = chars_read;
361    if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
362#endif
363#ifdef USE_MMAP
364    int segment = 0;
365    int segment_base = 0;
366    chars_avail = infile_size;
367
368#endif
369//////////////////////////////////////////////////////////////////////////////////////////
370// Full Segments
371//////////////////////////////////////////////////////////////////////////////////////////
372
373
374
375    while (chars_avail >= SEGMENT_SIZE) {
376
377#ifdef USE_MMAP
378        segment_base = segment * SEGMENT_SIZE;
379#endif
380        LF_scanner.init();
381        match_scanner.init();
382
383        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
384#ifndef USE_MMAP
385            block_base = blk*BLOCK_SIZE;
386            s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
387#endif
388#ifdef USE_MMAP
389            block_base = blk*BLOCK_SIZE + segment_base;
390            s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
391#endif
392            process_block(basis_bits, carry_q, output);
393
394            LF_scanner.load_block(output.LF, blk);
395            match_scanner.load_block(output.matches, blk);
396            if (count_only_option){
397                if (bitblock::any(output.matches))
398                {
399                    if (bitblock::any(simd_and(match_vector, output.matches))){
400                        match_count += bitblock::popcount(match_vector);
401                        match_vector = output.matches;
402                    }
403                    else
404                    {
405                        match_vector = simd_or(match_vector, output.matches);
406                    }
407                }
408            }
409        }
410
411#ifndef USE_MMAP
412        int copy_back_pos = 0;
413
414
415        if (LF_scanner.count() > 0) {
416            copy_back_pos = LF_scanner.get_final_pos() + 1;
417            memset (carry_q, 0, sizeof(BitBlock) * carry_count);
418        }
419        else {
420            copy_back_pos =  SEGMENT_SIZE;
421        }
422
423        int  copy_back_size = SEGMENT_SIZE - copy_back_pos;
424#endif
425#ifdef USE_MMAP
426    buffer_ptr = &infile_buffer[segment_base];
427#endif
428
429        if (!count_only_option) {
430          line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
431        }
432#ifndef USE_MMAP
433        memmove(&src_buffer[0], &src_buffer[copy_back_pos], copy_back_size);
434
435        //Do another read.
436        chars_read = fread(&src_buffer[copy_back_size], 1, copy_back_pos, infile);
437        chars_avail = chars_read + copy_back_size;
438        if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
439        buffer_pos += chars_avail;
440#endif
441#ifdef USE_MMAP
442        segment++;
443        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
444        chars_avail -= SEGMENT_SIZE;
445
446#endif
447    }
448
449
450//////////////////////////////////////////////////////////////////////////////////////////
451// For the Final Partial Segment.
452//////////////////////////////////////////////////////////////////////////////////////////
453
454#ifdef USE_MMAP
455    segment_base = segment * SEGMENT_SIZE;
456#endif
457    int remaining = chars_avail;
458
459    LF_scanner.init();
460    match_scanner.init();
461
462    /* Full Blocks */
463    blk = 0;
464    while (remaining >= BLOCK_SIZE) {
465    //fprintf(outfile, "Remaining = %i\n", remaining);
466#ifndef USE_MMAP
467        block_base = block_pos;
468        s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
469#endif
470#ifdef USE_MMAP
471        block_base = block_pos + segment_base;
472        s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
473#endif
474        process_block(basis_bits, carry_q, output);
475
476        LF_scanner.load_block(output.LF, blk);
477        match_scanner.load_block(output.matches, blk);
478        if (count_only_option)
479        {
480            if (bitblock::any(output.matches))
481            {
482                if (bitblock::any(simd_and(match_vector, output.matches)))
483                {
484                    match_count += bitblock::popcount(match_vector);
485                    match_vector = output.matches;
486                }
487                else
488                {
489                    match_vector = simd_or(match_vector, output.matches);
490                }
491            }
492        }
493
494        block_pos += BLOCK_SIZE;
495        remaining -= BLOCK_SIZE;
496        blk++;
497    }
498    block_base = block_pos;
499    //fprintf(stderr, "Remaining = %i\n", remaining);
500
501    //For the last partial block, or for any carry.
502    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
503#ifndef USE_MMAP
504     block_base = block_pos;
505     s2p_do_final_block((BytePack *) &src_buffer[block_base], basis_bits, EOF_mask);
506#endif
507#ifdef USE_MMAP
508     block_base = block_pos + segment_base;
509     s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
510#endif
511    process_block(basis_bits, carry_q, output);
512
513    if (count_only_option)
514    {
515        match_count += bitblock::popcount(match_vector);
516        if (bitblock::any(output.matches))
517        {
518            match_count += bitblock::popcount(output.matches);
519        }
520        fprintf(outfile, "Matching Lines:%d\n", match_count);
521    }
522    else
523    {
524        LF_scanner.load_block(output.LF, blk);
525        match_scanner.load_block(output.matches, blk);
526        blk++;
527        for (int i = blk; i < SEGMENT_BLOCKS; i++)
528        {
529            LF_scanner.load_block(simd<1>::constant<0>(), i);
530            match_scanner.load_block(simd<1>::constant<0>(), i);
531        }
532#ifndef USE_MMAP
533        line_start = 0;
534#endif
535#ifdef USE_MMAP
536        buffer_ptr = &infile_buffer[segment_base];
537#endif
538        line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
539    }
540
541    buffer_pos += chars_avail;
542}
543
544
545
Note: See TracBrowser for help on using the repository browser.