source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 4288

Last change on this file since 4288 was 4288, checked in by cameron, 5 years ago

Separate Advance Queue from Carry Queue

File size: 15.4 KB
Line 
1/*
2 *  Copyright (c) 2014 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "icgrep.h"
8#include "utf_encoding.h"
9#include "compiler.h"
10
11#include <fstream>
12#include <sstream>
13#include <iostream>
14#include <string>
15#include <stdint.h>
16
17#include <stdio.h>
18#include <stdlib.h>
19#include <unistd.h>
20#include <errno.h>
21#include <sys/types.h>
22#include <sys/stat.h>
23
24#include "include/simd-lib/bitblock.hpp"
25#include "include/simd-lib/carryQ.hpp"
26#include "include/simd-lib/pabloSupport.hpp"
27#include "include/simd-lib/s2p.hpp"
28#include "include/simd-lib/buffer.hpp"
29#include "include/simd-lib/bitblock_iterator.hpp"
30#include "include/simd-lib/transpose.hpp"
31
32#include "hrtime.h"
33
34// mmap system
35#include <sys/mman.h>
36#include <fcntl.h>
37
38#if (BLOCK_SIZE == 128)
39#define SEGMENT_BLOCKS 7
40#endif
41
42#if (BLOCK_SIZE == 256)
43#define SEGMENT_BLOCKS 15
44#endif
45
46#define SEGMENT_SIZE (BLOCK_SIZE * SEGMENT_BLOCKS)
47
48#define BUFFER_SEGMENTS 15
49#define BUFFER_SIZE (BUFFER_SEGMENTS * SEGMENT_SIZE)
50
51#define BitBlock_declare(name)  BitBlock name
52
53#define ubitblock_declare(name, n) \
54  ubitblock name[n];\
55  do {int i;\
56      for (i = 0; i < n; i++) name[i]._128 = simd<1>::constant<0>();\
57     }\
58  while (0)
59
60BitBlock EOF_mask = simd<1>::constant<1>();
61
62struct Output {
63    BitBlock matches;
64    BitBlock LF;
65};
66
67using namespace std;
68
69typedef void (*process_block_fcn)(const Basis_bits &basis_bits, BitBlock carry_q[], BitBlock advance_q[], Output &output);
70
71
72#define USE_MMAP
73#ifndef USE_MMAP
74void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, int advance_count, process_block_fcn process_block);
75#endif
76#ifdef USE_MMAP
77void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, int advance_count, process_block_fcn process_block);
78#endif
79
80
81BitBlock get_category(Basis_bits &basis_bits, const char* category);
82
83int main(int argc, char *argv[])
84{
85    char * inregex, * fileregex, * infilename, * outfilename;
86    FILE *infile, *outfile, *regexfile;
87
88#ifdef USE_MMAP
89    int fdSrc;
90    struct stat infile_sb;
91    char * infile_buffer;
92#endif
93
94    int opt_code;
95    bool count_only_option = 0;
96    bool print_version_option = 0;
97    bool regex_from_file_option = 0;
98    bool ascii_only_option = 0;
99    bool compile_time_option = 0;
100    bool enable_multiplexing = 0;
101    bool print_usage = 0;
102
103    unsigned long long cycles = 0;
104
105
106    long lSize = 0;
107
108    size_t result;
109
110    while ((opt_code = getopt(argc, argv, "cvftam")) != -1)
111    {
112        switch (opt_code)
113        {
114        case 'c':
115            count_only_option = 1;
116            break;
117        case 'v':
118            print_version_option = 1;
119            break;
120        case 'f':
121            regex_from_file_option = 1;
122            break;
123        case 't':
124            compile_time_option = 1;
125            break;
126        case 'a':
127            ascii_only_option = 1;
128            break;
129        case 'm':
130            enable_multiplexing = 1;
131            break;
132        case '?':
133            break;
134        default:
135            printf ("Invalid option: %c\n", opt_code);
136            print_usage = 1;
137        }
138    }
139
140    if (optind >= argc)
141    {
142        printf ("Too few arguments\n");
143        print_usage = 1;
144    }
145
146    if (print_usage) {
147        printf("Usage: %s [-a] [-c] [-f] [-m] [-t] [-v] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
148        exit(-1);
149    }
150
151    inregex = argv[optind++];
152    if (inregex == 0)
153    {
154        fprintf(stderr, "Error: cannot read the regular expression.\n");
155        exit(-1);
156    }
157
158    if (regex_from_file_option)
159    {
160        regexfile = fopen(inregex, "rb");
161        if (!regexfile){
162            fprintf(stderr, "Error: cannot open %s for processing.\n", inregex);
163            exit(-1);
164        }
165
166        fseek (regexfile , 0 , SEEK_END);
167        lSize = ftell (regexfile);
168        rewind (regexfile);
169
170        fileregex = (char*) malloc (sizeof(char)*lSize);
171        if (fileregex == NULL) {fputs ("Memory error",stderr); exit (2);}
172
173        result = fread (fileregex, 1, lSize, regexfile);
174        if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
175        fclose(regexfile);
176
177        if (fileregex[lSize - 1] == '\n') fileregex[lSize - 1] = '\0';
178    }
179
180    infilename = argv[optind++];
181#ifndef USE_MMAP
182    infile = fopen(infilename, "rb");
183    if (!infile) {
184        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
185        exit(-1);
186    }
187#endif
188
189    if (optind >= argc) {
190        outfile = stdout;
191    }
192    else {
193        outfilename = argv[optind++];
194        if (optind != argc)
195        {
196            printf("Too many arguments\n");
197            printf("Usage: %s [-a] [-c] [-f] [-m] [-t] [-v] <regex|regexfile> <inputfile> [<outputfile>]\n", argv[0]);
198            exit(-1);
199        }
200        outfile = fopen(outfilename, "wb");
201        if (!outfile)
202        {
203            fprintf(stderr, "Error: cannot open %s for writing.\n", outfilename);
204            exit(-1);
205        }
206    }
207
208#ifdef USE_MMAP
209    fdSrc = open(infilename, O_RDONLY);
210    if (fdSrc == -1) {
211        fprintf(stderr, "Error: cannot open %s for processing.\n", infilename);
212        exit(-1);
213    }
214    if (fstat(fdSrc, &infile_sb) == -1) {
215        fprintf(stderr, "Error: cannot stat %s for processing.\n", infilename);
216        exit(-1);
217    }
218    if (infile_sb.st_size == 0) {
219        if (count_only_option) fprintf(outfile, "Matching Lines: %d\n", 0);
220        exit(0);
221    }
222    infile_buffer = (char *) mmap(NULL, infile_sb.st_size, PROT_READ, MAP_PRIVATE, fdSrc, 0);
223    if (infile_buffer == MAP_FAILED) {
224        fprintf(stderr, "Error: mmap of %s failure.\n", infilename);
225        exit(-1);
226    }
227#endif
228
229    if (print_version_option)
230    {
231        fprintf(outfile, "Parabix icgrep implementation: August 2014\n");
232    }
233
234    Encoding encoding(ascii_only_option ? Encoding::Type::ASCII : Encoding::Type::UTF_8, 8);
235    if (compile_time_option)
236    {
237        cycles = get_hrcycles();
238    }
239    const auto llvm_codegen = icgrep::compile(encoding, (regex_from_file_option ? fileregex : inregex), compile_time_option, enable_multiplexing);
240
241    if (compile_time_option)
242    {
243        cycles = get_hrcycles() - cycles;
244        std::cout << "Total compile time - cycles:       " << cycles << std::endl;
245    }
246
247    if (llvm_codegen.process_block_fptr != 0)
248    {
249        void (*FP)(const Basis_bits &basis_bits, BitBlock carry_q[], BitBlock advance_q[], Output &output) = (void (*)(const Basis_bits &basis_bits, BitBlock carry_q[], BitBlock advance_q[], Output &output))(void*)llvm_codegen.process_block_fptr;
250#ifndef USE_MMAP
251        do_process(infile, outfile, count_only_option, llvm_codegen.carry_q_size, llvm_codegen.advance_q_size, FP);
252#endif
253#ifdef USE_MMAP
254        do_process(infile_buffer, infile_sb.st_size, outfile, count_only_option, llvm_codegen.carry_q_size, llvm_codegen.advance_q_size, FP);
255#endif
256    }
257
258#ifndef USE_MMAP
259    fclose(infile);
260#endif
261#ifdef USE_MMAP
262    munmap((void *) infile_buffer, infile_sb.st_size);
263    close(fdSrc);
264#endif
265    fclose(outfile);
266    if (regex_from_file_option) free(fileregex);
267
268    return 0;
269}
270
271#if (BLOCK_SIZE == 256)
272typedef BitStreamScanner<BitBlock, uint64_t, uint64_t, SEGMENT_BLOCKS> ScannerT;
273#endif
274
275#if (BLOCK_SIZE == 128)
276typedef BitStreamScanner<BitBlock, uint32_t, uint32_t, SEGMENT_BLOCKS> ScannerT;
277#endif
278
279//
280// Write matched lines from a buffer to an output file, given segment
281// scanners for line ends and matches (where matches are a subset of line ends).
282// The buffer pointer must point to the first byte of the segment
283// corresponding to the scanner indexes.   The first_line_start is the
284// start position of the first line relative to the buffer start position.
285// It must be zero or negative;  if negative, the buffer must permit negative
286// indexing so that the lineup to the buffer start position can also be printed.
287// The start position of the final line in the processed segment is returned.
288//
289
290ssize_t write_matches(FILE * outfile, ScannerT line_scanner, ScannerT match_scanner, char * buffer, ssize_t first_line_start) {
291
292  ssize_t line_start = first_line_start;
293  size_t match_pos;
294  size_t line_end;
295  while (match_scanner.has_next()) {
296    match_pos = match_scanner.scan_to_next();
297    // If we found a match, it must be at a line end.
298    line_end = line_scanner.scan_to_next();
299    while (line_end < match_pos) {
300      line_start = line_end + 1;
301      line_end = line_scanner.scan_to_next();
302    }
303    fwrite(&buffer[line_start], 1, line_end - line_start + 1, outfile);
304    line_start = line_end + 1;
305
306  }
307  while(line_scanner.has_next()) {
308    line_end = line_scanner.scan_to_next();
309    line_start = line_end+1;
310  }
311  return line_start;
312}
313
314
315
316#ifndef USE_MMAP
317void do_process(FILE *infile, FILE *outfile, int count_only_option, int carry_count, int advance_count, process_block_fcn process_block) {
318#endif
319#ifdef USE_MMAP
320void do_process(char * infile_buffer, size_t infile_size, FILE *outfile, int count_only_option, int carry_count, int advance_count, process_block_fcn process_block) {
321#endif
322
323    struct Basis_bits basis_bits;
324    struct Output output;
325    BitBlock match_vector;
326    BitBlock carry_q[carry_count];
327    BitBlock advance_q[advance_count];
328    int match_count=0;
329    int blk = 0;
330    int block_base  = 0;
331    int block_pos   = 0;
332    int buffer_pos  = 0;
333    int chars_avail = 0;
334    int chars_read  = 0;
335    int line_start = 0;
336    int line_end = 0;
337    int match_pos = 0;
338    int line_no = 0;
339
340    ScannerT LF_scanner;
341    ScannerT match_scanner;
342
343    match_vector = simd<1>::constant<0>();
344    memset (carry_q, 0, sizeof(BitBlock) * carry_count);
345    memset (advance_q, 0, sizeof(BitBlock) * advance_count);
346
347    char * buffer_ptr;
348#ifndef USE_MMAP
349    ATTRIBUTE_SIMD_ALIGN char src_buffer[SEGMENT_SIZE];
350    buffer_ptr = &src_buffer;
351    chars_read = fread((void *)&src_buffer[0], 1, SEGMENT_SIZE, infile);
352    chars_avail = chars_read;
353    if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
354#endif
355#ifdef USE_MMAP
356    int segment = 0;
357    int segment_base = 0;
358    chars_avail = infile_size;
359
360#endif
361//////////////////////////////////////////////////////////////////////////////////////////
362// Full Segments
363//////////////////////////////////////////////////////////////////////////////////////////
364
365
366
367    while (chars_avail >= SEGMENT_SIZE) {
368
369#ifdef USE_MMAP
370        segment_base = segment * SEGMENT_SIZE;
371#endif
372        LF_scanner.init();
373        match_scanner.init();
374
375        for (blk = 0; blk < SEGMENT_BLOCKS; blk++) {
376#ifndef USE_MMAP
377            block_base = blk*BLOCK_SIZE;
378            s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
379#endif
380#ifdef USE_MMAP
381            block_base = blk*BLOCK_SIZE + segment_base;
382            s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
383#endif
384            process_block(basis_bits, carry_q, advance_q, output);
385
386            LF_scanner.load_block(output.LF, blk);
387            match_scanner.load_block(output.matches, blk);
388            if (count_only_option){
389                if (bitblock::any(output.matches))
390                {
391                    if (bitblock::any(simd_and(match_vector, output.matches))){
392                        match_count += bitblock::popcount(match_vector);
393                        match_vector = output.matches;
394                    }
395                    else
396                    {
397                        match_vector = simd_or(match_vector, output.matches);
398                    }
399                }
400            }
401        }
402
403#ifndef USE_MMAP
404        int copy_back_pos = 0;
405
406
407        if (LF_scanner.count() > 0) {
408            copy_back_pos = LF_scanner.get_final_pos() + 1;
409            memset (carry_q, 0, sizeof(BitBlock) * carry_count);
410            memset (advance_q, 0, sizeof(BitBlock) * advance_count);
411        }
412        else {
413            copy_back_pos =  SEGMENT_SIZE;
414        }
415
416        int  copy_back_size = SEGMENT_SIZE - copy_back_pos;
417#endif
418#ifdef USE_MMAP
419    buffer_ptr = &infile_buffer[segment_base];
420#endif
421
422        if (!count_only_option) {
423          line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
424        }
425#ifndef USE_MMAP
426        memmove(&src_buffer[0], &src_buffer[copy_back_pos], copy_back_size);
427
428        //Do another read.
429        chars_read = fread(&src_buffer[copy_back_size], 1, copy_back_pos, infile);
430        chars_avail = chars_read + copy_back_size;
431        if (chars_avail >= SEGMENT_SIZE) chars_avail = SEGMENT_SIZE;
432        buffer_pos += chars_avail;
433#endif
434#ifdef USE_MMAP
435        segment++;
436        line_start -= SEGMENT_SIZE;  /* Will be negative offset for use within next segment. */
437        chars_avail -= SEGMENT_SIZE;
438#endif
439    }
440
441
442//////////////////////////////////////////////////////////////////////////////////////////
443// For the Final Partial Segment.
444//////////////////////////////////////////////////////////////////////////////////////////
445
446#ifdef USE_MMAP
447    segment_base = segment * SEGMENT_SIZE;
448#endif
449    int remaining = chars_avail;
450
451    LF_scanner.init();
452    match_scanner.init();
453
454    /* Full Blocks */
455    blk = 0;
456    while (remaining >= BLOCK_SIZE) {
457    //fprintf(outfile, "Remaining = %i\n", remaining);
458#ifndef USE_MMAP
459        block_base = block_pos;
460        s2p_do_block((BytePack *) &src_buffer[block_base], basis_bits);
461#endif
462#ifdef USE_MMAP
463        block_base = block_pos + segment_base;
464        s2p_do_block((BytePack *) &infile_buffer[block_base], basis_bits);
465#endif
466        process_block(basis_bits, carry_q, advance_q, output);
467
468        LF_scanner.load_block(output.LF, blk);
469        match_scanner.load_block(output.matches, blk);
470        if (count_only_option)
471        {
472            if (bitblock::any(output.matches))
473            {
474                if (bitblock::any(simd_and(match_vector, output.matches)))
475                {
476                    match_count += bitblock::popcount(match_vector);
477                    match_vector = output.matches;
478                }
479                else
480                {
481                    match_vector = simd_or(match_vector, output.matches);
482                }
483            }
484        }
485
486        block_pos += BLOCK_SIZE;
487        remaining -= BLOCK_SIZE;
488        blk++;
489    }
490    block_base = block_pos;
491    //fprintf(stderr, "Remaining = %i\n", remaining);
492
493    //For the last partial block, or for any carry.
494    EOF_mask = bitblock::srl(simd<1>::constant<1>(), convert(BLOCK_SIZE-remaining));
495#ifndef USE_MMAP
496     block_base = block_pos;
497     s2p_do_final_block((BytePack *) &src_buffer[block_base], basis_bits, EOF_mask);
498#endif
499#ifdef USE_MMAP
500     block_base = block_pos + segment_base;
501     s2p_do_final_block((BytePack *) &infile_buffer[block_base], basis_bits, EOF_mask);
502#endif
503    process_block(basis_bits, carry_q, advance_q, output);
504
505    if (count_only_option)
506    {
507        match_count += bitblock::popcount(match_vector);
508        if (bitblock::any(output.matches))
509        {
510            match_count += bitblock::popcount(output.matches);
511        }
512        fprintf(outfile, "Matching Lines:%d\n", match_count);
513    }
514    else
515    {
516        LF_scanner.load_block(output.LF, blk);
517        match_scanner.load_block(output.matches, blk);
518        blk++;
519        for (int i = blk; i < SEGMENT_BLOCKS; i++)
520        {
521            LF_scanner.load_block(simd<1>::constant<0>(), i);
522            match_scanner.load_block(simd<1>::constant<0>(), i);
523        }
524#ifndef USE_MMAP
525        line_start = 0;
526#endif
527#ifdef USE_MMAP
528        buffer_ptr = &infile_buffer[segment_base];
529#endif
530        line_start = write_matches(outfile, LF_scanner, match_scanner, buffer_ptr, line_start);
531    }
532
533    buffer_pos += chars_avail;
534}
535
536
537
Note: See TracBrowser for help on using the repository browser.