source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5707

Last change on this file since 5707 was 5707, checked in by cameron, 20 months ago

Fix for older compilers

File size: 18.6 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/toolchain.h>
31#include <toolchain/cpudriver.h>
32#include <iostream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38#include <errno.h>
39#include <llvm/ADT/STLExtras.h> // for make_unique
40#include <llvm/Support/CommandLine.h>
41
42using namespace parabix;
43using namespace llvm;
44static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
45
46namespace grep {
47
48// Grep Engine construction and initialization.
49   
50GrepEngine::GrepEngine() :
51    mGrepDriver(nullptr),
52    grepMatchFound(false),
53    fileCount(0),
54    mMoveMatchesToEOL(true) {}
55   
56GrepEngine::~GrepEngine() {
57    delete mGrepDriver;
58}
59   
60QuietModeEngine::QuietModeEngine() : GrepEngine() {
61    mMoveMatchesToEOL = false;
62}
63
64MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
65    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
66    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
67    mMoveMatchesToEOL = false;
68}
69
70CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
71    mFileSuffix = ":";
72}
73
74EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
75    mFileSuffix = InitialTabFlag ? "\t:" : ":";
76    if (LineRegexpFlag) mMoveMatchesToEOL = false;
77}
78
79void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
80    const int n = filenames.size();
81    mResultStrs.resize(n);
82    for (unsigned i = 0; i < n; i++) {
83        mResultStrs[i] = make_unique<std::stringstream>();
84    }
85    inputFiles = filenames;
86}
87   
88
89// Code Generation
90//
91// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
92
93std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
94    auto & idb = mGrepDriver->getBuilder();
95    const unsigned segmentSize = codegen::SegmentSize;
96    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
97    const unsigned encodingBits = 8;
98   
99    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
100    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
101    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
102   
103    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
104    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
105    mGrepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
106   
107    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
108    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
109    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
110   
111    const auto n = REs.size();
112   
113    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
114   
115    for (unsigned i = 0; i < n; i++) {
116        REs[i] = re::resolveNames(REs[i]);
117        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
118        std::vector<std::vector<unsigned>> exclusiveSetIDs;
119        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
120        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
121    }
122   
123    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
124   
125    for(unsigned i = 0; i < n; ++i){
126        const auto numOfCharacterClasses = charclasses[i].size();
127        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
128        kernel::Kernel * ccK = mGrepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
129        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
130        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
131        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
132        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
133        MatchResultsBufs[i] = MatchResults;
134    }
135    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
136    if (REs.size() > 1) {
137        MergedResults = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
138        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
139        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
140    }
141    StreamSetBuffer * Matches = MergedResults;
142   
143    if (mMoveMatchesToEOL) {
144        StreamSetBuffer * OriginalMatches = Matches;
145        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
146        Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
147        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
148    }
149   
150    if (InvertMatchFlag) {
151        kernel::Kernel * invertK = mGrepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
152        StreamSetBuffer * OriginalMatches = Matches;
153        Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
154        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
155    }
156    if (MaxCountFlag > 0) {
157        kernel::Kernel * untilK = mGrepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
158        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
159        StreamSetBuffer * AllMatches = Matches;
160        Matches = mGrepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
161        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
162    }
163    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
164}
165
166// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
167// which returns a count of the matches found (possibly subject to a MaxCount).
168//
169
170void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
171   
172    assert (mGrepDriver == nullptr);
173    mGrepDriver = new ParabixDriver("engine");
174    auto & idb = mGrepDriver->getBuilder();
175    Module * M = idb->getModule();
176   
177    const unsigned segmentSize = codegen::SegmentSize;
178    const unsigned encodingBits = 8;
179   
180    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
181    mainFunc->setCallingConv(CallingConv::C);
182    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
183    auto args = mainFunc->arg_begin();
184   
185    Value * const fileDescriptor = &*(args++);
186    fileDescriptor->setName("fileDescriptor");
187   
188    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
189    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
190    sourceK->setInitialArguments({fileDescriptor});
191    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
192   
193    StreamSetBuffer * LineBreakStream;
194    StreamSetBuffer * Matches;
195    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
196   
197    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
198    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
199    mGrepDriver->generatePipelineIR();
200    idb->setKernel(matchCountK);
201    Value * matchedLineCount = idb->getAccumulator("countResult");
202    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
203    mGrepDriver->deallocateBuffers();
204    idb->CreateRet(matchedLineCount);
205    mGrepDriver->finalizeObject();
206}
207
208//
209// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
210// matched lines.
211
212class EmitMatch : public MatchAccumulator {
213    friend class EmitMatchesEngine;
214public:
215    EmitMatch(std::string linePrefix, std::stringstream * strm) : mLinePrefix(linePrefix), mLineCount(0), mPrevious_line_end(nullptr), mResultStr(strm) {}
216    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
217    void finalize_match(char * buffer_end) override;
218protected:
219    std::string mLinePrefix;
220    size_t mLineCount;
221    char * mPrevious_line_end;
222    std::stringstream* mResultStr;
223};
224
225//
226//  Default Report Match:  lines are emitted with whatever line terminators are found in the
227//  input.  However, if the final line is not terminated, a new line is appended.
228void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
229    if (!(WithFilenameFlag | LineNumberFlag) && (line_start == mPrevious_line_end + 1)) {
230        // Consecutive matches: only one write call needed.
231        mResultStr->write(mPrevious_line_end, line_end - mPrevious_line_end);
232    }
233    else {
234        if (mLineCount > 0) {
235            // deal with the final byte of the previous line.
236            mResultStr->write(mPrevious_line_end, 1);
237        }
238        if (WithFilenameFlag) {
239            *mResultStr << mLinePrefix;
240        }
241        if (LineNumberFlag) {
242            // Internally line numbers are counted from 0.  For display, adjust
243            // the line number so that lines are numbered from 1.
244            if (InitialTabFlag) {
245                *mResultStr << lineNum+1 << "\t:";
246            }
247            else {
248                *mResultStr << lineNum+1 << ":";
249            }
250        }
251        mResultStr->write(line_start, line_end - line_start);
252    }
253    mPrevious_line_end = line_end;
254    mLineCount++;
255}
256
257void EmitMatch::finalize_match(char * buffer_end) {
258    if (mLineCount == 0) return;  // No matches.
259    if (mPrevious_line_end < buffer_end) {
260        mResultStr->write(mPrevious_line_end, 1);
261    }
262    else {
263        // Likely unterminated final line.
264        char last_byte = mPrevious_line_end[-1];
265        if (last_byte == 0x0D) {
266            // The final CR is acceptable as a line_end.
267            return;
268        }
269        // Terminate the line with an LF
270        // (Even if we had an incomplete UTF-8 sequence.)
271        *mResultStr << "\n";
272    }
273}
274
275void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
276    assert (mGrepDriver == nullptr);
277    mGrepDriver = new ParabixDriver("engine");
278    auto & idb = mGrepDriver->getBuilder();
279    Module * M = idb->getModule();
280   
281    const unsigned segmentSize = codegen::SegmentSize;
282    const unsigned encodingBits = 8;
283   
284    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
285    mainFunc->setCallingConv(CallingConv::C);
286    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
287    auto args = mainFunc->arg_begin();
288   
289    Value * const fileDescriptor = &*(args++);
290    fileDescriptor->setName("fileDescriptor");
291    Value * match_accumulator = &*(args++);
292    match_accumulator->setName("match_accumulator");
293   
294    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
295    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
296    sourceK->setInitialArguments({fileDescriptor});
297    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
298   
299    StreamSetBuffer * LineBreakStream;
300    StreamSetBuffer * Matches;
301    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
302   
303    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb));
304    scanMatchK->setInitialArguments({match_accumulator});
305    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
306    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
307    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
308   
309    mGrepDriver->generatePipelineIR();
310    mGrepDriver->deallocateBuffers();
311    idb->CreateRet(idb->getInt64(0));
312    mGrepDriver->finalizeObject();
313}
314
315
316//
317//  The doGrep methods apply a GrepEngine to a single file, processing the results
318//  differently based on the engine type.
319   
320uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
321    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
322    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
323   
324    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
325    if (fileDescriptor == -1) return 0;
326   
327    uint64_t grepResult = f(fileDescriptor);
328    close(fileDescriptor);
329    return grepResult;
330}
331
332uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
333    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
334    if (WithFilenameFlag) *mResultStrs[fileIdx] << linePrefix(fileName);
335    *mResultStrs[fileIdx] << grepResult << "\n";
336    return grepResult;
337}
338
339std::string GrepEngine::linePrefix(std::string fileName) {
340    if (fileName == "-") {
341        return LabelFlag + mFileSuffix;
342    }
343    else {
344        return fileName + mFileSuffix;
345    }
346}
347   
348uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
349    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
350    if (grepResult == mRequiredCount) {
351       *mResultStrs[fileIdx] << linePrefix(fileName);
352    }
353    return grepResult;
354}
355
356uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
357    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
358    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
359   
360    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
361    if (fileDescriptor == -1) return 0;
362    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx].get());
363    uint64_t grepResult = f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
364    close(fileDescriptor);
365    if (accum.mLineCount > 0) grepMatchFound = true;
366    return accum.mLineCount;
367}
368
369// Open a file and return its file desciptor.
370int32_t GrepEngine::openFile(const std::string & fileName, std::stringstream * msgstrm) {
371    if (fileName == "-") {
372        return STDIN_FILENO;
373    }
374    else {
375        struct stat sb;
376        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
377        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
378            if (!NoMessagesFlag) {
379                if (errno == EACCES) {
380                    *msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
381                }
382                else if (errno == ENOENT) {
383                    *msgstrm << "icgrep: " << fileName << ": No such file.\n";
384                }
385                else {
386                    *msgstrm << "icgrep: " << fileName << ": Failed.\n";
387                }
388            }
389            return fileDescriptor;
390        }
391        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
392            if (!NoMessagesFlag) {
393                *msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
394            }
395            close(fileDescriptor);
396            return -1;
397        }
398        return fileDescriptor;
399    }
400}
401
402// The process of searching a group of files may use a sequential or a task
403// parallel approach.
404
405bool GrepEngine::searchAllFiles() {
406    if (Threads <= 1) {
407        for (unsigned i = 0; i != inputFiles.size(); ++i) {
408            size_t grepResult = doGrep(inputFiles[i], i);
409            if (grepResult > 0) {
410                grepMatchFound = true;
411                if (QuietMode) break;
412            }
413        }
414    } else if (Threads > 1) {
415        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
416        pthread_t threads[numOfThreads];
417       
418        for(unsigned long i = 0; i < numOfThreads; ++i) {
419            const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
420            if (rc) {
421                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
422            }
423        }
424        for(unsigned i = 0; i < numOfThreads; ++i) {
425            void * status = nullptr;
426            const int rc = pthread_join(threads[i], &status);
427            if (rc) {
428                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
429            }
430        }
431    }
432    return grepMatchFound;
433}
434
435
436// DoGrep thread function.
437void * GrepEngine::DoGrepThreadFunction(void *args) {
438    size_t fileIdx;
439    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
440
441    grepEngine->count_mutex.lock();
442    fileIdx = grepEngine->fileCount;
443    grepEngine->fileCount++;
444    grepEngine->count_mutex.unlock();
445
446    while (fileIdx < grepEngine->inputFiles.size()) {
447        size_t grepResult = grepEngine->doGrep(grepEngine->inputFiles[fileIdx], fileIdx);
448       
449        grepEngine->count_mutex.lock();
450        if (grepResult > 0) grepEngine->grepMatchFound = true;
451        fileIdx = grepEngine->fileCount;
452        grepEngine->fileCount++;
453        grepEngine->count_mutex.unlock();
454        if (QuietMode && grepEngine->grepMatchFound) pthread_exit(nullptr);
455    }
456    pthread_exit(nullptr);
457}
458   
459void GrepEngine::writeMatches() {
460    for (unsigned i = 0; i < inputFiles.size(); ++i) {
461        std::cout << mResultStrs[i]->str();
462    }
463}
464
465}
466
Note: See TracBrowser for help on using the repository browser.