source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5700

Last change on this file since 5700 was 5700, checked in by cameron, 22 months ago

Refactoring of grep_engine based on mode

File size: 17.9 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/toolchain.h>
31#include <toolchain/cpudriver.h>
32#include <iostream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38#include <errno.h>
39#include <llvm/ADT/STLExtras.h> // for make_unique
40#include <llvm/Support/CommandLine.h>
41
42using namespace parabix;
43using namespace llvm;
44static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
45
46
47namespace grep {
48
49
50// DoGrep thread function.
51void *GrepEngine::DoGrepThreadFunction(void *args) {
52    size_t fileIdx;
53    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
54
55    grepEngine->count_mutex.lock();
56    fileIdx = grepEngine->fileCount;
57    grepEngine->fileCount++;
58    grepEngine->count_mutex.unlock();
59
60    while (fileIdx < grepEngine->inputFiles.size()) {
61        size_t grepResult = grepEngine->doGrep(grepEngine->inputFiles[fileIdx], fileIdx);
62       
63        grepEngine->count_mutex.lock();
64        if (grepResult > 0) grepEngine->grepMatchFound = true;
65        fileIdx = grepEngine->fileCount;
66        grepEngine->fileCount++;
67        grepEngine->count_mutex.unlock();
68        if (QuietMode && grepEngine->grepMatchFound) pthread_exit(nullptr);
69    }
70    pthread_exit(nullptr);
71}
72   
73void GrepEngine::run() {
74   
75    if (Threads <= 1) {
76        for (unsigned i = 0; i != inputFiles.size(); ++i) {
77            size_t grepResult = doGrep(inputFiles[i], i);
78            if (grepResult > 0) grepMatchFound = true;
79        }
80    } else if (Threads > 1) {
81        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
82        pthread_t threads[numOfThreads];
83       
84        for(unsigned long i = 0; i < numOfThreads; ++i){
85            const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
86            if (rc) {
87                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
88            }
89        }
90        for(unsigned i = 0; i < numOfThreads; ++i) {
91            void * status = nullptr;
92            const int rc = pthread_join(threads[i], &status);
93            if (rc) {
94                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
95            }
96        }
97    }
98}
99       
100//
101//  Default Report Match:  lines are emitted with whatever line terminators are found in the
102//  input.  However, if the final line is not terminated, a new line is appended.
103
104
105class EmitMatch : public MatchAccumulator {
106public:
107    EmitMatch(std::string linePrefix, std::stringstream * strm) : mLinePrefix(linePrefix), mLineCount(0), mPrevious_line_end(nullptr), mResultStr(strm) {}
108    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
109    void finalize_match(char * buffer_end) override;
110    std::string mLinePrefix;
111    size_t mLineCount;
112    char * mPrevious_line_end;
113    std::stringstream* mResultStr;
114   
115};
116
117
118void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
119    if (!(WithFilenameFlag | LineNumberFlag) && (line_start == mPrevious_line_end + 1)) {
120        // Consecutive matches: only one write call needed.
121        mResultStr->write(mPrevious_line_end, line_end - mPrevious_line_end);
122    }
123    else {
124        if (mLineCount > 0) {
125            // deal with the final byte of the previous line.
126            mResultStr->write(mPrevious_line_end, 1);
127        }
128        if (WithFilenameFlag) {
129            *mResultStr << mLinePrefix;
130        }
131        if (LineNumberFlag) {
132            // Internally line numbers are counted from 0.  For display, adjust
133            // the line number so that lines are numbered from 1.
134            if (InitialTabFlag) {
135                *mResultStr << lineNum+1 << "\t:";
136            }
137            else {
138                *mResultStr << lineNum+1 << ":";
139            }
140        }
141        mResultStr->write(line_start, line_end - line_start);
142    }
143    mPrevious_line_end = line_end;
144    mLineCount++;
145}
146
147void EmitMatch::finalize_match(char * buffer_end) {
148    if (mLineCount == 0) return;  // No matches.
149    if (mPrevious_line_end < buffer_end) {
150        mResultStr->write(mPrevious_line_end, 1);
151    }
152    else {
153        // Likely unterminated final line.
154        char last_byte = mPrevious_line_end[-1];
155        if (last_byte == 0x0D) {
156            // The final CR is acceptable as a line_end.
157            return;
158        }
159        // Terminate the line with an LF
160        // (Even if we had an incomplete UTF-8 sequence.)
161        *mResultStr << "\n";
162    }
163}
164
165
166
167bool matchesNeedToBeMovedToEOL() {
168    if ((Mode == QuietMode) | (Mode == FilesWithMatch) | (Mode == FilesWithoutMatch)) {
169        return false;
170    }
171    else if (LineRegexpFlag) {
172        return false;
173    }
174    // TODO: return false for other cases based on regexp analysis, e.g., regexp ends with $.
175    return true;
176}
177
178   
179int32_t openFile(const std::string & fileName, std::stringstream & msgstrm) {
180    if (fileName == "-") {
181        return STDIN_FILENO;
182    }
183    else {
184        struct stat sb;
185        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
186        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
187            if (!NoMessagesFlag) {
188                if (errno == EACCES) {
189                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
190                }
191                else if (errno == ENOENT) {
192                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
193                }
194                else {
195                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
196                }
197            }
198            return fileDescriptor;
199        }
200        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
201            if (!NoMessagesFlag) {
202                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
203            }
204            close(fileDescriptor);
205            return -1; 
206        }
207        return fileDescriptor;
208    }
209}
210
211std::string GrepEngine::linePrefix(std::string fileName) {
212    if (fileName == "-") {
213        return LabelFlag + mFileSuffix;
214    }
215    else {
216        return fileName + mFileSuffix;
217    }
218}
219
220uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
221    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
222   
223    if (fileDescriptor == -1) return 0;
224   
225    EmitMatch accum(linePrefix(fileName), &mResultStrs[fileIdx]);
226   
227    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
228    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
229   
230    uint64_t grepResult = f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
231    close(fileDescriptor);
232    if (accum.mLineCount > 0) grepMatchFound = true;
233    return grepResult;
234}
235
236uint64_t CountOnlyGrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
237    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
238   
239    if (fileDescriptor == -1) return 0;
240   
241    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
242    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
243   
244    uint64_t grepResult = f(fileDescriptor);
245    close(fileDescriptor);
246   
247    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
248    mResultStrs[fileIdx] << grepResult << "\n";
249    return grepResult;
250}
251
252uint64_t MatchOnlyGrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
253    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
254   
255    if (fileDescriptor == -1) return 0;
256   
257    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
258    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
259   
260    uint64_t grepResult = f(fileDescriptor);
261    close(fileDescriptor);
262   
263    if (QuietMode) {
264        if (grepResult > 0) exit(MatchFoundExitCode);
265    }
266    else {
267        if (grepResult == mRequiredCount) {
268            mResultStrs[fileIdx] << linePrefix(fileName);
269        }
270    }
271    return grepResult;
272}
273
274void GrepEngine::initFileResult(std::vector<std::string> filenames){
275    grepMatchFound = false;
276    const int n = filenames.size();
277    mResultStrs.resize(n);
278    inputFiles = filenames;
279}
280
281   
282void GrepEngine::PrintResults(){
283   
284    for (unsigned i = 0; i < inputFiles.size(); ++i){
285        std::cout << mResultStrs[i].str();
286    }
287    exit(grepMatchFound ? MatchFoundExitCode : MatchNotFoundExitCode);
288}
289
290   
291   
292   
293std::pair<StreamSetBuffer *, StreamSetBuffer *> grepPipeline(Driver * grepDriver, std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
294    auto & idb = grepDriver->getBuilder();
295    const unsigned segmentSize = codegen::SegmentSize;
296    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
297    const unsigned encodingBits = 8;
298
299    StreamSetBuffer * BasisBits = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
300    kernel::Kernel * s2pk = grepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
301    grepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
302   
303    StreamSetBuffer * LineBreakStream = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
304    kernel::Kernel * linebreakK = grepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
305    grepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
306   
307    kernel::Kernel * requiredStreamsK = grepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
308    StreamSetBuffer * RequiredStreams = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
309    grepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
310   
311    const auto n = REs.size();
312   
313    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
314
315    for (unsigned i = 0; i < n; i++) {
316        REs[i] = re::resolveNames(REs[i]);
317        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
318        std::vector<std::vector<unsigned>> exclusiveSetIDs;
319        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
320        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
321    } 
322
323    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
324
325    for(unsigned i = 0; i < n; ++i){
326        const auto numOfCharacterClasses = charclasses[i].size();
327        StreamSetBuffer * CharClasses = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
328        kernel::Kernel * ccK = grepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
329        grepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
330        StreamSetBuffer * MatchResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
331        kernel::Kernel * icgrepK = grepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
332        grepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
333        MatchResultsBufs[i] = MatchResults;
334    }
335    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
336    if (REs.size() > 1) {
337        MergedResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
338        kernel::Kernel * streamsMergeK = grepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
339        grepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
340    }
341    StreamSetBuffer * Matches = MergedResults;
342   
343    if (matchesNeedToBeMovedToEOL()) {
344        StreamSetBuffer * OriginalMatches = Matches;
345        kernel::Kernel * matchedLinesK = grepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
346        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
347        grepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
348    }
349   
350    if (InvertMatchFlag) {
351        kernel::Kernel * invertK = grepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
352        StreamSetBuffer * OriginalMatches = Matches;
353        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
354        grepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
355    }
356    if (MaxCountFlag > 0) {
357        kernel::Kernel * untilK = grepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
358        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
359        StreamSetBuffer * AllMatches = Matches;
360        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
361        grepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
362    }
363    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
364}
365
366    void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
367       
368        assert (mGrepDriver == nullptr);
369        mGrepDriver = new ParabixDriver("engine");
370        auto & idb = mGrepDriver->getBuilder();
371        Module * M = idb->getModule();
372       
373        const unsigned segmentSize = codegen::SegmentSize;
374        const unsigned encodingBits = 8;
375       
376        Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
377        mainFunc->setCallingConv(CallingConv::C);
378        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
379        auto args = mainFunc->arg_begin();
380       
381        Value * const fileDescriptor = &*(args++);
382        fileDescriptor->setName("fileDescriptor");
383        Value * match_accumulator = &*(args++);
384        match_accumulator->setName("match_accumulator");
385       
386        StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
387        kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
388        sourceK->setInitialArguments({fileDescriptor});
389        mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
390       
391        StreamSetBuffer * LineBreakStream;
392        StreamSetBuffer * Matches;
393        std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
394       
395        kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb));
396        scanMatchK->setInitialArguments({match_accumulator});
397        mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
398        mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
399        mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
400       
401       
402        mGrepDriver->generatePipelineIR();
403        mGrepDriver->deallocateBuffers();
404       
405        idb->CreateRet(idb->getInt64(0));
406        mGrepDriver->finalizeObject();
407    }
408   
409    void CountOnlyGrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
410       
411        assert (mGrepDriver == nullptr);
412        mGrepDriver = new ParabixDriver("engine");
413        auto & idb = mGrepDriver->getBuilder();
414        Module * M = idb->getModule();
415       
416        const unsigned segmentSize = codegen::SegmentSize;
417        const unsigned encodingBits = 8;
418       
419        Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
420        mainFunc->setCallingConv(CallingConv::C);
421        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
422        auto args = mainFunc->arg_begin();
423       
424        Value * const fileDescriptor = &*(args++);
425        fileDescriptor->setName("fileDescriptor");
426       
427        StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
428        kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
429        sourceK->setInitialArguments({fileDescriptor});
430        mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
431       
432        StreamSetBuffer * LineBreakStream;
433        StreamSetBuffer * Matches;
434        std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
435       
436        kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
437        mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
438        mGrepDriver->generatePipelineIR();
439        idb->setKernel(matchCountK);
440        Value * matchedLineCount = idb->getAccumulator("countResult");
441        matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
442        mGrepDriver->deallocateBuffers();
443        idb->CreateRet(matchedLineCount);
444        mGrepDriver->finalizeObject();
445    }
446   
447
448GrepEngine::~GrepEngine() {
449    delete mGrepDriver;
450}
451
452}
Note: See TracBrowser for help on using the repository browser.