source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5703

Last change on this file since 5703 was 5703, checked in by cameron, 2 years ago

Refactoring, clean-up, set initial threads to 2

File size: 18.1 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/toolchain.h>
31#include <toolchain/cpudriver.h>
32#include <iostream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38#include <errno.h>
39#include <llvm/ADT/STLExtras.h> // for make_unique
40#include <llvm/Support/CommandLine.h>
41
42using namespace parabix;
43using namespace llvm;
44static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
45
46namespace grep {
47
48
49// DoGrep thread function.
50void *GrepEngine::DoGrepThreadFunction(void *args) {
51    size_t fileIdx;
52    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
53
54    grepEngine->count_mutex.lock();
55    fileIdx = grepEngine->fileCount;
56    grepEngine->fileCount++;
57    grepEngine->count_mutex.unlock();
58
59    while (fileIdx < grepEngine->inputFiles.size()) {
60        size_t grepResult = grepEngine->doGrep(grepEngine->inputFiles[fileIdx], fileIdx);
61       
62        grepEngine->count_mutex.lock();
63        if (grepResult > 0) grepEngine->grepMatchFound = true;
64        fileIdx = grepEngine->fileCount;
65        grepEngine->fileCount++;
66        grepEngine->count_mutex.unlock();
67        if (QuietMode && grepEngine->grepMatchFound) pthread_exit(nullptr);
68    }
69    pthread_exit(nullptr);
70}
71   
72bool GrepEngine::searchAllFiles() {
73   
74    if (Threads <= 1) {
75        for (unsigned i = 0; i != inputFiles.size(); ++i) {
76            size_t grepResult = doGrep(inputFiles[i], i);
77            if (grepResult > 0) {
78                grepMatchFound = true;
79                if (QuietMode) break;
80            }
81        }
82    } else if (Threads > 1) {
83        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
84        pthread_t threads[numOfThreads];
85       
86        for(unsigned long i = 0; i < numOfThreads; ++i){
87            const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
88            if (rc) {
89                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
90            }
91        }
92        for(unsigned i = 0; i < numOfThreads; ++i) {
93            void * status = nullptr;
94            const int rc = pthread_join(threads[i], &status);
95            if (rc) {
96                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
97            }
98        }
99    }
100    return grepMatchFound;
101}
102       
103//
104//  Default Report Match:  lines are emitted with whatever line terminators are found in the
105//  input.  However, if the final line is not terminated, a new line is appended.
106
107class EmitMatch : public MatchAccumulator {
108    friend class EmitMatchesEngine;
109public:
110    EmitMatch(std::string linePrefix, std::stringstream * strm) : mLinePrefix(linePrefix), mLineCount(0), mPrevious_line_end(nullptr), mResultStr(strm) {}
111    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
112    void finalize_match(char * buffer_end) override;
113protected:
114    std::string mLinePrefix;
115    size_t mLineCount;
116    char * mPrevious_line_end;
117    std::stringstream* mResultStr;
118};
119
120void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
121    if (!(WithFilenameFlag | LineNumberFlag) && (line_start == mPrevious_line_end + 1)) {
122        // Consecutive matches: only one write call needed.
123        mResultStr->write(mPrevious_line_end, line_end - mPrevious_line_end);
124    }
125    else {
126        if (mLineCount > 0) {
127            // deal with the final byte of the previous line.
128            mResultStr->write(mPrevious_line_end, 1);
129        }
130        if (WithFilenameFlag) {
131            *mResultStr << mLinePrefix;
132        }
133        if (LineNumberFlag) {
134            // Internally line numbers are counted from 0.  For display, adjust
135            // the line number so that lines are numbered from 1.
136            if (InitialTabFlag) {
137                *mResultStr << lineNum+1 << "\t:";
138            }
139            else {
140                *mResultStr << lineNum+1 << ":";
141            }
142        }
143        mResultStr->write(line_start, line_end - line_start);
144    }
145    mPrevious_line_end = line_end;
146    mLineCount++;
147}
148
149void EmitMatch::finalize_match(char * buffer_end) {
150    if (mLineCount == 0) return;  // No matches.
151    if (mPrevious_line_end < buffer_end) {
152        mResultStr->write(mPrevious_line_end, 1);
153    }
154    else {
155        // Likely unterminated final line.
156        char last_byte = mPrevious_line_end[-1];
157        if (last_byte == 0x0D) {
158            // The final CR is acceptable as a line_end.
159            return;
160        }
161        // Terminate the line with an LF
162        // (Even if we had an incomplete UTF-8 sequence.)
163        *mResultStr << "\n";
164    }
165}
166
167
168
169bool matchesNeedToBeMovedToEOL() {
170    if ((Mode == QuietMode) | (Mode == FilesWithMatch) | (Mode == FilesWithoutMatch)) {
171        return false;
172    }
173    else if (LineRegexpFlag) {
174        return false;
175    }
176    // TODO: return false for other cases based on regexp analysis, e.g., regexp ends with $.
177    return true;
178}
179
180// Open a file and return its file desciptor.
181int32_t GrepEngine::openFile(const std::string & fileName, std::stringstream & msgstrm) {
182    if (fileName == "-") {
183        return STDIN_FILENO;
184    }
185    else {
186        struct stat sb;
187        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
188        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
189            if (!NoMessagesFlag) {
190                if (errno == EACCES) {
191                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
192                }
193                else if (errno == ENOENT) {
194                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
195                }
196                else {
197                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
198                }
199            }
200            return fileDescriptor;
201        }
202        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
203            if (!NoMessagesFlag) {
204                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
205            }
206            close(fileDescriptor);
207            return -1; 
208        }
209        return fileDescriptor;
210    }
211}
212
213std::string GrepEngine::linePrefix(std::string fileName) {
214    if (fileName == "-") {
215        return LabelFlag + mFileSuffix;
216    }
217    else {
218        return fileName + mFileSuffix;
219    }
220}
221
222uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
223    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
224   
225    if (fileDescriptor == -1) return 0;
226   
227    EmitMatch accum(linePrefix(fileName), &mResultStrs[fileIdx]);
228   
229    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
230    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
231   
232    uint64_t grepResult = f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
233    close(fileDescriptor);
234    if (accum.mLineCount > 0) grepMatchFound = true;
235    return grepResult;
236}
237
238uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
239    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
240    if (fileDescriptor == -1) return 0;
241   
242    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
243    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
244   
245    uint64_t grepResult = f(fileDescriptor);
246    close(fileDescriptor);
247   
248    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
249    mResultStrs[fileIdx] << grepResult << "\n";
250    return grepResult;
251}
252
253uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
254    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
255    if (fileDescriptor == -1) return 0;
256   
257    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
258    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
259   
260    uint64_t grepResult = f(fileDescriptor);
261    close(fileDescriptor);
262   
263    if (QuietMode) {
264        if (grepResult > 0) exit(MatchFoundExitCode);
265    }
266    else {
267        if (grepResult == mRequiredCount) {
268            mResultStrs[fileIdx] << linePrefix(fileName);
269        }
270    }
271    return grepResult;
272}
273
274void GrepEngine::initFileResult(std::vector<std::string> & filenames){
275    const int n = filenames.size();
276    mResultStrs.resize(n);
277    inputFiles = filenames;
278}
279
280std::pair<StreamSetBuffer *, StreamSetBuffer *> grepPipeline(Driver * grepDriver, std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
281    auto & idb = grepDriver->getBuilder();
282    const unsigned segmentSize = codegen::SegmentSize;
283    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
284    const unsigned encodingBits = 8;
285
286    StreamSetBuffer * BasisBits = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
287    kernel::Kernel * s2pk = grepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
288    grepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
289   
290    StreamSetBuffer * LineBreakStream = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
291    kernel::Kernel * linebreakK = grepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
292    grepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
293   
294    kernel::Kernel * requiredStreamsK = grepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
295    StreamSetBuffer * RequiredStreams = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
296    grepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
297   
298    const auto n = REs.size();
299   
300    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
301
302    for (unsigned i = 0; i < n; i++) {
303        REs[i] = re::resolveNames(REs[i]);
304        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
305        std::vector<std::vector<unsigned>> exclusiveSetIDs;
306        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
307        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
308    } 
309
310    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
311
312    for(unsigned i = 0; i < n; ++i){
313        const auto numOfCharacterClasses = charclasses[i].size();
314        StreamSetBuffer * CharClasses = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
315        kernel::Kernel * ccK = grepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
316        grepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
317        StreamSetBuffer * MatchResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
318        kernel::Kernel * icgrepK = grepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
319        grepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
320        MatchResultsBufs[i] = MatchResults;
321    }
322    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
323    if (REs.size() > 1) {
324        MergedResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
325        kernel::Kernel * streamsMergeK = grepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
326        grepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
327    }
328    StreamSetBuffer * Matches = MergedResults;
329   
330    if (matchesNeedToBeMovedToEOL()) {
331        StreamSetBuffer * OriginalMatches = Matches;
332        kernel::Kernel * matchedLinesK = grepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
333        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
334        grepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
335    }
336   
337    if (InvertMatchFlag) {
338        kernel::Kernel * invertK = grepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
339        StreamSetBuffer * OriginalMatches = Matches;
340        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
341        grepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
342    }
343    if (MaxCountFlag > 0) {
344        kernel::Kernel * untilK = grepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
345        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
346        StreamSetBuffer * AllMatches = Matches;
347        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
348        grepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
349    }
350    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
351}
352
353void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
354    assert (mGrepDriver == nullptr);
355    mGrepDriver = new ParabixDriver("engine");
356    auto & idb = mGrepDriver->getBuilder();
357    Module * M = idb->getModule();
358   
359    const unsigned segmentSize = codegen::SegmentSize;
360    const unsigned encodingBits = 8;
361   
362    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
363    mainFunc->setCallingConv(CallingConv::C);
364    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
365    auto args = mainFunc->arg_begin();
366   
367    Value * const fileDescriptor = &*(args++);
368    fileDescriptor->setName("fileDescriptor");
369    Value * match_accumulator = &*(args++);
370    match_accumulator->setName("match_accumulator");
371   
372    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
373    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
374    sourceK->setInitialArguments({fileDescriptor});
375    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
376   
377    StreamSetBuffer * LineBreakStream;
378    StreamSetBuffer * Matches;
379    std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
380   
381    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb));
382    scanMatchK->setInitialArguments({match_accumulator});
383    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
384    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
385    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
386   
387    mGrepDriver->generatePipelineIR();
388    mGrepDriver->deallocateBuffers();
389    idb->CreateRet(idb->getInt64(0));
390    mGrepDriver->finalizeObject();
391}
392
393void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
394   
395    assert (mGrepDriver == nullptr);
396    mGrepDriver = new ParabixDriver("engine");
397    auto & idb = mGrepDriver->getBuilder();
398    Module * M = idb->getModule();
399   
400    const unsigned segmentSize = codegen::SegmentSize;
401    const unsigned encodingBits = 8;
402   
403    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
404    mainFunc->setCallingConv(CallingConv::C);
405    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
406    auto args = mainFunc->arg_begin();
407   
408    Value * const fileDescriptor = &*(args++);
409    fileDescriptor->setName("fileDescriptor");
410   
411    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
412    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
413    sourceK->setInitialArguments({fileDescriptor});
414    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
415   
416    StreamSetBuffer * LineBreakStream;
417    StreamSetBuffer * Matches;
418    std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
419   
420    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
421    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
422    mGrepDriver->generatePipelineIR();
423    idb->setKernel(matchCountK);
424    Value * matchedLineCount = idb->getAccumulator("countResult");
425    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
426    mGrepDriver->deallocateBuffers();
427    idb->CreateRet(matchedLineCount);
428    mGrepDriver->finalizeObject();
429}
430
431void GrepEngine::writeMatches(){
432    for (unsigned i = 0; i < inputFiles.size(); ++i){
433        std::cout << mResultStrs[i].str();
434    }
435}
436
437GrepEngine::GrepEngine() :
438    mGrepDriver(nullptr),
439    grepMatchFound(false),
440    fileCount(0) {}
441   
442GrepEngine::~GrepEngine() {
443    delete mGrepDriver;
444}
445
446EmitMatchesEngine::EmitMatchesEngine() : GrepEngine()
447    {mFileSuffix = InitialTabFlag ? "\t:" : ":";}
448   
449CountOnlyEngine::CountOnlyEngine() :
450    GrepEngine() {mFileSuffix = ":";}
451
452MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
453    GrepEngine(), mRequiredCount(showFilesWithoutMatch)
454    {mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";}
455}
Note: See TracBrowser for help on using the repository browser.