source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5770

Last change on this file since 5770 was 5770, checked in by cameron, 17 months ago

Restructure to eliminate unnecessary dependencies on RegExpCompiler? and UCDLIB

File size: 18.7 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/casing.h>
26#include <re/re_toolchain.h>
27#include <toolchain/toolchain.h>
28#include <re/re_name_resolve.h>
29#include <re/re_collect_unicodesets.h>
30#include <re/re_multiplex.h>
31#include <toolchain/toolchain.h>
32#include <toolchain/cpudriver.h>
33#include <iostream>
34#include <cc/multiplex_CCs.h>
35#include <llvm/Support/raw_ostream.h>
36#include <util/aligned_allocator.h>
37#include <sys/stat.h>
38#include <fcntl.h>
39#include <errno.h>
40#include <llvm/ADT/STLExtras.h> // for make_unique
41#include <llvm/Support/CommandLine.h>
42#include <llvm/Support/Debug.h>
43#include <sched.h>
44
45using namespace parabix;
46using namespace llvm;
47static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
48
49namespace grep {
50
51// Grep Engine construction and initialization.
52
53GrepEngine::GrepEngine() :
54    mGrepDriver(nullptr),
55    mNextFileToGrep(0),
56    mNextFileToPrint(0),
57    grepMatchFound(false),
58    mMoveMatchesToEOL(true),
59    mEngineThread(pthread_self()) {}
60
61GrepEngine::~GrepEngine() {
62    delete mGrepDriver;
63}
64
65QuietModeEngine::QuietModeEngine() : GrepEngine() {
66    mMoveMatchesToEOL = false;
67}
68
69MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
70    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
71    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
72    mMoveMatchesToEOL = false;
73}
74
75CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
76    mFileSuffix = ":";
77}
78
79EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
80    mFileSuffix = InitialTabFlag ? "\t:" : ":";
81    if (LineRegexpFlag) mMoveMatchesToEOL = false;
82}
83
84void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
85    const unsigned n = filenames.size();
86    mResultStrs.resize(n);
87    mFileStatus.resize(n);
88    for (unsigned i = 0; i < n; i++) {
89        mResultStrs[i] = make_unique<std::stringstream>();
90        mFileStatus[i] = FileStatus::Pending;
91    }
92    inputFiles = filenames;
93}
94
95// Code Generation
96//
97// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
98
99std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
100    auto & idb = mGrepDriver->getBuilder();
101    const unsigned segmentSize = codegen::SegmentSize;
102    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
103    const unsigned encodingBits = 8;
104
105    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
106    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
107    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
108
109    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
110    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
111    mGrepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
112
113    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
114    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments);
115    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
116
117    const auto n = REs.size();
118    std::vector<std::vector<re::CC *>> charclasses(n);
119    for (unsigned i = 0; i < n; i++) {
120        REs[i] = resolveCaseInsensitiveMode(REs[i], grep::IgnoreCaseFlag);
121        REs[i] = re::resolveNames(REs[i]);
122        const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
123        std::vector<std::vector<unsigned>> exclusiveSetIDs;
124        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
125        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
126        REs[i] = regular_expression_passes(REs[i]);
127  }
128
129    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
130
131    for(unsigned i = 0; i < n; ++i){
132        const auto numOfCharacterClasses = charclasses[i].size();
133        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
134        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(charclasses[i]));
135        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
136        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
137        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
138        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
139        MatchResultsBufs[i] = MatchResults;
140    }
141    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
142    if (REs.size() > 1) {
143        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
144        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
145        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
146    }
147    StreamSetBuffer * Matches = MergedResults;
148
149    if (mMoveMatchesToEOL) {
150        StreamSetBuffer * OriginalMatches = Matches;
151        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
152        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
153        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
154    }
155
156    if (InvertMatchFlag) {
157        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
158        StreamSetBuffer * OriginalMatches = Matches;
159        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
160        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
161    }
162    if (MaxCountFlag > 0) {
163        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
164        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
165        StreamSetBuffer * AllMatches = Matches;
166        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
167        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
168    }
169    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
170}
171
172// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
173// which returns a count of the matches found (possibly subject to a MaxCount).
174//
175
176void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
177
178    assert (mGrepDriver == nullptr);
179    mGrepDriver = new ParabixDriver("engine");
180    auto & idb = mGrepDriver->getBuilder();
181    Module * M = idb->getModule();
182
183    const auto segmentSize = codegen::SegmentSize;
184    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
185
186    const unsigned encodingBits = 8;
187
188    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
189    mainFunc->setCallingConv(CallingConv::C);
190    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
191    auto args = mainFunc->arg_begin();
192
193    Value * const fileDescriptor = &*(args++);
194    fileDescriptor->setName("fileDescriptor");
195
196    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
197    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
198    sourceK->setInitialArguments({fileDescriptor});
199    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
200
201    StreamSetBuffer * LineBreakStream;
202    StreamSetBuffer * Matches;
203    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
204
205    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
206    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
207    mGrepDriver->generatePipelineIR();
208    idb->setKernel(matchCountK);
209    Value * matchedLineCount = idb->getAccumulator("countResult");
210    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
211    mGrepDriver->deallocateBuffers();
212    idb->CreateRet(matchedLineCount);
213    mGrepDriver->finalizeObject();
214}
215
216//
217// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
218// matched lines.
219
220class EmitMatch : public MatchAccumulator {
221    friend class EmitMatchesEngine;
222public:
223    EmitMatch(std::string linePrefix, std::stringstream * strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
224    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
225    void finalize_match(char * buffer_end) override;
226protected:
227    std::string mLinePrefix;
228    size_t mLineCount;
229    bool mTerminated;
230    std::stringstream* mResultStr;
231};
232
233//
234//  Default Report Match:  lines are emitted with whatever line terminators are found in the
235//  input.  However, if the final line is not terminated, a new line is appended.
236//
237void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
238    if (WithFilenameFlag) {
239        *mResultStr << mLinePrefix;
240    }
241    if (LineNumberFlag) {
242        // Internally line numbers are counted from 0.  For display, adjust
243        // the line number so that lines are numbered from 1.
244        if (InitialTabFlag) {
245            *mResultStr << lineNum+1 << "\t:";
246        }
247        else {
248            *mResultStr << lineNum+1 << ":";
249        }
250    }
251    size_t bytes = line_end - line_start + 1;
252    mResultStr->write(line_start, bytes);
253    mLineCount++;
254    unsigned last_byte = *line_end;
255    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
256    if (LLVM_UNLIKELY(!mTerminated)) {
257        if (last_byte == 0x85) {  //  Possible NEL terminator.
258            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
259        }
260        else {
261            // Possible LS or PS terminators.
262            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
263                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
264                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
265        }
266    }
267}
268
269void EmitMatch::finalize_match(char * buffer_end) {
270    if (!mTerminated) *mResultStr << "\n";
271}
272
273void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
274    assert (mGrepDriver == nullptr);
275    mGrepDriver = new ParabixDriver("engine");
276    auto & idb = mGrepDriver->getBuilder();
277    Module * M = idb->getModule();
278
279    const auto segmentSize = codegen::SegmentSize;
280    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
281    const unsigned encodingBits = 8;
282
283    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
284    mainFunc->setCallingConv(CallingConv::C);
285    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
286    auto args = mainFunc->arg_begin();
287
288    Value * const fileDescriptor = &*(args++);
289    fileDescriptor->setName("fileDescriptor");
290    Value * match_accumulator = &*(args++);
291    match_accumulator->setName("match_accumulator");
292
293    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
294    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
295    sourceK->setInitialArguments({fileDescriptor});
296    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
297
298    StreamSetBuffer * LineBreakStream;
299    StreamSetBuffer * Matches;
300    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
301
302    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
303    scanMatchK->setInitialArguments({match_accumulator});
304    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
305    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
306    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
307
308    mGrepDriver->generatePipelineIR();
309    mGrepDriver->deallocateBuffers();
310    idb->CreateRet(idb->getInt64(0));
311    mGrepDriver->finalizeObject();
312}
313
314
315//
316//  The doGrep methods apply a GrepEngine to a single file, processing the results
317//  differently based on the engine type.
318
319uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
320    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
321    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
322
323    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
324    if (fileDescriptor == -1) return 0;
325
326    uint64_t grepResult = f(fileDescriptor);
327    close(fileDescriptor);
328    return grepResult;
329}
330
331uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
332    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
333    if (WithFilenameFlag) *mResultStrs[fileIdx] << linePrefix(fileName);
334    *mResultStrs[fileIdx] << grepResult << "\n";
335    return grepResult;
336}
337
338std::string GrepEngine::linePrefix(std::string fileName) {
339    if (fileName == "-") {
340        return LabelFlag + mFileSuffix;
341    }
342    else {
343        return fileName + mFileSuffix;
344    }
345}
346
347uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
348    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
349    if (grepResult == mRequiredCount) {
350       *mResultStrs[fileIdx] << linePrefix(fileName);
351    }
352    return grepResult;
353}
354
355uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
356    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
357    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
358
359    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx].get());
360    if (fileDescriptor == -1) return 0;
361    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx].get());
362    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
363    close(fileDescriptor);
364    if (accum.mLineCount > 0) grepMatchFound = true;
365    return accum.mLineCount;
366}
367
368// Open a file and return its file desciptor.
369int32_t GrepEngine::openFile(const std::string & fileName, std::stringstream * msgstrm) {
370    if (fileName == "-") {
371        return STDIN_FILENO;
372    }
373    else {
374        struct stat sb;
375        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
376        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
377            if (!NoMessagesFlag) {
378                if (errno == EACCES) {
379                    *msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
380                }
381                else if (errno == ENOENT) {
382                    *msgstrm << "icgrep: " << fileName << ": No such file.\n";
383                }
384                else {
385                    *msgstrm << "icgrep: " << fileName << ": Failed.\n";
386                }
387            }
388            return fileDescriptor;
389        }
390        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
391            if (!NoMessagesFlag) {
392                *msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
393            }
394            close(fileDescriptor);
395            return -1;
396        }
397        return fileDescriptor;
398    }
399}
400
401// The process of searching a group of files may use a sequential or a task
402// parallel approach.
403
404void * DoGrepThreadFunction(void *args) {
405    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
406}
407
408bool GrepEngine::searchAllFiles() {
409    const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
410    pthread_t threads[numOfThreads];
411
412    for(unsigned long i = 1; i < numOfThreads; ++i) {
413        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
414        if (rc) {
415            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
416        }
417    }
418    // Main thread also does the work;
419
420    DoGrepThreadMethod();
421    for(unsigned i = 1; i < numOfThreads; ++i) {
422        void * status = nullptr;
423        const int rc = pthread_join(threads[i], &status);
424        if (rc) {
425            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
426        }
427    }
428    return grepMatchFound;
429}
430
431
432// DoGrep thread function.
433void * GrepEngine::DoGrepThreadMethod() {
434
435    auto fileIdx = mNextFileToGrep++;
436    while (fileIdx < inputFiles.size()) {
437        const size_t grepResult = doGrep(inputFiles[fileIdx], fileIdx);
438        mFileStatus[fileIdx] = FileStatus::GrepComplete;
439        if (grepResult > 0) {
440            grepMatchFound = true;
441        }
442        if (QuietMode && grepMatchFound) {
443            if (pthread_self() != mEngineThread) {
444                pthread_exit(nullptr);
445            }
446            return nullptr;
447        }
448        fileIdx = mNextFileToGrep++;
449    }
450
451    auto printIdx = mNextFileToPrint++;
452    while (printIdx < inputFiles.size()) {
453        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
454        if (readyToPrint) {
455            const auto output = mResultStrs[printIdx]->str();
456            if (!output.empty()) {
457                mWriteMutex.lock();
458                std::cout << output;
459                mWriteMutex.unlock();
460            }
461            mFileStatus[printIdx] = FileStatus::PrintComplete;
462            printIdx = mNextFileToPrint++;
463        } else {
464            mCacheMutex.lock();
465            mGrepDriver->performIncrementalCacheCleanupStep();
466            mCacheMutex.unlock();
467        }
468        sched_yield();
469    }
470
471    if (pthread_self() != mEngineThread) {
472        pthread_exit(nullptr);
473    } else {
474        return nullptr;
475    }
476}
477
478}
Note: See TracBrowser for help on using the repository browser.