source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5772

Last change on this file since 5772 was 5772, checked in by cameron, 14 months ago

resolveGraphemeMode

File size: 18.5 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/casing.h>
26#include <re/re_toolchain.h>
27#include <toolchain/toolchain.h>
28#include <re/re_name_resolve.h>
29#include <re/re_collect_unicodesets.h>
30#include <re/re_multiplex.h>
31#include <re/grapheme_clusters.h>
32#include <toolchain/toolchain.h>
33#include <toolchain/cpudriver.h>
34#include <iostream>
35#include <cc/multiplex_CCs.h>
36#include <llvm/Support/raw_ostream.h>
37#include <util/aligned_allocator.h>
38#include <sys/stat.h>
39#include <fcntl.h>
40#include <errno.h>
41#include <llvm/ADT/STLExtras.h> // for make_unique
42#include <llvm/Support/CommandLine.h>
43#include <llvm/Support/Debug.h>
44#include <sched.h>
45
46using namespace parabix;
47using namespace llvm;
48static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
49
50namespace grep {
51
52// Grep Engine construction and initialization.
53
54GrepEngine::GrepEngine() :
55    mGrepDriver(nullptr),
56    mNextFileToGrep(0),
57    mNextFileToPrint(0),
58    grepMatchFound(false),
59    mMoveMatchesToEOL(true),
60    mEngineThread(pthread_self()) {}
61
62GrepEngine::~GrepEngine() {
63    delete mGrepDriver;
64}
65
66QuietModeEngine::QuietModeEngine() : GrepEngine() {
67    mMoveMatchesToEOL = false;
68}
69
70MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
71    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
72    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
73    mMoveMatchesToEOL = false;
74}
75
76CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
77    mFileSuffix = ":";
78}
79
80EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
81    mFileSuffix = InitialTabFlag ? "\t:" : ":";
82    if (LineRegexpFlag) mMoveMatchesToEOL = false;
83}
84
85void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
86    const unsigned n = filenames.size();
87    mResultStrs.resize(n);
88    mFileStatus.resize(n, FileStatus::Pending);
89    inputFiles = filenames;
90}
91
92// Code Generation
93//
94// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
95
96std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
97    auto & idb = mGrepDriver->getBuilder();
98    const unsigned segmentSize = codegen::SegmentSize;
99    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
100    const unsigned encodingBits = 8;
101
102    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
103    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
104    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
105
106    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
107    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
108    mGrepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
109
110    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
111    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments);
112    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
113
114    const auto n = REs.size();
115    std::vector<std::vector<re::CC *>> charclasses(n);
116    for (unsigned i = 0; i < n; i++) {
117        REs[i] = resolveCaseInsensitiveMode(REs[i], grep::IgnoreCaseFlag);
118        REs[i] = resolveGraphemeMode(REs[i], false /* not in grapheme mode at top level*/);
119        REs[i] = re::resolveNames(REs[i]);
120        const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
121        std::vector<std::vector<unsigned>> exclusiveSetIDs;
122        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
123        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
124        REs[i] = regular_expression_passes(REs[i]);
125  }
126
127    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
128
129    for(unsigned i = 0; i < n; ++i){
130        const auto numOfCharacterClasses = charclasses[i].size();
131        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
132        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(charclasses[i]));
133        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
134        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
135        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
136        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
137        MatchResultsBufs[i] = MatchResults;
138    }
139    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
140    if (REs.size() > 1) {
141        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
142        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
143        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
144    }
145    StreamSetBuffer * Matches = MergedResults;
146
147    if (mMoveMatchesToEOL) {
148        StreamSetBuffer * OriginalMatches = Matches;
149        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
150        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
151        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
152    }
153
154    if (InvertMatchFlag) {
155        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
156        StreamSetBuffer * OriginalMatches = Matches;
157        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
158        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
159    }
160    if (MaxCountFlag > 0) {
161        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
162        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
163        StreamSetBuffer * AllMatches = Matches;
164        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
165        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
166    }
167    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
168}
169
170// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
171// which returns a count of the matches found (possibly subject to a MaxCount).
172//
173
174void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
175
176    assert (mGrepDriver == nullptr);
177    mGrepDriver = new ParabixDriver("engine");
178    auto & idb = mGrepDriver->getBuilder();
179    Module * M = idb->getModule();
180
181    const auto segmentSize = codegen::SegmentSize;
182    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
183
184    const unsigned encodingBits = 8;
185
186    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
187    mainFunc->setCallingConv(CallingConv::C);
188    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
189    auto args = mainFunc->arg_begin();
190
191    Value * const fileDescriptor = &*(args++);
192    fileDescriptor->setName("fileDescriptor");
193
194    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
195    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
196    sourceK->setInitialArguments({fileDescriptor});
197    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
198
199    StreamSetBuffer * LineBreakStream;
200    StreamSetBuffer * Matches;
201    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
202
203    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
204    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
205    mGrepDriver->generatePipelineIR();
206    idb->setKernel(matchCountK);
207    Value * matchedLineCount = idb->getAccumulator("countResult");
208    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
209    mGrepDriver->deallocateBuffers();
210    idb->CreateRet(matchedLineCount);
211    mGrepDriver->finalizeObject();
212}
213
214//
215// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
216// matched lines.
217
218class EmitMatch : public MatchAccumulator {
219    friend class EmitMatchesEngine;
220public:
221    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
222    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
223    void finalize_match(char * buffer_end) override;
224protected:
225    std::string mLinePrefix;
226    size_t mLineCount;
227    bool mTerminated;
228    std::ostringstream & mResultStr;
229};
230
231//
232//  Default Report Match:  lines are emitted with whatever line terminators are found in the
233//  input.  However, if the final line is not terminated, a new line is appended.
234//
235void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
236    if (WithFilenameFlag) {
237        mResultStr << mLinePrefix;
238    }
239    if (LineNumberFlag) {
240        // Internally line numbers are counted from 0.  For display, adjust
241        // the line number so that lines are numbered from 1.
242        if (InitialTabFlag) {
243            mResultStr << lineNum+1 << "\t:";
244        }
245        else {
246            mResultStr << lineNum+1 << ":";
247        }
248    }
249    size_t bytes = line_end - line_start + 1;
250    mResultStr.write(line_start, bytes);
251    mLineCount++;
252    unsigned last_byte = *line_end;
253    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
254    if (LLVM_UNLIKELY(!mTerminated)) {
255        if (last_byte == 0x85) {  //  Possible NEL terminator.
256            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
257        }
258        else {
259            // Possible LS or PS terminators.
260            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
261                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
262                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
263        }
264    }
265}
266
267void EmitMatch::finalize_match(char * buffer_end) {
268    if (!mTerminated) mResultStr << "\n";
269}
270
271void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
272    assert (mGrepDriver == nullptr);
273    mGrepDriver = new ParabixDriver("engine");
274    auto & idb = mGrepDriver->getBuilder();
275    Module * M = idb->getModule();
276
277    const auto segmentSize = codegen::SegmentSize;
278    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
279    const unsigned encodingBits = 8;
280
281    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
282    mainFunc->setCallingConv(CallingConv::C);
283    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
284    auto args = mainFunc->arg_begin();
285
286    Value * const fileDescriptor = &*(args++);
287    fileDescriptor->setName("fileDescriptor");
288    Value * match_accumulator = &*(args++);
289    match_accumulator->setName("match_accumulator");
290
291    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
292    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
293    sourceK->setInitialArguments({fileDescriptor});
294    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
295
296    StreamSetBuffer * LineBreakStream;
297    StreamSetBuffer * Matches;
298    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
299
300    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
301    scanMatchK->setInitialArguments({match_accumulator});
302    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
303    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
304    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
305
306    mGrepDriver->generatePipelineIR();
307    mGrepDriver->deallocateBuffers();
308    idb->CreateRet(idb->getInt64(0));
309    mGrepDriver->finalizeObject();
310}
311
312
313//
314//  The doGrep methods apply a GrepEngine to a single file, processing the results
315//  differently based on the engine type.
316
317uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
318    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
319    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
320
321    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
322    if (fileDescriptor == -1) return 0;
323
324    uint64_t grepResult = f(fileDescriptor);
325    close(fileDescriptor);
326    return grepResult;
327}
328
329uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
330    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
331    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
332    mResultStrs[fileIdx] << grepResult << "\n";
333    return grepResult;
334}
335
336std::string GrepEngine::linePrefix(std::string fileName) {
337    if (fileName == "-") {
338        return LabelFlag + mFileSuffix;
339    }
340    else {
341        return fileName + mFileSuffix;
342    }
343}
344
345uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
346    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
347    if (grepResult == mRequiredCount) {
348       mResultStrs[fileIdx] << linePrefix(fileName);
349    }
350    return grepResult;
351}
352
353uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
354    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
355    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
356
357    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
358    if (fileDescriptor == -1) return 0;
359    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
360    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
361    close(fileDescriptor);
362    if (accum.mLineCount > 0) grepMatchFound = true;
363    return accum.mLineCount;
364}
365
366// Open a file and return its file desciptor.
367int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
368    if (fileName == "-") {
369        return STDIN_FILENO;
370    }
371    else {
372        struct stat sb;
373        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
374        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
375            if (!NoMessagesFlag) {
376                if (errno == EACCES) {
377                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
378                }
379                else if (errno == ENOENT) {
380                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
381                }
382                else {
383                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
384                }
385            }
386            return fileDescriptor;
387        }
388        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
389            if (!NoMessagesFlag) {
390                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
391            }
392            close(fileDescriptor);
393            return -1;
394        }
395        return fileDescriptor;
396    }
397}
398
399// The process of searching a group of files may use a sequential or a task
400// parallel approach.
401
402void * DoGrepThreadFunction(void *args) {
403    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
404}
405
406bool GrepEngine::searchAllFiles() {
407    const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
408    pthread_t threads[numOfThreads];
409
410    for(unsigned long i = 1; i < numOfThreads; ++i) {
411        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
412        if (rc) {
413            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
414        }
415    }
416    // Main thread also does the work;
417
418    DoGrepThreadMethod();
419    for(unsigned i = 1; i < numOfThreads; ++i) {
420        void * status = nullptr;
421        const int rc = pthread_join(threads[i], &status);
422        if (rc) {
423            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
424        }
425    }
426    return grepMatchFound;
427}
428
429
430// DoGrep thread function.
431void * GrepEngine::DoGrepThreadMethod() {
432
433    unsigned fileIdx = mNextFileToGrep++;
434    while (fileIdx < inputFiles.size()) {
435        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
436        mFileStatus[fileIdx] = FileStatus::GrepComplete;
437        if (grepResult > 0) {
438            grepMatchFound = true;
439        }
440        if (QuietMode && grepMatchFound) {
441            if (pthread_self() != mEngineThread) {
442                pthread_exit(nullptr);
443            }
444            return nullptr;
445        }
446        fileIdx = mNextFileToGrep++;
447    }
448
449    unsigned printIdx = mNextFileToPrint++;
450    while (printIdx < inputFiles.size()) {
451        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
452        if (readyToPrint) {
453            const auto output = mResultStrs[printIdx].str();
454            if (!output.empty()) {
455                llvm::outs() << output;
456            }
457            mFileStatus[printIdx] = FileStatus::PrintComplete;
458            printIdx = mNextFileToPrint++;
459        } else {
460            mGrepDriver->performIncrementalCacheCleanupStep();
461        }
462        sched_yield();
463    }
464
465    if (pthread_self() != mEngineThread) {
466        pthread_exit(nullptr);
467    } else {
468        return nullptr;
469    }
470}
471
472}
Note: See TracBrowser for help on using the repository browser.