source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5782

Last change on this file since 5782 was 5782, checked in by nmedfort, 16 months ago

Initial check-in of LookAhead? support; modified LineBreakKernel? to compute CR+LF using LookAhead?(1) + misc. fixes.

File size: 19.2 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/casing.h>
26#include <re/exclude_CC.h>
27#include <re/re_toolchain.h>
28#include <toolchain/toolchain.h>
29#include <re/re_name_resolve.h>
30#include <re/re_collect_unicodesets.h>
31#include <re/re_multiplex.h>
32#include <re/grapheme_clusters.h>
33#include <toolchain/toolchain.h>
34#include <toolchain/cpudriver.h>
35#include <iostream>
36#include <cc/multiplex_CCs.h>
37#include <llvm/Support/raw_ostream.h>
38#include <util/aligned_allocator.h>
39#include <sys/stat.h>
40#include <fcntl.h>
41#include <errno.h>
42#include <llvm/ADT/STLExtras.h> // for make_unique
43#include <llvm/Support/CommandLine.h>
44#include <llvm/Support/Debug.h>
45#include <sched.h>
46
47using namespace parabix;
48using namespace llvm;
49static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
50
51namespace grep {
52
53// Grep Engine construction and initialization.
54
55GrepEngine::GrepEngine() :
56    mGrepDriver(nullptr),
57    mNextFileToGrep(0),
58    mNextFileToPrint(0),
59    grepMatchFound(false),
60    mMoveMatchesToEOL(true),
61    mEngineThread(pthread_self()) {}
62
63GrepEngine::~GrepEngine() {
64    delete mGrepDriver;
65}
66
67QuietModeEngine::QuietModeEngine() : GrepEngine() {
68    mMoveMatchesToEOL = false;
69}
70
71MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
72    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
73    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
74    mMoveMatchesToEOL = false;
75}
76
77CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
78    mFileSuffix = ":";
79}
80
81EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
82    mFileSuffix = InitialTabFlag ? "\t:" : ":";
83    if (LineRegexpFlag) mMoveMatchesToEOL = false;
84}
85
86void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
87    const unsigned n = filenames.size();
88    mResultStrs.resize(n);
89    mFileStatus.resize(n, FileStatus::Pending);
90    inputFiles = filenames;
91}
92
93// Code Generation
94//
95// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
96
97std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
98    auto & idb = mGrepDriver->getBuilder();
99    const unsigned segmentSize = codegen::SegmentSize;
100    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
101    const unsigned encodingBits = 8;
102
103    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments + 1);
104    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
105    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
106
107    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments + 1);
108    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
109    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
110
111    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
112    StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
113    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
114    mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
115
116    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
117    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize * bufferSegments);
118    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
119
120    const auto n = REs.size();
121    std::vector<std::vector<re::CC *>> charclasses(n);
122    for (unsigned i = 0; i < n; i++) {
123        REs[i] = resolveCaseInsensitiveMode(REs[i], grep::IgnoreCaseFlag);
124        REs[i] = resolveGraphemeMode(REs[i], false /* not in grapheme mode at top level*/);
125        REs[i] = re::resolveNames(REs[i]);
126        REs[i] = exclude_CC(REs[i], re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029))));
127
128        const auto UnicodeSets = re::collectUnicodeSets(REs[i]);
129        std::vector<std::vector<unsigned>> exclusiveSetIDs;
130        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
131        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
132        REs[i] = regular_expression_passes(REs[i]);
133  }
134
135    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
136
137    for(unsigned i = 0; i < n; ++i){
138        const auto numOfCharacterClasses = charclasses[i].size();
139        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
140        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(charclasses[i]));
141        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
142        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
143        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
144        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
145        MatchResultsBufs[i] = MatchResults;
146    }
147    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
148    if (REs.size() > 1) {
149        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
150        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
151        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
152    }
153    StreamSetBuffer * Matches = MergedResults;
154
155    if (mMoveMatchesToEOL) {
156        StreamSetBuffer * OriginalMatches = Matches;
157        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
158        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
159        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
160    }
161
162    if (InvertMatchFlag) {
163        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
164        StreamSetBuffer * OriginalMatches = Matches;
165        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
166        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
167    }
168    if (MaxCountFlag > 0) {
169        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
170        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
171        StreamSetBuffer * AllMatches = Matches;
172        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
173        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
174    }
175    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
176}
177
178// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
179// which returns a count of the matches found (possibly subject to a MaxCount).
180//
181
182void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
183
184    assert (mGrepDriver == nullptr);
185    mGrepDriver = new ParabixDriver("engine");
186    auto & idb = mGrepDriver->getBuilder();
187    Module * M = idb->getModule();
188
189    const auto segmentSize = codegen::SegmentSize;
190    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
191
192    const unsigned encodingBits = 8;
193
194    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
195    mainFunc->setCallingConv(CallingConv::C);
196    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
197    auto args = mainFunc->arg_begin();
198
199    Value * const fileDescriptor = &*(args++);
200    fileDescriptor->setName("fileDescriptor");
201
202    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
203    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
204    sourceK->setInitialArguments({fileDescriptor});
205    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
206
207    StreamSetBuffer * LineBreakStream;
208    StreamSetBuffer * Matches;
209    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
210
211    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
212    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
213    mGrepDriver->generatePipelineIR();
214    idb->setKernel(matchCountK);
215    Value * matchedLineCount = idb->getAccumulator("countResult");
216    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
217    mGrepDriver->deallocateBuffers();
218    idb->CreateRet(matchedLineCount);
219    mGrepDriver->finalizeObject();
220}
221
222//
223// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
224// matched lines.
225
226class EmitMatch : public MatchAccumulator {
227    friend class EmitMatchesEngine;
228public:
229    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
230    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
231    void finalize_match(char * buffer_end) override;
232protected:
233    std::string mLinePrefix;
234    size_t mLineCount;
235    bool mTerminated;
236    std::ostringstream & mResultStr;
237};
238
239//
240//  Default Report Match:  lines are emitted with whatever line terminators are found in the
241//  input.  However, if the final line is not terminated, a new line is appended.
242//
243void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
244    if (WithFilenameFlag) {
245        mResultStr << mLinePrefix;
246    }
247    if (LineNumberFlag) {
248        // Internally line numbers are counted from 0.  For display, adjust
249        // the line number so that lines are numbered from 1.
250        if (InitialTabFlag) {
251            mResultStr << lineNum+1 << "\t:";
252        }
253        else {
254            mResultStr << lineNum+1 << ":";
255        }
256    }
257    size_t bytes = line_end - line_start + 1;
258    mResultStr.write(line_start, bytes);
259    mLineCount++;
260    unsigned last_byte = *line_end;
261    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
262    if (LLVM_UNLIKELY(!mTerminated)) {
263        if (last_byte == 0x85) {  //  Possible NEL terminator.
264            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
265        }
266        else {
267            // Possible LS or PS terminators.
268            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
269                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
270                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
271        }
272    }
273}
274
275void EmitMatch::finalize_match(char * buffer_end) {
276    if (!mTerminated) mResultStr << "\n";
277}
278
279void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
280    assert (mGrepDriver == nullptr);
281    mGrepDriver = new ParabixDriver("engine");
282    auto & idb = mGrepDriver->getBuilder();
283    Module * M = idb->getModule();
284
285    const auto segmentSize = codegen::SegmentSize;
286    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
287    const unsigned encodingBits = 8;
288
289    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
290    mainFunc->setCallingConv(CallingConv::C);
291    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
292    auto args = mainFunc->arg_begin();
293
294    Value * const fileDescriptor = &*(args++);
295    fileDescriptor->setName("fileDescriptor");
296    Value * match_accumulator = &*(args++);
297    match_accumulator->setName("match_accumulator");
298
299    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
300    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
301    sourceK->setInitialArguments({fileDescriptor});
302    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
303
304    StreamSetBuffer * LineBreakStream;
305    StreamSetBuffer * Matches;
306    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
307
308    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
309    scanMatchK->setInitialArguments({match_accumulator});
310    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
311    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
312    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
313
314    mGrepDriver->generatePipelineIR();
315    mGrepDriver->deallocateBuffers();
316    idb->CreateRet(idb->getInt64(0));
317    mGrepDriver->finalizeObject();
318}
319
320
321//
322//  The doGrep methods apply a GrepEngine to a single file, processing the results
323//  differently based on the engine type.
324
325uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
326    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
327    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
328
329    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
330    if (fileDescriptor == -1) return 0;
331
332    uint64_t grepResult = f(fileDescriptor);
333    close(fileDescriptor);
334    return grepResult;
335}
336
337uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
338    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
339    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
340    mResultStrs[fileIdx] << grepResult << "\n";
341    return grepResult;
342}
343
344std::string GrepEngine::linePrefix(std::string fileName) {
345    if (fileName == "-") {
346        return LabelFlag + mFileSuffix;
347    }
348    else {
349        return fileName + mFileSuffix;
350    }
351}
352
353uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
354    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
355    if (grepResult == mRequiredCount) {
356       mResultStrs[fileIdx] << linePrefix(fileName);
357    }
358    return grepResult;
359}
360
361uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
362    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
363    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
364
365    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
366    if (fileDescriptor == -1) return 0;
367    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
368    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
369    close(fileDescriptor);
370    if (accum.mLineCount > 0) grepMatchFound = true;
371    return accum.mLineCount;
372}
373
374// Open a file and return its file desciptor.
375int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
376    if (fileName == "-") {
377        return STDIN_FILENO;
378    }
379    else {
380        struct stat sb;
381        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
382        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
383            if (!NoMessagesFlag) {
384                if (errno == EACCES) {
385                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
386                }
387                else if (errno == ENOENT) {
388                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
389                }
390                else {
391                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
392                }
393            }
394            return fileDescriptor;
395        }
396        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
397            if (!NoMessagesFlag) {
398                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
399            }
400            close(fileDescriptor);
401            return -1;
402        }
403        return fileDescriptor;
404    }
405}
406
407// The process of searching a group of files may use a sequential or a task
408// parallel approach.
409
410void * DoGrepThreadFunction(void *args) {
411    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
412}
413
414bool GrepEngine::searchAllFiles() {
415    const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
416    pthread_t threads[numOfThreads];
417
418    for(unsigned long i = 1; i < numOfThreads; ++i) {
419        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
420        if (rc) {
421            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
422        }
423    }
424    // Main thread also does the work;
425
426    DoGrepThreadMethod();
427    for(unsigned i = 1; i < numOfThreads; ++i) {
428        void * status = nullptr;
429        const int rc = pthread_join(threads[i], &status);
430        if (rc) {
431            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
432        }
433    }
434    return grepMatchFound;
435}
436
437
438// DoGrep thread function.
439void * GrepEngine::DoGrepThreadMethod() {
440
441    unsigned fileIdx = mNextFileToGrep++;
442    while (fileIdx < inputFiles.size()) {
443        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
444        mFileStatus[fileIdx] = FileStatus::GrepComplete;
445        if (grepResult > 0) {
446            grepMatchFound = true;
447        }
448        if (QuietMode && grepMatchFound) {
449            if (pthread_self() != mEngineThread) {
450                pthread_exit(nullptr);
451            }
452            return nullptr;
453        }
454        fileIdx = mNextFileToGrep++;
455    }
456
457    unsigned printIdx = mNextFileToPrint++;
458    while (printIdx < inputFiles.size()) {
459        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
460        if (readyToPrint) {
461            const auto output = mResultStrs[printIdx].str();
462            if (!output.empty()) {
463                llvm::outs() << output;
464            }
465            mFileStatus[printIdx] = FileStatus::PrintComplete;
466            printIdx = mNextFileToPrint++;
467        } else {
468            mGrepDriver->performIncrementalCacheCleanupStep();
469        }
470        sched_yield();
471    }
472
473    if (pthread_self() != mEngineThread) {
474        pthread_exit(nullptr);
475    } else {
476        return nullptr;
477    }
478}
479
480}
Note: See TracBrowser for help on using the repository browser.