source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5795

Last change on this file since 5795 was 5795, checked in by cameron, 13 months ago

Adding Alphabet to CCs: initial check-in

File size: 19.5 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/casing.h>
26#include <re/exclude_CC.h>
27#include <re/re_toolchain.h>
28#include <toolchain/toolchain.h>
29#include <re/re_name_resolve.h>
30#include <re/re_collect_unicodesets.h>
31#include <re/re_multiplex.h>
32#include <re/grapheme_clusters.h>
33#include <toolchain/toolchain.h>
34#include <toolchain/cpudriver.h>
35#include <iostream>
36#include <cc/multiplex_CCs.h>
37#include <llvm/Support/raw_ostream.h>
38#include <util/aligned_allocator.h>
39#include <sys/stat.h>
40#include <fcntl.h>
41#include <errno.h>
42#include <llvm/ADT/STLExtras.h> // for make_unique
43#include <llvm/Support/CommandLine.h>
44#include <llvm/Support/Debug.h>
45#include <sched.h>
46
47using namespace parabix;
48using namespace llvm;
49using namespace cc;
50
51static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
52
53namespace grep {
54
55// Grep Engine construction and initialization.
56
57GrepEngine::GrepEngine() :
58    mGrepDriver(nullptr),
59    mNextFileToGrep(0),
60    mNextFileToPrint(0),
61    grepMatchFound(false),
62    mMoveMatchesToEOL(true),
63    mEngineThread(pthread_self()) {}
64
65GrepEngine::~GrepEngine() {
66    delete mGrepDriver;
67}
68
69QuietModeEngine::QuietModeEngine() : GrepEngine() {
70    mMoveMatchesToEOL = false;
71}
72
73MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
74    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
75    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
76    mMoveMatchesToEOL = false;
77}
78
79CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
80    mFileSuffix = ":";
81}
82
83EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
84    mFileSuffix = InitialTabFlag ? "\t:" : ":";
85    if (LineRegexpFlag) mMoveMatchesToEOL = false;
86}
87
88void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
89    const unsigned n = filenames.size();
90    mResultStrs.resize(n);
91    mFileStatus.resize(n, FileStatus::Pending);
92    inputFiles = filenames;
93}
94
95// Code Generation
96//
97// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
98
99std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
100    auto & idb = mGrepDriver->getBuilder();
101    const unsigned segmentSize = codegen::SegmentSize;
102    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
103    const unsigned encodingBits = 8;
104
105    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
106    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
107    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
108
109    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
110    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
111    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
112
113    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
114    StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
115    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
116    mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
117
118    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
119    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize * bufferSegments);
120    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
121
122    const auto n = REs.size();
123    std::vector<std::vector<re::CC *>> charclasses(n);
124    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
125
126    for(unsigned i = 0; i < n; ++i){
127#define USE_MULTIPLEX_CC
128#ifdef USE_MULTIPLEX_CC
129       
130        REs[i] = multiplexing_prepasses(REs[i]);
131        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
132        std::unique_ptr<cc::MultiplexedAlphabet> mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
133        REs[i] = multiplex(REs[i], UnicodeSets, mpx->getExclusiveSetIDs());
134        std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
135        auto numOfCharacterClasses = mpx_basis.size();
136        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
137        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
138        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
139        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
140        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
141        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
142#else
143        REs[i] = regular_expression_passes(REs[i]);
144        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
145        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
146        mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
147#endif
148        MatchResultsBufs[i] = MatchResults;
149    }
150    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
151    if (REs.size() > 1) {
152        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
153        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
154        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
155    }
156    StreamSetBuffer * Matches = MergedResults;
157
158    if (mMoveMatchesToEOL) {
159        StreamSetBuffer * OriginalMatches = Matches;
160        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
161        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
162        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
163    }
164
165    if (InvertMatchFlag) {
166        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
167        StreamSetBuffer * OriginalMatches = Matches;
168        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
169        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
170    }
171    if (MaxCountFlag > 0) {
172        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
173        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
174        StreamSetBuffer * AllMatches = Matches;
175        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
176        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
177    }
178    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
179}
180
181// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
182// which returns a count of the matches found (possibly subject to a MaxCount).
183//
184
185void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
186
187    assert (mGrepDriver == nullptr);
188    mGrepDriver = new ParabixDriver("engine");
189    auto & idb = mGrepDriver->getBuilder();
190    Module * M = idb->getModule();
191
192    const auto segmentSize = codegen::SegmentSize;
193    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
194
195    const unsigned encodingBits = 8;
196
197    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
198    mainFunc->setCallingConv(CallingConv::C);
199    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
200    auto args = mainFunc->arg_begin();
201
202    Value * const fileDescriptor = &*(args++);
203    fileDescriptor->setName("fileDescriptor");
204
205    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
206    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
207    sourceK->setInitialArguments({fileDescriptor});
208    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
209
210    StreamSetBuffer * LineBreakStream;
211    StreamSetBuffer * Matches;
212    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
213
214    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
215    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
216    mGrepDriver->generatePipelineIR();
217    idb->setKernel(matchCountK);
218    Value * matchedLineCount = idb->getAccumulator("countResult");
219    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
220    mGrepDriver->deallocateBuffers();
221    idb->CreateRet(matchedLineCount);
222    mGrepDriver->finalizeObject();
223}
224
225//
226// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
227// matched lines.
228
229class EmitMatch : public MatchAccumulator {
230    friend class EmitMatchesEngine;
231public:
232    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
233    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
234    void finalize_match(char * buffer_end) override;
235protected:
236    std::string mLinePrefix;
237    size_t mLineCount;
238    bool mTerminated;
239    std::ostringstream & mResultStr;
240};
241
242//
243//  Default Report Match:  lines are emitted with whatever line terminators are found in the
244//  input.  However, if the final line is not terminated, a new line is appended.
245//
246void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
247    if (WithFilenameFlag) {
248        mResultStr << mLinePrefix;
249    }
250    if (LineNumberFlag) {
251        // Internally line numbers are counted from 0.  For display, adjust
252        // the line number so that lines are numbered from 1.
253        if (InitialTabFlag) {
254            mResultStr << lineNum+1 << "\t:";
255        }
256        else {
257            mResultStr << lineNum+1 << ":";
258        }
259    }
260    size_t bytes = line_end - line_start + 1;
261    mResultStr.write(line_start, bytes);
262    mLineCount++;
263    unsigned last_byte = *line_end;
264    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
265    if (LLVM_UNLIKELY(!mTerminated)) {
266        if (last_byte == 0x85) {  //  Possible NEL terminator.
267            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
268        }
269        else {
270            // Possible LS or PS terminators.
271            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
272                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
273                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
274        }
275    }
276}
277
278void EmitMatch::finalize_match(char * buffer_end) {
279    if (!mTerminated) mResultStr << "\n";
280}
281
282void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
283    assert (mGrepDriver == nullptr);
284    mGrepDriver = new ParabixDriver("engine");
285    auto & idb = mGrepDriver->getBuilder();
286    Module * M = idb->getModule();
287
288    const auto segmentSize = codegen::SegmentSize;
289    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
290    const unsigned encodingBits = 8;
291
292    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
293    mainFunc->setCallingConv(CallingConv::C);
294    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
295    auto args = mainFunc->arg_begin();
296
297    Value * const fileDescriptor = &*(args++);
298    fileDescriptor->setName("fileDescriptor");
299    Value * match_accumulator = &*(args++);
300    match_accumulator->setName("match_accumulator");
301
302    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
303    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
304    sourceK->setInitialArguments({fileDescriptor});
305    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
306
307    StreamSetBuffer * LineBreakStream;
308    StreamSetBuffer * Matches;
309    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
310
311    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
312    scanMatchK->setInitialArguments({match_accumulator});
313    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
314    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
315    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
316
317    mGrepDriver->generatePipelineIR();
318    mGrepDriver->deallocateBuffers();
319    idb->CreateRet(idb->getInt64(0));
320    mGrepDriver->finalizeObject();
321}
322
323
324//
325//  The doGrep methods apply a GrepEngine to a single file, processing the results
326//  differently based on the engine type.
327
328uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
329    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
330    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
331
332    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
333    if (fileDescriptor == -1) return 0;
334
335    uint64_t grepResult = f(fileDescriptor);
336    close(fileDescriptor);
337    return grepResult;
338}
339
340uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
341    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
342    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
343    mResultStrs[fileIdx] << grepResult << "\n";
344    return grepResult;
345}
346
347std::string GrepEngine::linePrefix(std::string fileName) {
348    if (fileName == "-") {
349        return LabelFlag + mFileSuffix;
350    }
351    else {
352        return fileName + mFileSuffix;
353    }
354}
355
356uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
357    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
358    if (grepResult == mRequiredCount) {
359       mResultStrs[fileIdx] << linePrefix(fileName);
360    }
361    return grepResult;
362}
363
364uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
365    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
366    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
367
368    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
369    if (fileDescriptor == -1) return 0;
370    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
371    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
372    close(fileDescriptor);
373    if (accum.mLineCount > 0) grepMatchFound = true;
374    return accum.mLineCount;
375}
376
377// Open a file and return its file desciptor.
378int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
379    if (fileName == "-") {
380        return STDIN_FILENO;
381    }
382    else {
383        struct stat sb;
384        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
385        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
386            if (!NoMessagesFlag) {
387                if (errno == EACCES) {
388                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
389                }
390                else if (errno == ENOENT) {
391                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
392                }
393                else {
394                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
395                }
396            }
397            return fileDescriptor;
398        }
399        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
400            if (!NoMessagesFlag) {
401                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
402            }
403            close(fileDescriptor);
404            return -1;
405        }
406        return fileDescriptor;
407    }
408}
409
410// The process of searching a group of files may use a sequential or a task
411// parallel approach.
412
413void * DoGrepThreadFunction(void *args) {
414    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
415}
416
417bool GrepEngine::searchAllFiles() {
418    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
419    std::vector<pthread_t> threads(numOfThreads);
420
421    for(unsigned long i = 1; i < numOfThreads; ++i) {
422        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
423        if (rc) {
424            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
425        }
426    }
427    // Main thread also does the work;
428
429    DoGrepThreadMethod();
430    for(unsigned i = 1; i < numOfThreads; ++i) {
431        void * status = nullptr;
432        const int rc = pthread_join(threads[i], &status);
433        if (rc) {
434            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
435        }
436    }
437    return grepMatchFound;
438}
439
440
441// DoGrep thread function.
442void * GrepEngine::DoGrepThreadMethod() {
443
444    unsigned fileIdx = mNextFileToGrep++;
445    while (fileIdx < inputFiles.size()) {
446        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
447        mFileStatus[fileIdx] = FileStatus::GrepComplete;
448        if (grepResult > 0) {
449            grepMatchFound = true;
450        }
451        if (QuietMode && grepMatchFound) {
452            if (pthread_self() != mEngineThread) {
453                pthread_exit(nullptr);
454            }
455            return nullptr;
456        }
457        fileIdx = mNextFileToGrep++;
458    }
459
460    unsigned printIdx = mNextFileToPrint++;
461    while (printIdx < inputFiles.size()) {
462        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
463        if (readyToPrint) {
464            const auto output = mResultStrs[printIdx].str();
465            if (!output.empty()) {
466                llvm::outs() << output;
467            }
468            mFileStatus[printIdx] = FileStatus::PrintComplete;
469            printIdx = mNextFileToPrint++;
470        } else {
471            mGrepDriver->performIncrementalCacheCleanupStep();
472        }
473        sched_yield();
474    }
475
476    if (pthread_self() != mEngineThread) {
477        pthread_exit(nullptr);
478    } else {
479        // Always perform one final cache cleanup step.
480        mGrepDriver->performIncrementalCacheCleanupStep();
481    }
482}
483
484}
Note: See TracBrowser for help on using the repository browser.