source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5801

Last change on this file since 5801 was 5801, checked in by cameron, 15 months ago

Additional Alphabet analysis and transformation

File size: 19.5 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/casing.h>
26#include <re/exclude_CC.h>
27#include <re/re_toolchain.h>
28#include <toolchain/toolchain.h>
29#include <re/re_name_resolve.h>
30#include <re/re_collect_unicodesets.h>
31#include <re/re_multiplex.h>
32#include <re/grapheme_clusters.h>
33#include <re/printer_re.h>
34#include <toolchain/toolchain.h>
35#include <toolchain/cpudriver.h>
36#include <iostream>
37#include <cc/multiplex_CCs.h>
38#include <llvm/Support/raw_ostream.h>
39#include <util/aligned_allocator.h>
40#include <sys/stat.h>
41#include <fcntl.h>
42#include <errno.h>
43#include <llvm/ADT/STLExtras.h> // for make_unique
44#include <llvm/Support/CommandLine.h>
45#include <llvm/Support/Debug.h>
46#include <sched.h>
47
48using namespace parabix;
49using namespace llvm;
50using namespace cc;
51
52static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
53
54namespace grep {
55
56// Grep Engine construction and initialization.
57
58GrepEngine::GrepEngine() :
59    mGrepDriver(nullptr),
60    mNextFileToGrep(0),
61    mNextFileToPrint(0),
62    grepMatchFound(false),
63    mMoveMatchesToEOL(true),
64    mEngineThread(pthread_self()) {}
65
66GrepEngine::~GrepEngine() {
67    delete mGrepDriver;
68}
69
70QuietModeEngine::QuietModeEngine() : GrepEngine() {
71    mMoveMatchesToEOL = false;
72}
73
74MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
75    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
76    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
77    mMoveMatchesToEOL = false;
78}
79
80CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
81    mFileSuffix = ":";
82}
83
84EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
85    mFileSuffix = InitialTabFlag ? "\t:" : ":";
86    if (LineRegexpFlag) mMoveMatchesToEOL = false;
87}
88
89void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
90    const unsigned n = filenames.size();
91    mResultStrs.resize(n);
92    mFileStatus.resize(n, FileStatus::Pending);
93    inputFiles = filenames;
94}
95
96// Code Generation
97//
98// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
99
100std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
101    auto & idb = mGrepDriver->getBuilder();
102    const unsigned segmentSize = codegen::SegmentSize;
103    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
104    const unsigned encodingBits = 8;
105
106    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments);
107    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
108    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
109
110    StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
111    kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, encodingBits);
112    mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
113
114    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
115    StreamSetBuffer * CRLFStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
116    kernel::Kernel * linebreakK = mGrepDriver->addKernelInstance<kernel::LineBreakKernelBuilder>(idb, encodingBits);
117    mGrepDriver->makeKernelCall(linebreakK, {BasisBits, LineFeedStream}, {LineBreakStream, CRLFStream});
118
119    kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
120    StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(3, 1), segmentSize * bufferSegments);
121    mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
122
123    const auto n = REs.size();
124    std::vector<std::vector<re::CC *>> charclasses(n);
125    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
126
127    for(unsigned i = 0; i < n; ++i){
128#define USE_MULTIPLEX_CC
129#ifdef USE_MULTIPLEX_CC
130       
131        REs[i] = multiplexing_prepasses(REs[i]);
132        const std::vector<const re::CC *> UnicodeSets = re::collectUnicodeSets(REs[i]);
133        std::unique_ptr<cc::MultiplexedAlphabet> mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
134        REs[i] = transformCCs(mpx.get(), REs[i]);
135        //llvm::errs() << Printer_RE::PrintRE(REs[i]) << '\n';
136        std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
137        auto numOfCharacterClasses = mpx_basis.size();
138        StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments);
139        kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
140        mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
141        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
142        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses);
143        mGrepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
144#else
145        REs[i] = regular_expression_passes(REs[i]);
146        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
147        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i]);
148        mGrepDriver->makeKernelCall(icgrepK, {BasisBits, LineBreakStream, CRLFStream, RequiredStreams}, {MatchResults});
149#endif
150        MatchResultsBufs[i] = MatchResults;
151    }
152    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
153    if (REs.size() > 1) {
154        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
155        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
156        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
157    }
158    StreamSetBuffer * Matches = MergedResults;
159
160    if (mMoveMatchesToEOL) {
161        StreamSetBuffer * OriginalMatches = Matches;
162        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
163        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
164        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
165    }
166
167    if (InvertMatchFlag) {
168        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
169        StreamSetBuffer * OriginalMatches = Matches;
170        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
171        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
172    }
173    if (MaxCountFlag > 0) {
174        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
175        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
176        StreamSetBuffer * AllMatches = Matches;
177        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments);
178        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
179    }
180    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
181}
182
183// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
184// which returns a count of the matches found (possibly subject to a MaxCount).
185//
186
187void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
188
189    assert (mGrepDriver == nullptr);
190    mGrepDriver = new ParabixDriver("engine");
191    auto & idb = mGrepDriver->getBuilder();
192    Module * M = idb->getModule();
193
194    const auto segmentSize = codegen::SegmentSize;
195    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
196
197    const unsigned encodingBits = 8;
198
199    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
200    mainFunc->setCallingConv(CallingConv::C);
201    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
202    auto args = mainFunc->arg_begin();
203
204    Value * const fileDescriptor = &*(args++);
205    fileDescriptor->setName("fileDescriptor");
206
207    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
208    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
209    sourceK->setInitialArguments({fileDescriptor});
210    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
211
212    StreamSetBuffer * LineBreakStream;
213    StreamSetBuffer * Matches;
214    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
215
216    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
217    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
218    mGrepDriver->generatePipelineIR();
219    idb->setKernel(matchCountK);
220    Value * matchedLineCount = idb->getAccumulator("countResult");
221    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
222    mGrepDriver->deallocateBuffers();
223    idb->CreateRet(matchedLineCount);
224    mGrepDriver->finalizeObject();
225}
226
227//
228// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
229// matched lines.
230
231class EmitMatch : public MatchAccumulator {
232    friend class EmitMatchesEngine;
233public:
234    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
235    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
236    void finalize_match(char * buffer_end) override;
237protected:
238    std::string mLinePrefix;
239    size_t mLineCount;
240    bool mTerminated;
241    std::ostringstream & mResultStr;
242};
243
244//
245//  Default Report Match:  lines are emitted with whatever line terminators are found in the
246//  input.  However, if the final line is not terminated, a new line is appended.
247//
248void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
249    if (WithFilenameFlag) {
250        mResultStr << mLinePrefix;
251    }
252    if (LineNumberFlag) {
253        // Internally line numbers are counted from 0.  For display, adjust
254        // the line number so that lines are numbered from 1.
255        if (InitialTabFlag) {
256            mResultStr << lineNum+1 << "\t:";
257        }
258        else {
259            mResultStr << lineNum+1 << ":";
260        }
261    }
262    size_t bytes = line_end - line_start + 1;
263    mResultStr.write(line_start, bytes);
264    mLineCount++;
265    unsigned last_byte = *line_end;
266    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
267    if (LLVM_UNLIKELY(!mTerminated)) {
268        if (last_byte == 0x85) {  //  Possible NEL terminator.
269            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
270        }
271        else {
272            // Possible LS or PS terminators.
273            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
274                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
275                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
276        }
277    }
278}
279
280void EmitMatch::finalize_match(char * buffer_end) {
281    if (!mTerminated) mResultStr << "\n";
282}
283
284void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
285    assert (mGrepDriver == nullptr);
286    mGrepDriver = new ParabixDriver("engine");
287    auto & idb = mGrepDriver->getBuilder();
288    Module * M = idb->getModule();
289
290    const auto segmentSize = codegen::SegmentSize;
291    const auto bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
292    const unsigned encodingBits = 8;
293
294    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
295    mainFunc->setCallingConv(CallingConv::C);
296    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
297    auto args = mainFunc->arg_begin();
298
299    Value * const fileDescriptor = &*(args++);
300    fileDescriptor->setName("fileDescriptor");
301    Value * match_accumulator = &*(args++);
302    match_accumulator->setName("match_accumulator");
303
304    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
305    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb, segmentSize * bufferSegments);
306    sourceK->setInitialArguments({fileDescriptor});
307    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
308
309    StreamSetBuffer * LineBreakStream;
310    StreamSetBuffer * Matches;
311    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
312
313    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
314    scanMatchK->setInitialArguments({match_accumulator});
315    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
316    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
317    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
318
319    mGrepDriver->generatePipelineIR();
320    mGrepDriver->deallocateBuffers();
321    idb->CreateRet(idb->getInt64(0));
322    mGrepDriver->finalizeObject();
323}
324
325
326//
327//  The doGrep methods apply a GrepEngine to a single file, processing the results
328//  differently based on the engine type.
329
330uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
331    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
332    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
333
334    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
335    if (fileDescriptor == -1) return 0;
336
337    uint64_t grepResult = f(fileDescriptor);
338    close(fileDescriptor);
339    return grepResult;
340}
341
342uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
343    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
344    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
345    mResultStrs[fileIdx] << grepResult << "\n";
346    return grepResult;
347}
348
349std::string GrepEngine::linePrefix(std::string fileName) {
350    if (fileName == "-") {
351        return LabelFlag + mFileSuffix;
352    }
353    else {
354        return fileName + mFileSuffix;
355    }
356}
357
358uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
359    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
360    if (grepResult == mRequiredCount) {
361       mResultStrs[fileIdx] << linePrefix(fileName);
362    }
363    return grepResult;
364}
365
366uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
367    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
368    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
369
370    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
371    if (fileDescriptor == -1) return 0;
372    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
373    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
374    close(fileDescriptor);
375    if (accum.mLineCount > 0) grepMatchFound = true;
376    return accum.mLineCount;
377}
378
379// Open a file and return its file desciptor.
380int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
381    if (fileName == "-") {
382        return STDIN_FILENO;
383    }
384    else {
385        struct stat sb;
386        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
387        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
388            if (!NoMessagesFlag) {
389                if (errno == EACCES) {
390                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
391                }
392                else if (errno == ENOENT) {
393                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
394                }
395                else {
396                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
397                }
398            }
399            return fileDescriptor;
400        }
401        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
402            if (!NoMessagesFlag) {
403                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
404            }
405            close(fileDescriptor);
406            return -1;
407        }
408        return fileDescriptor;
409    }
410}
411
412// The process of searching a group of files may use a sequential or a task
413// parallel approach.
414
415void * DoGrepThreadFunction(void *args) {
416    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
417}
418
419bool GrepEngine::searchAllFiles() {
420    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
421    std::vector<pthread_t> threads(numOfThreads);
422
423    for(unsigned long i = 1; i < numOfThreads; ++i) {
424        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
425        if (rc) {
426            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
427        }
428    }
429    // Main thread also does the work;
430
431    DoGrepThreadMethod();
432    for(unsigned i = 1; i < numOfThreads; ++i) {
433        void * status = nullptr;
434        const int rc = pthread_join(threads[i], &status);
435        if (rc) {
436            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
437        }
438    }
439    return grepMatchFound;
440}
441
442
443// DoGrep thread function.
444void * GrepEngine::DoGrepThreadMethod() {
445
446    unsigned fileIdx = mNextFileToGrep++;
447    while (fileIdx < inputFiles.size()) {
448        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
449        mFileStatus[fileIdx] = FileStatus::GrepComplete;
450        if (grepResult > 0) {
451            grepMatchFound = true;
452        }
453        if (QuietMode && grepMatchFound) {
454            if (pthread_self() != mEngineThread) {
455                pthread_exit(nullptr);
456            }
457            return nullptr;
458        }
459        fileIdx = mNextFileToGrep++;
460    }
461
462    unsigned printIdx = mNextFileToPrint++;
463    while (printIdx < inputFiles.size()) {
464        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
465        if (readyToPrint) {
466            const auto output = mResultStrs[printIdx].str();
467            if (!output.empty()) {
468                llvm::outs() << output;
469            }
470            mFileStatus[printIdx] = FileStatus::PrintComplete;
471            printIdx = mNextFileToPrint++;
472        } else {
473            mGrepDriver->performIncrementalCacheCleanupStep();
474        }
475        sched_yield();
476    }
477
478    if (pthread_self() != mEngineThread) {
479        pthread_exit(nullptr);
480    } else {
481        // Always perform one final cache cleanup step.
482        mGrepDriver->performIncrementalCacheCleanupStep();
483    }
484}
485
486}
Note: See TracBrowser for help on using the repository browser.