source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5902

Last change on this file since 5902 was 5902, checked in by cameron, 14 months ago

Initial deployment of bytegrep kernel in icgrep

File size: 29.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/UCD_property_kernel.h>
16#include <kernels/grapheme_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/source_kernel.h>
20#include <kernels/s2p_kernel.h>
21#include <kernels/scanmatchgen.h>
22#include <kernels/streamset.h>
23#include <kernels/until_n.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/re_collect_unicodesets.h>
37#include <re/re_multiplex.h>
38#include <re/grapheme_clusters.h>
39#include <re/printer_re.h>
40#include <toolchain/toolchain.h>
41#include <toolchain/cpudriver.h>
42#include <iostream>
43#include <cc/multiplex_CCs.h>
44#include <llvm/Support/raw_ostream.h>
45#include <util/aligned_allocator.h>
46#include <sys/stat.h>
47#include <fcntl.h>
48#include <errno.h>
49#include <llvm/ADT/STLExtras.h> // for make_unique
50#include <llvm/Support/CommandLine.h>
51#include <llvm/Support/Debug.h>
52#include <sched.h>
53
54using namespace parabix;
55using namespace llvm;
56using namespace cc;
57using namespace kernel;
58
59static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
60static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
61static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
62static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
63
64
65namespace grep {
66   
67
68void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
69    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
70}
71
72void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
73    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
74}
75
76void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
77    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
78   
79    pattern = resolveCaseInsensitiveMode(pattern, false);
80    pattern = regular_expression_passes(pattern);
81    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
82    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
83
84    ParabixDriver pxDriver("codepointEngine");
85    auto & idb = pxDriver.getBuilder();
86    Module * M = idb->getModule();
87   
88    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
89    mainFunc->setCallingConv(CallingConv::C);
90    auto args = mainFunc->arg_begin();
91    Value * const buffer = &*(args++);
92    buffer->setName("buffer");
93    Value * length = &*(args++);
94    length->setName("length");
95   
96    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
97   
98    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
99    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
100    sourceK->setInitialArguments({buffer, length});
101    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
102   
103   
104    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
105    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
106    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
107   
108    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
109    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
110    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
111   
112    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
113   
114    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
115    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
116    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
117   
118    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
119    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
120    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
121   
122    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
123    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
124    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
125   
126    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
127    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
128    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
129    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
130    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
131   
132    pxDriver.generatePipelineIR();
133    pxDriver.deallocateBuffers();
134    idb->CreateRetVoid();
135    pxDriver.finalizeObject();
136   
137    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
138    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
139    f(search_buffer, bufferLength);
140}
141
142
143
144// Grep Engine construction and initialization.
145
146GrepEngine::GrepEngine() :
147    mGrepDriver(nullptr),
148    mNextFileToGrep(0),
149    mNextFileToPrint(0),
150    grepMatchFound(false),
151    mGrepRecordBreak(GrepRecordBreakKind::LF),
152    mMoveMatchesToEOL(true),
153    mEngineThread(pthread_self()) {}
154
155GrepEngine::~GrepEngine() {
156    delete mGrepDriver;
157}
158
159QuietModeEngine::QuietModeEngine() : GrepEngine() {
160    mMoveMatchesToEOL = false;
161}
162
163MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
164    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
165    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
166    mMoveMatchesToEOL = false;
167}
168
169CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
170    mFileSuffix = ":";
171}
172
173EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
174    mFileSuffix = InitialTabFlag ? "\t:" : ":";
175    if (LineRegexpFlag) mMoveMatchesToEOL = false;
176}
177
178   
179void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
180    mGrepRecordBreak = b;
181}
182
183void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
184    const unsigned n = filenames.size();
185    mResultStrs.resize(n);
186    mFileStatus.resize(n, FileStatus::Pending);
187    inputFiles = filenames;
188}
189
190// Code Generation
191//
192// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
193
194unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
195    const unsigned packSize = b->getSizeTy()->getBitWidth();
196    return (packSize * packSize) / b->getBitBlockWidth();
197}
198
199std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
200    auto & idb = mGrepDriver->getBuilder();
201    const unsigned segmentSize = codegen::SegmentSize;
202    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
203    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
204    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
205    const unsigned encodingBits = 8;
206   
207   
208    //  Regular Expression Processing and Analysis Phase
209    const auto nREs = REs.size();
210    bool hasGCB[nREs];
211    bool anyGCB = false;
212   
213    std::set<re::Name *> UnicodeProperties;
214   
215    re::CC * breakCC = nullptr;
216    std::string breakName;
217    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
218        breakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
219    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
220        breakCC = re::makeByte(0);  // Null
221    } else {
222        breakCC = re::makeByte(0x0A); // LF
223    }
224    re::RE * anchorRE = breakCC;
225    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
226        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
227        anchorName->setDefinition(UCD::UnicodeBreakRE());
228        anchorRE = anchorName;
229    }
230
231    for(unsigned i = 0; i < nREs; ++i) {
232        REs[i] = resolveModesAndExternalSymbols(REs[i]);
233        REs[i] = re::exclude_CC(REs[i], breakCC);
234        REs[i] = resolveAnchors(REs[i], anchorRE);
235        re::gatherUnicodeProperties(REs[i], UnicodeProperties);
236        REs[i] = regular_expression_passes(REs[i]);
237        hasGCB[i] = hasGraphemeClusterBoundary(REs[i]);
238        anyGCB |= hasGCB[i];
239    }
240   
241    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
242    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
243   
244    // For simple regular expressions with a small number of characters, we
245    // can bypass transposition and use the Direct CC compiler.
246    if ((nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB) && byteTestsWithinLimit(REs[0], 6)) {
247        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
248        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, REs[0]);
249        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
250        MatchResultsBufs[0] = MatchResults;
251        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 1);
252        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
253    } else {
254        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
255        kernel::Kernel * s2pk = nullptr;
256        if (PabloTransposition) {
257            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
258        }
259        else {
260            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
261        }
262        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
263
264        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
265        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
266
267        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
268        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
269        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
270       
271        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
272        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
273
274        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
275            LineBreakStream = LineFeedStream;
276        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
277            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
278            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
279        } else {
280            LineBreakStream = UnicodeLB;
281        }
282       
283        std::map<std::string, StreamSetBuffer *> propertyStream;
284        if (PropertyKernels) {
285            for (auto p : UnicodeProperties) {
286                auto name = p->getFullName();
287                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
288                propertyStream.emplace(std::make_pair(name, s));
289                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
290                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
291            }
292        }
293        StreamSetBuffer * GCB_stream = nullptr;
294        if (anyGCB) {
295            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
296            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
297            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
298        }
299
300        for(unsigned i = 0; i < nREs; ++i) {
301            std::vector<std::string> externalStreamNames;
302            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
303            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
304                externalStreamNames.push_back("UTF8_LB");
305                icgrepInputSets.push_back(LineBreakStream);
306                externalStreamNames.push_back("UTF8_nonfinal");
307                icgrepInputSets.push_back(RequiredStreams);
308            }
309            std::set<re::Name *> UnicodeProperties;
310            if (PropertyKernels) {
311                re::gatherUnicodeProperties(REs[i], UnicodeProperties);
312                for (auto p : UnicodeProperties) {
313                    auto name = p->getFullName();
314                    auto f = propertyStream.find(name);
315                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
316                    externalStreamNames.push_back(name);
317                    icgrepInputSets.push_back(f->second);
318                }
319            }
320            if (hasGCB[i]) {
321                externalStreamNames.push_back("\\b{g}");
322                icgrepInputSets.push_back(GCB_stream);
323            }
324            if (CC_Multiplexing) {
325                const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
326                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
327                if (UnicodeSets.size() <= 1) {
328                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
329                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
330                    MatchResultsBufs[i] = MatchResults;
331                } else {
332                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
333                    REs[i] = transformCCs(mpx.get(), REs[i]);
334                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
335                    auto numOfCharacterClasses = mpx_basis.size();
336                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
337                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
338                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
339    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
340    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
341                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
342                    icgrepInputSets.push_back(CharClasses);
343                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
344                    MatchResultsBufs[i] = MatchResults;
345                }
346            } else {
347                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
348                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
349                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
350                MatchResultsBufs[i] = MatchResults;
351            }
352        }
353    }
354    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
355    if (REs.size() > 1) {
356        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
357        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
358        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
359    }
360    StreamSetBuffer * Matches = MergedResults;
361
362    if (mMoveMatchesToEOL) {
363        StreamSetBuffer * OriginalMatches = Matches;
364        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
365        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
366        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
367    }
368
369    if (InvertMatchFlag) {
370        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
371        StreamSetBuffer * OriginalMatches = Matches;
372        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
373        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
374    }
375    if (MaxCountFlag > 0) {
376        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
377        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
378        StreamSetBuffer * const AllMatches = Matches;
379        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
380        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
381    }
382    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
383}
384
385// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
386// which returns a count of the matches found (possibly subject to a MaxCount).
387//
388
389void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
390
391    assert (mGrepDriver == nullptr);
392    mGrepDriver = new ParabixDriver("engine");
393    auto & idb = mGrepDriver->getBuilder();
394    Module * M = idb->getModule();
395
396    const unsigned encodingBits = 8;
397
398    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
399    mainFunc->setCallingConv(CallingConv::C);
400    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
401    auto args = mainFunc->arg_begin();
402
403    Value * const fileDescriptor = &*(args++);
404    fileDescriptor->setName("fileDescriptor");
405
406    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
407    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
408    sourceK->setInitialArguments({fileDescriptor});
409    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
410
411    StreamSetBuffer * LineBreakStream;
412    StreamSetBuffer * Matches;
413    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
414
415    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
416    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
417    mGrepDriver->generatePipelineIR();
418    idb->setKernel(matchCountK);
419    Value * matchedLineCount = idb->getAccumulator("countResult");
420    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
421    mGrepDriver->deallocateBuffers();
422    idb->CreateRet(matchedLineCount);
423    mGrepDriver->finalizeObject();
424}
425
426//
427// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
428// matched lines.
429
430class EmitMatch : public MatchAccumulator {
431    friend class EmitMatchesEngine;
432public:
433    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
434    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
435    void finalize_match(char * buffer_end) override;
436protected:
437    std::string mLinePrefix;
438    size_t mLineCount;
439    bool mTerminated;
440    std::ostringstream & mResultStr;
441};
442
443//
444//  Default Report Match:  lines are emitted with whatever line terminators are found in the
445//  input.  However, if the final line is not terminated, a new line is appended.
446//
447void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
448    if (WithFilenameFlag) {
449        mResultStr << mLinePrefix;
450    }
451    if (LineNumberFlag) {
452        // Internally line numbers are counted from 0.  For display, adjust
453        // the line number so that lines are numbered from 1.
454        if (InitialTabFlag) {
455            mResultStr << lineNum+1 << "\t:";
456        }
457        else {
458            mResultStr << lineNum+1 << ":";
459        }
460    }
461    size_t bytes = line_end - line_start + 1;
462    mResultStr.write(line_start, bytes);
463    mLineCount++;
464    unsigned last_byte = *line_end;
465    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
466    if (LLVM_UNLIKELY(!mTerminated)) {
467        if (last_byte == 0x85) {  //  Possible NEL terminator.
468            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
469        }
470        else {
471            // Possible LS or PS terminators.
472            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
473                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
474                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
475        }
476    }
477}
478
479void EmitMatch::finalize_match(char * buffer_end) {
480    if (!mTerminated) mResultStr << "\n";
481}
482
483void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
484    assert (mGrepDriver == nullptr);
485    mGrepDriver = new ParabixDriver("engine");
486    auto & idb = mGrepDriver->getBuilder();
487    Module * M = idb->getModule();
488
489    const unsigned encodingBits = 8;
490
491    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
492    mainFunc->setCallingConv(CallingConv::C);
493    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
494    auto args = mainFunc->arg_begin();
495
496    Value * const fileDescriptor = &*(args++);
497    fileDescriptor->setName("fileDescriptor");
498    Value * match_accumulator = &*(args++);
499    match_accumulator->setName("match_accumulator");
500
501    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
502    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
503    sourceK->setInitialArguments({fileDescriptor});
504    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
505
506    StreamSetBuffer * LineBreakStream;
507    StreamSetBuffer * Matches;
508    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
509
510    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
511    scanMatchK->setInitialArguments({match_accumulator});
512    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
513    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
514    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
515
516    mGrepDriver->generatePipelineIR();
517    mGrepDriver->deallocateBuffers();
518    idb->CreateRet(idb->getInt64(0));
519    mGrepDriver->finalizeObject();
520}
521
522
523//
524//  The doGrep methods apply a GrepEngine to a single file, processing the results
525//  differently based on the engine type.
526
527uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
528    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
529    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
530
531    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
532    if (fileDescriptor == -1) return 0;
533
534    uint64_t grepResult = f(fileDescriptor);
535    close(fileDescriptor);
536    return grepResult;
537}
538
539uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
540    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
541    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
542    mResultStrs[fileIdx] << grepResult << "\n";
543    return grepResult;
544}
545
546std::string GrepEngine::linePrefix(std::string fileName) {
547    if (fileName == "-") {
548        return LabelFlag + mFileSuffix;
549    }
550    else {
551        return fileName + mFileSuffix;
552    }
553}
554
555uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
556    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
557    if (grepResult == mRequiredCount) {
558       mResultStrs[fileIdx] << linePrefix(fileName);
559    }
560    return grepResult;
561}
562
563uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
564    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
565    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
566
567    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
568    if (fileDescriptor == -1) return 0;
569    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
570    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
571    close(fileDescriptor);
572    if (accum.mLineCount > 0) grepMatchFound = true;
573    return accum.mLineCount;
574}
575
576// Open a file and return its file desciptor.
577int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
578    if (fileName == "-") {
579        return STDIN_FILENO;
580    }
581    else {
582        struct stat sb;
583        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
584        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
585            if (!NoMessagesFlag) {
586                if (errno == EACCES) {
587                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
588                }
589                else if (errno == ENOENT) {
590                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
591                }
592                else {
593                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
594                }
595            }
596            return fileDescriptor;
597        }
598        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
599            if (!NoMessagesFlag) {
600                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
601            }
602            close(fileDescriptor);
603            return -1;
604        }
605        return fileDescriptor;
606    }
607}
608
609// The process of searching a group of files may use a sequential or a task
610// parallel approach.
611
612void * DoGrepThreadFunction(void *args) {
613    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
614}
615
616bool GrepEngine::searchAllFiles() {
617    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
618    std::vector<pthread_t> threads(numOfThreads);
619
620    for(unsigned long i = 1; i < numOfThreads; ++i) {
621        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
622        if (rc) {
623            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
624        }
625    }
626    // Main thread also does the work;
627
628    DoGrepThreadMethod();
629    for(unsigned i = 1; i < numOfThreads; ++i) {
630        void * status = nullptr;
631        const int rc = pthread_join(threads[i], &status);
632        if (rc) {
633            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
634        }
635    }
636    return grepMatchFound;
637}
638
639
640// DoGrep thread function.
641void * GrepEngine::DoGrepThreadMethod() {
642
643    unsigned fileIdx = mNextFileToGrep++;
644    while (fileIdx < inputFiles.size()) {
645        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
646        mFileStatus[fileIdx] = FileStatus::GrepComplete;
647        if (grepResult > 0) {
648            grepMatchFound = true;
649        }
650        if (QuietMode && grepMatchFound) {
651            if (pthread_self() != mEngineThread) {
652                pthread_exit(nullptr);
653            }
654            return nullptr;
655        }
656        fileIdx = mNextFileToGrep++;
657    }
658
659    unsigned printIdx = mNextFileToPrint++;
660    while (printIdx < inputFiles.size()) {
661        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
662        if (readyToPrint) {
663            const auto output = mResultStrs[printIdx].str();
664            if (!output.empty()) {
665                llvm::outs() << output;
666            }
667            mFileStatus[printIdx] = FileStatus::PrintComplete;
668            printIdx = mNextFileToPrint++;
669        } else {
670            mGrepDriver->performIncrementalCacheCleanupStep();
671        }
672        sched_yield();
673    }
674
675    if (pthread_self() != mEngineThread) {
676        pthread_exit(nullptr);
677    } else {
678        // Always perform one final cache cleanup step.
679        mGrepDriver->performIncrementalCacheCleanupStep();
680    }
681    return nullptr;
682}
683
684}
Note: See TracBrowser for help on using the repository browser.