source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5908

Last change on this file since 5908 was 5908, checked in by cameron, 13 months ago

Byte-Bit grep kernel optimizes when an RE begins with an initial trigraph

File size: 30.8 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/UCD_property_kernel.h>
16#include <kernels/grapheme_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/source_kernel.h>
20#include <kernels/s2p_kernel.h>
21#include <kernels/scanmatchgen.h>
22#include <kernels/streamset.h>
23#include <kernels/until_n.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/re_collect_unicodesets.h>
37#include <re/re_multiplex.h>
38#include <re/grapheme_clusters.h>
39#include <re/printer_re.h>
40#include <toolchain/toolchain.h>
41#include <toolchain/cpudriver.h>
42#include <iostream>
43#include <cc/multiplex_CCs.h>
44#include <llvm/Support/raw_ostream.h>
45#include <util/aligned_allocator.h>
46#include <sys/stat.h>
47#include <fcntl.h>
48#include <errno.h>
49#include <llvm/ADT/STLExtras.h> // for make_unique
50#include <llvm/Support/CommandLine.h>
51#include <llvm/Support/Debug.h>
52#include <sched.h>
53
54using namespace parabix;
55using namespace llvm;
56using namespace cc;
57using namespace kernel;
58
59static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
60static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
61static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
62static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
63
64const unsigned DefaultByteCClimit = 6;
65
66static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
67
68
69namespace grep {
70   
71
72void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
73    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
74}
75
76void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
77    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
78}
79
80void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
81    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
82   
83    pattern = resolveCaseInsensitiveMode(pattern, false);
84    pattern = regular_expression_passes(pattern);
85    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
86    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
87
88    ParabixDriver pxDriver("codepointEngine");
89    auto & idb = pxDriver.getBuilder();
90    Module * M = idb->getModule();
91   
92    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
93    mainFunc->setCallingConv(CallingConv::C);
94    auto args = mainFunc->arg_begin();
95    Value * const buffer = &*(args++);
96    buffer->setName("buffer");
97    Value * length = &*(args++);
98    length->setName("length");
99   
100    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
101   
102    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
103    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
104    sourceK->setInitialArguments({buffer, length});
105    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
106   
107   
108    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
109    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
110    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
111   
112    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
113    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
114    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
115   
116    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
117   
118    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
119    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
120    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
121   
122    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
123    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
124    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
125   
126    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
127    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
128    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
129   
130    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
131    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
132    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
133    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
134    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
135   
136    pxDriver.generatePipelineIR();
137    pxDriver.deallocateBuffers();
138    idb->CreateRetVoid();
139    pxDriver.finalizeObject();
140   
141    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
142    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
143    f(search_buffer, bufferLength);
144}
145
146
147
148// Grep Engine construction and initialization.
149
150GrepEngine::GrepEngine() :
151    mGrepDriver(nullptr),
152    mNextFileToGrep(0),
153    mNextFileToPrint(0),
154    grepMatchFound(false),
155    mGrepRecordBreak(GrepRecordBreakKind::LF),
156    mMoveMatchesToEOL(true),
157    mEngineThread(pthread_self()) {}
158
159GrepEngine::~GrepEngine() {
160    delete mGrepDriver;
161}
162
163QuietModeEngine::QuietModeEngine() : GrepEngine() {
164    mMoveMatchesToEOL = false;
165}
166
167MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
168    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
169    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
170    mMoveMatchesToEOL = false;
171}
172
173CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
174    mFileSuffix = ":";
175}
176
177EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
178    mFileSuffix = InitialTabFlag ? "\t:" : ":";
179    if (LineRegexpFlag) mMoveMatchesToEOL = false;
180}
181
182   
183void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
184    mGrepRecordBreak = b;
185}
186
187void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
188    const unsigned n = filenames.size();
189    mResultStrs.resize(n);
190    mFileStatus.resize(n, FileStatus::Pending);
191    inputFiles = filenames;
192}
193
194// Code Generation
195//
196// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
197
198unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
199    const unsigned packSize = b->getSizeTy()->getBitWidth();
200    return (packSize * packSize) / b->getBitBlockWidth();
201}
202   
203std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
204    auto & idb = mGrepDriver->getBuilder();
205    const unsigned segmentSize = codegen::SegmentSize;
206    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
207    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
208    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
209    const unsigned encodingBits = 8;
210   
211   
212    //  Regular Expression Processing and Analysis Phase
213    const auto nREs = REs.size();
214    bool hasGCB[nREs];
215    bool anyGCB = false;
216   
217    std::set<re::Name *> UnicodeProperties;
218   
219    re::CC * breakCC = nullptr;
220    std::string breakName;
221    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
222        breakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
223    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
224        breakCC = re::makeByte(0);  // Null
225    } else {
226        breakCC = re::makeByte(0x0A); // LF
227    }
228    re::RE * anchorRE = breakCC;
229    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
230        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
231        anchorName->setDefinition(UCD::UnicodeBreakRE());
232        anchorRE = anchorName;
233    }
234
235    for(unsigned i = 0; i < nREs; ++i) {
236        REs[i] = resolveModesAndExternalSymbols(REs[i]);
237        REs[i] = re::exclude_CC(REs[i], breakCC);
238        REs[i] = resolveAnchors(REs[i], anchorRE);
239        re::gatherUnicodeProperties(REs[i], UnicodeProperties);
240        REs[i] = regular_expression_passes(REs[i]);
241        hasGCB[i] = hasGraphemeClusterBoundary(REs[i]);
242        anyGCB |= hasGCB[i];
243    }
244   
245    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
246    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
247   
248    re::RE * prefixRE;
249    re::RE * suffixRE;
250    // For simple regular expressions with a small number of characters, we
251    // can bypass transposition and use the Direct CC compiler.
252    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
253    if (isSimple) {
254        REs[0] = toUTF8(REs[0]);
255    }
256    if (isSimple && byteTestsWithinLimit(REs[0], ByteCClimit)) {
257        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
258        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, REs[0]);
259        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
260        MatchResultsBufs[0] = MatchResults;
261        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{breakCC}, 1);
262        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
263    } else if (isSimple && hasTriCCwithinLimit(REs[0], ByteCClimit, prefixRE, suffixRE)) {
264        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
265        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE);
266        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
267        MatchResultsBufs[0] = MatchResults;
268        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{breakCC}, 1);
269        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
270    } else {
271        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
272        kernel::Kernel * s2pk = nullptr;
273        if (PabloTransposition) {
274            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
275        }
276        else {
277            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
278        }
279        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
280
281        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
282        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
283
284        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
285        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
286        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
287       
288        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
289        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
290
291        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
292            LineBreakStream = LineFeedStream;
293        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
294            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{breakCC}, 8);
295            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
296        } else {
297            LineBreakStream = UnicodeLB;
298        }
299       
300        std::map<std::string, StreamSetBuffer *> propertyStream;
301        if (PropertyKernels) {
302            for (auto p : UnicodeProperties) {
303                auto name = p->getFullName();
304                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
305                propertyStream.emplace(std::make_pair(name, s));
306                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
307                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
308            }
309        }
310        StreamSetBuffer * GCB_stream = nullptr;
311        if (anyGCB) {
312            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
313            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
314            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
315        }
316
317        for(unsigned i = 0; i < nREs; ++i) {
318            std::vector<std::string> externalStreamNames;
319            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
320            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
321                externalStreamNames.push_back("UTF8_LB");
322                icgrepInputSets.push_back(LineBreakStream);
323                externalStreamNames.push_back("UTF8_nonfinal");
324                icgrepInputSets.push_back(RequiredStreams);
325            }
326            std::set<re::Name *> UnicodeProperties;
327            if (PropertyKernels) {
328                re::gatherUnicodeProperties(REs[i], UnicodeProperties);
329                for (auto p : UnicodeProperties) {
330                    auto name = p->getFullName();
331                    auto f = propertyStream.find(name);
332                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
333                    externalStreamNames.push_back(name);
334                    icgrepInputSets.push_back(f->second);
335                }
336            }
337            if (hasGCB[i]) {
338                externalStreamNames.push_back("\\b{g}");
339                icgrepInputSets.push_back(GCB_stream);
340            }
341            if (CC_Multiplexing) {
342                const auto UnicodeSets = re::collectUnicodeSets(REs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
343                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
344                if (UnicodeSets.size() <= 1) {
345                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
346                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
347                    MatchResultsBufs[i] = MatchResults;
348                } else {
349                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
350                    REs[i] = transformCCs(mpx.get(), REs[i]);
351                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
352                    auto numOfCharacterClasses = mpx_basis.size();
353                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
354                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
355                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
356    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
357    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
358                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
359                    icgrepInputSets.push_back(CharClasses);
360                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
361                    MatchResultsBufs[i] = MatchResults;
362                }
363            } else {
364                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
365                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, REs[i], externalStreamNames);
366                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
367                MatchResultsBufs[i] = MatchResults;
368            }
369        }
370    }
371    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
372    if (REs.size() > 1) {
373        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
374        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, REs.size());
375        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
376    }
377    StreamSetBuffer * Matches = MergedResults;
378
379    if (mMoveMatchesToEOL) {
380        StreamSetBuffer * OriginalMatches = Matches;
381        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
382        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
383        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
384    }
385
386    if (InvertMatchFlag) {
387        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
388        StreamSetBuffer * OriginalMatches = Matches;
389        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
390        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
391    }
392    if (MaxCountFlag > 0) {
393        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
394        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
395        StreamSetBuffer * const AllMatches = Matches;
396        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
397        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
398    }
399    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
400}
401
402// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
403// which returns a count of the matches found (possibly subject to a MaxCount).
404//
405
406void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
407
408    assert (mGrepDriver == nullptr);
409    mGrepDriver = new ParabixDriver("engine");
410    auto & idb = mGrepDriver->getBuilder();
411    Module * M = idb->getModule();
412
413    const unsigned encodingBits = 8;
414
415    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), nullptr));
416    mainFunc->setCallingConv(CallingConv::C);
417    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
418    auto args = mainFunc->arg_begin();
419
420    Value * const fileDescriptor = &*(args++);
421    fileDescriptor->setName("fileDescriptor");
422
423    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
424    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
425    sourceK->setInitialArguments({fileDescriptor});
426    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
427
428    StreamSetBuffer * LineBreakStream;
429    StreamSetBuffer * Matches;
430    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
431
432    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
433    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
434    mGrepDriver->generatePipelineIR();
435    idb->setKernel(matchCountK);
436    Value * matchedLineCount = idb->getAccumulator("countResult");
437    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
438    mGrepDriver->deallocateBuffers();
439    idb->CreateRet(matchedLineCount);
440    mGrepDriver->finalizeObject();
441}
442
443//
444// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
445// matched lines.
446
447class EmitMatch : public MatchAccumulator {
448    friend class EmitMatchesEngine;
449public:
450    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
451    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
452    void finalize_match(char * buffer_end) override;
453protected:
454    std::string mLinePrefix;
455    size_t mLineCount;
456    bool mTerminated;
457    std::ostringstream & mResultStr;
458};
459
460//
461//  Default Report Match:  lines are emitted with whatever line terminators are found in the
462//  input.  However, if the final line is not terminated, a new line is appended.
463//
464void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
465    if (WithFilenameFlag) {
466        mResultStr << mLinePrefix;
467    }
468    if (LineNumberFlag) {
469        // Internally line numbers are counted from 0.  For display, adjust
470        // the line number so that lines are numbered from 1.
471        if (InitialTabFlag) {
472            mResultStr << lineNum+1 << "\t:";
473        }
474        else {
475            mResultStr << lineNum+1 << ":";
476        }
477    }
478    size_t bytes = line_end - line_start + 1;
479    mResultStr.write(line_start, bytes);
480    mLineCount++;
481    unsigned last_byte = *line_end;
482    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
483    if (LLVM_UNLIKELY(!mTerminated)) {
484        if (last_byte == 0x85) {  //  Possible NEL terminator.
485            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
486        }
487        else {
488            // Possible LS or PS terminators.
489            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
490                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
491                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
492        }
493    }
494}
495
496void EmitMatch::finalize_match(char * buffer_end) {
497    if (!mTerminated) mResultStr << "\n";
498}
499
500void EmitMatchesEngine::grepCodeGen(std::vector<re::RE *> REs) {
501    assert (mGrepDriver == nullptr);
502    mGrepDriver = new ParabixDriver("engine");
503    auto & idb = mGrepDriver->getBuilder();
504    Module * M = idb->getModule();
505
506    const unsigned encodingBits = 8;
507
508    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
509    mainFunc->setCallingConv(CallingConv::C);
510    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
511    auto args = mainFunc->arg_begin();
512
513    Value * const fileDescriptor = &*(args++);
514    fileDescriptor->setName("fileDescriptor");
515    Value * match_accumulator = &*(args++);
516    match_accumulator->setName("match_accumulator");
517
518    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
519    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
520    sourceK->setInitialArguments({fileDescriptor});
521    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
522
523    StreamSetBuffer * LineBreakStream;
524    StreamSetBuffer * Matches;
525    std::tie(LineBreakStream, Matches) = grepPipeline(REs, ByteStream);
526
527    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
528    scanMatchK->setInitialArguments({match_accumulator});
529    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
530    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
531    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
532
533    mGrepDriver->generatePipelineIR();
534    mGrepDriver->deallocateBuffers();
535    idb->CreateRet(idb->getInt64(0));
536    mGrepDriver->finalizeObject();
537}
538
539
540//
541//  The doGrep methods apply a GrepEngine to a single file, processing the results
542//  differently based on the engine type.
543
544uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
545    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor);
546    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
547
548    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
549    if (fileDescriptor == -1) return 0;
550
551    uint64_t grepResult = f(fileDescriptor);
552    close(fileDescriptor);
553    return grepResult;
554}
555
556uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
557    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
558    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
559    mResultStrs[fileIdx] << grepResult << "\n";
560    return grepResult;
561}
562
563std::string GrepEngine::linePrefix(std::string fileName) {
564    if (fileName == "-") {
565        return LabelFlag + mFileSuffix;
566    }
567    else {
568        return fileName + mFileSuffix;
569    }
570}
571
572uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
573    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
574    if (grepResult == mRequiredCount) {
575       mResultStrs[fileIdx] << linePrefix(fileName);
576    }
577    return grepResult;
578}
579
580uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
581    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
582    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
583
584    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
585    if (fileDescriptor == -1) return 0;
586    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
587    f(fileDescriptor, reinterpret_cast<intptr_t>(&accum));
588    close(fileDescriptor);
589    if (accum.mLineCount > 0) grepMatchFound = true;
590    return accum.mLineCount;
591}
592
593// Open a file and return its file desciptor.
594int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
595    if (fileName == "-") {
596        return STDIN_FILENO;
597    }
598    else {
599        struct stat sb;
600        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
601        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
602            if (!NoMessagesFlag) {
603                if (errno == EACCES) {
604                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
605                }
606                else if (errno == ENOENT) {
607                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
608                }
609                else {
610                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
611                }
612            }
613            return fileDescriptor;
614        }
615        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
616            if (!NoMessagesFlag) {
617                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
618            }
619            close(fileDescriptor);
620            return -1;
621        }
622        return fileDescriptor;
623    }
624}
625
626// The process of searching a group of files may use a sequential or a task
627// parallel approach.
628
629void * DoGrepThreadFunction(void *args) {
630    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
631}
632
633bool GrepEngine::searchAllFiles() {
634    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
635    std::vector<pthread_t> threads(numOfThreads);
636
637    for(unsigned long i = 1; i < numOfThreads; ++i) {
638        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
639        if (rc) {
640            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
641        }
642    }
643    // Main thread also does the work;
644
645    DoGrepThreadMethod();
646    for(unsigned i = 1; i < numOfThreads; ++i) {
647        void * status = nullptr;
648        const int rc = pthread_join(threads[i], &status);
649        if (rc) {
650            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
651        }
652    }
653    return grepMatchFound;
654}
655
656
657// DoGrep thread function.
658void * GrepEngine::DoGrepThreadMethod() {
659
660    unsigned fileIdx = mNextFileToGrep++;
661    while (fileIdx < inputFiles.size()) {
662        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
663        mFileStatus[fileIdx] = FileStatus::GrepComplete;
664        if (grepResult > 0) {
665            grepMatchFound = true;
666        }
667        if (QuietMode && grepMatchFound) {
668            if (pthread_self() != mEngineThread) {
669                pthread_exit(nullptr);
670            }
671            return nullptr;
672        }
673        fileIdx = mNextFileToGrep++;
674    }
675
676    unsigned printIdx = mNextFileToPrint++;
677    while (printIdx < inputFiles.size()) {
678        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
679        if (readyToPrint) {
680            const auto output = mResultStrs[printIdx].str();
681            if (!output.empty()) {
682                llvm::outs() << output;
683            }
684            mFileStatus[printIdx] = FileStatus::PrintComplete;
685            printIdx = mNextFileToPrint++;
686        } else {
687            mGrepDriver->performIncrementalCacheCleanupStep();
688        }
689        sched_yield();
690    }
691
692    if (pthread_self() != mEngineThread) {
693        pthread_exit(nullptr);
694    } else {
695        // Always perform one final cache cleanup step.
696        mGrepDriver->performIncrementalCacheCleanupStep();
697    }
698    return nullptr;
699}
700
701}
Note: See TracBrowser for help on using the repository browser.