source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5927

Last change on this file since 5927 was 5927, checked in by cameron, 13 months ago

mmap - command line flag + overridden for stdin, devices, files of size 0 (incl virtual files such as /proc/cpuinfo)

File size: 31.6 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/UCD_property_kernel.h>
16#include <kernels/grapheme_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/source_kernel.h>
20#include <kernels/s2p_kernel.h>
21#include <kernels/scanmatchgen.h>
22#include <kernels/streamset.h>
23#include <kernels/until_n.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/re_collect_unicodesets.h>
37#include <re/re_multiplex.h>
38#include <re/grapheme_clusters.h>
39#include <re/printer_re.h>
40#include <toolchain/toolchain.h>
41#include <toolchain/cpudriver.h>
42#include <iostream>
43#include <cc/multiplex_CCs.h>
44#include <llvm/Support/raw_ostream.h>
45#include <util/aligned_allocator.h>
46#include <sys/stat.h>
47#include <fcntl.h>
48#include <errno.h>
49#include <llvm/ADT/STLExtras.h> // for make_unique
50#include <llvm/Support/CommandLine.h>
51#include <llvm/Support/Debug.h>
52#include <sched.h>
53
54using namespace parabix;
55using namespace llvm;
56using namespace cc;
57using namespace kernel;
58
59static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
60static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
61static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
62static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
63
64const unsigned DefaultByteCClimit = 6;
65
66static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
67
68
69namespace grep {
70   
71
72extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
73    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
74}
75
76extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
77    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
78}
79
80void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
81    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
82   
83    pattern = resolveCaseInsensitiveMode(pattern, false);
84    pattern = regular_expression_passes(pattern);
85    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
86    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
87
88    ParabixDriver pxDriver("codepointEngine");
89    auto & idb = pxDriver.getBuilder();
90    Module * M = idb->getModule();
91   
92    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
93    mainFunc->setCallingConv(CallingConv::C);
94    auto args = mainFunc->arg_begin();
95    Value * const buffer = &*(args++);
96    buffer->setName("buffer");
97    Value * length = &*(args++);
98    length->setName("length");
99   
100    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
101    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
102    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
103    sourceK->setInitialArguments({buffer, length});
104    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
105   
106   
107    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
108    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
109    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
110   
111    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
112    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
113    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
114   
115    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
116   
117    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
118    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
119    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
120   
121    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
122    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
123    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
124   
125    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
126    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
127    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
128   
129    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
130    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
131    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
132    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
133    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
134   
135    pxDriver.generatePipelineIR();
136    pxDriver.deallocateBuffers();
137    idb->CreateRetVoid();
138    pxDriver.finalizeObject();
139   
140    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
141    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
142    f(search_buffer, bufferLength);
143}
144
145
146
147// Grep Engine construction and initialization.
148
149GrepEngine::GrepEngine() :
150    mGrepDriver(nullptr),
151    mNextFileToGrep(0),
152    mNextFileToPrint(0),
153    grepMatchFound(false),
154    mGrepRecordBreak(GrepRecordBreakKind::LF),
155    mMoveMatchesToEOL(true),
156    mEngineThread(pthread_self()) {}
157
158GrepEngine::~GrepEngine() {
159    delete mGrepDriver;
160}
161
162QuietModeEngine::QuietModeEngine() : GrepEngine() {
163    mMoveMatchesToEOL = false;
164}
165
166MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
167    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
168    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
169    mMoveMatchesToEOL = false;
170}
171
172CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
173    mFileSuffix = ":";
174}
175
176EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
177    mFileSuffix = InitialTabFlag ? "\t:" : ":";
178    if (LineRegexpFlag) mMoveMatchesToEOL = false;
179}
180
181   
182void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
183    mGrepRecordBreak = b;
184}
185
186   
187
188   
189void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
190    const unsigned n = filenames.size();
191    mResultStrs.resize(n);
192    mFileStatus.resize(n, FileStatus::Pending);
193    inputFiles = filenames;
194}
195
196void GrepEngine::initREs(std::vector<re::RE *> & REs) {
197    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
198        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
199    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
200        mBreakCC = re::makeByte(0);  // Null
201    } else {
202        mBreakCC = re::makeByte(0x0A); // LF
203    }
204    re::RE * anchorRE = mBreakCC;
205    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
206        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
207        anchorName->setDefinition(UCD::UnicodeBreakRE());
208        anchorRE = anchorName;
209    }
210   
211    mREs = REs;
212    bool allAnchored = true;
213    for(unsigned i = 0; i < mREs.size(); ++i) {
214        if (!hasEndAnchor(mREs[i])) allAnchored = false;
215        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
216        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
217        mREs[i] = resolveAnchors(mREs[i], anchorRE);
218        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
219        mREs[i] = regular_expression_passes(mREs[i]);
220    }
221    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
222
223}
224
225
226   
227// Code Generation
228//
229// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
230
231unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
232    const unsigned packSize = b->getSizeTy()->getBitWidth();
233    return (packSize * packSize) / b->getBitBlockWidth();
234}
235   
236std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
237    auto & idb = mGrepDriver->getBuilder();
238    const unsigned segmentSize = codegen::SegmentSize;
239    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
240    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
241    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
242    const unsigned encodingBits = 8;
243   
244   
245    //  Regular Expression Processing and Analysis Phase
246    const auto nREs = mREs.size();
247    bool hasGCB[nREs];
248    bool anyGCB = false;
249
250    for(unsigned i = 0; i < nREs; ++i) {
251        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
252        anyGCB |= hasGCB[i];
253    }
254    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
255    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
256   
257    re::RE * prefixRE;
258    re::RE * suffixRE;
259    // For simple regular expressions with a small number of characters, we
260    // can bypass transposition and use the Direct CC compiler.
261    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
262    if (isSimple) {
263        mREs[0] = toUTF8(mREs[0]);
264    }
265    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
266        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
267        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0]);
268        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
269        MatchResultsBufs[0] = MatchResults;
270        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC}, 1);
271        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
272    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
273        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
274        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE);
275        mGrepDriver->makeKernelCall(icgrepK, {ByteStream}, {MatchResults});
276        MatchResultsBufs[0] = MatchResults;
277        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC}, 1);
278        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
279    } else {
280       
281        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
282        kernel::Kernel * s2pk = nullptr;
283        if (PabloTransposition) {
284            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
285        }
286        else {
287            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
288        }
289        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
290
291        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
292        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
293
294        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
295        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
296        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
297       
298        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
299        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
300
301        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
302            LineBreakStream = LineFeedStream;
303        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
304            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
305            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
306        } else {
307            LineBreakStream = UnicodeLB;
308        }
309       
310        std::map<std::string, StreamSetBuffer *> propertyStream;
311        if (PropertyKernels) {
312            for (auto p : mUnicodeProperties) {
313                auto name = p->getFullName();
314                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
315                propertyStream.emplace(std::make_pair(name, s));
316                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
317                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
318            }
319        }
320        StreamSetBuffer * GCB_stream = nullptr;
321        if (anyGCB) {
322            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
323            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
324            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
325        }
326
327        for(unsigned i = 0; i < nREs; ++i) {
328            std::vector<std::string> externalStreamNames;
329            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
330            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
331                externalStreamNames.push_back("UTF8_LB");
332                icgrepInputSets.push_back(LineBreakStream);
333                externalStreamNames.push_back("UTF8_nonfinal");
334                icgrepInputSets.push_back(RequiredStreams);
335            }
336            std::set<re::Name *> UnicodeProperties;
337            if (PropertyKernels) {
338                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
339                for (auto p : UnicodeProperties) {
340                    auto name = p->getFullName();
341                    auto f = propertyStream.find(name);
342                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
343                    externalStreamNames.push_back(name);
344                    icgrepInputSets.push_back(f->second);
345                }
346            }
347            if (hasGCB[i]) {
348                externalStreamNames.push_back("\\b{g}");
349                icgrepInputSets.push_back(GCB_stream);
350            }
351            if (CC_Multiplexing) {
352                const auto UnicodeSets = re::collectUnicodeSets(mREs[i], std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
353                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
354                if (UnicodeSets.size() <= 1) {
355                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
356                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
357                    MatchResultsBufs[i] = MatchResults;
358                } else {
359                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
360                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
361                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
362                    auto numOfCharacterClasses = mpx_basis.size();
363                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
364                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
365                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
366    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
367    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
368                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
369                    icgrepInputSets.push_back(CharClasses);
370                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
371                    MatchResultsBufs[i] = MatchResults;
372                }
373            } else {
374                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
375                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
376                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
377                MatchResultsBufs[i] = MatchResults;
378            }
379        }
380    }
381
382    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
383    if (mREs.size() > 1) {
384        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
385        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
386        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
387    }
388    StreamSetBuffer * Matches = MergedResults;
389    if (mMoveMatchesToEOL) {
390        StreamSetBuffer * OriginalMatches = Matches;
391        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
392        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
393        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
394    }
395    if (InvertMatchFlag) {
396        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
397        StreamSetBuffer * OriginalMatches = Matches;
398        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
399        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
400    }
401    if (MaxCountFlag > 0) {
402        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
403        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
404        StreamSetBuffer * const AllMatches = Matches;
405        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
406        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
407    }
408
409    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
410}
411
412// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
413// which returns a count of the matches found (possibly subject to a MaxCount).
414//
415
416void GrepEngine::grepCodeGen() {
417
418    assert (mGrepDriver == nullptr);
419    mGrepDriver = new ParabixDriver("engine");
420    auto & idb = mGrepDriver->getBuilder();
421    Module * M = idb->getModule();
422
423    const unsigned encodingBits = 8;
424
425    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
426    mainFunc->setCallingConv(CallingConv::C);
427    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
428    auto args = mainFunc->arg_begin();
429
430    Value * const useMMap = &*(args++);
431    useMMap->setName("useMMap");
432    Value * const fileDescriptor = &*(args++);
433    fileDescriptor->setName("fileDescriptor");
434
435    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
436    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
437    sourceK->setInitialArguments({useMMap, fileDescriptor});
438    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
439
440    StreamSetBuffer * LineBreakStream;
441    StreamSetBuffer * Matches;
442    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
443
444    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
445    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
446    mGrepDriver->generatePipelineIR();
447    idb->setKernel(matchCountK);
448    Value * matchedLineCount = idb->getAccumulator("countResult");
449    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
450    mGrepDriver->deallocateBuffers();
451    idb->CreateRet(matchedLineCount);
452    mGrepDriver->finalizeObject();
453}
454
455//
456// The EmitMatches engine uses an EmitMatchesAccumulator object to concatenate together
457// matched lines.
458
459class EmitMatch : public MatchAccumulator {
460    friend class EmitMatchesEngine;
461public:
462    EmitMatch(std::string linePrefix, std::ostringstream & strm) : mLinePrefix(linePrefix), mLineCount(0), mTerminated(true), mResultStr(strm) {}
463    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
464    void finalize_match(char * buffer_end) override;
465protected:
466    std::string mLinePrefix;
467    size_t mLineCount;
468    bool mTerminated;
469    std::ostringstream & mResultStr;
470};
471
472//
473//  Default Report Match:  lines are emitted with whatever line terminators are found in the
474//  input.  However, if the final line is not terminated, a new line is appended.
475//
476void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
477    if (WithFilenameFlag) {
478        mResultStr << mLinePrefix;
479    }
480    if (LineNumberFlag) {
481        // Internally line numbers are counted from 0.  For display, adjust
482        // the line number so that lines are numbered from 1.
483        if (InitialTabFlag) {
484            mResultStr << lineNum+1 << "\t:";
485        }
486        else {
487            mResultStr << lineNum+1 << ":";
488        }
489    }
490    size_t bytes = line_end - line_start + 1;
491    mResultStr.write(line_start, bytes);
492    mLineCount++;
493    unsigned last_byte = *line_end;
494    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
495    if (LLVM_UNLIKELY(!mTerminated)) {
496        if (last_byte == 0x85) {  //  Possible NEL terminator.
497            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
498        }
499        else {
500            // Possible LS or PS terminators.
501            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
502                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
503                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
504        }
505    }
506}
507
508void EmitMatch::finalize_match(char * buffer_end) {
509    if (!mTerminated) mResultStr << "\n";
510}
511
512void EmitMatchesEngine::grepCodeGen() {
513    assert (mGrepDriver == nullptr);
514    mGrepDriver = new ParabixDriver("engine");
515    auto & idb = mGrepDriver->getBuilder();
516    Module * M = idb->getModule();
517
518    const unsigned encodingBits = 8;
519
520    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
521    mainFunc->setCallingConv(CallingConv::C);
522    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
523    auto args = mainFunc->arg_begin();
524
525    Value * const useMMap = &*(args++);
526    useMMap->setName("useMMap");
527    Value * const fileDescriptor = &*(args++);
528    fileDescriptor->setName("fileDescriptor");
529    Value * match_accumulator = &*(args++);
530    match_accumulator->setName("match_accumulator");
531
532    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
533    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
534    sourceK->setInitialArguments({useMMap, fileDescriptor});
535    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
536
537    StreamSetBuffer * LineBreakStream;
538    StreamSetBuffer * Matches;
539    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
540
541    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
542    scanMatchK->setInitialArguments({match_accumulator});
543    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
544    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
545    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
546
547    mGrepDriver->generatePipelineIR();
548    mGrepDriver->deallocateBuffers();
549    idb->CreateRet(idb->getInt64(0));
550    mGrepDriver->finalizeObject();
551}
552
553
554//
555//  The doGrep methods apply a GrepEngine to a single file, processing the results
556//  differently based on the engine type.
557
558uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
559    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
560    using namespace boost::filesystem;
561    path p(fileName);
562    bool useMMap = grep::MmapFlag;
563    if (p == "-") useMMap = false;
564    if (!is_regular_file(p)) useMMap = false;
565
566    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
567
568    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
569    if (fileDescriptor == -1) return 0;
570
571    uint64_t grepResult = f(useMMap, fileDescriptor);
572    close(fileDescriptor);
573    return grepResult;
574}
575
576uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
577    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
578    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
579    mResultStrs[fileIdx] << grepResult << "\n";
580    return grepResult;
581}
582
583std::string GrepEngine::linePrefix(std::string fileName) {
584    if (fileName == "-") {
585        return LabelFlag + mFileSuffix;
586    }
587    else {
588        return fileName + mFileSuffix;
589    }
590}
591
592uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
593    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
594    if (grepResult == mRequiredCount) {
595       mResultStrs[fileIdx] << linePrefix(fileName);
596    }
597    return grepResult;
598}
599
600uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
601    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
602    using namespace boost::filesystem;
603    path p(fileName);
604    bool useMMap = grep::MmapFlag;
605    if (p == "-") useMMap = false;
606    if (!is_regular_file(p)) useMMap = false;
607    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
608    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
609    if (fileDescriptor == -1) return 0;
610    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
611    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
612    close(fileDescriptor);
613    if (accum.mLineCount > 0) grepMatchFound = true;
614    return accum.mLineCount;
615}
616
617// Open a file and return its file desciptor.
618int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
619    if (fileName == "-") {
620        return STDIN_FILENO;
621    }
622    else {
623        struct stat sb;
624        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
625        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
626            if (!NoMessagesFlag) {
627                if (errno == EACCES) {
628                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
629                }
630                else if (errno == ENOENT) {
631                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
632                }
633                else {
634                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
635                }
636            }
637            return fileDescriptor;
638        }
639        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
640            if (!NoMessagesFlag) {
641                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
642            }
643            close(fileDescriptor);
644            return -1;
645        }
646        return fileDescriptor;
647    }
648}
649
650// The process of searching a group of files may use a sequential or a task
651// parallel approach.
652
653void * DoGrepThreadFunction(void *args) {
654    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
655}
656
657bool GrepEngine::searchAllFiles() {
658    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
659    std::vector<pthread_t> threads(numOfThreads);
660
661    for(unsigned long i = 1; i < numOfThreads; ++i) {
662        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
663        if (rc) {
664            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
665        }
666    }
667    // Main thread also does the work;
668    DoGrepThreadMethod();
669    for(unsigned i = 1; i < numOfThreads; ++i) {
670        void * status = nullptr;
671        const int rc = pthread_join(threads[i], &status);
672        if (rc) {
673            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
674        }
675    }
676    return grepMatchFound;
677}
678
679
680// DoGrep thread function.
681void * GrepEngine::DoGrepThreadMethod() {
682
683    unsigned fileIdx = mNextFileToGrep++;
684    while (fileIdx < inputFiles.size()) {
685        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
686        mFileStatus[fileIdx] = FileStatus::GrepComplete;
687        if (grepResult > 0) {
688            grepMatchFound = true;
689        }
690        if (QuietMode && grepMatchFound) {
691            if (pthread_self() != mEngineThread) {
692                pthread_exit(nullptr);
693            }
694            return nullptr;
695        }
696        fileIdx = mNextFileToGrep++;
697    }
698
699    unsigned printIdx = mNextFileToPrint++;
700    while (printIdx < inputFiles.size()) {
701        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
702        if (readyToPrint) {
703            const auto output = mResultStrs[printIdx].str();
704            if (!output.empty()) {
705                llvm::outs() << output;
706            }
707            mFileStatus[printIdx] = FileStatus::PrintComplete;
708            printIdx = mNextFileToPrint++;
709        } else {
710            mGrepDriver->performIncrementalCacheCleanupStep();
711        }
712        sched_yield();
713    }
714
715    if (pthread_self() != mEngineThread) {
716        pthread_exit(nullptr);
717    } else {
718        // Always perform one final cache cleanup step.
719        mGrepDriver->performIncrementalCacheCleanupStep();
720    }
721    return nullptr;
722}
723
724}
Note: See TracBrowser for help on using the repository browser.