source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5941

Last change on this file since 5941 was 5941, checked in by xwa163, 12 months ago
  1. Add attributes to disable some features of multiblock kernel
  2. Fix bug for lz4d new approach in large data, pass all test cases
  3. Disable lz4d related test cases for old approach
File size: 33.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/UCD_property_kernel.h>
16#include <kernels/grapheme_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/source_kernel.h>
20#include <kernels/s2p_kernel.h>
21#include <kernels/scanmatchgen.h>
22#include <kernels/streamset.h>
23#include <kernels/until_n.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <cc/alphabet.h>
27#include <re/re_cc.h>
28#include <re/re_name.h>
29#include <re/casing.h>
30#include <re/exclude_CC.h>
31#include <re/to_utf8.h>
32#include <re/re_toolchain.h>
33#include <toolchain/toolchain.h>
34#include <re/re_analysis.h>
35#include <re/re_name_resolve.h>
36#include <re/re_name_gather.h>
37#include <re/collect_ccs.h>
38#include <re/replaceCC.h>
39#include <re/re_multiplex.h>
40#include <re/grapheme_clusters.h>
41#include <re/printer_re.h>
42#include <toolchain/toolchain.h>
43#include <toolchain/cpudriver.h>
44#include <iostream>
45#include <cc/multiplex_CCs.h>
46#include <llvm/Support/raw_ostream.h>
47#include <util/aligned_allocator.h>
48#include <sys/stat.h>
49#include <fcntl.h>
50#include <errno.h>
51#include <llvm/ADT/STLExtras.h> // for make_unique
52#include <llvm/Support/CommandLine.h>
53#include <llvm/Support/Debug.h>
54#include <sched.h>
55
56using namespace parabix;
57using namespace llvm;
58using namespace cc;
59using namespace kernel;
60
61static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
62static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
63static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
64static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
65static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
66const unsigned DefaultByteCClimit = 6;
67
68static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
69
70
71namespace grep {
72   
73
74extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
75    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
76}
77
78extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
79    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
80}
81
82void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
83    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
84    auto segParallelModeSave = codegen::SegmentPipelineParallel;
85    codegen::SegmentPipelineParallel = false;
86   
87    pattern = resolveCaseInsensitiveMode(pattern, false);
88    pattern = regular_expression_passes(pattern);
89    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
90    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
91
92    ParabixDriver pxDriver("codepointEngine");
93    auto & idb = pxDriver.getBuilder();
94    Module * M = idb->getModule();
95   
96    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
97    mainFunc->setCallingConv(CallingConv::C);
98    auto args = mainFunc->arg_begin();
99    Value * const buffer = &*(args++);
100    buffer->setName("buffer");
101    Value * length = &*(args++);
102    length->setName("length");
103   
104    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
105    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
106    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
107    sourceK->setInitialArguments({buffer, length});
108    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
109   
110   
111    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
112    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
113    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
114   
115    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
116    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
117    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
118   
119    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
120   
121    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
122    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
123    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
124   
125    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
126    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
127    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
128   
129    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
130    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
131    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
132   
133    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
134    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
135    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
136    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
137    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
138   
139    pxDriver.generatePipelineIR();
140    pxDriver.deallocateBuffers();
141    idb->CreateRetVoid();
142    pxDriver.finalizeObject();
143   
144    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
145    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
146    f(search_buffer, bufferLength);
147    codegen::SegmentPipelineParallel = segParallelModeSave;
148}
149
150
151
152// Grep Engine construction and initialization.
153
154GrepEngine::GrepEngine() :
155    mGrepDriver(nullptr),
156    mNextFileToGrep(0),
157    mNextFileToPrint(0),
158    grepMatchFound(false),
159    mGrepRecordBreak(GrepRecordBreakKind::LF),
160    mMoveMatchesToEOL(true),
161    mEngineThread(pthread_self()) {}
162
163GrepEngine::~GrepEngine() {
164    delete mGrepDriver;
165}
166
167QuietModeEngine::QuietModeEngine() : GrepEngine() {
168    mMoveMatchesToEOL = false;
169}
170
171MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
172    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
173    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
174    mMoveMatchesToEOL = false;
175}
176
177CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
178    mFileSuffix = ":";
179}
180
181EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
182    mFileSuffix = InitialTabFlag ? "\t:" : ":";
183    if (LineRegexpFlag) mMoveMatchesToEOL = false;
184}
185
186   
187void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
188    mGrepRecordBreak = b;
189}
190
191   
192
193   
194void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
195    const unsigned n = filenames.size();
196    mResultStrs.resize(n);
197    mFileStatus.resize(n, FileStatus::Pending);
198    inputFiles = filenames;
199}
200
201void GrepEngine::initREs(std::vector<re::RE *> & REs) {
202    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
203        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
204    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
205        mBreakCC = re::makeByte(0);  // Null
206    } else {
207        mBreakCC = re::makeByte(0x0A); // LF
208    }
209    re::RE * anchorRE = mBreakCC;
210    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
211        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
212        anchorName->setDefinition(UCD::UnicodeBreakRE());
213        anchorRE = anchorName;
214    }
215   
216    mREs = REs;
217    bool allAnchored = true;
218    for(unsigned i = 0; i < mREs.size(); ++i) {
219        if (!hasEndAnchor(mREs[i])) allAnchored = false;
220        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
221        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
222        mREs[i] = resolveAnchors(mREs[i], anchorRE);
223        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
224        mREs[i] = regular_expression_passes(mREs[i]);
225    }
226    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
227
228}
229
230
231   
232// Code Generation
233//
234// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
235
236unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
237    const unsigned packSize = b->getSizeTy()->getBitWidth();
238    return (packSize * packSize) / b->getBitBlockWidth();
239}
240   
241std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
242    auto & idb = mGrepDriver->getBuilder();
243    const unsigned segmentSize = codegen::SegmentSize;
244    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
245    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
246    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
247    const unsigned encodingBits = 8;
248   
249   
250    //  Regular Expression Processing and Analysis Phase
251    const auto nREs = mREs.size();
252    bool hasGCB[nREs];
253    bool anyGCB = false;
254
255    for(unsigned i = 0; i < nREs; ++i) {
256        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
257        anyGCB |= hasGCB[i];
258    }
259    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
260    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
261   
262    re::RE * prefixRE;
263    re::RE * suffixRE;
264    // For simple regular expressions with a small number of characters, we
265    // can bypass transposition and use the Direct CC compiler.
266    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
267    if (isSimple) {
268        mREs[0] = toUTF8(mREs[0]);
269    }
270    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
271        std::vector<std::string> externalStreamNames;
272        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
273        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
274            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
275            for (auto cc : CCs) {
276                auto ccName = makeName(cc);
277                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
278                std::string ccNameStr = ccName->getFullName();
279                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
280                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
281                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
282                externalStreamNames.push_back(ccNameStr);
283                icgrepInputSets.push_back(ccStream);
284            }
285        }
286        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
287        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
288        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
289        MatchResultsBufs[0] = MatchResults;
290        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
291        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
292    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
293        std::vector<std::string> externalStreamNames;
294        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
295        if (MultithreadedSimpleRE) {
296            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
297            for (auto cc : CCs) {
298                auto ccName = makeName(cc);
299                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
300                std::string ccNameStr = ccName->getFullName();
301                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
302                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
303                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
304                externalStreamNames.push_back(ccNameStr);
305                icgrepInputSets.push_back(ccStream);
306            }
307        }
308        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
309        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
310        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
311        MatchResultsBufs[0] = MatchResults;
312        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
313        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
314    } else {
315       
316        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
317        kernel::Kernel * s2pk = nullptr;
318        if (PabloTransposition) {
319            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
320        }
321        else {
322            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
323        }
324        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
325
326        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
327        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
328
329        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
330        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
331        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
332       
333        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
334        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
335
336        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
337            LineBreakStream = LineFeedStream;
338        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
339            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
340            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
341        } else {
342            LineBreakStream = UnicodeLB;
343        }
344       
345        std::map<std::string, StreamSetBuffer *> propertyStream;
346        if (PropertyKernels) {
347            for (auto p : mUnicodeProperties) {
348                auto name = p->getFullName();
349                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
350                propertyStream.emplace(std::make_pair(name, s));
351                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
352                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
353            }
354        }
355        StreamSetBuffer * GCB_stream = nullptr;
356        if (anyGCB) {
357            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
358            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
359            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
360        }
361
362        for(unsigned i = 0; i < nREs; ++i) {
363            std::vector<std::string> externalStreamNames;
364            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
365            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
366                externalStreamNames.push_back("UTF8_LB");
367                icgrepInputSets.push_back(LineBreakStream);
368                externalStreamNames.push_back("UTF8_nonfinal");
369                icgrepInputSets.push_back(RequiredStreams);
370            }
371            std::set<re::Name *> UnicodeProperties;
372            if (PropertyKernels) {
373                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
374                for (auto p : UnicodeProperties) {
375                    auto name = p->getFullName();
376                    auto f = propertyStream.find(name);
377                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
378                    externalStreamNames.push_back(name);
379                    icgrepInputSets.push_back(f->second);
380                }
381            }
382            if (hasGCB[i]) {
383                externalStreamNames.push_back("\\b{g}");
384                icgrepInputSets.push_back(GCB_stream);
385            }
386            if (CC_Multiplexing) {
387                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
388                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
389                if (UnicodeSets.size() <= 1) {
390                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
391                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
392                    MatchResultsBufs[i] = MatchResults;
393                } else {
394                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
395                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
396                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
397                    auto numOfCharacterClasses = mpx_basis.size();
398                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
399                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
400                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
401    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
402    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
403                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
404                    icgrepInputSets.push_back(CharClasses);
405                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
406                    MatchResultsBufs[i] = MatchResults;
407                }
408            } else {
409                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
410                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
411                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
412                MatchResultsBufs[i] = MatchResults;
413            }
414        }
415    }
416
417    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
418    if (mREs.size() > 1) {
419        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
420        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
421        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
422    }
423    StreamSetBuffer * Matches = MergedResults;
424    if (mMoveMatchesToEOL) {
425        StreamSetBuffer * OriginalMatches = Matches;
426        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
427        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
428        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
429    }
430    if (InvertMatchFlag) {
431        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
432        StreamSetBuffer * OriginalMatches = Matches;
433        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
434        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
435    }
436    if (MaxCountFlag > 0) {
437        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
438        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
439        StreamSetBuffer * const AllMatches = Matches;
440        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
441        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
442    }
443
444    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
445}
446
447// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
448// which returns a count of the matches found (possibly subject to a MaxCount).
449//
450
451void GrepEngine::grepCodeGen() {
452
453    assert (mGrepDriver == nullptr);
454    mGrepDriver = new ParabixDriver("engine");
455    auto & idb = mGrepDriver->getBuilder();
456    Module * M = idb->getModule();
457
458    const unsigned encodingBits = 8;
459
460    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
461    mainFunc->setCallingConv(CallingConv::C);
462    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
463    auto args = mainFunc->arg_begin();
464
465    Value * const useMMap = &*(args++);
466    useMMap->setName("useMMap");
467    Value * const fileDescriptor = &*(args++);
468    fileDescriptor->setName("fileDescriptor");
469
470    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
471    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
472    sourceK->setInitialArguments({useMMap, fileDescriptor});
473    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
474
475    StreamSetBuffer * LineBreakStream;
476    StreamSetBuffer * Matches;
477    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
478
479    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
480    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
481    mGrepDriver->generatePipelineIR();
482    idb->setKernel(matchCountK);
483    Value * matchedLineCount = idb->getAccumulator("countResult");
484    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
485    mGrepDriver->deallocateBuffers();
486    idb->CreateRet(matchedLineCount);
487    mGrepDriver->finalizeObject();
488}
489
490
491//
492//  Default Report Match:  lines are emitted with whatever line terminators are found in the
493//  input.  However, if the final line is not terminated, a new line is appended.
494//
495void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
496    if (WithFilenameFlag) {
497        mResultStr << mLinePrefix;
498    }
499    if (LineNumberFlag) {
500        // Internally line numbers are counted from 0.  For display, adjust
501        // the line number so that lines are numbered from 1.
502        if (InitialTabFlag) {
503            mResultStr << lineNum+1 << "\t:";
504        }
505        else {
506            mResultStr << lineNum+1 << ":";
507        }
508    }
509    size_t bytes = line_end - line_start + 1;
510    mResultStr.write(line_start, bytes);
511    mLineCount++;
512    unsigned last_byte = *line_end;
513    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
514    if (LLVM_UNLIKELY(!mTerminated)) {
515        if (last_byte == 0x85) {  //  Possible NEL terminator.
516            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
517        }
518        else {
519            // Possible LS or PS terminators.
520            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
521                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
522                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
523        }
524    }
525}
526
527void EmitMatch::finalize_match(char * buffer_end) {
528    if (!mTerminated) mResultStr << "\n";
529}
530
531void EmitMatchesEngine::grepCodeGen() {
532    assert (mGrepDriver == nullptr);
533    mGrepDriver = new ParabixDriver("engine");
534    auto & idb = mGrepDriver->getBuilder();
535    Module * M = idb->getModule();
536
537    const unsigned encodingBits = 8;
538
539    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
540    mainFunc->setCallingConv(CallingConv::C);
541    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
542    auto args = mainFunc->arg_begin();
543
544    Value * const useMMap = &*(args++);
545    useMMap->setName("useMMap");
546    Value * const fileDescriptor = &*(args++);
547    fileDescriptor->setName("fileDescriptor");
548    Value * match_accumulator = &*(args++);
549    match_accumulator->setName("match_accumulator");
550
551    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
552    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
553    sourceK->setInitialArguments({useMMap, fileDescriptor});
554    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
555
556    StreamSetBuffer * LineBreakStream;
557    StreamSetBuffer * Matches;
558    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
559
560    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
561    scanMatchK->setInitialArguments({match_accumulator});
562    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
563    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
564    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
565
566    mGrepDriver->generatePipelineIR();
567    mGrepDriver->deallocateBuffers();
568    idb->CreateRet(idb->getInt64(0));
569    mGrepDriver->finalizeObject();
570}
571
572
573//
574//  The doGrep methods apply a GrepEngine to a single file, processing the results
575//  differently based on the engine type.
576
577uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
578    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
579    using namespace boost::filesystem;
580    path p(fileName);
581    bool useMMap = grep::MmapFlag;
582    if (p == "-") useMMap = false;
583    if (!is_regular_file(p)) useMMap = false;
584
585    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
586
587    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
588    if (fileDescriptor == -1) return 0;
589
590    uint64_t grepResult = f(useMMap, fileDescriptor);
591    close(fileDescriptor);
592    return grepResult;
593}
594
595uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
596    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
597    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
598    mResultStrs[fileIdx] << grepResult << "\n";
599    return grepResult;
600}
601
602std::string GrepEngine::linePrefix(std::string fileName) {
603    if (fileName == "-") {
604        return LabelFlag + mFileSuffix;
605    }
606    else {
607        return fileName + mFileSuffix;
608    }
609}
610
611uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
612    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
613    if (grepResult == mRequiredCount) {
614       mResultStrs[fileIdx] << linePrefix(fileName);
615    }
616    return grepResult;
617}
618
619uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
620    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
621    using namespace boost::filesystem;
622    path p(fileName);
623    bool useMMap = grep::MmapFlag;
624    if (p == "-") useMMap = false;
625    if (!is_regular_file(p)) useMMap = false;
626    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
627    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
628    if (fileDescriptor == -1) return 0;
629    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
630    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
631    close(fileDescriptor);
632    if (accum.mLineCount > 0) grepMatchFound = true;
633    return accum.mLineCount;
634}
635
636// Open a file and return its file desciptor.
637int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
638    if (fileName == "-") {
639        return STDIN_FILENO;
640    }
641    else {
642        struct stat sb;
643        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
644        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
645            if (!NoMessagesFlag) {
646                if (errno == EACCES) {
647                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
648                }
649                else if (errno == ENOENT) {
650                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
651                }
652                else {
653                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
654                }
655            }
656            return fileDescriptor;
657        }
658        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
659            if (!NoMessagesFlag) {
660                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
661            }
662            close(fileDescriptor);
663            return -1;
664        }
665        return fileDescriptor;
666    }
667}
668
669// The process of searching a group of files may use a sequential or a task
670// parallel approach.
671
672void * DoGrepThreadFunction(void *args) {
673    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
674}
675
676bool GrepEngine::searchAllFiles() {
677    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
678    std::vector<pthread_t> threads(numOfThreads);
679
680    for(unsigned long i = 1; i < numOfThreads; ++i) {
681        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
682        if (rc) {
683            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
684        }
685    }
686    // Main thread also does the work;
687    DoGrepThreadMethod();
688    for(unsigned i = 1; i < numOfThreads; ++i) {
689        void * status = nullptr;
690        const int rc = pthread_join(threads[i], &status);
691        if (rc) {
692            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
693        }
694    }
695    return grepMatchFound;
696}
697
698
699// DoGrep thread function.
700void * GrepEngine::DoGrepThreadMethod() {
701
702    unsigned fileIdx = mNextFileToGrep++;
703    while (fileIdx < inputFiles.size()) {
704        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
705        mFileStatus[fileIdx] = FileStatus::GrepComplete;
706        if (grepResult > 0) {
707            grepMatchFound = true;
708        }
709        if (QuietMode && grepMatchFound) {
710            if (pthread_self() != mEngineThread) {
711                pthread_exit(nullptr);
712            }
713            return nullptr;
714        }
715        fileIdx = mNextFileToGrep++;
716    }
717
718    unsigned printIdx = mNextFileToPrint++;
719    while (printIdx < inputFiles.size()) {
720        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
721        if (readyToPrint) {
722            const auto output = mResultStrs[printIdx].str();
723            if (!output.empty()) {
724                llvm::outs() << output;
725            }
726            mFileStatus[printIdx] = FileStatus::PrintComplete;
727            printIdx = mNextFileToPrint++;
728        } else {
729            mGrepDriver->performIncrementalCacheCleanupStep();
730        }
731        sched_yield();
732    }
733
734    if (pthread_self() != mEngineThread) {
735        pthread_exit(nullptr);
736    } else {
737        // Always perform one final cache cleanup step.
738        mGrepDriver->performIncrementalCacheCleanupStep();
739    }
740    return nullptr;
741}
742
743}
Note: See TracBrowser for help on using the repository browser.