source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5944

Last change on this file since 5944 was 5944, checked in by cameron, 12 months ago

Common command line file selection utility for icgrep, wc ...

File size: 33.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/UCD_property_kernel.h>
16#include <kernels/grapheme_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/source_kernel.h>
20#include <kernels/s2p_kernel.h>
21#include <kernels/scanmatchgen.h>
22#include <kernels/streamset.h>
23#include <kernels/until_n.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <cc/alphabet.h>
27#include <re/re_cc.h>
28#include <re/re_name.h>
29#include <re/casing.h>
30#include <re/exclude_CC.h>
31#include <re/to_utf8.h>
32#include <re/re_toolchain.h>
33#include <toolchain/toolchain.h>
34#include <re/re_analysis.h>
35#include <re/re_name_resolve.h>
36#include <re/re_name_gather.h>
37#include <re/collect_ccs.h>
38#include <re/replaceCC.h>
39#include <re/re_multiplex.h>
40#include <re/grapheme_clusters.h>
41#include <re/printer_re.h>
42#include <toolchain/toolchain.h>
43#include <toolchain/cpudriver.h>
44#include <iostream>
45#include <cc/multiplex_CCs.h>
46#include <llvm/Support/raw_ostream.h>
47#include <util/aligned_allocator.h>
48#include <util/file_select.h>
49#include <sys/stat.h>
50#include <fcntl.h>
51#include <errno.h>
52#include <llvm/ADT/STLExtras.h> // for make_unique
53#include <llvm/Support/CommandLine.h>
54#include <llvm/Support/Debug.h>
55#include <sched.h>
56
57using namespace parabix;
58using namespace llvm;
59using namespace cc;
60using namespace kernel;
61
62static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
63static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
64static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
65static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
66static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
67const unsigned DefaultByteCClimit = 6;
68
69static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
70
71
72namespace grep {
73   
74
75extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
76    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
77}
78
79extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
80    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
81}
82
83void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
84    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
85    auto segParallelModeSave = codegen::SegmentPipelineParallel;
86    codegen::SegmentPipelineParallel = false;
87   
88    pattern = resolveCaseInsensitiveMode(pattern, false);
89    pattern = regular_expression_passes(pattern);
90    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
91    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
92
93    ParabixDriver pxDriver("codepointEngine");
94    auto & idb = pxDriver.getBuilder();
95    Module * M = idb->getModule();
96   
97    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
98    mainFunc->setCallingConv(CallingConv::C);
99    auto args = mainFunc->arg_begin();
100    Value * const buffer = &*(args++);
101    buffer->setName("buffer");
102    Value * length = &*(args++);
103    length->setName("length");
104   
105    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
106    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
107    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
108    sourceK->setInitialArguments({buffer, length});
109    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
110   
111   
112    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
113    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
114    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
115   
116    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
117    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
118    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
119   
120    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
121   
122    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
123    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
124    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
125   
126    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
127    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
128    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
129   
130    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
131    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
132    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
133   
134    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
135    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
136    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
137    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
138    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
139   
140    pxDriver.generatePipelineIR();
141    pxDriver.deallocateBuffers();
142    idb->CreateRetVoid();
143    pxDriver.finalizeObject();
144   
145    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
146    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
147    f(search_buffer, bufferLength);
148    codegen::SegmentPipelineParallel = segParallelModeSave;
149}
150
151
152
153// Grep Engine construction and initialization.
154
155GrepEngine::GrepEngine() :
156    mGrepDriver(nullptr),
157    mNextFileToGrep(0),
158    mNextFileToPrint(0),
159    grepMatchFound(false),
160    mGrepRecordBreak(GrepRecordBreakKind::LF),
161    mMoveMatchesToEOL(true),
162    mEngineThread(pthread_self()) {}
163
164GrepEngine::~GrepEngine() {
165    delete mGrepDriver;
166}
167
168QuietModeEngine::QuietModeEngine() : GrepEngine() {
169    mMoveMatchesToEOL = false;
170}
171
172MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch) :
173    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
174    mFileSuffix = NullFlag ? std::string("\0", 1) : "\n";
175    mMoveMatchesToEOL = false;
176}
177
178CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
179    mFileSuffix = ":";
180}
181
182EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
183    mFileSuffix = InitialTabFlag ? "\t:" : ":";
184    if (LineRegexpFlag) mMoveMatchesToEOL = false;
185}
186
187   
188void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
189    mGrepRecordBreak = b;
190}
191
192   
193
194   
195void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
196    const unsigned n = filenames.size();
197    mResultStrs.resize(n);
198    mFileStatus.resize(n, FileStatus::Pending);
199    inputFiles = filenames;
200}
201
202void GrepEngine::initREs(std::vector<re::RE *> & REs) {
203    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
204        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
205    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
206        mBreakCC = re::makeByte(0);  // Null
207    } else {
208        mBreakCC = re::makeByte(0x0A); // LF
209    }
210    re::RE * anchorRE = mBreakCC;
211    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
212        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
213        anchorName->setDefinition(UCD::UnicodeBreakRE());
214        anchorRE = anchorName;
215    }
216   
217    mREs = REs;
218    bool allAnchored = true;
219    for(unsigned i = 0; i < mREs.size(); ++i) {
220        if (!hasEndAnchor(mREs[i])) allAnchored = false;
221        mREs[i] = resolveModesAndExternalSymbols(mREs[i]);
222        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
223        mREs[i] = resolveAnchors(mREs[i], anchorRE);
224        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
225        mREs[i] = regular_expression_passes(mREs[i]);
226    }
227    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
228
229}
230
231
232   
233// Code Generation
234//
235// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
236
237unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
238    const unsigned packSize = b->getSizeTy()->getBitWidth();
239    return (packSize * packSize) / b->getBitBlockWidth();
240}
241   
242std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
243    auto & idb = mGrepDriver->getBuilder();
244    const unsigned segmentSize = codegen::SegmentSize;
245    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
246    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
247    const unsigned baseBufferSize = segmentSize * (MaxCountFlag > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
248    const unsigned encodingBits = 8;
249   
250   
251    //  Regular Expression Processing and Analysis Phase
252    const auto nREs = mREs.size();
253    bool hasGCB[nREs];
254    bool anyGCB = false;
255
256    for(unsigned i = 0; i < nREs; ++i) {
257        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
258        anyGCB |= hasGCB[i];
259    }
260    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
261    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
262   
263    re::RE * prefixRE;
264    re::RE * suffixRE;
265    // For simple regular expressions with a small number of characters, we
266    // can bypass transposition and use the Direct CC compiler.
267    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
268    if (isSimple) {
269        mREs[0] = toUTF8(mREs[0]);
270    }
271    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
272        std::vector<std::string> externalStreamNames;
273        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
274        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
275            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
276            for (auto cc : CCs) {
277                auto ccName = makeName(cc);
278                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
279                std::string ccNameStr = ccName->getFullName();
280                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
281                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
282                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
283                externalStreamNames.push_back(ccNameStr);
284                icgrepInputSets.push_back(ccStream);
285            }
286        }
287        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
288        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
289        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
290        MatchResultsBufs[0] = MatchResults;
291        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
292        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
293    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
294        std::vector<std::string> externalStreamNames;
295        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
296        if (MultithreadedSimpleRE) {
297            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
298            for (auto cc : CCs) {
299                auto ccName = makeName(cc);
300                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
301                std::string ccNameStr = ccName->getFullName();
302                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
303                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
304                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
305                externalStreamNames.push_back(ccNameStr);
306                icgrepInputSets.push_back(ccStream);
307            }
308        }
309        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
310        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
311        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
312        MatchResultsBufs[0] = MatchResults;
313        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
314        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
315    } else {
316       
317        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
318        kernel::Kernel * s2pk = nullptr;
319        if (PabloTransposition) {
320            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
321        }
322        else {
323            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
324        }
325        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
326
327        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
328        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
329
330        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
331        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
332        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
333       
334        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
335        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
336
337        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
338            LineBreakStream = LineFeedStream;
339        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
340            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
341            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
342        } else {
343            LineBreakStream = UnicodeLB;
344        }
345       
346        std::map<std::string, StreamSetBuffer *> propertyStream;
347        if (PropertyKernels) {
348            for (auto p : mUnicodeProperties) {
349                auto name = p->getFullName();
350                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
351                propertyStream.emplace(std::make_pair(name, s));
352                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
353                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
354            }
355        }
356        StreamSetBuffer * GCB_stream = nullptr;
357        if (anyGCB) {
358            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
359            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
360            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
361        }
362
363        for(unsigned i = 0; i < nREs; ++i) {
364            std::vector<std::string> externalStreamNames;
365            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
366            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
367                externalStreamNames.push_back("UTF8_LB");
368                icgrepInputSets.push_back(LineBreakStream);
369                externalStreamNames.push_back("UTF8_nonfinal");
370                icgrepInputSets.push_back(RequiredStreams);
371            }
372            std::set<re::Name *> UnicodeProperties;
373            if (PropertyKernels) {
374                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
375                for (auto p : UnicodeProperties) {
376                    auto name = p->getFullName();
377                    auto f = propertyStream.find(name);
378                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
379                    externalStreamNames.push_back(name);
380                    icgrepInputSets.push_back(f->second);
381                }
382            }
383            if (hasGCB[i]) {
384                externalStreamNames.push_back("\\b{g}");
385                icgrepInputSets.push_back(GCB_stream);
386            }
387            if (CC_Multiplexing) {
388                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
389                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
390                if (UnicodeSets.size() <= 1) {
391                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
392                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
393                    MatchResultsBufs[i] = MatchResults;
394                } else {
395                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
396                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
397                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
398                    auto numOfCharacterClasses = mpx_basis.size();
399                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
400                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
401                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
402    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
403    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
404                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
405                    icgrepInputSets.push_back(CharClasses);
406                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
407                    MatchResultsBufs[i] = MatchResults;
408                }
409            } else {
410                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
411                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
412                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
413                MatchResultsBufs[i] = MatchResults;
414            }
415        }
416    }
417
418    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
419    if (mREs.size() > 1) {
420        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
421        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
422        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
423    }
424    StreamSetBuffer * Matches = MergedResults;
425    if (mMoveMatchesToEOL) {
426        StreamSetBuffer * OriginalMatches = Matches;
427        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
428        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
429        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
430    }
431    if (InvertMatchFlag) {
432        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
433        StreamSetBuffer * OriginalMatches = Matches;
434        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
435        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
436    }
437    if (MaxCountFlag > 0) {
438        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
439        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
440        StreamSetBuffer * const AllMatches = Matches;
441        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
442        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
443    }
444
445    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
446}
447
448// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
449// which returns a count of the matches found (possibly subject to a MaxCount).
450//
451
452void GrepEngine::grepCodeGen() {
453
454    assert (mGrepDriver == nullptr);
455    mGrepDriver = new ParabixDriver("engine");
456    auto & idb = mGrepDriver->getBuilder();
457    Module * M = idb->getModule();
458
459    const unsigned encodingBits = 8;
460
461    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
462    mainFunc->setCallingConv(CallingConv::C);
463    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
464    auto args = mainFunc->arg_begin();
465
466    Value * const useMMap = &*(args++);
467    useMMap->setName("useMMap");
468    Value * const fileDescriptor = &*(args++);
469    fileDescriptor->setName("fileDescriptor");
470
471    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
472    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
473    sourceK->setInitialArguments({useMMap, fileDescriptor});
474    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
475
476    StreamSetBuffer * LineBreakStream;
477    StreamSetBuffer * Matches;
478    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
479
480    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
481    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
482    mGrepDriver->generatePipelineIR();
483    idb->setKernel(matchCountK);
484    Value * matchedLineCount = idb->getAccumulator("countResult");
485    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
486    mGrepDriver->deallocateBuffers();
487    idb->CreateRet(matchedLineCount);
488    mGrepDriver->finalizeObject();
489}
490
491
492//
493//  Default Report Match:  lines are emitted with whatever line terminators are found in the
494//  input.  However, if the final line is not terminated, a new line is appended.
495//
496void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
497    if (WithFilenameFlag) {
498        mResultStr << mLinePrefix;
499    }
500    if (LineNumberFlag) {
501        // Internally line numbers are counted from 0.  For display, adjust
502        // the line number so that lines are numbered from 1.
503        if (InitialTabFlag) {
504            mResultStr << lineNum+1 << "\t:";
505        }
506        else {
507            mResultStr << lineNum+1 << ":";
508        }
509    }
510    size_t bytes = line_end - line_start + 1;
511    mResultStr.write(line_start, bytes);
512    mLineCount++;
513    unsigned last_byte = *line_end;
514    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
515    if (LLVM_UNLIKELY(!mTerminated)) {
516        if (last_byte == 0x85) {  //  Possible NEL terminator.
517            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
518        }
519        else {
520            // Possible LS or PS terminators.
521            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
522                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
523                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
524        }
525    }
526}
527
528void EmitMatch::finalize_match(char * buffer_end) {
529    if (!mTerminated) mResultStr << "\n";
530}
531
532void EmitMatchesEngine::grepCodeGen() {
533    assert (mGrepDriver == nullptr);
534    mGrepDriver = new ParabixDriver("engine");
535    auto & idb = mGrepDriver->getBuilder();
536    Module * M = idb->getModule();
537
538    const unsigned encodingBits = 8;
539
540    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
541    mainFunc->setCallingConv(CallingConv::C);
542    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
543    auto args = mainFunc->arg_begin();
544
545    Value * const useMMap = &*(args++);
546    useMMap->setName("useMMap");
547    Value * const fileDescriptor = &*(args++);
548    fileDescriptor->setName("fileDescriptor");
549    Value * match_accumulator = &*(args++);
550    match_accumulator->setName("match_accumulator");
551
552    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
553    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
554    sourceK->setInitialArguments({useMMap, fileDescriptor});
555    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
556
557    StreamSetBuffer * LineBreakStream;
558    StreamSetBuffer * Matches;
559    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
560
561    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
562    scanMatchK->setInitialArguments({match_accumulator});
563    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
564    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
565    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
566
567    mGrepDriver->generatePipelineIR();
568    mGrepDriver->deallocateBuffers();
569    idb->CreateRet(idb->getInt64(0));
570    mGrepDriver->finalizeObject();
571}
572
573
574//
575//  The doGrep methods apply a GrepEngine to a single file, processing the results
576//  differently based on the engine type.
577
578uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
579    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
580    using namespace boost::filesystem;
581    path p(fileName);
582    bool useMMap = argv::MmapFlag;
583    if (p == "-") useMMap = false;
584    if (!is_regular_file(p)) useMMap = false;
585
586    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
587
588    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
589    if (fileDescriptor == -1) return 0;
590
591    uint64_t grepResult = f(useMMap, fileDescriptor);
592    close(fileDescriptor);
593    return grepResult;
594}
595
596uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
597    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
598    if (WithFilenameFlag) mResultStrs[fileIdx] << linePrefix(fileName);
599    mResultStrs[fileIdx] << grepResult << "\n";
600    return grepResult;
601}
602
603std::string GrepEngine::linePrefix(std::string fileName) {
604    if (fileName == "-") {
605        return LabelFlag + mFileSuffix;
606    }
607    else {
608        return fileName + mFileSuffix;
609    }
610}
611
612uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
613    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
614    if (grepResult == mRequiredCount) {
615       mResultStrs[fileIdx] << linePrefix(fileName);
616    }
617    return grepResult;
618}
619
620uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
621    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
622    using namespace boost::filesystem;
623    path p(fileName);
624    bool useMMap = argv::MmapFlag;
625    if (p == "-") useMMap = false;
626    if (!is_regular_file(p)) useMMap = false;
627    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
628    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
629    if (fileDescriptor == -1) return 0;
630    EmitMatch accum(linePrefix(fileName), mResultStrs[fileIdx]);
631    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
632    close(fileDescriptor);
633    if (accum.mLineCount > 0) grepMatchFound = true;
634    return accum.mLineCount;
635}
636
637// Open a file and return its file desciptor.
638int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
639    if (fileName == "-") {
640        return STDIN_FILENO;
641    }
642    else {
643        struct stat sb;
644        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
645        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
646            if (!NoMessagesFlag) {
647                if (errno == EACCES) {
648                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
649                }
650                else if (errno == ENOENT) {
651                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
652                }
653                else {
654                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
655                }
656            }
657            return fileDescriptor;
658        }
659        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
660            if (!NoMessagesFlag) {
661                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
662            }
663            close(fileDescriptor);
664            return -1;
665        }
666        return fileDescriptor;
667    }
668}
669
670// The process of searching a group of files may use a sequential or a task
671// parallel approach.
672
673void * DoGrepThreadFunction(void *args) {
674    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
675}
676
677bool GrepEngine::searchAllFiles() {
678    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
679    std::vector<pthread_t> threads(numOfThreads);
680
681    for(unsigned long i = 1; i < numOfThreads; ++i) {
682        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
683        if (rc) {
684            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
685        }
686    }
687    // Main thread also does the work;
688    DoGrepThreadMethod();
689    for(unsigned i = 1; i < numOfThreads; ++i) {
690        void * status = nullptr;
691        const int rc = pthread_join(threads[i], &status);
692        if (rc) {
693            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
694        }
695    }
696    return grepMatchFound;
697}
698
699
700// DoGrep thread function.
701void * GrepEngine::DoGrepThreadMethod() {
702
703    unsigned fileIdx = mNextFileToGrep++;
704    while (fileIdx < inputFiles.size()) {
705        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
706        mFileStatus[fileIdx] = FileStatus::GrepComplete;
707        if (grepResult > 0) {
708            grepMatchFound = true;
709        }
710        if (QuietMode && grepMatchFound) {
711            if (pthread_self() != mEngineThread) {
712                pthread_exit(nullptr);
713            }
714            return nullptr;
715        }
716        fileIdx = mNextFileToGrep++;
717    }
718
719    unsigned printIdx = mNextFileToPrint++;
720    while (printIdx < inputFiles.size()) {
721        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
722        if (readyToPrint) {
723            const auto output = mResultStrs[printIdx].str();
724            if (!output.empty()) {
725                llvm::outs() << output;
726            }
727            mFileStatus[printIdx] = FileStatus::PrintComplete;
728            printIdx = mNextFileToPrint++;
729        } else {
730            mGrepDriver->performIncrementalCacheCleanupStep();
731        }
732        sched_yield();
733    }
734
735    if (pthread_self() != mEngineThread) {
736        pthread_exit(nullptr);
737    } else {
738        // Always perform one final cache cleanup step.
739        mGrepDriver->performIncrementalCacheCleanupStep();
740    }
741    return nullptr;
742}
743
744}
Note: See TracBrowser for help on using the repository browser.