source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5964

Last change on this file since 5964 was 5964, checked in by cameron, 12 months ago

Restructuring in preparation for command-line file include/exclude GLOBs

File size: 37.9 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <boost/filesystem.hpp>
10#include <UCD/resolve_properties.h>
11#include <kernels/charclasses.h>
12#include <kernels/cc_kernel.h>
13#include <kernels/grep_kernel.h>
14#include <kernels/UCD_property_kernel.h>
15#include <kernels/grapheme_kernel.h>
16#include <kernels/linebreak_kernel.h>
17#include <kernels/streams_merge.h>
18#include <kernels/source_kernel.h>
19#include <kernels/s2p_kernel.h>
20#include <kernels/scanmatchgen.h>
21#include <kernels/streamset.h>
22#include <kernels/until_n.h>
23#include <kernels/kernel_builder.h>
24#include <pablo/pablo_kernel.h>
25#include <cc/alphabet.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/collect_ccs.h>
37#include <re/replaceCC.h>
38#include <re/re_multiplex.h>
39#include <re/grapheme_clusters.h>
40#include <re/printer_re.h>
41#include <toolchain/toolchain.h>
42#include <toolchain/cpudriver.h>
43#include <iostream>
44#include <cc/multiplex_CCs.h>
45#include <llvm/Support/raw_ostream.h>
46#include <util/file_select.h>
47#include <util/aligned_allocator.h>
48#include <sys/stat.h>
49#include <fcntl.h>
50#include <errno.h>
51#include <llvm/ADT/STLExtras.h> // for make_unique
52#include <llvm/Support/CommandLine.h>
53#include <llvm/Support/Debug.h>
54#include <llvm/Support/Casting.h>
55#include <sched.h>
56
57using namespace parabix;
58using namespace llvm;
59using namespace cc;
60using namespace kernel;
61
62static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
63static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
64static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
65static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
66static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
67const unsigned DefaultByteCClimit = 6;
68
69static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
70
71
72namespace grep {
73   
74
75extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
76    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
77}
78
79extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
80    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
81}
82   
83
84inline static size_t ceil_log2(const size_t v) {
85    assert ("log2(0) is undefined!" && v != 0);
86    assert ("sizeof(size_t) == sizeof(long)" && sizeof(size_t) == sizeof(long));
87    return (sizeof(size_t) * CHAR_BIT) - __builtin_clzl(v - 1UL);
88}
89
90void SearchableBuffer::addSearchCandidate(const char * C_string_ptr) {
91    size_t length = strlen(C_string_ptr)+1;
92    if (mSpace_used + length >= mAllocated_capacity) {
93        size_t new_capacity = size_t{1} << (ceil_log2(mSpace_used + length + 1));
94        AlignedAllocator<char, BUFFER_ALIGNMENT> alloc;
95        char * new_buffer = mAllocator.allocate(new_capacity, 0);
96        memcpy(new_buffer, mBuffer_base, mSpace_used);
97        memset(&new_buffer[mSpace_used], 0, new_capacity-mSpace_used);
98        if (mBuffer_base != mInitial_buffer) {
99            alloc.deallocate(mBuffer_base, 0);
100        }
101        mBuffer_base = new_buffer;
102        mAllocated_capacity = new_capacity;
103    }
104    memcpy((void * ) &mBuffer_base[mSpace_used], C_string_ptr, length);
105    mSpace_used += length;
106    assert("Search candidate not null terminated" && (mBuffer_base[mSpace_used] == '\0'));
107    mEntries++;
108}
109
110SearchableBuffer::SearchableBuffer() :
111    mAllocated_capacity(INITIAL_CAPACITY),
112    mSpace_used(0),
113    mEntries(0),
114    mBuffer_base(mInitial_buffer) {
115    memset(mBuffer_base, 0, INITIAL_CAPACITY);
116}
117
118SearchableBuffer::~SearchableBuffer() {
119    if (mBuffer_base != mInitial_buffer) {
120        mAllocator.deallocate(mBuffer_base, 0);
121    }
122}
123
124
125
126// Grep Engine construction and initialization.
127
128GrepEngine::GrepEngine() :
129    mSuppressFileMessages(false),
130    mPreferMMap(true),
131    mShowFileNames(false),
132    mStdinLabel("(stdin)"),
133    mShowLineNumbers(false),
134    mInitialTab(false),
135    mCaseInsensitive(false),
136    mInvertMatches(false),
137    mMaxCount(0),
138    mGrepStdIn(false),
139    mGrepDriver(nullptr),
140    mNextFileToGrep(0),
141    mNextFileToPrint(0),
142    grepMatchFound(false),
143    mGrepRecordBreak(GrepRecordBreakKind::LF),
144    mMoveMatchesToEOL(true),
145    mEngineThread(pthread_self()) {}
146
147GrepEngine::~GrepEngine() {
148    delete mGrepDriver;
149}
150
151QuietModeEngine::QuietModeEngine() : GrepEngine() {
152    mEngineKind = EngineKind::QuietMode;
153    mMoveMatchesToEOL = false;
154    mMaxCount = 1;
155}
156
157MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch, bool useNullSeparators) :
158    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
159    mEngineKind = EngineKind::MatchOnly;
160    mFileSuffix = useNullSeparators ? std::string("\0", 1) : "\n";
161    mMoveMatchesToEOL = false;
162    mMaxCount = 1;
163}
164
165CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
166    mEngineKind = EngineKind::CountOnly;
167    mFileSuffix = ":";
168}
169
170EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
171    mEngineKind = EngineKind::EmitMatches;
172    mFileSuffix = mInitialTab ? "\t:" : ":";
173}
174
175   
176void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
177    mGrepRecordBreak = b;
178}
179
180   
181
182   
183void GrepEngine::initFileResult(std::vector<boost::filesystem::path> & paths) {
184    const unsigned n = paths.size();
185    mResultStrs.resize(n);
186    mFileStatus.resize(n, FileStatus::Pending);
187    inputPaths = paths;
188}
189
190void GrepEngine::initREs(std::vector<re::RE *> & REs) {
191    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
192        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
193    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
194        mBreakCC = re::makeByte(0);  // Null
195    } else {
196        mBreakCC = re::makeByte(0x0A); // LF
197    }
198    re::RE * anchorRE = mBreakCC;
199    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
200        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
201        anchorName->setDefinition(UCD::UnicodeBreakRE());
202        anchorRE = anchorName;
203    }
204   
205    mREs = REs;
206    bool allAnchored = true;
207    for(unsigned i = 0; i < mREs.size(); ++i) {
208        if (!hasEndAnchor(mREs[i])) allAnchored = false;
209        mREs[i] = resolveModesAndExternalSymbols(mREs[i], mCaseInsensitive);
210        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
211        mREs[i] = resolveAnchors(mREs[i], anchorRE);
212        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
213        mREs[i] = regular_expression_passes(mREs[i]);
214    }
215    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
216
217}
218
219
220   
221// Code Generation
222//
223// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
224
225unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
226    const unsigned packSize = b->getSizeTy()->getBitWidth();
227    return (packSize * packSize) / b->getBitBlockWidth();
228}
229   
230std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
231    auto & idb = mGrepDriver->getBuilder();
232    const unsigned segmentSize = codegen::SegmentSize;
233    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
234    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
235    const unsigned baseBufferSize = segmentSize * (mMaxCount > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
236    const unsigned encodingBits = 8;
237   
238   
239    //  Regular Expression Processing and Analysis Phase
240    const auto nREs = mREs.size();
241    bool hasGCB[nREs];
242    bool anyGCB = false;
243
244    for(unsigned i = 0; i < nREs; ++i) {
245        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
246        anyGCB |= hasGCB[i];
247    }
248    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
249    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
250   
251    re::RE * prefixRE;
252    re::RE * suffixRE;
253    // For simple regular expressions with a small number of characters, we
254    // can bypass transposition and use the Direct CC compiler.
255    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
256    if (isSimple) {
257        mREs[0] = toUTF8(mREs[0]);
258    }
259    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
260        std::vector<std::string> externalStreamNames;
261        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
262        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
263            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
264            for (auto cc : CCs) {
265                auto ccName = makeName(cc);
266                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
267                std::string ccNameStr = ccName->getFullName();
268                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
269                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
270                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
271                externalStreamNames.push_back(ccNameStr);
272                icgrepInputSets.push_back(ccStream);
273            }
274        }
275        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
276        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
277        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
278        MatchResultsBufs[0] = MatchResults;
279        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
280        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
281    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
282        std::vector<std::string> externalStreamNames;
283        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
284        if (MultithreadedSimpleRE) {
285            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
286            for (auto cc : CCs) {
287                auto ccName = makeName(cc);
288                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
289                std::string ccNameStr = ccName->getFullName();
290                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
291                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
292                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
293                externalStreamNames.push_back(ccNameStr);
294                icgrepInputSets.push_back(ccStream);
295            }
296        }
297        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
298        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
299        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
300        MatchResultsBufs[0] = MatchResults;
301        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
302        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
303    } else {
304       
305        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
306        kernel::Kernel * s2pk = nullptr;
307        if (PabloTransposition) {
308            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
309        }
310        else {
311            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
312        }
313        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
314
315        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
316        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
317
318        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
319        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
320        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
321       
322        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
323        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
324
325        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
326            LineBreakStream = LineFeedStream;
327        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
328            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
329            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
330        } else {
331            LineBreakStream = UnicodeLB;
332        }
333       
334        std::map<std::string, StreamSetBuffer *> propertyStream;
335        if (PropertyKernels) {
336            for (auto p : mUnicodeProperties) {
337                auto name = p->getFullName();
338                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
339                propertyStream.emplace(std::make_pair(name, s));
340                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
341                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
342            }
343        }
344        StreamSetBuffer * GCB_stream = nullptr;
345        if (anyGCB) {
346            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
347            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
348            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
349        }
350
351        for(unsigned i = 0; i < nREs; ++i) {
352            std::vector<std::string> externalStreamNames;
353            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
354            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
355                externalStreamNames.push_back("UTF8_LB");
356                icgrepInputSets.push_back(LineBreakStream);
357                externalStreamNames.push_back("UTF8_nonfinal");
358                icgrepInputSets.push_back(RequiredStreams);
359            }
360            std::set<re::Name *> UnicodeProperties;
361            if (PropertyKernels) {
362                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
363                for (auto p : UnicodeProperties) {
364                    auto name = p->getFullName();
365                    auto f = propertyStream.find(name);
366                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
367                    externalStreamNames.push_back(name);
368                    icgrepInputSets.push_back(f->second);
369                }
370            }
371            if (hasGCB[i]) {
372                externalStreamNames.push_back("\\b{g}");
373                icgrepInputSets.push_back(GCB_stream);
374            }
375            if (CC_Multiplexing) {
376                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
377                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
378                if (UnicodeSets.size() <= 1) {
379                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
380                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
381                    MatchResultsBufs[i] = MatchResults;
382                } else {
383                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
384                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
385                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
386                    auto numOfCharacterClasses = mpx_basis.size();
387                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
388                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
389                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
390    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
391    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
392                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
393                    icgrepInputSets.push_back(CharClasses);
394                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
395                    MatchResultsBufs[i] = MatchResults;
396                }
397            } else {
398                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
399                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
400                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
401                MatchResultsBufs[i] = MatchResults;
402            }
403        }
404    }
405
406    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
407    if (mREs.size() > 1) {
408        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
409        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
410        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
411    }
412    StreamSetBuffer * Matches = MergedResults;
413    if (mMoveMatchesToEOL) {
414        StreamSetBuffer * OriginalMatches = Matches;
415        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
416        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
417        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
418    }
419    if (mInvertMatches) {
420        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
421        StreamSetBuffer * OriginalMatches = Matches;
422        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
423        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
424    }
425    if (mMaxCount > 0) {
426        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
427        untilK->setInitialArguments({idb->getSize(mMaxCount)});
428        StreamSetBuffer * const AllMatches = Matches;
429        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
430        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
431    }
432
433    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
434}
435
436// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
437// which returns a count of the matches found (possibly subject to a MaxCount).
438//
439
440void GrepEngine::grepCodeGen() {
441
442    assert (mGrepDriver == nullptr);
443    mGrepDriver = new ParabixDriver("engine");
444    auto & idb = mGrepDriver->getBuilder();
445    Module * M = idb->getModule();
446
447    const unsigned encodingBits = 8;
448
449    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
450    mainFunc->setCallingConv(CallingConv::C);
451    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
452    auto args = mainFunc->arg_begin();
453
454    Value * const useMMap = &*(args++);
455    useMMap->setName("useMMap");
456    Value * const fileDescriptor = &*(args++);
457    fileDescriptor->setName("fileDescriptor");
458
459    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
460    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
461    sourceK->setInitialArguments({useMMap, fileDescriptor});
462    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
463
464    StreamSetBuffer * LineBreakStream;
465    StreamSetBuffer * Matches;
466    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
467
468    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
469    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
470    mGrepDriver->generatePipelineIR();
471    idb->setKernel(matchCountK);
472    Value * matchedLineCount = idb->getAccumulator("countResult");
473    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
474    mGrepDriver->deallocateBuffers();
475    idb->CreateRet(matchedLineCount);
476    mGrepDriver->finalizeObject();
477}
478
479
480//
481//  Default Report Match:  lines are emitted with whatever line terminators are found in the
482//  input.  However, if the final line is not terminated, a new line is appended.
483//
484void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
485    mResultStr << mLinePrefix;
486    if (mShowLineNumbers) {
487        // Internally line numbers are counted from 0.  For display, adjust
488        // the line number so that lines are numbered from 1.
489        if (mInitialTab) {
490            mResultStr << lineNum+1 << "\t:";
491        }
492        else {
493            mResultStr << lineNum+1 << ":";
494        }
495    }
496    size_t bytes = line_end - line_start + 1;
497    mResultStr.write(line_start, bytes);
498    mLineCount++;
499    unsigned last_byte = *line_end;
500    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
501    if (LLVM_UNLIKELY(!mTerminated)) {
502        if (last_byte == 0x85) {  //  Possible NEL terminator.
503            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
504        }
505        else {
506            // Possible LS or PS terminators.
507            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
508                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
509                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
510        }
511    }
512}
513
514void EmitMatch::finalize_match(char * buffer_end) {
515    if (!mTerminated) mResultStr << "\n";
516}
517
518void EmitMatchesEngine::grepCodeGen() {
519    assert (mGrepDriver == nullptr);
520    mGrepDriver = new ParabixDriver("engine");
521    auto & idb = mGrepDriver->getBuilder();
522    Module * M = idb->getModule();
523
524    const unsigned encodingBits = 8;
525
526    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
527    mainFunc->setCallingConv(CallingConv::C);
528    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
529    auto args = mainFunc->arg_begin();
530
531    Value * const useMMap = &*(args++);
532    useMMap->setName("useMMap");
533    Value * const fileDescriptor = &*(args++);
534    fileDescriptor->setName("fileDescriptor");
535    Value * match_accumulator = &*(args++);
536    match_accumulator->setName("match_accumulator");
537
538    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
539    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
540    sourceK->setInitialArguments({useMMap, fileDescriptor});
541    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
542
543    StreamSetBuffer * LineBreakStream;
544    StreamSetBuffer * Matches;
545    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
546
547    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
548    scanMatchK->setInitialArguments({match_accumulator});
549    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
550    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
551    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
552
553    mGrepDriver->generatePipelineIR();
554    mGrepDriver->deallocateBuffers();
555    idb->CreateRet(idb->getInt64(0));
556    mGrepDriver->finalizeObject();
557}
558
559
560//
561//  The doGrep methods apply a GrepEngine to a single file, processing the results
562//  differently based on the engine type.
563
564uint64_t GrepEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
565    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
566    using namespace boost::filesystem;
567    path p(fileName);
568    bool useMMap = mPreferMMap;
569    if (p == "-") useMMap = false;
570    if (!is_regular_file(p)) useMMap = false;
571
572    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
573
574    int32_t fileDescriptor = openFile(fileName, strm);
575    if (fileDescriptor == -1) return 0;
576
577    uint64_t grepResult = f(useMMap, fileDescriptor);
578    close(fileDescriptor);
579    return grepResult;
580}
581
582uint64_t CountOnlyEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
583    uint64_t grepResult = GrepEngine::doGrep(fileName, strm);
584    if (mShowFileNames) strm << linePrefix(fileName);
585    strm << grepResult << "\n";
586    return grepResult;
587}
588
589std::string GrepEngine::linePrefix(std::string fileName) {
590    if (!mShowFileNames) return "";
591    if (fileName == "-") {
592        return mStdinLabel + mFileSuffix;
593    }
594    else {
595        return fileName + mFileSuffix;
596    }
597}
598
599uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
600    uint64_t grepResult = GrepEngine::doGrep(fileName, strm);
601    if (grepResult == mRequiredCount) {
602       strm << linePrefix(fileName);
603    }
604    return grepResult;
605}
606
607uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
608    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
609    using namespace boost::filesystem;
610    path p(fileName);
611    bool useMMap = mPreferMMap;
612    if (p == "-") useMMap = false;
613    if (!is_regular_file(p)) useMMap = false;
614    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
615    int32_t fileDescriptor = openFile(fileName, strm);
616    if (fileDescriptor == -1) return 0;
617    EmitMatch accum(linePrefix(fileName), mShowLineNumbers, mInitialTab, strm);
618    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
619    close(fileDescriptor);
620    if (accum.mLineCount > 0) grepMatchFound = true;
621    return accum.mLineCount;
622}
623
624// Open a file and return its file desciptor.
625int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
626    if (fileName == "-") {
627        return STDIN_FILENO;
628    }
629    else {
630        struct stat sb;
631        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
632        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
633            if (!mSuppressFileMessages) {
634                if (errno == EACCES) {
635                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
636                }
637                else if (errno == ENOENT) {
638                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
639                }
640                else {
641                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
642                }
643            }
644            return fileDescriptor;
645        }
646        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
647            if (!mSuppressFileMessages) {
648                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
649            }
650            close(fileDescriptor);
651            return -1;
652        }
653        return fileDescriptor;
654    }
655}
656
657// The process of searching a group of files may use a sequential or a task
658// parallel approach.
659
660void * DoGrepThreadFunction(void *args) {
661    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
662}
663
664bool GrepEngine::searchAllFiles() {
665    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputPaths.size()));
666    std::vector<pthread_t> threads(numOfThreads);
667
668    for(unsigned long i = 1; i < numOfThreads; ++i) {
669        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
670        if (rc) {
671            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
672        }
673    }
674    // Main thread also does the work;
675    DoGrepThreadMethod();
676    for(unsigned i = 1; i < numOfThreads; ++i) {
677        void * status = nullptr;
678        const int rc = pthread_join(threads[i], &status);
679        if (rc) {
680            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
681        }
682    }
683    return grepMatchFound;
684}
685
686
687// DoGrep thread function.
688void * GrepEngine::DoGrepThreadMethod() {
689
690    unsigned fileIdx = mNextFileToGrep++;
691    while (fileIdx < inputPaths.size()) {
692        if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
693            errs() << "Tracing " << inputPaths[fileIdx].string() << "\n";
694        }
695        const auto grepResult = doGrep(inputPaths[fileIdx].string(), mResultStrs[fileIdx]);
696        mFileStatus[fileIdx] = FileStatus::GrepComplete;
697        if (grepResult > 0) {
698            grepMatchFound = true;
699        }
700        if ((mEngineKind == EngineKind::QuietMode) && grepMatchFound) {
701            if (pthread_self() != mEngineThread) {
702                pthread_exit(nullptr);
703            }
704            return nullptr;
705        }
706        fileIdx = mNextFileToGrep++;
707    }
708
709    unsigned printIdx = mNextFileToPrint++;
710    while (printIdx < inputPaths.size()) {
711        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
712        if (readyToPrint) {
713            const auto output = mResultStrs[printIdx].str();
714            if (!output.empty()) {
715                llvm::outs() << output;
716            }
717            mFileStatus[printIdx] = FileStatus::PrintComplete;
718            printIdx = mNextFileToPrint++;
719        } else {
720            mGrepDriver->performIncrementalCacheCleanupStep();
721        }
722        sched_yield();
723    }
724
725    if (pthread_self() != mEngineThread) {
726        pthread_exit(nullptr);
727    } else {
728        // Always perform one final cache cleanup step.
729        mGrepDriver->performIncrementalCacheCleanupStep();
730        if (mGrepStdIn) {
731            std::ostringstream s;
732            const auto grepResult = doGrep("-", s);
733            llvm::outs() << s.str();
734            if (grepResult) grepMatchFound = true;
735        }
736    }
737    return nullptr;
738}
739
740   
741   
742InternalSearchEngine::InternalSearchEngine() :
743    mGrepRecordBreak(GrepRecordBreakKind::LF),
744    mCaseInsensitive(false),
745    mGrepDriver(nullptr) {}
746   
747InternalSearchEngine::~InternalSearchEngine() {
748    delete mGrepDriver;
749}
750
751void InternalSearchEngine::grepCodeGen(re::RE * matchingRE, re::RE * excludedRE, MatchAccumulator * accum) {
752    mGrepDriver = new ParabixDriver("InternalEngine");
753    auto & idb = mGrepDriver->getBuilder();
754    Module * M = idb->getModule();
755   
756    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
757   
758    re::CC * breakCC = nullptr;
759    if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
760        breakCC = re::makeByte(0x0);
761    } else {// if (mGrepRecordBreak == GrepRecordBreakKind::LF)
762        breakCC = re::makeByte(0x0A);
763    }
764    if (matchingRE != nullptr) {
765        matchingRE = resolveCaseInsensitiveMode(matchingRE, mCaseInsensitive);
766        matchingRE = regular_expression_passes(matchingRE);
767        matchingRE = re::exclude_CC(matchingRE, breakCC);
768        matchingRE = resolveAnchors(matchingRE, breakCC);
769        matchingRE = toUTF8(matchingRE);
770    }
771    if (excludedRE != nullptr) {
772        excludedRE = resolveCaseInsensitiveMode(excludedRE, mCaseInsensitive);
773        excludedRE = regular_expression_passes(excludedRE);
774        excludedRE = re::exclude_CC(excludedRE, breakCC);
775        excludedRE = resolveAnchors(excludedRE, breakCC);
776        excludedRE = toUTF8(excludedRE);
777    }
778    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
779    mainFunc->setCallingConv(CallingConv::C);
780    auto args = mainFunc->arg_begin();
781    Value * const buffer = &*(args++);
782    buffer->setName("buffer");
783    Value * length = &*(args++);
784    length->setName("length");
785   
786    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
787    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
788    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
789    sourceK->setInitialArguments({buffer, length});
790    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
791    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
792    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
793    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
794   
795    StreamSetBuffer * RecordBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
796    std::string RBname = (mGrepRecordBreak == GrepRecordBreakKind::Null) ? "Null" : "LF";
797    kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, RBname, std::vector<re::CC *>{breakCC}, 8);
798    mGrepDriver->makeKernelCall(breakK, {BasisBits}, {RecordBreakStream});
799   
800   
801    std::vector<std::string> externalStreamNames;
802    StreamSetBuffer * MatchingRecords = nullptr;
803    if (matchingRE != nullptr) {
804        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
805        kernel::Kernel * includeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, matchingRE, externalStreamNames);
806        mGrepDriver->makeKernelCall(includeK, {BasisBits}, {MatchResults});
807        MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
808        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
809        mGrepDriver->makeKernelCall(matchedLinesK, {MatchResults, RecordBreakStream}, {MatchingRecords});
810    }
811    if (excludedRE != nullptr) {
812        StreamSetBuffer * ExcludedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
813        kernel::Kernel * excludeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, excludedRE, externalStreamNames);
814        mGrepDriver->makeKernelCall(excludeK, {BasisBits}, {ExcludedResults});
815        StreamSetBuffer * ExcludedRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
816        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
817        mGrepDriver->makeKernelCall(matchedLinesK, {ExcludedResults, RecordBreakStream}, {ExcludedRecords});
818
819        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
820        if (matchingRE != nullptr) {
821            StreamSetBuffer * nonExcluded = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
822            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {nonExcluded});
823            StreamSetBuffer * included = MatchingRecords;
824            MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
825            kernel::Kernel * streamsIntersectK = mGrepDriver->addKernelInstance<kernel::StreamsIntersect>(idb, 1, 2);
826            mGrepDriver->makeKernelCall(streamsIntersectK, {included, nonExcluded}, {MatchingRecords});
827        }
828        else {
829            MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
830            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {MatchingRecords});
831        }
832    }
833    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
834    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
835    mGrepDriver->makeKernelCall(scanMatchK, {MatchingRecords, RecordBreakStream, ByteStream}, {});
836    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
837    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
838    mGrepDriver->generatePipelineIR();
839    mGrepDriver->deallocateBuffers();
840    idb->CreateRetVoid();
841    mGrepDriver->finalizeObject();
842}
843
844void InternalSearchEngine::doGrep(const char * search_buffer, size_t bufferLength) {
845    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
846    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
847    f(search_buffer, bufferLength);
848}
849
850}
Note: See TracBrowser for help on using the repository browser.