source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5969

Last change on this file since 5969 was 5969, checked in by cameron, 10 months ago

Fix for -L/-l

File size: 38.1 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <boost/filesystem.hpp>
10#include <UCD/resolve_properties.h>
11#include <kernels/charclasses.h>
12#include <kernels/cc_kernel.h>
13#include <kernels/grep_kernel.h>
14#include <kernels/UCD_property_kernel.h>
15#include <kernels/grapheme_kernel.h>
16#include <kernels/linebreak_kernel.h>
17#include <kernels/streams_merge.h>
18#include <kernels/source_kernel.h>
19#include <kernels/s2p_kernel.h>
20#include <kernels/scanmatchgen.h>
21#include <kernels/streamset.h>
22#include <kernels/until_n.h>
23#include <kernels/kernel_builder.h>
24#include <pablo/pablo_kernel.h>
25#include <cc/alphabet.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/collect_ccs.h>
37#include <re/replaceCC.h>
38#include <re/re_multiplex.h>
39#include <re/grapheme_clusters.h>
40#include <re/printer_re.h>
41#include <toolchain/toolchain.h>
42#include <toolchain/cpudriver.h>
43#include <iostream>
44#include <cc/multiplex_CCs.h>
45#include <llvm/Support/raw_ostream.h>
46#include <util/file_select.h>
47#include <util/aligned_allocator.h>
48#include <sys/stat.h>
49#include <fcntl.h>
50#include <errno.h>
51#include <llvm/ADT/STLExtras.h> // for make_unique
52#include <llvm/Support/CommandLine.h>
53#include <llvm/Support/Debug.h>
54#include <llvm/Support/Casting.h>
55#include <sched.h>
56
57using namespace parabix;
58using namespace llvm;
59using namespace cc;
60using namespace kernel;
61
62static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
63static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
64static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
65static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
66static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
67const unsigned DefaultByteCClimit = 6;
68
69static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
70
71
72namespace grep {
73   
74
75extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
76    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
77}
78
79extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
80    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
81}
82   
83
84inline static size_t ceil_log2(const size_t v) {
85    assert ("log2(0) is undefined!" && v != 0);
86    assert ("sizeof(size_t) == sizeof(long)" && sizeof(size_t) == sizeof(long));
87    return (sizeof(size_t) * CHAR_BIT) - __builtin_clzl(v - 1UL);
88}
89
90void SearchableBuffer::addSearchCandidate(const char * C_string_ptr) {
91    size_t length = strlen(C_string_ptr)+1;
92    if (mSpace_used + length >= mAllocated_capacity) {
93        size_t new_capacity = size_t{1} << (ceil_log2(mSpace_used + length + 1));
94        AlignedAllocator<char, BUFFER_ALIGNMENT> alloc;
95        char * new_buffer = mAllocator.allocate(new_capacity, 0);
96        memcpy(new_buffer, mBuffer_base, mSpace_used);
97        memset(&new_buffer[mSpace_used], 0, new_capacity-mSpace_used);
98        if (mBuffer_base != mInitial_buffer) {
99            alloc.deallocate(mBuffer_base, 0);
100        }
101        mBuffer_base = new_buffer;
102        mAllocated_capacity = new_capacity;
103    }
104    memcpy((void * ) &mBuffer_base[mSpace_used], C_string_ptr, length);
105    mSpace_used += length;
106    assert("Search candidate not null terminated" && (mBuffer_base[mSpace_used] == '\0'));
107    mEntries++;
108}
109
110SearchableBuffer::SearchableBuffer() :
111    mAllocated_capacity(INITIAL_CAPACITY),
112    mSpace_used(0),
113    mEntries(0),
114    mBuffer_base(mInitial_buffer) {
115    memset(mBuffer_base, 0, INITIAL_CAPACITY);
116}
117
118SearchableBuffer::~SearchableBuffer() {
119    if (mBuffer_base != mInitial_buffer) {
120        mAllocator.deallocate(mBuffer_base, 0);
121    }
122}
123
124
125
126// Grep Engine construction and initialization.
127
128GrepEngine::GrepEngine() :
129    mSuppressFileMessages(false),
130    mPreferMMap(true),
131    mShowFileNames(false),
132    mStdinLabel("(stdin)"),
133    mShowLineNumbers(false),
134    mInitialTab(false),
135    mCaseInsensitive(false),
136    mInvertMatches(false),
137    mMaxCount(0),
138    mGrepStdIn(false),
139    mGrepDriver(nullptr),
140    mNextFileToGrep(0),
141    mNextFileToPrint(0),
142    grepMatchFound(false),
143    mGrepRecordBreak(GrepRecordBreakKind::LF),
144    mMoveMatchesToEOL(true),
145    mEngineThread(pthread_self()) {}
146
147GrepEngine::~GrepEngine() {
148    delete mGrepDriver;
149}
150
151QuietModeEngine::QuietModeEngine() : GrepEngine() {
152    mEngineKind = EngineKind::QuietMode;
153    mMoveMatchesToEOL = false;
154    mMaxCount = 1;
155}
156
157MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithMatch, bool useNullSeparators) :
158    GrepEngine(), mRequiredCount(showFilesWithMatch) {
159    mEngineKind = EngineKind::MatchOnly;
160    mFileSuffix = useNullSeparators ? std::string("\0", 1) : "\n";
161    mMoveMatchesToEOL = false;
162    mMaxCount = 1;
163    mShowFileNames = true;
164}
165
166CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
167    mEngineKind = EngineKind::CountOnly;
168    mFileSuffix = ":";
169}
170
171EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
172    mEngineKind = EngineKind::EmitMatches;
173    mFileSuffix = mInitialTab ? "\t:" : ":";
174}
175
176   
177void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
178    mGrepRecordBreak = b;
179}
180
181   
182
183   
184void GrepEngine::initFileResult(std::vector<boost::filesystem::path> & paths) {
185    const unsigned n = paths.size();
186    mResultStrs.resize(n);
187    mFileStatus.resize(n, FileStatus::Pending);
188    inputPaths = paths;
189}
190
191void GrepEngine::initREs(std::vector<re::RE *> & REs) {
192    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
193        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
194    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
195        mBreakCC = re::makeByte(0);  // Null
196    } else {
197        mBreakCC = re::makeByte(0x0A); // LF
198    }
199    re::RE * anchorRE = mBreakCC;
200    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
201        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
202        anchorName->setDefinition(UCD::UnicodeBreakRE());
203        anchorRE = anchorName;
204    }
205   
206    mREs = REs;
207    bool allAnchored = true;
208    for(unsigned i = 0; i < mREs.size(); ++i) {
209        if (!hasEndAnchor(mREs[i])) allAnchored = false;
210        mREs[i] = resolveModesAndExternalSymbols(mREs[i], mCaseInsensitive);
211        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
212        mREs[i] = resolveAnchors(mREs[i], anchorRE);
213        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
214        mREs[i] = regular_expression_passes(mREs[i]);
215    }
216    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
217
218}
219
220
221   
222// Code Generation
223//
224// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
225
226unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
227    const unsigned packSize = b->getSizeTy()->getBitWidth();
228    return (packSize * packSize) / b->getBitBlockWidth();
229}
230   
231std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
232    auto & idb = mGrepDriver->getBuilder();
233    const unsigned segmentSize = codegen::SegmentSize;
234    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
235    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
236    const unsigned baseBufferSize = segmentSize * (mMaxCount > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
237    const unsigned encodingBits = 8;
238   
239   
240    //  Regular Expression Processing and Analysis Phase
241    const auto nREs = mREs.size();
242    bool hasGCB[nREs];
243    bool anyGCB = false;
244
245    for(unsigned i = 0; i < nREs; ++i) {
246        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
247        anyGCB |= hasGCB[i];
248    }
249    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
250    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
251   
252    re::RE * prefixRE;
253    re::RE * suffixRE;
254    // For simple regular expressions with a small number of characters, we
255    // can bypass transposition and use the Direct CC compiler.
256    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
257    if (isSimple) {
258        mREs[0] = toUTF8(mREs[0]);
259    }
260    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
261        std::vector<std::string> externalStreamNames;
262        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
263        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
264            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
265            for (auto cc : CCs) {
266                auto ccName = makeName(cc);
267                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
268                std::string ccNameStr = ccName->getFullName();
269                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
270                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
271                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
272                externalStreamNames.push_back(ccNameStr);
273                icgrepInputSets.push_back(ccStream);
274            }
275        }
276        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
277        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
278        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
279        MatchResultsBufs[0] = MatchResults;
280        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
281        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
282    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
283        std::vector<std::string> externalStreamNames;
284        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
285        if (MultithreadedSimpleRE) {
286            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
287            for (auto cc : CCs) {
288                auto ccName = makeName(cc);
289                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
290                std::string ccNameStr = ccName->getFullName();
291                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
292                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
293                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
294                externalStreamNames.push_back(ccNameStr);
295                icgrepInputSets.push_back(ccStream);
296            }
297        }
298        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
299        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
300        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
301        MatchResultsBufs[0] = MatchResults;
302        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
303        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
304    } else {
305       
306        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
307        kernel::Kernel * s2pk = nullptr;
308        if (PabloTransposition) {
309            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
310        }
311        else {
312            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
313        }
314        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
315
316        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
317        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
318
319        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
320        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
321        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
322       
323        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
324        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
325
326        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
327            LineBreakStream = LineFeedStream;
328        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
329            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
330            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
331        } else {
332            LineBreakStream = UnicodeLB;
333        }
334       
335        std::map<std::string, StreamSetBuffer *> propertyStream;
336        if (PropertyKernels) {
337            for (auto p : mUnicodeProperties) {
338                auto name = p->getFullName();
339                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
340                propertyStream.emplace(std::make_pair(name, s));
341                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
342                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
343            }
344        }
345        StreamSetBuffer * GCB_stream = nullptr;
346        if (anyGCB) {
347            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
348            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
349            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
350        }
351
352        for(unsigned i = 0; i < nREs; ++i) {
353            std::vector<std::string> externalStreamNames;
354            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
355            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
356                externalStreamNames.push_back("UTF8_LB");
357                icgrepInputSets.push_back(LineBreakStream);
358                externalStreamNames.push_back("UTF8_nonfinal");
359                icgrepInputSets.push_back(RequiredStreams);
360            }
361            std::set<re::Name *> UnicodeProperties;
362            if (PropertyKernels) {
363                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
364                for (auto p : UnicodeProperties) {
365                    auto name = p->getFullName();
366                    auto f = propertyStream.find(name);
367                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
368                    externalStreamNames.push_back(name);
369                    icgrepInputSets.push_back(f->second);
370                }
371            }
372            if (hasGCB[i]) {
373                externalStreamNames.push_back("\\b{g}");
374                icgrepInputSets.push_back(GCB_stream);
375            }
376            if (CC_Multiplexing) {
377                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
378                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
379                if (UnicodeSets.size() <= 1) {
380                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
381                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
382                    MatchResultsBufs[i] = MatchResults;
383                } else {
384                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
385                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
386                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
387                    auto numOfCharacterClasses = mpx_basis.size();
388                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
389                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
390                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
391    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
392    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
393                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
394                    icgrepInputSets.push_back(CharClasses);
395                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
396                    MatchResultsBufs[i] = MatchResults;
397                }
398            } else {
399                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
400                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
401                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
402                MatchResultsBufs[i] = MatchResults;
403            }
404        }
405    }
406
407    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
408    if (mREs.size() > 1) {
409        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
410        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
411        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
412    }
413    StreamSetBuffer * Matches = MergedResults;
414    if (mMoveMatchesToEOL) {
415        StreamSetBuffer * OriginalMatches = Matches;
416        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
417        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
418        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
419    }
420    if (mInvertMatches) {
421        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
422        StreamSetBuffer * OriginalMatches = Matches;
423        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
424        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
425    }
426    if (mMaxCount > 0) {
427        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
428        untilK->setInitialArguments({idb->getSize(mMaxCount)});
429        StreamSetBuffer * const AllMatches = Matches;
430        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
431        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
432    }
433
434    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
435}
436
437// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
438// which returns a count of the matches found (possibly subject to a MaxCount).
439//
440
441void GrepEngine::grepCodeGen() {
442
443    assert (mGrepDriver == nullptr);
444    mGrepDriver = new ParabixDriver("engine");
445    auto & idb = mGrepDriver->getBuilder();
446    Module * M = idb->getModule();
447
448    const unsigned encodingBits = 8;
449
450    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
451    mainFunc->setCallingConv(CallingConv::C);
452    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
453    auto args = mainFunc->arg_begin();
454
455    Value * const useMMap = &*(args++);
456    useMMap->setName("useMMap");
457    Value * const fileDescriptor = &*(args++);
458    fileDescriptor->setName("fileDescriptor");
459
460    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
461    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
462    sourceK->setInitialArguments({useMMap, fileDescriptor});
463    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
464
465    StreamSetBuffer * LineBreakStream;
466    StreamSetBuffer * Matches;
467    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
468
469    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
470    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
471    mGrepDriver->generatePipelineIR();
472    idb->setKernel(matchCountK);
473    Value * matchedLineCount = idb->getAccumulator("countResult");
474    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
475    mGrepDriver->deallocateBuffers();
476    idb->CreateRet(matchedLineCount);
477   
478    mGrepDriver->finalizeObject();
479}
480
481
482//
483//  Default Report Match:  lines are emitted with whatever line terminators are found in the
484//  input.  However, if the final line is not terminated, a new line is appended.
485//
486void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
487    mResultStr << mLinePrefix;
488    if (mShowLineNumbers) {
489        // Internally line numbers are counted from 0.  For display, adjust
490        // the line number so that lines are numbered from 1.
491        if (mInitialTab) {
492            mResultStr << lineNum+1 << "\t:";
493        }
494        else {
495            mResultStr << lineNum+1 << ":";
496        }
497    }
498    size_t bytes = line_end - line_start + 1;
499    mResultStr.write(line_start, bytes);
500    mLineCount++;
501    unsigned last_byte = *line_end;
502    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
503    if (LLVM_UNLIKELY(!mTerminated)) {
504        if (last_byte == 0x85) {  //  Possible NEL terminator.
505            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
506        }
507        else {
508            // Possible LS or PS terminators.
509            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
510                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
511                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
512        }
513    }
514}
515
516void EmitMatch::finalize_match(char * buffer_end) {
517    if (!mTerminated) mResultStr << "\n";
518}
519
520void EmitMatchesEngine::grepCodeGen() {
521    assert (mGrepDriver == nullptr);
522    mGrepDriver = new ParabixDriver("engine");
523    auto & idb = mGrepDriver->getBuilder();
524    Module * M = idb->getModule();
525
526    const unsigned encodingBits = 8;
527
528    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
529    mainFunc->setCallingConv(CallingConv::C);
530    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
531    auto args = mainFunc->arg_begin();
532
533    Value * const useMMap = &*(args++);
534    useMMap->setName("useMMap");
535    Value * const fileDescriptor = &*(args++);
536    fileDescriptor->setName("fileDescriptor");
537    Value * match_accumulator = &*(args++);
538    match_accumulator->setName("match_accumulator");
539
540    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
541    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
542    sourceK->setInitialArguments({useMMap, fileDescriptor});
543    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
544
545    StreamSetBuffer * LineBreakStream;
546    StreamSetBuffer * Matches;
547    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
548
549    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
550    scanMatchK->setInitialArguments({match_accumulator});
551    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
552    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
553    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
554
555    mGrepDriver->generatePipelineIR();
556    mGrepDriver->deallocateBuffers();
557    idb->CreateRet(idb->getInt64(0));
558    mGrepDriver->finalizeObject();
559}
560
561
562//
563//  The doGrep methods apply a GrepEngine to a single file, processing the results
564//  differently based on the engine type.
565
566uint64_t GrepEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
567    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
568    using namespace boost::filesystem;
569    path p(fileName);
570    bool useMMap = mPreferMMap;
571    if (p == "-") useMMap = false;
572    if (!is_regular_file(p)) useMMap = false;
573
574    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
575
576    int32_t fileDescriptor = openFile(fileName, strm);
577    if (fileDescriptor == -1) return 0;
578
579    uint64_t grepResult = f(useMMap, fileDescriptor);
580    close(fileDescriptor);
581    return grepResult;
582}
583
584uint64_t CountOnlyEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
585    uint64_t grepResult = GrepEngine::doGrep(fileName, strm);
586    if (mShowFileNames) strm << linePrefix(fileName);
587    strm << grepResult << "\n";
588    return grepResult;
589}
590
591std::string GrepEngine::linePrefix(std::string fileName) {
592    if (!mShowFileNames) return "";
593    if (fileName == "-") {
594        return mStdinLabel + mFileSuffix;
595    }
596    else {
597        return fileName + mFileSuffix;
598    }
599}
600
601uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
602    uint64_t grepResult = GrepEngine::doGrep(fileName, strm);
603    if (grepResult == mRequiredCount) {
604       strm << linePrefix(fileName);
605    }
606    return grepResult;
607}
608
609uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, std::ostringstream & strm) {
610    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
611    using namespace boost::filesystem;
612    path p(fileName);
613    bool useMMap = mPreferMMap;
614    if (p == "-") useMMap = false;
615    if (!is_regular_file(p)) useMMap = false;
616    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
617    int32_t fileDescriptor = openFile(fileName, strm);
618    if (fileDescriptor == -1) return 0;
619    EmitMatch accum(linePrefix(fileName), mShowLineNumbers, mInitialTab, strm);
620    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
621    close(fileDescriptor);
622    if (accum.mLineCount > 0) grepMatchFound = true;
623    return accum.mLineCount;
624}
625
626// Open a file and return its file desciptor.
627int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
628    if (fileName == "-") {
629        return STDIN_FILENO;
630    }
631    else {
632        struct stat sb;
633        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
634        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
635            if (!mSuppressFileMessages) {
636                if (errno == EACCES) {
637                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
638                }
639                else if (errno == ENOENT) {
640                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
641                }
642                else {
643                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
644                }
645            }
646            return fileDescriptor;
647        }
648        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
649            if (!mSuppressFileMessages) {
650                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
651            }
652            close(fileDescriptor);
653            return -1;
654        }
655        return fileDescriptor;
656    }
657}
658
659// The process of searching a group of files may use a sequential or a task
660// parallel approach.
661
662void * DoGrepThreadFunction(void *args) {
663    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
664}
665
666bool GrepEngine::searchAllFiles() {
667    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputPaths.size()));
668    std::vector<pthread_t> threads(numOfThreads);
669
670    for(unsigned long i = 1; i < numOfThreads; ++i) {
671        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
672        if (rc) {
673            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
674        }
675    }
676    // Main thread also does the work;
677    DoGrepThreadMethod();
678    for(unsigned i = 1; i < numOfThreads; ++i) {
679        void * status = nullptr;
680        const int rc = pthread_join(threads[i], &status);
681        if (rc) {
682            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
683        }
684    }
685    return grepMatchFound;
686}
687
688
689// DoGrep thread function.
690void * GrepEngine::DoGrepThreadMethod() {
691
692    unsigned fileIdx = mNextFileToGrep++;
693    while (fileIdx < inputPaths.size()) {
694        if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
695            errs() << "Tracing " << inputPaths[fileIdx].string() << "\n";
696        }
697        const auto grepResult = doGrep(inputPaths[fileIdx].string(), mResultStrs[fileIdx]);
698        mFileStatus[fileIdx] = FileStatus::GrepComplete;
699        if (grepResult > 0) {
700            grepMatchFound = true;
701        }
702        if ((mEngineKind == EngineKind::QuietMode) && grepMatchFound) {
703            if (pthread_self() != mEngineThread) {
704                pthread_exit(nullptr);
705            }
706            return nullptr;
707        }
708        fileIdx = mNextFileToGrep++;
709    }
710
711    unsigned printIdx = mNextFileToPrint++;
712    while (printIdx < inputPaths.size()) {
713        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
714        if (readyToPrint) {
715            const auto output = mResultStrs[printIdx].str();
716            if (!output.empty()) {
717                llvm::outs() << output;
718            }
719            mFileStatus[printIdx] = FileStatus::PrintComplete;
720            printIdx = mNextFileToPrint++;
721        } else {
722            mGrepDriver->performIncrementalCacheCleanupStep();
723        }
724        sched_yield();
725    }
726
727    if (pthread_self() != mEngineThread) {
728        pthread_exit(nullptr);
729    } else {
730        // Always perform one final cache cleanup step.
731        mGrepDriver->performIncrementalCacheCleanupStep();
732        if (mGrepStdIn) {
733            std::ostringstream s;
734            const auto grepResult = doGrep("-", s);
735            llvm::outs() << s.str();
736            if (grepResult) grepMatchFound = true;
737        }
738    }
739    return nullptr;
740}
741
742   
743   
744InternalSearchEngine::InternalSearchEngine() :
745    mGrepRecordBreak(GrepRecordBreakKind::LF),
746    mCaseInsensitive(false),
747    mGrepDriver(nullptr) {}
748   
749InternalSearchEngine::~InternalSearchEngine() {
750    delete mGrepDriver;
751}
752
753void InternalSearchEngine::grepCodeGen(re::RE * matchingRE, re::RE * excludedRE, MatchAccumulator * accum) {
754    mGrepDriver = new ParabixDriver("InternalEngine");
755    auto & idb = mGrepDriver->getBuilder();
756    Module * M = idb->getModule();
757   
758    mSaveSegmentPipelineParallel = codegen::SegmentPipelineParallel;
759    codegen::SegmentPipelineParallel = false;
760    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
761   
762    re::CC * breakCC = nullptr;
763    if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
764        breakCC = re::makeByte(0x0);
765    } else {// if (mGrepRecordBreak == GrepRecordBreakKind::LF)
766        breakCC = re::makeByte(0x0A);
767    }
768    if (matchingRE != nullptr) {
769        matchingRE = resolveCaseInsensitiveMode(matchingRE, mCaseInsensitive);
770        matchingRE = regular_expression_passes(matchingRE);
771        matchingRE = re::exclude_CC(matchingRE, breakCC);
772        matchingRE = resolveAnchors(matchingRE, breakCC);
773        matchingRE = toUTF8(matchingRE);
774    }
775    if (excludedRE != nullptr) {
776        excludedRE = resolveCaseInsensitiveMode(excludedRE, mCaseInsensitive);
777        excludedRE = regular_expression_passes(excludedRE);
778        excludedRE = re::exclude_CC(excludedRE, breakCC);
779        excludedRE = resolveAnchors(excludedRE, breakCC);
780        excludedRE = toUTF8(excludedRE);
781    }
782    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
783    mainFunc->setCallingConv(CallingConv::C);
784    auto args = mainFunc->arg_begin();
785    Value * const buffer = &*(args++);
786    buffer->setName("buffer");
787    Value * length = &*(args++);
788    length->setName("length");
789   
790    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
791    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
792    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
793    sourceK->setInitialArguments({buffer, length});
794    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
795    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
796    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
797    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
798   
799    StreamSetBuffer * RecordBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
800    std::string RBname = (mGrepRecordBreak == GrepRecordBreakKind::Null) ? "Null" : "LF";
801    kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, RBname, std::vector<re::CC *>{breakCC}, 8);
802    mGrepDriver->makeKernelCall(breakK, {BasisBits}, {RecordBreakStream});
803   
804   
805    std::vector<std::string> externalStreamNames;
806    StreamSetBuffer * MatchingRecords = nullptr;
807    if (matchingRE != nullptr) {
808        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
809        kernel::Kernel * includeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, matchingRE, externalStreamNames);
810        mGrepDriver->makeKernelCall(includeK, {BasisBits}, {MatchResults});
811        MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
812        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
813        mGrepDriver->makeKernelCall(matchedLinesK, {MatchResults, RecordBreakStream}, {MatchingRecords});
814    }
815    if (excludedRE != nullptr) {
816        StreamSetBuffer * ExcludedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
817        kernel::Kernel * excludeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, excludedRE, externalStreamNames);
818        mGrepDriver->makeKernelCall(excludeK, {BasisBits}, {ExcludedResults});
819        StreamSetBuffer * ExcludedRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
820        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
821        mGrepDriver->makeKernelCall(matchedLinesK, {ExcludedResults, RecordBreakStream}, {ExcludedRecords});
822
823        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
824        if (matchingRE != nullptr) {
825            StreamSetBuffer * nonExcluded = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
826            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {nonExcluded});
827            StreamSetBuffer * included = MatchingRecords;
828            MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
829            kernel::Kernel * streamsIntersectK = mGrepDriver->addKernelInstance<kernel::StreamsIntersect>(idb, 1, 2);
830            mGrepDriver->makeKernelCall(streamsIntersectK, {included, nonExcluded}, {MatchingRecords});
831        }
832        else {
833            MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
834            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {MatchingRecords});
835        }
836    }
837    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
838    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
839    mGrepDriver->makeKernelCall(scanMatchK, {MatchingRecords, RecordBreakStream, ByteStream}, {});
840    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
841    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
842    mGrepDriver->generatePipelineIR();
843    mGrepDriver->deallocateBuffers();
844    idb->CreateRetVoid();
845    mGrepDriver->finalizeObject();
846}
847
848void InternalSearchEngine::doGrep(const char * search_buffer, size_t bufferLength) {
849    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
850    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
851    f(search_buffer, bufferLength);
852    codegen::SegmentPipelineParallel = mSaveSegmentPipelineParallel;
853}
854
855}
Note: See TracBrowser for help on using the repository browser.