source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5954

Last change on this file since 5954 was 5954, checked in by cameron, 13 months ago

InternalSearchEngine?

File size: 37.4 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <boost/filesystem.hpp>
10#include <UCD/resolve_properties.h>
11#include <kernels/charclasses.h>
12#include <kernels/cc_kernel.h>
13#include <kernels/grep_kernel.h>
14#include <kernels/UCD_property_kernel.h>
15#include <kernels/grapheme_kernel.h>
16#include <kernels/linebreak_kernel.h>
17#include <kernels/streams_merge.h>
18#include <kernels/source_kernel.h>
19#include <kernels/s2p_kernel.h>
20#include <kernels/scanmatchgen.h>
21#include <kernels/streamset.h>
22#include <kernels/until_n.h>
23#include <kernels/kernel_builder.h>
24#include <pablo/pablo_kernel.h>
25#include <cc/alphabet.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/collect_ccs.h>
37#include <re/replaceCC.h>
38#include <re/re_multiplex.h>
39#include <re/grapheme_clusters.h>
40#include <re/printer_re.h>
41#include <toolchain/toolchain.h>
42#include <toolchain/cpudriver.h>
43#include <iostream>
44#include <cc/multiplex_CCs.h>
45#include <llvm/Support/raw_ostream.h>
46#include <util/file_select.h>
47#include <util/aligned_allocator.h>
48#include <sys/stat.h>
49#include <fcntl.h>
50#include <errno.h>
51#include <llvm/ADT/STLExtras.h> // for make_unique
52#include <llvm/Support/CommandLine.h>
53#include <llvm/Support/Debug.h>
54#include <llvm/Support/Casting.h>
55#include <sched.h>
56
57using namespace parabix;
58using namespace llvm;
59using namespace cc;
60using namespace kernel;
61
62static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
63static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
64static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
65static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
66static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
67const unsigned DefaultByteCClimit = 6;
68
69static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
70
71
72namespace grep {
73   
74
75extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
76    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
77}
78
79extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
80    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
81}
82   
83
84inline static size_t ceil_log2(const size_t v) {
85    assert ("log2(0) is undefined!" && v != 0);
86    assert ("sizeof(size_t) == sizeof(ulong)" && sizeof(size_t) == sizeof(ulong));
87    return (sizeof(size_t) * CHAR_BIT) - __builtin_clzl(v - 1UL);
88}
89
90void SearchableBuffer::addSearchCandidate(char * C_string_ptr, size_t length) {
91    if (mSpace_used + length >= mAllocated_capacity) {
92        size_t new_capacity = size_t{1} << (ceil_log2(mSpace_used + length + 1));
93        AlignedAllocator<char, BUFFER_ALIGNMENT> alloc;
94        char * new_buffer = alloc.allocate(new_capacity, 0);
95        memcpy(new_buffer, mBuffer_base, mSpace_used);
96        memset(&new_buffer[mSpace_used], 0, new_capacity-mSpace_used);
97        if (mBuffer_base != mInitial_buffer) {
98            alloc.deallocate(mBuffer_base, 0);
99        }
100        mBuffer_base = new_buffer;
101        mAllocated_capacity = new_capacity;
102    }
103    memcpy((void * ) &mBuffer_base[mSpace_used], C_string_ptr, length+1);
104    mSpace_used += length+1;
105    assert("Search candidate not null terminated" && (buffer_base[mSpace_used] == '\0'));
106    mEntries++;
107}
108
109SearchableBuffer::SearchableBuffer() :
110    mAllocated_capacity(INITIAL_CAPACITY), mBuffer_base(mInitial_buffer) {
111    memset(mBuffer_base, 0, INITIAL_CAPACITY);
112}
113
114SearchableBuffer::~SearchableBuffer() {
115    if (mBuffer_base != mInitial_buffer) {
116        AlignedAllocator<char, BUFFER_ALIGNMENT> alloc;
117        alloc.deallocate(mBuffer_base, 0);
118    }
119}
120
121
122
123// Grep Engine construction and initialization.
124
125GrepEngine::GrepEngine() :
126    mSuppressFileMessages(false),
127    mPreferMMap(true),
128    mShowFileNames(false),
129    mStdinLabel("(stdin)"),
130    mShowLineNumbers(false),
131    mInitialTab(false),
132    mCaseInsensitive(false),
133    mInvertMatches(false),
134    mMaxCount(0),
135    mGrepDriver(nullptr),
136    mNextFileToGrep(0),
137    mNextFileToPrint(0),
138    grepMatchFound(false),
139    mGrepRecordBreak(GrepRecordBreakKind::LF),
140    mMoveMatchesToEOL(true),
141    mEngineThread(pthread_self()) {}
142
143GrepEngine::~GrepEngine() {
144    delete mGrepDriver;
145}
146
147QuietModeEngine::QuietModeEngine() : GrepEngine() {
148    mEngineKind = EngineKind::QuietMode;
149    mMoveMatchesToEOL = false;
150    mMaxCount = 1;
151}
152
153MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch, bool useNullSeparators) :
154    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
155    mEngineKind = EngineKind::MatchOnly;
156    mFileSuffix = useNullSeparators ? std::string("\0", 1) : "\n";
157    mMoveMatchesToEOL = false;
158    mMaxCount = 1;
159}
160
161CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
162    mEngineKind = EngineKind::CountOnly;
163    mFileSuffix = ":";
164}
165
166EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
167    mEngineKind = EngineKind::EmitMatches;
168    mFileSuffix = mInitialTab ? "\t:" : ":";
169}
170
171   
172void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
173    mGrepRecordBreak = b;
174}
175
176   
177
178   
179void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
180    const unsigned n = filenames.size();
181    mResultStrs.resize(n);
182    mFileStatus.resize(n, FileStatus::Pending);
183    inputFiles = filenames;
184}
185
186void GrepEngine::initREs(std::vector<re::RE *> & REs) {
187    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
188        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
189    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
190        mBreakCC = re::makeByte(0);  // Null
191    } else {
192        mBreakCC = re::makeByte(0x0A); // LF
193    }
194    re::RE * anchorRE = mBreakCC;
195    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
196        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
197        anchorName->setDefinition(UCD::UnicodeBreakRE());
198        anchorRE = anchorName;
199    }
200   
201    mREs = REs;
202    bool allAnchored = true;
203    for(unsigned i = 0; i < mREs.size(); ++i) {
204        if (!hasEndAnchor(mREs[i])) allAnchored = false;
205        mREs[i] = resolveModesAndExternalSymbols(mREs[i], mCaseInsensitive);
206        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
207        mREs[i] = resolveAnchors(mREs[i], anchorRE);
208        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
209        mREs[i] = regular_expression_passes(mREs[i]);
210    }
211    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
212
213}
214
215
216   
217// Code Generation
218//
219// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
220
221unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
222    const unsigned packSize = b->getSizeTy()->getBitWidth();
223    return (packSize * packSize) / b->getBitBlockWidth();
224}
225   
226std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
227    auto & idb = mGrepDriver->getBuilder();
228    const unsigned segmentSize = codegen::SegmentSize;
229    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
230    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
231    const unsigned baseBufferSize = segmentSize * (mMaxCount > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
232    const unsigned encodingBits = 8;
233   
234   
235    //  Regular Expression Processing and Analysis Phase
236    const auto nREs = mREs.size();
237    bool hasGCB[nREs];
238    bool anyGCB = false;
239
240    for(unsigned i = 0; i < nREs; ++i) {
241        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
242        anyGCB |= hasGCB[i];
243    }
244    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
245    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
246   
247    re::RE * prefixRE;
248    re::RE * suffixRE;
249    // For simple regular expressions with a small number of characters, we
250    // can bypass transposition and use the Direct CC compiler.
251    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
252    if (isSimple) {
253        mREs[0] = toUTF8(mREs[0]);
254    }
255    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
256        std::vector<std::string> externalStreamNames;
257        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
258        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
259            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
260            for (auto cc : CCs) {
261                auto ccName = makeName(cc);
262                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
263                std::string ccNameStr = ccName->getFullName();
264                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
265                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
266                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
267                externalStreamNames.push_back(ccNameStr);
268                icgrepInputSets.push_back(ccStream);
269            }
270        }
271        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
272        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
273        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
274        MatchResultsBufs[0] = MatchResults;
275        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
276        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
277    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
278        std::vector<std::string> externalStreamNames;
279        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
280        if (MultithreadedSimpleRE) {
281            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
282            for (auto cc : CCs) {
283                auto ccName = makeName(cc);
284                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
285                std::string ccNameStr = ccName->getFullName();
286                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
287                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
288                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
289                externalStreamNames.push_back(ccNameStr);
290                icgrepInputSets.push_back(ccStream);
291            }
292        }
293        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
294        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
295        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
296        MatchResultsBufs[0] = MatchResults;
297        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
298        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
299    } else {
300       
301        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
302        kernel::Kernel * s2pk = nullptr;
303        if (PabloTransposition) {
304            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
305        }
306        else {
307            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
308        }
309        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
310
311        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
312        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
313
314        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
315        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
316        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
317       
318        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
319        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
320
321        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
322            LineBreakStream = LineFeedStream;
323        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
324            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
325            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
326        } else {
327            LineBreakStream = UnicodeLB;
328        }
329       
330        std::map<std::string, StreamSetBuffer *> propertyStream;
331        if (PropertyKernels) {
332            for (auto p : mUnicodeProperties) {
333                auto name = p->getFullName();
334                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
335                propertyStream.emplace(std::make_pair(name, s));
336                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
337                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
338            }
339        }
340        StreamSetBuffer * GCB_stream = nullptr;
341        if (anyGCB) {
342            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
343            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
344            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
345        }
346
347        for(unsigned i = 0; i < nREs; ++i) {
348            std::vector<std::string> externalStreamNames;
349            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
350            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
351                externalStreamNames.push_back("UTF8_LB");
352                icgrepInputSets.push_back(LineBreakStream);
353                externalStreamNames.push_back("UTF8_nonfinal");
354                icgrepInputSets.push_back(RequiredStreams);
355            }
356            std::set<re::Name *> UnicodeProperties;
357            if (PropertyKernels) {
358                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
359                for (auto p : UnicodeProperties) {
360                    auto name = p->getFullName();
361                    auto f = propertyStream.find(name);
362                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
363                    externalStreamNames.push_back(name);
364                    icgrepInputSets.push_back(f->second);
365                }
366            }
367            if (hasGCB[i]) {
368                externalStreamNames.push_back("\\b{g}");
369                icgrepInputSets.push_back(GCB_stream);
370            }
371            if (CC_Multiplexing) {
372                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
373                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
374                if (UnicodeSets.size() <= 1) {
375                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
376                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
377                    MatchResultsBufs[i] = MatchResults;
378                } else {
379                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
380                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
381                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
382                    auto numOfCharacterClasses = mpx_basis.size();
383                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
384                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
385                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
386    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
387    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
388                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
389                    icgrepInputSets.push_back(CharClasses);
390                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
391                    MatchResultsBufs[i] = MatchResults;
392                }
393            } else {
394                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
395                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
396                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
397                MatchResultsBufs[i] = MatchResults;
398            }
399        }
400    }
401
402    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
403    if (mREs.size() > 1) {
404        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
405        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
406        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
407    }
408    StreamSetBuffer * Matches = MergedResults;
409    if (mMoveMatchesToEOL) {
410        StreamSetBuffer * OriginalMatches = Matches;
411        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
412        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
413        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
414    }
415    if (mInvertMatches) {
416        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
417        StreamSetBuffer * OriginalMatches = Matches;
418        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
419        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
420    }
421    if (mMaxCount > 0) {
422        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
423        untilK->setInitialArguments({idb->getSize(mMaxCount)});
424        StreamSetBuffer * const AllMatches = Matches;
425        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
426        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
427    }
428
429    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
430}
431
432// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
433// which returns a count of the matches found (possibly subject to a MaxCount).
434//
435
436void GrepEngine::grepCodeGen() {
437
438    assert (mGrepDriver == nullptr);
439    mGrepDriver = new ParabixDriver("engine");
440    auto & idb = mGrepDriver->getBuilder();
441    Module * M = idb->getModule();
442
443    const unsigned encodingBits = 8;
444
445    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
446    mainFunc->setCallingConv(CallingConv::C);
447    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
448    auto args = mainFunc->arg_begin();
449
450    Value * const useMMap = &*(args++);
451    useMMap->setName("useMMap");
452    Value * const fileDescriptor = &*(args++);
453    fileDescriptor->setName("fileDescriptor");
454
455    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
456    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
457    sourceK->setInitialArguments({useMMap, fileDescriptor});
458    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
459
460    StreamSetBuffer * LineBreakStream;
461    StreamSetBuffer * Matches;
462    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
463
464    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
465    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
466    mGrepDriver->generatePipelineIR();
467    idb->setKernel(matchCountK);
468    Value * matchedLineCount = idb->getAccumulator("countResult");
469    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
470    mGrepDriver->deallocateBuffers();
471    idb->CreateRet(matchedLineCount);
472    mGrepDriver->finalizeObject();
473}
474
475
476//
477//  Default Report Match:  lines are emitted with whatever line terminators are found in the
478//  input.  However, if the final line is not terminated, a new line is appended.
479//
480void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
481    mResultStr << mLinePrefix;
482    if (mShowLineNumbers) {
483        // Internally line numbers are counted from 0.  For display, adjust
484        // the line number so that lines are numbered from 1.
485        if (mInitialTab) {
486            mResultStr << lineNum+1 << "\t:";
487        }
488        else {
489            mResultStr << lineNum+1 << ":";
490        }
491    }
492    size_t bytes = line_end - line_start + 1;
493    mResultStr.write(line_start, bytes);
494    mLineCount++;
495    unsigned last_byte = *line_end;
496    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
497    if (LLVM_UNLIKELY(!mTerminated)) {
498        if (last_byte == 0x85) {  //  Possible NEL terminator.
499            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
500        }
501        else {
502            // Possible LS or PS terminators.
503            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
504                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
505                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
506        }
507    }
508}
509
510void EmitMatch::finalize_match(char * buffer_end) {
511    if (!mTerminated) mResultStr << "\n";
512}
513
514void EmitMatchesEngine::grepCodeGen() {
515    assert (mGrepDriver == nullptr);
516    mGrepDriver = new ParabixDriver("engine");
517    auto & idb = mGrepDriver->getBuilder();
518    Module * M = idb->getModule();
519
520    const unsigned encodingBits = 8;
521
522    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
523    mainFunc->setCallingConv(CallingConv::C);
524    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
525    auto args = mainFunc->arg_begin();
526
527    Value * const useMMap = &*(args++);
528    useMMap->setName("useMMap");
529    Value * const fileDescriptor = &*(args++);
530    fileDescriptor->setName("fileDescriptor");
531    Value * match_accumulator = &*(args++);
532    match_accumulator->setName("match_accumulator");
533
534    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
535    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
536    sourceK->setInitialArguments({useMMap, fileDescriptor});
537    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
538
539    StreamSetBuffer * LineBreakStream;
540    StreamSetBuffer * Matches;
541    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
542
543    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
544    scanMatchK->setInitialArguments({match_accumulator});
545    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
546    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
547    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
548
549    mGrepDriver->generatePipelineIR();
550    mGrepDriver->deallocateBuffers();
551    idb->CreateRet(idb->getInt64(0));
552    mGrepDriver->finalizeObject();
553}
554
555
556//
557//  The doGrep methods apply a GrepEngine to a single file, processing the results
558//  differently based on the engine type.
559
560uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
561    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
562    using namespace boost::filesystem;
563    path p(fileName);
564    bool useMMap = mPreferMMap;
565    if (p == "-") useMMap = false;
566    if (!is_regular_file(p)) useMMap = false;
567
568    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
569
570    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
571    if (fileDescriptor == -1) return 0;
572
573    uint64_t grepResult = f(useMMap, fileDescriptor);
574    close(fileDescriptor);
575    return grepResult;
576}
577
578uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
579    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
580    if (mShowFileNames) mResultStrs[fileIdx] << linePrefix(fileName);
581    mResultStrs[fileIdx] << grepResult << "\n";
582    return grepResult;
583}
584
585std::string GrepEngine::linePrefix(std::string fileName) {
586    if (!mShowFileNames) return "";
587    if (fileName == "-") {
588        return mStdinLabel + mFileSuffix;
589    }
590    else {
591        return fileName + mFileSuffix;
592    }
593}
594
595uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
596    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
597    if (grepResult == mRequiredCount) {
598       mResultStrs[fileIdx] << linePrefix(fileName);
599    }
600    return grepResult;
601}
602
603uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
604    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
605    using namespace boost::filesystem;
606    path p(fileName);
607    bool useMMap = mPreferMMap;
608    if (p == "-") useMMap = false;
609    if (!is_regular_file(p)) useMMap = false;
610    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
611    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
612    if (fileDescriptor == -1) return 0;
613    EmitMatch accum(linePrefix(fileName), mShowLineNumbers, mInitialTab, mResultStrs[fileIdx]);
614    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
615    close(fileDescriptor);
616    if (accum.mLineCount > 0) grepMatchFound = true;
617    return accum.mLineCount;
618}
619
620// Open a file and return its file desciptor.
621int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
622    if (fileName == "-") {
623        return STDIN_FILENO;
624    }
625    else {
626        struct stat sb;
627        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
628        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
629            if (!mSuppressFileMessages) {
630                if (errno == EACCES) {
631                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
632                }
633                else if (errno == ENOENT) {
634                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
635                }
636                else {
637                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
638                }
639            }
640            return fileDescriptor;
641        }
642        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
643            if (!mSuppressFileMessages) {
644                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
645            }
646            close(fileDescriptor);
647            return -1;
648        }
649        return fileDescriptor;
650    }
651}
652
653// The process of searching a group of files may use a sequential or a task
654// parallel approach.
655
656void * DoGrepThreadFunction(void *args) {
657    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
658}
659
660bool GrepEngine::searchAllFiles() {
661    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
662    std::vector<pthread_t> threads(numOfThreads);
663
664    for(unsigned long i = 1; i < numOfThreads; ++i) {
665        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
666        if (rc) {
667            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
668        }
669    }
670    // Main thread also does the work;
671    DoGrepThreadMethod();
672    for(unsigned i = 1; i < numOfThreads; ++i) {
673        void * status = nullptr;
674        const int rc = pthread_join(threads[i], &status);
675        if (rc) {
676            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
677        }
678    }
679    return grepMatchFound;
680}
681
682
683// DoGrep thread function.
684void * GrepEngine::DoGrepThreadMethod() {
685
686    unsigned fileIdx = mNextFileToGrep++;
687    while (fileIdx < inputFiles.size()) {
688        if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
689            errs() << "Tracing " << inputFiles[fileIdx] << "\n";
690        }
691        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
692        mFileStatus[fileIdx] = FileStatus::GrepComplete;
693        if (grepResult > 0) {
694            grepMatchFound = true;
695        }
696        if ((mEngineKind == EngineKind::QuietMode) && grepMatchFound) {
697            if (pthread_self() != mEngineThread) {
698                pthread_exit(nullptr);
699            }
700            return nullptr;
701        }
702        fileIdx = mNextFileToGrep++;
703    }
704
705    unsigned printIdx = mNextFileToPrint++;
706    while (printIdx < inputFiles.size()) {
707        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
708        if (readyToPrint) {
709            const auto output = mResultStrs[printIdx].str();
710            if (!output.empty()) {
711                llvm::outs() << output;
712            }
713            mFileStatus[printIdx] = FileStatus::PrintComplete;
714            printIdx = mNextFileToPrint++;
715        } else {
716            mGrepDriver->performIncrementalCacheCleanupStep();
717        }
718        sched_yield();
719    }
720
721    if (pthread_self() != mEngineThread) {
722        pthread_exit(nullptr);
723    } else {
724        // Always perform one final cache cleanup step.
725        mGrepDriver->performIncrementalCacheCleanupStep();
726    }
727    return nullptr;
728}
729
730   
731   
732InternalSearchEngine::InternalSearchEngine() :
733    mGrepRecordBreak(GrepRecordBreakKind::LF),
734    mCaseInsensitive(false),
735    mGrepDriver(nullptr),
736    grepMatchFound(false) {}
737   
738InternalSearchEngine::~InternalSearchEngine() {
739    delete mGrepDriver;
740}
741
742void InternalSearchEngine::grepCodeGen(re::RE * matchingRE, re::RE * excludedRE, MatchAccumulator * accum) {
743    mGrepDriver = new ParabixDriver("InternalEngine");
744    auto & idb = mGrepDriver->getBuilder();
745    Module * M = idb->getModule();
746   
747    const unsigned encodingBits = 8;
748    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
749   
750    re::CC * breakCC = nullptr;
751    if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
752        breakCC = re::makeByte(0xA);
753    } else {// if (mGrepRecordBreak == GrepRecordBreakKind::LF)
754        breakCC = re::makeByte(0x0A);
755    }
756    if (matchingRE != nullptr) {
757        matchingRE = resolveCaseInsensitiveMode(matchingRE, mCaseInsensitive);
758        matchingRE = regular_expression_passes(matchingRE);
759        matchingRE = re::exclude_CC(matchingRE, breakCC);
760        matchingRE = resolveAnchors(matchingRE, breakCC);
761    }
762    if (excludedRE != nullptr) {
763        excludedRE = resolveCaseInsensitiveMode(excludedRE, mCaseInsensitive);
764        excludedRE = regular_expression_passes(excludedRE);
765        excludedRE = re::exclude_CC(excludedRE, breakCC);
766        excludedRE = resolveAnchors(excludedRE, breakCC);
767    }
768    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
769    mainFunc->setCallingConv(CallingConv::C);
770    auto args = mainFunc->arg_begin();
771    Value * const buffer = &*(args++);
772    buffer->setName("buffer");
773    Value * length = &*(args++);
774    length->setName("length");
775   
776    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
777    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
778    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
779    sourceK->setInitialArguments({buffer, length});
780    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
781    StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
782    kernel::Kernel * s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
783    mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
784   
785    StreamSetBuffer * RecordBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
786    std::string RBname = (mGrepRecordBreak == GrepRecordBreakKind::Null) ? "Null" : "LF";
787    kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, RBname, std::vector<re::CC *>{breakCC}, 8);
788    mGrepDriver->makeKernelCall(breakK, {BasisBits}, {RecordBreakStream});
789   
790    StreamSetBuffer * MatchingRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
791   
792    std::vector<std::string> externalStreamNames;
793    if (matchingRE != nullptr) {
794        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
795        kernel::Kernel * includeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, matchingRE, externalStreamNames);
796        mGrepDriver->makeKernelCall(includeK, {BasisBits}, {MatchResults});
797        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
798        mGrepDriver->makeKernelCall(matchedLinesK, {MatchResults, RecordBreakStream}, {MatchingRecords});
799    }
800   
801    if (excludedRE != nullptr) {
802        StreamSetBuffer * ExcludedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
803        kernel::Kernel * excludeK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, excludedRE, externalStreamNames);
804        mGrepDriver->makeKernelCall(excludeK, {BasisBits}, {ExcludedResults});
805        StreamSetBuffer * ExcludedRecords = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
806        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
807        mGrepDriver->makeKernelCall(matchedLinesK, {ExcludedResults, RecordBreakStream}, {ExcludedRecords});
808
809        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
810        if (matchingRE != nullptr) {
811            StreamSetBuffer * nonExcluded = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
812            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {nonExcluded});
813            StreamSetBuffer * included = MatchingRecords;
814            kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, 2);
815            mGrepDriver->makeKernelCall(streamsMergeK, {included, nonExcluded}, {MatchingRecords});
816        }
817        else {
818            mGrepDriver->makeKernelCall(invertK, {ExcludedRecords, RecordBreakStream}, {MatchingRecords});
819        }
820    }
821
822    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
823    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
824    mGrepDriver->makeKernelCall(scanMatchK, {MatchingRecords, RecordBreakStream, ByteStream}, {});
825    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
826    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
827    mGrepDriver->generatePipelineIR();
828    mGrepDriver->deallocateBuffers();
829    idb->CreateRetVoid();
830    mGrepDriver->finalizeObject();
831}
832
833void InternalSearchEngine::doGrep(const char * search_buffer, size_t bufferLength) {
834    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
835    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
836    f(search_buffer, bufferLength);
837}
838
839}
Note: See TracBrowser for help on using the repository browser.