source: icGREP/icgrep-devel/icgrep/grep/grep_engine.cpp @ 5947

Last change on this file since 5947 was 5947, checked in by cameron, 12 months ago

include boost/align

File size: 35.1 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6#include <set>
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <boost/filesystem.hpp>
10#include <UCD/resolve_properties.h>
11#include <kernels/charclasses.h>
12#include <kernels/cc_kernel.h>
13#include <kernels/grep_kernel.h>
14#include <kernels/UCD_property_kernel.h>
15#include <kernels/grapheme_kernel.h>
16#include <kernels/linebreak_kernel.h>
17#include <kernels/streams_merge.h>
18#include <kernels/source_kernel.h>
19#include <kernels/s2p_kernel.h>
20#include <kernels/scanmatchgen.h>
21#include <kernels/streamset.h>
22#include <kernels/until_n.h>
23#include <kernels/kernel_builder.h>
24#include <pablo/pablo_kernel.h>
25#include <cc/alphabet.h>
26#include <re/re_cc.h>
27#include <re/re_name.h>
28#include <re/casing.h>
29#include <re/exclude_CC.h>
30#include <re/to_utf8.h>
31#include <re/re_toolchain.h>
32#include <toolchain/toolchain.h>
33#include <re/re_analysis.h>
34#include <re/re_name_resolve.h>
35#include <re/re_name_gather.h>
36#include <re/collect_ccs.h>
37#include <re/replaceCC.h>
38#include <re/re_multiplex.h>
39#include <re/grapheme_clusters.h>
40#include <re/printer_re.h>
41#include <toolchain/toolchain.h>
42#include <toolchain/cpudriver.h>
43#include <iostream>
44#include <cc/multiplex_CCs.h>
45#include <llvm/Support/raw_ostream.h>
46#include <util/file_select.h>
47#include <boost/align/aligned_allocator.hpp>
48#include <sys/stat.h>
49#include <fcntl.h>
50#include <errno.h>
51#include <llvm/ADT/STLExtras.h> // for make_unique
52#include <llvm/Support/CommandLine.h>
53#include <llvm/Support/Debug.h>
54#include <llvm/Support/Casting.h>
55#include <sched.h>
56
57using namespace parabix;
58using namespace llvm;
59using namespace cc;
60using namespace kernel;
61
62static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(2));
63static cl::opt<bool> PabloTransposition("enable-pablo-s2p", cl::desc("Enable experimental pablo transposition."));
64static cl::opt<bool> CC_Multiplexing("CC-multiplexing", cl::desc("Enable CC multiplexing."), cl::init(false));
65static cl::opt<bool> PropertyKernels("enable-property-kernels", cl::desc("Enable Unicode property kernels."), cl::init(false));
66static cl::opt<bool> MultithreadedSimpleRE("enable-simple-RE-kernels", cl::desc("Enable individual CC kernels for simple REs."), cl::init(false));
67const unsigned DefaultByteCClimit = 6;
68
69static cl::opt<unsigned> ByteCClimit("byte-CC-limit", cl::desc("Max number of CCs for byte CC pipeline."), cl::init(DefaultByteCClimit));
70
71
72namespace grep {
73   
74
75extern "C" void accumulate_match_wrapper(intptr_t accum_addr, const size_t lineNum, char * line_start, char * line_end) {
76    reinterpret_cast<MatchAccumulator *>(accum_addr)->accumulate_match(lineNum, line_start, line_end);
77}
78
79extern "C" void finalize_match_wrapper(intptr_t accum_addr, char * buffer_end) {
80    reinterpret_cast<MatchAccumulator *>(accum_addr)->finalize_match(buffer_end);
81}
82   
83inline static size_t ceil_log2(const size_t v) {
84    assert ("log2(0) is undefined!" && v != 0);
85    return (sizeof(size_t) * CHAR_BIT) - __builtin_clzll(v - 1U);
86}
87
88void SearchableBuffer::addSearchCandidate(char * string_ptr, size_t length) {
89    if (space_used + length >= allocated_capacity) {
90        size_t new_capacity = size_t{1} << (ceil_log2(space_used + length + 1));
91        char * new_buffer = (char *) boost::alignment::aligned_alloc(BUFFER_ALIGNMENT, new_capacity);
92        memcpy(new_buffer, buffer_base, space_used);
93        memset(&new_buffer[space_used], 0, new_capacity-space_used);
94        if (buffer_base != initial_buffer) {
95            free(buffer_base);
96        }
97        buffer_base = new_buffer;
98        allocated_capacity = new_capacity;
99    }
100    memcpy((void * ) &buffer_base[space_used], string_ptr, length);
101    space_used += length;
102    buffer_base[space_used] = '\0';
103    space_used++;
104    entries++;
105}
106
107SearchableBuffer::SearchableBuffer() :
108allocated_capacity(INITIAL_CAPACITY), buffer_base(initial_buffer) {
109    memset(buffer_base, 0, INITIAL_CAPACITY);
110}
111
112SearchableBuffer::~SearchableBuffer() {
113    if (buffer_base != initial_buffer) {
114        free(buffer_base);
115    }
116}
117
118void grepBuffer(re::RE * pattern, const char * search_buffer, size_t bufferLength, MatchAccumulator * accum) {
119    const unsigned segmentSize = codegen::BufferSegments * codegen::SegmentSize * codegen::ThreadNum;
120    auto segParallelModeSave = codegen::SegmentPipelineParallel;
121    codegen::SegmentPipelineParallel = false;
122   
123    pattern = resolveCaseInsensitiveMode(pattern, false);
124    pattern = regular_expression_passes(pattern);
125    pattern = re::exclude_CC(pattern, re::makeByte(0x0A));
126    pattern = resolveAnchors(pattern, re::makeByte(0x0A));
127
128    ParabixDriver pxDriver("codepointEngine");
129    auto & idb = pxDriver.getBuilder();
130    Module * M = idb->getModule();
131   
132    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getVoidTy(), idb->getInt8PtrTy(), idb->getSizeTy(), nullptr));
133    mainFunc->setCallingConv(CallingConv::C);
134    auto args = mainFunc->arg_begin();
135    Value * const buffer = &*(args++);
136    buffer->setName("buffer");
137    Value * length = &*(args++);
138    length->setName("length");
139   
140    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
141    StreamSetBuffer * ByteStream = pxDriver.addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, 8));
142    kernel::Kernel * sourceK = pxDriver.addKernelInstance<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy());
143    sourceK->setInitialArguments({buffer, length});
144    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
145   
146   
147    StreamSetBuffer * BasisBits = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize);
148    kernel::Kernel * s2pk = pxDriver.addKernelInstance<kernel::S2PKernel>(idb);
149    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
150   
151    StreamSetBuffer * LineFeedStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
152    kernel::Kernel * linefeedK = pxDriver.addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
153    pxDriver.makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
154   
155    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
156   
157    kernel::Kernel * requiredStreamsK = pxDriver.addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
158    StreamSetBuffer * RequiredStreams = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
159    pxDriver.makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, LineBreakStream});
160   
161    StreamSetBuffer * MatchResults = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
162    kernel::Kernel * icgrepK = pxDriver.addKernelInstance<kernel::ICGrepKernel>(idb, pattern, std::vector<std::string>{"UTF8_LB", "UTF8_nonfinal"});
163    pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream, RequiredStreams}, {MatchResults});
164   
165    StreamSetBuffer * MatchedLines = pxDriver.addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize);
166    kernel::Kernel * matchedLinesK = pxDriver.addKernelInstance<kernel::MatchedLinesKernel>(idb);
167    pxDriver.makeKernelCall(matchedLinesK, {MatchResults, LineBreakStream}, {MatchedLines});
168   
169    kernel::Kernel * scanMatchK = pxDriver.addKernelInstance<kernel::ScanMatchKernel>(idb);
170    scanMatchK->setInitialArguments({ConstantInt::get(idb->getIntAddrTy(), reinterpret_cast<intptr_t>(accum))});
171    pxDriver.makeKernelCall(scanMatchK, {MatchedLines, LineBreakStream, ByteStream}, {});
172    pxDriver.LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
173    pxDriver.LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
174   
175    pxDriver.generatePipelineIR();
176    pxDriver.deallocateBuffers();
177    idb->CreateRetVoid();
178    pxDriver.finalizeObject();
179   
180    typedef void (*GrepFunctionType)(const char * buffer, const size_t length);
181    auto f = reinterpret_cast<GrepFunctionType>(pxDriver.getMain());
182    f(search_buffer, bufferLength);
183    codegen::SegmentPipelineParallel = segParallelModeSave;
184}
185
186
187
188// Grep Engine construction and initialization.
189
190GrepEngine::GrepEngine() :
191    mSuppressFileMessages(false),
192    mPreferMMap(true),
193    mShowFileNames(false),
194    mStdinLabel("(stdin)"),
195    mShowLineNumbers(false),
196    mInitialTab(false),
197    mCaseInsensitive(false),
198    mInvertMatches(false),
199    mMaxCount(0),
200    mGrepDriver(nullptr),
201    mNextFileToGrep(0),
202    mNextFileToPrint(0),
203    grepMatchFound(false),
204    mGrepRecordBreak(GrepRecordBreakKind::LF),
205    mMoveMatchesToEOL(true),
206    mEngineThread(pthread_self()) {}
207
208GrepEngine::~GrepEngine() {
209    delete mGrepDriver;
210}
211
212QuietModeEngine::QuietModeEngine() : GrepEngine() {
213    mEngineKind = EngineKind::QuietMode;
214    mMoveMatchesToEOL = false;
215    mMaxCount = 1;
216}
217
218MatchOnlyEngine::MatchOnlyEngine(bool showFilesWithoutMatch, bool useNullSeparators) :
219    GrepEngine(), mRequiredCount(showFilesWithoutMatch) {
220    mEngineKind = EngineKind::MatchOnly;
221    mFileSuffix = useNullSeparators ? std::string("\0", 1) : "\n";
222    mMoveMatchesToEOL = false;
223    mMaxCount = 1;
224}
225
226CountOnlyEngine::CountOnlyEngine() : GrepEngine() {
227    mEngineKind = EngineKind::CountOnly;
228    mFileSuffix = ":";
229}
230
231EmitMatchesEngine::EmitMatchesEngine() : GrepEngine() {
232    mEngineKind = EngineKind::EmitMatches;
233    mFileSuffix = mInitialTab ? "\t:" : ":";
234}
235
236   
237void GrepEngine::setRecordBreak(GrepRecordBreakKind b) {
238    mGrepRecordBreak = b;
239}
240
241   
242
243   
244void GrepEngine::initFileResult(std::vector<std::string> & filenames) {
245    const unsigned n = filenames.size();
246    mResultStrs.resize(n);
247    mFileStatus.resize(n, FileStatus::Pending);
248    inputFiles = filenames;
249}
250
251void GrepEngine::initREs(std::vector<re::RE *> & REs) {
252    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
253        mBreakCC = re::makeCC(re::makeCC(0x0A, 0x0D), re::makeCC(re::makeCC(0x85), re::makeCC(0x2028, 0x2029)));
254    } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
255        mBreakCC = re::makeByte(0);  // Null
256    } else {
257        mBreakCC = re::makeByte(0x0A); // LF
258    }
259    re::RE * anchorRE = mBreakCC;
260    if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
261        re::Name * anchorName = re::makeName("UTF8_LB", re::Name::Type::Unicode);
262        anchorName->setDefinition(UCD::UnicodeBreakRE());
263        anchorRE = anchorName;
264    }
265   
266    mREs = REs;
267    bool allAnchored = true;
268    for(unsigned i = 0; i < mREs.size(); ++i) {
269        if (!hasEndAnchor(mREs[i])) allAnchored = false;
270        mREs[i] = resolveModesAndExternalSymbols(mREs[i], mCaseInsensitive);
271        mREs[i] = re::exclude_CC(mREs[i], mBreakCC);
272        mREs[i] = resolveAnchors(mREs[i], anchorRE);
273        re::gatherUnicodeProperties(mREs[i], mUnicodeProperties);
274        mREs[i] = regular_expression_passes(mREs[i]);
275    }
276    if (allAnchored && (mGrepRecordBreak != GrepRecordBreakKind::Unicode)) mMoveMatchesToEOL = false;
277
278}
279
280
281   
282// Code Generation
283//
284// All engines share a common pipeline to compute a stream of Matches from a given input Bytestream.
285
286unsigned LLVM_READNONE calculateMaxCountRate(const std::unique_ptr<kernel::KernelBuilder> & b) {
287    const unsigned packSize = b->getSizeTy()->getBitWidth();
288    return (packSize * packSize) / b->getBitBlockWidth();
289}
290   
291std::pair<StreamSetBuffer *, StreamSetBuffer *> GrepEngine::grepPipeline(StreamSetBuffer * ByteStream) {
292    auto & idb = mGrepDriver->getBuilder();
293    const unsigned segmentSize = codegen::SegmentSize;
294    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
295    // TODO: until we automate stream buffer sizing, use this calculation to determine how large our matches buffer needs to be.
296    const unsigned baseBufferSize = segmentSize * (mMaxCount > 0 ? (std::max(bufferSegments, calculateMaxCountRate(idb))) : bufferSegments);
297    const unsigned encodingBits = 8;
298   
299   
300    //  Regular Expression Processing and Analysis Phase
301    const auto nREs = mREs.size();
302    bool hasGCB[nREs];
303    bool anyGCB = false;
304
305    for(unsigned i = 0; i < nREs; ++i) {
306        hasGCB[i] = hasGraphemeClusterBoundary(mREs[i]);
307        anyGCB |= hasGCB[i];
308    }
309    StreamSetBuffer * LineBreakStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
310    std::vector<StreamSetBuffer *> MatchResultsBufs(nREs);
311   
312    re::RE * prefixRE;
313    re::RE * suffixRE;
314    // For simple regular expressions with a small number of characters, we
315    // can bypass transposition and use the Direct CC compiler.
316    bool isSimple = (nREs == 1) && (mGrepRecordBreak != GrepRecordBreakKind::Unicode) && (!anyGCB);
317    if (isSimple) {
318        mREs[0] = toUTF8(mREs[0]);
319    }
320    if (isSimple && byteTestsWithinLimit(mREs[0], ByteCClimit)) {
321        std::vector<std::string> externalStreamNames;
322        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
323        if (MultithreadedSimpleRE && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
324            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
325            for (auto cc : CCs) {
326                auto ccName = makeName(cc);
327                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
328                std::string ccNameStr = ccName->getFullName();
329                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
330                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
331                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
332                externalStreamNames.push_back(ccNameStr);
333                icgrepInputSets.push_back(ccStream);
334            }
335        }
336        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
337        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteGrepKernel>(idb, mREs[0], externalStreamNames);
338        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
339        MatchResultsBufs[0] = MatchResults;
340        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
341        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
342    } else if (isSimple && hasTriCCwithinLimit(mREs[0], ByteCClimit, prefixRE, suffixRE)) {
343        std::vector<std::string> externalStreamNames;
344        std::vector<StreamSetBuffer *> icgrepInputSets = {ByteStream};
345        if (MultithreadedSimpleRE) {
346            auto CCs = re::collectCCs(prefixRE, &cc::Byte);
347            for (auto cc : CCs) {
348                auto ccName = makeName(cc);
349                mREs[0] = re::replaceCC(mREs[0], cc, ccName);
350                std::string ccNameStr = ccName->getFullName();
351                StreamSetBuffer * ccStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
352                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, ccNameStr, std::vector<re::CC *>{cc});
353                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {ccStream});
354                externalStreamNames.push_back(ccNameStr);
355                icgrepInputSets.push_back(ccStream);
356            }
357        }
358        StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
359        kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ByteBitGrepKernel>(idb, prefixRE, suffixRE, externalStreamNames);
360        mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
361        MatchResultsBufs[0] = MatchResults;
362        kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::DirectCharacterClassKernelBuilder>(idb, "breakCC", std::vector<re::CC *>{mBreakCC});
363        mGrepDriver->makeKernelCall(breakK, {ByteStream}, {LineBreakStream});
364    } else {
365       
366        StreamSetBuffer * BasisBits = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), baseBufferSize);
367        kernel::Kernel * s2pk = nullptr;
368        if (PabloTransposition) {
369            s2pk = mGrepDriver->addKernelInstance<kernel::S2P_PabloKernel>(idb);
370        }
371        else {
372            s2pk = mGrepDriver->addKernelInstance<kernel::S2PKernel>(idb);
373        }
374        mGrepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
375
376        StreamSetBuffer * RequiredStreams = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
377        StreamSetBuffer * UnicodeLB = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
378
379        StreamSetBuffer * LineFeedStream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
380        kernel::Kernel * linefeedK = mGrepDriver->addKernelInstance<kernel::LineFeedKernelBuilder>(idb, Binding{idb->getStreamSetTy(8), "basis", FixedRate(), Principal()});
381        mGrepDriver->makeKernelCall(linefeedK, {BasisBits}, {LineFeedStream});
382       
383        kernel::Kernel * requiredStreamsK = mGrepDriver->addKernelInstance<kernel::RequiredStreams_UTF8>(idb);
384        mGrepDriver->makeKernelCall(requiredStreamsK, {BasisBits, LineFeedStream}, {RequiredStreams, UnicodeLB});
385
386        if (mGrepRecordBreak == GrepRecordBreakKind::LF) {
387            LineBreakStream = LineFeedStream;
388        } else if (mGrepRecordBreak == GrepRecordBreakKind::Null) {
389            kernel::Kernel * breakK = mGrepDriver->addKernelInstance<kernel::ParabixCharacterClassKernelBuilder>(idb, "Null", std::vector<re::CC *>{mBreakCC}, 8);
390            mGrepDriver->makeKernelCall(breakK, {BasisBits}, {LineBreakStream});
391        } else {
392            LineBreakStream = UnicodeLB;
393        }
394       
395        std::map<std::string, StreamSetBuffer *> propertyStream;
396        if (PropertyKernels) {
397            for (auto p : mUnicodeProperties) {
398                auto name = p->getFullName();
399                StreamSetBuffer * s = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
400                propertyStream.emplace(std::make_pair(name, s));
401                kernel::Kernel * propertyK = mGrepDriver->addKernelInstance<kernel::UnicodePropertyKernelBuilder>(idb, p);
402                mGrepDriver->makeKernelCall(propertyK, {BasisBits}, {s});
403            }
404        }
405        StreamSetBuffer * GCB_stream = nullptr;
406        if (anyGCB) {
407            GCB_stream = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
408            kernel::Kernel * gcbK = mGrepDriver->addKernelInstance<kernel::GraphemeClusterBreakKernel>(idb);
409            mGrepDriver->makeKernelCall(gcbK, {BasisBits, RequiredStreams}, {GCB_stream});
410        }
411
412        for(unsigned i = 0; i < nREs; ++i) {
413            std::vector<std::string> externalStreamNames;
414            std::vector<StreamSetBuffer *> icgrepInputSets = {BasisBits};
415            if (mGrepRecordBreak == GrepRecordBreakKind::Unicode) {
416                externalStreamNames.push_back("UTF8_LB");
417                icgrepInputSets.push_back(LineBreakStream);
418                externalStreamNames.push_back("UTF8_nonfinal");
419                icgrepInputSets.push_back(RequiredStreams);
420            }
421            std::set<re::Name *> UnicodeProperties;
422            if (PropertyKernels) {
423                re::gatherUnicodeProperties(mREs[i], UnicodeProperties);
424                for (auto p : UnicodeProperties) {
425                    auto name = p->getFullName();
426                    auto f = propertyStream.find(name);
427                    if (f == propertyStream.end()) report_fatal_error(name + " not found\n");
428                    externalStreamNames.push_back(name);
429                    icgrepInputSets.push_back(f->second);
430                }
431            }
432            if (hasGCB[i]) {
433                externalStreamNames.push_back("\\b{g}");
434                icgrepInputSets.push_back(GCB_stream);
435            }
436            if (CC_Multiplexing) {
437                const auto UnicodeSets = re::collectCCs(mREs[i], &cc::Unicode, std::set<re::Name *>({re::makeZeroWidth("\\b{g}")}));
438                StreamSetBuffer * const MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
439                if (UnicodeSets.size() <= 1) {
440                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
441                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
442                    MatchResultsBufs[i] = MatchResults;
443                } else {
444                    mpx = make_unique<MultiplexedAlphabet>("mpx", UnicodeSets);
445                    mREs[i] = transformCCs(mpx.get(), mREs[i]);
446                    std::vector<re::CC *> mpx_basis = mpx->getMultiplexedCCs();
447                    auto numOfCharacterClasses = mpx_basis.size();
448                    StreamSetBuffer * CharClasses = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), baseBufferSize);
449                    kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis));
450                    mGrepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
451    //                kernel::Kernel * ccK = mGrepDriver->addKernelInstance<kernel::CharClassesKernel>(idb, std::move(mpx_basis), true);
452    //                mGrepDriver->makeKernelCall(ccK, {ByteStream}, {CharClasses});
453                    kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames, std::vector<cc::Alphabet *>{mpx.get()});
454                    icgrepInputSets.push_back(CharClasses);
455                    mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
456                    MatchResultsBufs[i] = MatchResults;
457                }
458            } else {
459                StreamSetBuffer * MatchResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
460                kernel::Kernel * icgrepK = mGrepDriver->addKernelInstance<kernel::ICGrepKernel>(idb, mREs[i], externalStreamNames);
461                mGrepDriver->makeKernelCall(icgrepK, icgrepInputSets, {MatchResults});
462                MatchResultsBufs[i] = MatchResults;
463            }
464        }
465    }
466
467    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
468    if (mREs.size() > 1) {
469        MergedResults = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
470        kernel::Kernel * streamsMergeK = mGrepDriver->addKernelInstance<kernel::StreamsMerge>(idb, 1, mREs.size());
471        mGrepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
472    }
473    StreamSetBuffer * Matches = MergedResults;
474    if (mMoveMatchesToEOL) {
475        StreamSetBuffer * OriginalMatches = Matches;
476        kernel::Kernel * matchedLinesK = mGrepDriver->addKernelInstance<kernel::MatchedLinesKernel>(idb);
477        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
478        mGrepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
479    }
480    if (mInvertMatches) {
481        kernel::Kernel * invertK = mGrepDriver->addKernelInstance<kernel::InvertMatchesKernel>(idb);
482        StreamSetBuffer * OriginalMatches = Matches;
483        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
484        mGrepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
485    }
486    if (mMaxCount > 0) {
487        kernel::Kernel * untilK = mGrepDriver->addKernelInstance<kernel::UntilNkernel>(idb);
488        untilK->setInitialArguments({idb->getSize(mMaxCount)});
489        StreamSetBuffer * const AllMatches = Matches;
490        Matches = mGrepDriver->addBuffer<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), baseBufferSize);
491        mGrepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
492    }
493
494    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
495}
496
497// The QuietMode, MatchOnly and CountOnly engines share a common code generation main function,
498// which returns a count of the matches found (possibly subject to a MaxCount).
499//
500
501void GrepEngine::grepCodeGen() {
502
503    assert (mGrepDriver == nullptr);
504    mGrepDriver = new ParabixDriver("engine");
505    auto & idb = mGrepDriver->getBuilder();
506    Module * M = idb->getModule();
507
508    const unsigned encodingBits = 8;
509
510    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), nullptr));
511    mainFunc->setCallingConv(CallingConv::C);
512    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
513    auto args = mainFunc->arg_begin();
514
515    Value * const useMMap = &*(args++);
516    useMMap->setName("useMMap");
517    Value * const fileDescriptor = &*(args++);
518    fileDescriptor->setName("fileDescriptor");
519
520    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
521    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
522    sourceK->setInitialArguments({useMMap, fileDescriptor});
523    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
524
525    StreamSetBuffer * LineBreakStream;
526    StreamSetBuffer * Matches;
527    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
528
529    kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance<kernel::PopcountKernel>(idb);
530    mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
531    mGrepDriver->generatePipelineIR();
532    idb->setKernel(matchCountK);
533    Value * matchedLineCount = idb->getAccumulator("countResult");
534    matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
535    mGrepDriver->deallocateBuffers();
536    idb->CreateRet(matchedLineCount);
537    mGrepDriver->finalizeObject();
538}
539
540
541//
542//  Default Report Match:  lines are emitted with whatever line terminators are found in the
543//  input.  However, if the final line is not terminated, a new line is appended.
544//
545void EmitMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
546    mResultStr << mLinePrefix;
547    if (mShowLineNumbers) {
548        // Internally line numbers are counted from 0.  For display, adjust
549        // the line number so that lines are numbered from 1.
550        if (mInitialTab) {
551            mResultStr << lineNum+1 << "\t:";
552        }
553        else {
554            mResultStr << lineNum+1 << ":";
555        }
556    }
557    size_t bytes = line_end - line_start + 1;
558    mResultStr.write(line_start, bytes);
559    mLineCount++;
560    unsigned last_byte = *line_end;
561    mTerminated = (last_byte >= 0x0A) && (last_byte <= 0x0D);
562    if (LLVM_UNLIKELY(!mTerminated)) {
563        if (last_byte == 0x85) {  //  Possible NEL terminator.
564            mTerminated = (bytes >= 2) && (static_cast<unsigned>(line_end[-1]) == 0xC2);
565        }
566        else {
567            // Possible LS or PS terminators.
568            mTerminated = (bytes >= 3) && (static_cast<unsigned>(line_end[-2]) == 0xE2)
569                                       && (static_cast<unsigned>(line_end[-1]) == 0x80)
570                                       && ((last_byte == 0xA8) || (last_byte == 0xA9));
571        }
572    }
573}
574
575void EmitMatch::finalize_match(char * buffer_end) {
576    if (!mTerminated) mResultStr << "\n";
577}
578
579void EmitMatchesEngine::grepCodeGen() {
580    assert (mGrepDriver == nullptr);
581    mGrepDriver = new ParabixDriver("engine");
582    auto & idb = mGrepDriver->getBuilder();
583    Module * M = idb->getModule();
584
585    const unsigned encodingBits = 8;
586
587    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt8Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
588    mainFunc->setCallingConv(CallingConv::C);
589    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
590    auto args = mainFunc->arg_begin();
591
592    Value * const useMMap = &*(args++);
593    useMMap->setName("useMMap");
594    Value * const fileDescriptor = &*(args++);
595    fileDescriptor->setName("fileDescriptor");
596    Value * match_accumulator = &*(args++);
597    match_accumulator->setName("match_accumulator");
598
599    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits));
600    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance<kernel::FDSourceKernel>(idb);
601    sourceK->setInitialArguments({useMMap, fileDescriptor});
602    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
603
604    StreamSetBuffer * LineBreakStream;
605    StreamSetBuffer * Matches;
606    std::tie(LineBreakStream, Matches) = grepPipeline(ByteStream);
607
608    kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance<kernel::ScanMatchKernel>(idb);
609    scanMatchK->setInitialArguments({match_accumulator});
610    mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
611    mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
612    mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
613
614    mGrepDriver->generatePipelineIR();
615    mGrepDriver->deallocateBuffers();
616    idb->CreateRet(idb->getInt64(0));
617    mGrepDriver->finalizeObject();
618}
619
620
621//
622//  The doGrep methods apply a GrepEngine to a single file, processing the results
623//  differently based on the engine type.
624
625uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
626    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor);
627    using namespace boost::filesystem;
628    path p(fileName);
629    bool useMMap = mPreferMMap;
630    if (p == "-") useMMap = false;
631    if (!is_regular_file(p)) useMMap = false;
632
633    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
634
635    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
636    if (fileDescriptor == -1) return 0;
637
638    uint64_t grepResult = f(useMMap, fileDescriptor);
639    close(fileDescriptor);
640    return grepResult;
641}
642
643uint64_t CountOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
644    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
645    if (mShowFileNames) mResultStrs[fileIdx] << linePrefix(fileName);
646    mResultStrs[fileIdx] << grepResult << "\n";
647    return grepResult;
648}
649
650std::string GrepEngine::linePrefix(std::string fileName) {
651    if (!mShowFileNames) return "";
652    if (fileName == "-") {
653        return mStdinLabel + mFileSuffix;
654    }
655    else {
656        return fileName + mFileSuffix;
657    }
658}
659
660uint64_t MatchOnlyEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
661    uint64_t grepResult = GrepEngine::doGrep(fileName, fileIdx);
662    if (grepResult == mRequiredCount) {
663       mResultStrs[fileIdx] << linePrefix(fileName);
664    }
665    return grepResult;
666}
667
668uint64_t EmitMatchesEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
669    typedef uint64_t (*GrepFunctionType)(bool useMMap, int32_t fileDescriptor, intptr_t accum_addr);
670    using namespace boost::filesystem;
671    path p(fileName);
672    bool useMMap = mPreferMMap;
673    if (p == "-") useMMap = false;
674    if (!is_regular_file(p)) useMMap = false;
675    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
676    int32_t fileDescriptor = openFile(fileName, mResultStrs[fileIdx]);
677    if (fileDescriptor == -1) return 0;
678    EmitMatch accum(linePrefix(fileName), mShowLineNumbers, mInitialTab, mResultStrs[fileIdx]);
679    f(useMMap, fileDescriptor, reinterpret_cast<intptr_t>(&accum));
680    close(fileDescriptor);
681    if (accum.mLineCount > 0) grepMatchFound = true;
682    return accum.mLineCount;
683}
684
685// Open a file and return its file desciptor.
686int32_t GrepEngine::openFile(const std::string & fileName, std::ostringstream & msgstrm) {
687    if (fileName == "-") {
688        return STDIN_FILENO;
689    }
690    else {
691        struct stat sb;
692        int32_t fileDescriptor = open(fileName.c_str(), O_RDONLY);
693        if (LLVM_UNLIKELY(fileDescriptor == -1)) {
694            if (!mSuppressFileMessages) {
695                if (errno == EACCES) {
696                    msgstrm << "icgrep: " << fileName << ": Permission denied.\n";
697                }
698                else if (errno == ENOENT) {
699                    msgstrm << "icgrep: " << fileName << ": No such file.\n";
700                }
701                else {
702                    msgstrm << "icgrep: " << fileName << ": Failed.\n";
703                }
704            }
705            return fileDescriptor;
706        }
707        if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
708            if (!mSuppressFileMessages) {
709                msgstrm << "icgrep: " << fileName << ": Is a directory.\n";
710            }
711            close(fileDescriptor);
712            return -1;
713        }
714        return fileDescriptor;
715    }
716}
717
718// The process of searching a group of files may use a sequential or a task
719// parallel approach.
720
721void * DoGrepThreadFunction(void *args) {
722    return reinterpret_cast<GrepEngine *>(args)->DoGrepThreadMethod();
723}
724
725bool GrepEngine::searchAllFiles() {
726    const unsigned numOfThreads = std::min(static_cast<unsigned>(Threads), static_cast<unsigned>(inputFiles.size())); 
727    std::vector<pthread_t> threads(numOfThreads);
728
729    for(unsigned long i = 1; i < numOfThreads; ++i) {
730        const int rc = pthread_create(&threads[i], nullptr, DoGrepThreadFunction, (void *)this);
731        if (rc) {
732            llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
733        }
734    }
735    // Main thread also does the work;
736    DoGrepThreadMethod();
737    for(unsigned i = 1; i < numOfThreads; ++i) {
738        void * status = nullptr;
739        const int rc = pthread_join(threads[i], &status);
740        if (rc) {
741            llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
742        }
743    }
744    return grepMatchFound;
745}
746
747
748// DoGrep thread function.
749void * GrepEngine::DoGrepThreadMethod() {
750
751    unsigned fileIdx = mNextFileToGrep++;
752    while (fileIdx < inputFiles.size()) {
753        if (codegen::DebugOptionIsSet(codegen::TraceCounts)) {
754            errs() << "Tracing " << inputFiles[fileIdx] << "\n";
755        }
756        const auto grepResult = doGrep(inputFiles[fileIdx], fileIdx);
757        mFileStatus[fileIdx] = FileStatus::GrepComplete;
758        if (grepResult > 0) {
759            grepMatchFound = true;
760        }
761        if ((mEngineKind == EngineKind::QuietMode) && grepMatchFound) {
762            if (pthread_self() != mEngineThread) {
763                pthread_exit(nullptr);
764            }
765            return nullptr;
766        }
767        fileIdx = mNextFileToGrep++;
768    }
769
770    unsigned printIdx = mNextFileToPrint++;
771    while (printIdx < inputFiles.size()) {
772        const bool readyToPrint = ((printIdx == 0) || (mFileStatus[printIdx - 1] == FileStatus::PrintComplete)) && (mFileStatus[printIdx] == FileStatus::GrepComplete);
773        if (readyToPrint) {
774            const auto output = mResultStrs[printIdx].str();
775            if (!output.empty()) {
776                llvm::outs() << output;
777            }
778            mFileStatus[printIdx] = FileStatus::PrintComplete;
779            printIdx = mNextFileToPrint++;
780        } else {
781            mGrepDriver->performIncrementalCacheCleanupStep();
782        }
783        sched_yield();
784    }
785
786    if (pthread_self() != mEngineThread) {
787        pthread_exit(nullptr);
788    } else {
789        // Always perform one final cache cleanup step.
790        mGrepDriver->performIncrementalCacheCleanupStep();
791    }
792    return nullptr;
793}
794
795}
Note: See TracBrowser for help on using the repository browser.