source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5418

Last change on this file since 5418 was 5418, checked in by nmedfort, 2 years ago

Removed non-functional CUDA code from icgrep and consolidated grep and multigrep mode into a single function; allowed segment parallel pipeline to utilize process as its initial thread; modified MMapSourceKernel to map and perform mmap directly and advise the OS to drop consumed data streams.

File size: 14.8 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <llvm/ExecutionEngine/MCJIT.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/Support/CommandLine.h>
12#include <boost/filesystem.hpp>
13#include <IR_Gen/idisa_builder.h>
14#include <IR_Gen/idisa_target.h>
15#include <UCD/UnicodeNameData.h>
16#include <UCD/resolve_properties.h>
17#include <kernels/cc_kernel.h>
18#include <kernels/grep_kernel.h>
19#include <kernels/linebreak_kernel.h>
20#include <kernels/streams_merge.h>
21#include <kernels/match_count.h>
22#include <kernels/mmap_kernel.h>
23#include <kernels/s2p_kernel.h>
24#include <kernels/scanmatchgen.h>
25#include <kernels/streamset.h>
26#include <kernels/stdin_kernel.h>
27#include <pablo/pablo_kernel.h>
28#include <re/re_cc.h>
29#include <re/re_toolchain.h>
30#include <kernels/toolchain.h>
31#include <iostream>
32#include <sstream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38
39using namespace parabix;
40using namespace llvm;
41
42static cl::OptionCategory bGrepOutputOptions("Output Options",
43                                             "These options control the output.");
44static cl::opt<bool> SilenceFileErrors("s", cl::desc("Suppress messages for file errors."), cl::init(false),  cl::cat(bGrepOutputOptions));
45
46static cl::opt<bool> SuppressOutput("q", cl::desc("Suppress normal output; set return code only."), cl::init(false),  cl::cat(bGrepOutputOptions));
47
48static cl::opt<bool> NormalizeLineBreaks("normalize-line-breaks", cl::desc("Normalize line breaks to std::endl."), cl::init(false),  cl::cat(bGrepOutputOptions));
49
50static cl::opt<bool> ShowFileNames("H", cl::desc("Show the file name with each matching line."), cl::cat(bGrepOutputOptions));
51static cl::alias ShowFileNamesLong("with-filename", cl::desc("Alias for -H"), cl::aliasopt(ShowFileNames));
52
53static cl::opt<bool> ShowLineNumbers("n", cl::desc("Show the line number with each matching line."), cl::cat(bGrepOutputOptions));
54static cl::alias ShowLineNumbersLong("line-number", cl::desc("Alias for -n"), cl::aliasopt(ShowLineNumbers));
55
56static re::CC * parsedCodePointSet = nullptr;
57
58static std::vector<std::string> parsedPropertyValues;
59
60uint64_t GrepEngine::doGrep(const std::string & fileName, const int fileIdx) const {
61    const int fd = open(fileName.c_str(), O_RDONLY);
62    if (LLVM_UNLIKELY(fd == -1)) {
63        return 0;
64    }
65    const auto result = doGrep(fd, fileIdx);
66    close(fd);
67    return result;
68}
69
70uint64_t GrepEngine::doGrep(const uint32_t fileDescriptor, const int fileIdx) const {
71    assert (mGrepFunction);
72    typedef uint64_t (*GrepFunctionType)(size_t fileDescriptor, const int fileIdx);
73    return reinterpret_cast<GrepFunctionType>(mGrepFunction)(fileDescriptor, fileIdx);
74}
75
76void GrepEngine::doGrep(const char * buffer, const uint64_t length, const int fileIdx) const {
77    assert (mGrepFunction);
78    typedef uint64_t (*GrepFunctionType)(const char * buffer, const uint64_t length, const int fileIdx);
79    reinterpret_cast<GrepFunctionType>(mGrepFunction)(buffer, length, fileIdx);
80}
81
82static int * total_count;
83static std::stringstream * resultStrs = nullptr;
84static std::vector<std::string> inputFiles;
85
86void initFileResult(std::vector<std::string> filenames){
87    const int n = filenames.size();
88    if (n > 1) {
89        ShowFileNames = true;
90    }
91    inputFiles = filenames;
92    resultStrs = new std::stringstream[n];
93    total_count = new int[n];
94    for (unsigned i = 0; i < inputFiles.size(); ++i){
95        total_count[i] = 0;
96    }
97
98}
99
100template<typename CodeUnit>
101void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
102
103//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
104
105    assert (buffer);
106    assert (line_start <= line_end);
107    assert (line_end <= filesize);
108
109    if (ShowFileNames) {
110        resultStrs[fileIdx] << inputFiles[fileIdx] << ':';
111    }
112    if (ShowLineNumbers) {
113        resultStrs[fileIdx] << lineNum << ":";
114    }
115
116    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
117    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
118        ++line_start;
119    }
120
121    if (LLVM_UNLIKELY(line_end == filesize)) {
122        // The match position is at end-of-file.   We have a final unterminated line.
123        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
124        if (NormalizeLineBreaks) {
125            resultStrs[fileIdx] << '\n';  // terminate it
126        }
127    } else {
128        const auto end_byte = buffer[line_end];
129        if (NormalizeLineBreaks) {
130            if (LLVM_UNLIKELY(end_byte == 0x85)) {
131                // Line terminated with NEL, on the second byte.  Back up 1.
132                line_end -= 1;
133            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
134                // Line terminated with PS or LS, on the third byte.  Back up 2.
135                line_end -= 2;
136            }
137            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
138            resultStrs[fileIdx] << '\n';
139        } else {
140            if (end_byte == 0x0D) {
141                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
142                if ((line_end + 1) < filesize) {
143                    if (buffer[line_end + 1] == 0x0A) {
144                        // Found CRLF; preserve both bytes.
145                        ++line_end;
146                    }
147                }
148            }
149            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
150        }
151    }
152}
153
154void PrintResult(bool CountOnly, std::vector<size_t> & total_CountOnly){
155    if (CountOnly) {
156        if (!ShowFileNames) {
157            for (unsigned i = 0; i < inputFiles.size(); ++i){
158                std::cout << total_CountOnly[i] << std::endl;
159            }
160        } else {
161            for (unsigned i = 0; i < inputFiles.size(); ++i){
162                std::cout << inputFiles[i] << ':' << total_CountOnly[i] << std::endl;
163            };
164        }
165    } else {
166        for (unsigned i = 0; i < inputFiles.size(); ++i){
167            std::cout << resultStrs[i].str();
168        }
169    }
170}
171
172void insert_codepoints(const size_t lineNum, const size_t line_start, const size_t line_end, const char * const buffer) {
173    assert (buffer);
174    assert (line_start <= line_end);
175    re::codepoint_t c = 0;
176    size_t line_pos = line_start;
177    while (isxdigit(buffer[line_pos])) {
178        assert (line_pos < line_end);
179        if (isdigit(buffer[line_pos])) {
180            c = (c << 4) | (buffer[line_pos] - '0');
181        }
182        else {
183            c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
184        }
185        line_pos++;
186    }
187    assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
188    parsedCodePointSet->insert(c);
189}
190
191void insert_property_values(size_t lineNum, size_t line_start, size_t line_end, const char * buffer) {
192    assert (line_start <= line_end);
193    parsedPropertyValues.emplace_back(buffer + line_start, buffer + line_end);
194}
195
196inline void linkGrepFunction(ParabixDriver & pxDriver, const GrepType grepType, const bool UTF_16, kernel::KernelBuilder & kernel) {
197    switch (grepType) {
198        case GrepType::Normal:
199            if (UTF_16) {
200                pxDriver.addExternalLink(kernel, "matcher", &wrapped_report_match<uint16_t>);
201            } else {
202                pxDriver.addExternalLink(kernel, "matcher", &wrapped_report_match<uint8_t>);
203            }
204            break;
205        case GrepType::NameExpression:
206            pxDriver.addExternalLink(kernel, "matcher", &insert_codepoints);
207            break;
208        case GrepType::PropertyValue:
209            pxDriver.addExternalLink(kernel, "matcher", &insert_property_values);
210            break;
211    }
212}
213
214void GrepEngine::grepCodeGen(std::string moduleName, std::vector<re::RE *> REs, const bool CountOnly, const bool UTF_16, GrepSource grepSource, const GrepType grepType) {
215
216    Module * M = new Module(moduleName + ":icgrep", getGlobalContext());;
217    IDISA::IDISA_Builder * iBuilder = IDISA::GetIDISA_Builder(M);;
218    ParabixDriver pxDriver(iBuilder);
219
220    const unsigned segmentSize = codegen::SegmentSize;
221    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
222    const unsigned encodingBits = UTF_16 ? 16 : 8;
223
224    Type * const int64Ty = iBuilder->getInt64Ty();
225
226    Function * mainFunc = nullptr;
227    Value * fileIdx = nullptr;
228    StreamSetBuffer * ByteStream = nullptr;
229    kernel::KernelBuilder * sourceK = nullptr;
230
231    if (grepSource == GrepSource::Internal) {
232
233        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, iBuilder->getInt8PtrTy(), int64Ty, int64Ty, nullptr));
234        mainFunc->setCallingConv(CallingConv::C);
235        iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
236        Function::arg_iterator args = mainFunc->arg_begin();
237
238        Value * const buffer = &*(args++);
239        buffer->setName("buffer");
240        Value * const length = &*(args++);
241        length->setName("length");
242        fileIdx = &*(args++);
243        fileIdx->setName("fileIdx");
244
245        ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
246
247        sourceK = pxDriver.addKernelInstance(make_unique<kernel::FileSourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), segmentSize));
248        sourceK->setInitialArguments({buffer, length});
249
250    } else {
251
252        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, iBuilder->getInt32Ty(), int64Ty, nullptr));
253        mainFunc->setCallingConv(CallingConv::C);
254        iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
255        Function::arg_iterator args = mainFunc->arg_begin();
256
257        Value * const fileDescriptor = &*(args++);
258        fileDescriptor->setName("fileDescriptor");
259        fileIdx = &*(args++);
260        fileIdx->setName("fileIdx");
261
262        if (grepSource == GrepSource::File) {
263            ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
264            sourceK = pxDriver.addKernelInstance(make_unique<kernel::MMapSourceKernel>(iBuilder, segmentSize));
265            sourceK->setInitialArguments({fileDescriptor});
266        } else { // if (grepSource == GrepSource::StdIn) {
267            ByteStream = pxDriver.addBuffer(make_unique<ExtensibleBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), segmentSize));
268            sourceK = pxDriver.addKernelInstance(make_unique<kernel::StdInKernel>(iBuilder, segmentSize));
269        }
270    }
271
272    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
273    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
274   
275    kernel::KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<kernel::S2PKernel>(iBuilder));
276    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
277   
278    kernel::KernelBuilder * linebreakK = pxDriver.addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(iBuilder, encodingBits));
279    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
280    pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
281   
282    const auto n = REs.size();
283
284    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
285
286    for(unsigned i = 0; i < n; ++i){
287        StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
288        kernel::KernelBuilder * icgrepK = pxDriver.addKernelInstance(make_unique<kernel::ICgrepKernelBuilder>(iBuilder, REs[i]));
289        pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream}, {MatchResults});
290        MatchResultsBufs[i] = MatchResults;
291    }
292    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
293    if (REs.size() > 1) {
294        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
295        kernel::KernelBuilder * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(iBuilder, 1, REs.size()));
296        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
297    }
298   
299    if (AlgorithmOptionIsSet(re::InvertMatches)) {
300        kernel::KernelBuilder * invertK = pxDriver.addKernelInstance(make_unique<kernel::InvertMatchesKernel>(iBuilder));
301        StreamSetBuffer * OriginalMatches = MergedResults;
302        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
303        pxDriver.makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {MergedResults});
304    }
305    if (CountOnly) {
306        kernel::MatchCount matchCountK(iBuilder);
307        pxDriver.addKernelCall(matchCountK, {MergedResults}, {});
308        pxDriver.generatePipelineIR();
309        iBuilder->CreateRet(matchCountK.getScalarField("matchedLineCount"));
310        pxDriver.linkAndFinalize();
311    } else {
312        kernel::ScanMatchKernel scanMatchK(iBuilder, grepType, encodingBits);
313        scanMatchK.setInitialArguments({fileIdx});
314        pxDriver.addKernelCall(scanMatchK, {MergedResults, LineBreakStream, ByteStream}, {});
315        linkGrepFunction(pxDriver, grepType, UTF_16, scanMatchK);
316        pxDriver.generatePipelineIR();
317        iBuilder->CreateRet(iBuilder->getInt64(0));
318        pxDriver.linkAndFinalize();
319    }
320
321    mGrepFunction = pxDriver.getPointerToMain();
322}
323
324re::CC * GrepEngine::grepCodepoints() {
325    parsedCodePointSet = re::makeCC();
326    char * mFileBuffer = getUnicodeNameDataPtr();
327    size_t mFileSize = getUnicodeNameDataSize();
328    doGrep(mFileBuffer, mFileSize, 0);
329    return parsedCodePointSet;
330}
331
332const std::vector<std::string> & GrepEngine::grepPropertyValues(const std::string& propertyName) {
333    enum { MaxSupportedVectorWidthInBytes = 32 };
334    AlignedAllocator<char, MaxSupportedVectorWidthInBytes> alloc;
335    parsedPropertyValues.clear();
336    const std::string & str = UCD::getPropertyValueGrepString(propertyName);
337    const auto n = str.length();
338    // NOTE: MaxSupportedVectorWidthInBytes of trailing 0s are needed to prevent the grep function from
339    // erroneously matching garbage data when loading the final partial block.
340    char * aligned = alloc.allocate(n + MaxSupportedVectorWidthInBytes, 0);
341    std::memcpy(aligned, str.data(), n);
342    std::memset(aligned + n, 0, MaxSupportedVectorWidthInBytes);
343    doGrep(aligned, n, 0);
344    alloc.deallocate(aligned, 0);
345    return parsedPropertyValues;
346}
347
348GrepEngine::GrepEngine()
349: mGrepFunction(nullptr) {
350
351}
Note: See TracBrowser for help on using the repository browser.