source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5440

Last change on this file since 5440 was 5440, checked in by nmedfort, 2 years ago

Large refactoring step. Removed IR generation code from Kernel (formally KernelBuilder?) and moved it into the new KernelBuilder? class.

File size: 14.6 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9//#include <llvm/ExecutionEngine/MCJIT.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/Support/CommandLine.h>
12#include <boost/filesystem.hpp>
13#include <UCD/UnicodeNameData.h>
14#include <UCD/resolve_properties.h>
15#include <kernels/cc_kernel.h>
16#include <kernels/grep_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/match_count.h>
20#include <kernels/source_kernel.h>
21#include <kernels/s2p_kernel.h>
22#include <kernels/scanmatchgen.h>
23#include <kernels/streamset.h>
24#include <kernels/kernel_builder.h>
25#include <pablo/pablo_kernel.h>
26#include <re/re_cc.h>
27#include <re/re_toolchain.h>
28#include <toolchain/toolchain.h>
29#include <iostream>
30#include <sstream>
31#include <cc/multiplex_CCs.h>
32#include <llvm/Support/raw_ostream.h>
33#include <util/aligned_allocator.h>
34#include <sys/stat.h>
35#include <fcntl.h>
36
37using namespace parabix;
38using namespace llvm;
39
40static cl::OptionCategory bGrepOutputOptions("Output Options",
41                                             "These options control the output.");
42static cl::opt<bool> SilenceFileErrors("s", cl::desc("Suppress messages for file errors."), cl::init(false),  cl::cat(bGrepOutputOptions));
43
44static cl::opt<bool> SuppressOutput("q", cl::desc("Suppress normal output; set return code only."), cl::init(false),  cl::cat(bGrepOutputOptions));
45
46static cl::opt<bool> NormalizeLineBreaks("normalize-line-breaks", cl::desc("Normalize line breaks to std::endl."), cl::init(false),  cl::cat(bGrepOutputOptions));
47
48static cl::opt<bool> ShowFileNames("H", cl::desc("Show the file name with each matching line."), cl::cat(bGrepOutputOptions));
49static cl::alias ShowFileNamesLong("with-filename", cl::desc("Alias for -H"), cl::aliasopt(ShowFileNames));
50
51static cl::opt<bool> ShowLineNumbers("n", cl::desc("Show the line number with each matching line."), cl::cat(bGrepOutputOptions));
52static cl::alias ShowLineNumbersLong("line-number", cl::desc("Alias for -n"), cl::aliasopt(ShowLineNumbers));
53
54static re::CC * parsedCodePointSet = nullptr;
55
56static std::vector<std::string> parsedPropertyValues;
57
58uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
59    const int32_t fd = open(fileName.c_str(), O_RDONLY);
60    if (LLVM_UNLIKELY(fd == -1)) {
61        return 0;
62    }
63    const auto result = doGrep(fd, fileIdx);
64    close(fd);
65    return result;
66}
67
68uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
69    assert (mGrepFunction);
70    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
71    return reinterpret_cast<GrepFunctionType>(mGrepFunction)(fileDescriptor, fileIdx);
72}
73
74void GrepEngine::doGrep(const char * buffer, const uint64_t length, const uint32_t fileIdx) const {
75    assert (mGrepFunction);
76    typedef uint64_t (*GrepFunctionType)(const char * buffer, const uint64_t length, const uint32_t fileIdx);
77    reinterpret_cast<GrepFunctionType>(mGrepFunction)(buffer, length, fileIdx);
78}
79
80static int * total_count;
81static std::stringstream * resultStrs = nullptr;
82static std::vector<std::string> inputFiles;
83
84void initFileResult(std::vector<std::string> filenames){
85    const int n = filenames.size();
86    if (n > 1) {
87        ShowFileNames = true;
88    }
89    inputFiles = filenames;
90    resultStrs = new std::stringstream[n];
91    total_count = new int[n];
92    for (unsigned i = 0; i < inputFiles.size(); ++i){
93        total_count[i] = 0;
94    }
95
96}
97
98template<typename CodeUnit>
99void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
100
101//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
102
103    assert (buffer);
104    assert (line_start <= line_end);
105    assert (line_end <= filesize);
106
107    if (ShowFileNames) {
108        resultStrs[fileIdx] << inputFiles[fileIdx] << ':';
109    }
110    if (ShowLineNumbers) {
111        resultStrs[fileIdx] << lineNum << ":";
112    }
113
114    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
115    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
116        ++line_start;
117    }
118
119    if (LLVM_UNLIKELY(line_end == filesize)) {
120        // The match position is at end-of-file.   We have a final unterminated line.
121        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
122        if (NormalizeLineBreaks) {
123            resultStrs[fileIdx] << '\n';  // terminate it
124        }
125    } else {
126        const auto end_byte = buffer[line_end];
127        if (NormalizeLineBreaks) {
128            if (LLVM_UNLIKELY(end_byte == 0x85)) {
129                // Line terminated with NEL, on the second byte.  Back up 1.
130                line_end -= 1;
131            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
132                // Line terminated with PS or LS, on the third byte.  Back up 2.
133                line_end -= 2;
134            }
135            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
136            resultStrs[fileIdx] << '\n';
137        } else {
138            if (end_byte == 0x0D) {
139                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
140                if ((line_end + 1) < filesize) {
141                    if (buffer[line_end + 1] == 0x0A) {
142                        // Found CRLF; preserve both bytes.
143                        ++line_end;
144                    }
145                }
146            }
147            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
148        }
149    }
150}
151
152void PrintResult(bool CountOnly, std::vector<size_t> & total_CountOnly){
153    if (CountOnly) {
154        if (!ShowFileNames) {
155            for (unsigned i = 0; i < inputFiles.size(); ++i){
156                std::cout << total_CountOnly[i] << std::endl;
157            }
158        } else {
159            for (unsigned i = 0; i < inputFiles.size(); ++i){
160                std::cout << inputFiles[i] << ':' << total_CountOnly[i] << std::endl;
161            };
162        }
163    } else {
164        for (unsigned i = 0; i < inputFiles.size(); ++i){
165            std::cout << resultStrs[i].str();
166        }
167    }
168}
169
170void insert_codepoints(const size_t lineNum, const size_t line_start, const size_t line_end, const char * const buffer) {
171    assert (buffer);
172    assert (line_start <= line_end);
173    re::codepoint_t c = 0;
174    size_t line_pos = line_start;
175    while (isxdigit(buffer[line_pos])) {
176        assert (line_pos < line_end);
177        if (isdigit(buffer[line_pos])) {
178            c = (c << 4) | (buffer[line_pos] - '0');
179        }
180        else {
181            c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
182        }
183        line_pos++;
184    }
185    assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
186    parsedCodePointSet->insert(c);
187}
188
189void insert_property_values(size_t lineNum, size_t line_start, size_t line_end, const char * buffer) {
190    assert (line_start <= line_end);
191    parsedPropertyValues.emplace_back(buffer + line_start, buffer + line_end);
192}
193
194void GrepEngine::grepCodeGen(const std::string & moduleName, std::vector<re::RE *> REs, const bool CountOnly, const bool UTF_16, GrepSource grepSource, const GrepType grepType) {
195
196    ParabixDriver pxDriver(moduleName + ":icgrep");
197    auto & idb = pxDriver.getBuilder();
198    Module * M = idb->getModule();
199
200    const unsigned segmentSize = codegen::SegmentSize;
201    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
202    const unsigned encodingBits = UTF_16 ? 16 : 8;
203
204    Type * const int64Ty = idb->getInt64Ty();
205    Type * const int32Ty = idb->getInt32Ty();
206
207    Function * mainFunc = nullptr;
208    Value * fileIdx = nullptr;
209    StreamSetBuffer * ByteStream = nullptr;
210    kernel::Kernel * sourceK = nullptr;
211
212    if (grepSource == GrepSource::Internal) {
213
214        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt8PtrTy(), int64Ty, int32Ty, nullptr));
215        mainFunc->setCallingConv(CallingConv::C);
216        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
217        auto args = mainFunc->arg_begin();
218
219        Value * const buffer = &*(args++);
220        buffer->setName("buffer");
221
222        Value * length = &*(args++);
223        length->setName("length");
224        length = idb->CreateZExtOrTrunc(length, idb->getSizeTy());
225
226        fileIdx = &*(args++);
227        fileIdx->setName("fileIdx");
228
229        ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
230
231        sourceK = pxDriver.addKernelInstance(make_unique<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy(), segmentSize));
232        sourceK->setInitialArguments({buffer, length});
233
234    } else {
235
236        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt32Ty(), int32Ty, nullptr));
237        mainFunc->setCallingConv(CallingConv::C);
238        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
239        auto args = mainFunc->arg_begin();
240
241        Value * const fileDescriptor = &*(args++);
242        fileDescriptor->setName("fileDescriptor");
243        fileIdx = &*(args++);
244        fileIdx->setName("fileIdx");
245
246        ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
247
248        if (grepSource == GrepSource::File) {
249            sourceK = pxDriver.addKernelInstance(make_unique<kernel::MMapSourceKernel>(idb, segmentSize));
250            sourceK->setInitialArguments({fileDescriptor});
251        } else { // if (grepSource == GrepSource::StdIn) {
252            sourceK = pxDriver.addKernelInstance(make_unique<kernel::ReadSourceKernel>(idb, segmentSize));
253            sourceK->setInitialArguments({idb->getInt32(STDIN_FILENO)});
254        }
255    }
256
257    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
258    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize * bufferSegments));
259   
260    kernel::Kernel * s2pk = pxDriver.addKernelInstance(make_unique<kernel::S2PKernel>(idb));
261    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
262   
263    kernel::Kernel * linebreakK = pxDriver.addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
264    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
265    pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
266   
267    const auto n = REs.size();
268
269    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
270
271    for(unsigned i = 0; i < n; ++i){
272        StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
273        kernel::Kernel * icgrepK = pxDriver.addKernelInstance(make_unique<kernel::ICgrepKernelBuilder>(idb, REs[i]));
274        pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream}, {MatchResults});
275        MatchResultsBufs[i] = MatchResults;
276    }
277    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
278    if (REs.size() > 1) {
279        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
280        kernel::Kernel * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
281        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
282    }
283   
284    if (AlgorithmOptionIsSet(re::InvertMatches)) {
285        kernel::Kernel * invertK = pxDriver.addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
286        StreamSetBuffer * OriginalMatches = MergedResults;
287        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
288        pxDriver.makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {MergedResults});
289    }
290    if (CountOnly) {
291        kernel::MatchCount matchCountK(idb);
292        pxDriver.addKernelCall(matchCountK, {MergedResults}, {});
293        pxDriver.generatePipelineIR();
294        idb->setKernel(&matchCountK);
295        Value * matchedLineCount = idb->getScalarField("matchedLineCount");
296        matchedLineCount = idb->CreateZExt(matchedLineCount, int64Ty);
297        idb->CreateRet(matchedLineCount);
298        pxDriver.linkAndFinalize();
299    } else {
300        kernel::ScanMatchKernel scanMatchK(idb, grepType, encodingBits);
301        scanMatchK.setInitialArguments({fileIdx});
302        pxDriver.addKernelCall(scanMatchK, {MergedResults, LineBreakStream, ByteStream}, {});
303        switch (grepType) {
304            case GrepType::Normal:
305                if (UTF_16) {
306                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint16_t>);
307                } else {
308                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint8_t>);
309                }
310                break;
311            case GrepType::NameExpression:
312                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_codepoints);
313                break;
314            case GrepType::PropertyValue:
315                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_property_values);
316                break;
317        }
318        pxDriver.generatePipelineIR();
319        idb->CreateRet(idb->getInt64(0));
320        pxDriver.linkAndFinalize();
321    }
322
323    mGrepFunction = pxDriver.getPointerToMain();
324}
325
326re::CC * GrepEngine::grepCodepoints() {
327    parsedCodePointSet = re::makeCC();
328    char * mFileBuffer = getUnicodeNameDataPtr();
329    size_t mFileSize = getUnicodeNameDataSize();
330    doGrep(mFileBuffer, mFileSize, 0);
331    return parsedCodePointSet;
332}
333
334const std::vector<std::string> & GrepEngine::grepPropertyValues(const std::string& propertyName) {
335    enum { MaxSupportedVectorWidthInBytes = 32 };
336    AlignedAllocator<char, MaxSupportedVectorWidthInBytes> alloc;
337    parsedPropertyValues.clear();
338    const std::string & str = UCD::getPropertyValueGrepString(propertyName);
339    const auto n = str.length();
340    // NOTE: MaxSupportedVectorWidthInBytes of trailing 0s are needed to prevent the grep function from
341    // erroneously matching garbage data when loading the final partial block.
342    char * aligned = alloc.allocate(n + MaxSupportedVectorWidthInBytes, 0);
343    std::memcpy(aligned, str.data(), n);
344    std::memset(aligned + n, 0, MaxSupportedVectorWidthInBytes);
345    doGrep(aligned, n, 0);
346    alloc.deallocate(aligned, 0);
347    return parsedPropertyValues;
348}
349
350GrepEngine::GrepEngine()
351: mGrepFunction(nullptr) {
352
353}
Note: See TracBrowser for help on using the repository browser.