source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5425

Last change on this file since 5425 was 5425, checked in by nmedfort, 2 years ago

Changes towards separate compilation

File size: 14.7 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <llvm/ExecutionEngine/MCJIT.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/Support/CommandLine.h>
12#include <boost/filesystem.hpp>
13#include <IR_Gen/idisa_builder.h>
14#include <UCD/UnicodeNameData.h>
15#include <UCD/resolve_properties.h>
16#include <kernels/cc_kernel.h>
17#include <kernels/grep_kernel.h>
18#include <kernels/linebreak_kernel.h>
19#include <kernels/streams_merge.h>
20#include <kernels/match_count.h>
21#include <kernels/mmap_kernel.h>
22#include <kernels/s2p_kernel.h>
23#include <kernels/scanmatchgen.h>
24#include <kernels/streamset.h>
25#include <kernels/stdin_kernel.h>
26#include <pablo/pablo_kernel.h>
27#include <re/re_cc.h>
28#include <re/re_toolchain.h>
29#include <toolchain/toolchain.h>
30#include <iostream>
31#include <sstream>
32#include <cc/multiplex_CCs.h>
33#include <llvm/Support/raw_ostream.h>
34#include <util/aligned_allocator.h>
35#include <sys/stat.h>
36#include <fcntl.h>
37
38using namespace parabix;
39using namespace llvm;
40
41static cl::OptionCategory bGrepOutputOptions("Output Options",
42                                             "These options control the output.");
43static cl::opt<bool> SilenceFileErrors("s", cl::desc("Suppress messages for file errors."), cl::init(false),  cl::cat(bGrepOutputOptions));
44
45static cl::opt<bool> SuppressOutput("q", cl::desc("Suppress normal output; set return code only."), cl::init(false),  cl::cat(bGrepOutputOptions));
46
47static cl::opt<bool> NormalizeLineBreaks("normalize-line-breaks", cl::desc("Normalize line breaks to std::endl."), cl::init(false),  cl::cat(bGrepOutputOptions));
48
49static cl::opt<bool> ShowFileNames("H", cl::desc("Show the file name with each matching line."), cl::cat(bGrepOutputOptions));
50static cl::alias ShowFileNamesLong("with-filename", cl::desc("Alias for -H"), cl::aliasopt(ShowFileNames));
51
52static cl::opt<bool> ShowLineNumbers("n", cl::desc("Show the line number with each matching line."), cl::cat(bGrepOutputOptions));
53static cl::alias ShowLineNumbersLong("line-number", cl::desc("Alias for -n"), cl::aliasopt(ShowLineNumbers));
54
55static re::CC * parsedCodePointSet = nullptr;
56
57static std::vector<std::string> parsedPropertyValues;
58
59uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
60    const int32_t fd = open(fileName.c_str(), O_RDONLY);
61    if (LLVM_UNLIKELY(fd == -1)) {
62        return 0;
63    }
64    const auto result = doGrep(fd, fileIdx);
65    close(fd);
66    return result;
67}
68
69uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
70    assert (mGrepFunction);
71    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
72    return reinterpret_cast<GrepFunctionType>(mGrepFunction)(fileDescriptor, fileIdx);
73}
74
75void GrepEngine::doGrep(const char * buffer, const uint64_t length, const uint32_t fileIdx) const {
76    assert (mGrepFunction);
77    typedef uint64_t (*GrepFunctionType)(const char * buffer, const uint64_t length, const uint32_t fileIdx);
78    reinterpret_cast<GrepFunctionType>(mGrepFunction)(buffer, length, fileIdx);
79}
80
81static int * total_count;
82static std::stringstream * resultStrs = nullptr;
83static std::vector<std::string> inputFiles;
84
85void initFileResult(std::vector<std::string> filenames){
86    const int n = filenames.size();
87    if (n > 1) {
88        ShowFileNames = true;
89    }
90    inputFiles = filenames;
91    resultStrs = new std::stringstream[n];
92    total_count = new int[n];
93    for (unsigned i = 0; i < inputFiles.size(); ++i){
94        total_count[i] = 0;
95    }
96
97}
98
99template<typename CodeUnit>
100void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
101
102//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
103
104    assert (buffer);
105    assert (line_start <= line_end);
106    assert (line_end <= filesize);
107
108    if (ShowFileNames) {
109        resultStrs[fileIdx] << inputFiles[fileIdx] << ':';
110    }
111    if (ShowLineNumbers) {
112        resultStrs[fileIdx] << lineNum << ":";
113    }
114
115    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
116    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
117        ++line_start;
118    }
119
120    if (LLVM_UNLIKELY(line_end == filesize)) {
121        // The match position is at end-of-file.   We have a final unterminated line.
122        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
123        if (NormalizeLineBreaks) {
124            resultStrs[fileIdx] << '\n';  // terminate it
125        }
126    } else {
127        const auto end_byte = buffer[line_end];
128        if (NormalizeLineBreaks) {
129            if (LLVM_UNLIKELY(end_byte == 0x85)) {
130                // Line terminated with NEL, on the second byte.  Back up 1.
131                line_end -= 1;
132            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
133                // Line terminated with PS or LS, on the third byte.  Back up 2.
134                line_end -= 2;
135            }
136            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
137            resultStrs[fileIdx] << '\n';
138        } else {
139            if (end_byte == 0x0D) {
140                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
141                if ((line_end + 1) < filesize) {
142                    if (buffer[line_end + 1] == 0x0A) {
143                        // Found CRLF; preserve both bytes.
144                        ++line_end;
145                    }
146                }
147            }
148            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
149        }
150    }
151}
152
153void PrintResult(bool CountOnly, std::vector<size_t> & total_CountOnly){
154    if (CountOnly) {
155        if (!ShowFileNames) {
156            for (unsigned i = 0; i < inputFiles.size(); ++i){
157                std::cout << total_CountOnly[i] << std::endl;
158            }
159        } else {
160            for (unsigned i = 0; i < inputFiles.size(); ++i){
161                std::cout << inputFiles[i] << ':' << total_CountOnly[i] << std::endl;
162            };
163        }
164    } else {
165        for (unsigned i = 0; i < inputFiles.size(); ++i){
166            std::cout << resultStrs[i].str();
167        }
168    }
169}
170
171void insert_codepoints(const size_t lineNum, const size_t line_start, const size_t line_end, const char * const buffer) {
172    assert (buffer);
173    assert (line_start <= line_end);
174    re::codepoint_t c = 0;
175    size_t line_pos = line_start;
176    while (isxdigit(buffer[line_pos])) {
177        assert (line_pos < line_end);
178        if (isdigit(buffer[line_pos])) {
179            c = (c << 4) | (buffer[line_pos] - '0');
180        }
181        else {
182            c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
183        }
184        line_pos++;
185    }
186    assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
187    parsedCodePointSet->insert(c);
188}
189
190void insert_property_values(size_t lineNum, size_t line_start, size_t line_end, const char * buffer) {
191    assert (line_start <= line_end);
192    parsedPropertyValues.emplace_back(buffer + line_start, buffer + line_end);
193}
194
195void GrepEngine::grepCodeGen(const std::string & moduleName, std::vector<re::RE *> REs, const bool CountOnly, const bool UTF_16, GrepSource grepSource, const GrepType grepType) {
196
197    ParabixDriver pxDriver(moduleName + ":icgrep");
198    auto idb = pxDriver.getIDISA_Builder();
199    Module * M = idb->getModule();
200
201    const unsigned segmentSize = codegen::SegmentSize;
202    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
203    const unsigned encodingBits = UTF_16 ? 16 : 8;
204
205    Type * const int64Ty = idb->getInt64Ty();
206    Type * const int32Ty = idb->getInt32Ty();
207
208    Function * mainFunc = nullptr;
209    Value * fileIdx = nullptr;
210    StreamSetBuffer * ByteStream = nullptr;
211    kernel::KernelBuilder * sourceK = nullptr;
212
213    if (grepSource == GrepSource::Internal) {
214
215        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt8PtrTy(), int64Ty, int32Ty, nullptr));
216        mainFunc->setCallingConv(CallingConv::C);
217        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
218        auto args = mainFunc->arg_begin();
219
220        Value * const buffer = &*(args++);
221        buffer->setName("buffer");
222
223        Value * length = &*(args++);
224        length->setName("length");
225        length = idb->CreateZExtOrTrunc(length, idb->getSizeTy());
226
227        fileIdx = &*(args++);
228        fileIdx->setName("fileIdx");
229
230        ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(idb, idb->getStreamSetTy(1, 8)));
231
232        sourceK = pxDriver.addKernelInstance(make_unique<kernel::FileSourceKernel>(idb, idb->getInt8PtrTy(), segmentSize));
233        sourceK->setInitialArguments({buffer, length});
234
235    } else {
236
237        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt32Ty(), int32Ty, nullptr));
238        mainFunc->setCallingConv(CallingConv::C);
239        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
240        auto args = mainFunc->arg_begin();
241
242        Value * const fileDescriptor = &*(args++);
243        fileDescriptor->setName("fileDescriptor");
244        fileIdx = &*(args++);
245        fileIdx->setName("fileIdx");
246
247        if (grepSource == GrepSource::File) {
248            ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(idb, idb->getStreamSetTy(1, 8)));
249            sourceK = pxDriver.addKernelInstance(make_unique<kernel::MMapSourceKernel>(idb, segmentSize));
250            sourceK->setInitialArguments({fileDescriptor});
251        } else { // if (grepSource == GrepSource::StdIn) {
252            ByteStream = pxDriver.addBuffer(make_unique<ExtensibleBuffer>(idb, idb->getStreamSetTy(1, 8), segmentSize));
253            sourceK = pxDriver.addKernelInstance(make_unique<kernel::StdInKernel>(idb, segmentSize));
254        }
255    }
256
257    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
258    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize * bufferSegments));
259   
260    kernel::KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<kernel::S2PKernel>(idb));
261    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
262   
263    kernel::KernelBuilder * linebreakK = pxDriver.addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
264    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
265    pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
266   
267    const auto n = REs.size();
268
269    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
270
271    for(unsigned i = 0; i < n; ++i){
272        StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
273        kernel::KernelBuilder * icgrepK = pxDriver.addKernelInstance(make_unique<kernel::ICgrepKernelBuilder>(idb, REs[i]));
274        pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream}, {MatchResults});
275        MatchResultsBufs[i] = MatchResults;
276    }
277    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
278    if (REs.size() > 1) {
279        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
280        kernel::KernelBuilder * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
281        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
282    }
283   
284    if (AlgorithmOptionIsSet(re::InvertMatches)) {
285        kernel::KernelBuilder * invertK = pxDriver.addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
286        StreamSetBuffer * OriginalMatches = MergedResults;
287        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
288        pxDriver.makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {MergedResults});
289    }
290    if (CountOnly) {
291        kernel::MatchCount matchCountK(idb);
292        pxDriver.addKernelCall(matchCountK, {MergedResults}, {});
293        pxDriver.generatePipelineIR();
294        Value * matchedLineCount = matchCountK.getScalarField("matchedLineCount");
295        matchedLineCount = idb->CreateZExt(matchedLineCount, int64Ty);
296        idb->CreateRet(matchedLineCount);
297        pxDriver.linkAndFinalize();
298    } else {
299        kernel::ScanMatchKernel scanMatchK(idb, grepType, encodingBits);
300        scanMatchK.setInitialArguments({fileIdx});
301        pxDriver.addKernelCall(scanMatchK, {MergedResults, LineBreakStream, ByteStream}, {});
302        switch (grepType) {
303            case GrepType::Normal:
304                if (UTF_16) {
305                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint16_t>);
306                } else {
307                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint8_t>);
308                }
309                break;
310            case GrepType::NameExpression:
311                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_codepoints);
312                break;
313            case GrepType::PropertyValue:
314                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_property_values);
315                break;
316        }
317        pxDriver.generatePipelineIR();
318        idb->CreateRet(idb->getInt64(0));
319        pxDriver.linkAndFinalize();
320    }
321
322    mGrepFunction = pxDriver.getPointerToMain();
323}
324
325re::CC * GrepEngine::grepCodepoints() {
326    parsedCodePointSet = re::makeCC();
327    char * mFileBuffer = getUnicodeNameDataPtr();
328    size_t mFileSize = getUnicodeNameDataSize();
329    doGrep(mFileBuffer, mFileSize, 0);
330    return parsedCodePointSet;
331}
332
333const std::vector<std::string> & GrepEngine::grepPropertyValues(const std::string& propertyName) {
334    enum { MaxSupportedVectorWidthInBytes = 32 };
335    AlignedAllocator<char, MaxSupportedVectorWidthInBytes> alloc;
336    parsedPropertyValues.clear();
337    const std::string & str = UCD::getPropertyValueGrepString(propertyName);
338    const auto n = str.length();
339    // NOTE: MaxSupportedVectorWidthInBytes of trailing 0s are needed to prevent the grep function from
340    // erroneously matching garbage data when loading the final partial block.
341    char * aligned = alloc.allocate(n + MaxSupportedVectorWidthInBytes, 0);
342    std::memcpy(aligned, str.data(), n);
343    std::memset(aligned + n, 0, MaxSupportedVectorWidthInBytes);
344    doGrep(aligned, n, 0);
345    alloc.deallocate(aligned, 0);
346    return parsedPropertyValues;
347}
348
349GrepEngine::GrepEngine()
350: mGrepFunction(nullptr) {
351
352}
Note: See TracBrowser for help on using the repository browser.