source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5419

Last change on this file since 5419 was 5419, checked in by nmedfort, 2 years ago

Bug fixes for 32-bit systems

File size: 15.1 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9#include <llvm/ExecutionEngine/MCJIT.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/Support/CommandLine.h>
12#include <boost/filesystem.hpp>
13#include <IR_Gen/idisa_builder.h>
14#include <IR_Gen/idisa_target.h>
15#include <UCD/UnicodeNameData.h>
16#include <UCD/resolve_properties.h>
17#include <kernels/cc_kernel.h>
18#include <kernels/grep_kernel.h>
19#include <kernels/linebreak_kernel.h>
20#include <kernels/streams_merge.h>
21#include <kernels/match_count.h>
22#include <kernels/mmap_kernel.h>
23#include <kernels/s2p_kernel.h>
24#include <kernels/scanmatchgen.h>
25#include <kernels/streamset.h>
26#include <kernels/stdin_kernel.h>
27#include <pablo/pablo_kernel.h>
28#include <re/re_cc.h>
29#include <re/re_toolchain.h>
30#include <kernels/toolchain.h>
31#include <iostream>
32#include <sstream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38
39using namespace parabix;
40using namespace llvm;
41
42static cl::OptionCategory bGrepOutputOptions("Output Options",
43                                             "These options control the output.");
44static cl::opt<bool> SilenceFileErrors("s", cl::desc("Suppress messages for file errors."), cl::init(false),  cl::cat(bGrepOutputOptions));
45
46static cl::opt<bool> SuppressOutput("q", cl::desc("Suppress normal output; set return code only."), cl::init(false),  cl::cat(bGrepOutputOptions));
47
48static cl::opt<bool> NormalizeLineBreaks("normalize-line-breaks", cl::desc("Normalize line breaks to std::endl."), cl::init(false),  cl::cat(bGrepOutputOptions));
49
50static cl::opt<bool> ShowFileNames("H", cl::desc("Show the file name with each matching line."), cl::cat(bGrepOutputOptions));
51static cl::alias ShowFileNamesLong("with-filename", cl::desc("Alias for -H"), cl::aliasopt(ShowFileNames));
52
53static cl::opt<bool> ShowLineNumbers("n", cl::desc("Show the line number with each matching line."), cl::cat(bGrepOutputOptions));
54static cl::alias ShowLineNumbersLong("line-number", cl::desc("Alias for -n"), cl::aliasopt(ShowLineNumbers));
55
56static re::CC * parsedCodePointSet = nullptr;
57
58static std::vector<std::string> parsedPropertyValues;
59
60uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
61    const int32_t fd = open(fileName.c_str(), O_RDONLY);
62    if (LLVM_UNLIKELY(fd == -1)) {
63        return 0;
64    }
65    const auto result = doGrep(fd, fileIdx);
66    close(fd);
67    return result;
68}
69
70uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
71    assert (mGrepFunction);
72    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
73    return reinterpret_cast<GrepFunctionType>(mGrepFunction)(fileDescriptor, fileIdx);
74}
75
76void GrepEngine::doGrep(const char * buffer, const uint64_t length, const uint32_t fileIdx) const {
77    assert (mGrepFunction);
78    typedef uint64_t (*GrepFunctionType)(const char * buffer, const uint64_t length, const uint32_t fileIdx);
79    reinterpret_cast<GrepFunctionType>(mGrepFunction)(buffer, length, fileIdx);
80}
81
82static int * total_count;
83static std::stringstream * resultStrs = nullptr;
84static std::vector<std::string> inputFiles;
85
86void initFileResult(std::vector<std::string> filenames){
87    const int n = filenames.size();
88    if (n > 1) {
89        ShowFileNames = true;
90    }
91    inputFiles = filenames;
92    resultStrs = new std::stringstream[n];
93    total_count = new int[n];
94    for (unsigned i = 0; i < inputFiles.size(); ++i){
95        total_count[i] = 0;
96    }
97
98}
99
100template<typename CodeUnit>
101void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
102
103//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
104
105    assert (buffer);
106    assert (line_start <= line_end);
107    assert (line_end <= filesize);
108
109    if (ShowFileNames) {
110        resultStrs[fileIdx] << inputFiles[fileIdx] << ':';
111    }
112    if (ShowLineNumbers) {
113        resultStrs[fileIdx] << lineNum << ":";
114    }
115
116    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
117    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
118        ++line_start;
119    }
120
121    if (LLVM_UNLIKELY(line_end == filesize)) {
122        // The match position is at end-of-file.   We have a final unterminated line.
123        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
124        if (NormalizeLineBreaks) {
125            resultStrs[fileIdx] << '\n';  // terminate it
126        }
127    } else {
128        const auto end_byte = buffer[line_end];
129        if (NormalizeLineBreaks) {
130            if (LLVM_UNLIKELY(end_byte == 0x85)) {
131                // Line terminated with NEL, on the second byte.  Back up 1.
132                line_end -= 1;
133            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
134                // Line terminated with PS or LS, on the third byte.  Back up 2.
135                line_end -= 2;
136            }
137            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
138            resultStrs[fileIdx] << '\n';
139        } else {
140            if (end_byte == 0x0D) {
141                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
142                if ((line_end + 1) < filesize) {
143                    if (buffer[line_end + 1] == 0x0A) {
144                        // Found CRLF; preserve both bytes.
145                        ++line_end;
146                    }
147                }
148            }
149            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
150        }
151    }
152}
153
154void PrintResult(bool CountOnly, std::vector<size_t> & total_CountOnly){
155    if (CountOnly) {
156        if (!ShowFileNames) {
157            for (unsigned i = 0; i < inputFiles.size(); ++i){
158                std::cout << total_CountOnly[i] << std::endl;
159            }
160        } else {
161            for (unsigned i = 0; i < inputFiles.size(); ++i){
162                std::cout << inputFiles[i] << ':' << total_CountOnly[i] << std::endl;
163            };
164        }
165    } else {
166        for (unsigned i = 0; i < inputFiles.size(); ++i){
167            std::cout << resultStrs[i].str();
168        }
169    }
170}
171
172void insert_codepoints(const size_t lineNum, const size_t line_start, const size_t line_end, const char * const buffer) {
173    assert (buffer);
174    assert (line_start <= line_end);
175    re::codepoint_t c = 0;
176    size_t line_pos = line_start;
177    while (isxdigit(buffer[line_pos])) {
178        assert (line_pos < line_end);
179        if (isdigit(buffer[line_pos])) {
180            c = (c << 4) | (buffer[line_pos] - '0');
181        }
182        else {
183            c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
184        }
185        line_pos++;
186    }
187    assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
188    parsedCodePointSet->insert(c);
189}
190
191void insert_property_values(size_t lineNum, size_t line_start, size_t line_end, const char * buffer) {
192    assert (line_start <= line_end);
193    parsedPropertyValues.emplace_back(buffer + line_start, buffer + line_end);
194}
195
196inline void linkGrepFunction(ParabixDriver & pxDriver, const GrepType grepType, const bool UTF_16, kernel::KernelBuilder & kernel) {
197    switch (grepType) {
198        case GrepType::Normal:
199            if (UTF_16) {
200                pxDriver.addExternalLink(kernel, "matcher", &wrapped_report_match<uint16_t>);
201            } else {
202                pxDriver.addExternalLink(kernel, "matcher", &wrapped_report_match<uint8_t>);
203            }
204            break;
205        case GrepType::NameExpression:
206            pxDriver.addExternalLink(kernel, "matcher", &insert_codepoints);
207            break;
208        case GrepType::PropertyValue:
209            pxDriver.addExternalLink(kernel, "matcher", &insert_property_values);
210            break;
211    }
212}
213
214void GrepEngine::grepCodeGen(std::string moduleName, std::vector<re::RE *> REs, const bool CountOnly, const bool UTF_16, GrepSource grepSource, const GrepType grepType) {
215
216    Module * M = new Module(moduleName + ":icgrep", getGlobalContext());;
217    IDISA::IDISA_Builder * iBuilder = IDISA::GetIDISA_Builder(M);;
218    ParabixDriver pxDriver(iBuilder);
219
220    const unsigned segmentSize = codegen::SegmentSize;
221    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
222    const unsigned encodingBits = UTF_16 ? 16 : 8;
223
224    Type * const int64Ty = iBuilder->getInt64Ty();
225    Type * const int32Ty = iBuilder->getInt32Ty();
226
227    Function * mainFunc = nullptr;
228    Value * fileIdx = nullptr;
229    StreamSetBuffer * ByteStream = nullptr;
230    kernel::KernelBuilder * sourceK = nullptr;
231
232    if (grepSource == GrepSource::Internal) {
233
234        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, iBuilder->getInt8PtrTy(), int64Ty, int32Ty, nullptr));
235        mainFunc->setCallingConv(CallingConv::C);
236        iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
237        Function::arg_iterator args = mainFunc->arg_begin();
238
239        Value * const buffer = &*(args++);
240        buffer->setName("buffer");
241
242        Value * length = &*(args++);
243        length->setName("length");
244        length = iBuilder->CreateZExtOrTrunc(length, iBuilder->getSizeTy());
245
246        fileIdx = &*(args++);
247        fileIdx->setName("fileIdx");
248
249        ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
250
251        sourceK = pxDriver.addKernelInstance(make_unique<kernel::FileSourceKernel>(iBuilder, iBuilder->getInt8PtrTy(), segmentSize));
252        sourceK->setInitialArguments({buffer, length});
253
254    } else {
255
256        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, iBuilder->getInt32Ty(), int32Ty, nullptr));
257        mainFunc->setCallingConv(CallingConv::C);
258        iBuilder->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
259        Function::arg_iterator args = mainFunc->arg_begin();
260
261        Value * const fileDescriptor = &*(args++);
262        fileDescriptor->setName("fileDescriptor");
263        fileIdx = &*(args++);
264        fileIdx->setName("fileIdx");
265
266        if (grepSource == GrepSource::File) {
267            ByteStream = pxDriver.addBuffer(make_unique<SourceFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)));
268            sourceK = pxDriver.addKernelInstance(make_unique<kernel::MMapSourceKernel>(iBuilder, segmentSize));
269            sourceK->setInitialArguments({fileDescriptor});
270        } else { // if (grepSource == GrepSource::StdIn) {
271            ByteStream = pxDriver.addBuffer(make_unique<ExtensibleBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), segmentSize));
272            sourceK = pxDriver.addKernelInstance(make_unique<kernel::StdInKernel>(iBuilder, segmentSize));
273        }
274    }
275
276    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
277    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments));
278   
279    kernel::KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<kernel::S2PKernel>(iBuilder));
280    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
281   
282    kernel::KernelBuilder * linebreakK = pxDriver.addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(iBuilder, encodingBits));
283    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
284    pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
285   
286    const auto n = REs.size();
287
288    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
289
290    for(unsigned i = 0; i < n; ++i){
291        StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
292        kernel::KernelBuilder * icgrepK = pxDriver.addKernelInstance(make_unique<kernel::ICgrepKernelBuilder>(iBuilder, REs[i]));
293        pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream}, {MatchResults});
294        MatchResultsBufs[i] = MatchResults;
295    }
296    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
297    if (REs.size() > 1) {
298        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
299        kernel::KernelBuilder * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(iBuilder, 1, REs.size()));
300        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
301    }
302   
303    if (AlgorithmOptionIsSet(re::InvertMatches)) {
304        kernel::KernelBuilder * invertK = pxDriver.addKernelInstance(make_unique<kernel::InvertMatchesKernel>(iBuilder));
305        StreamSetBuffer * OriginalMatches = MergedResults;
306        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 1), segmentSize * bufferSegments));
307        pxDriver.makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {MergedResults});
308    }
309    if (CountOnly) {
310        kernel::MatchCount matchCountK(iBuilder);
311        pxDriver.addKernelCall(matchCountK, {MergedResults}, {});
312        pxDriver.generatePipelineIR();
313        Value * matchedLineCount = matchCountK.getScalarField("matchedLineCount");
314        matchedLineCount = iBuilder->CreateZExt(matchedLineCount, int64Ty);
315        iBuilder->CreateRet(matchedLineCount);
316        pxDriver.linkAndFinalize();
317    } else {
318        kernel::ScanMatchKernel scanMatchK(iBuilder, grepType, encodingBits);
319        scanMatchK.setInitialArguments({fileIdx});
320        pxDriver.addKernelCall(scanMatchK, {MergedResults, LineBreakStream, ByteStream}, {});
321        linkGrepFunction(pxDriver, grepType, UTF_16, scanMatchK);
322        pxDriver.generatePipelineIR();
323        iBuilder->CreateRet(iBuilder->getInt64(0));
324        pxDriver.linkAndFinalize();
325    }
326
327    mGrepFunction = pxDriver.getPointerToMain();
328}
329
330re::CC * GrepEngine::grepCodepoints() {
331    parsedCodePointSet = re::makeCC();
332    char * mFileBuffer = getUnicodeNameDataPtr();
333    size_t mFileSize = getUnicodeNameDataSize();
334    doGrep(mFileBuffer, mFileSize, 0);
335    return parsedCodePointSet;
336}
337
338const std::vector<std::string> & GrepEngine::grepPropertyValues(const std::string& propertyName) {
339    enum { MaxSupportedVectorWidthInBytes = 32 };
340    AlignedAllocator<char, MaxSupportedVectorWidthInBytes> alloc;
341    parsedPropertyValues.clear();
342    const std::string & str = UCD::getPropertyValueGrepString(propertyName);
343    const auto n = str.length();
344    // NOTE: MaxSupportedVectorWidthInBytes of trailing 0s are needed to prevent the grep function from
345    // erroneously matching garbage data when loading the final partial block.
346    char * aligned = alloc.allocate(n + MaxSupportedVectorWidthInBytes, 0);
347    std::memcpy(aligned, str.data(), n);
348    std::memset(aligned + n, 0, MaxSupportedVectorWidthInBytes);
349    doGrep(aligned, n, 0);
350    alloc.deallocate(aligned, 0);
351    return parsedPropertyValues;
352}
353
354GrepEngine::GrepEngine()
355: mGrepFunction(nullptr) {
356
357}
Note: See TracBrowser for help on using the repository browser.