source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5450

Last change on this file since 5450 was 5450, checked in by cameron, 2 years ago

Until_N kernel and icgrep -max-count option: initial check-in

File size: 15.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include <llvm/IR/Module.h>
9//#include <llvm/ExecutionEngine/MCJIT.h>
10#include <llvm/IR/Verifier.h>
11#include <llvm/Support/CommandLine.h>
12#include <boost/filesystem.hpp>
13#include <UCD/UnicodeNameData.h>
14#include <UCD/resolve_properties.h>
15#include <kernels/cc_kernel.h>
16#include <kernels/grep_kernel.h>
17#include <kernels/linebreak_kernel.h>
18#include <kernels/streams_merge.h>
19#include <kernels/match_count.h>
20#include <kernels/source_kernel.h>
21#include <kernels/s2p_kernel.h>
22#include <kernels/scanmatchgen.h>
23#include <kernels/streamset.h>
24#include <kernels/until_n.h>
25#include <kernels/kernel_builder.h>
26#include <pablo/pablo_kernel.h>
27#include <re/re_cc.h>
28#include <re/re_toolchain.h>
29#include <toolchain/toolchain.h>
30#include <iostream>
31#include <sstream>
32#include <cc/multiplex_CCs.h>
33#include <llvm/Support/raw_ostream.h>
34#include <util/aligned_allocator.h>
35#include <sys/stat.h>
36#include <fcntl.h>
37
38using namespace parabix;
39using namespace llvm;
40
41static cl::OptionCategory bGrepOutputOptions("Output Options",
42                                             "These options control the output.");
43static cl::opt<bool> SilenceFileErrors("s", cl::desc("Suppress messages for file errors."), cl::init(false),  cl::cat(bGrepOutputOptions));
44
45static cl::opt<bool> SuppressOutput("q", cl::desc("Suppress normal output; set return code only."), cl::init(false),  cl::cat(bGrepOutputOptions));
46
47static cl::opt<bool> NormalizeLineBreaks("normalize-line-breaks", cl::desc("Normalize line breaks to std::endl."), cl::init(false),  cl::cat(bGrepOutputOptions));
48
49static cl::opt<bool> ShowFileNames("H", cl::desc("Show the file name with each matching line."), cl::cat(bGrepOutputOptions));
50static cl::alias ShowFileNamesLong("with-filename", cl::desc("Alias for -H"), cl::aliasopt(ShowFileNames));
51
52static cl::opt<bool> ShowLineNumbers("n", cl::desc("Show the line number with each matching line."), cl::cat(bGrepOutputOptions));
53static cl::alias ShowLineNumbersLong("line-number", cl::desc("Alias for -n"), cl::aliasopt(ShowLineNumbers));
54
55static cl::opt<int> MaxCount("m", cl::desc("Limit the number of matches per file."), cl::cat(bGrepOutputOptions), cl::init((size_t) -1));
56static cl::alias MaxCountLong("max-count", cl::desc("Alias for -m"), cl::aliasopt(MaxCount));
57
58static re::CC * parsedCodePointSet = nullptr;
59
60static std::vector<std::string> parsedPropertyValues;
61
62uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
63    const int32_t fd = open(fileName.c_str(), O_RDONLY);
64    if (LLVM_UNLIKELY(fd == -1)) {
65        return 0;
66    }
67    const auto result = doGrep(fd, fileIdx);
68    close(fd);
69    return result;
70}
71
72uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
73    assert (mGrepFunction);
74    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
75    return reinterpret_cast<GrepFunctionType>(mGrepFunction)(fileDescriptor, fileIdx);
76}
77
78void GrepEngine::doGrep(const char * buffer, const uint64_t length, const uint32_t fileIdx) const {
79    assert (mGrepFunction);
80    typedef uint64_t (*GrepFunctionType)(const char * buffer, const uint64_t length, const uint32_t fileIdx);
81    reinterpret_cast<GrepFunctionType>(mGrepFunction)(buffer, length, fileIdx);
82}
83
84static int * total_count;
85static std::stringstream * resultStrs = nullptr;
86static std::vector<std::string> inputFiles;
87
88void initFileResult(std::vector<std::string> filenames){
89    const int n = filenames.size();
90    if (n > 1) {
91        ShowFileNames = true;
92    }
93    inputFiles = filenames;
94    resultStrs = new std::stringstream[n];
95    total_count = new int[n];
96    for (unsigned i = 0; i < inputFiles.size(); ++i){
97        total_count[i] = 0;
98    }
99
100}
101
102template<typename CodeUnit>
103void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
104
105//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
106
107    assert (buffer);
108    assert (line_start <= line_end);
109    assert (line_end <= filesize);
110
111    if (ShowFileNames) {
112        resultStrs[fileIdx] << inputFiles[fileIdx] << ':';
113    }
114    if (ShowLineNumbers) {
115        // Internally line numbers are counted from 0.  For display, adjust
116        // the line number so that lines are numbered from 1.
117        resultStrs[fileIdx] << lineNum+1 << ":";
118    }
119
120    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
121    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
122        ++line_start;
123    }
124
125    if (LLVM_UNLIKELY(line_end == filesize)) {
126        // The match position is at end-of-file.   We have a final unterminated line.
127        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
128        if (NormalizeLineBreaks) {
129            resultStrs[fileIdx] << '\n';  // terminate it
130        }
131    } else {
132        const auto end_byte = buffer[line_end];
133        if (NormalizeLineBreaks) {
134            if (LLVM_UNLIKELY(end_byte == 0x85)) {
135                // Line terminated with NEL, on the second byte.  Back up 1.
136                line_end -= 1;
137            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
138                // Line terminated with PS or LS, on the third byte.  Back up 2.
139                line_end -= 2;
140            }
141            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
142            resultStrs[fileIdx] << '\n';
143        } else {
144            if (end_byte == 0x0D) {
145                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
146                if ((line_end + 1) < filesize) {
147                    if (buffer[line_end + 1] == 0x0A) {
148                        // Found CRLF; preserve both bytes.
149                        ++line_end;
150                    }
151                }
152            }
153            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
154        }
155    }
156}
157
158void PrintResult(bool CountOnly, std::vector<size_t> & total_CountOnly){
159    if (CountOnly) {
160        if (!ShowFileNames) {
161            for (unsigned i = 0; i < inputFiles.size(); ++i){
162                std::cout << total_CountOnly[i] << std::endl;
163            }
164        } else {
165            for (unsigned i = 0; i < inputFiles.size(); ++i){
166                std::cout << inputFiles[i] << ':' << total_CountOnly[i] << std::endl;
167            };
168        }
169    } else {
170        for (unsigned i = 0; i < inputFiles.size(); ++i){
171            std::cout << resultStrs[i].str();
172        }
173    }
174}
175
176void insert_codepoints(const size_t lineNum, const size_t line_start, const size_t line_end, const char * const buffer) {
177    assert (buffer);
178    assert (line_start <= line_end);
179    re::codepoint_t c = 0;
180    size_t line_pos = line_start;
181    while (isxdigit(buffer[line_pos])) {
182        assert (line_pos < line_end);
183        if (isdigit(buffer[line_pos])) {
184            c = (c << 4) | (buffer[line_pos] - '0');
185        }
186        else {
187            c = (c << 4) | (tolower(buffer[line_pos]) - 'a' + 10);
188        }
189        line_pos++;
190    }
191    assert(((line_pos - line_start) >= 4) && ((line_pos - line_start) <= 6)); // UCD format 4 to 6 hex digits.
192    parsedCodePointSet->insert(c);
193}
194
195void insert_property_values(size_t lineNum, size_t line_start, size_t line_end, const char * buffer) {
196    assert (line_start <= line_end);
197    parsedPropertyValues.emplace_back(buffer + line_start, buffer + line_end);
198}
199
200void GrepEngine::grepCodeGen(const std::string & moduleName, std::vector<re::RE *> REs, const bool CountOnly, const bool UTF_16, GrepSource grepSource, const GrepType grepType) {
201
202    ParabixDriver pxDriver(moduleName + ":icgrep");
203    auto & idb = pxDriver.getBuilder();
204    Module * M = idb->getModule();
205
206    const unsigned segmentSize = codegen::SegmentSize;
207    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
208    const unsigned encodingBits = UTF_16 ? 16 : 8;
209
210    Type * const int64Ty = idb->getInt64Ty();
211    Type * const int32Ty = idb->getInt32Ty();
212
213    Function * mainFunc = nullptr;
214    Value * fileIdx = nullptr;
215    StreamSetBuffer * ByteStream = nullptr;
216    kernel::Kernel * sourceK = nullptr;
217
218    if (grepSource == GrepSource::Internal) {
219
220        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt8PtrTy(), int64Ty, int32Ty, nullptr));
221        mainFunc->setCallingConv(CallingConv::C);
222        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
223        auto args = mainFunc->arg_begin();
224
225        Value * const buffer = &*(args++);
226        buffer->setName("buffer");
227
228        Value * length = &*(args++);
229        length->setName("length");
230        length = idb->CreateZExtOrTrunc(length, idb->getSizeTy());
231
232        fileIdx = &*(args++);
233        fileIdx->setName("fileIdx");
234
235        ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
236
237        sourceK = pxDriver.addKernelInstance(make_unique<kernel::MemorySourceKernel>(idb, idb->getInt8PtrTy(), segmentSize));
238        sourceK->setInitialArguments({buffer, length});
239
240    } else {
241
242        mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt32Ty(), int32Ty, nullptr));
243        mainFunc->setCallingConv(CallingConv::C);
244        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
245        auto args = mainFunc->arg_begin();
246
247        Value * const fileDescriptor = &*(args++);
248        fileDescriptor->setName("fileDescriptor");
249        fileIdx = &*(args++);
250        fileIdx->setName("fileIdx");
251
252        ByteStream = pxDriver.addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
253
254        if (grepSource == GrepSource::File) {
255            sourceK = pxDriver.addKernelInstance(make_unique<kernel::MMapSourceKernel>(idb, segmentSize));
256            sourceK->setInitialArguments({fileDescriptor});
257        } else { // if (grepSource == GrepSource::StdIn) {
258            sourceK = pxDriver.addKernelInstance(make_unique<kernel::ReadSourceKernel>(idb, segmentSize));
259            sourceK->setInitialArguments({idb->getInt32(STDIN_FILENO)});
260        }
261    }
262
263    pxDriver.makeKernelCall(sourceK, {}, {ByteStream});
264    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(8, 1), segmentSize * bufferSegments));
265   
266    kernel::Kernel * s2pk = pxDriver.addKernelInstance(make_unique<kernel::S2PKernel>(idb));
267    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
268   
269    kernel::Kernel * linebreakK = pxDriver.addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
270    StreamSetBuffer * LineBreakStream = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
271    pxDriver.makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
272   
273    const auto n = REs.size();
274
275    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
276
277    for(unsigned i = 0; i < n; ++i){
278        StreamSetBuffer * MatchResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
279        kernel::Kernel * icgrepK = pxDriver.addKernelInstance(make_unique<kernel::ICgrepKernelBuilder>(idb, REs[i]));
280        pxDriver.makeKernelCall(icgrepK, {BasisBits, LineBreakStream}, {MatchResults});
281        MatchResultsBufs[i] = MatchResults;
282    }
283    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
284    if (REs.size() > 1) {
285        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
286        kernel::Kernel * streamsMergeK = pxDriver.addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
287        pxDriver.makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
288    }
289   
290    if (AlgorithmOptionIsSet(re::InvertMatches)) {
291        kernel::Kernel * invertK = pxDriver.addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
292        StreamSetBuffer * OriginalMatches = MergedResults;
293        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
294        pxDriver.makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {MergedResults});
295    }
296    if (MaxCount > 0) {
297        kernel::Kernel * untilK = pxDriver.addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
298        untilK->setInitialArguments({idb->getSize(MaxCount)});
299        StreamSetBuffer * AllMatches = MergedResults;
300        MergedResults = pxDriver.addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
301        pxDriver.makeKernelCall(untilK, {AllMatches}, {MergedResults});
302    }
303    if (CountOnly) {
304        kernel::MatchCount matchCountK(idb);
305        pxDriver.addKernelCall(matchCountK, {MergedResults}, {});
306        pxDriver.generatePipelineIR();
307        idb->setKernel(&matchCountK);
308        Value * matchedLineCount = idb->getScalarField("matchedLineCount");
309        matchedLineCount = idb->CreateZExt(matchedLineCount, int64Ty);
310        idb->CreateRet(matchedLineCount);
311        pxDriver.linkAndFinalize();
312    } else {
313        kernel::ScanMatchKernel scanMatchK(idb, grepType, encodingBits);
314        scanMatchK.setInitialArguments({fileIdx});
315        pxDriver.addKernelCall(scanMatchK, {MergedResults, LineBreakStream, ByteStream}, {});
316        switch (grepType) {
317            case GrepType::Normal:
318                if (UTF_16) {
319                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint16_t>);
320                } else {
321                    pxDriver.LinkFunction(scanMatchK, "matcher", &wrapped_report_match<uint8_t>);
322                }
323                break;
324            case GrepType::NameExpression:
325                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_codepoints);
326                break;
327            case GrepType::PropertyValue:
328                pxDriver.LinkFunction(scanMatchK, "matcher", &insert_property_values);
329                break;
330        }
331        pxDriver.generatePipelineIR();
332        idb->CreateRet(idb->getInt64(0));
333        pxDriver.linkAndFinalize();
334    }
335
336    mGrepFunction = pxDriver.getPointerToMain();
337}
338
339re::CC * GrepEngine::grepCodepoints() {
340    parsedCodePointSet = re::makeCC();
341    char * mFileBuffer = getUnicodeNameDataPtr();
342    size_t mFileSize = getUnicodeNameDataSize();
343    doGrep(mFileBuffer, mFileSize, 0);
344    return parsedCodePointSet;
345}
346
347const std::vector<std::string> & GrepEngine::grepPropertyValues(const std::string& propertyName) {
348    enum { MaxSupportedVectorWidthInBytes = 32 };
349    AlignedAllocator<char, MaxSupportedVectorWidthInBytes> alloc;
350    parsedPropertyValues.clear();
351    const std::string & str = UCD::getPropertyValueGrepString(propertyName);
352    const auto n = str.length();
353    // NOTE: MaxSupportedVectorWidthInBytes of trailing 0s are needed to prevent the grep function from
354    // erroneously matching garbage data when loading the final partial block.
355    char * aligned = alloc.allocate(n + MaxSupportedVectorWidthInBytes, 0);
356    std::memcpy(aligned, str.data(), n);
357    std::memset(aligned + n, 0, MaxSupportedVectorWidthInBytes);
358    doGrep(aligned, n, 0);
359    alloc.deallocate(aligned, 0);
360    return parsedPropertyValues;
361}
362
363GrepEngine::GrepEngine()
364: mGrepFunction(nullptr) {
365
366}
Note: See TracBrowser for help on using the repository browser.