source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5693

Last change on this file since 5693 was 5693, checked in by cameron, 2 years ago

Unified FDsource kernel; filename - now interpreted as stdin

File size: 15.7 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/cpudriver.h>
31#include <toolchain/NVPTXDriver.h>
32#include <iostream>
33#include <sstream>
34#include <cc/multiplex_CCs.h>
35#include <llvm/Support/raw_ostream.h>
36#include <util/aligned_allocator.h>
37#include <sys/stat.h>
38#include <fcntl.h>
39#include <errno.h>
40#include <mutex>
41
42using namespace parabix;
43using namespace llvm;
44
45namespace grep {
46
47static std::stringstream * resultStrs = nullptr;
48static std::vector<std::string> inputFiles;
49static std::vector<std::string> linePrefix;
50static bool grepMatchFound;
51
52size_t * startPoints = nullptr;
53size_t * accumBytes = nullptr;
54
55
56std::mutex count_mutex;
57size_t fileCount;
58
59// DoGrep thread function.
60void *DoGrepThreadFunction(void *args)
61{
62    size_t fileIdx;
63    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
64
65    count_mutex.lock();
66    fileIdx = fileCount;
67    fileCount++;
68    count_mutex.unlock();
69
70    while (fileIdx < inputFiles.size()) {
71        size_t grepResult = grepEngine->doGrep(inputFiles[fileIdx], fileIdx);
72       
73        count_mutex.lock();
74        if (grepResult > 0) grepMatchFound = true;
75        fileIdx = fileCount;
76        fileCount++;
77        count_mutex.unlock();
78        if (QuietMode && grepMatchFound) pthread_exit(nullptr);
79    }
80
81    pthread_exit(nullptr);
82}
83
84bool matchesNeedToBeMovedToEOL() {
85    if ((Mode == QuietMode) | (Mode == FilesWithMatch) | (Mode == FilesWithoutMatch)) {
86        return false;
87    }
88    else if (LineRegexpFlag) {
89        return false;
90    }
91    // TODO: return false for other cases based on regexp analysis, e.g., regexp ends with $.
92    return true;
93}
94   
95uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
96    if (fileName == "-") {
97        return doGrep(STDIN_FILENO, fileIdx);
98    }
99    struct stat sb;
100    const int32_t fd = open(fileName.c_str(), O_RDONLY);
101    if (LLVM_UNLIKELY(fd == -1)) {
102        if (!NoMessagesFlag  && !(Mode == QuietMode)) {
103            if (errno == EACCES) {
104                resultStrs[fileIdx] << "icgrep: " << fileName << ": Permission denied.\n";
105            }
106            else if (errno == ENOENT) {
107                resultStrs[fileIdx] << "icgrep: " << fileName << ": No such file.\n";
108            }
109            else {
110                resultStrs[fileIdx] << "icgrep: " << fileName << ": Failed.\n";
111            }
112        }
113        return 0;
114    }
115    if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
116        if (!NoMessagesFlag  && !(Mode == QuietMode)) {
117            resultStrs[fileIdx] << "icgrep: " << fileName << ": Is a directory.\n";
118        }
119        close(fd);
120        return 0;
121    }
122    const auto result = doGrep(fd, fileIdx);
123    close(fd);
124    return result;
125}
126
127uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
128    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
129    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
130   
131    uint64_t grepResult = f(fileDescriptor, fileIdx);
132    if (grepResult > 0) grepMatchFound = true;
133    else if ((Mode == NormalMode) && !resultStrs[fileIdx].str().empty()) grepMatchFound = true;
134   
135    if (Mode == CountOnly) {
136        resultStrs[fileIdx] << linePrefix[fileIdx] << grepResult << "\n";
137    }
138    else if (Mode == FilesWithMatch || Mode == FilesWithoutMatch ) {
139        size_t requiredCount = Mode == FilesWithMatch ? 1 : 0;
140        if (grepResult == requiredCount) {
141            resultStrs[fileIdx] << linePrefix[fileIdx];
142        }
143    }
144    else if (Mode == QuietMode) {
145        if (grepMatchFound) exit(MatchFoundExitCode);
146    }
147    return grepResult;
148}
149
150void initFileResult(std::vector<std::string> filenames){
151    grepMatchFound = false;
152    const int n = filenames.size();
153    linePrefix.resize(n);
154    if ((n > 1) && !NoFilenameFlag) {
155        WithFilenameFlag = true;
156    }
157    std::string fileSuffix = "";
158    bool setLinePrefix = WithFilenameFlag || (Mode == FilesWithMatch) || (Mode == FilesWithoutMatch);
159    if (setLinePrefix) {
160        if (NullFlag) {
161            fileSuffix = std::string("\0", 1);
162        }
163        else if ((Mode == NormalMode) && InitialTabFlag && !(LineNumberFlag || ByteOffsetFlag)) {
164            fileSuffix = "\t:";
165        }
166        else if ((Mode == NormalMode) || (Mode == CountOnly)) {
167            fileSuffix = ":";
168        }
169        else if ((Mode == FilesWithMatch) || (Mode == FilesWithoutMatch)) {
170            fileSuffix = "\n";
171        }
172    }
173    inputFiles = filenames;
174    resultStrs = new std::stringstream[n];
175    for (unsigned i = 0; i < inputFiles.size(); ++i) {
176        if (setLinePrefix) {
177            if (inputFiles[i] == "-") {
178                linePrefix[i] = LabelFlag + fileSuffix;
179            }
180            else {
181                linePrefix[i] = inputFiles[i] + fileSuffix;
182            }
183        }
184    }
185}
186
187template<typename CodeUnit>
188void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
189
190//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
191
192    assert (buffer);
193    assert (line_start <= line_end);
194    assert (line_end <= filesize);
195
196    if (WithFilenameFlag) {
197        resultStrs[fileIdx] << linePrefix[fileIdx];
198    }
199    if (LineNumberFlag) {
200        // Internally line numbers are counted from 0.  For display, adjust
201        // the line number so that lines are numbered from 1.
202        if (InitialTabFlag) {
203            resultStrs[fileIdx] << lineNum+1 << "\t:";
204        }
205        else {
206            resultStrs[fileIdx] << lineNum+1 << ":";
207        }
208    }
209
210    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
211    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
212        ++line_start;
213    }
214
215    if (LLVM_UNLIKELY(line_end == filesize)) {
216        // The match position is at end-of-file.   We have a final unterminated line.
217        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
218        if (NormalizeLineBreaksFlag) {
219            resultStrs[fileIdx] << '\n';  // terminate it
220        }
221    } else {
222        const auto end_byte = buffer[line_end];
223        if (grep::NormalizeLineBreaksFlag) {
224            if (LLVM_UNLIKELY(end_byte == 0x85)) {
225                // Line terminated with NEL, on the second byte.  Back up 1.
226                line_end -= 1;
227            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
228                // Line terminated with PS or LS, on the third byte.  Back up 2.
229                line_end -= 2;
230            }
231            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
232            resultStrs[fileIdx] << '\n';
233        } else {
234            if (end_byte == 0x0D) {
235                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
236                if ((line_end + 1) < filesize) {
237                    if (buffer[line_end + 1] == 0x0A) {
238                        // Found CRLF; preserve both bytes.
239                        ++line_end;
240                    }
241                }
242            }
243            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
244        }
245    }
246}
247
248void PrintResults(){
249   
250    for (unsigned i = 0; i < inputFiles.size(); ++i){
251        std::cout << resultStrs[i].str();
252    }
253    exit(grepMatchFound ? MatchFoundExitCode : MatchNotFoundExitCode);
254}
255
256   
257std::pair<StreamSetBuffer *, StreamSetBuffer *> grepPipeline(Driver * grepDriver, std::vector<re::RE *> & REs, const GrepModeType grepMode, unsigned encodingBits, StreamSetBuffer * ByteStream) {
258    auto & idb = grepDriver->getBuilder();
259    const unsigned segmentSize = codegen::SegmentSize;
260    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
261    size_t MatchLimit = ((grepMode == QuietMode) | (grepMode == FilesWithMatch) | (grepMode == FilesWithoutMatch)) ? 1 : MaxCountFlag;
262   
263
264    StreamSetBuffer * BasisBits = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
265    kernel::Kernel * s2pk = grepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
266    grepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
267   
268    StreamSetBuffer * LineBreakStream = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
269    kernel::Kernel * linebreakK = grepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
270    grepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
271   
272    kernel::Kernel * requiredStreamsK = grepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
273    StreamSetBuffer * RequiredStreams = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
274    grepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
275   
276    const auto n = REs.size();
277   
278    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
279
280    for (unsigned i = 0; i < n; i++) {
281        REs[i] = re::resolveNames(REs[i]);
282        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
283        std::vector<std::vector<unsigned>> exclusiveSetIDs;
284        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
285        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
286    } 
287
288    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
289
290    for(unsigned i = 0; i < n; ++i){
291        const auto numOfCharacterClasses = charclasses[i].size();
292        StreamSetBuffer * CharClasses = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
293        kernel::Kernel * ccK = grepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
294        grepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
295        StreamSetBuffer * MatchResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
296        kernel::Kernel * icgrepK = grepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
297        grepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
298        MatchResultsBufs[i] = MatchResults;
299    }
300    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
301    if (REs.size() > 1) {
302        MergedResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
303        kernel::Kernel * streamsMergeK = grepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
304        grepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
305    }
306    StreamSetBuffer * Matches = MergedResults;
307   
308    if (matchesNeedToBeMovedToEOL()) {
309        StreamSetBuffer * OriginalMatches = Matches;
310        kernel::Kernel * matchedLinesK = grepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
311        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
312        grepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
313    }
314   
315    if (InvertMatchFlag) {
316        kernel::Kernel * invertK = grepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
317        StreamSetBuffer * OriginalMatches = Matches;
318        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
319        grepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
320    }
321    if (MatchLimit > 0) {
322        kernel::Kernel * untilK = grepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
323        untilK->setInitialArguments({idb->getSize(MatchLimit)});
324        StreamSetBuffer * AllMatches = Matches;
325        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
326        grepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
327    }
328    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
329}
330
331void GrepEngine::grepCodeGen(std::vector<re::RE *> REs, const GrepModeType grepMode) {
332
333    assert (mGrepDriver == nullptr);
334    mGrepDriver = new ParabixDriver("engine");
335    auto & idb = mGrepDriver->getBuilder();
336    Module * M = idb->getModule();
337
338    const unsigned segmentSize = codegen::SegmentSize;
339    const unsigned encodingBits = 8;
340
341    Type * const int64Ty = idb->getInt64Ty();
342    Type * const int32Ty = idb->getInt32Ty();
343
344    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt32Ty(), int32Ty, nullptr));
345    mainFunc->setCallingConv(CallingConv::C);
346    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
347    auto args = mainFunc->arg_begin();
348
349    Value * const fileDescriptor = &*(args++);
350    fileDescriptor->setName("fileDescriptor");
351    Value * fileIdx = &*(args++);
352    fileIdx->setName("fileIdx");
353
354    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
355    kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
356    sourceK->setInitialArguments({fileDescriptor});
357    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
358   
359    StreamSetBuffer * LineBreakStream;
360    StreamSetBuffer * Matches;
361    std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, grepMode, encodingBits, ByteStream);
362   
363    if (grepMode == NormalMode) {
364        kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb, GrepType::Normal, encodingBits));
365        scanMatchK->setInitialArguments({fileIdx});
366        mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
367        mGrepDriver->LinkFunction(*scanMatchK, "matcher", &wrapped_report_match<uint8_t>);
368        mGrepDriver->generatePipelineIR();
369        mGrepDriver->deallocateBuffers();
370
371        idb->CreateRet(idb->getInt64(0));
372    } else {
373        kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
374        mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
375        mGrepDriver->generatePipelineIR();
376        idb->setKernel(matchCountK);
377        Value * matchedLineCount = idb->getAccumulator("countResult");
378        matchedLineCount = idb->CreateZExt(matchedLineCount, int64Ty);
379        mGrepDriver->deallocateBuffers();
380        idb->CreateRet(matchedLineCount);
381    }
382    mGrepDriver->finalizeObject();
383}
384
385GrepEngine::GrepEngine()
386: mGrepDriver(nullptr) {
387
388}
389
390GrepEngine::~GrepEngine() {
391    delete mGrepDriver;
392}
393
394}
Note: See TracBrowser for help on using the repository browser.