source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5698

Last change on this file since 5698 was 5698, checked in by cameron, 22 months ago

Modularization progress

File size: 16.7 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/cpudriver.h>
31#include <toolchain/NVPTXDriver.h>
32#include <iostream>
33#include <cc/multiplex_CCs.h>
34#include <llvm/Support/raw_ostream.h>
35#include <util/aligned_allocator.h>
36#include <sys/stat.h>
37#include <fcntl.h>
38#include <errno.h>
39#include <llvm/ADT/STLExtras.h> // for make_unique
40
41using namespace parabix;
42using namespace llvm;
43
44namespace grep {
45
46
47// DoGrep thread function.
48void *DoGrepThreadFunction(void *args)
49{
50    size_t fileIdx;
51    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
52
53    grepEngine->count_mutex.lock();
54    fileIdx = grepEngine->fileCount;
55    grepEngine->fileCount++;
56    grepEngine->count_mutex.unlock();
57
58    while (fileIdx < grepEngine->inputFiles.size()) {
59        size_t grepResult = grepEngine->doGrep(grepEngine->inputFiles[fileIdx], fileIdx);
60       
61        grepEngine->count_mutex.lock();
62        if (grepResult > 0) grepEngine->grepMatchFound = true;
63        fileIdx = grepEngine->fileCount;
64        grepEngine->fileCount++;
65        grepEngine->count_mutex.unlock();
66        if (QuietMode && grepEngine->grepMatchFound) pthread_exit(nullptr);
67    }
68
69    pthread_exit(nullptr);
70}
71   
72    //
73    //  Default Report Match:  lines are emitted with whatever line terminators are found in the
74    //  input.  However, if the final line is not terminated, a new line is appended.
75   
76    void NonNormalizingReportMatch::accumulate_match (const size_t lineNum, char * line_start, char * line_end) {
77        if (!(WithFilenameFlag | LineNumberFlag) && (line_start == mPrevious_line_end + 1)) {
78            // Consecutive matches: only one write call needed.
79            mResultStr.write(mPrevious_line_end, line_end - mPrevious_line_end);
80        }
81        else {
82            if (mLineCount > 0) {
83                // deal with the final byte of the previous line.
84                mResultStr.write(mPrevious_line_end, 1);
85            }
86            if (WithFilenameFlag) {
87                mResultStr << mLinePrefix;
88            }
89            if (LineNumberFlag) {
90                // Internally line numbers are counted from 0.  For display, adjust
91                // the line number so that lines are numbered from 1.
92                if (InitialTabFlag) {
93                    mResultStr << lineNum+1 << "\t:";
94                }
95                else {
96                    mResultStr << lineNum+1 << ":";
97                }
98            }
99            mResultStr.write(line_start, line_end - line_start);
100        }
101        mPrevious_line_end = line_end;
102        mLineCount++;
103    }
104   
105    void NonNormalizingReportMatch::finalize_match(char * buffer_end) {
106        if (mLineCount == 0) return;  // No matches.
107        if (mPrevious_line_end < buffer_end) {
108            mResultStr.write(mPrevious_line_end, 1);
109        }
110        else {
111            // Likely unterminated final line.
112            char last_byte = mPrevious_line_end[-1];
113            if (last_byte == 0x0D) {
114                // The final CR is acceptable as a line_end.
115                return;
116            }
117            // Terminate the line with an LF
118            // (Even if we had an incomplete UTF-8 sequence.)
119            mResultStr << "\n";
120        }
121    }
122   
123   
124
125bool matchesNeedToBeMovedToEOL() {
126    if ((Mode == QuietMode) | (Mode == FilesWithMatch) | (Mode == FilesWithoutMatch)) {
127        return false;
128    }
129    else if (LineRegexpFlag) {
130        return false;
131    }
132    // TODO: return false for other cases based on regexp analysis, e.g., regexp ends with $.
133    return true;
134}
135   
136uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) {
137    if (fileName == "-") {
138        return doGrep(STDIN_FILENO, fileIdx);
139    }
140    struct stat sb;
141    const int32_t fd = open(fileName.c_str(), O_RDONLY);
142    if (LLVM_UNLIKELY(fd == -1)) {
143        if (!NoMessagesFlag) {
144            if (errno == EACCES) {
145                resultAccums[fileIdx]->mResultStr << "icgrep: " << fileName << ": Permission denied.\n";
146            }
147            else if (errno == ENOENT) {
148                resultAccums[fileIdx]->mResultStr << "icgrep: " << fileName << ": No such file.\n";
149            }
150            else {
151                resultAccums[fileIdx]->mResultStr << "icgrep: " << fileName << ": Failed.\n";
152            }
153        }
154        return 0;
155    }
156    if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
157        if (!NoMessagesFlag) {
158            resultAccums[fileIdx]->mResultStr << "icgrep: " << fileName << ": Is a directory.\n";
159        }
160        close(fd);
161        return 0;
162    }
163    const auto result = doGrep(fd, fileIdx);
164    close(fd);
165    return result;
166}
167
168uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) {
169    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, intptr_t accum_addr);
170    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
171   
172    uint64_t grepResult = f(fileDescriptor, reinterpret_cast<intptr_t>(resultAccums[fileIdx].get()));
173    if (grepResult > 0) grepMatchFound = true;
174    else if ((Mode == NormalMode) && !resultAccums[fileIdx]->mResultStr.str().empty()) grepMatchFound = true;
175   
176    if (Mode == CountOnly) {
177        resultAccums[fileIdx]->mResultStr << resultAccums[fileIdx]->mLinePrefix << grepResult << "\n";
178    }
179    else if (Mode == FilesWithMatch || Mode == FilesWithoutMatch ) {
180        size_t requiredCount = Mode == FilesWithMatch ? 1 : 0;
181        if (grepResult == requiredCount) {
182            resultAccums[fileIdx]->mResultStr << resultAccums[fileIdx]->mLinePrefix;
183        }
184    }
185    else if (Mode == QuietMode) {
186        if (grepMatchFound) exit(MatchFoundExitCode);
187    }
188    return grepResult;
189}
190
191void GrepEngine::initFileResult(std::vector<std::string> filenames){
192    grepMatchFound = false;
193    const int n = filenames.size();
194    if ((n > 1) && !NoFilenameFlag) {
195        WithFilenameFlag = true;
196    }
197    std::string fileSuffix = "";
198    bool setLinePrefix = WithFilenameFlag || (Mode == FilesWithMatch) || (Mode == FilesWithoutMatch);
199    if (setLinePrefix) {
200        if (NullFlag) {
201            fileSuffix = std::string("\0", 1);
202        }
203        else if ((Mode == NormalMode) && InitialTabFlag && !(LineNumberFlag || ByteOffsetFlag)) {
204            fileSuffix = "\t:";
205        }
206        else if ((Mode == NormalMode) || (Mode == CountOnly)) {
207            fileSuffix = ":";
208        }
209        else if ((Mode == FilesWithMatch) || (Mode == FilesWithoutMatch)) {
210            fileSuffix = "\n";
211        }
212    }
213    inputFiles = filenames;
214    for (unsigned i = 0; i < inputFiles.size(); ++i) {
215        std::string linePrefix;
216        if (setLinePrefix) {
217            if (inputFiles[i] == "-") {
218                linePrefix = LabelFlag + fileSuffix;
219            }
220            else {
221                linePrefix = inputFiles[i] + fileSuffix;
222            }
223        }
224        resultAccums.push_back(make_unique<NonNormalizingReportMatch>(linePrefix));
225    }
226}
227
228
229void GrepEngine::PrintResults(){
230   
231    for (unsigned i = 0; i < inputFiles.size(); ++i){
232        std::cout << resultAccums[i]->mResultStr.str();
233    }
234    exit(grepMatchFound ? MatchFoundExitCode : MatchNotFoundExitCode);
235}
236
237   
238std::pair<StreamSetBuffer *, StreamSetBuffer *> grepPipeline(Driver * grepDriver, std::vector<re::RE *> & REs, StreamSetBuffer * ByteStream) {
239    auto & idb = grepDriver->getBuilder();
240    const unsigned segmentSize = codegen::SegmentSize;
241    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
242    const unsigned encodingBits = 8;
243
244    StreamSetBuffer * BasisBits = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
245    kernel::Kernel * s2pk = grepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
246    grepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
247   
248    StreamSetBuffer * LineBreakStream = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
249    kernel::Kernel * linebreakK = grepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
250    grepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
251   
252    kernel::Kernel * requiredStreamsK = grepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
253    StreamSetBuffer * RequiredStreams = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
254    grepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
255   
256    const auto n = REs.size();
257   
258    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
259
260    for (unsigned i = 0; i < n; i++) {
261        REs[i] = re::resolveNames(REs[i]);
262        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
263        std::vector<std::vector<unsigned>> exclusiveSetIDs;
264        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
265        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
266    } 
267
268    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
269
270    for(unsigned i = 0; i < n; ++i){
271        const auto numOfCharacterClasses = charclasses[i].size();
272        StreamSetBuffer * CharClasses = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
273        kernel::Kernel * ccK = grepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
274        grepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
275        StreamSetBuffer * MatchResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
276        kernel::Kernel * icgrepK = grepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
277        grepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
278        MatchResultsBufs[i] = MatchResults;
279    }
280    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
281    if (REs.size() > 1) {
282        MergedResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
283        kernel::Kernel * streamsMergeK = grepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
284        grepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
285    }
286    StreamSetBuffer * Matches = MergedResults;
287   
288    if (matchesNeedToBeMovedToEOL()) {
289        StreamSetBuffer * OriginalMatches = Matches;
290        kernel::Kernel * matchedLinesK = grepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
291        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
292        grepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
293    }
294   
295    if (InvertMatchFlag) {
296        kernel::Kernel * invertK = grepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
297        StreamSetBuffer * OriginalMatches = Matches;
298        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
299        grepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
300    }
301    if (MaxCountFlag > 0) {
302        kernel::Kernel * untilK = grepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
303        untilK->setInitialArguments({idb->getSize(MaxCountFlag)});
304        StreamSetBuffer * AllMatches = Matches;
305        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
306        grepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
307    }
308    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
309}
310
311    void GrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
312       
313        assert (mGrepDriver == nullptr);
314        mGrepDriver = new ParabixDriver("engine");
315        auto & idb = mGrepDriver->getBuilder();
316        Module * M = idb->getModule();
317       
318        const unsigned segmentSize = codegen::SegmentSize;
319        const unsigned encodingBits = 8;
320       
321        Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
322        mainFunc->setCallingConv(CallingConv::C);
323        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
324        auto args = mainFunc->arg_begin();
325       
326        Value * const fileDescriptor = &*(args++);
327        fileDescriptor->setName("fileDescriptor");
328        Value * match_accumulator = &*(args++);
329        match_accumulator->setName("match_accumulator");
330       
331        StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
332        kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
333        sourceK->setInitialArguments({fileDescriptor});
334        mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
335       
336        StreamSetBuffer * LineBreakStream;
337        StreamSetBuffer * Matches;
338        std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
339       
340        kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb));
341        scanMatchK->setInitialArguments({match_accumulator});
342        mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
343        mGrepDriver->LinkFunction(*scanMatchK, "accumulate_match_wrapper", &accumulate_match_wrapper);
344        mGrepDriver->LinkFunction(*scanMatchK, "finalize_match_wrapper", &finalize_match_wrapper);
345       
346       
347        mGrepDriver->generatePipelineIR();
348        mGrepDriver->deallocateBuffers();
349       
350        idb->CreateRet(idb->getInt64(0));
351        mGrepDriver->finalizeObject();
352    }
353   
354    void CountOnlyGrepEngine::grepCodeGen(std::vector<re::RE *> REs) {
355       
356        assert (mGrepDriver == nullptr);
357        mGrepDriver = new ParabixDriver("engine");
358        auto & idb = mGrepDriver->getBuilder();
359        Module * M = idb->getModule();
360       
361        const unsigned segmentSize = codegen::SegmentSize;
362        const unsigned encodingBits = 8;
363       
364        Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", idb->getInt64Ty(), idb->getInt32Ty(), idb->getIntAddrTy(), nullptr));
365        mainFunc->setCallingConv(CallingConv::C);
366        idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
367        auto args = mainFunc->arg_begin();
368       
369        Value * const fileDescriptor = &*(args++);
370        fileDescriptor->setName("fileDescriptor");
371        Value * match_accumulator = &*(args++);
372        match_accumulator->setName("match_accumulator");
373       
374        StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, encodingBits)));
375        kernel::Kernel * sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::FDSourceKernel>(idb, segmentSize));
376        sourceK->setInitialArguments({fileDescriptor});
377        mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
378       
379        StreamSetBuffer * LineBreakStream;
380        StreamSetBuffer * Matches;
381        std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, ByteStream);
382       
383        kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
384        mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
385        mGrepDriver->generatePipelineIR();
386        idb->setKernel(matchCountK);
387        Value * matchedLineCount = idb->getAccumulator("countResult");
388        matchedLineCount = idb->CreateZExt(matchedLineCount, idb->getInt64Ty());
389        mGrepDriver->deallocateBuffers();
390        idb->CreateRet(matchedLineCount);
391        mGrepDriver->finalizeObject();
392    }
393   
394
395GrepEngine::~GrepEngine() {
396    delete mGrepDriver;
397}
398
399}
Note: See TracBrowser for help on using the repository browser.