source: icGREP/icgrep-devel/icgrep/grep_engine.cpp @ 5692

Last change on this file since 5692 was 5692, checked in by cameron, 18 months ago

Cleaning up icgrep - removing experimental nvptx and incomplete UTF-16 modes

File size: 15.9 KB
Line 
1/*
2 *  Copyright (c) 2017 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "grep_engine.h"
8#include "grep_interface.h"
9#include <llvm/IR/Module.h>
10#include <boost/filesystem.hpp>
11#include <UCD/resolve_properties.h>
12#include <kernels/charclasses.h>
13#include <kernels/cc_kernel.h>
14#include <kernels/grep_kernel.h>
15#include <kernels/linebreak_kernel.h>
16#include <kernels/streams_merge.h>
17#include <kernels/source_kernel.h>
18#include <kernels/s2p_kernel.h>
19#include <kernels/scanmatchgen.h>
20#include <kernels/streamset.h>
21#include <kernels/until_n.h>
22#include <kernels/kernel_builder.h>
23#include <pablo/pablo_kernel.h>
24#include <re/re_cc.h>
25#include <re/re_toolchain.h>
26#include <toolchain/toolchain.h>
27#include <re/re_name_resolve.h>   
28#include <re/re_collect_unicodesets.h>
29#include <re/re_multiplex.h>
30#include <toolchain/cpudriver.h>
31#include <toolchain/NVPTXDriver.h>
32#include <iostream>
33#include <sstream>
34#include <cc/multiplex_CCs.h>
35#include <llvm/Support/raw_ostream.h>
36#include <util/aligned_allocator.h>
37#include <sys/stat.h>
38#include <fcntl.h>
39#include <errno.h>
40#include <mutex>
41
42using namespace parabix;
43using namespace llvm;
44
45namespace grep {
46
47static std::stringstream * resultStrs = nullptr;
48static std::vector<std::string> inputFiles;
49static std::vector<std::string> linePrefix;
50static bool grepMatchFound;
51
52size_t * startPoints = nullptr;
53size_t * accumBytes = nullptr;
54
55
56std::mutex count_mutex;
57size_t fileCount;
58
59// DoGrep thread function.
60void *DoGrepThreadFunction(void *args)
61{
62    size_t fileIdx;
63    grep::GrepEngine * grepEngine = (grep::GrepEngine *)args;
64
65    count_mutex.lock();
66    fileIdx = fileCount;
67    fileCount++;
68    count_mutex.unlock();
69
70    while (fileIdx < inputFiles.size()) {
71        size_t grepResult = grepEngine->doGrep(inputFiles[fileIdx], fileIdx);
72       
73        count_mutex.lock();
74        if (grepResult > 0) grepMatchFound = true;
75        fileIdx = fileCount;
76        fileCount++;
77        count_mutex.unlock();
78        if (QuietMode && grepMatchFound) pthread_exit(nullptr);
79    }
80
81    pthread_exit(nullptr);
82}
83
84bool matchesNeedToBeMovedToEOL() {
85    if ((Mode == QuietMode) | (Mode == FilesWithMatch) | (Mode == FilesWithoutMatch)) {
86        return false;
87    }
88    else if (LineRegexpFlag) {
89        return false;
90    }
91    // TODO: return false for other cases based on regexp analysis, e.g., regexp ends with $.
92    return true;
93}
94   
95uint64_t GrepEngine::doGrep(const std::string & fileName, const uint32_t fileIdx) const {
96    struct stat sb;
97    const int32_t fd = open(fileName.c_str(), O_RDONLY);
98    if (LLVM_UNLIKELY(fd == -1)) {
99        if (!NoMessagesFlag  && !(Mode == QuietMode)) {
100            if (errno == EACCES) {
101                resultStrs[fileIdx] << "icgrep: " << fileName << ": Permission denied.\n";
102            }
103            else if (errno == ENOENT) {
104                resultStrs[fileIdx] << "icgrep: " << fileName << ": No such file.\n";
105            }
106            else {
107                resultStrs[fileIdx] << "icgrep: " << fileName << ": Failed.\n";
108            }
109        }
110        return 0;
111    }
112    if (stat(fileName.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) {
113        if (!NoMessagesFlag  && !(Mode == QuietMode)) {
114            resultStrs[fileIdx] << "icgrep: " << fileName << ": Is a directory.\n";
115        }
116        close(fd);
117        return 0;
118    }
119    const auto result = doGrep(fd, fileIdx);
120    close(fd);
121    return result;
122}
123
124uint64_t GrepEngine::doGrep(const int32_t fileDescriptor, const uint32_t fileIdx) const {
125    assert (mGrepDriver);
126    typedef uint64_t (*GrepFunctionType)(int32_t fileDescriptor, const uint32_t fileIdx);
127    auto f = reinterpret_cast<GrepFunctionType>(mGrepDriver->getMain());
128   
129    uint64_t grepResult = f(fileDescriptor, fileIdx);
130    if (grepResult > 0) grepMatchFound = true;
131    else if ((Mode == NormalMode) && !resultStrs[fileIdx].str().empty()) grepMatchFound = true;
132   
133    if (Mode == CountOnly) {
134        resultStrs[fileIdx] << linePrefix[fileIdx] << grepResult << "\n";
135    }
136    else if (Mode == FilesWithMatch || Mode == FilesWithoutMatch ) {
137        size_t requiredCount = Mode == FilesWithMatch ? 1 : 0;
138        if (grepResult == requiredCount) {
139            resultStrs[fileIdx] << linePrefix[fileIdx];
140        }
141    }
142    else if (Mode == QuietMode) {
143        if (grepMatchFound) exit(MatchFoundExitCode);
144    }
145    return grepResult;
146}
147
148void initFileResult(std::vector<std::string> filenames){
149    grepMatchFound = false;
150    const int n = filenames.size();
151    linePrefix.resize(n);
152    if ((n > 1) && !NoFilenameFlag) {
153        WithFilenameFlag = true;
154    }
155    std::string fileSuffix = "";
156    bool setLinePrefix = WithFilenameFlag || (Mode == FilesWithMatch) || (Mode == FilesWithoutMatch);
157    if (setLinePrefix) {
158        if (NullFlag) {
159            fileSuffix = std::string("\0", 1);
160        }
161        else if ((Mode == NormalMode) && InitialTabFlag && !(LineNumberFlag || ByteOffsetFlag)) {
162            fileSuffix = "\t:";
163        }
164        else if ((Mode == NormalMode) || (Mode == CountOnly)) {
165            fileSuffix = ":";
166        }
167        else if ((Mode == FilesWithMatch) || (Mode == FilesWithoutMatch)) {
168            fileSuffix = "\n";
169        }
170    }
171    inputFiles = filenames;
172    resultStrs = new std::stringstream[n];
173    for (unsigned i = 0; i < inputFiles.size(); ++i) {
174        if (setLinePrefix) {
175            if (inputFiles[i] == "-") {
176                linePrefix[i] = LabelFlag + fileSuffix;
177            }
178            else {
179                linePrefix[i] = inputFiles[i] + fileSuffix;
180            }
181        }
182    }
183}
184
185template<typename CodeUnit>
186void wrapped_report_match(const size_t lineNum, size_t line_start, size_t line_end, const CodeUnit * const buffer, const size_t filesize, const size_t fileIdx) {
187
188//    errs().write_hex((size_t)buffer) << " : " << lineNum << " (" << line_start << ", " << line_end << ", " << filesize << ")\n";
189
190    assert (buffer);
191    assert (line_start <= line_end);
192    assert (line_end <= filesize);
193
194    if (WithFilenameFlag) {
195        resultStrs[fileIdx] << linePrefix[fileIdx];
196    }
197    if (LineNumberFlag) {
198        // Internally line numbers are counted from 0.  For display, adjust
199        // the line number so that lines are numbered from 1.
200        if (InitialTabFlag) {
201            resultStrs[fileIdx] << lineNum+1 << "\t:";
202        }
203        else {
204            resultStrs[fileIdx] << lineNum+1 << ":";
205        }
206    }
207
208    // If the line "starts" on the LF of a CRLF, it is actually the end of the last line.
209    if ((buffer[line_start] == 0xA) && (line_start != line_end)) {
210        ++line_start;
211    }
212
213    if (LLVM_UNLIKELY(line_end == filesize)) {
214        // The match position is at end-of-file.   We have a final unterminated line.
215        resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
216        if (NormalizeLineBreaksFlag) {
217            resultStrs[fileIdx] << '\n';  // terminate it
218        }
219    } else {
220        const auto end_byte = buffer[line_end];
221        if (grep::NormalizeLineBreaksFlag) {
222            if (LLVM_UNLIKELY(end_byte == 0x85)) {
223                // Line terminated with NEL, on the second byte.  Back up 1.
224                line_end -= 1;
225            } else if (LLVM_UNLIKELY(end_byte > 0xD)) {
226                // Line terminated with PS or LS, on the third byte.  Back up 2.
227                line_end -= 2;
228            }
229            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start) * sizeof(CodeUnit));
230            resultStrs[fileIdx] << '\n';
231        } else {
232            if (end_byte == 0x0D) {
233                // Check for line_end on first byte of CRLF; we don't want to access past the end of buffer.
234                if ((line_end + 1) < filesize) {
235                    if (buffer[line_end + 1] == 0x0A) {
236                        // Found CRLF; preserve both bytes.
237                        ++line_end;
238                    }
239                }
240            }
241            resultStrs[fileIdx].write((char *)&buffer[line_start], (line_end - line_start + 1) * sizeof(CodeUnit));
242        }
243    }
244}
245
246void PrintResults(){
247   
248    for (unsigned i = 0; i < inputFiles.size(); ++i){
249        std::cout << resultStrs[i].str();
250    }
251    exit(grepMatchFound ? MatchFoundExitCode : MatchNotFoundExitCode);
252}
253
254   
255std::pair<StreamSetBuffer *, StreamSetBuffer *> grepPipeline(Driver * grepDriver, std::vector<re::RE *> & REs, const GrepModeType grepMode, unsigned encodingBits, StreamSetBuffer * ByteStream) {
256    auto & idb = grepDriver->getBuilder();
257    const unsigned segmentSize = codegen::SegmentSize;
258    const unsigned bufferSegments = codegen::BufferSegments * codegen::ThreadNum;
259    size_t MatchLimit = ((grepMode == QuietMode) | (grepMode == FilesWithMatch) | (grepMode == FilesWithoutMatch)) ? 1 : MaxCountFlag;
260   
261
262    StreamSetBuffer * BasisBits = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(encodingBits, 1), segmentSize * bufferSegments));
263    kernel::Kernel * s2pk = grepDriver->addKernelInstance(make_unique<kernel::S2PKernel>(idb));
264    grepDriver->makeKernelCall(s2pk, {ByteStream}, {BasisBits});
265   
266    StreamSetBuffer * LineBreakStream = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
267    kernel::Kernel * linebreakK = grepDriver->addKernelInstance(make_unique<kernel::LineBreakKernelBuilder>(idb, encodingBits));
268    grepDriver->makeKernelCall(linebreakK, {BasisBits}, {LineBreakStream});
269   
270    kernel::Kernel * requiredStreamsK = grepDriver->addKernelInstance(make_unique<kernel::RequiredStreams_UTF8>(idb));
271    StreamSetBuffer * RequiredStreams = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(4, 1), segmentSize * bufferSegments));
272    grepDriver->makeKernelCall(requiredStreamsK, {BasisBits}, {RequiredStreams});
273   
274    const auto n = REs.size();
275   
276    std::vector<std::vector<UCD::UnicodeSet>> charclasses(n);
277
278    for (unsigned i = 0; i < n; i++) {
279        REs[i] = re::resolveNames(REs[i]);
280        std::vector<UCD::UnicodeSet> UnicodeSets = re::collect_UnicodeSets(REs[i]);
281        std::vector<std::vector<unsigned>> exclusiveSetIDs;
282        doMultiplexCCs(UnicodeSets, exclusiveSetIDs, charclasses[i]);
283        REs[i] = multiplex(REs[i], UnicodeSets, exclusiveSetIDs);
284    } 
285
286    std::vector<StreamSetBuffer *> MatchResultsBufs(n);
287
288    for(unsigned i = 0; i < n; ++i){
289        const auto numOfCharacterClasses = charclasses[i].size();
290        StreamSetBuffer * CharClasses = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(numOfCharacterClasses), segmentSize * bufferSegments));
291        kernel::Kernel * ccK = grepDriver->addKernelInstance(make_unique<kernel::CharClassesKernel>(idb, std::move(charclasses[i])));
292        grepDriver->makeKernelCall(ccK, {BasisBits}, {CharClasses});
293        StreamSetBuffer * MatchResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
294        kernel::Kernel * icgrepK = grepDriver->addKernelInstance(make_unique<kernel::ICGrepKernel>(idb, REs[i], numOfCharacterClasses));
295        grepDriver->makeKernelCall(icgrepK, {CharClasses, LineBreakStream, RequiredStreams}, {MatchResults});
296        MatchResultsBufs[i] = MatchResults;
297    }
298    StreamSetBuffer * MergedResults = MatchResultsBufs[0];
299    if (REs.size() > 1) {
300        MergedResults = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
301        kernel::Kernel * streamsMergeK = grepDriver->addKernelInstance(make_unique<kernel::StreamsMerge>(idb, 1, REs.size()));
302        grepDriver->makeKernelCall(streamsMergeK, MatchResultsBufs, {MergedResults});
303    }
304    StreamSetBuffer * Matches = MergedResults;
305   
306    if (matchesNeedToBeMovedToEOL()) {
307        StreamSetBuffer * OriginalMatches = Matches;
308        kernel::Kernel * matchedLinesK = grepDriver->addKernelInstance(make_unique<kernel::MatchedLinesKernel>(idb));
309        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
310        grepDriver->makeKernelCall(matchedLinesK, {OriginalMatches, LineBreakStream}, {Matches});
311    }
312   
313    if (InvertMatchFlag) {
314        kernel::Kernel * invertK = grepDriver->addKernelInstance(make_unique<kernel::InvertMatchesKernel>(idb));
315        StreamSetBuffer * OriginalMatches = Matches;
316        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
317        grepDriver->makeKernelCall(invertK, {OriginalMatches, LineBreakStream}, {Matches});
318    }
319    if (MatchLimit > 0) {
320        kernel::Kernel * untilK = grepDriver->addKernelInstance(make_unique<kernel::UntilNkernel>(idb));
321        untilK->setInitialArguments({idb->getSize(MatchLimit)});
322        StreamSetBuffer * AllMatches = Matches;
323        Matches = grepDriver->addBuffer(make_unique<CircularBuffer>(idb, idb->getStreamSetTy(1, 1), segmentSize * bufferSegments));
324        grepDriver->makeKernelCall(untilK, {AllMatches}, {Matches});
325    }
326    return std::pair<StreamSetBuffer *, StreamSetBuffer *>(LineBreakStream, Matches);
327}
328
329void GrepEngine::grepCodeGen(std::vector<re::RE *> REs, const GrepModeType grepMode, GrepSource grepSource) {
330
331    assert (mGrepDriver == nullptr);
332    mGrepDriver = new ParabixDriver("engine");
333    auto & idb = mGrepDriver->getBuilder();
334    Module * M = idb->getModule();
335
336    const unsigned segmentSize = codegen::SegmentSize;
337    const unsigned encodingBits = 8;
338
339    Type * const int64Ty = idb->getInt64Ty();
340    Type * const int32Ty = idb->getInt32Ty();
341
342    kernel::Kernel * sourceK = nullptr;
343   
344    Function * mainFunc = cast<Function>(M->getOrInsertFunction("Main", int64Ty, idb->getInt32Ty(), int32Ty, nullptr));
345    mainFunc->setCallingConv(CallingConv::C);
346    idb->SetInsertPoint(BasicBlock::Create(M->getContext(), "entry", mainFunc, 0));
347    auto args = mainFunc->arg_begin();
348
349    Value * const fileDescriptor = &*(args++);
350    fileDescriptor->setName("fileDescriptor");
351    Value * fileIdx = &*(args++);
352    fileIdx->setName("fileIdx");
353
354    StreamSetBuffer * ByteStream = mGrepDriver->addBuffer(make_unique<SourceBuffer>(idb, idb->getStreamSetTy(1, 8)));
355
356    if (grepSource == GrepSource::File) {
357        sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::MMapSourceKernel>(idb, segmentSize));
358    } else {
359        sourceK = mGrepDriver->addKernelInstance(make_unique<kernel::ReadSourceKernel>(idb, segmentSize));
360    }
361    sourceK->setInitialArguments({fileDescriptor});
362
363    mGrepDriver->makeKernelCall(sourceK, {}, {ByteStream});
364   
365    StreamSetBuffer * LineBreakStream;
366    StreamSetBuffer * Matches;
367    std::tie(LineBreakStream, Matches) = grepPipeline(mGrepDriver, REs, grepMode, encodingBits, ByteStream);
368   
369    if (grepMode == NormalMode) {
370        kernel::Kernel * scanMatchK = mGrepDriver->addKernelInstance(make_unique<kernel::ScanMatchKernel>(idb, GrepType::Normal, encodingBits));
371        scanMatchK->setInitialArguments({fileIdx});
372        mGrepDriver->makeKernelCall(scanMatchK, {Matches, LineBreakStream, ByteStream}, {});
373        mGrepDriver->LinkFunction(*scanMatchK, "matcher", &wrapped_report_match<uint8_t>);
374        mGrepDriver->generatePipelineIR();
375        mGrepDriver->deallocateBuffers();
376
377        idb->CreateRet(idb->getInt64(0));
378    } else {
379        kernel::Kernel * matchCountK = mGrepDriver->addKernelInstance(make_unique<kernel::PopcountKernel>(idb));
380        mGrepDriver->makeKernelCall(matchCountK, {Matches}, {});
381        mGrepDriver->generatePipelineIR();
382        idb->setKernel(matchCountK);
383        Value * matchedLineCount = idb->getAccumulator("countResult");
384        matchedLineCount = idb->CreateZExt(matchedLineCount, int64Ty);
385        mGrepDriver->deallocateBuffers();
386        idb->CreateRet(matchedLineCount);
387    }
388    mGrepDriver->finalizeObject();
389}
390
391GrepEngine::GrepEngine()
392: mGrepDriver(nullptr) {
393
394}
395
396GrepEngine::~GrepEngine() {
397    delete mGrepDriver;
398}
399
400}
Note: See TracBrowser for help on using the repository browser.