source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5423

Last change on this file since 5423 was 5418, checked in by nmedfort, 2 years ago

Removed non-functional CUDA code from icgrep and consolidated grep and multigrep mode into a single function; allowed segment parallel pipeline to utilize process as its initial thread; modified MMapSourceKernel to map and perform mmap directly and advise the OS to drop consumed data streams.

File size: 17.7 KB
RevLine 
[3850]1/*
[4947]2 *  Copyright (c) 2016 International Characters.
[3850]3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[4961]7#include <cstdio>
[5025]8#include <vector>
[4730]9#include <llvm/Support/CommandLine.h>
[5161]10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
[5267]12#include <llvm/Support/raw_ostream.h>
[4968]13#include <re/re_alt.h>
[5197]14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
[4734]17#include <re/re_parser.h>
[5197]18#include <re/re_utility.h>
[4946]19#include <grep_engine.h>
[4968]20#include <fstream>
21#include <string>
[4961]22#include <boost/uuid/sha1.hpp>
[5402]23#include <kernels/toolchain.h>
[5030]24#include <re/re_toolchain.h>
[5031]25#include <pablo/pablo_toolchain.h>
[4972]26#include <mutex>
[5163]27#include <boost/filesystem.hpp>
[5016]28#include <iostream> // MEEE
[5156]29#ifdef PRINT_TIMING_INFORMATION
30#include <hrtime.h>
31#include <util/papi_helper.hpp>
32#endif
[5418]33#include <sys/stat.h>
34#include <fcntl.h>
[5156]35
[5267]36using namespace llvm;
37
[5026]38static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
39                                       "These are standard grep options intended for compatibility with typical grep usage.");
[5167]40
41#ifdef FUTURE
42static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
[5180]43static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
[5167]44    cl::values(
[5180]45        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
46        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
47        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
[5218]48        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
[5180]49        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
50               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
[5167]51#endif
52
[5197]53static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
54static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
55
56static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
57static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
58
[5045]59static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
[5026]60static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
61                                       "These are additional options for icgrep functionality and performance.");
[5197]62
63static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
64static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
65
[5355]66static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5197]67static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
68
69
[5163]70static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5025]71static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
[5016]72
[5197]73
[4544]74static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
75
[5163]76static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
77static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
78static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5197]79static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
[4544]80
[5026]81static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
82static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
83static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
[4544]84
[4967]85static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
[4544]86
[5016]87static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
[5026]88         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
[5016]89
[5344]90static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"), cl::cat(EnhancedGrepOptions));
91static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
[5163]92static std::vector<std::string> allFiles;
[5161]93//
94// Handler for errors reported through llvm::report_fatal_error.  Report
95// and signal error code 2 (grep convention).
96//
[5320]97static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
98    #ifndef NDEBUG
99    throw std::runtime_error(Message);
100    #else
[5161]101    // Modified from LLVM's internal report_fatal_error logic.
102    SmallVector<char, 64> Buffer;
103    raw_svector_ostream OS(Buffer);
104    OS << "icgrep ERROR: " << Message << "\n";
105    StringRef MessageStr = OS.str();
106    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
107    (void)written; // If something went wrong, we deliberately just give up.
108    // Run the interrupt handlers to make sure any special cleanups get done, in
109    // particular that we remove files registered with RemoveFileOnSignal.
110    llvm::sys::RunInterruptHandlers();
111    exit(2);
[5320]112    #endif
[5161]113}
114
[4961]115static std::string allREs;
[4963]116static re::ModeFlagSet globalFlags = 0;
[4734]117
[5418]118std::vector<re::RE *> readExpressions() {
[4734]119 
120    if (RegexFilename != "") {
121        std::ifstream regexFile(RegexFilename.c_str());
122        std::string r;
123        if (regexFile.is_open()) {
124            while (std::getline(regexFile, r)) {
125                regexVector.push_back(r);
126            }
127            regexFile.close();
128        }
129    }
130   
131    // if there are no regexes specified through -e or -f, the first positional argument
132    // must be a regex, not an input file.
133   
134    if (regexVector.size() == 0) {
135        regexVector.push_back(inputFiles[0]);
[5015]136        inputFiles.erase(inputFiles.begin());
[4734]137    }
[5418]138    if (CaseInsensitive) {
139        globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
140    }
[4734]141
[5344]142    std::vector<re::RE *> REs;
[4750]143    for (unsigned i = 0; i < regexVector.size(); i++) {
[5180]144#ifdef FUTURE
[5418]145        re::RE * re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
[5180]146#else
[5418]147        re::RE * re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
[5180]148#endif
[4734]149        REs.push_back(re_ast);
[4961]150        allREs += regexVector[i] + "\n";
[4734]151    }
[5344]152
[5418]153    if (MultiGrepKernels) {
154        std::vector<re::RE *> groups;
155        auto start = REs.begin();
156        auto end = start + REsPerGroup;
157        while (end < REs.end()) {
158            groups.push_back(re::makeAlt(start, end));
159            start = end;
160            end += REsPerGroup;
161        }
162        if ((REs.end() - start) > 1) {
163            groups.push_back(re::makeAlt(start, REs.end()));
164        } else {
165            groups.push_back(*start);
166        }
167        REs.swap(groups);
168    } else if (REs.size() > 1) {
169        re::RE * re_ast = re::makeAlt(REs.begin(), REs.end());
170        REs.assign({re_ast});
[5344]171    }
172
[5418]173    for (re::RE *& re_ast : REs) {
174        if (WholeWordMatching) {
175            re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
176        }
177        if (EntireLineMatching) {
178            re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
179        }
[4734]180    }
[5418]181
182    return REs;
[4734]183}
184
[4961]185std::string sha1sum(const std::string & str) {
186    char buffer[41];    // 40 hex-digits and the terminating null
187    unsigned int digest[5];     // 160 bits in total
[4775]188
[4961]189    boost::uuids::detail::sha1 sha1;
190    sha1.process_bytes(str.c_str(), str.size());
191    sha1.get_digest(digest);
192    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
193             digest[0], digest[1], digest[2], digest[3], digest[4]);
194    return std::string(buffer);
195}
196
[5107]197std::vector<size_t> total_CountOnly;
[4972]198std::mutex count_mutex;
199size_t fileCount;
[4979]200void *DoGrep(void *args)
[4967]201{
[4972]202    size_t fileIdx;
[4979]203    GrepEngine * grepEngine = (GrepEngine *)args;
[4967]204
[4972]205    count_mutex.lock();
[5028]206    fileIdx = fileCount;
[4972]207    fileCount++;
208    count_mutex.unlock();
[4979]209
[5418]210    while (fileIdx < allFiles.size()) {
211        total_CountOnly[fileIdx] = grepEngine->doGrep(allFiles[fileIdx], fileIdx);
[4972]212       
213        count_mutex.lock();
[5028]214        fileIdx = fileCount;
[4972]215        fileCount++;
216        count_mutex.unlock();
217    }
218
[5267]219    pthread_exit(nullptr);
[4967]220}
221
[5016]222
223// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
224bool isArgUnwantedForAll(char *argument) {
225    std::vector<std::string> unwantedFlags = {"-gs"};
[5037]226    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]227        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]228            return true;
[5016]229        }
230    }
[5037]231    return false;
[5016]232}
233// Filters out the command line strings that shouldn't be passed on to Grep
234bool isArgUnwantedForGrep(char *argument) {
[5180]235#ifdef FUTURE
[5218]236    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
[5180]237#else
[5016]238    std::vector<std::string> unwantedFlags = {"-n"};
[5180]239#endif
[5016]240
[5180]241    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]242        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]243            return true;
[5016]244        }
245    }
246
[5037]247    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
[5016]248        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
[5037]249            return true;
[5016]250        }
251    }
252
[5037]253    return false;
[5016]254}
255// Filters out the command line strings that shouldn't be passed on to IcGrep
256bool isArgUnwantedForIcGrep(char *argument) {
[5163]257    bool isUnwanted = false;
[5016]258    std::vector<std::string> unwantedFlags = {"-c"};
259
[5037]260    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]261        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5163]262            isUnwanted = true;
[5016]263        }
264    }
265
[5163]266    return isUnwanted;
[5016]267}
268
269/*
270* Constructs a shell command that calls icgrep and then pipes the output to grep.
271* Then executs this shell command using the "system()" function.
272* This allows the output to be colored since all output is piped to grep.
273*/ 
274void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
275    std::string icGrepArguments = "";
276    std::string grepArguments = "";
277
278    // Construct the shell arguments for icgrep and grep
279    // by filtering out the command line arguments passed into this process.
[5037]280    for (int i = 1; i < argc; i++) {
[5016]281        if (!isArgUnwantedForAll(argv[i])) {
282
283            if (!isArgUnwantedForIcGrep(argv[i])) {
[5138]284                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
285                icGrepArguments.append("\"");       
[5016]286                icGrepArguments.append(argv[i]);
[5138]287                icGrepArguments.append("\" ");
[5016]288            }
289
290            if (!isArgUnwantedForGrep(argv[i])) {
[5138]291                grepArguments.append("\"");
[5016]292                grepArguments.append(argv[i]);
[5138]293                grepArguments.append("\" ");
[5016]294            }
295        }
296    }
297
[5180]298#ifdef FUTURE
299    switch (RegexpSyntax) {
300        case re::RE_Syntax::BRE:
301            grepArguments.append("\"-G\" ");
302            break;
303        case re::RE_Syntax::ERE:
304            grepArguments.append("\"-E\" ");
305            break;
[5218]306        case re::RE_Syntax::PROSITE:
307            grepArguments.append("\"-PRO\" ");
308            break;
[5180]309        case re::RE_Syntax::PCRE:
310            grepArguments.append("\"-P\" ");
311            break;
312        default:
313            //TODO: handle fix string
314            break;
315    }
316#endif
317
[5154]318    std::string systemCall = argv[0];
319    systemCall.append(" ");
[5016]320    systemCall.append(icGrepArguments);
321    systemCall.append(" ");
[5180]322#ifdef FUTURE
323    systemCall.append(" | grep --color=always ");
324#else
[5016]325    systemCall.append(" | grep --color=always -P ");
[5180]326#endif
[5016]327    systemCall.append(grepArguments);
[5138]328
[5236]329    const auto rc = system(systemCall.c_str());
330    if (LLVM_UNLIKELY(rc < 0)) {
331        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
332    }
[5016]333}
334
335
[5163]336// This is a stub, to be expanded later.
337bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
338
339std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
340    using namespace boost::filesystem;
341    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
342    std::vector<std::string> expanded_paths;
343    boost::system::error_code errc;
344    if (FollowSubdirectorySymlinks) {
345        EnterDirectoriesRecursively = true;
346    }
[5379]347    for (const std::string & f : inputFiles) {
348//        if (f == "-") {
349//            continue;
350//        }
[5163]351        path p(f);
[5379]352        if (LLVM_UNLIKELY(EnterDirectoriesRecursively && is_directory(p))) {
[5163]353            if (!excludeDirectory(p)) {
354                recursive_directory_iterator di(p, follow_symlink, errc), end;
355                if (errc) {
356                    // If we cannot enter the directory, keep it in the list of files.
357                    expanded_paths.push_back(f); 
358                    continue;
359                }
360                while (di != end) {
361                    auto & e = di->path();
362                    if (is_directory(e)) {
[5379]363                        if (LLVM_UNLIKELY(excludeDirectory(e))) {
364                            di.no_push();
365                        }
366                    } else {
367                        expanded_paths.push_back(e.string());
[5163]368                    }
369                    di.increment(errc);
370                    if (errc) {
371                        expanded_paths.push_back(e.string()); 
372                    }
373                }
374            }
[5379]375        } else {
376            expanded_paths.push_back(p.string());
[5163]377        }
378    }
379    return expanded_paths;
380}
381
382
[4325]383int main(int argc, char *argv[]) {
[5161]384    llvm::install_fatal_error_handler(&icgrep_error_handler);
[5373]385    AddParabixVersionPrinter();
[5199]386#ifndef USE_LLVM_3_6
[5036]387    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
[5186]388#endif
[4544]389    cl::ParseCommandLineOptions(argc, argv);
[5167]390#ifdef FUTURE
[5180]391    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
392        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
[5167]393    }
394#endif
[5418]395
396    const auto REs = readExpressions();
397
[4963]398    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
[5197]399   
[5016]400    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
401        pipeIcGrepOutputToGrep(argc, argv);
402        return 0;   // icgrep is called again, so we need to end this process.
403    }
[5377]404
405
[5379]406    allFiles = getFullFileList(inputFiles);
407
[4979]408    GrepEngine grepEngine;
[5087]409
[5379]410    if (allFiles.empty()) {
[5377]411
[5418]412        grepEngine.grepCodeGen(module_name, REs, CountOnly, UTF_16, GrepSource::StdIn);
[5379]413        allFiles = { "-" };
[5377]414        initFileResult(allFiles);
[5418]415        total_CountOnly.resize(1);
416        total_CountOnly[0] = grepEngine.doGrep(STDIN_FILENO, 0);
[5377]417
418    } else {
419
[5418]420        grepEngine.grepCodeGen(module_name, REs, CountOnly, UTF_16, GrepSource::File);
[5377]421
422        if (FileNamesOnly && NonMatchingFileNamesOnly) {
423            // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
424            // (Although GNU grep prints nothing.)
425            for (auto & f : allFiles) {
426                if (boost::filesystem::exists(f)) {
427                    std::cout << f << "\n";
428                } else {
429                    std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
430                }
[5197]431            }
[5377]432            exit(0);
[5197]433        }
[4967]434
[5377]435        if (FileNamesOnly) {
436            llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
437        }
438        if (NonMatchingFileNamesOnly) {
439            llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
440        }
441        initFileResult(allFiles);
[5418]442        total_CountOnly.resize(allFiles.size());
[5156]443
[5377]444        if (Threads <= 1) {
445            for (unsigned i = 0; i != allFiles.size(); ++i) {
[5418]446                total_CountOnly[i] = grepEngine.doGrep(allFiles[i], i);
[5377]447            }
448        } else if (Threads > 1) {
449            const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
450            pthread_t threads[numOfThreads];
[4967]451
[5377]452            for(unsigned long i = 0; i < numOfThreads; ++i){
453                const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
454                if (rc) {
455                    llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
456                }
[4968]457            }
[5377]458            for(unsigned i = 0; i < numOfThreads; ++i) {
459                void * status = nullptr;
460                const int rc = pthread_join(threads[i], &status);
461                if (rc) {
462                    llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
463                }
[4967]464            }
465        }
[5377]466
[3850]467    }
[5377]468   
[5025]469    PrintResult(CountOnly, total_CountOnly);
[4327]470   
[3850]471    return 0;
472}
Note: See TracBrowser for help on using the repository browser.