source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5464

Last change on this file since 5464 was 5464, checked in by nmedfort, 2 years ago

Restructuring work for the Driver classes. Start of work to eliminate the memory leaks with the ExecutionEngine?. Replaced custom AlignedMalloc? with backend call to std::aligned_malloc. Salvaged some work on DistributionPass? for reevaluation.

File size: 17.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <llvm/Support/raw_ostream.h>
13#include <re/re_alt.h>
14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
17#include <re/re_parser.h>
18#include <re/re_utility.h>
19#include <grep_engine.h>
20#include <fstream>
21#include <string>
22#include <toolchain/toolchain.h>
23#include <re/re_toolchain.h>
24#include <pablo/pablo_toolchain.h>
25#include <mutex>
26#include <boost/filesystem.hpp>
27#include <iostream> // MEEE
28#ifdef PRINT_TIMING_INFORMATION
29#include <hrtime.h>
30#include <util/papi_helper.hpp>
31#endif
32#include <sys/stat.h>
33#include <fcntl.h>
34
35using namespace llvm;
36
37static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
38                                       "These are standard grep options intended for compatibility with typical grep usage.");
39
40#ifdef FUTURE
41static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
42static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
43    cl::values(
44        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
45        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
46        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
47        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
48        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
49               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
50#endif
51
52static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
53static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
54
55static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
56static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
57
58static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
59static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
60                                       "These are additional options for icgrep functionality and performance.");
61
62static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
63static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
64
65static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
66static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
67
68
69static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
70static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
71
72
73static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
74
75static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
76static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
77static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
78static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
79
80static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
81static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
82static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
83
84static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
85
86static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
87         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
88
89static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"), cl::cat(EnhancedGrepOptions));
90static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
91static std::vector<std::string> allFiles;
92//
93// Handler for errors reported through llvm::report_fatal_error.  Report
94// and signal error code 2 (grep convention).
95//
96static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
97    #ifndef NDEBUG
98    throw std::runtime_error(Message);
99    #else
100    // Modified from LLVM's internal report_fatal_error logic.
101    SmallVector<char, 64> Buffer;
102    raw_svector_ostream OS(Buffer);
103    OS << "icgrep ERROR: " << Message << "\n";
104    StringRef MessageStr = OS.str();
105    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
106    (void)written; // If something went wrong, we deliberately just give up.
107    // Run the interrupt handlers to make sure any special cleanups get done, in
108    // particular that we remove files registered with RemoveFileOnSignal.
109    llvm::sys::RunInterruptHandlers();
110    exit(2);
111    #endif
112}
113
114static re::ModeFlagSet globalFlags = 0;
115
116std::vector<re::RE *> readExpressions() {
117 
118    if (RegexFilename != "") {
119        std::ifstream regexFile(RegexFilename.c_str());
120        std::string r;
121        if (regexFile.is_open()) {
122            while (std::getline(regexFile, r)) {
123                regexVector.push_back(r);
124            }
125            regexFile.close();
126        }
127    }
128   
129    // if there are no regexes specified through -e or -f, the first positional argument
130    // must be a regex, not an input file.
131   
132    if (regexVector.size() == 0) {
133        regexVector.push_back(inputFiles[0]);
134        inputFiles.erase(inputFiles.begin());
135    }
136    if (CaseInsensitive) {
137        globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
138    }
139
140    std::vector<re::RE *> REs;
141    for (unsigned i = 0; i < regexVector.size(); i++) {
142#ifdef FUTURE
143        re::RE * re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
144#else
145        re::RE * re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
146#endif
147        REs.push_back(re_ast);
148    }
149
150    if (MultiGrepKernels) {
151        std::vector<re::RE *> groups;
152        auto start = REs.begin();
153        auto end = start + REsPerGroup;
154        while (end < REs.end()) {
155            groups.push_back(re::makeAlt(start, end));
156            start = end;
157            end += REsPerGroup;
158        }
159        if ((REs.end() - start) > 1) {
160            groups.push_back(re::makeAlt(start, REs.end()));
161        } else {
162            groups.push_back(*start);
163        }
164        REs.swap(groups);
165    } else if (REs.size() > 1) {
166        re::RE * re_ast = re::makeAlt(REs.begin(), REs.end());
167        REs.assign({re_ast});
168    }
169
170    for (re::RE *& re_ast : REs) {
171        if (WholeWordMatching) {
172            re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
173        }
174        if (EntireLineMatching) {
175            re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
176        }
177    }
178
179    return REs;
180}
181
182std::vector<size_t> total_CountOnly;
183std::mutex count_mutex;
184size_t fileCount;
185void *DoGrep(void *args)
186{
187    size_t fileIdx;
188    GrepEngine * grepEngine = (GrepEngine *)args;
189
190    count_mutex.lock();
191    fileIdx = fileCount;
192    fileCount++;
193    count_mutex.unlock();
194
195    while (fileIdx < allFiles.size()) {
196        total_CountOnly[fileIdx] = grepEngine->doGrep(allFiles[fileIdx], fileIdx);
197       
198        count_mutex.lock();
199        fileIdx = fileCount;
200        fileCount++;
201        count_mutex.unlock();
202    }
203
204    pthread_exit(nullptr);
205}
206
207
208// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
209bool isArgUnwantedForAll(char *argument) {
210    std::vector<std::string> unwantedFlags = {"-gs"};
211    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
212        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
213            return true;
214        }
215    }
216    return false;
217}
218// Filters out the command line strings that shouldn't be passed on to Grep
219bool isArgUnwantedForGrep(char *argument) {
220#ifdef FUTURE
221    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
222#else
223    std::vector<std::string> unwantedFlags = {"-n"};
224#endif
225
226    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
227        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
228            return true;
229        }
230    }
231
232    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
233        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
234            return true;
235        }
236    }
237
238    return false;
239}
240// Filters out the command line strings that shouldn't be passed on to IcGrep
241bool isArgUnwantedForIcGrep(char *argument) {
242    bool isUnwanted = false;
243    std::vector<std::string> unwantedFlags = {"-c"};
244
245    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
246        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
247            isUnwanted = true;
248        }
249    }
250
251    return isUnwanted;
252}
253
254/*
255* Constructs a shell command that calls icgrep and then pipes the output to grep.
256* Then executs this shell command using the "system()" function.
257* This allows the output to be colored since all output is piped to grep.
258*/ 
259void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
260    std::string icGrepArguments = "";
261    std::string grepArguments = "";
262
263    // Construct the shell arguments for icgrep and grep
264    // by filtering out the command line arguments passed into this process.
265    for (int i = 1; i < argc; i++) {
266        if (!isArgUnwantedForAll(argv[i])) {
267
268            if (!isArgUnwantedForIcGrep(argv[i])) {
269                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
270                icGrepArguments.append("\"");       
271                icGrepArguments.append(argv[i]);
272                icGrepArguments.append("\" ");
273            }
274
275            if (!isArgUnwantedForGrep(argv[i])) {
276                grepArguments.append("\"");
277                grepArguments.append(argv[i]);
278                grepArguments.append("\" ");
279            }
280        }
281    }
282
283#ifdef FUTURE
284    switch (RegexpSyntax) {
285        case re::RE_Syntax::BRE:
286            grepArguments.append("\"-G\" ");
287            break;
288        case re::RE_Syntax::ERE:
289            grepArguments.append("\"-E\" ");
290            break;
291        case re::RE_Syntax::PROSITE:
292            grepArguments.append("\"-PRO\" ");
293            break;
294        case re::RE_Syntax::PCRE:
295            grepArguments.append("\"-P\" ");
296            break;
297        default:
298            //TODO: handle fix string
299            break;
300    }
301#endif
302
303    std::string systemCall = argv[0];
304    systemCall.append(" ");
305    systemCall.append(icGrepArguments);
306    systemCall.append(" ");
307#ifdef FUTURE
308    systemCall.append(" | grep --color=always ");
309#else
310    systemCall.append(" | grep --color=always -P ");
311#endif
312    systemCall.append(grepArguments);
313
314    const auto rc = system(systemCall.c_str());
315    if (LLVM_UNLIKELY(rc < 0)) {
316        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
317    }
318}
319
320
321// This is a stub, to be expanded later.
322bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
323
324std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
325    using namespace boost::filesystem;
326    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
327    std::vector<std::string> expanded_paths;
328    boost::system::error_code errc;
329    if (FollowSubdirectorySymlinks) {
330        EnterDirectoriesRecursively = true;
331    }
332    for (const std::string & f : inputFiles) {
333//        if (f == "-") {
334//            continue;
335//        }
336        path p(f);
337        if (LLVM_UNLIKELY(EnterDirectoriesRecursively && is_directory(p))) {
338            if (!excludeDirectory(p)) {
339                recursive_directory_iterator di(p, follow_symlink, errc), end;
340                if (errc) {
341                    // If we cannot enter the directory, keep it in the list of files.
342                    expanded_paths.push_back(f); 
343                    continue;
344                }
345                while (di != end) {
346                    auto & e = di->path();
347                    if (is_directory(e)) {
348                        if (LLVM_UNLIKELY(excludeDirectory(e))) {
349                            di.no_push();
350                        }
351                    } else {
352                        expanded_paths.push_back(e.string());
353                    }
354                    di.increment(errc);
355                    if (errc) {
356                        expanded_paths.push_back(e.string()); 
357                    }
358                }
359            }
360        } else {
361            expanded_paths.push_back(p.string());
362        }
363    }
364    return expanded_paths;
365}
366
367
368int main(int argc, char *argv[]) {
369    llvm::install_fatal_error_handler(&icgrep_error_handler);
370    AddParabixVersionPrinter();
371#ifndef USE_LLVM_3_6
372    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
373#endif
374    cl::ParseCommandLineOptions(argc, argv);
375#ifdef FUTURE
376    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
377        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
378    }
379#endif
380
381    const auto REs = readExpressions();
382
383    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
384        pipeIcGrepOutputToGrep(argc, argv);
385        return 0;   // icgrep is called again, so we need to end this process.
386    }
387
388
389    allFiles = getFullFileList(inputFiles);
390
391    GrepEngine grepEngine;
392
393    if (allFiles.empty()) {
394
395        grepEngine.grepCodeGen(REs, CountOnly, UTF_16, GrepSource::StdIn);
396        allFiles = { "-" };
397        initFileResult(allFiles);
398        total_CountOnly.resize(1);
399        total_CountOnly[0] = grepEngine.doGrep(STDIN_FILENO, 0);
400
401    } else {
402       
403        setNVPTXOption();
404       
405        if(codegen::NVPTX){
406            grepEngine.grepCodeGen_nvptx(REs, CountOnly, UTF_16);
407            for (unsigned i = 0; i != allFiles.size(); ++i) {
408                grepEngine.doGrep(allFiles[i]);
409            }         
410            return 0;
411        }
412        else{
413            grepEngine.grepCodeGen(REs, CountOnly, UTF_16, GrepSource::File);
414        }
415
416        if (FileNamesOnly && NonMatchingFileNamesOnly) {
417            // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
418            // (Although GNU grep prints nothing.)
419            for (auto & f : allFiles) {
420                if (boost::filesystem::exists(f)) {
421                    std::cout << f << "\n";
422                } else {
423                    std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
424                }
425            }
426            exit(0);
427        }
428
429        if (FileNamesOnly) {
430            llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
431        }
432        if (NonMatchingFileNamesOnly) {
433            llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
434        }
435        initFileResult(allFiles);
436        total_CountOnly.resize(allFiles.size());
437
438        if (Threads <= 1) {
439            for (unsigned i = 0; i != allFiles.size(); ++i) {
440                total_CountOnly[i] = grepEngine.doGrep(allFiles[i], i);
441            }
442        } else if (Threads > 1) {
443            const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
444            pthread_t threads[numOfThreads];
445
446            for(unsigned long i = 0; i < numOfThreads; ++i){
447                const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
448                if (rc) {
449                    llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
450                }
451            }
452            for(unsigned i = 0; i < numOfThreads; ++i) {
453                void * status = nullptr;
454                const int rc = pthread_join(threads[i], &status);
455                if (rc) {
456                    llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
457                }
458            }
459        }
460
461    }
462   
463    PrintResult(CountOnly, total_CountOnly);
464   
465    return 0;
466}
Note: See TracBrowser for help on using the repository browser.