source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5401

Last change on this file since 5401 was 5401, checked in by nmedfort, 3 years ago

Updated all projects to use ParabixDriver?. Deprecated original pipeline generation methods. Enabled LLVM optimizations, IR and ASM printing for Kernel modules. Enabled object cache by default. Begun work on moving consumed position information back to producing kernels.

File size: 18.5 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <llvm/Support/raw_ostream.h>
13#include <re/re_alt.h>
14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
17#include <re/re_parser.h>
18#include <re/re_utility.h>
19#include <grep_engine.h>
20#include <fstream>
21#include <string>
22#include <boost/uuid/sha1.hpp>
23#include <toolchain.h>
24#include <re/re_toolchain.h>
25#include <pablo/pablo_toolchain.h>
26#include <mutex>
27#include <boost/filesystem.hpp>
28#include <iostream> // MEEE
29#ifdef PRINT_TIMING_INFORMATION
30#include <hrtime.h>
31#include <util/papi_helper.hpp>
32#endif
33
34using namespace llvm;
35
36static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
37                                       "These are standard grep options intended for compatibility with typical grep usage.");
38
39#ifdef FUTURE
40static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
41static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
42    cl::values(
43        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
44        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
45        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
46        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
47        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
48               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
49#endif
50
51static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
52static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
53
54static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
55static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
56
57static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
58static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
59                                       "These are additional options for icgrep functionality and performance.");
60
61static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
62static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
63
64static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
65static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
66
67
68static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
69static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
70
71
72static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
73
74static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
75static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
76static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
77static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
78
79static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
80static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
81static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
82
83static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
84
85static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
86         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
87
88static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"), cl::cat(EnhancedGrepOptions));
89static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
90static std::vector<std::string> allFiles;
91//
92// Handler for errors reported through llvm::report_fatal_error.  Report
93// and signal error code 2 (grep convention).
94//
95static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
96    #ifndef NDEBUG
97    throw std::runtime_error(Message);
98    #else
99    // Modified from LLVM's internal report_fatal_error logic.
100    SmallVector<char, 64> Buffer;
101    raw_svector_ostream OS(Buffer);
102    OS << "icgrep ERROR: " << Message << "\n";
103    StringRef MessageStr = OS.str();
104    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
105    (void)written; // If something went wrong, we deliberately just give up.
106    // Run the interrupt handlers to make sure any special cleanups get done, in
107    // particular that we remove files registered with RemoveFileOnSignal.
108    llvm::sys::RunInterruptHandlers();
109    exit(2);
110    #endif
111}
112
113static std::string allREs;
114static re::ModeFlagSet globalFlags = 0;
115std::vector<re::RE *> RELists;
116
117re::RE * get_icgrep_RE() {
118 
119    //std::vector<std::string> regexVector;
120    if (RegexFilename != "") {
121        std::ifstream regexFile(RegexFilename.c_str());
122        std::string r;
123        if (regexFile.is_open()) {
124            while (std::getline(regexFile, r)) {
125                regexVector.push_back(r);
126            }
127            regexFile.close();
128        }
129    }
130   
131    // if there are no regexes specified through -e or -f, the first positional argument
132    // must be a regex, not an input file.
133   
134    if (regexVector.size() == 0) {
135        regexVector.push_back(inputFiles[0]);
136        inputFiles.erase(inputFiles.begin());
137    }
138    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
139
140    std::vector<re::RE *> REs;
141    re::RE * re_ast = nullptr;
142    for (unsigned i = 0; i < regexVector.size(); i++) {
143#ifdef FUTURE
144        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
145#else
146        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
147#endif
148        REs.push_back(re_ast);
149        allREs += regexVector[i] + "\n";
150    }
151
152    std::vector<re::RE *>::iterator start = REs.begin();
153    std::vector<re::RE *>::iterator end = start + REsPerGroup;
154    while(end < REs.end()) {
155        RELists.push_back(re::makeAlt(start, end));
156        start = end;
157        end += REsPerGroup;
158    }
159    if(REs.end()-start>1)
160        RELists.push_back(re::makeAlt(start, REs.end()));
161    else
162        RELists.push_back(*start);
163
164    if (REs.size() > 1) {
165        re_ast = re::makeAlt(REs.begin(), REs.end());
166    }
167    if (WholeWordMatching) {
168        re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
169    }
170    if (EntireLineMatching) {
171        re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
172    }   
173    return re_ast;
174}
175
176std::string sha1sum(const std::string & str) {
177    char buffer[41];    // 40 hex-digits and the terminating null
178    unsigned int digest[5];     // 160 bits in total
179
180    boost::uuids::detail::sha1 sha1;
181    sha1.process_bytes(str.c_str(), str.size());
182    sha1.get_digest(digest);
183    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
184             digest[0], digest[1], digest[2], digest[3], digest[4]);
185    return std::string(buffer);
186}
187
188std::vector<size_t> total_CountOnly;
189std::mutex count_mutex;
190size_t fileCount;
191void *DoGrep(void *args)
192{
193    size_t fileIdx;
194    GrepEngine * grepEngine = (GrepEngine *)args;
195
196    count_mutex.lock();
197    fileIdx = fileCount;
198    fileCount++;
199    count_mutex.unlock();
200
201    while (fileIdx < allFiles.size()){
202        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly);
203       
204        count_mutex.lock();
205        fileIdx = fileCount;
206        fileCount++;
207        count_mutex.unlock();
208    }
209
210    pthread_exit(nullptr);
211}
212
213
214// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
215bool isArgUnwantedForAll(char *argument) {
216    std::vector<std::string> unwantedFlags = {"-gs"};
217    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
218        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
219            return true;
220        }
221    }
222    return false;
223}
224// Filters out the command line strings that shouldn't be passed on to Grep
225bool isArgUnwantedForGrep(char *argument) {
226#ifdef FUTURE
227    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
228#else
229    std::vector<std::string> unwantedFlags = {"-n"};
230#endif
231
232    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
233        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
234            return true;
235        }
236    }
237
238    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
239        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
240            return true;
241        }
242    }
243
244    return false;
245}
246// Filters out the command line strings that shouldn't be passed on to IcGrep
247bool isArgUnwantedForIcGrep(char *argument) {
248    bool isUnwanted = false;
249    std::vector<std::string> unwantedFlags = {"-c"};
250
251    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
252        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
253            isUnwanted = true;
254        }
255    }
256
257    return isUnwanted;
258}
259
260/*
261* Constructs a shell command that calls icgrep and then pipes the output to grep.
262* Then executs this shell command using the "system()" function.
263* This allows the output to be colored since all output is piped to grep.
264*/ 
265void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
266    std::string icGrepArguments = "";
267    std::string grepArguments = "";
268
269    // Construct the shell arguments for icgrep and grep
270    // by filtering out the command line arguments passed into this process.
271    for (int i = 1; i < argc; i++) {
272        if (!isArgUnwantedForAll(argv[i])) {
273
274            if (!isArgUnwantedForIcGrep(argv[i])) {
275                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
276                icGrepArguments.append("\"");       
277                icGrepArguments.append(argv[i]);
278                icGrepArguments.append("\" ");
279            }
280
281            if (!isArgUnwantedForGrep(argv[i])) {
282                grepArguments.append("\"");
283                grepArguments.append(argv[i]);
284                grepArguments.append("\" ");
285            }
286        }
287    }
288
289#ifdef FUTURE
290    switch (RegexpSyntax) {
291        case re::RE_Syntax::BRE:
292            grepArguments.append("\"-G\" ");
293            break;
294        case re::RE_Syntax::ERE:
295            grepArguments.append("\"-E\" ");
296            break;
297        case re::RE_Syntax::PROSITE:
298            grepArguments.append("\"-PRO\" ");
299            break;
300        case re::RE_Syntax::PCRE:
301            grepArguments.append("\"-P\" ");
302            break;
303        default:
304            //TODO: handle fix string
305            break;
306    }
307#endif
308
309    std::string systemCall = argv[0];
310    systemCall.append(" ");
311    systemCall.append(icGrepArguments);
312    systemCall.append(" ");
313#ifdef FUTURE
314    systemCall.append(" | grep --color=always ");
315#else
316    systemCall.append(" | grep --color=always -P ");
317#endif
318    systemCall.append(grepArguments);
319
320    const auto rc = system(systemCall.c_str());
321    if (LLVM_UNLIKELY(rc < 0)) {
322        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
323    }
324}
325
326
327// This is a stub, to be expanded later.
328bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
329
330std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
331    using namespace boost::filesystem;
332    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
333    std::vector<std::string> expanded_paths;
334    boost::system::error_code errc;
335    if (FollowSubdirectorySymlinks) {
336        EnterDirectoriesRecursively = true;
337    }
338    for (const std::string & f : inputFiles) {
339//        if (f == "-") {
340//            continue;
341//        }
342        path p(f);
343        if (LLVM_UNLIKELY(EnterDirectoriesRecursively && is_directory(p))) {
344            if (!excludeDirectory(p)) {
345                recursive_directory_iterator di(p, follow_symlink, errc), end;
346                if (errc) {
347                    // If we cannot enter the directory, keep it in the list of files.
348                    expanded_paths.push_back(f); 
349                    continue;
350                }
351                while (di != end) {
352                    auto & e = di->path();
353                    if (is_directory(e)) {
354                        if (LLVM_UNLIKELY(excludeDirectory(e))) {
355                            di.no_push();
356                        }
357                    } else {
358                        expanded_paths.push_back(e.string());
359                    }
360                    di.increment(errc);
361                    if (errc) {
362                        expanded_paths.push_back(e.string()); 
363                    }
364                }
365            }
366        } else {
367            expanded_paths.push_back(p.string());
368        }
369    }
370    return expanded_paths;
371}
372
373
374int main(int argc, char *argv[]) {
375    llvm::install_fatal_error_handler(&icgrep_error_handler);
376    AddParabixVersionPrinter();
377#ifndef USE_LLVM_3_6
378    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
379#endif
380    cl::ParseCommandLineOptions(argc, argv);
381#ifdef FUTURE
382    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
383        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
384    }
385#endif
386    re::RE * re_ast = get_icgrep_RE();
387    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
388   
389    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
390        pipeIcGrepOutputToGrep(argc, argv);
391        return 0;   // icgrep is called again, so we need to end this process.
392    }
393
394
395    allFiles = getFullFileList(inputFiles);
396
397    GrepEngine grepEngine;
398
399    if (allFiles.empty()) {
400
401        grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16, GrepType::Normal, true);
402        allFiles = { "-" };
403        initFileResult(allFiles);
404        total_CountOnly.push_back(0);
405        grepEngine.doGrep(0, CountOnly, total_CountOnly);
406
407    } else {
408
409        if (MultiGrepKernels) {
410            grepEngine.grepCodeGen(module_name, RELists, CountOnly, UTF_16);
411        } else {
412            grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16, GrepType::Normal, false);
413        }
414
415        if (FileNamesOnly && NonMatchingFileNamesOnly) {
416            // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
417            // (Although GNU grep prints nothing.)
418            for (auto & f : allFiles) {
419                if (boost::filesystem::exists(f)) {
420                    std::cout << f << "\n";
421                } else {
422                    std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
423                }
424            }
425            exit(0);
426        }
427
428        if (FileNamesOnly) {
429            llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
430        }
431        if (NonMatchingFileNamesOnly) {
432            llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
433        }
434        initFileResult(allFiles);
435
436        for (unsigned i=0; i < allFiles.size(); ++i){
437            total_CountOnly.push_back(0);
438        }
439
440        if (Threads <= 1) {
441
442            #ifdef PRINT_TIMING_INFORMATION
443            // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
444            // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
445            papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
446            #endif
447            for (unsigned i = 0; i != allFiles.size(); ++i) {
448                #ifdef PRINT_TIMING_INFORMATION
449                papiCounters.start();
450                const timestamp_t execution_start = read_cycle_counter();
451                #endif
452                grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly);
453                #ifdef PRINT_TIMING_INFORMATION
454                const timestamp_t execution_end = read_cycle_counter();
455                papiCounters.stop();
456                std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
457                #endif
458            }
459        } else if (Threads > 1) {
460            const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
461            pthread_t threads[numOfThreads];
462
463            for(unsigned long i = 0; i < numOfThreads; ++i){
464                const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
465                if (rc) {
466                    llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
467                }
468            }
469
470            for(unsigned i = 0; i < numOfThreads; ++i) {
471                void * status = nullptr;
472                const int rc = pthread_join(threads[i], &status);
473                if (rc) {
474                    llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
475                }
476            }
477        }
478
479    }
480
481   
482    PrintResult(CountOnly, total_CountOnly);
483   
484    return 0;
485}
Note: See TracBrowser for help on using the repository browser.