source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5377

Last change on this file since 5377 was 5377, checked in by nmedfort, 2 years ago

Support for stdin. Needs more testing.

File size: 18.5 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <llvm/Support/raw_ostream.h>
13#include <re/re_alt.h>
14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
17#include <re/re_parser.h>
18#include <re/re_utility.h>
19#include <grep_engine.h>
20#include <fstream>
21#include <string>
22#include <boost/uuid/sha1.hpp>
23#include <toolchain.h>
24#include <re/re_toolchain.h>
25#include <pablo/pablo_toolchain.h>
26#include <mutex>
27#include <boost/filesystem.hpp>
28#include <iostream> // MEEE
29#ifdef PRINT_TIMING_INFORMATION
30#include <hrtime.h>
31#include <util/papi_helper.hpp>
32#endif
33#include <poll.h>
34
35inline bool hasInputFromStdIn() {
36    pollfd stdin_poll;
37    stdin_poll.fd = STDIN_FILENO;
38    stdin_poll.events = POLLIN | POLLRDBAND | POLLRDNORM | POLLPRI;
39    return poll(&stdin_poll, 1, 0) == 1;
40}
41
42using namespace llvm;
43
44static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
45                                       "These are standard grep options intended for compatibility with typical grep usage.");
46
47#ifdef FUTURE
48static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
49static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
50    cl::values(
51        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
52        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
53        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
54        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
55        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
56               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
57#endif
58
59static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
60static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
61
62static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
63static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
64
65static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
66static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
67                                       "These are additional options for icgrep functionality and performance.");
68
69static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
70static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
71
72static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
73static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
74
75
76static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
77static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
78
79
80static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
81
82static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
83static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
84static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
85static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
86
87static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
88static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
89static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
90
91static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
92
93static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
94         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
95
96static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"), cl::cat(EnhancedGrepOptions));
97static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
98static std::vector<std::string> allFiles;
99//
100// Handler for errors reported through llvm::report_fatal_error.  Report
101// and signal error code 2 (grep convention).
102//
103static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
104    #ifndef NDEBUG
105    throw std::runtime_error(Message);
106    #else
107    // Modified from LLVM's internal report_fatal_error logic.
108    SmallVector<char, 64> Buffer;
109    raw_svector_ostream OS(Buffer);
110    OS << "icgrep ERROR: " << Message << "\n";
111    StringRef MessageStr = OS.str();
112    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
113    (void)written; // If something went wrong, we deliberately just give up.
114    // Run the interrupt handlers to make sure any special cleanups get done, in
115    // particular that we remove files registered with RemoveFileOnSignal.
116    llvm::sys::RunInterruptHandlers();
117    exit(2);
118    #endif
119}
120
121static std::string allREs;
122static re::ModeFlagSet globalFlags = 0;
123std::vector<re::RE *> RELists;
124
125re::RE * get_icgrep_RE() {
126 
127    //std::vector<std::string> regexVector;
128    if (RegexFilename != "") {
129        std::ifstream regexFile(RegexFilename.c_str());
130        std::string r;
131        if (regexFile.is_open()) {
132            while (std::getline(regexFile, r)) {
133                regexVector.push_back(r);
134            }
135            regexFile.close();
136        }
137    }
138   
139    // if there are no regexes specified through -e or -f, the first positional argument
140    // must be a regex, not an input file.
141   
142    if (regexVector.size() == 0) {
143        regexVector.push_back(inputFiles[0]);
144        inputFiles.erase(inputFiles.begin());
145    }
146    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
147
148    std::vector<re::RE *> REs;
149    re::RE * re_ast = nullptr;
150    for (unsigned i = 0; i < regexVector.size(); i++) {
151#ifdef FUTURE
152        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
153#else
154        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
155#endif
156        REs.push_back(re_ast);
157        allREs += regexVector[i] + "\n";
158    }
159
160    std::vector<re::RE *>::iterator start = REs.begin();
161    std::vector<re::RE *>::iterator end = start + REsPerGroup;
162    while(end < REs.end()) {
163        RELists.push_back(re::makeAlt(start, end));
164        start = end;
165        end += REsPerGroup;
166    }
167    if(REs.end()-start>1)
168        RELists.push_back(re::makeAlt(start, REs.end()));
169    else
170        RELists.push_back(*start);
171
172    if (REs.size() > 1) {
173        re_ast = re::makeAlt(REs.begin(), REs.end());
174    }
175    if (WholeWordMatching) {
176        re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
177    }
178    if (EntireLineMatching) {
179        re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
180    }   
181    return re_ast;
182}
183
184std::string sha1sum(const std::string & str) {
185    char buffer[41];    // 40 hex-digits and the terminating null
186    unsigned int digest[5];     // 160 bits in total
187
188    boost::uuids::detail::sha1 sha1;
189    sha1.process_bytes(str.c_str(), str.size());
190    sha1.get_digest(digest);
191    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
192             digest[0], digest[1], digest[2], digest[3], digest[4]);
193    return std::string(buffer);
194}
195
196std::vector<size_t> total_CountOnly;
197std::mutex count_mutex;
198size_t fileCount;
199void *DoGrep(void *args)
200{
201    size_t fileIdx;
202    GrepEngine * grepEngine = (GrepEngine *)args;
203
204    count_mutex.lock();
205    fileIdx = fileCount;
206    fileCount++;
207    count_mutex.unlock();
208
209    while (fileIdx < allFiles.size()){
210        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly);
211       
212        count_mutex.lock();
213        fileIdx = fileCount;
214        fileCount++;
215        count_mutex.unlock();
216    }
217
218    pthread_exit(nullptr);
219}
220
221
222// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
223bool isArgUnwantedForAll(char *argument) {
224    std::vector<std::string> unwantedFlags = {"-gs"};
225    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
226        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
227            return true;
228        }
229    }
230    return false;
231}
232// Filters out the command line strings that shouldn't be passed on to Grep
233bool isArgUnwantedForGrep(char *argument) {
234#ifdef FUTURE
235    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
236#else
237    std::vector<std::string> unwantedFlags = {"-n"};
238#endif
239
240    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
241        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
242            return true;
243        }
244    }
245
246    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
247        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
248            return true;
249        }
250    }
251
252    return false;
253}
254// Filters out the command line strings that shouldn't be passed on to IcGrep
255bool isArgUnwantedForIcGrep(char *argument) {
256    bool isUnwanted = false;
257    std::vector<std::string> unwantedFlags = {"-c"};
258
259    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
260        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
261            isUnwanted = true;
262        }
263    }
264
265    return isUnwanted;
266}
267
268/*
269* Constructs a shell command that calls icgrep and then pipes the output to grep.
270* Then executs this shell command using the "system()" function.
271* This allows the output to be colored since all output is piped to grep.
272*/ 
273void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
274    std::string icGrepArguments = "";
275    std::string grepArguments = "";
276
277    // Construct the shell arguments for icgrep and grep
278    // by filtering out the command line arguments passed into this process.
279    for (int i = 1; i < argc; i++) {
280        if (!isArgUnwantedForAll(argv[i])) {
281
282            if (!isArgUnwantedForIcGrep(argv[i])) {
283                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
284                icGrepArguments.append("\"");       
285                icGrepArguments.append(argv[i]);
286                icGrepArguments.append("\" ");
287            }
288
289            if (!isArgUnwantedForGrep(argv[i])) {
290                grepArguments.append("\"");
291                grepArguments.append(argv[i]);
292                grepArguments.append("\" ");
293            }
294        }
295    }
296
297#ifdef FUTURE
298    switch (RegexpSyntax) {
299        case re::RE_Syntax::BRE:
300            grepArguments.append("\"-G\" ");
301            break;
302        case re::RE_Syntax::ERE:
303            grepArguments.append("\"-E\" ");
304            break;
305        case re::RE_Syntax::PROSITE:
306            grepArguments.append("\"-PRO\" ");
307            break;
308        case re::RE_Syntax::PCRE:
309            grepArguments.append("\"-P\" ");
310            break;
311        default:
312            //TODO: handle fix string
313            break;
314    }
315#endif
316
317    std::string systemCall = argv[0];
318    systemCall.append(" ");
319    systemCall.append(icGrepArguments);
320    systemCall.append(" ");
321#ifdef FUTURE
322    systemCall.append(" | grep --color=always ");
323#else
324    systemCall.append(" | grep --color=always -P ");
325#endif
326    systemCall.append(grepArguments);
327
328    const auto rc = system(systemCall.c_str());
329    if (LLVM_UNLIKELY(rc < 0)) {
330        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
331    }
332}
333
334
335// This is a stub, to be expanded later.
336bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
337
338std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
339    using namespace boost::filesystem;
340    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
341    std::vector<std::string> expanded_paths;
342    boost::system::error_code errc;
343    if (FollowSubdirectorySymlinks) {
344        EnterDirectoriesRecursively = true;
345    }
346    for (auto & f : inputFiles) {
347        path p(f);
348        if (EnterDirectoriesRecursively && is_directory(p)) {
349            if (!excludeDirectory(p)) {
350                recursive_directory_iterator di(p, follow_symlink, errc), end;
351                if (errc) {
352                    // If we cannot enter the directory, keep it in the list of files.
353                    expanded_paths.push_back(f); 
354                    continue;
355                }
356                while (di != end) {
357                    auto & e = di->path();
358                    if (is_directory(e)) {
359                        if (excludeDirectory(e)) di.no_push();
360                    }
361                    else expanded_paths.push_back(e.string());
362                    di.increment(errc);
363                    if (errc) {
364                        expanded_paths.push_back(e.string()); 
365                    }
366                }
367            }
368        }
369        else expanded_paths.push_back(p.string());
370    }
371    return expanded_paths;
372}
373
374
375int main(int argc, char *argv[]) {
376    llvm::install_fatal_error_handler(&icgrep_error_handler);
377    AddParabixVersionPrinter();
378#ifndef USE_LLVM_3_6
379    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
380#endif
381    cl::ParseCommandLineOptions(argc, argv);
382#ifdef FUTURE
383    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
384        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
385    }
386#endif
387    re::RE * re_ast = get_icgrep_RE();
388    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
389   
390    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
391        pipeIcGrepOutputToGrep(argc, argv);
392        return 0;   // icgrep is called again, so we need to end this process.
393    }
394
395    const bool usingStdIn = hasInputFromStdIn();
396
397    GrepEngine grepEngine;
398    if (MultiGrepKernels) {
399        grepEngine.multiGrepCodeGen(module_name, RELists, CountOnly, UTF_16);
400    } else {
401        grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16, GrepType::Normal, usingStdIn);
402    }
403
404    if (usingStdIn)  {
405
406        allFiles = { "stdin" };
407        initFileResult(allFiles);
408        total_CountOnly.push_back(0);
409        grepEngine.doGrep(0, CountOnly, total_CountOnly);
410
411    } else {
412
413        allFiles = getFullFileList(inputFiles);
414
415        if (FileNamesOnly && NonMatchingFileNamesOnly) {
416            // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
417            // (Although GNU grep prints nothing.)
418            for (auto & f : allFiles) {
419                if (boost::filesystem::exists(f)) {
420                    std::cout << f << "\n";
421                } else {
422                    std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
423                }
424            }
425            exit(0);
426        }
427
428        if (FileNamesOnly) {
429            llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
430        }
431        if (NonMatchingFileNamesOnly) {
432            llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
433        }
434        initFileResult(allFiles);
435
436        for (unsigned i=0; i < allFiles.size(); ++i){
437            total_CountOnly.push_back(0);
438        }
439
440        if (Threads <= 1) {
441
442            #ifdef PRINT_TIMING_INFORMATION
443            // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
444            // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
445            papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
446            #endif
447            for (unsigned i = 0; i != allFiles.size(); ++i) {
448                #ifdef PRINT_TIMING_INFORMATION
449                papiCounters.start();
450                const timestamp_t execution_start = read_cycle_counter();
451                #endif
452                grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly);
453                #ifdef PRINT_TIMING_INFORMATION
454                const timestamp_t execution_end = read_cycle_counter();
455                papiCounters.stop();
456                std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
457                #endif
458            }
459        } else if (Threads > 1) {
460            const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
461            pthread_t threads[numOfThreads];
462
463            for(unsigned long i = 0; i < numOfThreads; ++i){
464                const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
465                if (rc) {
466                    llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
467                }
468            }
469
470            for(unsigned i = 0; i < numOfThreads; ++i) {
471                void * status = nullptr;
472                const int rc = pthread_join(threads[i], &status);
473                if (rc) {
474                    llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
475                }
476            }
477        }
478
479    }
480   
481
482   
483    PrintResult(CountOnly, total_CountOnly);
484   
485    return 0;
486}
Note: See TracBrowser for help on using the repository browser.