source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5320

Last change on this file since 5320 was 5320, checked in by nmedfort, 2 years ago

memcpy/memset support for 32-bit systems; more error messages/handling; bug fix for ParabixCharacterClassKernelBuilder?. continued work on parenthesis matching + expandable buffers.

File size: 16.9 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <llvm/Support/raw_ostream.h>
13#include <re/re_alt.h>
14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
17#include <re/re_parser.h>
18#include <re/re_utility.h>
19#include <grep_engine.h>
20#include <fstream>
21#include <string>
22#include <boost/uuid/sha1.hpp>
23#include <toolchain.h>
24#include <re/re_toolchain.h>
25#include <pablo/pablo_toolchain.h>
26#include <mutex>
27#include <boost/filesystem.hpp>
28#include <iostream> // MEEE
29#ifdef PRINT_TIMING_INFORMATION
30#include <hrtime.h>
31#include <util/papi_helper.hpp>
32#endif
33
34using namespace llvm;
35
36static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
37                                       "These are standard grep options intended for compatibility with typical grep usage.");
38
39#ifdef FUTURE
40static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
41static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
42    cl::values(
43        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
44        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
45        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
46        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
47        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
48               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
49#endif
50
51static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
52static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
53
54static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
55static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
56
57static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
58static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
59                                       "These are additional options for icgrep functionality and performance.");
60
61static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
62static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
63
64static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
65static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
66
67
68static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
69static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
70
71
72static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
73
74static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
75static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
76static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
77static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
78
79static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
80static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
81static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
82
83static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
84
85static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
86         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
87
88
89static std::vector<std::string> allFiles;
90//
91// Handler for errors reported through llvm::report_fatal_error.  Report
92// and signal error code 2 (grep convention).
93//
94static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
95    #ifndef NDEBUG
96    throw std::runtime_error(Message);
97    #else
98    // Modified from LLVM's internal report_fatal_error logic.
99    SmallVector<char, 64> Buffer;
100    raw_svector_ostream OS(Buffer);
101    OS << "icgrep ERROR: " << Message << "\n";
102    StringRef MessageStr = OS.str();
103    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
104    (void)written; // If something went wrong, we deliberately just give up.
105    // Run the interrupt handlers to make sure any special cleanups get done, in
106    // particular that we remove files registered with RemoveFileOnSignal.
107    llvm::sys::RunInterruptHandlers();
108    exit(2);
109    #endif
110}
111
112static std::string allREs;
113static re::ModeFlagSet globalFlags = 0;
114
115re::RE * get_icgrep_RE() {
116 
117    //std::vector<std::string> regexVector;
118    if (RegexFilename != "") {
119        std::ifstream regexFile(RegexFilename.c_str());
120        std::string r;
121        if (regexFile.is_open()) {
122            while (std::getline(regexFile, r)) {
123                regexVector.push_back(r);
124            }
125            regexFile.close();
126        }
127    }
128   
129    // if there are no regexes specified through -e or -f, the first positional argument
130    // must be a regex, not an input file.
131   
132    if (regexVector.size() == 0) {
133        regexVector.push_back(inputFiles[0]);
134        inputFiles.erase(inputFiles.begin());
135    }
136    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
137
138 
139    std::vector<re::RE *> REs;
140    re::RE * re_ast = nullptr;
141    for (unsigned i = 0; i < regexVector.size(); i++) {
142#ifdef FUTURE
143        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
144#else
145        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
146#endif
147        REs.push_back(re_ast);
148        allREs += regexVector[i] + "\n";
149    }
150    if (REs.size() > 1) {
151        re_ast = re::makeAlt(REs.begin(), REs.end());
152    }
153    if (WholeWordMatching) {
154        re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
155    }
156    if (EntireLineMatching) {
157        re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
158    }   
159    return re_ast;
160}
161
162std::string sha1sum(const std::string & str) {
163    char buffer[41];    // 40 hex-digits and the terminating null
164    unsigned int digest[5];     // 160 bits in total
165
166    boost::uuids::detail::sha1 sha1;
167    sha1.process_bytes(str.c_str(), str.size());
168    sha1.get_digest(digest);
169    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
170             digest[0], digest[1], digest[2], digest[3], digest[4]);
171    return std::string(buffer);
172}
173
174std::vector<size_t> total_CountOnly;
175std::mutex count_mutex;
176size_t fileCount;
177void *DoGrep(void *args)
178{
179    size_t fileIdx;
180    GrepEngine * grepEngine = (GrepEngine *)args;
181
182    count_mutex.lock();
183    fileIdx = fileCount;
184    fileCount++;
185    count_mutex.unlock();
186
187    while (fileIdx < allFiles.size()){
188        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
189       
190        count_mutex.lock();
191        fileIdx = fileCount;
192        fileCount++;
193        count_mutex.unlock();
194    }
195
196    pthread_exit(nullptr);
197}
198
199
200// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
201bool isArgUnwantedForAll(char *argument) {
202    std::vector<std::string> unwantedFlags = {"-gs"};
203    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
204        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
205            return true;
206        }
207    }
208    return false;
209}
210// Filters out the command line strings that shouldn't be passed on to Grep
211bool isArgUnwantedForGrep(char *argument) {
212#ifdef FUTURE
213    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
214#else
215    std::vector<std::string> unwantedFlags = {"-n"};
216#endif
217
218    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
219        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
220            return true;
221        }
222    }
223
224    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
225        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
226            return true;
227        }
228    }
229
230    return false;
231}
232// Filters out the command line strings that shouldn't be passed on to IcGrep
233bool isArgUnwantedForIcGrep(char *argument) {
234    bool isUnwanted = false;
235    std::vector<std::string> unwantedFlags = {"-c"};
236
237    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
238        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
239            isUnwanted = true;
240        }
241    }
242
243    return isUnwanted;
244}
245
246/*
247* Constructs a shell command that calls icgrep and then pipes the output to grep.
248* Then executs this shell command using the "system()" function.
249* This allows the output to be colored since all output is piped to grep.
250*/ 
251void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
252    std::string icGrepArguments = "";
253    std::string grepArguments = "";
254
255    // Construct the shell arguments for icgrep and grep
256    // by filtering out the command line arguments passed into this process.
257    for (int i = 1; i < argc; i++) {
258        if (!isArgUnwantedForAll(argv[i])) {
259
260            if (!isArgUnwantedForIcGrep(argv[i])) {
261                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
262                icGrepArguments.append("\"");       
263                icGrepArguments.append(argv[i]);
264                icGrepArguments.append("\" ");
265            }
266
267            if (!isArgUnwantedForGrep(argv[i])) {
268                grepArguments.append("\"");
269                grepArguments.append(argv[i]);
270                grepArguments.append("\" ");
271            }
272        }
273    }
274
275#ifdef FUTURE
276    switch (RegexpSyntax) {
277        case re::RE_Syntax::BRE:
278            grepArguments.append("\"-G\" ");
279            break;
280        case re::RE_Syntax::ERE:
281            grepArguments.append("\"-E\" ");
282            break;
283        case re::RE_Syntax::PROSITE:
284            grepArguments.append("\"-PRO\" ");
285            break;
286        case re::RE_Syntax::PCRE:
287            grepArguments.append("\"-P\" ");
288            break;
289        default:
290            //TODO: handle fix string
291            break;
292    }
293#endif
294
295    std::string systemCall = argv[0];
296    systemCall.append(" ");
297    systemCall.append(icGrepArguments);
298    systemCall.append(" ");
299#ifdef FUTURE
300    systemCall.append(" | grep --color=always ");
301#else
302    systemCall.append(" | grep --color=always -P ");
303#endif
304    systemCall.append(grepArguments);
305
306    const auto rc = system(systemCall.c_str());
307    if (LLVM_UNLIKELY(rc < 0)) {
308        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
309    }
310}
311
312
313// This is a stub, to be expanded later.
314bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
315
316std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
317    using namespace boost::filesystem;
318    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
319    std::vector<std::string> expanded_paths;
320    boost::system::error_code errc;
321    if (FollowSubdirectorySymlinks) {
322        EnterDirectoriesRecursively = true;
323    }
324    for (auto & f : inputFiles) {
325        path p(f);
326        if (EnterDirectoriesRecursively && is_directory(p)) {
327            if (!excludeDirectory(p)) {
328                recursive_directory_iterator di(p, follow_symlink, errc), end;
329                if (errc) {
330                    // If we cannot enter the directory, keep it in the list of files.
331                    expanded_paths.push_back(f); 
332                    continue;
333                }
334                while (di != end) {
335                    auto & e = di->path();
336                    if (is_directory(e)) {
337                        if (excludeDirectory(e)) di.no_push();
338                    }
339                    else expanded_paths.push_back(e.string());
340                    di.increment(errc);
341                    if (errc) {
342                        expanded_paths.push_back(e.string()); 
343                    }
344                }
345            }
346        }
347        else expanded_paths.push_back(p.string());
348    }
349    return expanded_paths;
350}
351
352
353int main(int argc, char *argv[]) {
354    llvm::install_fatal_error_handler(&icgrep_error_handler);
355#ifndef USE_LLVM_3_6
356    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
357#endif
358    cl::ParseCommandLineOptions(argc, argv);
359#ifdef FUTURE
360    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
361        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
362    }
363#endif
364    re::RE * re_ast = get_icgrep_RE();
365    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
366   
367    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
368        pipeIcGrepOutputToGrep(argc, argv);
369        return 0;   // icgrep is called again, so we need to end this process.
370    }
371   
372    GrepEngine grepEngine;
373    grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
374
375    allFiles = getFullFileList(inputFiles);
376   
377    if (FileNamesOnly && NonMatchingFileNamesOnly) {
378        // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
379        // (Although GNU grep prints nothing.)
380        for (auto & f : allFiles) {
381            if (boost::filesystem::exists(f)) {
382                std::cout << f << "\n";
383            } else {
384                std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
385            }
386        }
387        exit(0);
388    }
389    if (FileNamesOnly) {
390        llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
391    }
392    if (NonMatchingFileNamesOnly) {
393        llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
394    }
395   
396    initResult(allFiles);
397    for (unsigned i=0; i < allFiles.size(); ++i){
398        total_CountOnly.push_back(0);
399    }
400
401    if (Threads <= 1) {
402
403        #ifdef PRINT_TIMING_INFORMATION
404        // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
405        // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
406        papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
407        #endif
408        for (unsigned i = 0; i != allFiles.size(); ++i) {
409            #ifdef PRINT_TIMING_INFORMATION
410            papiCounters.start();
411            const timestamp_t execution_start = read_cycle_counter();
412            #endif
413            grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly, UTF_16);
414            #ifdef PRINT_TIMING_INFORMATION
415            const timestamp_t execution_end = read_cycle_counter();
416            papiCounters.stop();
417            std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
418            #endif
419        }       
420    } else if (Threads > 1) {
421        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
422        pthread_t threads[numOfThreads];
423
424        for(unsigned long i = 0; i < numOfThreads; ++i){
425            const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
426            if (rc) {
427                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
428            }
429        }
430
431        for(unsigned i = 0; i < numOfThreads; ++i) {
432            void * status = nullptr;
433            const int rc = pthread_join(threads[i], &status);
434            if (rc) {
435                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
436            }
437        }
438    }
439   
440    PrintResult(CountOnly, total_CountOnly);
441   
442    return 0;
443}
Note: See TracBrowser for help on using the repository browser.