source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5197

Last change on this file since 5197 was 5197, checked in by cameron, 3 years ago

Support for -x, -w, -l, -L flags (in progress)

File size: 16.6 KB
RevLine 
[3850]1/*
[4947]2 *  Copyright (c) 2016 International Characters.
[3850]3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[4961]7#include <cstdio>
[5025]8#include <vector>
[4730]9#include <llvm/Support/CommandLine.h>
[5161]10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
[4968]12#include <re/re_alt.h>
[5197]13#include <re/re_seq.h>
14#include <re/re_start.h>
15#include <re/re_end.h>
[4734]16#include <re/re_parser.h>
[5197]17#include <re/re_utility.h>
[4946]18#include <grep_engine.h>
[4968]19#include <fstream>
20#include <string>
[3850]21
[4961]22#include <boost/uuid/sha1.hpp>
[4967]23#include <toolchain.h>
[5030]24#include <re/re_toolchain.h>
[5031]25#include <pablo/pablo_toolchain.h>
[4972]26#include <mutex>
[5163]27#include <boost/filesystem.hpp>
[4961]28
[5016]29#include <iostream> // MEEE
[5156]30
31#ifdef PRINT_TIMING_INFORMATION
32#include <hrtime.h>
33#include <util/papi_helper.hpp>
34#endif
35
[5026]36static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
37                                       "These are standard grep options intended for compatibility with typical grep usage.");
[5167]38
39#ifdef FUTURE
40static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
[5180]41static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
[5167]42    cl::values(
[5180]43        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
44        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
45        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
46        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
47               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
[5167]48#endif
49
[5197]50static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
51static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
52
53static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
54static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
55
[5045]56static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
[5026]57static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
58                                       "These are additional options for icgrep functionality and performance.");
[5197]59
60static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
61static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
62
63static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
64static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
65
66
[5163]67static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5025]68static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
[5016]69
[5197]70
[4544]71static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
72
[5163]73static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
74static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
75static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5197]76static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
[4544]77
[5026]78static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
79static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
80static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
[4544]81
[4967]82static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
[4544]83
[5016]84static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
[5026]85         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
[5016]86
[5163]87
88static std::vector<std::string> allFiles;
[5161]89//
90// Handler for errors reported through llvm::report_fatal_error.  Report
91// and signal error code 2 (grep convention).
92//
93static void icgrep_error_handler(void *UserData, const std::string &Message,
94                             bool GenCrashDiag) {
[5016]95
[5161]96    // Modified from LLVM's internal report_fatal_error logic.
97    SmallVector<char, 64> Buffer;
98    raw_svector_ostream OS(Buffer);
99    OS << "icgrep ERROR: " << Message << "\n";
100    StringRef MessageStr = OS.str();
101    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
102    (void)written; // If something went wrong, we deliberately just give up.
103
104    // Run the interrupt handlers to make sure any special cleanups get done, in
105    // particular that we remove files registered with RemoveFileOnSignal.
106    llvm::sys::RunInterruptHandlers();
107    exit(2);
108}
109
[4961]110static std::string allREs;
[4963]111static re::ModeFlagSet globalFlags = 0;
[4734]112
113re::RE * get_icgrep_RE() {
114 
115    //std::vector<std::string> regexVector;
116    if (RegexFilename != "") {
117        std::ifstream regexFile(RegexFilename.c_str());
118        std::string r;
119        if (regexFile.is_open()) {
120            while (std::getline(regexFile, r)) {
121                regexVector.push_back(r);
122            }
123            regexFile.close();
124        }
125    }
126   
127    // if there are no regexes specified through -e or -f, the first positional argument
128    // must be a regex, not an input file.
129   
130    if (regexVector.size() == 0) {
131        regexVector.push_back(inputFiles[0]);
[5015]132        inputFiles.erase(inputFiles.begin());
[4734]133    }
134    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
135
136 
137    std::vector<re::RE *> REs;
138    re::RE * re_ast = nullptr;
[4750]139    for (unsigned i = 0; i < regexVector.size(); i++) {
[5180]140#ifdef FUTURE
141        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
142#else
[4868]143        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
[5180]144#endif
[4734]145        REs.push_back(re_ast);
[4961]146        allREs += regexVector[i] + "\n";
[4734]147    }
148    if (REs.size() > 1) {
149        re_ast = re::makeAlt(REs.begin(), REs.end());
150    }
[5197]151    if (WholeWordMatching) {
152        re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
153    }
154    if (EntireLineMatching) {
155        re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
156    }   
[4734]157    return re_ast;
158}
159
[4961]160std::string sha1sum(const std::string & str) {
161    char buffer[41];    // 40 hex-digits and the terminating null
162    unsigned int digest[5];     // 160 bits in total
[4775]163
[4961]164    boost::uuids::detail::sha1 sha1;
165    sha1.process_bytes(str.c_str(), str.size());
166    sha1.get_digest(digest);
167    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
168             digest[0], digest[1], digest[2], digest[3], digest[4]);
169    return std::string(buffer);
170}
171
[5107]172std::vector<size_t> total_CountOnly;
[4972]173std::mutex count_mutex;
174size_t fileCount;
[4979]175void *DoGrep(void *args)
[4967]176{
[4972]177    size_t fileIdx;
[4979]178    GrepEngine * grepEngine = (GrepEngine *)args;
[4967]179
[4972]180    count_mutex.lock();
[5028]181    fileIdx = fileCount;
[4972]182    fileCount++;
183    count_mutex.unlock();
[4979]184
[5163]185    while (fileIdx < allFiles.size()){
186        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
[4972]187       
188        count_mutex.lock();
[5028]189        fileIdx = fileCount;
[4972]190        fileCount++;
191        count_mutex.unlock();
192    }
193
[4967]194    pthread_exit(NULL);
195}
196
[5016]197
198// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
199bool isArgUnwantedForAll(char *argument) {
200    std::vector<std::string> unwantedFlags = {"-gs"};
[5037]201    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]202        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]203            return true;
[5016]204        }
205    }
[5037]206    return false;
[5016]207}
208// Filters out the command line strings that shouldn't be passed on to Grep
209bool isArgUnwantedForGrep(char *argument) {
[5180]210#ifdef FUTURE
211    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E"};
212#else
[5016]213    std::vector<std::string> unwantedFlags = {"-n"};
[5180]214#endif
[5016]215
[5180]216    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]217        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]218            return true;
[5016]219        }
220    }
221
[5037]222    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
[5016]223        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
[5037]224            return true;
[5016]225        }
226    }
227
[5037]228    return false;
[5016]229}
230// Filters out the command line strings that shouldn't be passed on to IcGrep
231bool isArgUnwantedForIcGrep(char *argument) {
[5163]232    bool isUnwanted = false;
[5016]233    std::vector<std::string> unwantedFlags = {"-c"};
234
[5037]235    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]236        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5163]237            isUnwanted = true;
[5016]238        }
239    }
240
[5163]241    return isUnwanted;
[5016]242}
243
244/*
245* Constructs a shell command that calls icgrep and then pipes the output to grep.
246* Then executs this shell command using the "system()" function.
247* This allows the output to be colored since all output is piped to grep.
248*/ 
249void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
250    std::string icGrepArguments = "";
251    std::string grepArguments = "";
252
253    // Construct the shell arguments for icgrep and grep
254    // by filtering out the command line arguments passed into this process.
[5037]255    for (int i = 1; i < argc; i++) {
[5016]256        if (!isArgUnwantedForAll(argv[i])) {
257
258            if (!isArgUnwantedForIcGrep(argv[i])) {
[5138]259                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
260                icGrepArguments.append("\"");       
[5016]261                icGrepArguments.append(argv[i]);
[5138]262                icGrepArguments.append("\" ");
[5016]263            }
264
265            if (!isArgUnwantedForGrep(argv[i])) {
[5138]266                grepArguments.append("\"");
[5016]267                grepArguments.append(argv[i]);
[5138]268                grepArguments.append("\" ");
[5016]269            }
270        }
271    }
272
[5180]273#ifdef FUTURE
274    switch (RegexpSyntax) {
275        case re::RE_Syntax::BRE:
276            grepArguments.append("\"-G\" ");
277            break;
278        case re::RE_Syntax::ERE:
279            grepArguments.append("\"-E\" ");
280            break;
281        case re::RE_Syntax::PCRE:
282            grepArguments.append("\"-P\" ");
283            break;
284        default:
285            //TODO: handle fix string
286            break;
287    }
288#endif
289
[5154]290    std::string systemCall = argv[0];
291    systemCall.append(" ");
[5016]292    systemCall.append(icGrepArguments);
293    systemCall.append(" ");
[5180]294#ifdef FUTURE
295    systemCall.append(" | grep --color=always ");
296#else
[5016]297    systemCall.append(" | grep --color=always -P ");
[5180]298#endif
[5016]299    systemCall.append(grepArguments);
[5138]300
[5016]301    system(systemCall.c_str());
302}
303
304
[5163]305// This is a stub, to be expanded later.
306bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
307
308std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
309    using namespace boost::filesystem;
310    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
311    std::vector<std::string> expanded_paths;
312    boost::system::error_code errc;
313    if (FollowSubdirectorySymlinks) {
314        EnterDirectoriesRecursively = true;
315    }
316    for (auto & f : inputFiles) {
317        path p(f);
318        if (EnterDirectoriesRecursively && is_directory(p)) {
319            if (!excludeDirectory(p)) {
320                recursive_directory_iterator di(p, follow_symlink, errc), end;
321                if (errc) {
322                    // If we cannot enter the directory, keep it in the list of files.
323                    expanded_paths.push_back(f); 
324                    continue;
325                }
326                while (di != end) {
327                    auto & e = di->path();
328                    if (is_directory(e)) {
329                        if (excludeDirectory(e)) di.no_push();
330                    }
331                    else expanded_paths.push_back(e.string());
332                    di.increment(errc);
333                    if (errc) {
334                        expanded_paths.push_back(e.string()); 
335                    }
336                }
337            }
338        }
339        else expanded_paths.push_back(p.string());
340    }
341    return expanded_paths;
342}
343
344
[4325]345int main(int argc, char *argv[]) {
[5161]346    llvm::install_fatal_error_handler(&icgrep_error_handler);
[5197]347#if LLVM_VERSION_MINOR > 6
[5036]348    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
[5186]349#endif
[4544]350    cl::ParseCommandLineOptions(argc, argv);
[5167]351#ifdef FUTURE
[5180]352    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
353        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
[5167]354    }
355#endif
[4939]356    re::RE * re_ast = get_icgrep_RE();
[4963]357    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
[5197]358   
[5016]359    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
360        pipeIcGrepOutputToGrep(argc, argv);
361        return 0;   // icgrep is called again, so we need to end this process.
362    }
[4730]363   
[4979]364    GrepEngine grepEngine;
[5045]365    grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
[5052]366    //std::cerr << "grepCodeGen complete";
[5087]367
368    releaseSlabAllocatorMemory();
[5163]369   
370    allFiles = getFullFileList(inputFiles);
371   
[5197]372    if (FileNamesOnly && NonMatchingFileNamesOnly) {
373        // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
374        // (Although GNU grep prints nothing.)
375        for (auto & f : allFiles) {
376            if (boost::filesystem::exists(f)) {
377                std::cout << f << "\n";
378            }
379            else {
380                std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
381            }
382        }
383        exit(0);
384    }
385    if (FileNamesOnly) {
386        llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
387    }
388    if (NonMatchingFileNamesOnly) {
389        llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
390    }
391   
[5163]392    initResult(allFiles);
393    for (unsigned i=0; i < allFiles.size(); ++i){
[5025]394        total_CountOnly.push_back(0);
395    }
[4967]396
[4968]397    if (Threads <= 1) {
[5156]398
399        #ifdef PRINT_TIMING_INFORMATION
400        // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
401        // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
402        papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
403        #endif
[5163]404        for (unsigned i = 0; i != allFiles.size(); ++i) {
[5156]405            #ifdef PRINT_TIMING_INFORMATION
406            papiCounters.start();
407            const timestamp_t execution_start = read_cycle_counter();
408            #endif
[5163]409            grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly, UTF_16);
[5156]410            #ifdef PRINT_TIMING_INFORMATION
411            const timestamp_t execution_end = read_cycle_counter();
412            papiCounters.stop();
[5163]413            std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
[5156]414            #endif
[4967]415        }       
[4968]416    } else if (Threads > 1) {
417        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
418        pthread_t threads[numOfThreads];
[4967]419
[4968]420        for(unsigned long i = 0; i < numOfThreads; ++i){
[4979]421            const int rc = pthread_create(&threads[i], NULL, DoGrep, (void *)&grepEngine);
[4968]422            if (rc) {
[5161]423                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
[4968]424            }
[4947]425        }
[4967]426
[4968]427        for(unsigned i = 0; i < numOfThreads; ++i) {
428            void * status = nullptr;
429            const int rc = pthread_join(threads[i], &status);
[4967]430            if (rc) {
[5161]431                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
[4967]432            }
433        }
[3850]434    }
[5063]435   
[5025]436    PrintResult(CountOnly, total_CountOnly);
[4327]437   
[3850]438    return 0;
439}
Note: See TracBrowser for help on using the repository browser.