source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5371

Last change on this file since 5371 was 5355, checked in by cameron, 2 years ago

Swizzled bitstream deletion and -enable-AVX-deletion in u8u16

File size: 17.8 KB
RevLine 
[3850]1/*
[4947]2 *  Copyright (c) 2016 International Characters.
[3850]3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
[4961]7#include <cstdio>
[5025]8#include <vector>
[4730]9#include <llvm/Support/CommandLine.h>
[5161]10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
[5267]12#include <llvm/Support/raw_ostream.h>
[4968]13#include <re/re_alt.h>
[5197]14#include <re/re_seq.h>
15#include <re/re_start.h>
16#include <re/re_end.h>
[4734]17#include <re/re_parser.h>
[5197]18#include <re/re_utility.h>
[4946]19#include <grep_engine.h>
[4968]20#include <fstream>
21#include <string>
[4961]22#include <boost/uuid/sha1.hpp>
[4967]23#include <toolchain.h>
[5030]24#include <re/re_toolchain.h>
[5031]25#include <pablo/pablo_toolchain.h>
[4972]26#include <mutex>
[5163]27#include <boost/filesystem.hpp>
[5016]28#include <iostream> // MEEE
[5156]29#ifdef PRINT_TIMING_INFORMATION
30#include <hrtime.h>
31#include <util/papi_helper.hpp>
32#endif
33
[5267]34using namespace llvm;
35
[5026]36static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
37                                       "These are standard grep options intended for compatibility with typical grep usage.");
[5167]38
39#ifdef FUTURE
40static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
[5180]41static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
[5167]42    cl::values(
[5180]43        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
44        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
45        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
[5218]46        clEnumValN(re::RE_Syntax::PROSITE, "PRO", "PROSITE protein patterns syntax"),
[5180]47        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
48               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
[5167]49#endif
50
[5197]51static cl::opt<bool> EntireLineMatching("x", cl::desc("Require that entire lines be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
52static cl::alias  EntireLineMatchingAlias("line-regexp", cl::desc("Alias for -x"), cl::aliasopt(EntireLineMatching));
53
54static cl::opt<bool> WholeWordMatching("w", cl::desc("Require that whole words be matched."), cl::cat(LegacyGrepOptions), cl::Grouping);
55static cl::alias WholeWordMatchingAlias("word-regexp", cl::desc("Alias for -w"), cl::aliasopt(WholeWordMatching));
56
[5045]57static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
[5026]58static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
59                                       "These are additional options for icgrep functionality and performance.");
[5197]60
61static cl::opt<bool> FileNamesOnly("l", cl::desc("Display only the names of matching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
62static cl::alias FileNamesAlias("files-with-matches", cl::desc("Alias for -l"), cl::aliasopt(FileNamesOnly));
63
[5355]64static cl::opt<bool> NonMatchingFileNamesOnly("L", cl::desc("Display only the names of nonmatching files."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5197]65static cl::alias NonMatchingFileNamesAlias("files-without-match", cl::desc("Alias for -L"), cl::aliasopt(NonMatchingFileNamesOnly));
66
67
[5163]68static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5025]69static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
[5016]70
[5197]71
[4544]72static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
73
[5163]74static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
75static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
76static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
[5197]77static cl::alias CaseInsensitiveAlisas("ignore-case", cl::desc("Ignore case distinctions in the pattern and the file."), cl::aliasopt(CaseInsensitive));
[4544]78
[5026]79static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
80static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
81static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
[4544]82
[4967]83static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
[4544]84
[5016]85static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
[5026]86         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
[5016]87
[5344]88static cl::opt<bool> MultiGrepKernels("enable-multigrep-kernels", cl::desc("Construct separated kernels for each regular expression"), cl::cat(EnhancedGrepOptions));
89static cl::opt<int> REsPerGroup("re-num", cl::desc("Number of regular expressions processed by each kernel."), cl::init(1));
[5163]90static std::vector<std::string> allFiles;
[5161]91//
92// Handler for errors reported through llvm::report_fatal_error.  Report
93// and signal error code 2 (grep convention).
94//
[5320]95static void icgrep_error_handler(void *UserData, const std::string &Message, bool GenCrashDiag) {
96    #ifndef NDEBUG
97    throw std::runtime_error(Message);
98    #else
[5161]99    // Modified from LLVM's internal report_fatal_error logic.
100    SmallVector<char, 64> Buffer;
101    raw_svector_ostream OS(Buffer);
102    OS << "icgrep ERROR: " << Message << "\n";
103    StringRef MessageStr = OS.str();
104    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
105    (void)written; // If something went wrong, we deliberately just give up.
106    // Run the interrupt handlers to make sure any special cleanups get done, in
107    // particular that we remove files registered with RemoveFileOnSignal.
108    llvm::sys::RunInterruptHandlers();
109    exit(2);
[5320]110    #endif
[5161]111}
112
[4961]113static std::string allREs;
[4963]114static re::ModeFlagSet globalFlags = 0;
[5344]115std::vector<re::RE *> RELists;
[4734]116
117re::RE * get_icgrep_RE() {
118 
119    //std::vector<std::string> regexVector;
120    if (RegexFilename != "") {
121        std::ifstream regexFile(RegexFilename.c_str());
122        std::string r;
123        if (regexFile.is_open()) {
124            while (std::getline(regexFile, r)) {
125                regexVector.push_back(r);
126            }
127            regexFile.close();
128        }
129    }
130   
131    // if there are no regexes specified through -e or -f, the first positional argument
132    // must be a regex, not an input file.
133   
134    if (regexVector.size() == 0) {
135        regexVector.push_back(inputFiles[0]);
[5015]136        inputFiles.erase(inputFiles.begin());
[4734]137    }
138    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
139
[5344]140    std::vector<re::RE *> REs;
[4734]141    re::RE * re_ast = nullptr;
[4750]142    for (unsigned i = 0; i < regexVector.size(); i++) {
[5180]143#ifdef FUTURE
144        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
145#else
[4868]146        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
[5180]147#endif
[4734]148        REs.push_back(re_ast);
[4961]149        allREs += regexVector[i] + "\n";
[4734]150    }
[5344]151
152    std::vector<re::RE *>::iterator start = REs.begin();
153    std::vector<re::RE *>::iterator end = start + REsPerGroup;
154    while(end < REs.end()) {
155        RELists.push_back(re::makeAlt(start, end));
156        start = end;
157        end += REsPerGroup;
158    }
159    if(REs.end()-start>1)
160        RELists.push_back(re::makeAlt(start, REs.end()));
161    else
162        RELists.push_back(*start);
163
[4734]164    if (REs.size() > 1) {
165        re_ast = re::makeAlt(REs.begin(), REs.end());
166    }
[5197]167    if (WholeWordMatching) {
168        re_ast = re::makeSeq({re::makeWordBoundary(), re_ast, re::makeWordBoundary()});
169    }
170    if (EntireLineMatching) {
171        re_ast = re::makeSeq({re::makeStart(), re_ast, re::makeEnd()});
172    }   
[4734]173    return re_ast;
174}
175
[4961]176std::string sha1sum(const std::string & str) {
177    char buffer[41];    // 40 hex-digits and the terminating null
178    unsigned int digest[5];     // 160 bits in total
[4775]179
[4961]180    boost::uuids::detail::sha1 sha1;
181    sha1.process_bytes(str.c_str(), str.size());
182    sha1.get_digest(digest);
183    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
184             digest[0], digest[1], digest[2], digest[3], digest[4]);
185    return std::string(buffer);
186}
187
[5107]188std::vector<size_t> total_CountOnly;
[4972]189std::mutex count_mutex;
190size_t fileCount;
[4979]191void *DoGrep(void *args)
[4967]192{
[4972]193    size_t fileIdx;
[4979]194    GrepEngine * grepEngine = (GrepEngine *)args;
[4967]195
[4972]196    count_mutex.lock();
[5028]197    fileIdx = fileCount;
[4972]198    fileCount++;
199    count_mutex.unlock();
[4979]200
[5163]201    while (fileIdx < allFiles.size()){
202        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
[4972]203       
204        count_mutex.lock();
[5028]205        fileIdx = fileCount;
[4972]206        fileCount++;
207        count_mutex.unlock();
208    }
209
[5267]210    pthread_exit(nullptr);
[4967]211}
212
[5016]213
214// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
215bool isArgUnwantedForAll(char *argument) {
216    std::vector<std::string> unwantedFlags = {"-gs"};
[5037]217    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]218        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]219            return true;
[5016]220        }
221    }
[5037]222    return false;
[5016]223}
224// Filters out the command line strings that shouldn't be passed on to Grep
225bool isArgUnwantedForGrep(char *argument) {
[5180]226#ifdef FUTURE
[5218]227    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E", "-PRO"};
[5180]228#else
[5016]229    std::vector<std::string> unwantedFlags = {"-n"};
[5180]230#endif
[5016]231
[5180]232    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]233        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5037]234            return true;
[5016]235        }
236    }
237
[5037]238    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
[5016]239        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
[5037]240            return true;
[5016]241        }
242    }
243
[5037]244    return false;
[5016]245}
246// Filters out the command line strings that shouldn't be passed on to IcGrep
247bool isArgUnwantedForIcGrep(char *argument) {
[5163]248    bool isUnwanted = false;
[5016]249    std::vector<std::string> unwantedFlags = {"-c"};
250
[5037]251    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
[5016]252        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
[5163]253            isUnwanted = true;
[5016]254        }
255    }
256
[5163]257    return isUnwanted;
[5016]258}
259
260/*
261* Constructs a shell command that calls icgrep and then pipes the output to grep.
262* Then executs this shell command using the "system()" function.
263* This allows the output to be colored since all output is piped to grep.
264*/ 
265void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
266    std::string icGrepArguments = "";
267    std::string grepArguments = "";
268
269    // Construct the shell arguments for icgrep and grep
270    // by filtering out the command line arguments passed into this process.
[5037]271    for (int i = 1; i < argc; i++) {
[5016]272        if (!isArgUnwantedForAll(argv[i])) {
273
274            if (!isArgUnwantedForIcGrep(argv[i])) {
[5138]275                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
276                icGrepArguments.append("\"");       
[5016]277                icGrepArguments.append(argv[i]);
[5138]278                icGrepArguments.append("\" ");
[5016]279            }
280
281            if (!isArgUnwantedForGrep(argv[i])) {
[5138]282                grepArguments.append("\"");
[5016]283                grepArguments.append(argv[i]);
[5138]284                grepArguments.append("\" ");
[5016]285            }
286        }
287    }
288
[5180]289#ifdef FUTURE
290    switch (RegexpSyntax) {
291        case re::RE_Syntax::BRE:
292            grepArguments.append("\"-G\" ");
293            break;
294        case re::RE_Syntax::ERE:
295            grepArguments.append("\"-E\" ");
296            break;
[5218]297        case re::RE_Syntax::PROSITE:
298            grepArguments.append("\"-PRO\" ");
299            break;
[5180]300        case re::RE_Syntax::PCRE:
301            grepArguments.append("\"-P\" ");
302            break;
303        default:
304            //TODO: handle fix string
305            break;
306    }
307#endif
308
[5154]309    std::string systemCall = argv[0];
310    systemCall.append(" ");
[5016]311    systemCall.append(icGrepArguments);
312    systemCall.append(" ");
[5180]313#ifdef FUTURE
314    systemCall.append(" | grep --color=always ");
315#else
[5016]316    systemCall.append(" | grep --color=always -P ");
[5180]317#endif
[5016]318    systemCall.append(grepArguments);
[5138]319
[5236]320    const auto rc = system(systemCall.c_str());
321    if (LLVM_UNLIKELY(rc < 0)) {
322        throw std::runtime_error("Error calling grep: " + std::string(strerror(errno)));
323    }
[5016]324}
325
326
[5163]327// This is a stub, to be expanded later.
328bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
329
330std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
331    using namespace boost::filesystem;
332    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
333    std::vector<std::string> expanded_paths;
334    boost::system::error_code errc;
335    if (FollowSubdirectorySymlinks) {
336        EnterDirectoriesRecursively = true;
337    }
338    for (auto & f : inputFiles) {
339        path p(f);
340        if (EnterDirectoriesRecursively && is_directory(p)) {
341            if (!excludeDirectory(p)) {
342                recursive_directory_iterator di(p, follow_symlink, errc), end;
343                if (errc) {
344                    // If we cannot enter the directory, keep it in the list of files.
345                    expanded_paths.push_back(f); 
346                    continue;
347                }
348                while (di != end) {
349                    auto & e = di->path();
350                    if (is_directory(e)) {
351                        if (excludeDirectory(e)) di.no_push();
352                    }
353                    else expanded_paths.push_back(e.string());
354                    di.increment(errc);
355                    if (errc) {
356                        expanded_paths.push_back(e.string()); 
357                    }
358                }
359            }
360        }
361        else expanded_paths.push_back(p.string());
362    }
363    return expanded_paths;
364}
365
366
[4325]367int main(int argc, char *argv[]) {
[5161]368    llvm::install_fatal_error_handler(&icgrep_error_handler);
[5199]369#ifndef USE_LLVM_3_6
[5036]370    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
[5186]371#endif
[4544]372    cl::ParseCommandLineOptions(argc, argv);
[5167]373#ifdef FUTURE
[5180]374    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
375        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
[5167]376    }
377#endif
[4939]378    re::RE * re_ast = get_icgrep_RE();
[4963]379    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
[5197]380   
[5016]381    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
382        pipeIcGrepOutputToGrep(argc, argv);
383        return 0;   // icgrep is called again, so we need to end this process.
384    }
[4730]385   
[4979]386    GrepEngine grepEngine;
[5338]387    if(MultiGrepKernels){
[5344]388        grepEngine.multiGrepCodeGen(module_name, RELists, CountOnly, UTF_16);
[5338]389    }
390    else{
391        grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
392    }
[5087]393
[5163]394    allFiles = getFullFileList(inputFiles);
395   
[5197]396    if (FileNamesOnly && NonMatchingFileNamesOnly) {
397        // Strange request: print names of all matching files and all non-matching files: i.e., all of them.
398        // (Although GNU grep prints nothing.)
399        for (auto & f : allFiles) {
400            if (boost::filesystem::exists(f)) {
401                std::cout << f << "\n";
[5202]402            } else {
[5197]403                std::cerr << "Error: cannot open " << f << " for processing. Skipped.\n";
404            }
405        }
406        exit(0);
407    }
408    if (FileNamesOnly) {
409        llvm::report_fatal_error("Sorry, -l/-files-with-matches not yet supported\n.");
410    }
411    if (NonMatchingFileNamesOnly) {
412        llvm::report_fatal_error("Sorry, -L/-files-without-match not yet supported\n.");
413    }
414   
[5163]415    initResult(allFiles);
416    for (unsigned i=0; i < allFiles.size(); ++i){
[5025]417        total_CountOnly.push_back(0);
418    }
[4967]419
[4968]420    if (Threads <= 1) {
[5156]421
422        #ifdef PRINT_TIMING_INFORMATION
423        // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
424        // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
425        papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
426        #endif
[5163]427        for (unsigned i = 0; i != allFiles.size(); ++i) {
[5156]428            #ifdef PRINT_TIMING_INFORMATION
429            papiCounters.start();
430            const timestamp_t execution_start = read_cycle_counter();
431            #endif
[5163]432            grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly, UTF_16);
[5156]433            #ifdef PRINT_TIMING_INFORMATION
434            const timestamp_t execution_end = read_cycle_counter();
435            papiCounters.stop();
[5163]436            std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
[5156]437            #endif
[4967]438        }       
[4968]439    } else if (Threads > 1) {
440        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
441        pthread_t threads[numOfThreads];
[4967]442
[4968]443        for(unsigned long i = 0; i < numOfThreads; ++i){
[5267]444            const int rc = pthread_create(&threads[i], nullptr, DoGrep, (void *)&grepEngine);
[4968]445            if (rc) {
[5161]446                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
[4968]447            }
[4947]448        }
[4967]449
[4968]450        for(unsigned i = 0; i < numOfThreads; ++i) {
451            void * status = nullptr;
452            const int rc = pthread_join(threads[i], &status);
[4967]453            if (rc) {
[5161]454                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
[4967]455            }
456        }
[3850]457    }
[5063]458   
[5025]459    PrintResult(CountOnly, total_CountOnly);
[4327]460   
[3850]461    return 0;
462}
Note: See TracBrowser for help on using the repository browser.