source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5187

Last change on this file since 5187 was 5187, checked in by faldebey, 3 years ago

LLVM-3.6 Support

File size: 14.4 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <re/re_alt.h>
13#include <re/re_parser.h>
14#include <grep_engine.h>
15#include <fstream>
16#include <string>
17
18#include <boost/uuid/sha1.hpp>
19#include <toolchain.h>
20#include <re/re_toolchain.h>
21#include <pablo/pablo_toolchain.h>
22#include <mutex>
23#include <boost/filesystem.hpp>
24
25#include <iostream> // MEEE
26
27#ifdef PRINT_TIMING_INFORMATION
28#include <hrtime.h>
29#include <util/papi_helper.hpp>
30#endif
31
32static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
33                                       "These are standard grep options intended for compatibility with typical grep usage.");
34
35#ifdef FUTURE
36static cl::OptionCategory RegexpOptions("Regular Expression Interpretation", "These options control regular expression interpretation");
37static cl::opt<re::RE_Syntax> RegexpSyntax(cl::desc("Regular expression syntax:"),
38    cl::values(
39        clEnumValN(re::RE_Syntax::FixedStrings, "F", "Fixed strings, separated by newlines"),
40        clEnumValN(re::RE_Syntax::BRE, "G", "Posix basic regular expression (BRE) syntax"),
41        clEnumValN(re::RE_Syntax::ERE, "E", "Posix extended regular expression (ERE) syntax"),
42        clEnumValN(re::RE_Syntax::PCRE, "P", "Perl-compatible regular expression (PCRE) syntax - default"),
43               clEnumValEnd), cl::cat(LegacyGrepOptions), cl::Grouping, cl::init(re::RE_Syntax::PCRE));
44#endif
45
46static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
47static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
48                                       "These are additional options for icgrep functionality and performance.");
49static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
50static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
51
52static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
53
54static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
55static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
56static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
57
58
59static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
60static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
61static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
62
63static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
64
65static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
66         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
67
68
69static std::vector<std::string> allFiles;
70//
71// Handler for errors reported through llvm::report_fatal_error.  Report
72// and signal error code 2 (grep convention).
73//
74static void icgrep_error_handler(void *UserData, const std::string &Message,
75                             bool GenCrashDiag) {
76
77    // Modified from LLVM's internal report_fatal_error logic.
78    SmallVector<char, 64> Buffer;
79    raw_svector_ostream OS(Buffer);
80    OS << "icgrep ERROR: " << Message << "\n";
81    StringRef MessageStr = OS.str();
82    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
83    (void)written; // If something went wrong, we deliberately just give up.
84
85    // Run the interrupt handlers to make sure any special cleanups get done, in
86    // particular that we remove files registered with RemoveFileOnSignal.
87    llvm::sys::RunInterruptHandlers();
88    exit(2);
89}
90
91static std::string allREs;
92static re::ModeFlagSet globalFlags = 0;
93
94re::RE * get_icgrep_RE() {
95 
96    //std::vector<std::string> regexVector;
97    if (RegexFilename != "") {
98        std::ifstream regexFile(RegexFilename.c_str());
99        std::string r;
100        if (regexFile.is_open()) {
101            while (std::getline(regexFile, r)) {
102                regexVector.push_back(r);
103            }
104            regexFile.close();
105        }
106    }
107   
108    // if there are no regexes specified through -e or -f, the first positional argument
109    // must be a regex, not an input file.
110   
111    if (regexVector.size() == 0) {
112        regexVector.push_back(inputFiles[0]);
113        inputFiles.erase(inputFiles.begin());
114    }
115    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
116
117 
118    std::vector<re::RE *> REs;
119    re::RE * re_ast = nullptr;
120    for (unsigned i = 0; i < regexVector.size(); i++) {
121#ifdef FUTURE
122        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags, RegexpSyntax);
123#else
124        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
125#endif
126        REs.push_back(re_ast);
127        allREs += regexVector[i] + "\n";
128    }
129    if (REs.size() > 1) {
130        re_ast = re::makeAlt(REs.begin(), REs.end());
131    }
132   
133    return re_ast;
134}
135
136std::string sha1sum(const std::string & str) {
137    char buffer[41];    // 40 hex-digits and the terminating null
138    unsigned int digest[5];     // 160 bits in total
139
140    boost::uuids::detail::sha1 sha1;
141    sha1.process_bytes(str.c_str(), str.size());
142    sha1.get_digest(digest);
143    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
144             digest[0], digest[1], digest[2], digest[3], digest[4]);
145    return std::string(buffer);
146}
147
148std::vector<size_t> total_CountOnly;
149std::mutex count_mutex;
150size_t fileCount;
151void *DoGrep(void *args)
152{
153    size_t fileIdx;
154    GrepEngine * grepEngine = (GrepEngine *)args;
155
156    count_mutex.lock();
157    fileIdx = fileCount;
158    fileCount++;
159    count_mutex.unlock();
160
161    while (fileIdx < allFiles.size()){
162        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
163       
164        count_mutex.lock();
165        fileIdx = fileCount;
166        fileCount++;
167        count_mutex.unlock();
168    }
169
170    pthread_exit(NULL);
171}
172
173
174// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
175bool isArgUnwantedForAll(char *argument) {
176    std::vector<std::string> unwantedFlags = {"-gs"};
177    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
178        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
179            return true;
180        }
181    }
182    return false;
183}
184// Filters out the command line strings that shouldn't be passed on to Grep
185bool isArgUnwantedForGrep(char *argument) {
186#ifdef FUTURE
187    std::vector<std::string> unwantedFlags = {"-n", "-P", "-G", "-E"};
188#else
189    std::vector<std::string> unwantedFlags = {"-n"};
190#endif
191
192    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
193        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
194            return true;
195        }
196    }
197
198    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
199        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
200            return true;
201        }
202    }
203
204    return false;
205}
206// Filters out the command line strings that shouldn't be passed on to IcGrep
207bool isArgUnwantedForIcGrep(char *argument) {
208    bool isUnwanted = false;
209    std::vector<std::string> unwantedFlags = {"-c"};
210
211    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
212        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
213            isUnwanted = true;
214        }
215    }
216
217    return isUnwanted;
218}
219
220/*
221* Constructs a shell command that calls icgrep and then pipes the output to grep.
222* Then executs this shell command using the "system()" function.
223* This allows the output to be colored since all output is piped to grep.
224*/ 
225void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
226    std::string icGrepArguments = "";
227    std::string grepArguments = "";
228
229    // Construct the shell arguments for icgrep and grep
230    // by filtering out the command line arguments passed into this process.
231    for (int i = 1; i < argc; i++) {
232        if (!isArgUnwantedForAll(argv[i])) {
233
234            if (!isArgUnwantedForIcGrep(argv[i])) {
235                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
236                icGrepArguments.append("\"");       
237                icGrepArguments.append(argv[i]);
238                icGrepArguments.append("\" ");
239            }
240
241            if (!isArgUnwantedForGrep(argv[i])) {
242                grepArguments.append("\"");
243                grepArguments.append(argv[i]);
244                grepArguments.append("\" ");
245            }
246        }
247    }
248
249#ifdef FUTURE
250    switch (RegexpSyntax) {
251        case re::RE_Syntax::BRE:
252            grepArguments.append("\"-G\" ");
253            break;
254        case re::RE_Syntax::ERE:
255            grepArguments.append("\"-E\" ");
256            break;
257        case re::RE_Syntax::PCRE:
258            grepArguments.append("\"-P\" ");
259            break;
260        default:
261            //TODO: handle fix string
262            break;
263    }
264#endif
265
266    std::string systemCall = argv[0];
267    systemCall.append(" ");
268    systemCall.append(icGrepArguments);
269    systemCall.append(" ");
270#ifdef FUTURE
271    systemCall.append(" | grep --color=always ");
272#else
273    systemCall.append(" | grep --color=always -P ");
274#endif
275    systemCall.append(grepArguments);
276
277    system(systemCall.c_str());
278}
279
280
281// This is a stub, to be expanded later.
282bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
283
284std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
285    using namespace boost::filesystem;
286    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
287    std::vector<std::string> expanded_paths;
288    boost::system::error_code errc;
289    if (FollowSubdirectorySymlinks) {
290        EnterDirectoriesRecursively = true;
291    }
292    for (auto & f : inputFiles) {
293        path p(f);
294        if (EnterDirectoriesRecursively && is_directory(p)) {
295            if (!excludeDirectory(p)) {
296                recursive_directory_iterator di(p, follow_symlink, errc), end;
297                if (errc) {
298                    // If we cannot enter the directory, keep it in the list of files.
299                    expanded_paths.push_back(f); 
300                    continue;
301                }
302                while (di != end) {
303                    auto & e = di->path();
304                    if (is_directory(e)) {
305                        if (excludeDirectory(e)) di.no_push();
306                    }
307                    else expanded_paths.push_back(e.string());
308                    di.increment(errc);
309                    if (errc) {
310                        expanded_paths.push_back(e.string()); 
311                    }
312                }
313            }
314        }
315        else expanded_paths.push_back(p.string());
316    }
317    return expanded_paths;
318}
319
320
321int main(int argc, char *argv[]) {
322    llvm::install_fatal_error_handler(&icgrep_error_handler);
323#if LLVM_VERSION_MINOR > 6
324    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
325#endif
326    cl::ParseCommandLineOptions(argc, argv);
327#ifdef FUTURE
328    if (RegexpSyntax == re::RE_Syntax::FixedStrings) {
329        llvm::report_fatal_error("Sorry, FixedStrings syntax is not fully supported\n.");
330    }
331#endif
332    re::RE * re_ast = get_icgrep_RE();
333    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
334
335    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
336        pipeIcGrepOutputToGrep(argc, argv);
337        return 0;   // icgrep is called again, so we need to end this process.
338    }
339   
340    GrepEngine grepEngine;
341    grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
342    //std::cerr << "grepCodeGen complete";
343
344    releaseSlabAllocatorMemory();
345   
346    allFiles = getFullFileList(inputFiles);
347   
348    initResult(allFiles);
349    for (unsigned i=0; i < allFiles.size(); ++i){
350        total_CountOnly.push_back(0);
351    }
352
353    if (Threads <= 1) {
354
355        #ifdef PRINT_TIMING_INFORMATION
356        // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
357        // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
358        papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
359        #endif
360        for (unsigned i = 0; i != allFiles.size(); ++i) {
361            #ifdef PRINT_TIMING_INFORMATION
362            papiCounters.start();
363            const timestamp_t execution_start = read_cycle_counter();
364            #endif
365            grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly, UTF_16);
366            #ifdef PRINT_TIMING_INFORMATION
367            const timestamp_t execution_end = read_cycle_counter();
368            papiCounters.stop();
369            std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
370            #endif
371        }       
372    } else if (Threads > 1) {
373        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
374        pthread_t threads[numOfThreads];
375
376        for(unsigned long i = 0; i < numOfThreads; ++i){
377            const int rc = pthread_create(&threads[i], NULL, DoGrep, (void *)&grepEngine);
378            if (rc) {
379                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
380            }
381        }
382
383        for(unsigned i = 0; i < numOfThreads; ++i) {
384            void * status = nullptr;
385            const int rc = pthread_join(threads[i], &status);
386            if (rc) {
387                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
388            }
389        }
390    }
391   
392    PrintResult(CountOnly, total_CountOnly);
393   
394    return 0;
395}
Note: See TracBrowser for help on using the repository browser.