source: icGREP/icgrep-devel/icgrep/icgrep.cpp @ 5163

Last change on this file since 5163 was 5163, checked in by cameron, 3 years ago

Initial support for -r/-R recursive file processing

File size: 12.7 KB
Line 
1/*
2 *  Copyright (c) 2016 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <cstdio>
8#include <vector>
9#include <llvm/Support/CommandLine.h>
10#include <llvm/Support/ErrorHandling.h>
11#include <llvm/Support/Signals.h>
12#include <re/re_alt.h>
13#include <re/re_parser.h>
14#include <grep_engine.h>
15#include <fstream>
16#include <string>
17
18#include <boost/uuid/sha1.hpp>
19#include <toolchain.h>
20#include <re/re_toolchain.h>
21#include <pablo/pablo_toolchain.h>
22#include <mutex>
23#include <boost/filesystem.hpp>
24
25#include <iostream> // MEEE
26
27#ifdef PRINT_TIMING_INFORMATION
28#include <hrtime.h>
29#include <util/papi_helper.hpp>
30#endif
31
32static cl::OptionCategory LegacyGrepOptions("A. Standard Grep Options",
33                                       "These are standard grep options intended for compatibility with typical grep usage.");
34static cl::opt<bool> UTF_16("UTF-16", cl::desc("Regular expressions over the UTF-16 representation of Unicode."), cl::cat(LegacyGrepOptions));
35static cl::OptionCategory EnhancedGrepOptions("B. Enhanced Grep Options",
36                                       "These are additional options for icgrep functionality and performance.");
37static cl::opt<bool> CountOnly("c", cl::desc("Count and display the matching lines per file only."), cl::cat(LegacyGrepOptions), cl::Grouping);
38static cl::alias CountOnlyLong("count", cl::desc("Alias for -c"), cl::aliasopt(CountOnly));
39
40static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
41
42static cl::opt<bool> EnterDirectoriesRecursively("r", cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(LegacyGrepOptions), cl::Grouping);
43static cl::opt<bool> FollowSubdirectorySymlinks("R", cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(LegacyGrepOptions), cl::Grouping);
44static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."), cl::cat(LegacyGrepOptions), cl::Grouping);
45
46
47static cl::list<std::string> regexVector("e", cl::desc("Regular expression"), cl::ZeroOrMore, cl::cat(LegacyGrepOptions));
48static cl::opt<std::string> RegexFilename("f", cl::desc("Take regular expressions (one per line) from a file"), cl::value_desc("regex file"), cl::init(""), cl::cat(LegacyGrepOptions));
49static cl::opt<std::string> IRFileName("precompiled", cl::desc("Use precompiled regular expression"), cl::value_desc("LLVM IR file"), cl::init(""));
50
51static cl::opt<int> Threads("t", cl::desc("Total number of threads."), cl::init(1));
52
53static cl::opt<bool> GrepSupport("gs", cl::desc("Grep support. Pipe the output of icgrep into grep. \
54         Gives you colored output + back-referencing capability."), cl::cat(EnhancedGrepOptions));
55
56
57static std::vector<std::string> allFiles;
58//
59// Handler for errors reported through llvm::report_fatal_error.  Report
60// and signal error code 2 (grep convention).
61//
62static void icgrep_error_handler(void *UserData, const std::string &Message,
63                             bool GenCrashDiag) {
64
65    // Modified from LLVM's internal report_fatal_error logic.
66    SmallVector<char, 64> Buffer;
67    raw_svector_ostream OS(Buffer);
68    OS << "icgrep ERROR: " << Message << "\n";
69    StringRef MessageStr = OS.str();
70    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
71    (void)written; // If something went wrong, we deliberately just give up.
72
73    // Run the interrupt handlers to make sure any special cleanups get done, in
74    // particular that we remove files registered with RemoveFileOnSignal.
75    llvm::sys::RunInterruptHandlers();
76    exit(2);
77}
78
79static std::string allREs;
80static re::ModeFlagSet globalFlags = 0;
81
82re::RE * get_icgrep_RE() {
83 
84    //std::vector<std::string> regexVector;
85    if (RegexFilename != "") {
86        std::ifstream regexFile(RegexFilename.c_str());
87        std::string r;
88        if (regexFile.is_open()) {
89            while (std::getline(regexFile, r)) {
90                regexVector.push_back(r);
91            }
92            regexFile.close();
93        }
94    }
95   
96    // if there are no regexes specified through -e or -f, the first positional argument
97    // must be a regex, not an input file.
98   
99    if (regexVector.size() == 0) {
100        regexVector.push_back(inputFiles[0]);
101        inputFiles.erase(inputFiles.begin());
102    }
103    if (CaseInsensitive) globalFlags |= re::CASE_INSENSITIVE_MODE_FLAG;
104
105 
106    std::vector<re::RE *> REs;
107    re::RE * re_ast = nullptr;
108    for (unsigned i = 0; i < regexVector.size(); i++) {
109        re_ast = re::RE_Parser::parse(regexVector[i], globalFlags);
110        REs.push_back(re_ast);
111        allREs += regexVector[i] + "\n";
112    }
113    if (REs.size() > 1) {
114        re_ast = re::makeAlt(REs.begin(), REs.end());
115    }
116   
117    return re_ast;
118}
119
120std::string sha1sum(const std::string & str) {
121    char buffer[41];    // 40 hex-digits and the terminating null
122    unsigned int digest[5];     // 160 bits in total
123
124    boost::uuids::detail::sha1 sha1;
125    sha1.process_bytes(str.c_str(), str.size());
126    sha1.get_digest(digest);
127    snprintf(buffer, sizeof(buffer), "%.8x%.8x%.8x%.8x%.8x",
128             digest[0], digest[1], digest[2], digest[3], digest[4]);
129    return std::string(buffer);
130}
131
132std::vector<size_t> total_CountOnly;
133std::mutex count_mutex;
134size_t fileCount;
135void *DoGrep(void *args)
136{
137    size_t fileIdx;
138    GrepEngine * grepEngine = (GrepEngine *)args;
139
140    count_mutex.lock();
141    fileIdx = fileCount;
142    fileCount++;
143    count_mutex.unlock();
144
145    while (fileIdx < allFiles.size()){
146        grepEngine->doGrep(allFiles[fileIdx], fileIdx, CountOnly, total_CountOnly, UTF_16);
147       
148        count_mutex.lock();
149        fileIdx = fileCount;
150        fileCount++;
151        count_mutex.unlock();
152    }
153
154    pthread_exit(NULL);
155}
156
157
158// Returns true if the command line argument shouldn't be passed to icGrep or Grep.
159bool isArgUnwantedForAll(char *argument) {
160    std::vector<std::string> unwantedFlags = {"-gs"};
161    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
162        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
163            return true;
164        }
165    }
166    return false;
167}
168// Filters out the command line strings that shouldn't be passed on to Grep
169bool isArgUnwantedForGrep(char *argument) {
170    std::vector<std::string> unwantedFlags = {"-n"};
171
172    for (unsigned i = 0; i < inputFiles.size(); ++i){
173        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
174            return true;
175        }
176    }
177
178    for (unsigned i = 0; i < inputFiles.size(); ++i){    // filter out input content files.
179        if (strcmp(argument, inputFiles[i].c_str()) == 0) {
180            return true;
181        }
182    }
183
184    return false;
185}
186// Filters out the command line strings that shouldn't be passed on to IcGrep
187bool isArgUnwantedForIcGrep(char *argument) {
188    bool isUnwanted = false;
189    std::vector<std::string> unwantedFlags = {"-c"};
190
191    for (unsigned i = 0; i < unwantedFlags.size(); ++i){
192        if (strcmp(argument, unwantedFlags[i].c_str()) == 0) {
193            isUnwanted = true;
194        }
195    }
196
197    return isUnwanted;
198}
199
200/*
201* Constructs a shell command that calls icgrep and then pipes the output to grep.
202* Then executs this shell command using the "system()" function.
203* This allows the output to be colored since all output is piped to grep.
204*/ 
205void pipeIcGrepOutputToGrep(int argc, char *argv[]) {
206    std::string icGrepArguments = "";
207    std::string grepArguments = "";
208
209    // Construct the shell arguments for icgrep and grep
210    // by filtering out the command line arguments passed into this process.
211    for (int i = 1; i < argc; i++) {
212        if (!isArgUnwantedForAll(argv[i])) {
213
214            if (!isArgUnwantedForIcGrep(argv[i])) {
215                // Wrap everything in quotes since the arguments passed into this program had them stripped by bash.
216                icGrepArguments.append("\"");       
217                icGrepArguments.append(argv[i]);
218                icGrepArguments.append("\" ");
219            }
220
221            if (!isArgUnwantedForGrep(argv[i])) {
222                grepArguments.append("\"");
223                grepArguments.append(argv[i]);
224                grepArguments.append("\" ");
225            }
226        }
227    }
228
229    std::string systemCall = argv[0];
230    systemCall.append(" ");
231    systemCall.append(icGrepArguments);
232    systemCall.append(" ");
233    systemCall.append(" | grep --color=always -P ");
234    systemCall.append(grepArguments);
235
236    system(systemCall.c_str());
237}
238
239
240// This is a stub, to be expanded later.
241bool excludeDirectory(boost::filesystem::path dirpath) { return dirpath.filename() == ".svn";}
242
243std::vector<std::string> getFullFileList(cl::list<std::string> & inputFiles) {
244    using namespace boost::filesystem;
245    symlink_option follow_symlink = FollowSubdirectorySymlinks ? symlink_option::recurse : symlink_option::none;
246    std::vector<std::string> expanded_paths;
247    boost::system::error_code errc;
248    if (FollowSubdirectorySymlinks) {
249        EnterDirectoriesRecursively = true;
250    }
251    for (auto & f : inputFiles) {
252        path p(f);
253        if (EnterDirectoriesRecursively && is_directory(p)) {
254            if (!excludeDirectory(p)) {
255                recursive_directory_iterator di(p, follow_symlink, errc), end;
256                if (errc) {
257                    // If we cannot enter the directory, keep it in the list of files.
258                    expanded_paths.push_back(f); 
259                    continue;
260                }
261                while (di != end) {
262                    auto & e = di->path();
263                    if (is_directory(e)) {
264                        if (excludeDirectory(e)) di.no_push();
265                    }
266                    else expanded_paths.push_back(e.string());
267                    di.increment(errc);
268                    if (errc) {
269                        expanded_paths.push_back(e.string()); 
270                    }
271                }
272            }
273        }
274        else expanded_paths.push_back(p.string());
275    }
276    return expanded_paths;
277}
278
279
280int main(int argc, char *argv[]) {
281    llvm::install_fatal_error_handler(&icgrep_error_handler);
282    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&LegacyGrepOptions, &EnhancedGrepOptions, re::re_toolchain_flags(), pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
283    cl::ParseCommandLineOptions(argc, argv);
284   
285    re::RE * re_ast = get_icgrep_RE();
286    std::string module_name = "grepcode:" + sha1sum(allREs) + ":" + std::to_string(globalFlags);
287
288    if (GrepSupport) {  // Calls icgrep again on command line and passes output to grep.
289        pipeIcGrepOutputToGrep(argc, argv);
290        return 0;   // icgrep is called again, so we need to end this process.
291    }
292   
293    GrepEngine grepEngine;
294    grepEngine.grepCodeGen(module_name, re_ast, CountOnly, UTF_16);
295    //std::cerr << "grepCodeGen complete";
296
297    releaseSlabAllocatorMemory();
298   
299    allFiles = getFullFileList(inputFiles);
300   
301    initResult(allFiles);
302    for (unsigned i=0; i < allFiles.size(); ++i){
303        total_CountOnly.push_back(0);
304    }
305
306    if (Threads <= 1) {
307
308        #ifdef PRINT_TIMING_INFORMATION
309        // PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY
310        // PAPI_RES_STL, PAPI_BR_MSP, PAPI_LST_INS, PAPI_L1_TCM
311        papi::PapiCounter<4> papiCounters({PAPI_RES_STL, PAPI_STL_CCY, PAPI_FUL_CCY, PAPI_MEM_WCY});
312        #endif
313        for (unsigned i = 0; i != allFiles.size(); ++i) {
314            #ifdef PRINT_TIMING_INFORMATION
315            papiCounters.start();
316            const timestamp_t execution_start = read_cycle_counter();
317            #endif
318            grepEngine.doGrep(allFiles[i], i, CountOnly, total_CountOnly, UTF_16);
319            #ifdef PRINT_TIMING_INFORMATION
320            const timestamp_t execution_end = read_cycle_counter();
321            papiCounters.stop();
322            std::cerr << "EXECUTION TIME: " << allFiles[i] << ":" << "CYCLES|" << (execution_end - execution_start) << papiCounters << std::endl;
323            #endif
324        }       
325    } else if (Threads > 1) {
326        const unsigned numOfThreads = Threads; // <- convert the command line value into an integer to allow stack allocation
327        pthread_t threads[numOfThreads];
328
329        for(unsigned long i = 0; i < numOfThreads; ++i){
330            const int rc = pthread_create(&threads[i], NULL, DoGrep, (void *)&grepEngine);
331            if (rc) {
332                llvm::report_fatal_error("Failed to create thread: code " + std::to_string(rc));
333            }
334        }
335
336        for(unsigned i = 0; i < numOfThreads; ++i) {
337            void * status = nullptr;
338            const int rc = pthread_join(threads[i], &status);
339            if (rc) {
340                llvm::report_fatal_error("Failed to join thread: code " + std::to_string(rc));
341            }
342        }
343    }
344   
345    PrintResult(CountOnly, total_CountOnly);
346   
347    return 0;
348}
Note: See TracBrowser for help on using the repository browser.