source: icGREP/icgrep-devel/icgrep/util/file_select.cpp @ 5997

Last change on this file since 5997 was 5997, checked in by cameron, 13 months ago

Optimization for the case when no include/exclude processing of files is required

File size: 16.5 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "file_select.h"
8#include <llvm/Support/CommandLine.h>
9#include <llvm/Support/ErrorHandling.h>
10#include <llvm/Support/Signals.h>
11#include <llvm/Support/raw_ostream.h>
12#include <boost/filesystem.hpp>
13#include <toolchain/toolchain.h>
14#include <re/parsers/parser.h>
15#include <re/re_alt.h>
16#include <re/re_seq.h>
17#include <re/re_start.h>
18#include <re/re_end.h>
19#include <re/re_cc.h>
20#include <re/re_toolchain.h>
21#include <re/printer_re.h>
22#include <grep/grep_engine.h>
23#include <fstream>
24#include <string>
25
26using namespace llvm;
27
28namespace argv {
29   
30static cl::OptionCategory Input_Options("File Selection Options", "These options control the input sources.");
31
32bool RecursiveFlag;
33static cl::opt<bool, true> RecursiveOption("r", cl::location(RecursiveFlag), cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(Input_Options), cl::Grouping);
34static cl::alias RecursiveAlias("recursive", cl::desc("Alias for -r"), cl::aliasopt(RecursiveOption));
35
36bool DereferenceRecursiveFlag;
37static cl::opt<bool, true> DereferenceRecursiveOption("R", cl::location(DereferenceRecursiveFlag), cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(Input_Options), cl::Grouping);
38static cl::alias DereferenceRecursiveAlias("dereference-recursive", cl::desc("Alias for -R"), cl::aliasopt(DereferenceRecursiveOption));
39
40
41bool MmapFlag;
42static cl::opt<bool, true> MmapOption("mmap", cl::location(MmapFlag),  cl::init(1), cl::desc("Use mmap for file input (default)."), cl::cat(Input_Options));
43
44std::string ExcludeFlag;
45static cl::opt<std::string, true> ExcludeOption("exclude", cl::location(ExcludeFlag), cl::desc("Exclude files matching the given filename GLOB pattern."), cl::cat(Input_Options));
46
47std::string ExcludeFromFlag;
48static cl::opt<std::string, true> ExcludeFromOption("exclude-from", cl::location(ExcludeFromFlag), cl::desc("Exclude files matching filename GLOB patterns from the given file."), cl::cat(Input_Options));
49
50std::string ExcludeDirFlag;
51static cl::opt<std::string, true> ExcludeDirOption("exclude-dir", cl::location(ExcludeDirFlag), cl::desc("Exclude directories matching the given pattern."),
52                                                   cl::init(".svn"), cl::cat(Input_Options));
53
54std::string IncludeDirFlag;
55static cl::opt<std::string, true> IncludeDirOption("include-dir", cl::location(IncludeDirFlag), cl::desc("Include directories matching the given pattern."), cl::cat(Input_Options));
56
57std::string IncludeFlag;
58static cl::opt<std::string, true> IncludeOption("include", cl::location(IncludeFlag), cl::desc("Include only files matching the given filename GLOB pattern."), cl::cat(Input_Options));
59
60DevDirAction DevicesFlag;
61static cl::opt<DevDirAction, true> DevicesOption("D", cl::desc("Processing mode for devices:"),
62                                                 cl::values(clEnumValN(Read, "read", "Treat devices as files to be searched."),
63                                                            clEnumValN(Skip, "skip", "Silently skip devices.")
64                                                            CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DevicesFlag), cl::init(Read));
65static cl::alias DevicesAlias("devices", cl::desc("Alias for -D"), cl::aliasopt(DevicesOption));
66
67DevDirAction DirectoriesFlag;
68static cl::opt<DevDirAction, true> DirectoriesOption("d", cl::desc("Processing mode for directories:"),
69                                                     cl::values(clEnumValN(Read, "read", "Print an error message for any listed directories."),
70                                                                clEnumValN(Skip, "skip", "Silently skip directories."),
71                                                                clEnumValN(Recurse, "recurse", "Recursive process directories, equivalent to -r.")
72                                                                CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DirectoriesFlag), cl::init(Read));
73static cl::alias DirectoriesAlias("directories", cl::desc("Alias for -d"), cl::aliasopt(DirectoriesOption));
74
75// Command line arguments to specify file and directory includes/excludes
76// use GLOB syntax, matching any full pathname suffix after a "/", or
77// the full filename of any recursively selected file or directory.
78re::RE * anchorToFullFileName(re::RE * glob) {
79    return re::makeSeq({re::makeAlt({re::makeStart(), re::makeCC('/')}), glob, re::makeEnd()});
80}
81
82bool UseStdIn;
83
84re::RE * getDirectoryExcludePattern() {
85    if (ExcludeDirFlag != "") {
86        auto excludeDir = re::RE_Parser::parse(ExcludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
87        return anchorToFullFileName(excludeDir);
88    } else {
89        return re::makeAlt();  // matches nothing, so excludes nothing.
90    }
91}
92
93re::RE * getDirectoryIncludePattern() {
94    if (IncludeDirFlag != "") {
95        auto dir = re::RE_Parser::parse(IncludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
96        return anchorToFullFileName(dir);
97    } else {
98        return re::makeEnd();  // matches every line..
99    }
100}
101
102re::RE * getFileExcludePattern() {
103    std::vector<re::RE *> patterns;
104    if (ExcludeFlag != "") {
105        re::RE * glob = re::RE_Parser::parse(ExcludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
106        patterns.push_back(glob);
107    }
108    if (ExcludeFromFlag != "") {
109        std::ifstream globFile(ExcludeFromFlag.c_str());
110        std::string r;
111        if (globFile.is_open()) {
112            while (std::getline(globFile, r)) {
113                re::RE * glob = re::RE_Parser::parse(r, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
114                patterns.push_back(glob);
115            }
116            globFile.close();
117        }
118    }
119    if (patterns.empty()) return re::makeAlt();  // matches nothing, so excludes nothing.
120    return anchorToFullFileName(re::makeAlt(patterns.begin(), patterns.end()));
121}
122   
123re::RE * getFileIncludePattern() {
124    if (IncludeFlag != "") {
125        re::RE * includeSpec = re::RE_Parser::parse(IncludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
126        includeSpec = anchorToFullFileName(includeSpec);
127        return includeSpec;
128    } else {
129        return re::makeEnd();  // matches every line.
130    }
131}
132
133namespace fs = boost::filesystem;
134
135//
136//  Directory List: a set of directory paths that have been
137//  examined to identify candidate files for searching, together
138//  with a count of the number of candidate files in each directory.
139//
140//  FileName Buffer: an ordered sequence of NUL terminated filenames
141//  for each candidate produced in the directory traversal.
142//  The first mFullPathEntries entries are CWD paths.  Subsequent entries
143//  are base file names relative to a directory.   The set
144//  of all entries for a given directory are consecutive in the
145//  buffer, and the sets are ordered consecutively by directory
146//  index in the Directory List.
147//
148//  CollectedPaths: a vector of file paths to which the
149//  selected files are added.
150
151class FileSelectAccumulator : public grep::MatchAccumulator {
152public:
153    FileSelectAccumulator(std::vector<fs::path> & collectedPaths) :
154        mCollectedPaths(collectedPaths),
155        mFullPathEntries(0)
156    {}
157    void setFullPathEntries(unsigned entries) {mFullPathEntries = entries; mDirectoryIndex = 0;}
158    void reset();
159    void addDirectory(fs::path dirPath, unsigned cumulativeEntryCount);
160    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
161protected:
162    std::vector<fs::path> & mCollectedPaths;
163    unsigned mFullPathEntries;
164    unsigned mDirectoryIndex;
165    std::vector<fs::path> mDirectoryList;
166    std::vector<unsigned> mCumulativeEntryCount;
167};
168   
169void FileSelectAccumulator::reset() {
170    mCollectedPaths.clear();
171    mFullPathEntries = 0;
172    mDirectoryIndex = 0;
173    mDirectoryList.clear();
174    mCumulativeEntryCount.clear();
175}
176
177void FileSelectAccumulator::addDirectory(fs::path dirPath, unsigned cumulativeEntryCount) {
178    mDirectoryList.push_back(dirPath);
179    mCumulativeEntryCount.push_back(cumulativeEntryCount);
180}
181
182void FileSelectAccumulator::accumulate_match(const size_t fileIdx, char * name_start, char * name_end) {
183    fs::path p(std::string(name_start, name_end - name_start));
184    if (fileIdx < mFullPathEntries) {
185        mCollectedPaths.push_back(p);
186   } else {
187        assert(mDirectoryIndex < mDirectoryList.size());
188        while (fileIdx >= mCumulativeEntryCount[mDirectoryIndex]) {
189            mDirectoryIndex++;
190        }
191        mCollectedPaths.emplace_back(mDirectoryList[mDirectoryIndex]/std::string(name_start, name_end - name_start));
192    }
193}
194   
195std::vector<fs::path> getFullFileList(cl::list<std::string> & inputFiles) {
196    // The vector to accumulate the full list of collected files to be searched.
197    std::vector<fs::path> collectedPaths;
198   
199    // In this pass through command line arguments and the file hierarchy,
200    // we are just gathering file and subdirectory entries, so we silently
201    // ignore errors.  We use the boost::filesystem operations that set
202    // error codes rather than raise exceptions.
203    boost::system::error_code errc;
204   
205    // In non-recursive greps with no include/exclude processing, we simply assemble the
206    // paths.
207    if ((DirectoriesFlag != Recurse) && (ExcludeFlag == "") && (IncludeFlag == "") && (ExcludeFromFlag == "")) {
208        for (const std::string & f : inputFiles) {
209            if (f == "-") {  // stdin, will always be searched.
210                argv::UseStdIn = true;
211                continue;
212            }
213            fs::path p(f);
214            if (errc) {
215                // If there was an error, we leave the file in the fileCandidates
216                // list for later error processing.
217                collectedPaths.push_back(p);
218            } else if (fs::is_directory(p)) {
219                if (DirectoriesFlag == Read) {
220                    collectedPaths.push_back(p);
221                }
222            } else if (fs::is_regular_file(p)) {
223                collectedPaths.push_back(p);
224            } else {
225                // Devices and unknown file types
226                if (DevicesFlag == Read) {
227                    collectedPaths.push_back(p);
228                }
229            }
230        }
231        return collectedPaths;
232    }
233   
234    // Otherwise we need to filter paths according to some include/exclude rules.
235   
236    FileSelectAccumulator fileAccum(collectedPaths);
237   
238    // At each level we gather candidate file and directory names and then
239    // filter the names based on -include, -exclude, -include-dir, -excclude-dir,
240    // and -exclude-from settings.
241    //
242    grep::SearchableBuffer dirCandidates;
243    grep::SearchableBuffer fileCandidates;
244
245    // First level of processing:  command line files and directories.
246    for (const std::string & f : inputFiles) {
247        if (f == "-") {  // stdin, will always be searched.
248            argv::UseStdIn = true;
249            continue;
250        }
251        fs::path p(f);
252        if (errc) {
253            // If there was an error, we leave the file in the fileCandidates
254            // list for later error processing.
255            fileCandidates.addSearchCandidate(p.c_str());
256        } else if (fs::is_directory(p)) {
257            if (DirectoriesFlag == Recurse) {
258                dirCandidates.addSearchCandidate(p.c_str());
259            } else if (DirectoriesFlag == Read) {
260                fileCandidates.addSearchCandidate(p.c_str());
261            }
262        } else if (fs::is_regular_file(p)) {
263            fileCandidates.addSearchCandidate(p.c_str());
264        } else {
265            // Devices and unknown file types
266            if (DevicesFlag == Read) {
267                fileCandidates.addSearchCandidate(p.c_str());
268            }
269        }
270    }
271   
272    auto commandLineDirCandidates = dirCandidates.getCandidateCount();
273    auto commandLineFileCandidates = fileCandidates.getCandidateCount();
274    fileAccum.setFullPathEntries(commandLineFileCandidates);
275    if (commandLineDirCandidates > 0) {
276        // Recursive processing of directories has been requested and we have
277        // candidate directories from the command line.
278   
279        // selectedDirectories will accumulate hold the results of directory
280        // include/exclude filtering at each level of processing.
281        std::vector<fs::path> selectedDirectories;
282       
283        FileSelectAccumulator directoryAccum(selectedDirectories);
284        grep::InternalSearchEngine directorySelectEngine;
285        directorySelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
286        directorySelectEngine.grepCodeGen
287            (getDirectoryIncludePattern(), getDirectoryExcludePattern(), & directoryAccum);
288       
289        // The initial grep search determines which of the command line directories to process.
290        // Each of these candidates is a full path return from command line argument processing.
291        directoryAccum.setFullPathEntries(dirCandidates.getCandidateCount());
292        directorySelectEngine.doGrep(dirCandidates.getBufferBase(), dirCandidates.getBufferSize());
293
294        while (!selectedDirectories.empty()) {
295            // We now iterate through the full list of directories, gathering
296            // entries from each.
297            // (a) File entries are added into the global list of fileCandidates.
298            // (b) Directory entries are added into a new list of candidates at each level.
299
300            grep::SearchableBuffer subdirCandidates;
301            std::vector<fs::path> currentDirectories = selectedDirectories;
302            directoryAccum.reset();
303            // Iterate through all directories, collecting subdirectory and file candidates.
304            for (auto & dirpath : currentDirectories) {
305                boost::system::error_code errc;
306                fs::directory_iterator di_end;
307                fs::directory_iterator di(dirpath, errc);
308                if (errc) {
309                    // If we cannot enter the directory, keep it in the list of files,
310                    // for possible error reporting.
311                    fileCandidates.addSearchCandidate(dirpath.filename().c_str());
312                    continue;
313                }
314                while (di != di_end) {
315                    auto & e = di->path();
316                    if (fs::is_directory(e)) {
317                        if (fs::is_symlink(e) && !DereferenceRecursiveFlag) {
318                            di.increment(errc);
319                            continue;
320                        }
321                        subdirCandidates.addSearchCandidate(e.filename().c_str());
322                    } else if (fs::is_regular_file(e)) {
323                        fileCandidates.addSearchCandidate(e.filename().c_str());
324                    } else {
325                        // Devices and unknown file types
326                        if (DevicesFlag == Read) {
327                            fileCandidates.addSearchCandidate(e.filename().c_str());
328                        }
329                    }
330                    di.increment(errc);
331                    if (errc) break;
332                }
333                // For each directory, update counts for candidates generated at this level.
334                //
335                directoryAccum.addDirectory(dirpath, subdirCandidates.getCandidateCount());
336                fileAccum.addDirectory(dirpath, fileCandidates.getCandidateCount());
337            }
338            // Directory traversal at this level is complete.  Clear the directoryList,
339            // so that it will accumulate only the selected entries from the gathered
340            // buffer of subdirCandidates.
341            selectedDirectories.clear();
342            //
343            //  Now do the search to produce the next level of selected subdirectories
344            directorySelectEngine.doGrep(subdirCandidates.getBufferBase(), subdirCandidates.getBufferSize());
345            // Thre search result has been written to directoryList, continue while we
346            // have new subdirectories.
347        } while (!selectedDirectories.empty());
348    }
349    //  All directories have been processed and all the fileCandidates in the SearchBuffer.
350    //  Now determine which of the candidates should included or excluded from the search.
351    //  The results will be accumulated in collectedPaths.
352    grep::InternalSearchEngine fileSelectEngine;
353    fileSelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
354    fileSelectEngine.grepCodeGen
355       (getFileIncludePattern(), getFileExcludePattern(), & fileAccum);
356    fileSelectEngine.doGrep(fileCandidates.getBufferBase(), fileCandidates.getBufferSize());
357    return collectedPaths;
358}
359
360}
Note: See TracBrowser for help on using the repository browser.