source: icGREP/icgrep-devel/icgrep/util/file_select.cpp @ 5965

Last change on this file since 5965 was 5965, checked in by cameron, 14 months ago

Support for file/directory include/exclude

File size: 15.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "file_select.h"
8#include <llvm/Support/CommandLine.h>
9#include <llvm/Support/ErrorHandling.h>
10#include <llvm/Support/Signals.h>
11#include <llvm/Support/raw_ostream.h>
12#include <boost/filesystem.hpp>
13#include <toolchain/toolchain.h>
14#include <re/parsers/parser.h>
15#include <re/re_alt.h>
16#include <re/re_seq.h>
17#include <re/re_start.h>
18#include <re/re_end.h>
19#include <re/re_cc.h>
20#include <re/re_toolchain.h>
21#include <re/printer_re.h>
22#include <grep/grep_engine.h>
23#include <fstream>
24#include <string>
25
26using namespace llvm;
27
28namespace argv {
29   
30static cl::OptionCategory Input_Options("File Selection Options", "These options control the input sources.");
31
32bool RecursiveFlag;
33static cl::opt<bool, true> RecursiveOption("r", cl::location(RecursiveFlag), cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(Input_Options), cl::Grouping);
34static cl::alias RecursiveAlias("recursive", cl::desc("Alias for -r"), cl::aliasopt(RecursiveOption));
35
36bool DereferenceRecursiveFlag;
37static cl::opt<bool, true> DereferenceRecursiveOption("R", cl::location(DereferenceRecursiveFlag), cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(Input_Options), cl::Grouping);
38static cl::alias DereferenceRecursiveAlias("dereference-recursive", cl::desc("Alias for -R"), cl::aliasopt(DereferenceRecursiveOption));
39
40
41bool MmapFlag;
42static cl::opt<bool, true> MmapOption("mmap", cl::location(MmapFlag),  cl::init(1), cl::desc("Use mmap for file input (default)."), cl::cat(Input_Options));
43
44std::string ExcludeFlag;
45static cl::opt<std::string, true> ExcludeOption("exclude", cl::location(ExcludeFlag), cl::desc("Exclude files matching the given filename GLOB pattern."), cl::cat(Input_Options));
46
47std::string ExcludeFromFlag;
48static cl::opt<std::string, true> ExcludeFromOption("exclude-from", cl::location(ExcludeFromFlag), cl::desc("Exclude files matching filename GLOB patterns from the given file."), cl::cat(Input_Options));
49
50std::string ExcludeDirFlag;
51static cl::opt<std::string, true> ExcludeDirOption("exclude-dir", cl::location(ExcludeDirFlag), cl::desc("Exclude directories matching the given pattern."),
52                                                   cl::init(".svn"), cl::cat(Input_Options));
53
54std::string IncludeDirFlag;
55static cl::opt<std::string, true> IncludeDirOption("include-dir", cl::location(IncludeDirFlag), cl::desc("Include directories matching the given pattern."), cl::cat(Input_Options));
56
57std::string IncludeFlag;
58static cl::opt<std::string, true> IncludeOption("include", cl::location(IncludeFlag), cl::desc("Include only files matching the given filename GLOB pattern."), cl::cat(Input_Options));
59
60DevDirAction DevicesFlag;
61static cl::opt<DevDirAction, true> DevicesOption("D", cl::desc("Processing mode for devices:"),
62                                                 cl::values(clEnumValN(Read, "read", "Treat devices as files to be searched."),
63                                                            clEnumValN(Skip, "skip", "Silently skip devices.")
64                                                            CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DevicesFlag), cl::init(Read));
65static cl::alias DevicesAlias("devices", cl::desc("Alias for -D"), cl::aliasopt(DevicesOption));
66
67DevDirAction DirectoriesFlag;
68static cl::opt<DevDirAction, true> DirectoriesOption("d", cl::desc("Processing mode for directories:"),
69                                                     cl::values(clEnumValN(Read, "read", "Print an error message for any listed directories."),
70                                                                clEnumValN(Skip, "skip", "Silently skip directories."),
71                                                                clEnumValN(Recurse, "recurse", "Recursive process directories, equivalent to -r.")
72                                                                CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DirectoriesFlag), cl::init(Read));
73static cl::alias DirectoriesAlias("directories", cl::desc("Alias for -d"), cl::aliasopt(DirectoriesOption));
74
75// Command line arguments to specify file and directory includes/excludes
76// use GLOB syntax, matching any full pathname suffix after a "/", or
77// the full filename of any recursively selected file or directory.
78re::RE * anchorToFullFileName(re::RE * glob) {
79    return re::makeSeq({re::makeAlt({re::makeStart(), re::makeCC('/')}), glob, re::makeEnd()});
80}
81
82bool UseStdIn;
83
84re::RE * getDirectoryExcludePattern() {
85    if (ExcludeDirFlag != "") {
86        auto excludeDir = re::RE_Parser::parse(ExcludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
87        return anchorToFullFileName(excludeDir);
88    } else {
89        return re::makeAlt();  // matches nothing, so excludes nothing.
90    }
91}
92
93re::RE * getDirectoryIncludePattern() {
94    if (IncludeDirFlag != "") {
95        auto dir = re::RE_Parser::parse(IncludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
96        return anchorToFullFileName(dir);
97    } else {
98        return re::makeEnd();  // matches every line..
99    }
100}
101
102re::RE * getFileExcludePattern() {
103    std::vector<re::RE *> patterns;
104    if (ExcludeFlag != "") {
105        re::RE * glob = re::RE_Parser::parse(ExcludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
106        patterns.push_back(glob);
107    }
108    if (ExcludeFromFlag != "") {
109        std::ifstream globFile(ExcludeFromFlag.c_str());
110        std::string r;
111        if (globFile.is_open()) {
112            while (std::getline(globFile, r)) {
113                re::RE * glob = re::RE_Parser::parse(r, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
114                patterns.push_back(glob);
115            }
116            globFile.close();
117        }
118    }
119    if (patterns.empty()) return re::makeAlt();  // matches nothing, so excludes nothing.
120    return anchorToFullFileName(re::makeAlt(patterns.begin(), patterns.end()));
121}
122   
123re::RE * getFileIncludePattern() {
124    if (IncludeFlag != "") {
125        re::RE * includeSpec = re::RE_Parser::parse(IncludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
126        includeSpec = anchorToFullFileName(includeSpec);
127        return includeSpec;
128    } else {
129        return re::makeEnd();  // matches every line.
130    }
131}
132
133namespace fs = boost::filesystem;
134
135//
136//  Directory List: a set of directory paths that have been
137//  examined to identify candidate files for searching, together
138//  with a count of the number of candidate files in each directory.
139//
140//  FileName Buffer: an ordered sequence of NUL terminated filenames
141//  for each candidate produced in the directory traversal.
142//  The first mFullPathEntries entries are CWD paths.  Subsequent entries
143//  are base file names relative to a directory.   The set
144//  of all entries for a given directory are consecutive in the
145//  buffer, and the sets are ordered consecutively by directory
146//  index in the Directory List.
147//
148//  CollectedPaths: a vector of file paths to which the
149//  selected files are added.
150
151class FileSelectAccumulator : public grep::MatchAccumulator {
152public:
153    FileSelectAccumulator(std::vector<fs::path> & collectedPaths) :
154        mCollectedPaths(collectedPaths),
155        mFullPathEntries(0)
156    {}
157    void setFullPathEntries(unsigned entries) {mFullPathEntries = entries; mDirectoryIndex = 0;}
158    void reset();
159    void addDirectory(fs::path dirPath, unsigned cumulativeEntryCount);
160    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
161protected:
162    std::vector<fs::path> & mCollectedPaths;
163    unsigned mFullPathEntries;
164    unsigned mDirectoryIndex;
165    std::vector<fs::path> mDirectoryList;
166    std::vector<unsigned> mCumulativeEntryCount;
167};
168   
169void FileSelectAccumulator::reset() {
170    mCollectedPaths.clear();
171    mFullPathEntries = 0;
172    mDirectoryIndex = 0;
173    mDirectoryList.clear();
174    mCumulativeEntryCount.clear();
175}
176
177void FileSelectAccumulator::addDirectory(fs::path dirPath, unsigned cumulativeEntryCount) {
178    mDirectoryList.push_back(dirPath);
179    mCumulativeEntryCount.push_back(cumulativeEntryCount);
180}
181
182void FileSelectAccumulator::accumulate_match(const size_t fileIdx, char * name_start, char * name_end) {
183    fs::path p(std::string(name_start, name_end - name_start));
184    if (fileIdx < mFullPathEntries) {
185        mCollectedPaths.push_back(p);
186   } else {
187        assert(mDirectoryIndex < mDirectoryList.size());
188        while (fileIdx >= mCumulativeEntryCount[mDirectoryIndex]) {
189            mDirectoryIndex++;
190        }
191        mCollectedPaths.emplace_back(mDirectoryList[mDirectoryIndex]/std::string(name_start, name_end - name_start));
192    }
193}
194   
195std::vector<fs::path> getFullFileList(cl::list<std::string> & inputFiles) {
196    // The vector to accumulate the full list of collected files to be searched.
197    std::vector<fs::path> collectedPaths;
198    FileSelectAccumulator fileAccum(collectedPaths);
199
200    // In this pass through command line arguments and the file hierarchy,
201    // we are just gathering file and subdirectory entries, so we silently
202    // ignore errors.  We use the boost::filesystem operations that set
203    // error codes rather than raise exceptions.
204    boost::system::error_code errc;
205   
206    // At each level we gather candidate file and directory names and then
207    // filter the names based on -include, -exclude, -include-dir, -excclude-dir,
208    // and -exclude-from settings.
209    //
210    grep::SearchableBuffer dirCandidates;
211    grep::SearchableBuffer fileCandidates;
212
213    // First level of processing:  command line files and directories.
214    for (const std::string & f : inputFiles) {
215        if (f == "-") {  // stdin, will always be searched.
216            argv::UseStdIn = true;
217            continue;
218        }
219        fs::path p(f);
220        if (errc) {
221            // If there was an error, we leave the file in the fileCandidates
222            // list for later error processing.
223            fileCandidates.addSearchCandidate(p.c_str());
224        } else if (fs::is_directory(p)) {
225            if (DirectoriesFlag == Recurse) {
226                dirCandidates.addSearchCandidate(p.c_str());
227            } else if (DirectoriesFlag == Read) {
228                fileCandidates.addSearchCandidate(p.c_str());
229            }
230        } else if (fs::is_regular_file(p)) {
231            fileCandidates.addSearchCandidate(p.c_str());
232        } else {
233            // Devices and unknown file types
234            if (DevicesFlag == Read) {
235                fileCandidates.addSearchCandidate(p.c_str());
236            }
237        }
238    }
239   
240    auto commandLineDirCandidates = dirCandidates.getCandidateCount();
241    auto commandLineFileCandidates = fileCandidates.getCandidateCount();
242    fileAccum.setFullPathEntries(commandLineFileCandidates);
243    if (commandLineDirCandidates > 0) {
244        // Recursive processing of directories has been requested and we have
245        // candidate directories from the command line.
246   
247        // selectedDirectories will accumulate hold the results of directory
248        // include/exclude filtering at each level of processing.
249        std::vector<fs::path> selectedDirectories;
250       
251        FileSelectAccumulator directoryAccum(selectedDirectories);
252        grep::InternalSearchEngine directorySelectEngine;
253        directorySelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
254        directorySelectEngine.grepCodeGen
255            (getDirectoryIncludePattern(), getDirectoryExcludePattern(), & directoryAccum);
256       
257        // The initial grep search determines which of the command line directories to process.
258        // Each of these candidates is a full path return from command line argument processing.
259        directoryAccum.setFullPathEntries(dirCandidates.getCandidateCount());
260        directorySelectEngine.doGrep(dirCandidates.getBufferBase(), dirCandidates.getBufferSize());
261
262        while (!selectedDirectories.empty()) {
263            // We now iterate through the full list of directories, gathering
264            // entries from each.
265            // (a) File entries are added into the global list of fileCandidates.
266            // (b) Directory entries are added into a new list of candidates at each level.
267
268            grep::SearchableBuffer subdirCandidates;
269            std::vector<fs::path> currentDirectories = selectedDirectories;
270            directoryAccum.reset();
271            // Iterate through all directories, collecting subdirectory and file candidates.
272            for (auto & dirpath : currentDirectories) {
273                boost::system::error_code errc;
274                fs::directory_iterator di_end;
275                fs::directory_iterator di(dirpath, errc);
276                if (errc) {
277                    // If we cannot enter the directory, keep it in the list of files,
278                    // for possible error reporting.
279                    fileCandidates.addSearchCandidate(dirpath.filename().c_str());
280                    continue;
281                }
282                while (di != di_end) {
283                    auto & e = di->path();
284                    if (fs::is_directory(e)) {
285                        if (fs::is_symlink(e) && !DereferenceRecursiveFlag) {
286                            di.increment(errc);
287                            continue;
288                        }
289                        subdirCandidates.addSearchCandidate(e.filename().c_str());
290                    } else if (fs::is_regular_file(e)) {
291                        fileCandidates.addSearchCandidate(e.filename().c_str());
292                    } else {
293                        // Devices and unknown file types
294                        if (DevicesFlag == Read) {
295                            fileCandidates.addSearchCandidate(e.filename().c_str());
296                        }
297                    }
298                    di.increment(errc);
299                    if (errc) break;
300                }
301                // For each directory, update counts for candidates generated at this level.
302                //
303                directoryAccum.addDirectory(dirpath, subdirCandidates.getCandidateCount());
304                fileAccum.addDirectory(dirpath, fileCandidates.getCandidateCount());
305            }
306            // Directory traversal at this level is complete.  Clear the directoryList,
307            // so that it will accumulate only the selected entries from the gathered
308            // buffer of subdirCandidates.
309            selectedDirectories.clear();
310            //
311            //  Now do the search to produce the next level of selected subdirectories
312            directorySelectEngine.doGrep(subdirCandidates.getBufferBase(), subdirCandidates.getBufferSize());
313            // Thre search result has been written to directoryList, continue while we
314            // have new subdirectories.
315        } while (!selectedDirectories.empty());
316    }
317    //  All directories have been processed and all the fileCandidates in the SearchBuffer.
318    //  Now determine which of the candidates should included or excluded from the search.
319    //  The results will be accumulated in collectedPaths.
320    grep::InternalSearchEngine fileSelectEngine;
321    fileSelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
322    fileSelectEngine.grepCodeGen
323       (getFileIncludePattern(), getFileExcludePattern(), & fileAccum);
324    fileSelectEngine.doGrep(fileCandidates.getBufferBase(), fileCandidates.getBufferSize());
325    return collectedPaths;
326}
327
328}
Note: See TracBrowser for help on using the repository browser.