source: icGREP/icgrep-devel/icgrep/util/file_select.cpp @ 5999

Last change on this file since 5999 was 5999, checked in by cameron, 13 months ago

Handling of file system errors

File size: 17.3 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include "file_select.h"
8#include <llvm/Support/CommandLine.h>
9#include <llvm/Support/ErrorHandling.h>
10#include <llvm/Support/Signals.h>
11#include <llvm/Support/raw_ostream.h>
12#include <boost/filesystem.hpp>
13#include <toolchain/toolchain.h>
14#include <re/parsers/parser.h>
15#include <re/re_alt.h>
16#include <re/re_seq.h>
17#include <re/re_start.h>
18#include <re/re_end.h>
19#include <re/re_cc.h>
20#include <re/re_toolchain.h>
21#include <re/printer_re.h>
22#include <grep/grep_engine.h>
23#include <fstream>
24#include <string>
25
26using namespace llvm;
27
28namespace argv {
29   
30static cl::OptionCategory Input_Options("File Selection Options", "These options control the input sources.");
31
32bool NoMessagesFlag;
33static cl::opt<bool, true> NoMessagesOption("s", cl::location(NoMessagesFlag), cl::desc("Suppress messages for file errors."), cl::cat(Input_Options), cl::Grouping);
34static cl::alias NoMessagesAlias("no-messages", cl::desc("Alias for -s"), cl::aliasopt(NoMessagesOption));
35
36bool RecursiveFlag;
37static cl::opt<bool, true> RecursiveOption("r", cl::location(RecursiveFlag), cl::desc("Recursively process files within directories, (but follow only top-level symlinks unless -R)."), cl::cat(Input_Options), cl::Grouping);
38static cl::alias RecursiveAlias("recursive", cl::desc("Alias for -r"), cl::aliasopt(RecursiveOption));
39
40bool DereferenceRecursiveFlag;
41static cl::opt<bool, true> DereferenceRecursiveOption("R", cl::location(DereferenceRecursiveFlag), cl::desc("Recursively process files within directories, following symlinks at all levels."), cl::cat(Input_Options), cl::Grouping);
42static cl::alias DereferenceRecursiveAlias("dereference-recursive", cl::desc("Alias for -R"), cl::aliasopt(DereferenceRecursiveOption));
43
44
45bool MmapFlag;
46static cl::opt<bool, true> MmapOption("mmap", cl::location(MmapFlag),  cl::init(1), cl::desc("Use mmap for file input (default)."), cl::cat(Input_Options));
47
48std::string ExcludeFlag;
49static cl::opt<std::string, true> ExcludeOption("exclude", cl::location(ExcludeFlag), cl::desc("Exclude files matching the given filename GLOB pattern."), cl::cat(Input_Options));
50
51std::string ExcludeFromFlag;
52static cl::opt<std::string, true> ExcludeFromOption("exclude-from", cl::location(ExcludeFromFlag), cl::desc("Exclude files matching filename GLOB patterns from the given file."), cl::cat(Input_Options));
53
54std::string ExcludeDirFlag;
55static cl::opt<std::string, true> ExcludeDirOption("exclude-dir", cl::location(ExcludeDirFlag), cl::desc("Exclude directories matching the given pattern."),
56                                                   cl::init(".svn"), cl::cat(Input_Options));
57
58std::string IncludeDirFlag;
59static cl::opt<std::string, true> IncludeDirOption("include-dir", cl::location(IncludeDirFlag), cl::desc("Include directories matching the given pattern."), cl::cat(Input_Options));
60
61std::string IncludeFlag;
62static cl::opt<std::string, true> IncludeOption("include", cl::location(IncludeFlag), cl::desc("Include only files matching the given filename GLOB pattern."), cl::cat(Input_Options));
63
64DevDirAction DevicesFlag;
65static cl::opt<DevDirAction, true> DevicesOption("D", cl::desc("Processing mode for devices:"),
66                                                 cl::values(clEnumValN(Read, "read", "Treat devices as files to be searched."),
67                                                            clEnumValN(Skip, "skip", "Silently skip devices.")
68                                                            CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DevicesFlag), cl::init(Read));
69static cl::alias DevicesAlias("devices", cl::desc("Alias for -D"), cl::aliasopt(DevicesOption));
70
71DevDirAction DirectoriesFlag;
72static cl::opt<DevDirAction, true> DirectoriesOption("d", cl::desc("Processing mode for directories:"),
73                                                     cl::values(clEnumValN(Read, "read", "Print an error message for any listed directories."),
74                                                                clEnumValN(Skip, "skip", "Silently skip directories."),
75                                                                clEnumValN(Recurse, "recurse", "Recursive process directories, equivalent to -r.")
76                                                                CL_ENUM_VAL_SENTINEL), cl::cat(Input_Options), cl::location(DirectoriesFlag), cl::init(Read));
77static cl::alias DirectoriesAlias("directories", cl::desc("Alias for -d"), cl::aliasopt(DirectoriesOption));
78
79// Command line arguments to specify file and directory includes/excludes
80// use GLOB syntax, matching any full pathname suffix after a "/", or
81// the full filename of any recursively selected file or directory.
82re::RE * anchorToFullFileName(re::RE * glob) {
83    return re::makeSeq({re::makeAlt({re::makeStart(), re::makeCC('/')}), glob, re::makeEnd()});
84}
85
86bool UseStdIn;
87
88re::RE * getDirectoryExcludePattern() {
89    if (ExcludeDirFlag != "") {
90        auto excludeDir = re::RE_Parser::parse(ExcludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
91        return anchorToFullFileName(excludeDir);
92    } else {
93        return re::makeAlt();  // matches nothing, so excludes nothing.
94    }
95}
96
97re::RE * getDirectoryIncludePattern() {
98    if (IncludeDirFlag != "") {
99        auto dir = re::RE_Parser::parse(IncludeDirFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
100        return anchorToFullFileName(dir);
101    } else {
102        return re::makeEnd();  // matches every line..
103    }
104}
105
106re::RE * getFileExcludePattern() {
107    std::vector<re::RE *> patterns;
108    if (ExcludeFlag != "") {
109        re::RE * glob = re::RE_Parser::parse(ExcludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
110        patterns.push_back(glob);
111    }
112    if (ExcludeFromFlag != "") {
113        std::ifstream globFile(ExcludeFromFlag.c_str());
114        std::string r;
115        if (globFile.is_open()) {
116            while (std::getline(globFile, r)) {
117                re::RE * glob = re::RE_Parser::parse(r, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
118                patterns.push_back(glob);
119            }
120            globFile.close();
121        }
122    }
123    if (patterns.empty()) return re::makeAlt();  // matches nothing, so excludes nothing.
124    return anchorToFullFileName(re::makeAlt(patterns.begin(), patterns.end()));
125}
126   
127re::RE * getFileIncludePattern() {
128    if (IncludeFlag != "") {
129        re::RE * includeSpec = re::RE_Parser::parse(IncludeFlag, re::DEFAULT_MODE, re::RE_Syntax::FileGLOB);
130        includeSpec = anchorToFullFileName(includeSpec);
131        return includeSpec;
132    } else {
133        return re::makeEnd();  // matches every line.
134    }
135}
136
137namespace fs = boost::filesystem;
138
139//
140//  Directory List: a set of directory paths that have been
141//  examined to identify candidate files for searching, together
142//  with a count of the number of candidate files in each directory.
143//
144//  FileName Buffer: an ordered sequence of NUL terminated filenames
145//  for each candidate produced in the directory traversal.
146//  The first mFullPathEntries entries are CWD paths.  Subsequent entries
147//  are base file names relative to a directory.   The set
148//  of all entries for a given directory are consecutive in the
149//  buffer, and the sets are ordered consecutively by directory
150//  index in the Directory List.
151//
152//  CollectedPaths: a vector of file paths to which the
153//  selected files are added.
154
155class FileSelectAccumulator : public grep::MatchAccumulator {
156public:
157    FileSelectAccumulator(std::vector<fs::path> & collectedPaths) :
158        mCollectedPaths(collectedPaths),
159        mFullPathEntries(0)
160    {}
161    void setFullPathEntries(unsigned entries) {mFullPathEntries = entries; mDirectoryIndex = 0;}
162    void reset();
163    void addDirectory(fs::path dirPath, unsigned cumulativeEntryCount);
164    void accumulate_match(const size_t lineNum, char * line_start, char * line_end) override;
165protected:
166    std::vector<fs::path> & mCollectedPaths;
167    unsigned mFullPathEntries;
168    unsigned mDirectoryIndex;
169    std::vector<fs::path> mDirectoryList;
170    std::vector<unsigned> mCumulativeEntryCount;
171};
172   
173void FileSelectAccumulator::reset() {
174    mCollectedPaths.clear();
175    mFullPathEntries = 0;
176    mDirectoryIndex = 0;
177    mDirectoryList.clear();
178    mCumulativeEntryCount.clear();
179}
180
181void FileSelectAccumulator::addDirectory(fs::path dirPath, unsigned cumulativeEntryCount) {
182    mDirectoryList.push_back(dirPath);
183    mCumulativeEntryCount.push_back(cumulativeEntryCount);
184}
185
186void FileSelectAccumulator::accumulate_match(const size_t fileIdx, char * name_start, char * name_end) {
187    fs::path p(std::string(name_start, name_end - name_start));
188    if (fileIdx < mFullPathEntries) {
189        mCollectedPaths.push_back(p);
190   } else {
191        assert(mDirectoryIndex < mDirectoryList.size());
192        while (fileIdx >= mCumulativeEntryCount[mDirectoryIndex]) {
193            mDirectoryIndex++;
194        }
195        mCollectedPaths.emplace_back(mDirectoryList[mDirectoryIndex]/std::string(name_start, name_end - name_start));
196    }
197}
198
199
200
201   
202std::vector<fs::path> getFullFileList(cl::list<std::string> & inputFiles) {
203    // The vector to accumulate the full list of collected files to be searched.
204    std::vector<fs::path> collectedPaths;
205   
206    // In this pass through command line arguments and the file hierarchy,
207    // we are just gathering file and subdirectory entries, so we silently
208    // ignore errors.  We use the boost::filesystem operations that set
209    // error codes rather than raise exceptions.
210    boost::system::error_code errc;
211   
212    // In non-recursive greps with no include/exclude processing, we simply assemble the
213    // paths.
214    if ((DirectoriesFlag != Recurse) && (ExcludeFlag == "") && (IncludeFlag == "") && (ExcludeFromFlag == "")) {
215        for (const std::string & f : inputFiles) {
216            if (f == "-") {  // stdin, will always be searched.
217                argv::UseStdIn = true;
218                continue;
219            }
220            fs::path p(f);
221            errs() << "path: " << p.string() << "\n";
222            fs::file_status s = fs::status(p, errc);
223            if (errc) {
224                // If there was an error, we leave the file in the fileCandidates
225                // list for later error processing.
226                if (!NoMessagesFlag) collectedPaths.push_back(p);
227            } else if (fs::is_directory(s)) {
228                if (DirectoriesFlag == Read) {
229                    collectedPaths.push_back(p);
230                }
231            } else if (fs::is_regular_file(s)) {
232                collectedPaths.push_back(p);
233            } else {
234                // Devices and unknown file types
235                if (DevicesFlag == Read) {
236                    collectedPaths.push_back(p);
237                }
238            }
239        }
240        return collectedPaths;
241    }
242   
243    // Otherwise we need to filter paths according to some include/exclude rules.
244   
245    FileSelectAccumulator fileAccum(collectedPaths);
246   
247    // At each level we gather candidate file and directory names and then
248    // filter the names based on -include, -exclude, -include-dir, -excclude-dir,
249    // and -exclude-from settings.
250    //
251    grep::SearchableBuffer dirCandidates;
252    grep::SearchableBuffer fileCandidates;
253
254    // First level of processing:  command line files and directories.
255    for (const std::string & f : inputFiles) {
256        if (f == "-") {  // stdin, will always be searched.
257            argv::UseStdIn = true;
258            continue;
259        }
260        fs::path p(f);
261        fs::file_status s = fs::status(p, errc);
262        if (errc) {
263            // If there was an error, we leave the file in the fileCandidates
264            // list for later error processing.
265            if (!NoMessagesFlag) fileCandidates.addSearchCandidate(p.c_str());
266        } else if (fs::is_directory(s)) {
267            if (DirectoriesFlag == Recurse) {
268                dirCandidates.addSearchCandidate(p.c_str());
269            } else if (DirectoriesFlag == Read) {
270                fileCandidates.addSearchCandidate(p.c_str());
271            }
272        } else if (fs::is_regular_file(s)) {
273            fileCandidates.addSearchCandidate(p.c_str());
274        } else {
275            // Devices and unknown file types
276            if (DevicesFlag == Read) {
277                fileCandidates.addSearchCandidate(p.c_str());
278            }
279        }
280    }
281   
282    auto commandLineDirCandidates = dirCandidates.getCandidateCount();
283    auto commandLineFileCandidates = fileCandidates.getCandidateCount();
284    fileAccum.setFullPathEntries(commandLineFileCandidates);
285    if (commandLineDirCandidates > 0) {
286        // Recursive processing of directories has been requested and we have
287        // candidate directories from the command line.
288   
289        // selectedDirectories will accumulate hold the results of directory
290        // include/exclude filtering at each level of processing.
291        std::vector<fs::path> selectedDirectories;
292       
293        FileSelectAccumulator directoryAccum(selectedDirectories);
294        grep::InternalSearchEngine directorySelectEngine;
295        directorySelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
296        directorySelectEngine.grepCodeGen
297            (getDirectoryIncludePattern(), getDirectoryExcludePattern(), & directoryAccum);
298       
299        // The initial grep search determines which of the command line directories to process.
300        // Each of these candidates is a full path return from command line argument processing.
301        directoryAccum.setFullPathEntries(dirCandidates.getCandidateCount());
302        directorySelectEngine.doGrep(dirCandidates.getBufferBase(), dirCandidates.getBufferSize());
303
304        while (!selectedDirectories.empty()) {
305            // We now iterate through the full list of directories, gathering
306            // entries from each.
307            // (a) File entries are added into the global list of fileCandidates.
308            // (b) Directory entries are added into a new list of candidates at each level.
309
310            grep::SearchableBuffer subdirCandidates;
311            std::vector<fs::path> currentDirectories = selectedDirectories;
312            directoryAccum.reset();
313            // Iterate through all directories, collecting subdirectory and file candidates.
314            for (auto & dirpath : currentDirectories) {
315                boost::system::error_code errc;
316                fs::directory_iterator di_end;
317                fs::directory_iterator di(dirpath, errc);
318                if (errc) {
319                    // If we cannot enter the directory, keep it in the list of files,
320                    // for possible error reporting.
321                    if (!NoMessagesFlag) fileCandidates.addSearchCandidate(dirpath.filename().c_str());
322                    continue;
323                }
324                while (di != di_end) {
325                    auto & e = di->path();
326                    fs::file_status s = fs::status(e, errc);
327                    if (errc) {
328                        // If there was an error, we leave the file in the fileCandidates
329                        // list for later error processing.
330                        if (!NoMessagesFlag) fileCandidates.addSearchCandidate(e.filename().c_str());
331                    } else if (fs::is_directory(s)) {
332                        if (fs::is_symlink(s) && !DereferenceRecursiveFlag) {
333                            di.increment(errc);
334                            continue;
335                        }
336                        subdirCandidates.addSearchCandidate(e.filename().c_str());
337                    } else if (fs::is_regular_file(s)) {
338                        fileCandidates.addSearchCandidate(e.filename().c_str());
339                    } else {
340                        // Devices and unknown file types
341                        if (DevicesFlag == Read) {
342                            fileCandidates.addSearchCandidate(e.filename().c_str());
343                        }
344                    }
345                    di.increment(errc);
346                    if (errc) break;
347                }
348                // For each directory, update counts for candidates generated at this level.
349                //
350                directoryAccum.addDirectory(dirpath, subdirCandidates.getCandidateCount());
351                fileAccum.addDirectory(dirpath, fileCandidates.getCandidateCount());
352            }
353            // Directory traversal at this level is complete.  Clear the directoryList,
354            // so that it will accumulate only the selected entries from the gathered
355            // buffer of subdirCandidates.
356            selectedDirectories.clear();
357            //
358            //  Now do the search to produce the next level of selected subdirectories
359            directorySelectEngine.doGrep(subdirCandidates.getBufferBase(), subdirCandidates.getBufferSize());
360            // Thre search result has been written to directoryList, continue while we
361            // have new subdirectories.
362        } while (!selectedDirectories.empty());
363    }
364    //  All directories have been processed and all the fileCandidates in the SearchBuffer.
365    //  Now determine which of the candidates should included or excluded from the search.
366    //  The results will be accumulated in collectedPaths.
367    grep::InternalSearchEngine fileSelectEngine;
368    fileSelectEngine.setRecordBreak(grep::GrepRecordBreakKind::Null);
369    fileSelectEngine.grepCodeGen
370       (getFileIncludePattern(), getFileExcludePattern(), & fileAccum);
371    fileSelectEngine.doGrep(fileCandidates.getBufferBase(), fileCandidates.getBufferSize());
372    return collectedPaths;
373}
374
375}
Note: See TracBrowser for help on using the repository browser.