source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5944

Last change on this file since 5944 was 5944, checked in by cameron, 17 months ago

Common command line file selection utility for icgrep, wc ...

File size: 10.7 KB
Line 
1/*
2 *  Copyright (c) 2018 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <iostream>
8#include <iomanip>
9#include <vector>
10#include <string>
11#include <toolchain/toolchain.h>
12#include <llvm/IR/Function.h>
13#include <llvm/IR/Module.h>
14#include <llvm/Support/CommandLine.h>
15#include <llvm/Support/raw_ostream.h>
16#include <cc/cc_compiler.h>
17#include <pablo/pablo_kernel.h>
18#include <kernels/kernel_builder.h>
19#include <IR_Gen/idisa_target.h>
20#include <kernels/streamset.h>
21#include <kernels/source_kernel.h>
22#include <kernels/s2p_kernel.h>
23#include <pablo/pablo_compiler.h>
24#include <pablo/pablo_toolchain.h>
25#include <toolchain/cpudriver.h>
26#include <fcntl.h>
27#include <util/file_select.h>
28
29using namespace llvm;
30
31static cl::OptionCategory wcFlags("Command Flags", "wc options");
32
33static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
34
35std::vector<std::string> allFiles;
36
37enum CountOptions {
38    LineOption, WordOption, CharOption, ByteOption
39};
40
41static cl::list<CountOptions> wcOptions(
42  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
43             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
44             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
45             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m).")
46             CL_ENUM_VAL_SENTINEL), cl::cat(wcFlags), cl::Grouping);
47                                                 
48static std::string wc_modes = "";
49
50static int defaultDisplayColumnWidth = 7;  // default field width
51
52
53
54bool CountLines = false;
55bool CountWords = false;
56bool CountChars = false;
57bool CountBytes = false;
58
59std::vector<uint64_t> lineCount;
60std::vector<uint64_t> wordCount;
61std::vector<uint64_t> charCount;
62std::vector<uint64_t> byteCount;
63
64uint64_t TotalLines = 0;
65uint64_t TotalWords = 0;
66uint64_t TotalChars = 0;
67uint64_t TotalBytes = 0;
68
69using namespace pablo;
70using namespace kernel;
71using namespace parabix;
72
73//  The callback routine that records counts in progress.
74//
75extern "C" {
76    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
77        lineCount[fileIdx] = lines;
78        wordCount[fileIdx] = words;
79        charCount[fileIdx] = chars;
80        byteCount[fileIdx] = bytes;
81        TotalLines += lines;
82        TotalWords += words;
83        TotalChars += chars;
84        TotalBytes += bytes;
85    }
86}
87
88class WordCountKernel final: public pablo::PabloKernel {
89public:
90    WordCountKernel(const std::unique_ptr<kernel::KernelBuilder> & b, Binding && inputStreamSet);
91    bool isCachable() const override { return true; }
92    bool hasSignature() const override { return false; }
93protected:
94    void generatePabloMethod() override;
95};
96
97WordCountKernel::WordCountKernel (const std::unique_ptr<kernel::KernelBuilder> & b, Binding && inputStreamSet)
98: PabloKernel(b, "wc_" + wc_modes,
99    {inputStreamSet},
100    {},
101    {},
102    {Binding{b->getSizeTy(), "lineCount"}, Binding{b->getSizeTy(), "wordCount"}, Binding{b->getSizeTy(), "charCount"}}) {
103
104}
105
106void WordCountKernel::generatePabloMethod() {
107    PabloBuilder pb(getEntryScope());
108    std::unique_ptr<cc::CC_Compiler> ccc;
109    if (CountWords || CountChars) {
110        ccc = make_unique<cc::Parabix_CC_Compiler>(getEntryScope(), getInputStreamSet("u8bit"));
111    } else {
112        ccc = make_unique<cc::Direct_CC_Compiler>(getEntryScope(), pb.createExtract(getInput(0), pb.getInteger(0)));
113    }
114
115    //  output: 3 counters
116    Var * lc = getOutputScalarVar("lineCount");
117    Var * wc = getOutputScalarVar("wordCount");
118    Var * cc = getOutputScalarVar("charCount");
119
120    if (CountLines) {
121        PabloAST * LF = ccc->compileCC(re::makeByte(0x0A));
122        pb.createAssign(lc, pb.createCount(LF));
123    }
124    if (CountWords) {
125        PabloAST * WS = ccc->compileCC(re::makeCC(re::makeByte(0x09, 0x0D), re::makeByte(0x20)));
126        PabloAST * wordChar = pb.createNot(WS);
127        // WS_follow_or_start = 1 past WS or at start of file
128        PabloAST * WS_follow_or_start = pb.createNot(pb.createAdvance(wordChar, 1));
129        PabloAST * wordStart = pb.createInFile(pb.createAnd(wordChar, WS_follow_or_start));
130        pb.createAssign(wc, pb.createCount(wordStart));
131    }
132    if (CountChars) {
133        //
134        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
135        // not UTF-8, or is not valid?
136        //
137        PabloAST * u8Begin = ccc->compileCC(re::makeCC(re::makeByte(0, 0x7F), re::makeByte(0xC2, 0xF4)));
138        pb.createAssign(cc, pb.createCount(u8Begin));
139    }
140}
141
142typedef void (*WordCountFunctionType)(uint32_t fd, size_t fileIdx);
143
144void wcPipelineGen(ParabixDriver & pxDriver) {
145
146    auto & iBuilder = pxDriver.getBuilder();
147    Module * m = iBuilder->getModule();
148    const unsigned segmentSize = codegen::SegmentSize;
149    const unsigned bufferSegments = codegen::ThreadNum+1;
150
151   
152    Type * const int32Ty = iBuilder->getInt32Ty();
153    Type * const sizeTy = iBuilder->getSizeTy();
154    Type * const voidTy = iBuilder->getVoidTy();
155
156    FunctionType * const recordCountsType = FunctionType::get(voidTy, {sizeTy, sizeTy, sizeTy, sizeTy, sizeTy}, false);
157    Constant * const recordCounts = m->getOrInsertFunction("record_counts", recordCountsType);
158
159    FunctionType * const mainType = FunctionType::get(voidTy, {int32Ty, sizeTy}, false);
160    Function * const main = cast<Function>(m->getOrInsertFunction("Main", mainType));
161    main->setCallingConv(CallingConv::C);
162    Function::arg_iterator args = main->arg_begin();   
163    Value * const fileDecriptor = &*(args++);
164    fileDecriptor->setName("fileDecriptor");
165    Value * const fileIdx = &*(args++);
166    fileIdx->setName("fileIdx");
167
168    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
169
170    StreamSetBuffer * const ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
171
172
173    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
174    mmapK->setInitialArguments({fileDecriptor});
175    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
176   
177    Kernel * wck  = nullptr;
178    if (CountWords || CountChars) {
179        StreamSetBuffer * const BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments);
180        Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder);
181        pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
182       
183        wck = pxDriver.addKernelInstance<WordCountKernel>(iBuilder, Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"});
184        pxDriver.makeKernelCall(wck, {BasisBits}, {});
185
186
187    } else {
188        wck = pxDriver.addKernelInstance<WordCountKernel>(iBuilder, Binding{iBuilder->getStreamSetTy(1, 8), "u8byte"});
189        pxDriver.makeKernelCall(wck, {ByteStream}, {});
190    }
191
192    pxDriver.generatePipelineIR();
193   
194    iBuilder->setKernel(mmapK);
195    Value * const fileSize = iBuilder->getAccumulator("fileSize");
196    iBuilder->setKernel(wck);
197    Value * const lineCount = iBuilder->getAccumulator("lineCount");
198    Value * const wordCount = iBuilder->getAccumulator("wordCount");
199    Value * const charCount = iBuilder->getAccumulator("charCount");
200
201    iBuilder->CreateCall(recordCounts, {lineCount, wordCount, charCount, fileSize, fileIdx});
202    pxDriver.deallocateBuffers();
203    iBuilder->CreateRetVoid();
204
205    pxDriver.finalizeObject();
206}
207
208void wc(WordCountFunctionType fn_ptr, const int64_t fileIdx) {
209    std::string fileName = allFiles[fileIdx];
210    const int fd = open(fileName.c_str(), O_RDONLY);
211    if (LLVM_UNLIKELY(fd == -1)) {
212        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
213    } else {
214        fn_ptr(fd, fileIdx);
215        close(fd);
216    }
217}
218
219int main(int argc, char *argv[]) {
220    codegen::ParseCommandLineOptions(argc, argv, {&wcFlags, pablo_toolchain_flags(), codegen::codegen_flags()});
221    if (wcOptions.size() == 0) {
222        CountLines = true;
223        CountWords = true;
224        CountBytes = true;
225    } else {
226        CountLines = false;
227        CountWords = false;
228        CountBytes = false;
229        CountChars = false;
230        for (unsigned i = 0; i < wcOptions.size(); i++) {
231            switch (wcOptions[i]) {
232                case WordOption: CountWords = true; break;
233                case LineOption: CountLines = true; break;
234                case CharOption: CountChars = true; CountBytes = false; break;
235                case ByteOption: CountBytes = true; CountChars = false; break;
236            }
237        }
238    }
239    if (CountLines) wc_modes += "l";
240    if (CountWords) wc_modes += "w";
241    if (CountChars) wc_modes += "m";
242    if (CountBytes) wc_modes += "c";
243
244    ParabixDriver pxDriver("wc");
245    wcPipelineGen(pxDriver);
246    auto wordCountFunctionPtr = reinterpret_cast<WordCountFunctionType>(pxDriver.getMain());
247
248    allFiles = argv::getFullFileList(inputFiles);
249    const auto fileCount = allFiles.size();
250    lineCount.resize(fileCount);
251    wordCount.resize(fileCount);
252    charCount.resize(fileCount);
253    byteCount.resize(fileCount);
254   
255    for (unsigned i = 0; i < fileCount; ++i) {
256        wc(wordCountFunctionPtr, i);
257    }
258   
259    size_t maxCount = 0;
260    if (CountLines) maxCount = TotalLines;
261    if (CountWords) maxCount = TotalWords;
262    if (CountChars) maxCount = TotalChars;
263    if (CountBytes) maxCount = TotalBytes;
264   
265   
266   
267    int displayColumnWidth = std::to_string(maxCount).size() + 1;
268    if (displayColumnWidth < defaultDisplayColumnWidth) displayColumnWidth = defaultDisplayColumnWidth;
269
270    for (unsigned i = 0; i < fileCount; ++i) {
271        std::cout << std::setw(displayColumnWidth-1);
272        if (CountLines) {
273            std::cout << lineCount[i] << std::setw(displayColumnWidth);
274        }
275        if (CountWords) {
276            std::cout << wordCount[i] << std::setw(displayColumnWidth);
277        }
278        if (CountChars) {
279            std::cout << charCount[i] << std::setw(displayColumnWidth);
280        }
281        if (CountBytes) {
282            std::cout << byteCount[i];
283        }
284        std::cout << " " << allFiles[i] << std::endl;
285    }
286    if (inputFiles.size() > 1) {
287        std::cout << std::setw(displayColumnWidth-1);
288        if (CountLines) {
289            std::cout << TotalLines << std::setw(displayColumnWidth);
290        }
291        if (CountWords) {
292            std::cout << TotalWords << std::setw(displayColumnWidth);
293        }
294        if (CountChars) {
295            std::cout << TotalChars << std::setw(displayColumnWidth);
296        }
297        if (CountBytes) {
298            std::cout << TotalBytes;
299        }
300        std::cout << " total" << std::endl;
301    }
302
303    return 0;
304}
Note: See TracBrowser for help on using the repository browser.