source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5924

Last change on this file since 5924 was 5924, checked in by cameron, 13 months ago

Various cleanups

File size: 10.4 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <iostream>
8#include <iomanip>
9#include <sstream>
10#include <toolchain/toolchain.h>
11#include <llvm/IR/Function.h>
12#include <llvm/IR/Module.h>
13#include <llvm/Support/CommandLine.h>
14#include <llvm/Support/raw_ostream.h>
15#include <cc/cc_compiler.h>
16#include <pablo/pablo_kernel.h>
17#include <kernels/kernel_builder.h>
18#include <IR_Gen/idisa_target.h>
19#include <kernels/streamset.h>
20#include <kernels/source_kernel.h>
21#include <kernels/s2p_kernel.h>
22#include <pablo/pablo_compiler.h>
23#include <pablo/pablo_toolchain.h>
24#include <toolchain/cpudriver.h>
25#include <fcntl.h>
26
27using namespace llvm;
28
29static cl::OptionCategory wcFlags("Command Flags", "wc options");
30
31cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
32
33enum CountOptions {
34    LineOption, WordOption, CharOption, ByteOption
35};
36
37static cl::list<CountOptions> wcOptions(
38  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
39             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
40             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
41             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m).")
42             CL_ENUM_VAL_SENTINEL), cl::cat(wcFlags), cl::Grouping);
43                                                 
44static std::string wc_modes = "";
45
46static int defaultFieldWidth = 7;  // default field width
47
48
49bool CountLines = false;
50bool CountWords = false;
51bool CountChars = false;
52bool CountBytes = false;
53
54std::vector<uint64_t> lineCount;
55std::vector<uint64_t> wordCount;
56std::vector<uint64_t> charCount;
57std::vector<uint64_t> byteCount;
58
59uint64_t TotalLines = 0;
60uint64_t TotalWords = 0;
61uint64_t TotalChars = 0;
62uint64_t TotalBytes = 0;
63
64using namespace pablo;
65using namespace kernel;
66using namespace parabix;
67
68//  The callback routine that records counts in progress.
69//
70extern "C" {
71    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
72        lineCount[fileIdx] = lines;
73        wordCount[fileIdx] = words;
74        charCount[fileIdx] = chars;
75        byteCount[fileIdx] = bytes;
76        TotalLines += lines;
77        TotalWords += words;
78        TotalChars += chars;
79        TotalBytes += bytes;
80    }
81}
82
83class WordCountKernel final: public pablo::PabloKernel {
84public:
85    WordCountKernel(const std::unique_ptr<kernel::KernelBuilder> & b, Binding && inputStreamSet);
86    bool isCachable() const override { return true; }
87    bool hasSignature() const override { return false; }
88protected:
89    void generatePabloMethod() override;
90};
91
92WordCountKernel::WordCountKernel (const std::unique_ptr<kernel::KernelBuilder> & b, Binding && inputStreamSet)
93: PabloKernel(b, "wc_" + wc_modes,
94    {inputStreamSet},
95    {},
96    {},
97    {Binding{b->getSizeTy(), "lineCount"}, Binding{b->getSizeTy(), "wordCount"}, Binding{b->getSizeTy(), "charCount"}}) {
98
99}
100
101void WordCountKernel::generatePabloMethod() {
102    PabloBuilder pb(getEntryScope());
103    std::unique_ptr<cc::CC_Compiler> ccc;
104    if (CountWords || CountChars) {
105        ccc = make_unique<cc::Parabix_CC_Compiler>(getEntryScope(), getInputStreamSet("u8bit"));
106    } else {
107        ccc = make_unique<cc::Direct_CC_Compiler>(getEntryScope(), pb.createExtract(getInput(0), pb.getInteger(0)));
108    }
109
110    //  output: 3 counters
111    Var * lc = getOutputScalarVar("lineCount");
112    Var * wc = getOutputScalarVar("wordCount");
113    Var * cc = getOutputScalarVar("charCount");
114
115    if (CountLines) {
116        PabloAST * LF = ccc->compileCC(re::makeByte(0x0A));
117        pb.createAssign(lc, pb.createCount(LF));
118    }
119    if (CountWords) {
120        PabloAST * WS = ccc->compileCC(re::makeCC(re::makeByte(0x09, 0x0D), re::makeByte(0x20)));
121        PabloAST * wordChar = pb.createNot(WS);
122        // WS_follow_or_start = 1 past WS or at start of file
123        PabloAST * WS_follow_or_start = pb.createNot(pb.createAdvance(wordChar, 1));
124        PabloAST * wordStart = pb.createInFile(pb.createAnd(wordChar, WS_follow_or_start));
125        pb.createAssign(wc, pb.createCount(wordStart));
126    }
127    if (CountChars) {
128        //
129        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
130        // not UTF-8, or is not valid?
131        //
132        PabloAST * u8Begin = ccc->compileCC(re::makeCC(re::makeByte(0, 0x7F), re::makeByte(0xC2, 0xF4)));
133        pb.createAssign(cc, pb.createCount(u8Begin));
134    }
135}
136
137typedef void (*WordCountFunctionType)(uint32_t fd, size_t fileIdx);
138
139void wcPipelineGen(ParabixDriver & pxDriver) {
140
141    auto & iBuilder = pxDriver.getBuilder();
142    Module * m = iBuilder->getModule();
143    const unsigned segmentSize = codegen::SegmentSize;
144    const unsigned bufferSegments = codegen::ThreadNum+1;
145
146   
147    Type * const int32Ty = iBuilder->getInt32Ty();
148    Type * const sizeTy = iBuilder->getSizeTy();
149    Type * const voidTy = iBuilder->getVoidTy();
150
151    FunctionType * const recordCountsType = FunctionType::get(voidTy, {sizeTy, sizeTy, sizeTy, sizeTy, sizeTy}, false);
152    Constant * const recordCounts = m->getOrInsertFunction("record_counts", recordCountsType);
153
154    FunctionType * const mainType = FunctionType::get(voidTy, {int32Ty, sizeTy}, false);
155    Function * const main = cast<Function>(m->getOrInsertFunction("Main", mainType));
156    main->setCallingConv(CallingConv::C);
157    Function::arg_iterator args = main->arg_begin();   
158    Value * const fileDecriptor = &*(args++);
159    fileDecriptor->setName("fileDecriptor");
160    Value * const fileIdx = &*(args++);
161    fileIdx->setName("fileIdx");
162
163    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
164
165    StreamSetBuffer * const ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
166
167
168    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
169    mmapK->setInitialArguments({fileDecriptor});
170    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
171   
172    Kernel * wck  = nullptr;
173    if (CountWords || CountChars) {
174        StreamSetBuffer * const BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments);
175        Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder);
176        pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
177       
178        wck = pxDriver.addKernelInstance<WordCountKernel>(iBuilder, Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"});
179        pxDriver.makeKernelCall(wck, {BasisBits}, {});
180
181
182    } else {
183        wck = pxDriver.addKernelInstance<WordCountKernel>(iBuilder, Binding{iBuilder->getStreamSetTy(1, 8), "u8byte"});
184        pxDriver.makeKernelCall(wck, {ByteStream}, {});
185    }
186
187    pxDriver.generatePipelineIR();
188   
189    iBuilder->setKernel(mmapK);
190    Value * const fileSize = iBuilder->getAccumulator("fileSize");
191    iBuilder->setKernel(wck);
192    Value * const lineCount = iBuilder->getAccumulator("lineCount");
193    Value * const wordCount = iBuilder->getAccumulator("wordCount");
194    Value * const charCount = iBuilder->getAccumulator("charCount");
195
196    iBuilder->CreateCall(recordCounts, {lineCount, wordCount, charCount, fileSize, fileIdx});
197    pxDriver.deallocateBuffers();
198    iBuilder->CreateRetVoid();
199
200    pxDriver.finalizeObject();
201}
202
203void wc(WordCountFunctionType fn_ptr, const int64_t fileIdx) {
204    std::string fileName = inputFiles[fileIdx];
205    const int fd = open(fileName.c_str(), O_RDONLY);
206    if (LLVM_UNLIKELY(fd == -1)) {
207        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
208    } else {
209        fn_ptr(fd, fileIdx);
210        close(fd);
211    }
212}
213
214int main(int argc, char *argv[]) {
215    codegen::ParseCommandLineOptions(argc, argv, {&wcFlags, pablo_toolchain_flags(), codegen::codegen_flags()});
216    if (wcOptions.size() == 0) {
217        CountLines = true;
218        CountWords = true;
219        CountBytes = true;
220    } else {
221        CountLines = false;
222        CountWords = false;
223        CountBytes = false;
224        CountChars = false;
225        for (unsigned i = 0; i < wcOptions.size(); i++) {
226            switch (wcOptions[i]) {
227                case WordOption: CountWords = true; break;
228                case LineOption: CountLines = true; break;
229                case CharOption: CountChars = true; CountBytes = false; break;
230                case ByteOption: CountBytes = true; CountChars = false; break;
231            }
232        }
233    }
234    if (CountLines) wc_modes += "l";
235    if (CountWords) wc_modes += "w";
236    if (CountChars) wc_modes += "m";
237    if (CountBytes) wc_modes += "c";
238
239    ParabixDriver pxDriver("wc");
240    wcPipelineGen(pxDriver);
241    auto wordCountFunctionPtr = reinterpret_cast<WordCountFunctionType>(pxDriver.getMain());
242
243    const auto fileCount = inputFiles.size();
244    lineCount.resize(fileCount);
245    wordCount.resize(fileCount);
246    charCount.resize(fileCount);
247    byteCount.resize(fileCount);
248   
249    for (unsigned i = 0; i < fileCount; ++i) {
250        wc(wordCountFunctionPtr, i);
251    }
252   
253    size_t maxCount = 0;
254    if (CountLines) maxCount = TotalLines;
255    if (CountWords) maxCount = TotalWords;
256    if (CountChars) maxCount = TotalChars;
257    if (CountBytes) maxCount = TotalBytes;
258   
259   
260   
261    int fieldWidth = std::to_string(maxCount).size() + 1;
262    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
263
264    for (unsigned i = 0; i < inputFiles.size(); ++i) {
265        std::cout << std::setw(fieldWidth-1);
266        if (CountLines) {
267            std::cout << lineCount[i] << std::setw(fieldWidth);
268        }
269        if (CountWords) {
270            std::cout << wordCount[i] << std::setw(fieldWidth);
271        }
272        if (CountChars) {
273            std::cout << charCount[i] << std::setw(fieldWidth);
274        }
275        if (CountBytes) {
276            std::cout << byteCount[i];
277        }
278        std::cout << " " << inputFiles[i] << std::endl;
279    }
280    if (inputFiles.size() > 1) {
281        std::cout << std::setw(fieldWidth-1);
282        if (CountLines) {
283            std::cout << TotalLines << std::setw(fieldWidth);
284        }
285        if (CountWords) {
286            std::cout << TotalWords << std::setw(fieldWidth);
287        }
288        if (CountChars) {
289            std::cout << TotalChars << std::setw(fieldWidth);
290        }
291        if (CountBytes) {
292            std::cout << TotalBytes;
293        }
294        std::cout << " total" << std::endl;
295    }
296
297    return 0;
298}
Note: See TracBrowser for help on using the repository browser.