source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5814

Last change on this file since 5814 was 5814, checked in by cameron, 17 months ago

Parsing of byte CCs

File size: 10.3 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <iostream>
8#include <iomanip>
9#include <sstream>
10#include <toolchain/toolchain.h>
11#include <llvm/IR/Function.h>
12#include <llvm/IR/Module.h>
13#include <llvm/Support/CommandLine.h>
14#include <llvm/Support/raw_ostream.h>
15#include <cc/cc_compiler.h>
16#include <pablo/pablo_kernel.h>
17#include <kernels/kernel_builder.h>
18#include <IR_Gen/idisa_target.h>
19#include <kernels/streamset.h>
20#include <kernels/source_kernel.h>
21#include <kernels/s2p_kernel.h>
22#include <pablo/pablo_compiler.h>
23#include <pablo/pablo_toolchain.h>
24#include <toolchain/cpudriver.h>
25#include <fcntl.h>
26
27using namespace llvm;
28
29static cl::OptionCategory wcFlags("Command Flags", "wc options");
30
31static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
32
33enum CountOptions {
34    LineOption, WordOption, CharOption, ByteOption
35};
36
37static cl::list<CountOptions> wcOptions(
38  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
39             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
40             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
41             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m).")
42             CL_ENUM_VAL_SENTINEL), cl::cat(wcFlags), cl::Grouping);
43                                                 
44
45
46static int defaultFieldWidth = 7;  // default field width
47
48
49bool CountLines = false;
50bool CountWords = false;
51bool CountChars = false;
52bool CountBytes = false;
53
54std::vector<uint64_t> lineCount;
55std::vector<uint64_t> wordCount;
56std::vector<uint64_t> charCount;
57std::vector<uint64_t> byteCount;
58
59uint64_t TotalLines = 0;
60uint64_t TotalWords = 0;
61uint64_t TotalChars = 0;
62uint64_t TotalBytes = 0;
63
64using namespace pablo;
65using namespace kernel;
66using namespace parabix;
67
68//  The callback routine that records counts in progress.
69//
70extern "C" {
71    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
72        lineCount[fileIdx] = lines;
73        wordCount[fileIdx] = words;
74        charCount[fileIdx] = chars;
75        byteCount[fileIdx] = bytes;
76        TotalLines += lines;
77        TotalWords += words;
78        TotalChars += chars;
79        TotalBytes += bytes;
80    }
81}
82
83class WordCountKernel final: public pablo::PabloKernel {
84public:
85    WordCountKernel(const std::unique_ptr<kernel::KernelBuilder> & b);
86    bool isCachable() const override { return true; }
87    bool hasSignature() const override { return false; }
88protected:
89    void generatePabloMethod() override;
90};
91
92WordCountKernel::WordCountKernel (const std::unique_ptr<kernel::KernelBuilder> & b)
93: PabloKernel(b, "wc",
94    {Binding{b->getStreamSetTy(8, 1), "u8bit"}, Binding{b->getStreamSetTy(1, 8), "u8byte"}},
95    {Binding{b->getStreamSetTy(1, 8), "dblbyte"}},
96    {},
97    {Binding{b->getSizeTy(), "lineCount"}, Binding{b->getSizeTy(), "wordCount"}, Binding{b->getSizeTy(), "charCount"}}) {
98
99}
100
101void WordCountKernel::generatePabloMethod() {
102
103    //  input: 8 basis bit streams
104    const auto u8bitSet = getInputStreamVar("u8bit");
105    const auto u8byteSet = getInputStreamVar("u8byte");
106    const auto dblbyteSet = getOutputStreamVar("dblbyte");
107    //  output: 3 counters
108
109    cc::CC_Compiler ccc(this, u8bitSet);
110
111    PabloBuilder & pb = ccc.getBuilder();
112
113    PabloAST * bytes = pb.createExtract(u8byteSet, pb.getInteger(0));
114    PabloAST * dbl = pb.createAdd(bytes, bytes);
115    pb.createAssign(pb.createExtract(dblbyteSet, pb.getInteger(0)), dbl);
116   
117    Var * lc = getOutputScalarVar("lineCount");
118    Var * wc = getOutputScalarVar("wordCount");
119    Var * cc = getOutputScalarVar("charCount");
120
121    if (CountLines) {
122        PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
123        pb.createAssign(lc, pb.createCount(LF));
124    }
125    if (CountWords) {
126        PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
127        PabloAST * wordChar = pb.createNot(WS);
128        // WS_follow_or_start = 1 past WS or at start of file
129        PabloAST * WS_follow_or_start = pb.createNot(pb.createAdvance(wordChar, 1));
130        PabloAST * wordStart = pb.createInFile(pb.createAnd(wordChar, WS_follow_or_start));
131        pb.createAssign(wc, pb.createCount(wordStart));
132    }
133    if (CountChars) {
134        //
135        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
136        // not UTF-8, or is not valid?
137        //
138        PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
139        pb.createAssign(cc, pb.createCount(u8Begin));
140    }
141}
142
143typedef void (*WordCountFunctionType)(uint32_t fd, size_t fileIdx);
144
145void wcPipelineGen(ParabixDriver & pxDriver) {
146
147    auto & iBuilder = pxDriver.getBuilder();
148    Module * m = iBuilder->getModule();
149    const unsigned segmentSize = codegen::SegmentSize;
150    const unsigned bufferSegments = codegen::ThreadNum+1;
151
152   
153    Type * const int32Ty = iBuilder->getInt32Ty();
154    Type * const sizeTy = iBuilder->getSizeTy();
155    Type * const voidTy = iBuilder->getVoidTy();
156
157    FunctionType * const recordCountsType = FunctionType::get(voidTy, {sizeTy, sizeTy, sizeTy, sizeTy, sizeTy}, false);
158    Constant * const recordCounts = m->getOrInsertFunction("record_counts", recordCountsType);
159
160    FunctionType * const mainType = FunctionType::get(voidTy, {int32Ty, sizeTy}, false);
161    Function * const main = cast<Function>(m->getOrInsertFunction("Main", mainType));
162    main->setCallingConv(CallingConv::C);
163    Function::arg_iterator args = main->arg_begin();   
164    Value * const fileDecriptor = &*(args++);
165    fileDecriptor->setName("fileDecriptor");
166    Value * const fileIdx = &*(args++);
167    fileIdx->setName("fileIdx");
168
169    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
170
171    StreamSetBuffer * const ByteStream = pxDriver.addBuffer<SourceBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8));
172    StreamSetBuffer * const ByteOut = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8), segmentSize * bufferSegments);
173
174    StreamSetBuffer * const BasisBits = pxDriver.addBuffer<CircularBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1), segmentSize * bufferSegments);
175
176    Kernel * mmapK = pxDriver.addKernelInstance<MMapSourceKernel>(iBuilder);
177    mmapK->setInitialArguments({fileDecriptor});
178    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
179
180    Kernel * s2pk = pxDriver.addKernelInstance<S2PKernel>(iBuilder);
181    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
182   
183    Kernel * wck = pxDriver.addKernelInstance<WordCountKernel>(iBuilder);
184    pxDriver.makeKernelCall(wck, {BasisBits,ByteStream}, {ByteOut});
185
186    pxDriver.generatePipelineIR();
187   
188    iBuilder->setKernel(mmapK);
189    Value * const fileSize = iBuilder->getAccumulator("fileSize");
190    iBuilder->setKernel(wck);
191    Value * const lineCount = iBuilder->getAccumulator("lineCount");
192    Value * const wordCount = iBuilder->getAccumulator("wordCount");
193    Value * const charCount = iBuilder->getAccumulator("charCount");
194
195    iBuilder->CreateCall(recordCounts, {lineCount, wordCount, charCount, fileSize, fileIdx});
196    pxDriver.deallocateBuffers();
197    iBuilder->CreateRetVoid();
198
199    pxDriver.finalizeObject();
200}
201
202void wc(WordCountFunctionType fn_ptr, const int64_t fileIdx) {
203    std::string fileName = inputFiles[fileIdx];
204    const int fd = open(fileName.c_str(), O_RDONLY);
205    if (LLVM_UNLIKELY(fd == -1)) {
206        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
207    } else {
208        fn_ptr(fd, fileIdx);
209        close(fd);
210    }
211}
212
213int main(int argc, char *argv[]) {
214    codegen::ParseCommandLineOptions(argc, argv, {&wcFlags, pablo_toolchain_flags(), codegen::codegen_flags()});
215    if (wcOptions.size() == 0) {
216        CountLines = true;
217        CountWords = true;
218        CountBytes = true;
219    } else {
220        CountLines = false;
221        CountWords = false;
222        CountBytes = false;
223        CountChars = false;
224        for (unsigned i = 0; i < wcOptions.size(); i++) {
225            switch (wcOptions[i]) {
226                case WordOption: CountWords = true; break;
227                case LineOption: CountLines = true; break;
228                case CharOption: CountBytes = true; CountChars = false; break;
229                case ByteOption: CountChars = true; CountBytes = false; break;
230            }
231        }
232    }
233   
234    ParabixDriver pxDriver("wc");
235    wcPipelineGen(pxDriver);
236    auto wordCountFunctionPtr = reinterpret_cast<WordCountFunctionType>(pxDriver.getMain());
237
238    const auto fileCount = inputFiles.size();
239    lineCount.resize(fileCount);
240    wordCount.resize(fileCount);
241    charCount.resize(fileCount);
242    byteCount.resize(fileCount);
243   
244    for (unsigned i = 0; i < fileCount; ++i) {
245        wc(wordCountFunctionPtr, i);
246    }
247   
248    size_t maxCount = 0;
249    if (CountLines) maxCount = TotalLines;
250    if (CountWords) maxCount = TotalWords;
251    if (CountChars) maxCount = TotalChars;
252    if (CountBytes) maxCount = TotalBytes;
253   
254    int fieldWidth = std::to_string(maxCount).size() + 1;
255    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
256
257    for (unsigned i = 0; i < inputFiles.size(); ++i) {
258        std::cout << std::setw(fieldWidth-1);
259        if (CountLines) {
260            std::cout << lineCount[i] << std::setw(fieldWidth);
261        }
262        if (CountWords) {
263            std::cout << wordCount[i] << std::setw(fieldWidth);
264        }
265        if (CountChars) {
266            std::cout << charCount[i] << std::setw(fieldWidth);
267        }
268        if (CountBytes) {
269            std::cout << byteCount[i];
270        }
271        std::cout << " " << inputFiles[i] << std::endl;
272    }
273    if (inputFiles.size() > 1) {
274        std::cout << std::setw(fieldWidth-1);
275        if (CountLines) {
276            std::cout << TotalLines << std::setw(fieldWidth);
277        }
278        if (CountWords) {
279            std::cout << TotalWords << std::setw(fieldWidth);
280        }
281        if (CountChars) {
282            std::cout << TotalChars << std::setw(fieldWidth);
283        }
284        if (CountBytes) {
285            std::cout << TotalBytes;
286        }
287        std::cout << " total" << std::endl;
288    }
289
290    return 0;
291}
Note: See TracBrowser for help on using the repository browser.