source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5069

Last change on this file since 5069 was 5069, checked in by cameron, 3 years ago

Simplification: replace wcPipelineBuilder object by wcPipeline function.

File size: 12.6 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20
21#include <llvm/Support/CommandLine.h>
22#include <llvm/Support/raw_ostream.h>
23
24#include <utf_encoding.h>
25#include <re/re_cc.h>
26#include <cc/cc_compiler.h>
27#include <pablo/function.h>
28#include <pablo/pablo_kernel.h>
29#include <IDISA/idisa_builder.h>
30#include <IDISA/idisa_target.h>
31#include <kernels/interface.h>
32#include <kernels/kernel.h>
33#include <kernels/s2p_kernel.h>
34
35#include <pablo/pablo_compiler.h>
36#include <pablo/pablo_toolchain.h>
37
38
39#include <utf_encoding.h>
40
41// mmap system
42#include <boost/filesystem.hpp>
43#include <boost/iostreams/device/mapped_file.hpp>
44using namespace boost::iostreams;
45using namespace boost::filesystem;
46
47#include <fcntl.h>
48static cl::OptionCategory wcFlags("Command Flags", "wc options");
49
50static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
51
52enum CountOptions {
53    LineOption, WordOption, CharOption, ByteOption
54};
55
56static cl::list<CountOptions> wcOptions(
57  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
58             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
59             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
60             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
61             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
62                                                 
63
64
65static int defaultFieldWidth = 7;  // default field width
66
67
68bool CountLines = false;
69bool CountWords = false;
70bool CountChars = false;
71bool CountBytes = false;
72
73std::vector<uint64_t> lineCount;
74std::vector<uint64_t> wordCount;
75std::vector<uint64_t> charCount;
76std::vector<uint64_t> byteCount;
77
78uint64_t TotalLines = 0;
79uint64_t TotalWords = 0;
80uint64_t TotalChars = 0;
81uint64_t TotalBytes = 0;
82
83
84//  The callback routine that records counts in progress.
85//
86extern "C" {
87    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
88        lineCount[fileIdx] = lines;
89        wordCount[fileIdx] = words;
90        charCount[fileIdx] = chars;
91        byteCount[fileIdx] = bytes;
92        TotalLines += lines;
93        TotalWords += words;
94        TotalChars += chars;
95        TotalBytes += bytes;
96    }
97}
98
99//
100//
101
102pablo::PabloFunction * wc_gen(Encoding encoding) {
103    //  input: 8 basis bit streams
104    //  output: 3 counters
105   
106    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
107    cc::CC_Compiler ccc(*function, encoding);
108   
109    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
110    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
111
112    if (CountLines) {
113        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
114        function->setResultCount(pBuilder.createCount("lineCount", LF));
115    }
116    if (CountWords) {
117        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
118       
119        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
120        // WS_follow_or_start = 1 past WS or at start of file
121        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
122        //
123        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
124        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
125    }
126    if (CountChars) {
127        //
128        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
129        // not UTF-8, or is not valid?
130        //
131        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
132        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
133    }
134    return function;
135}
136
137using namespace kernel;
138
139
140
141class wcPipelineBuilder {
142public:
143    wcPipelineBuilder(llvm::Module * m, IDISA::IDISA_Builder * b);
144   
145    ~wcPipelineBuilder();
146   
147    llvm::Function * ExecuteKernels(pablo::PabloFunction * function);
148   
149private:
150    llvm::Module *                      mMod;
151    IDISA::IDISA_Builder *              iBuilder;
152    llvm::Type *                        mBitBlockType;
153    int                                 mBlockSize;
154};
155
156
157using namespace pablo;
158using namespace kernel;
159
160
161Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, PabloFunction * function) {
162    Type * mBitBlockType = iBuilder->getBitBlockType();
163    unsigned mBlockSize = iBuilder->getBitBlockWidth();
164    s2pKernel  s2pk(iBuilder);
165    s2pk.generateKernel();
166   
167    pablo_function_passes(function);
168    PabloKernel  wck(iBuilder, "wc", function, {"lineCount", "wordCount", "charCount"});
169    wck.prepareKernel();
170    wck.generateKernel();
171
172    Constant * record_counts_routine;
173    Type * const int64ty = iBuilder->getInt64Ty();
174    Type * const voidTy = Type::getVoidTy(mMod->getContext());
175    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
176    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
177   
178    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
179    main->setCallingConv(CallingConv::C);
180    Function::arg_iterator args = main->arg_begin();
181   
182    Value * const inputStream = &*(args++);
183    inputStream->setName("input");
184    Value * const bufferSize = &*(args++);
185    bufferSize->setName("bufferSize");
186    Value * const fileIdx = &*(args++);
187    fileIdx->setName("fileIdx");
188   
189    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
190   
191    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
192
193    BasicBlock * fullCondBlock = BasicBlock::Create(mMod->getContext(), "fullCond", main, 0);
194    BasicBlock * fullBodyBlock = BasicBlock::Create(mMod->getContext(), "fullBody", main, 0);
195    BasicBlock * finalBlock = BasicBlock::Create(mMod->getContext(), "final", main, 0);
196
197    StreamSetBuffer ByteStream(iBuilder, StreamSetType(1, 8), 0);
198    StreamSetBuffer BasisBits(iBuilder, StreamSetType(8, 1), 1);
199    ByteStream.setStreamSetBuffer(inputStream);
200    Value * basisBits = BasisBits.allocateBuffer();
201
202    Value * s2pInstance = s2pk.createInstance({});
203    Value * wcInstance = wck.createInstance({});
204   
205    Value * initialBufferSize = bufferSize;
206    BasicBlock * initialBlock = entryBlock;
207    Value * initialBlockNo = iBuilder->getInt64(0);
208
209    iBuilder->CreateBr(fullCondBlock);
210
211   
212    iBuilder->SetInsertPoint(fullCondBlock);
213    PHINode * remainingBytes = iBuilder->CreatePHI(int64ty, 2, "remainingBytes");
214    remainingBytes->addIncoming(initialBufferSize, initialBlock);
215    PHINode * blockNo = iBuilder->CreatePHI(int64ty, 2, "blockNo");
216    blockNo->addIncoming(initialBlockNo, initialBlock);
217
218    Constant * const step = ConstantInt::get(int64ty, mBlockSize);
219    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
220    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
221   
222    iBuilder->SetInsertPoint(fullBodyBlock);
223
224    s2pk.createDoBlockCall(s2pInstance, {ByteStream.getBlockPointer(blockNo), basisBits});
225    wck.createDoBlockCall(wcInstance, {basisBits});
226
227    Value * diff = iBuilder->CreateSub(remainingBytes, step);
228
229    remainingBytes->addIncoming(diff, fullBodyBlock);
230    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, iBuilder->getInt64(1)), fullBodyBlock);
231    iBuilder->CreateBr(fullCondBlock);
232   
233    iBuilder->SetInsertPoint(finalBlock);
234    s2pk.createFinalBlockCall(s2pInstance, remainingBytes, {ByteStream.getBlockPointer(blockNo), basisBits});
235    wck.createFinalBlockCall(wcInstance, remainingBytes, {basisBits});
236   
237    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
238    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
239    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
240
241    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, bufferSize, fileIdx}));
242   
243    iBuilder->CreateRetVoid();
244    return main;
245}
246
247
248typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
249
250static ExecutionEngine * wcEngine = nullptr;
251
252wcFunctionType wcCodeGen(void) {
253                           
254    Module * M = new Module("wc", getGlobalContext());
255    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
256
257    Encoding encoding(Encoding::Type::UTF_8, 8);
258    pablo::PabloFunction * function = wc_gen(encoding);
259    llvm::Function * main_IR = wcPipeline(M, idb, function);
260
261    wcEngine = JIT_to_ExecutionEngine(M);
262   
263    wcEngine->finalizeObject();
264
265    delete idb;
266    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
267}
268
269void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
270    std::string fileName = inputFiles[fileIdx];
271    size_t fileSize;
272    char * fileBuffer;
273   
274    const path file(fileName);
275    if (exists(file)) {
276        if (is_directory(file)) {
277            return;
278        }
279    } else {
280        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
281        return;
282    }
283   
284    fileSize = file_size(file);
285    mapped_file_source mappedFile;
286    if (fileSize == 0) {
287        fileBuffer = nullptr;
288    }
289    else {
290        try {
291            mappedFile.open(fileName);
292        } catch (std::exception &e) {
293            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
294            return;
295        }
296        fileBuffer = const_cast<char *>(mappedFile.data());
297    }
298    fn_ptr(fileBuffer, fileSize, fileIdx);
299
300    mappedFile.close();
301   
302}
303
304
305
306int main(int argc, char *argv[]) {
307    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
308    cl::ParseCommandLineOptions(argc, argv);
309    if (wcOptions.size() == 0) {
310        CountLines = true;
311        CountWords = true;
312        CountBytes = true;
313    }
314    else {
315        CountLines = false;
316        CountWords = false;
317        CountBytes = false;
318        CountChars = false;
319        for (unsigned i = 0; i < wcOptions.size(); i++) {
320            switch (wcOptions[i]) {
321                case WordOption: CountWords = true; break;
322                case LineOption: CountLines = true; break;
323                case CharOption: CountBytes = true; CountChars = false; break;
324                case ByteOption: CountChars = true; CountBytes = false; break;
325            }
326        }
327    }
328   
329   
330    wcFunctionType fn_ptr = wcCodeGen();
331
332    int fileCount = inputFiles.size();
333    lineCount.resize(fileCount);
334    wordCount.resize(fileCount);
335    charCount.resize(fileCount);
336    byteCount.resize(fileCount);
337   
338    for (unsigned i = 0; i < inputFiles.size(); ++i) {
339        wc(fn_ptr, i);
340    }
341   
342    delete wcEngine;
343   
344    size_t maxCount = 0;
345    if (CountLines) maxCount = TotalLines;
346    if (CountWords) maxCount = TotalWords;
347    if (CountChars) maxCount = TotalChars;
348    if (CountBytes) maxCount = TotalBytes;
349   
350    int fieldWidth = std::to_string(maxCount).size() + 1;
351    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
352
353    for (unsigned i = 0; i < inputFiles.size(); ++i) {
354        std::cout << std::setw(fieldWidth-1);
355        if (CountLines) {
356            std::cout << lineCount[i] << std::setw(fieldWidth);
357        }
358        if (CountWords) {
359            std::cout << wordCount[i] << std::setw(fieldWidth);
360        }
361        if (CountChars) {
362            std::cout << charCount[i] << std::setw(fieldWidth);
363        }
364        if (CountBytes) {
365            std::cout << byteCount[i];
366        }
367        std::cout << " " << inputFiles[i] << std::endl;
368    }
369    if (inputFiles.size() > 1) {
370        std::cout << std::setw(fieldWidth-1);
371        if (CountLines) {
372            std::cout << TotalLines << std::setw(fieldWidth);
373        }
374        if (CountWords) {
375            std::cout << TotalWords << std::setw(fieldWidth);
376        }
377        if (CountChars) {
378            std::cout << TotalChars << std::setw(fieldWidth);
379        }
380        if (CountBytes) {
381            std::cout << TotalBytes;
382        }
383        std::cout << " total" << std::endl;
384    }
385
386    return 0;
387}
388
389                       
Note: See TracBrowser for help on using the repository browser.