source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5070

Last change on this file since 5070 was 5070, checked in by cameron, 3 years ago

Further wc clean-up.

File size: 12.2 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20
21#include <llvm/Support/CommandLine.h>
22#include <llvm/Support/raw_ostream.h>
23
24#include <utf_encoding.h>
25#include <re/re_cc.h>
26#include <cc/cc_compiler.h>
27#include <pablo/function.h>
28#include <pablo/pablo_kernel.h>
29#include <IDISA/idisa_builder.h>
30#include <IDISA/idisa_target.h>
31#include <kernels/interface.h>
32#include <kernels/kernel.h>
33#include <kernels/s2p_kernel.h>
34
35#include <pablo/pablo_compiler.h>
36#include <pablo/pablo_toolchain.h>
37
38
39#include <utf_encoding.h>
40
41// mmap system
42#include <boost/filesystem.hpp>
43#include <boost/iostreams/device/mapped_file.hpp>
44using namespace boost::iostreams;
45using namespace boost::filesystem;
46
47#include <fcntl.h>
48static cl::OptionCategory wcFlags("Command Flags", "wc options");
49
50static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
51
52enum CountOptions {
53    LineOption, WordOption, CharOption, ByteOption
54};
55
56static cl::list<CountOptions> wcOptions(
57  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
58             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
59             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
60             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
61             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
62                                                 
63
64
65static int defaultFieldWidth = 7;  // default field width
66
67
68bool CountLines = false;
69bool CountWords = false;
70bool CountChars = false;
71bool CountBytes = false;
72
73std::vector<uint64_t> lineCount;
74std::vector<uint64_t> wordCount;
75std::vector<uint64_t> charCount;
76std::vector<uint64_t> byteCount;
77
78uint64_t TotalLines = 0;
79uint64_t TotalWords = 0;
80uint64_t TotalChars = 0;
81uint64_t TotalBytes = 0;
82
83
84//  The callback routine that records counts in progress.
85//
86extern "C" {
87    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
88        lineCount[fileIdx] = lines;
89        wordCount[fileIdx] = words;
90        charCount[fileIdx] = chars;
91        byteCount[fileIdx] = bytes;
92        TotalLines += lines;
93        TotalWords += words;
94        TotalChars += chars;
95        TotalBytes += bytes;
96    }
97}
98
99//
100//
101
102pablo::PabloFunction * wc_gen(Encoding encoding) {
103    //  input: 8 basis bit streams
104    //  output: 3 counters
105   
106    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
107    cc::CC_Compiler ccc(*function, encoding);
108   
109    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
110    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
111
112    if (CountLines) {
113        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
114        function->setResultCount(pBuilder.createCount("lineCount", LF));
115    }
116    if (CountWords) {
117        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
118       
119        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
120        // WS_follow_or_start = 1 past WS or at start of file
121        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
122        //
123        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
124        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
125    }
126    if (CountChars) {
127        //
128        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
129        // not UTF-8, or is not valid?
130        //
131        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
132        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
133    }
134    return function;
135}
136
137using namespace kernel;
138
139
140Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
141    Type * mBitBlockType = iBuilder->getBitBlockType();
142    unsigned mBlockSize = iBuilder->getBitBlockWidth();
143    s2pKernel  s2pk(iBuilder);
144    s2pk.generateKernel();
145   
146    pablo_function_passes(function);
147    pablo::PabloKernel  wck(iBuilder, "wc", function, {"lineCount", "wordCount", "charCount"});
148    wck.prepareKernel();
149    wck.generateKernel();
150
151    Constant * record_counts_routine;
152    Type * const int64ty = iBuilder->getInt64Ty();
153    Type * const voidTy = Type::getVoidTy(mMod->getContext());
154    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
155    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
156   
157    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
158    main->setCallingConv(CallingConv::C);
159    Function::arg_iterator args = main->arg_begin();
160   
161    Value * const inputStream = &*(args++);
162    inputStream->setName("input");
163    Value * const bufferSize = &*(args++);
164    bufferSize->setName("bufferSize");
165    Value * const fileIdx = &*(args++);
166    fileIdx->setName("fileIdx");
167   
168    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
169   
170    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
171
172    BasicBlock * fullCondBlock = BasicBlock::Create(mMod->getContext(), "fullCond", main, 0);
173    BasicBlock * fullBodyBlock = BasicBlock::Create(mMod->getContext(), "fullBody", main, 0);
174    BasicBlock * finalBlock = BasicBlock::Create(mMod->getContext(), "final", main, 0);
175
176    StreamSetBuffer ByteStream(iBuilder, StreamSetType(1, 8), 0);
177    StreamSetBuffer BasisBits(iBuilder, StreamSetType(8, 1), 1);
178    ByteStream.setStreamSetBuffer(inputStream);
179    Value * basisBits = BasisBits.allocateBuffer();
180
181    Value * s2pInstance = s2pk.createInstance({});
182    Value * wcInstance = wck.createInstance({});
183   
184    Value * initialBufferSize = bufferSize;
185    BasicBlock * initialBlock = entryBlock;
186    Value * initialBlockNo = iBuilder->getInt64(0);
187
188    iBuilder->CreateBr(fullCondBlock);
189
190   
191    iBuilder->SetInsertPoint(fullCondBlock);
192    PHINode * remainingBytes = iBuilder->CreatePHI(int64ty, 2, "remainingBytes");
193    remainingBytes->addIncoming(initialBufferSize, initialBlock);
194    PHINode * blockNo = iBuilder->CreatePHI(int64ty, 2, "blockNo");
195    blockNo->addIncoming(initialBlockNo, initialBlock);
196
197    Constant * const step = ConstantInt::get(int64ty, mBlockSize);
198    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
199    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
200   
201    iBuilder->SetInsertPoint(fullBodyBlock);
202
203    s2pk.createDoBlockCall(s2pInstance, {ByteStream.getBlockPointer(blockNo), basisBits});
204    wck.createDoBlockCall(wcInstance, {basisBits});
205
206    Value * diff = iBuilder->CreateSub(remainingBytes, step);
207
208    remainingBytes->addIncoming(diff, fullBodyBlock);
209    blockNo->addIncoming(iBuilder->CreateAdd(blockNo, iBuilder->getInt64(1)), fullBodyBlock);
210    iBuilder->CreateBr(fullCondBlock);
211   
212    iBuilder->SetInsertPoint(finalBlock);
213    s2pk.createFinalBlockCall(s2pInstance, remainingBytes, {ByteStream.getBlockPointer(blockNo), basisBits});
214    wck.createFinalBlockCall(wcInstance, remainingBytes, {basisBits});
215   
216    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
217    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
218    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
219
220    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, bufferSize, fileIdx}));
221   
222    iBuilder->CreateRetVoid();
223    return main;
224}
225
226
227typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
228
229static ExecutionEngine * wcEngine = nullptr;
230
231wcFunctionType wcCodeGen(void) {
232                           
233    Module * M = new Module("wc", getGlobalContext());
234    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
235
236    Encoding encoding(Encoding::Type::UTF_8, 8);
237    pablo::PabloFunction * function = wc_gen(encoding);
238    llvm::Function * main_IR = wcPipeline(M, idb, function);
239
240    wcEngine = JIT_to_ExecutionEngine(M);
241   
242    wcEngine->finalizeObject();
243
244    delete idb;
245    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
246}
247
248void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
249    std::string fileName = inputFiles[fileIdx];
250    size_t fileSize;
251    char * fileBuffer;
252   
253    const path file(fileName);
254    if (exists(file)) {
255        if (is_directory(file)) {
256            return;
257        }
258    } else {
259        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
260        return;
261    }
262   
263    fileSize = file_size(file);
264    mapped_file_source mappedFile;
265    if (fileSize == 0) {
266        fileBuffer = nullptr;
267    }
268    else {
269        try {
270            mappedFile.open(fileName);
271        } catch (std::exception &e) {
272            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
273            return;
274        }
275        fileBuffer = const_cast<char *>(mappedFile.data());
276    }
277    fn_ptr(fileBuffer, fileSize, fileIdx);
278
279    mappedFile.close();
280   
281}
282
283
284
285int main(int argc, char *argv[]) {
286    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
287    cl::ParseCommandLineOptions(argc, argv);
288    if (wcOptions.size() == 0) {
289        CountLines = true;
290        CountWords = true;
291        CountBytes = true;
292    }
293    else {
294        CountLines = false;
295        CountWords = false;
296        CountBytes = false;
297        CountChars = false;
298        for (unsigned i = 0; i < wcOptions.size(); i++) {
299            switch (wcOptions[i]) {
300                case WordOption: CountWords = true; break;
301                case LineOption: CountLines = true; break;
302                case CharOption: CountBytes = true; CountChars = false; break;
303                case ByteOption: CountChars = true; CountBytes = false; break;
304            }
305        }
306    }
307   
308   
309    wcFunctionType fn_ptr = wcCodeGen();
310
311    int fileCount = inputFiles.size();
312    lineCount.resize(fileCount);
313    wordCount.resize(fileCount);
314    charCount.resize(fileCount);
315    byteCount.resize(fileCount);
316   
317    for (unsigned i = 0; i < inputFiles.size(); ++i) {
318        wc(fn_ptr, i);
319    }
320   
321    delete wcEngine;
322   
323    size_t maxCount = 0;
324    if (CountLines) maxCount = TotalLines;
325    if (CountWords) maxCount = TotalWords;
326    if (CountChars) maxCount = TotalChars;
327    if (CountBytes) maxCount = TotalBytes;
328   
329    int fieldWidth = std::to_string(maxCount).size() + 1;
330    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
331
332    for (unsigned i = 0; i < inputFiles.size(); ++i) {
333        std::cout << std::setw(fieldWidth-1);
334        if (CountLines) {
335            std::cout << lineCount[i] << std::setw(fieldWidth);
336        }
337        if (CountWords) {
338            std::cout << wordCount[i] << std::setw(fieldWidth);
339        }
340        if (CountChars) {
341            std::cout << charCount[i] << std::setw(fieldWidth);
342        }
343        if (CountBytes) {
344            std::cout << byteCount[i];
345        }
346        std::cout << " " << inputFiles[i] << std::endl;
347    }
348    if (inputFiles.size() > 1) {
349        std::cout << std::setw(fieldWidth-1);
350        if (CountLines) {
351            std::cout << TotalLines << std::setw(fieldWidth);
352        }
353        if (CountWords) {
354            std::cout << TotalWords << std::setw(fieldWidth);
355        }
356        if (CountChars) {
357            std::cout << TotalChars << std::setw(fieldWidth);
358        }
359        if (CountBytes) {
360            std::cout << TotalBytes;
361        }
362        std::cout << " total" << std::endl;
363    }
364
365    return 0;
366}
367
368                       
Note: See TracBrowser for help on using the repository browser.