source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5121

Last change on this file since 5121 was 5109, checked in by cameron, 3 years ago

u8u16 pipeline

File size: 10.9 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <utf_encoding.h>
26#include <re/re_cc.h>
27#include <cc/cc_compiler.h>
28#include <pablo/function.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/streamset.h>
33#include <kernels/interface.h>
34#include <kernels/kernel.h>
35#include <kernels/s2p_kernel.h>
36#include <kernels/pipeline.h>
37
38#include <pablo/pablo_compiler.h>
39#include <pablo/pablo_toolchain.h>
40
41
42#include <utf_encoding.h>
43
44// mmap system
45#include <boost/filesystem.hpp>
46#include <boost/iostreams/device/mapped_file.hpp>
47
48#include <fcntl.h>
49static cl::OptionCategory wcFlags("Command Flags", "wc options");
50
51static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
52
53enum CountOptions {
54    LineOption, WordOption, CharOption, ByteOption
55};
56
57static cl::list<CountOptions> wcOptions(
58  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
59             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
60             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
61             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
62             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
63                                                 
64
65
66static int defaultFieldWidth = 7;  // default field width
67
68
69bool CountLines = false;
70bool CountWords = false;
71bool CountChars = false;
72bool CountBytes = false;
73
74std::vector<uint64_t> lineCount;
75std::vector<uint64_t> wordCount;
76std::vector<uint64_t> charCount;
77std::vector<uint64_t> byteCount;
78
79uint64_t TotalLines = 0;
80uint64_t TotalWords = 0;
81uint64_t TotalChars = 0;
82uint64_t TotalBytes = 0;
83
84
85//  The callback routine that records counts in progress.
86//
87extern "C" {
88    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
89        lineCount[fileIdx] = lines;
90        wordCount[fileIdx] = words;
91        charCount[fileIdx] = chars;
92        byteCount[fileIdx] = bytes;
93        TotalLines += lines;
94        TotalWords += words;
95        TotalChars += chars;
96        TotalBytes += bytes;
97    }
98}
99
100//
101//
102
103pablo::PabloFunction * wc_gen(Encoding encoding) {
104    //  input: 8 basis bit streams
105    //  output: 3 counters
106   
107    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
108    cc::CC_Compiler ccc(*function, encoding);
109   
110    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
111    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
112
113    if (CountLines) {
114        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
115        function->setResultCount(pBuilder.createCount("lineCount", LF));
116    }
117    if (CountWords) {
118        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
119       
120        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
121        // WS_follow_or_start = 1 past WS or at start of file
122        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
123        //
124        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
125        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
126    }
127    if (CountChars) {
128        //
129        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
130        // not UTF-8, or is not valid?
131        //
132        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
133        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
134    }
135    return function;
136}
137
138using namespace kernel;
139using namespace parabix;
140
141
142Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
143    Type * mBitBlockType = iBuilder->getBitBlockType();
144   
145    ExternalUnboundedBuffer ByteStream(iBuilder, StreamSetType(1, i8));
146    SingleBlockBuffer BasisBits(iBuilder, StreamSetType(8, i1));
147   
148    s2pKernel  s2pk(iBuilder, ByteStream, BasisBits);
149    std::unique_ptr<Module> s2pM = s2pk.createKernelModule();
150   
151    pablo_function_passes(function);
152    pablo::PabloKernel  wck(iBuilder, "wc", function, BasisBits, {"lineCount", "wordCount", "charCount"});
153   
154    std::unique_ptr<Module> wcM = wck.createKernelModule();
155   
156    s2pk.addKernelDeclarations(mMod);
157    wck.addKernelDeclarations(mMod);
158
159    Constant * record_counts_routine;
160    Type * const size_ty = iBuilder->getSizeTy();
161    Type * const voidTy = Type::getVoidTy(mMod->getContext());
162    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, size_ty, size_ty, size_ty, size_ty, size_ty, nullptr);
163    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
164   
165    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, size_ty, size_ty, nullptr));
166    main->setCallingConv(CallingConv::C);
167    Function::arg_iterator args = main->arg_begin();
168   
169    Value * const inputStream = &*(args++);
170    inputStream->setName("input");
171    Value * const fileSize = &*(args++);
172    fileSize->setName("fileSize");
173    Value * const fileIdx = &*(args++);
174    fileIdx->setName("fileIdx");
175   
176    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
177
178    ByteStream.setStreamSetBuffer(inputStream);
179    BasisBits.allocateBuffer();
180   
181    Value * s2pInstance = s2pk.createInstance({}, {&ByteStream}, {&BasisBits});;
182    Value * wcInstance = wck.createInstance({}, {&BasisBits}, {});
183   
184    generatePipelineLoop(iBuilder, {&s2pk, &wck}, {s2pInstance, wcInstance}, fileSize);
185   
186    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
187    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
188    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
189
190    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
191   
192    iBuilder->CreateRetVoid();
193   
194    Linker L(*mMod);
195    L.linkInModule(std::move(s2pM));
196    L.linkInModule(std::move(wcM));
197   
198    return main;
199}
200
201
202typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
203
204static ExecutionEngine * wcEngine = nullptr;
205
206wcFunctionType wcCodeGen(void) {
207                           
208    Module * M = new Module("wc", getGlobalContext());
209    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
210
211    Encoding encoding(Encoding::Type::UTF_8, 8);
212    pablo::PabloFunction * function = wc_gen(encoding);
213    llvm::Function * main_IR = wcPipeline(M, idb, function);
214
215    wcEngine = JIT_to_ExecutionEngine(M);
216   
217    wcEngine->finalizeObject();
218
219    delete idb;
220    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
221}
222
223void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
224    std::string fileName = inputFiles[fileIdx];
225    size_t fileSize;
226    char * fileBuffer;
227   
228    const boost::filesystem::path file(fileName);
229    if (exists(file)) {
230        if (is_directory(file)) {
231            return;
232        }
233    } else {
234        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
235        return;
236    }
237   
238    fileSize = file_size(file);
239    boost::iostreams::mapped_file_source mappedFile;
240    if (fileSize == 0) {
241        fileBuffer = nullptr;
242    }
243    else {
244        try {
245            mappedFile.open(fileName);
246        } catch (std::exception &e) {
247            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
248            return;
249        }
250        fileBuffer = const_cast<char *>(mappedFile.data());
251    }
252    fn_ptr(fileBuffer, fileSize, fileIdx);
253
254    mappedFile.close();
255   
256}
257
258
259
260int main(int argc, char *argv[]) {
261    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
262    cl::ParseCommandLineOptions(argc, argv);
263    if (wcOptions.size() == 0) {
264        CountLines = true;
265        CountWords = true;
266        CountBytes = true;
267    }
268    else {
269        CountLines = false;
270        CountWords = false;
271        CountBytes = false;
272        CountChars = false;
273        for (unsigned i = 0; i < wcOptions.size(); i++) {
274            switch (wcOptions[i]) {
275                case WordOption: CountWords = true; break;
276                case LineOption: CountLines = true; break;
277                case CharOption: CountBytes = true; CountChars = false; break;
278                case ByteOption: CountChars = true; CountBytes = false; break;
279            }
280        }
281    }
282   
283   
284    wcFunctionType fn_ptr = wcCodeGen();
285
286    int fileCount = inputFiles.size();
287    lineCount.resize(fileCount);
288    wordCount.resize(fileCount);
289    charCount.resize(fileCount);
290    byteCount.resize(fileCount);
291   
292    for (unsigned i = 0; i < inputFiles.size(); ++i) {
293        wc(fn_ptr, i);
294    }
295   
296    delete wcEngine;
297   
298    size_t maxCount = 0;
299    if (CountLines) maxCount = TotalLines;
300    if (CountWords) maxCount = TotalWords;
301    if (CountChars) maxCount = TotalChars;
302    if (CountBytes) maxCount = TotalBytes;
303   
304    int fieldWidth = std::to_string(maxCount).size() + 1;
305    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
306
307    for (unsigned i = 0; i < inputFiles.size(); ++i) {
308        std::cout << std::setw(fieldWidth-1);
309        if (CountLines) {
310            std::cout << lineCount[i] << std::setw(fieldWidth);
311        }
312        if (CountWords) {
313            std::cout << wordCount[i] << std::setw(fieldWidth);
314        }
315        if (CountChars) {
316            std::cout << charCount[i] << std::setw(fieldWidth);
317        }
318        if (CountBytes) {
319            std::cout << byteCount[i];
320        }
321        std::cout << " " << inputFiles[i] << std::endl;
322    }
323    if (inputFiles.size() > 1) {
324        std::cout << std::setw(fieldWidth-1);
325        if (CountLines) {
326            std::cout << TotalLines << std::setw(fieldWidth);
327        }
328        if (CountWords) {
329            std::cout << TotalWords << std::setw(fieldWidth);
330        }
331        if (CountChars) {
332            std::cout << TotalChars << std::setw(fieldWidth);
333        }
334        if (CountBytes) {
335            std::cout << TotalBytes;
336        }
337        std::cout << " total" << std::endl;
338    }
339
340    return 0;
341}
342
343                       
Note: See TracBrowser for help on using the repository browser.