source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5088

Last change on this file since 5088 was 5088, checked in by cameron, 3 years ago

wc using doSegment; pipeline generation

File size: 10.9 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <utf_encoding.h>
26#include <re/re_cc.h>
27#include <cc/cc_compiler.h>
28#include <pablo/function.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/interface.h>
33#include <kernels/kernel.h>
34#include <kernels/s2p_kernel.h>
35#include <kernels/pipeline.h>
36
37#include <pablo/pablo_compiler.h>
38#include <pablo/pablo_toolchain.h>
39
40
41#include <utf_encoding.h>
42
43// mmap system
44#include <boost/filesystem.hpp>
45#include <boost/iostreams/device/mapped_file.hpp>
46using namespace boost::iostreams;
47using namespace boost::filesystem;
48
49#include <fcntl.h>
50static cl::OptionCategory wcFlags("Command Flags", "wc options");
51
52static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
53
54enum CountOptions {
55    LineOption, WordOption, CharOption, ByteOption
56};
57
58static cl::list<CountOptions> wcOptions(
59  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
60             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
61             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
62             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
63             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
64                                                 
65
66
67static int defaultFieldWidth = 7;  // default field width
68
69
70bool CountLines = false;
71bool CountWords = false;
72bool CountChars = false;
73bool CountBytes = false;
74
75std::vector<uint64_t> lineCount;
76std::vector<uint64_t> wordCount;
77std::vector<uint64_t> charCount;
78std::vector<uint64_t> byteCount;
79
80uint64_t TotalLines = 0;
81uint64_t TotalWords = 0;
82uint64_t TotalChars = 0;
83uint64_t TotalBytes = 0;
84
85
86//  The callback routine that records counts in progress.
87//
88extern "C" {
89    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
90        lineCount[fileIdx] = lines;
91        wordCount[fileIdx] = words;
92        charCount[fileIdx] = chars;
93        byteCount[fileIdx] = bytes;
94        TotalLines += lines;
95        TotalWords += words;
96        TotalChars += chars;
97        TotalBytes += bytes;
98    }
99}
100
101//
102//
103
104pablo::PabloFunction * wc_gen(Encoding encoding) {
105    //  input: 8 basis bit streams
106    //  output: 3 counters
107   
108    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
109    cc::CC_Compiler ccc(*function, encoding);
110   
111    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
112    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
113
114    if (CountLines) {
115        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
116        function->setResultCount(pBuilder.createCount("lineCount", LF));
117    }
118    if (CountWords) {
119        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
120       
121        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
122        // WS_follow_or_start = 1 past WS or at start of file
123        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
124        //
125        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
126        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
127    }
128    if (CountChars) {
129        //
130        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
131        // not UTF-8, or is not valid?
132        //
133        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
134        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
135    }
136    return function;
137}
138
139using namespace kernel;
140
141
142Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
143    Type * mBitBlockType = iBuilder->getBitBlockType();
144   
145    s2pKernel  s2pk(iBuilder);
146    std::unique_ptr<Module> s2pM = s2pk.createKernelModule();
147    pablo_function_passes(function);
148    pablo::PabloKernel  wck(iBuilder, "wc", function, {"lineCount", "wordCount", "charCount"});
149    std::unique_ptr<Module> wcM = wck.createKernelModule();
150   
151    s2pk.addKernelDeclarations(mMod);
152    wck.addKernelDeclarations(mMod);
153
154    Constant * record_counts_routine;
155    Type * const int64ty = iBuilder->getInt64Ty();
156    Type * const voidTy = Type::getVoidTy(mMod->getContext());
157    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
158    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
159   
160    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
161    main->setCallingConv(CallingConv::C);
162    Function::arg_iterator args = main->arg_begin();
163   
164    Value * const inputStream = &*(args++);
165    inputStream->setName("input");
166    Value * const fileSize = &*(args++);
167    fileSize->setName("fileSize");
168    Value * const fileIdx = &*(args++);
169    fileIdx->setName("fileIdx");
170   
171    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
172    kernel::StreamSetBuffer ByteStream(iBuilder, kernel::StreamSetType(1, 8), 0);
173    kernel::StreamSetBuffer BasisBits(iBuilder, kernel::StreamSetType(8, 1), codegen::SegmentSize);
174
175    ByteStream.setStreamSetBuffer(inputStream);
176    BasisBits.allocateBuffer();
177   
178    Value * s2pInstance = s2pk.createInstance({}, {&ByteStream}, {&BasisBits});;
179    Value * wcInstance = wck.createInstance({}, {&BasisBits}, {});
180   
181    generatePipelineLoop(iBuilder, {&s2pk, &wck}, {s2pInstance, wcInstance}, fileSize);
182   
183    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
184    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
185    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
186
187    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
188   
189    iBuilder->CreateRetVoid();
190   
191    Linker L(*mMod);
192    L.linkInModule(std::move(s2pM));
193    L.linkInModule(std::move(wcM));
194   
195    return main;
196}
197
198
199typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
200
201static ExecutionEngine * wcEngine = nullptr;
202
203wcFunctionType wcCodeGen(void) {
204                           
205    Module * M = new Module("wc", getGlobalContext());
206    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
207
208    Encoding encoding(Encoding::Type::UTF_8, 8);
209    pablo::PabloFunction * function = wc_gen(encoding);
210    llvm::Function * main_IR = wcPipeline(M, idb, function);
211
212    wcEngine = JIT_to_ExecutionEngine(M);
213   
214    wcEngine->finalizeObject();
215
216    delete idb;
217    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
218}
219
220void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
221    std::string fileName = inputFiles[fileIdx];
222    size_t fileSize;
223    char * fileBuffer;
224   
225    const path file(fileName);
226    if (exists(file)) {
227        if (is_directory(file)) {
228            return;
229        }
230    } else {
231        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
232        return;
233    }
234   
235    fileSize = file_size(file);
236    mapped_file_source mappedFile;
237    if (fileSize == 0) {
238        fileBuffer = nullptr;
239    }
240    else {
241        try {
242            mappedFile.open(fileName);
243        } catch (std::exception &e) {
244            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
245            return;
246        }
247        fileBuffer = const_cast<char *>(mappedFile.data());
248    }
249    fn_ptr(fileBuffer, fileSize, fileIdx);
250
251    mappedFile.close();
252   
253}
254
255
256
257int main(int argc, char *argv[]) {
258    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
259    cl::ParseCommandLineOptions(argc, argv);
260    if (wcOptions.size() == 0) {
261        CountLines = true;
262        CountWords = true;
263        CountBytes = true;
264    }
265    else {
266        CountLines = false;
267        CountWords = false;
268        CountBytes = false;
269        CountChars = false;
270        for (unsigned i = 0; i < wcOptions.size(); i++) {
271            switch (wcOptions[i]) {
272                case WordOption: CountWords = true; break;
273                case LineOption: CountLines = true; break;
274                case CharOption: CountBytes = true; CountChars = false; break;
275                case ByteOption: CountChars = true; CountBytes = false; break;
276            }
277        }
278    }
279   
280   
281    wcFunctionType fn_ptr = wcCodeGen();
282
283    int fileCount = inputFiles.size();
284    lineCount.resize(fileCount);
285    wordCount.resize(fileCount);
286    charCount.resize(fileCount);
287    byteCount.resize(fileCount);
288   
289    for (unsigned i = 0; i < inputFiles.size(); ++i) {
290        wc(fn_ptr, i);
291    }
292   
293    delete wcEngine;
294   
295    size_t maxCount = 0;
296    if (CountLines) maxCount = TotalLines;
297    if (CountWords) maxCount = TotalWords;
298    if (CountChars) maxCount = TotalChars;
299    if (CountBytes) maxCount = TotalBytes;
300   
301    int fieldWidth = std::to_string(maxCount).size() + 1;
302    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
303
304    for (unsigned i = 0; i < inputFiles.size(); ++i) {
305        std::cout << std::setw(fieldWidth-1);
306        if (CountLines) {
307            std::cout << lineCount[i] << std::setw(fieldWidth);
308        }
309        if (CountWords) {
310            std::cout << wordCount[i] << std::setw(fieldWidth);
311        }
312        if (CountChars) {
313            std::cout << charCount[i] << std::setw(fieldWidth);
314        }
315        if (CountBytes) {
316            std::cout << byteCount[i];
317        }
318        std::cout << " " << inputFiles[i] << std::endl;
319    }
320    if (inputFiles.size() > 1) {
321        std::cout << std::setw(fieldWidth-1);
322        if (CountLines) {
323            std::cout << TotalLines << std::setw(fieldWidth);
324        }
325        if (CountWords) {
326            std::cout << TotalWords << std::setw(fieldWidth);
327        }
328        if (CountChars) {
329            std::cout << TotalChars << std::setw(fieldWidth);
330        }
331        if (CountBytes) {
332            std::cout << TotalBytes;
333        }
334        std::cout << " total" << std::endl;
335    }
336
337    return 0;
338}
339
340                       
Note: See TracBrowser for help on using the repository browser.