source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5100

Last change on this file since 5100 was 5100, checked in by cameron, 3 years ago

Buffer class hierarchy; s2p kernel demonstrates specialization for different buffer strategies

File size: 10.9 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <utf_encoding.h>
26#include <re/re_cc.h>
27#include <cc/cc_compiler.h>
28#include <pablo/function.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/streamset.h>
33#include <kernels/interface.h>
34#include <kernels/kernel.h>
35#include <kernels/s2p_kernel.h>
36#include <kernels/pipeline.h>
37
38#include <pablo/pablo_compiler.h>
39#include <pablo/pablo_toolchain.h>
40
41
42#include <utf_encoding.h>
43
44// mmap system
45#include <boost/filesystem.hpp>
46#include <boost/iostreams/device/mapped_file.hpp>
47using namespace boost::iostreams;
48using namespace boost::filesystem;
49
50#include <fcntl.h>
51static cl::OptionCategory wcFlags("Command Flags", "wc options");
52
53static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
54
55enum CountOptions {
56    LineOption, WordOption, CharOption, ByteOption
57};
58
59static cl::list<CountOptions> wcOptions(
60  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
61             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
62             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
63             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
64             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
65                                                 
66
67
68static int defaultFieldWidth = 7;  // default field width
69
70
71bool CountLines = false;
72bool CountWords = false;
73bool CountChars = false;
74bool CountBytes = false;
75
76std::vector<uint64_t> lineCount;
77std::vector<uint64_t> wordCount;
78std::vector<uint64_t> charCount;
79std::vector<uint64_t> byteCount;
80
81uint64_t TotalLines = 0;
82uint64_t TotalWords = 0;
83uint64_t TotalChars = 0;
84uint64_t TotalBytes = 0;
85
86
87//  The callback routine that records counts in progress.
88//
89extern "C" {
90    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
91        lineCount[fileIdx] = lines;
92        wordCount[fileIdx] = words;
93        charCount[fileIdx] = chars;
94        byteCount[fileIdx] = bytes;
95        TotalLines += lines;
96        TotalWords += words;
97        TotalChars += chars;
98        TotalBytes += bytes;
99    }
100}
101
102//
103//
104
105pablo::PabloFunction * wc_gen(Encoding encoding) {
106    //  input: 8 basis bit streams
107    //  output: 3 counters
108   
109    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
110    cc::CC_Compiler ccc(*function, encoding);
111   
112    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
113    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
114
115    if (CountLines) {
116        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
117        function->setResultCount(pBuilder.createCount("lineCount", LF));
118    }
119    if (CountWords) {
120        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
121       
122        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
123        // WS_follow_or_start = 1 past WS or at start of file
124        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
125        //
126        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
127        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
128    }
129    if (CountChars) {
130        //
131        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
132        // not UTF-8, or is not valid?
133        //
134        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
135        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
136    }
137    return function;
138}
139
140using namespace kernel;
141using namespace parabix;
142
143
144Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
145    Type * mBitBlockType = iBuilder->getBitBlockType();
146   
147    ExternalUnboundedBuffer ByteStream(iBuilder, StreamSetType(1, i8));
148    SingleBlockBuffer BasisBits(iBuilder, StreamSetType(8, i1));
149   
150    s2pKernel  s2pk(iBuilder, ByteStream, BasisBits);
151    std::unique_ptr<Module> s2pM = s2pk.createKernelModule();
152    pablo_function_passes(function);
153    pablo::PabloKernel  wck(iBuilder, "wc", function, {"lineCount", "wordCount", "charCount"});
154    std::unique_ptr<Module> wcM = wck.createKernelModule();
155   
156    s2pk.addKernelDeclarations(mMod);
157    wck.addKernelDeclarations(mMod);
158
159    Constant * record_counts_routine;
160    Type * const int64ty = iBuilder->getInt64Ty();
161    Type * const voidTy = Type::getVoidTy(mMod->getContext());
162    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
163    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
164   
165    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
166    main->setCallingConv(CallingConv::C);
167    Function::arg_iterator args = main->arg_begin();
168   
169    Value * const inputStream = &*(args++);
170    inputStream->setName("input");
171    Value * const fileSize = &*(args++);
172    fileSize->setName("fileSize");
173    Value * const fileIdx = &*(args++);
174    fileIdx->setName("fileIdx");
175   
176    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
177
178    ByteStream.setStreamSetBuffer(inputStream);
179    BasisBits.allocateBuffer();
180   
181    Value * s2pInstance = s2pk.createInstance({}, {&ByteStream}, {&BasisBits});;
182    Value * wcInstance = wck.createInstance({}, {&BasisBits}, {});
183   
184    generatePipelineLoop(iBuilder, {&s2pk, &wck}, {s2pInstance, wcInstance}, fileSize);
185   
186    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
187    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
188    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
189
190    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
191   
192    iBuilder->CreateRetVoid();
193   
194    Linker L(*mMod);
195    L.linkInModule(std::move(s2pM));
196    L.linkInModule(std::move(wcM));
197   
198    return main;
199}
200
201
202typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
203
204static ExecutionEngine * wcEngine = nullptr;
205
206wcFunctionType wcCodeGen(void) {
207                           
208    Module * M = new Module("wc", getGlobalContext());
209    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
210
211    Encoding encoding(Encoding::Type::UTF_8, 8);
212    pablo::PabloFunction * function = wc_gen(encoding);
213    llvm::Function * main_IR = wcPipeline(M, idb, function);
214
215    wcEngine = JIT_to_ExecutionEngine(M);
216   
217    wcEngine->finalizeObject();
218
219    delete idb;
220    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
221}
222
223void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
224    std::string fileName = inputFiles[fileIdx];
225    size_t fileSize;
226    char * fileBuffer;
227   
228    const path file(fileName);
229    if (exists(file)) {
230        if (is_directory(file)) {
231            return;
232        }
233    } else {
234        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
235        return;
236    }
237   
238    fileSize = file_size(file);
239    mapped_file_source mappedFile;
240    if (fileSize == 0) {
241        fileBuffer = nullptr;
242    }
243    else {
244        try {
245            mappedFile.open(fileName);
246        } catch (std::exception &e) {
247            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
248            return;
249        }
250        fileBuffer = const_cast<char *>(mappedFile.data());
251    }
252    fn_ptr(fileBuffer, fileSize, fileIdx);
253
254    mappedFile.close();
255   
256}
257
258
259
260int main(int argc, char *argv[]) {
261    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
262    cl::ParseCommandLineOptions(argc, argv);
263    if (wcOptions.size() == 0) {
264        CountLines = true;
265        CountWords = true;
266        CountBytes = true;
267    }
268    else {
269        CountLines = false;
270        CountWords = false;
271        CountBytes = false;
272        CountChars = false;
273        for (unsigned i = 0; i < wcOptions.size(); i++) {
274            switch (wcOptions[i]) {
275                case WordOption: CountWords = true; break;
276                case LineOption: CountLines = true; break;
277                case CharOption: CountBytes = true; CountChars = false; break;
278                case ByteOption: CountChars = true; CountBytes = false; break;
279            }
280        }
281    }
282   
283   
284    wcFunctionType fn_ptr = wcCodeGen();
285
286    int fileCount = inputFiles.size();
287    lineCount.resize(fileCount);
288    wordCount.resize(fileCount);
289    charCount.resize(fileCount);
290    byteCount.resize(fileCount);
291   
292    for (unsigned i = 0; i < inputFiles.size(); ++i) {
293        wc(fn_ptr, i);
294    }
295   
296    delete wcEngine;
297   
298    size_t maxCount = 0;
299    if (CountLines) maxCount = TotalLines;
300    if (CountWords) maxCount = TotalWords;
301    if (CountChars) maxCount = TotalChars;
302    if (CountBytes) maxCount = TotalBytes;
303   
304    int fieldWidth = std::to_string(maxCount).size() + 1;
305    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
306
307    for (unsigned i = 0; i < inputFiles.size(); ++i) {
308        std::cout << std::setw(fieldWidth-1);
309        if (CountLines) {
310            std::cout << lineCount[i] << std::setw(fieldWidth);
311        }
312        if (CountWords) {
313            std::cout << wordCount[i] << std::setw(fieldWidth);
314        }
315        if (CountChars) {
316            std::cout << charCount[i] << std::setw(fieldWidth);
317        }
318        if (CountBytes) {
319            std::cout << byteCount[i];
320        }
321        std::cout << " " << inputFiles[i] << std::endl;
322    }
323    if (inputFiles.size() > 1) {
324        std::cout << std::setw(fieldWidth-1);
325        if (CountLines) {
326            std::cout << TotalLines << std::setw(fieldWidth);
327        }
328        if (CountWords) {
329            std::cout << TotalWords << std::setw(fieldWidth);
330        }
331        if (CountChars) {
332            std::cout << TotalChars << std::setw(fieldWidth);
333        }
334        if (CountBytes) {
335            std::cout << TotalBytes;
336        }
337        std::cout << " total" << std::endl;
338    }
339
340    return 0;
341}
342
343                       
Note: See TracBrowser for help on using the repository browser.