source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5101

Last change on this file since 5101 was 5101, checked in by cameron, 3 years ago

Clean out using boost namespaces

File size: 10.9 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <utf_encoding.h>
26#include <re/re_cc.h>
27#include <cc/cc_compiler.h>
28#include <pablo/function.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/streamset.h>
33#include <kernels/interface.h>
34#include <kernels/kernel.h>
35#include <kernels/s2p_kernel.h>
36#include <kernels/pipeline.h>
37
38#include <pablo/pablo_compiler.h>
39#include <pablo/pablo_toolchain.h>
40
41
42#include <utf_encoding.h>
43
44// mmap system
45#include <boost/filesystem.hpp>
46#include <boost/iostreams/device/mapped_file.hpp>
47
48#include <fcntl.h>
49static cl::OptionCategory wcFlags("Command Flags", "wc options");
50
51static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
52
53enum CountOptions {
54    LineOption, WordOption, CharOption, ByteOption
55};
56
57static cl::list<CountOptions> wcOptions(
58  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
59             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
60             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
61             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
62             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
63                                                 
64
65
66static int defaultFieldWidth = 7;  // default field width
67
68
69bool CountLines = false;
70bool CountWords = false;
71bool CountChars = false;
72bool CountBytes = false;
73
74std::vector<uint64_t> lineCount;
75std::vector<uint64_t> wordCount;
76std::vector<uint64_t> charCount;
77std::vector<uint64_t> byteCount;
78
79uint64_t TotalLines = 0;
80uint64_t TotalWords = 0;
81uint64_t TotalChars = 0;
82uint64_t TotalBytes = 0;
83
84
85//  The callback routine that records counts in progress.
86//
87extern "C" {
88    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
89        lineCount[fileIdx] = lines;
90        wordCount[fileIdx] = words;
91        charCount[fileIdx] = chars;
92        byteCount[fileIdx] = bytes;
93        TotalLines += lines;
94        TotalWords += words;
95        TotalChars += chars;
96        TotalBytes += bytes;
97    }
98}
99
100//
101//
102
103pablo::PabloFunction * wc_gen(Encoding encoding) {
104    //  input: 8 basis bit streams
105    //  output: 3 counters
106   
107    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 0);
108    cc::CC_Compiler ccc(*function, encoding);
109   
110    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
111    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
112
113    if (CountLines) {
114        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
115        function->setResultCount(pBuilder.createCount("lineCount", LF));
116    }
117    if (CountWords) {
118        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
119       
120        pablo::PabloAST * wordChar = pBuilder.createNot(WS);
121        // WS_follow_or_start = 1 past WS or at start of file
122        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(wordChar, 1));
123        //
124        pablo::PabloAST * wordStart = pBuilder.createInFile(pBuilder.createAnd(wordChar, WS_follow_or_start));
125        function->setResultCount(pBuilder.createCount("wordCount", wordStart));
126    }
127    if (CountChars) {
128        //
129        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
130        // not UTF-8, or is not valid?
131        //
132        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
133        function->setResultCount(pBuilder.createCount("charCount", u8Begin));
134    }
135    return function;
136}
137
138using namespace kernel;
139using namespace parabix;
140
141
142Function * wcPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
143    Type * mBitBlockType = iBuilder->getBitBlockType();
144   
145    ExternalUnboundedBuffer ByteStream(iBuilder, StreamSetType(1, i8));
146    SingleBlockBuffer BasisBits(iBuilder, StreamSetType(8, i1));
147   
148    s2pKernel  s2pk(iBuilder, ByteStream, BasisBits);
149    std::unique_ptr<Module> s2pM = s2pk.createKernelModule();
150    pablo_function_passes(function);
151    pablo::PabloKernel  wck(iBuilder, "wc", function, {"lineCount", "wordCount", "charCount"});
152    std::unique_ptr<Module> wcM = wck.createKernelModule();
153   
154    s2pk.addKernelDeclarations(mMod);
155    wck.addKernelDeclarations(mMod);
156
157    Constant * record_counts_routine;
158    Type * const int64ty = iBuilder->getInt64Ty();
159    Type * const voidTy = Type::getVoidTy(mMod->getContext());
160    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
161    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
162   
163    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
164    main->setCallingConv(CallingConv::C);
165    Function::arg_iterator args = main->arg_begin();
166   
167    Value * const inputStream = &*(args++);
168    inputStream->setName("input");
169    Value * const fileSize = &*(args++);
170    fileSize->setName("fileSize");
171    Value * const fileIdx = &*(args++);
172    fileIdx->setName("fileIdx");
173   
174    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
175
176    ByteStream.setStreamSetBuffer(inputStream);
177    BasisBits.allocateBuffer();
178   
179    Value * s2pInstance = s2pk.createInstance({}, {&ByteStream}, {&BasisBits});;
180    Value * wcInstance = wck.createInstance({}, {&BasisBits}, {});
181   
182    generatePipelineLoop(iBuilder, {&s2pk, &wck}, {s2pInstance, wcInstance}, fileSize);
183   
184    Value * lineCount = wck.createGetAccumulatorCall(wcInstance, "lineCount");
185    Value * wordCount = wck.createGetAccumulatorCall(wcInstance, "wordCount");
186    Value * charCount = wck.createGetAccumulatorCall(wcInstance, "charCount");;
187
188    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
189   
190    iBuilder->CreateRetVoid();
191   
192    Linker L(*mMod);
193    L.linkInModule(std::move(s2pM));
194    L.linkInModule(std::move(wcM));
195   
196    return main;
197}
198
199
200typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
201
202static ExecutionEngine * wcEngine = nullptr;
203
204wcFunctionType wcCodeGen(void) {
205                           
206    Module * M = new Module("wc", getGlobalContext());
207    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
208
209    Encoding encoding(Encoding::Type::UTF_8, 8);
210    pablo::PabloFunction * function = wc_gen(encoding);
211    llvm::Function * main_IR = wcPipeline(M, idb, function);
212
213    wcEngine = JIT_to_ExecutionEngine(M);
214   
215    wcEngine->finalizeObject();
216
217    delete idb;
218    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
219}
220
221void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
222    std::string fileName = inputFiles[fileIdx];
223    size_t fileSize;
224    char * fileBuffer;
225   
226    const boost::filesystem::path file(fileName);
227    if (exists(file)) {
228        if (is_directory(file)) {
229            return;
230        }
231    } else {
232        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
233        return;
234    }
235   
236    fileSize = file_size(file);
237    boost::iostreams::mapped_file_source mappedFile;
238    if (fileSize == 0) {
239        fileBuffer = nullptr;
240    }
241    else {
242        try {
243            mappedFile.open(fileName);
244        } catch (std::exception &e) {
245            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
246            return;
247        }
248        fileBuffer = const_cast<char *>(mappedFile.data());
249    }
250    fn_ptr(fileBuffer, fileSize, fileIdx);
251
252    mappedFile.close();
253   
254}
255
256
257
258int main(int argc, char *argv[]) {
259    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo::pablo_toolchain_flags(), codegen::codegen_flags()});
260    cl::ParseCommandLineOptions(argc, argv);
261    if (wcOptions.size() == 0) {
262        CountLines = true;
263        CountWords = true;
264        CountBytes = true;
265    }
266    else {
267        CountLines = false;
268        CountWords = false;
269        CountBytes = false;
270        CountChars = false;
271        for (unsigned i = 0; i < wcOptions.size(); i++) {
272            switch (wcOptions[i]) {
273                case WordOption: CountWords = true; break;
274                case LineOption: CountLines = true; break;
275                case CharOption: CountBytes = true; CountChars = false; break;
276                case ByteOption: CountChars = true; CountBytes = false; break;
277            }
278        }
279    }
280   
281   
282    wcFunctionType fn_ptr = wcCodeGen();
283
284    int fileCount = inputFiles.size();
285    lineCount.resize(fileCount);
286    wordCount.resize(fileCount);
287    charCount.resize(fileCount);
288    byteCount.resize(fileCount);
289   
290    for (unsigned i = 0; i < inputFiles.size(); ++i) {
291        wc(fn_ptr, i);
292    }
293   
294    delete wcEngine;
295   
296    size_t maxCount = 0;
297    if (CountLines) maxCount = TotalLines;
298    if (CountWords) maxCount = TotalWords;
299    if (CountChars) maxCount = TotalChars;
300    if (CountBytes) maxCount = TotalBytes;
301   
302    int fieldWidth = std::to_string(maxCount).size() + 1;
303    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
304
305    for (unsigned i = 0; i < inputFiles.size(); ++i) {
306        std::cout << std::setw(fieldWidth-1);
307        if (CountLines) {
308            std::cout << lineCount[i] << std::setw(fieldWidth);
309        }
310        if (CountWords) {
311            std::cout << wordCount[i] << std::setw(fieldWidth);
312        }
313        if (CountChars) {
314            std::cout << charCount[i] << std::setw(fieldWidth);
315        }
316        if (CountBytes) {
317            std::cout << byteCount[i];
318        }
319        std::cout << " " << inputFiles[i] << std::endl;
320    }
321    if (inputFiles.size() > 1) {
322        std::cout << std::setw(fieldWidth-1);
323        if (CountLines) {
324            std::cout << TotalLines << std::setw(fieldWidth);
325        }
326        if (CountWords) {
327            std::cout << TotalWords << std::setw(fieldWidth);
328        }
329        if (CountChars) {
330            std::cout << TotalChars << std::setw(fieldWidth);
331        }
332        if (CountBytes) {
333            std::cout << TotalBytes;
334        }
335        std::cout << " total" << std::endl;
336    }
337
338    return 0;
339}
340
341                       
Note: See TracBrowser for help on using the repository browser.