source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5377

Last change on this file since 5377 was 5377, checked in by nmedfort, 2 years ago

Support for stdin. Needs more testing.

File size: 10.6 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <iostream>
8#include <iomanip>
9#include <sstream>
10#include <toolchain.h>
11#include <llvm/IR/Function.h>
12#include <llvm/IR/Module.h>
13#include <llvm/ExecutionEngine/ExecutionEngine.h>
14#include "llvm/Linker/Linker.h"
15#include <llvm/Support/CommandLine.h>
16#include <llvm/Support/raw_ostream.h>
17#include <cc/cc_compiler.h>
18#include <pablo/pablo_kernel.h>
19#include <IR_Gen/idisa_builder.h>
20#include <IR_Gen/idisa_target.h>
21#include <kernels/streamset.h>
22#include <kernels/mmap_kernel.h>
23#include <kernels/s2p_kernel.h>
24#include <kernels/pipeline.h>
25#include <pablo/pablo_compiler.h>
26#include <pablo/pablo_toolchain.h>
27#include <boost/filesystem.hpp>
28#include <boost/iostreams/device/mapped_file.hpp>
29
30
31using namespace llvm;
32
33static cl::OptionCategory wcFlags("Command Flags", "wc options");
34
35static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
36
37enum CountOptions {
38    LineOption, WordOption, CharOption, ByteOption
39};
40
41static cl::list<CountOptions> wcOptions(
42  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
43             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
44             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
45             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
46             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
47                                                 
48
49
50static int defaultFieldWidth = 7;  // default field width
51
52
53bool CountLines = false;
54bool CountWords = false;
55bool CountChars = false;
56bool CountBytes = false;
57
58std::vector<uint64_t> lineCount;
59std::vector<uint64_t> wordCount;
60std::vector<uint64_t> charCount;
61std::vector<uint64_t> byteCount;
62
63uint64_t TotalLines = 0;
64uint64_t TotalWords = 0;
65uint64_t TotalChars = 0;
66uint64_t TotalBytes = 0;
67
68using namespace pablo;
69using namespace kernel;
70using namespace parabix;
71
72//  The callback routine that records counts in progress.
73//
74extern "C" {
75    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
76        lineCount[fileIdx] = lines;
77        wordCount[fileIdx] = words;
78        charCount[fileIdx] = chars;
79        byteCount[fileIdx] = bytes;
80        TotalLines += lines;
81        TotalWords += words;
82        TotalChars += chars;
83        TotalBytes += bytes;
84    }
85}
86
87//
88//
89
90void wc_gen(PabloKernel * kernel) {
91    //  input: 8 basis bit streams
92    const auto u8bitSet = kernel->getInputStreamVar("u8bit");
93    //  output: 3 counters
94   
95    cc::CC_Compiler ccc(kernel, u8bitSet);
96   
97    PabloBuilder & pb = ccc.getBuilder();
98
99    Var * lc = kernel->getOutputScalarVar("lineCount");
100    Var * wc = kernel->getOutputScalarVar("wordCount");
101    Var * cc = kernel->getOutputScalarVar("charCount");
102
103    if (CountLines) {
104        PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
105        pb.createAssign(lc, pb.createCount(LF));
106    }
107    if (CountWords) {
108        PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
109        PabloAST * wordChar = pb.createNot(WS);
110        // WS_follow_or_start = 1 past WS or at start of file
111        PabloAST * WS_follow_or_start = pb.createNot(pb.createAdvance(wordChar, 1));
112        PabloAST * wordStart = pb.createInFile(pb.createAnd(wordChar, WS_follow_or_start));
113        pb.createAssign(wc, pb.createCount(wordStart));
114    }
115    if (CountChars) {
116        //
117        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
118        // not UTF-8, or is not valid?
119        //
120        PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));       
121        pb.createAssign(cc, pb.createCount(u8Begin));
122    }
123}
124
125Function * pipeline(Module * m, IDISA::IDISA_Builder * iBuilder) {
126    Type * mBitBlockType = iBuilder->getBitBlockType();
127    Constant * record_counts_routine;
128    Type * const size_ty = iBuilder->getSizeTy();
129    Type * const voidTy = iBuilder->getVoidTy();
130    record_counts_routine = m->getOrInsertFunction("record_counts", voidTy, size_ty, size_ty, size_ty, size_ty, size_ty, nullptr);
131    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
132   
133    Function * const main = cast<Function>(m->getOrInsertFunction("Main", voidTy, inputType, size_ty, size_ty, nullptr));
134    main->setCallingConv(CallingConv::C);
135    Function::arg_iterator args = main->arg_begin();
136   
137    Value * const inputStream = &*(args++);
138    inputStream->setName("input");
139    Value * const fileSize = &*(args++);
140    fileSize->setName("fileSize");
141    Value * const fileIdx = &*(args++);
142    fileIdx->setName("fileIdx");
143   
144    ExternalFileBuffer ByteStream(iBuilder, iBuilder->getStreamSetTy(1, 8));
145
146    SingleBlockBuffer BasisBits(iBuilder, iBuilder->getStreamSetTy(8, 1));
147   
148    MMapSourceKernel mmapK(iBuilder);
149    std::unique_ptr<Module> mmapM = mmapK.createKernelModule({}, {&ByteStream});
150    mmapK.setInitialArguments({fileSize});
151   
152    S2PKernel  s2pk(iBuilder);
153    std::unique_ptr<Module> s2pM = s2pk.createKernelModule({&ByteStream}, {&BasisBits});
154   
155    PabloKernel wck(iBuilder, "wc",
156        {Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"}},
157        {},
158        {},
159        {Binding{iBuilder->getSizeTy(), "lineCount"}, Binding{iBuilder->getSizeTy(), "wordCount"}, Binding{iBuilder->getSizeTy(), "charCount"}});
160
161    wc_gen(&wck);
162    pablo_function_passes(&wck);
163   
164    std::unique_ptr<Module> wcM = wck.createKernelModule({&BasisBits}, {});
165   
166    mmapK.addKernelDeclarations(m);
167    s2pk.addKernelDeclarations(m);
168    wck.addKernelDeclarations(m);
169   
170    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
171
172    ByteStream.setStreamSetBuffer(inputStream);
173    BasisBits.allocateBuffer();
174   
175    generatePipeline(iBuilder, {&mmapK, &s2pk, &wck});
176   
177    Value * lineCount = wck.createGetAccumulatorCall(wck.getInstance(), "lineCount");
178    Value * wordCount = wck.createGetAccumulatorCall(wck.getInstance(), "wordCount");
179    Value * charCount = wck.createGetAccumulatorCall(wck.getInstance(), "charCount");
180
181    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
182   
183    iBuilder->CreateRetVoid();
184   
185    Linker L(*m);
186    L.linkInModule(std::move(mmapM));
187    L.linkInModule(std::move(s2pM));
188    L.linkInModule(std::move(wcM));
189   
190    return main;
191}
192
193
194typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
195
196static ExecutionEngine * wcEngine = nullptr;
197
198wcFunctionType wcCodeGen(void) {
199    Module * M = new Module("wc", getGlobalContext());
200    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
201
202    llvm::Function * main_IR = pipeline(M, idb);
203
204    wcEngine = JIT_to_ExecutionEngine(M);
205   
206    wcEngine->finalizeObject();
207
208    delete idb;
209    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
210}
211
212void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
213    std::string fileName = inputFiles[fileIdx];
214    size_t fileSize;
215    char * fileBuffer;
216   
217    const boost::filesystem::path file(fileName);
218    if (exists(file)) {
219        if (is_directory(file)) {
220            return;
221        }
222    } else {
223        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
224        return;
225    }
226   
227    fileSize = file_size(file);
228    boost::iostreams::mapped_file_source mappedFile;
229    if (fileSize == 0) {
230        fileBuffer = nullptr;
231    }
232    else {
233        try {
234            mappedFile.open(fileName);
235        } catch (std::exception &e) {
236            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
237            return;
238        }
239        fileBuffer = const_cast<char *>(mappedFile.data());
240    }
241    fn_ptr(fileBuffer, fileSize, fileIdx);
242
243    mappedFile.close();
244   
245}
246
247
248
249int main(int argc, char *argv[]) {
250    AddParabixVersionPrinter();
251    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo_toolchain_flags(), codegen::codegen_flags()});
252    cl::ParseCommandLineOptions(argc, argv);
253    if (wcOptions.size() == 0) {
254        CountLines = true;
255        CountWords = true;
256        CountBytes = true;
257    }
258    else {
259        CountLines = false;
260        CountWords = false;
261        CountBytes = false;
262        CountChars = false;
263        for (unsigned i = 0; i < wcOptions.size(); i++) {
264            switch (wcOptions[i]) {
265                case WordOption: CountWords = true; break;
266                case LineOption: CountLines = true; break;
267                case CharOption: CountBytes = true; CountChars = false; break;
268                case ByteOption: CountChars = true; CountBytes = false; break;
269            }
270        }
271    }
272   
273   
274    wcFunctionType fn_ptr = wcCodeGen();
275
276    int fileCount = inputFiles.size();
277    lineCount.resize(fileCount);
278    wordCount.resize(fileCount);
279    charCount.resize(fileCount);
280    byteCount.resize(fileCount);
281   
282    for (unsigned i = 0; i < inputFiles.size(); ++i) {
283        wc(fn_ptr, i);
284    }
285   
286    size_t maxCount = 0;
287    if (CountLines) maxCount = TotalLines;
288    if (CountWords) maxCount = TotalWords;
289    if (CountChars) maxCount = TotalChars;
290    if (CountBytes) maxCount = TotalBytes;
291   
292    int fieldWidth = std::to_string(maxCount).size() + 1;
293    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
294
295    for (unsigned i = 0; i < inputFiles.size(); ++i) {
296        std::cout << std::setw(fieldWidth-1);
297        if (CountLines) {
298            std::cout << lineCount[i] << std::setw(fieldWidth);
299        }
300        if (CountWords) {
301            std::cout << wordCount[i] << std::setw(fieldWidth);
302        }
303        if (CountChars) {
304            std::cout << charCount[i] << std::setw(fieldWidth);
305        }
306        if (CountBytes) {
307            std::cout << byteCount[i];
308        }
309        std::cout << " " << inputFiles[i] << std::endl;
310    }
311    if (inputFiles.size() > 1) {
312        std::cout << std::setw(fieldWidth-1);
313        if (CountLines) {
314            std::cout << TotalLines << std::setw(fieldWidth);
315        }
316        if (CountWords) {
317            std::cout << TotalWords << std::setw(fieldWidth);
318        }
319        if (CountChars) {
320            std::cout << TotalChars << std::setw(fieldWidth);
321        }
322        if (CountBytes) {
323            std::cout << TotalBytes;
324        }
325        std::cout << " total" << std::endl;
326    }
327
328    return 0;
329}
Note: See TracBrowser for help on using the repository browser.