source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5417

Last change on this file since 5417 was 5414, checked in by cameron, 2 years ago

Parabix driver can take ownership of kernelbuilder instances; uniquify mmap kernel name

File size: 10.6 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <iostream>
8#include <iomanip>
9#include <sstream>
10#include <kernels/toolchain.h>
11#include <llvm/IR/Function.h>
12#include <llvm/IR/Module.h>
13#include <llvm/ExecutionEngine/ExecutionEngine.h>
14#include "llvm/Linker/Linker.h"
15#include <llvm/Support/CommandLine.h>
16#include <llvm/Support/raw_ostream.h>
17#include <cc/cc_compiler.h>
18#include <pablo/pablo_kernel.h>
19#include <IR_Gen/idisa_builder.h>
20#include <IR_Gen/idisa_target.h>
21#include <kernels/streamset.h>
22#include <kernels/mmap_kernel.h>
23#include <kernels/s2p_kernel.h>
24#include <kernels/pipeline.h>
25#include <pablo/pablo_compiler.h>
26#include <pablo/pablo_toolchain.h>
27#include <boost/filesystem.hpp>
28#include <boost/iostreams/device/mapped_file.hpp>
29
30
31using namespace llvm;
32
33static cl::OptionCategory wcFlags("Command Flags", "wc options");
34
35static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
36
37enum CountOptions {
38    LineOption, WordOption, CharOption, ByteOption
39};
40
41static cl::list<CountOptions> wcOptions(
42  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
43             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
44             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
45             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
46             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
47                                                 
48
49
50static int defaultFieldWidth = 7;  // default field width
51
52
53bool CountLines = false;
54bool CountWords = false;
55bool CountChars = false;
56bool CountBytes = false;
57
58std::vector<uint64_t> lineCount;
59std::vector<uint64_t> wordCount;
60std::vector<uint64_t> charCount;
61std::vector<uint64_t> byteCount;
62
63uint64_t TotalLines = 0;
64uint64_t TotalWords = 0;
65uint64_t TotalChars = 0;
66uint64_t TotalBytes = 0;
67
68using namespace pablo;
69using namespace kernel;
70using namespace parabix;
71
72//  The callback routine that records counts in progress.
73//
74extern "C" {
75    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
76        lineCount[fileIdx] = lines;
77        wordCount[fileIdx] = words;
78        charCount[fileIdx] = chars;
79        byteCount[fileIdx] = bytes;
80        TotalLines += lines;
81        TotalWords += words;
82        TotalChars += chars;
83        TotalBytes += bytes;
84    }
85}
86
87//
88//
89
90std::unique_ptr<PabloKernel> wc_gen(IDISA::IDISA_Builder * iBuilder) {
91   
92    auto kernel = std::unique_ptr<PabloKernel>(new PabloKernel(iBuilder, "Parabix:wc",
93                    {Binding{iBuilder->getStreamSetTy(8, 1), "u8bit"}},
94                    {},
95                    {},
96                    {Binding{iBuilder->getSizeTy(), "lineCount"}, Binding{iBuilder->getSizeTy(), "wordCount"}, Binding{iBuilder->getSizeTy(), "charCount"}}));
97   
98    //  input: 8 basis bit streams
99    const auto u8bitSet = kernel->getInputStreamVar("u8bit");
100    //  output: 3 counters
101   
102    cc::CC_Compiler ccc(kernel.get(), u8bitSet);
103   
104    PabloBuilder & pb = ccc.getBuilder();
105
106    Var * lc = kernel->getOutputScalarVar("lineCount");
107    Var * wc = kernel->getOutputScalarVar("wordCount");
108    Var * cc = kernel->getOutputScalarVar("charCount");
109
110    if (CountLines) {
111        PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
112        pb.createAssign(lc, pb.createCount(LF));
113    }
114    if (CountWords) {
115        PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
116        PabloAST * wordChar = pb.createNot(WS);
117        // WS_follow_or_start = 1 past WS or at start of file
118        PabloAST * WS_follow_or_start = pb.createNot(pb.createAdvance(wordChar, 1));
119        PabloAST * wordStart = pb.createInFile(pb.createAnd(wordChar, WS_follow_or_start));
120        pb.createAssign(wc, pb.createCount(wordStart));
121    }
122    if (CountChars) {
123        //
124        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
125        // not UTF-8, or is not valid?
126        //
127        PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));       
128        pb.createAssign(cc, pb.createCount(u8Begin));
129    }
130    pablo_function_passes(kernel.get());
131    return kernel;
132}
133
134
135
136
137typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
138
139void wcPipelineGen(ParabixDriver & pxDriver) {
140
141    IDISA::IDISA_Builder * iBuilder = pxDriver.getIDISA_Builder();
142    Module * m = iBuilder->getModule();
143   
144    Type * mBitBlockType = iBuilder->getBitBlockType();
145    Constant * record_counts_routine;
146    Type * const size_ty = iBuilder->getSizeTy();
147    Type * const voidTy = iBuilder->getVoidTy();
148    record_counts_routine = m->getOrInsertFunction("record_counts", voidTy, size_ty, size_ty, size_ty, size_ty, size_ty, nullptr);
149    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
150   
151    Function * const main = cast<Function>(m->getOrInsertFunction("Main", voidTy, inputType, size_ty, size_ty, nullptr));
152    main->setCallingConv(CallingConv::C);
153    Function::arg_iterator args = main->arg_begin();
154   
155    Value * const inputStream = &*(args++);
156    inputStream->setName("input");
157    Value * const fileSize = &*(args++);
158    fileSize->setName("fileSize");
159    Value * const fileIdx = &*(args++);
160    fileIdx->setName("fileIdx");
161    iBuilder->SetInsertPoint(BasicBlock::Create(m->getContext(), "entry", main,0));
162
163    StreamSetBuffer * ByteStream = pxDriver.addExternalBuffer(make_unique<ExternalFileBuffer>(iBuilder, iBuilder->getStreamSetTy(1, 8)), inputStream);
164
165    StreamSetBuffer * BasisBits = pxDriver.addBuffer(make_unique<SingleBlockBuffer>(iBuilder, iBuilder->getStreamSetTy(8, 1)));
166
167    KernelBuilder * mmapK = pxDriver.addKernelInstance(make_unique<MMapSourceKernel>(iBuilder));
168    mmapK->setInitialArguments({fileSize});
169    pxDriver.makeKernelCall(mmapK, {}, {ByteStream});
170
171    KernelBuilder * s2pk = pxDriver.addKernelInstance(make_unique<S2PKernel>(iBuilder));
172    pxDriver.makeKernelCall(s2pk, {ByteStream}, {BasisBits});
173   
174    KernelBuilder * wck = pxDriver.addKernelInstance(wc_gen(iBuilder));
175    pxDriver.makeKernelCall(wck, {BasisBits}, {});
176
177
178    pxDriver.generatePipelineIR();
179   
180    Value * lineCount = wck->createGetAccumulatorCall("lineCount");
181    Value * wordCount = wck->createGetAccumulatorCall("wordCount");
182    Value * charCount = wck->createGetAccumulatorCall("charCount");
183
184    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, fileSize, fileIdx}));
185   
186    iBuilder->CreateRetVoid();
187
188    pxDriver.linkAndFinalize();
189}
190
191
192wcFunctionType wcCodeGen(void) {
193    Module * M = new Module("wc", getGlobalContext());
194    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
195    ParabixDriver pxDriver(idb);
196   
197    wcPipelineGen(pxDriver);
198
199    wcFunctionType main = reinterpret_cast<wcFunctionType>(pxDriver.getPointerToMain());
200    delete idb;
201    return main;
202}
203
204void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
205    std::string fileName = inputFiles[fileIdx];
206    size_t fileSize;
207    char * fileBuffer;
208   
209    const boost::filesystem::path file(fileName);
210    if (exists(file)) {
211        if (is_directory(file)) {
212            return;
213        }
214    } else {
215        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
216        return;
217    }
218   
219    fileSize = file_size(file);
220    boost::iostreams::mapped_file_source mappedFile;
221    if (fileSize == 0) {
222        fileBuffer = nullptr;
223    }
224    else {
225        try {
226            mappedFile.open(fileName);
227        } catch (std::exception &e) {
228            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
229            return;
230        }
231        fileBuffer = const_cast<char *>(mappedFile.data());
232    }
233    fn_ptr(fileBuffer, fileSize, fileIdx);
234
235    mappedFile.close();
236   
237}
238
239
240
241int main(int argc, char *argv[]) {
242    AddParabixVersionPrinter();
243    cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{&wcFlags, pablo_toolchain_flags(), codegen::codegen_flags()});
244    cl::ParseCommandLineOptions(argc, argv);
245    if (wcOptions.size() == 0) {
246        CountLines = true;
247        CountWords = true;
248        CountBytes = true;
249    }
250    else {
251        CountLines = false;
252        CountWords = false;
253        CountBytes = false;
254        CountChars = false;
255        for (unsigned i = 0; i < wcOptions.size(); i++) {
256            switch (wcOptions[i]) {
257                case WordOption: CountWords = true; break;
258                case LineOption: CountLines = true; break;
259                case CharOption: CountBytes = true; CountChars = false; break;
260                case ByteOption: CountChars = true; CountBytes = false; break;
261            }
262        }
263    }
264   
265   
266    wcFunctionType fn_ptr = wcCodeGen();
267
268    int fileCount = inputFiles.size();
269    lineCount.resize(fileCount);
270    wordCount.resize(fileCount);
271    charCount.resize(fileCount);
272    byteCount.resize(fileCount);
273   
274    for (unsigned i = 0; i < inputFiles.size(); ++i) {
275        wc(fn_ptr, i);
276    }
277   
278    size_t maxCount = 0;
279    if (CountLines) maxCount = TotalLines;
280    if (CountWords) maxCount = TotalWords;
281    if (CountChars) maxCount = TotalChars;
282    if (CountBytes) maxCount = TotalBytes;
283   
284    int fieldWidth = std::to_string(maxCount).size() + 1;
285    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
286
287    for (unsigned i = 0; i < inputFiles.size(); ++i) {
288        std::cout << std::setw(fieldWidth-1);
289        if (CountLines) {
290            std::cout << lineCount[i] << std::setw(fieldWidth);
291        }
292        if (CountWords) {
293            std::cout << wordCount[i] << std::setw(fieldWidth);
294        }
295        if (CountChars) {
296            std::cout << charCount[i] << std::setw(fieldWidth);
297        }
298        if (CountBytes) {
299            std::cout << byteCount[i];
300        }
301        std::cout << " " << inputFiles[i] << std::endl;
302    }
303    if (inputFiles.size() > 1) {
304        std::cout << std::setw(fieldWidth-1);
305        if (CountLines) {
306            std::cout << TotalLines << std::setw(fieldWidth);
307        }
308        if (CountWords) {
309            std::cout << TotalWords << std::setw(fieldWidth);
310        }
311        if (CountChars) {
312            std::cout << TotalChars << std::setw(fieldWidth);
313        }
314        if (CountBytes) {
315            std::cout << TotalBytes;
316        }
317        std::cout << " total" << std::endl;
318    }
319
320    return 0;
321}
Note: See TracBrowser for help on using the repository browser.