source: icGREP/icgrep-devel/icgrep/wc.cpp @ 5029

Last change on this file since 5029 was 5029, checked in by cameron, 3 years ago

Command line and output format improvements for wc - Posix compatibility

File size: 18.2 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13#include <llvm/IR/Function.h>
14#include <llvm/IR/Module.h>
15#include <llvm/ExecutionEngine/ExecutionEngine.h>
16#include <llvm/ExecutionEngine/MCJIT.h>
17#include <llvm/IRReader/IRReader.h>
18#include <llvm/IR/Verifier.h>
19#include <llvm/Support/Debug.h>
20
21#include <llvm/Support/CommandLine.h>
22#include <llvm/CodeGen/CommandFlags.h>
23#include <llvm/Support/SourceMgr.h>
24#include <llvm/Support/TargetSelect.h>
25#include <llvm/Support/Host.h>
26#include <llvm/Support/raw_ostream.h>
27
28#include <utf_encoding.h>
29#include <re/re_cc.h>
30#include <cc/cc_compiler.h>
31#include <pablo/function.h>
32#include <IDISA/idisa_builder.h>
33#include <IDISA/idisa_target.h>
34#include <kernels/instance.h>
35#include <kernels/kernel.h>
36#include <kernels/s2p_kernel.h>
37
38#include <pablo/pablo_compiler.h>
39#include <pablo/pablo_toolchain.h>
40
41// Dynamic processor detection
42#define ISPC_LLVM_VERSION ISPC_LLVM_3_6
43#include <util/ispc.cpp>
44
45#include <utf_encoding.h>
46
47// mmap system
48#include <boost/filesystem.hpp>
49#include <boost/iostreams/device/mapped_file.hpp>
50using namespace boost::iostreams;
51using namespace boost::filesystem;
52
53#include <fcntl.h>
54static cl::OptionCategory wcFlags("Command Flags", "wc options");
55
56static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<input file ...>"), cl::OneOrMore, cl::cat(wcFlags));
57
58enum CountOptions {
59    LineOption, WordOption, CharOption, ByteOption
60};
61
62static cl::list<CountOptions> wcOptions(cl::desc("Counting options."),
63  cl::values(clEnumValN(LineOption, "l", "Report the number of lines in each input file."),
64             clEnumValN(WordOption, "w", "Report the number of words in each input file."),
65             clEnumValN(CharOption, "m", "Report the number of characters in each input file (override -c)."),
66             clEnumValN(ByteOption, "c", "Report the number of bytes in each input file (override -m)."),
67             clEnumValEnd), cl::cat(wcFlags), cl::Grouping);
68                                                 
69static cl::OptionCategory eIRDumpOptions("LLVM IR Dump Options", "These options control dumping of LLVM IR.");
70static cl::opt<bool> DumpGeneratedIR("dump-generated-IR", cl::init(false), cl::desc("Print LLVM IR generated by Pablo Compiler."), cl::cat(eIRDumpOptions));
71
72static cl::OptionCategory cMachineCodeOptimization("Machine Code Optimizations", "These options control back-end compilier optimization levels.");
73
74static cl::opt<char> OptLevel("O", cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] (default = '-O0')"),
75                              cl::cat(cMachineCodeOptimization), cl::Prefix, cl::ZeroOrMore, cl::init('0'));
76
77static cl::opt<unsigned> SegmentSize("segment-size", cl::desc("Segment Size"), cl::value_desc("positive integer"), cl::init(1));
78
79
80static int defaultFieldWidth = 7;  // default field width
81
82
83bool CountLines = false;
84bool CountWords = false;
85bool CountChars = false;
86bool CountBytes = false;
87
88std::vector<uint64_t> lineCount;
89std::vector<uint64_t> wordCount;
90std::vector<uint64_t> charCount;
91std::vector<uint64_t> byteCount;
92
93uint64_t TotalLines = 0;
94uint64_t TotalWords = 0;
95uint64_t TotalChars = 0;
96uint64_t TotalBytes = 0;
97
98
99//  The callback routine that records counts in progress.
100//
101extern "C" {
102    void record_counts(uint64_t lines, uint64_t words, uint64_t chars, uint64_t bytes, uint64_t fileIdx) {
103        lineCount[fileIdx] = lines;
104        wordCount[fileIdx] = words;
105        charCount[fileIdx] = chars;
106        byteCount[fileIdx] = bytes;
107        TotalLines += lines;
108        TotalWords += words;
109        TotalChars += chars;
110        TotalBytes += bytes;
111    }
112}
113
114//
115//
116
117ExecutionEngine * wcJIT_to_ExecutionEngine (Module * m) {
118
119    InitializeNativeTarget();
120    InitializeNativeTargetAsmPrinter();
121    InitializeNativeTargetAsmParser();
122
123    PassRegistry * Registry = PassRegistry::getPassRegistry();
124    initializeCore(*Registry);
125    initializeCodeGen(*Registry);
126    initializeLowerIntrinsicsPass(*Registry);
127
128    std::string errMessage;
129    EngineBuilder builder(std::move(std::unique_ptr<Module>(m)));
130    builder.setErrorStr(&errMessage);
131    builder.setMCPU(sys::getHostCPUName());
132    CodeGenOpt::Level optLevel = CodeGenOpt::Level::None;
133    switch (OptLevel) {
134        case '0': optLevel = CodeGenOpt::None; break;
135        case '1': optLevel = CodeGenOpt::Less; break;
136        case '2': optLevel = CodeGenOpt::Default; break;
137        case '3': optLevel = CodeGenOpt::Aggressive; break;
138        default: errs() << OptLevel << " is an invalid optimization level.\n";
139    }
140    builder.setOptLevel(optLevel);
141
142    if ((strncmp(lGetSystemISA(), "avx2", 4) == 0)) {
143            std::vector<std::string> attrs;
144            attrs.push_back("avx2");
145            builder.setMAttrs(attrs);
146    }
147
148    // builder.selectTarget();
149
150    //builder.setOptLevel(mMaxWhileDepth ? CodeGenOpt::Level::Less : CodeGenOpt::Level::None);
151    ExecutionEngine * engine = builder.create();
152    if (engine == nullptr) {
153        throw std::runtime_error("Could not create ExecutionEngine: " + errMessage);
154    }
155    return engine;
156}
157
158
159pablo::PabloFunction * wc_gen(Encoding encoding) {
160    //  input: 8 basis bit streams
161    //  output: 3 count streams
162   
163    pablo::PabloFunction * function = pablo::PabloFunction::Create("wc", 8, 3);
164    cc::CC_Compiler ccc(*function, encoding);
165   
166    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
167    const std::vector<pablo::Var *> u8_bits = ccc.getBasisBits();
168
169    if (CountLines) {
170        pablo::PabloAST * LF = ccc.compileCC(re::makeCC(0x0A));
171        function->setResult(0, pBuilder.createAssign("lineCount", pBuilder.createCount(LF)));
172    }
173    else function->setResult(0, pBuilder.createAssign("lineCount", pBuilder.createZeroes()));
174    // FIXME - we need to limit this to pablo.inFile() because null bytes past EOF are matched by wordChar
175    if (CountWords) {
176        pablo::PabloAST * WS = ccc.compileCC(re::makeCC(re::makeCC(0x09, 0x0D), re::makeCC(0x20)));
177       
178        pablo::PabloAST * wordChar = ccc.compileCC(re::makeCC(re::makeCC(re::makeCC(0x00, 0x08), re::makeCC(0xE, 0x1F)), re::makeCC(0x21, 0xFF)));
179        // WS_follow_or_start = 1 past WS or at start of file
180        pablo::PabloAST * WS_follow_or_start = pBuilder.createNot(pBuilder.createAdvance(pBuilder.createNot(WS), 1));
181        //
182        pablo::PabloAST * wordStart = pBuilder.createAnd(wordChar, WS_follow_or_start);
183        function->setResult(1, pBuilder.createAssign("wordCount", pBuilder.createCount(wordStart)));
184    }
185    else function->setResult(1, pBuilder.createAssign("wordCount", pBuilder.createZeroes()));
186    if (CountChars) {
187        //
188        // FIXME: This correctly counts characters assuming valid UTF-8 input.  But what if input is
189        // not UTF-8, or is not valid?
190        //
191        pablo::PabloAST * u8Begin = ccc.compileCC(re::makeCC(re::makeCC(0, 0x7F), re::makeCC(0xC2, 0xF4)));
192        function->setResult(2, pBuilder.createAssign("charCount", pBuilder.createCount(u8Begin)));
193    }
194    else function->setResult(2, pBuilder.createAssign("charCount", pBuilder.createZeroes()));
195    return function;
196}
197
198using namespace kernel;
199
200
201class wcPipelineBuilder {
202public:
203    wcPipelineBuilder(llvm::Module * m, IDISA::IDISA_Builder * b);
204   
205    ~wcPipelineBuilder();
206   
207    void CreateKernels(pablo::PabloFunction * function);
208    llvm::Function * ExecuteKernels();
209   
210private:
211    llvm::Module *                      mMod;
212    IDISA::IDISA_Builder *              iBuilder;
213    KernelBuilder *                     mS2PKernel;
214    KernelBuilder *                     mWC_Kernel;
215    llvm::Type *                        mBitBlockType;
216    int                                 mBlockSize;
217};
218
219
220using namespace pablo;
221using namespace kernel;
222
223wcPipelineBuilder::wcPipelineBuilder(Module * m, IDISA::IDISA_Builder * b)
224: mMod(m)
225, iBuilder(b)
226, mBitBlockType(b->getBitBlockType())
227, mBlockSize(b->getBitBlockWidth()){
228   
229}
230
231wcPipelineBuilder::~wcPipelineBuilder(){
232    delete mS2PKernel;
233    delete mWC_Kernel;
234}
235
236void wcPipelineBuilder::CreateKernels(PabloFunction * function){
237    mS2PKernel = new KernelBuilder(iBuilder, "s2p", SegmentSize);
238    mWC_Kernel = new KernelBuilder(iBuilder, "wc", SegmentSize);
239   
240    generateS2PKernel(mMod, iBuilder, mS2PKernel);
241   
242    pablo_function_passes(function);
243   
244    PabloCompiler pablo_compiler(mMod, iBuilder);
245    try {
246        pablo_compiler.setKernel(mWC_Kernel);
247        pablo_compiler.compile(function);
248        delete function;
249        releaseSlabAllocatorMemory();
250    } catch (std::runtime_error e) {
251        delete function;
252        releaseSlabAllocatorMemory();
253        std::cerr << "Runtime error: " << e.what() << std::endl;
254        exit(1);
255    }
256   
257}
258
259
260
261
262Function * wcPipelineBuilder::ExecuteKernels() {
263    Constant * record_counts_routine;
264    Type * const int64ty = iBuilder->getInt64Ty();
265    Type * const voidTy = Type::getVoidTy(mMod->getContext());
266    record_counts_routine = mMod->getOrInsertFunction("record_counts", voidTy, int64ty, int64ty, int64ty, int64ty, int64ty, nullptr);
267    Type * const inputType = PointerType::get(ArrayType::get(StructType::get(mMod->getContext(), std::vector<Type *>({ArrayType::get(mBitBlockType, 8)})), 1), 0);
268   
269    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, int64ty, int64ty, nullptr));
270    main->setCallingConv(CallingConv::C);
271    Function::arg_iterator args = main->arg_begin();
272   
273    Value * const inputStream = &*(args++);
274    inputStream->setName("input");
275    Value * const bufferSize = &*(args++);
276    bufferSize->setName("bufferSize");
277    Value * const fileIdx = &*(args++);
278    fileIdx->setName("fileIdx");
279   
280    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
281   
282    BasicBlock * entryBlock = iBuilder->GetInsertBlock();
283
284    BasicBlock * segmentCondBlock = nullptr;
285    BasicBlock * segmentBodyBlock = nullptr;
286    const unsigned segmentSize = SegmentSize;
287    if (segmentSize > 1) {
288        segmentCondBlock = BasicBlock::Create(mMod->getContext(), "segmentCond", main, 0);
289        segmentBodyBlock = BasicBlock::Create(mMod->getContext(), "segmentBody", main, 0);
290    }
291    BasicBlock * fullCondBlock = BasicBlock::Create(mMod->getContext(), "fullCond", main, 0);
292    BasicBlock * fullBodyBlock = BasicBlock::Create(mMod->getContext(), "fullBody", main, 0);
293    BasicBlock * finalBlock = BasicBlock::Create(mMod->getContext(), "final", main, 0);
294    BasicBlock * finalPartialBlock = BasicBlock::Create(mMod->getContext(), "partial", main, 0);
295    BasicBlock * finalEmptyBlock = BasicBlock::Create(mMod->getContext(), "empty", main, 0);
296    BasicBlock * endBlock = BasicBlock::Create(mMod->getContext(), "end", main, 0);
297
298    Instance * s2pInstance = mS2PKernel->instantiate(inputStream);
299    Instance * wcInstance = mWC_Kernel->instantiate(s2pInstance->getOutputStreamBuffer());
300
301    Value * initialBufferSize = nullptr;
302    BasicBlock * initialBlock = nullptr;
303   
304    if (segmentSize > 1) {
305        iBuilder->CreateBr(segmentCondBlock);
306        iBuilder->SetInsertPoint(segmentCondBlock);
307        PHINode * remainingBytes = iBuilder->CreatePHI(int64ty, 2, "remainingBytes");
308        remainingBytes->addIncoming(bufferSize, entryBlock);
309        Constant * const step = ConstantInt::get(int64ty, mBlockSize * segmentSize);
310        Value * segmentCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
311        iBuilder->CreateCondBr(segmentCondTest, fullCondBlock, segmentBodyBlock);
312        iBuilder->SetInsertPoint(segmentBodyBlock);
313        for (unsigned i = 0; i < segmentSize; ++i) {
314            s2pInstance->CreateDoBlockCall();
315        }
316        for (unsigned i = 0; i < segmentSize; ++i) {
317            wcInstance->CreateDoBlockCall();
318        }
319        remainingBytes->addIncoming(iBuilder->CreateSub(remainingBytes, step), segmentBodyBlock);
320        iBuilder->CreateBr(segmentCondBlock);
321        initialBufferSize = remainingBytes;
322        initialBlock = segmentCondBlock;
323    } else {
324        initialBufferSize = bufferSize;
325        initialBlock = entryBlock;
326        iBuilder->CreateBr(fullCondBlock);
327    }
328
329    iBuilder->SetInsertPoint(fullCondBlock);
330    PHINode * remainingBytes = iBuilder->CreatePHI(int64ty, 2, "remainingBytes");
331    remainingBytes->addIncoming(initialBufferSize, initialBlock);
332
333    Constant * const step = ConstantInt::get(int64ty, mBlockSize);
334    Value * fullCondTest = iBuilder->CreateICmpULT(remainingBytes, step);
335    iBuilder->CreateCondBr(fullCondTest, finalBlock, fullBodyBlock);
336   
337    iBuilder->SetInsertPoint(fullBodyBlock);
338
339    s2pInstance->CreateDoBlockCall();
340    wcInstance->CreateDoBlockCall();
341
342    Value * diff = iBuilder->CreateSub(remainingBytes, step);
343
344    remainingBytes->addIncoming(diff, fullBodyBlock);
345    iBuilder->CreateBr(fullCondBlock);
346   
347    iBuilder->SetInsertPoint(finalBlock);
348    Value * emptyBlockCond = iBuilder->CreateICmpEQ(remainingBytes, ConstantInt::get(int64ty, 0));
349    iBuilder->CreateCondBr(emptyBlockCond, finalEmptyBlock, finalPartialBlock);
350   
351   
352    iBuilder->SetInsertPoint(finalPartialBlock);
353    s2pInstance->CreateDoBlockCall();
354    iBuilder->CreateBr(endBlock);
355   
356    iBuilder->SetInsertPoint(finalEmptyBlock);
357    s2pInstance->clearOutputStreamSet();
358    iBuilder->CreateBr(endBlock);
359   
360    iBuilder->SetInsertPoint(endBlock);
361
362    wcInstance->CreateDoBlockCall();
363   
364    Value * lineCount = iBuilder->CreateExtractElement(iBuilder->CreateBlockAlignedLoad(wcInstance->getOutputStream((int) 0)), iBuilder->getInt32(0));
365    Value * wordCount = iBuilder->CreateExtractElement(iBuilder->CreateBlockAlignedLoad(wcInstance->getOutputStream(1)), iBuilder->getInt32(0));
366    Value * charCount = iBuilder->CreateExtractElement(iBuilder->CreateBlockAlignedLoad(wcInstance->getOutputStream(2)), iBuilder->getInt32(0));
367   
368    iBuilder->CreateCall(record_counts_routine, std::vector<Value *>({lineCount, wordCount, charCount, bufferSize, fileIdx}));
369   
370    iBuilder->CreateRetVoid();
371   
372    return main;
373}
374
375
376typedef void (*wcFunctionType)(char * byte_data, size_t filesize, size_t fileIdx);
377
378static ExecutionEngine * wcEngine = nullptr;
379
380wcFunctionType wcCodeGen(void) {
381                           
382    Module * M = new Module("wc", getGlobalContext());
383   
384    IDISA::IDISA_Builder * idb = GetIDISA_Builder(M);
385
386    wcPipelineBuilder pipelineBuilder(M, idb);
387
388    Encoding encoding(Encoding::Type::UTF_8, 8);
389   
390    pablo::PabloFunction * function = wc_gen(encoding);
391   
392
393    pipelineBuilder.CreateKernels(function);
394
395    llvm::Function * main_IR = pipelineBuilder.ExecuteKernels();
396   
397    if (DumpGeneratedIR) {
398        M->dump();
399    }
400   
401    //verifyModule(*M, &dbgs());
402    //std::cerr << "ExecuteKernels(); done\n";
403    wcEngine = wcJIT_to_ExecutionEngine(M);
404   
405    wcEngine->finalizeObject();
406    //std::cerr << "finalizeObject(); done\n";
407
408    delete idb;
409    return reinterpret_cast<wcFunctionType>(wcEngine->getPointerToFunction(main_IR));
410}
411
412void wc(wcFunctionType fn_ptr, const int64_t fileIdx) {
413    std::string fileName = inputFiles[fileIdx];
414    size_t fileSize;
415    char * fileBuffer;
416   
417    const path file(fileName);
418    if (exists(file)) {
419        if (is_directory(file)) {
420            return;
421        }
422    } else {
423        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
424        return;
425    }
426   
427    fileSize = file_size(file);
428    mapped_file_source mappedFile;
429    if (fileSize == 0) {
430        fileBuffer = nullptr;
431    }
432    else {
433        try {
434            mappedFile.open(fileName);
435        } catch (std::exception &e) {
436            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
437            return;
438        }
439        fileBuffer = const_cast<char *>(mappedFile.data());
440    }
441    fn_ptr(fileBuffer, fileSize, fileIdx);
442
443    mappedFile.close();
444   
445}
446
447
448
449
450int main(int argc, char *argv[]) {
451    HideUnrelatedOptions(wcFlags);
452
453    cl::ParseCommandLineOptions(argc, argv);
454    if (wcOptions.size() == 0) {
455        CountLines = true;
456        CountWords = true;
457        CountBytes = true;
458    }
459    else {
460        CountLines = false;
461        CountWords = false;
462        CountBytes = false;
463        CountChars = false;
464        for (unsigned i = 0; i < wcOptions.size(); i++) {
465            switch (wcOptions[i]) {
466                case WordOption: CountWords = true; break;
467                case LineOption: CountLines = true; break;
468                case CharOption: CountBytes = true; CountChars = false; break;
469                case ByteOption: CountChars = true; CountBytes = false; break;
470            }
471        }
472    }
473   
474   
475    wcFunctionType fn_ptr = wcCodeGen();
476
477    int fileCount = inputFiles.size();
478    lineCount.resize(fileCount);
479    wordCount.resize(fileCount);
480    charCount.resize(fileCount);
481    byteCount.resize(fileCount);
482   
483    for (unsigned i = 0; i < inputFiles.size(); ++i) {
484        wc(fn_ptr, i);
485    }
486   
487    delete wcEngine;
488   
489    size_t maxCount = 0;
490    if (CountLines) maxCount = TotalLines;
491    if (CountWords) maxCount = TotalWords;
492    if (CountChars) maxCount = TotalChars;
493    if (CountBytes) maxCount = TotalBytes;
494   
495    int fieldWidth = std::to_string(maxCount).size() + 1;
496    if (fieldWidth < defaultFieldWidth) fieldWidth = defaultFieldWidth;
497
498    for (unsigned i = 0; i < inputFiles.size(); ++i) {
499        std::cout << std::setw(fieldWidth-1);
500        if (CountLines) {
501            std::cout << lineCount[i] << std::setw(fieldWidth);
502        }
503        if (CountWords) {
504            std::cout << wordCount[i] << std::setw(fieldWidth);
505        }
506        if (CountChars) {
507            std::cout << charCount[i] << std::setw(fieldWidth);
508        }
509        if (CountBytes) {
510            std::cout << byteCount[i];
511        }
512        std::cout << " " << inputFiles[i] << std::endl;
513    }
514    if (inputFiles.size() > 1) {
515        std::cout << std::setw(fieldWidth-1);
516        if (CountLines) {
517            std::cout << TotalLines << std::setw(fieldWidth);
518        }
519        if (CountWords) {
520            std::cout << TotalWords << std::setw(fieldWidth);
521        }
522        if (CountChars) {
523            std::cout << TotalChars << std::setw(fieldWidth);
524        }
525        if (CountBytes) {
526            std::cout << TotalBytes;
527        }
528        std::cout << " total" << std::endl;
529    }
530
531    return 0;
532}
533
534                       
Note: See TracBrowser for help on using the repository browser.