source: icGREP/icgrep-devel/icgrep/editd/editd.cpp @ 5177

Last change on this file since 5177 was 5177, checked in by cameron, 3 years ago

Editd fixes for LLVM 3.9

  • Property svn:executable set to *
File size: 13.4 KB
RevLine 
[5172]1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <re/re_cc.h>
26#include <cc/cc_compiler.h>
27#include <pablo/function.h>
28#include <pablo/pablo_compiler.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/streamset.h>
33#include <kernels/interface.h>
34#include <kernels/kernel.h>
35#include <kernels/s2p_kernel.h>
36#include <editd/editdscan_kernel.h>
37#include <kernels/pipeline.h>
38
39#include <re/re_alt.h>
40#include <editd/pattern_compiler.h>
41
42// mmap system
43#include <boost/filesystem.hpp>
44#include <boost/iostreams/device/mapped_file.hpp>
45#include <fcntl.h>
46static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
47
48static cl::list<std::string> pattVector("e", cl::desc("pattern"), cl::ZeroOrMore);
49static cl::opt<std::string> PatternFilename("f", cl::desc("Take patterns (one per line) from a file"), cl::value_desc("regex file"), cl::init(""));
50
51static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."));
52
53static cl::opt<int> editDistance("edit-dist", cl::desc("Edit Distance Value"), cl::init(2));
54static cl::opt<int> optPosition("opt-pos", cl::desc("Optimize position"), cl::init(8));
55static cl::opt<int> stepSize("step-size", cl::desc("Step Size"), cl::init(3));
56
57using namespace kernel;
58using namespace pablo;
59
[5173]60struct matchPosition
61{
62    size_t pos;
63    size_t dist;
64};
65
66std::vector<struct matchPosition> matchList;
67
68void sort_match_list(){
69   
70}
71
72void run_second_filter(int total_len, int pattern_segs, float errRate){
73   
74    if(matchList.size() == 0) return;
75
76    //Sort match position
77    bool exchanged = true;
78    while(exchanged){
79        exchanged = false;
[5177]80        for (unsigned i=0; i<matchList.size()-1; i++){
[5173]81            if(matchList[i].pos > matchList[i+1].pos){
82                size_t tmp_pos = matchList[i].pos;
83                size_t tmp_dist = matchList[i].dist;
84                matchList[i].pos = matchList[i+1].pos;
85                matchList[i].dist = matchList[i+1].dist;
86                matchList[i+1].pos = tmp_pos;
87                matchList[i+1].dist = tmp_dist;
88                exchanged = true;
89            }
90        }
91    }
92
93    std::cerr << "pattern_segs = " << pattern_segs << ", total_len = " << total_len << std::endl;
94
95    int v = pattern_segs * (editDistance+1) - total_len * errRate;
96
97    int startPos = matchList[0].pos;
98    int sum = matchList[0].dist;
99    int curIdx = 0;
[5177]100    unsigned i = 0;
[5173]101    int count = 0;
102    while (i < matchList.size()){
103        if(matchList[i].pos - startPos < total_len * (errRate+1)){
104            sum += matchList[i].dist;
105            i++;
106        }
107        else{
108            if(sum > v) count++;
109            sum -= matchList[curIdx].dist;
110            curIdx++;
111            startPos = matchList[curIdx].pos;
112        }
113    }
114    std::cout << "matching value is " << v << std::endl;
115    std::cout << "total candidate from the first filter is " << matchList.size() << std::endl;
116    std::cout << "total candidate from the second filter is " << count << std::endl;
117}
118
[5172]119extern "C" {
120void wrapped_report_pos(size_t match_pos, int dist) {
[5173]121        struct matchPosition curMatch;
122        curMatch.pos = match_pos;
123        curMatch.dist = dist;
124        matchList.push_back(curMatch);
[5172]125        std::cout << "pos: " << match_pos << ", dist:" << dist << "\n";
126    }
127
128}
129
130void icgrep_Linking(Module * m, ExecutionEngine * e) {
131    Module::FunctionListType & fns = m->getFunctionList();
132    for (Module::FunctionListType::iterator it = fns.begin(), it_end = fns.end(); it != it_end; ++it) {
133        std::string fnName = it->getName().str();
134        if (fnName == "wrapped_report_pos") {
135            e->addGlobalMapping(cast<GlobalValue>(it), (void *)&wrapped_report_pos);
136        }
137    }
138}
139
[5173]140void get_editd_pattern(int & pattern_segs, int & total_len) {
[5172]141 
142    if (PatternFilename != "") {
143        std::ifstream pattFile(PatternFilename.c_str());
144        std::string r;
145        if (pattFile.is_open()) {
146            while (std::getline(pattFile, r)) {
147                pattVector.push_back(r);
[5173]148                pattern_segs ++; 
149                total_len += r.size(); 
[5172]150            }
151            pattFile.close();
152        }
153    }
154   
155    // if there are no regexes specified through -e or -f, the first positional argument
156    // must be a regex, not an input file.
157   
158    if (pattVector.size() == 0) {
159        pattVector.push_back(inputFiles[0]);
160        inputFiles.erase(inputFiles.begin());
161    }
162}
163
164Function * editdPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
165    Type * mBitBlockType = iBuilder->getBitBlockType();
166   
167    ExternalFileBuffer ChStream(iBuilder, StreamSetType(4, i1));
168    SingleBlockBuffer MatchResults(iBuilder, StreamSetType(editDistance+1, i1));
169
170    pablo_function_passes(function);
171    pablo::PabloKernel  editdk(iBuilder, "editd", function, {});
[5173]172    kernel::editdScanKernel editdScanK(iBuilder, editDistance);
[5172]173   
174    std::unique_ptr<Module> editdM = editdk.createKernelModule({&ChStream}, {&MatchResults});
175    std::unique_ptr<Module> scanM = editdScanK.createKernelModule({&MatchResults}, {});               
176   
177    editdk.addKernelDeclarations(mMod);
178    editdScanK.addKernelDeclarations(mMod);
179
180    Type * const size_ty = iBuilder->getSizeTy();
181    Type * const voidTy = Type::getVoidTy(mMod->getContext());
182    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
183   
184    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, size_ty, nullptr));
185    main->setCallingConv(CallingConv::C);
186    Function::arg_iterator args = main->arg_begin();
187   
188    Value * const inputStream = &*(args++);
189    inputStream->setName("input");
190    Value * const fileSize = &*(args++);
191    fileSize->setName("fileSize");
192   
193    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
194
195    ChStream.setStreamSetBuffer(inputStream, fileSize);
196    MatchResults.allocateBuffer();
197   
198    Value * editdInstance = editdk.createInstance({});
199    Value * scanMatchInstance = editdScanK.createInstance({});
200   
201    generatePipelineLoop(iBuilder, {&editdk, &editdScanK}, {editdInstance, scanMatchInstance}, fileSize);
202       
203    iBuilder->CreateRetVoid();
204   
205    Linker L(*mMod);
206    L.linkInModule(std::move(editdM));
207    L.linkInModule(std::move(scanM));
208   
209    return main;
210}
211
212Function * preprocessPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
213    Type * mBitBlockType = iBuilder->getBitBlockType();
214   
215    ExternalFileBuffer ByteStream(iBuilder, StreamSetType(1, i8));
216    SingleBlockBuffer BasisBits(iBuilder, StreamSetType(8, i1));
217    ExternalFileBuffer CCResults(iBuilder, StreamSetType(4, i1));
218
219    s2pKernel  s2pk(iBuilder);
220    std::unique_ptr<Module> s2pM = s2pk.createKernelModule({&ByteStream}, {&BasisBits});
221
222    pablo_function_passes(function);
223    pablo::PabloKernel  ccck(iBuilder, "ccc", function, {});
224   
225    std::unique_ptr<Module> cccM = ccck.createKernelModule({&BasisBits}, {&CCResults});
226   
227    s2pk.addKernelDeclarations(mMod);
228    ccck.addKernelDeclarations(mMod);
229
230    Type * const size_ty = iBuilder->getSizeTy();
231    Type * const voidTy = Type::getVoidTy(mMod->getContext());
232    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
233    Type * const outputType = PointerType::get(ArrayType::get(mBitBlockType, 4), 0);
234   
235    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, size_ty, outputType, nullptr));
236    main->setCallingConv(CallingConv::C);
237    Function::arg_iterator args = main->arg_begin();
238   
239    Value * const inputStream = &*(args++);
240    inputStream->setName("input");
241    Value * const fileSize = &*(args++);
242    fileSize->setName("fileSize");
243    Value * const outputStream = &*(args++);
244    outputStream->setName("output");
245   
246    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
247
248    ByteStream.setStreamSetBuffer(inputStream, fileSize);
249    BasisBits.allocateBuffer();
250    CCResults.setStreamSetBuffer(outputStream, fileSize);
251   
252    Value * s2pInstance = s2pk.createInstance({});
253    Value * cccInstance = ccck.createInstance({});
254   
255    generatePipelineLoop(iBuilder, {&s2pk, &ccck}, {s2pInstance, cccInstance}, fileSize);
256       
257    iBuilder->CreateRetVoid();
258   
259    Linker L(*mMod);
260    L.linkInModule(std::move(s2pM));
261    L.linkInModule(std::move(cccM));
262   
263    return main;
264}
265
266
267typedef void (*preprocessFunctionType)(char * byte_data, size_t filesize, char * output_data);
268static ExecutionEngine * preprocessEngine = nullptr;
269
270preprocessFunctionType preprocessCodeGen() {
271                           
[5176]272    LLVMContext TheContext;
273    Module * M = new Module("preprocess", TheContext);
[5172]274    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
275
276    PabloFunction * function = PabloFunction::Create("preprocess", 8, 4);
277    cc::CC_Compiler ccc(*function);
278    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
279
280    pablo::PabloAST * A = ccc.compileCC(re::makeCC(re::makeCC(0x41), re::makeCC(0x61)));
281    pablo::PabloAST * C = ccc.compileCC(re::makeCC(re::makeCC(0x43), re::makeCC(0x63)));
282    pablo::PabloAST * T = ccc.compileCC(re::makeCC(re::makeCC(0x54), re::makeCC(0x74)));
283    pablo::PabloAST * G = ccc.compileCC(re::makeCC(re::makeCC(0x47), re::makeCC(0x67)));
284
285    function->setResult(0, pBuilder.createAssign("A", A));
286    function->setResult(1, pBuilder.createAssign("C", C));
287    function->setResult(2, pBuilder.createAssign("T", T));
288    function->setResult(3, pBuilder.createAssign("G", G));
289
290    llvm::Function * main_IR = preprocessPipeline(M, idb, function);
291
292    preprocessEngine = JIT_to_ExecutionEngine(M);
293   
294    preprocessEngine->finalizeObject();
295
296    delete idb;
297    return reinterpret_cast<preprocessFunctionType>(preprocessEngine->getPointerToFunction(main_IR));
298}
299
300typedef void (*editdFunctionType)(char * byte_data, size_t filesize);
301static ExecutionEngine * editdEngine = nullptr;
302
303editdFunctionType editdCodeGen() {
304                           
[5177]305    LLVMContext TheContext;
[5176]306    Module * M = new Module("editd", TheContext);
[5172]307    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
308
309    PabloFunction * function = PabloFunction::Create("editd", 4, editDistance+1);
310    pablo::PabloBuilder main (function->getEntryBlock());
311
312    const PabloType * streamType = getPabloType(PabloType::Stream, 1);
313
314    std::vector<pablo::Var *>   basisBits(4);
315    function->setParameter(0, basisBits[0] = main.createVar("PatA", streamType));
316    function->setParameter(1, basisBits[1] = main.createVar("PatC", streamType));
317    function->setParameter(2, basisBits[2] = main.createVar("PatT", streamType));
318    function->setParameter(3, basisBits[3] = main.createVar("PatG", streamType));
319
320    re::Pattern_Compiler pattern_compiler(*function);
321    pattern_compiler.compile(pattVector, main, basisBits, editDistance, optPosition, stepSize);
322
323    llvm::Function * main_IR = editdPipeline(M, idb, function);
324
325    editdEngine = JIT_to_ExecutionEngine(M);
326   
327    editdEngine->finalizeObject();
328
329    delete idb;
330    return reinterpret_cast<editdFunctionType>(editdEngine->getPointerToFunction(main_IR));
331}
332
333char * preprocess(preprocessFunctionType fn_ptr, int & size) {
334    std::string fileName = inputFiles[0];
335    size_t fileSize;
336    char * fileBuffer;
337   
338    const boost::filesystem::path file(fileName);
339    if (exists(file)) {
340        if (is_directory(file)) {
341            exit(0);
342        }
343    } else {
344        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
345        exit(0);
346    }
347   
348    fileSize = file_size(file);
349    boost::iostreams::mapped_file_source mappedFile;
350    if (fileSize == 0) {
351        fileBuffer = nullptr;
352    }
353    else {
354        try {
355            mappedFile.open(fileName);
356        } catch (std::exception &e) {
357            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
358            exit(0);
359        }
360        fileBuffer = const_cast<char *>(mappedFile.data());
361    }
362    char * chStream = (char *) aligned_alloc(32, fileSize);
363    fn_ptr(fileBuffer, fileSize, chStream);
364    size = fileSize;
365
366    mappedFile.close();
367
368    return chStream;
369   
370}
371
372void editd(editdFunctionType fn_ptr, char * chStream, int size) {
373 
374    if (size == 0) {
375        chStream = nullptr;
376    }
377
378    fn_ptr(chStream, size);
379   
380}
381
382int main(int argc, char *argv[]) {
383
384    cl::ParseCommandLineOptions(argc, argv);
385
[5173]386    int pattern_segs = 0;
387    int total_len = 0;
[5172]388
[5173]389    get_editd_pattern(pattern_segs, total_len);
390 
[5172]391    preprocessFunctionType preprocess_ptr = preprocessCodeGen();
392    int size = 0;
393    char * chStream = preprocess(preprocess_ptr, size);
[5173]394   
[5172]395    editdFunctionType editd_ptr = editdCodeGen();
396    editd(editd_ptr, chStream, size);
397
[5173]398    if(pattVector.size()>1)
399        run_second_filter(pattern_segs, total_len, 0.15);
400
[5172]401
402    return 0;
403}
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
Note: See TracBrowser for help on using the repository browser.