source: icGREP/icgrep-devel/icgrep/editd/editd.cpp @ 5173

Last change on this file since 5173 was 5173, checked in by lindanl, 3 years ago

Edit Distance: second level filter.

  • Property svn:executable set to *
File size: 13.4 KB
Line 
1/*
2 *  Copyright (c) 2015 International Characters.
3 *  This software is licensed to the public under the Open Software License 3.0.
4 *  icgrep is a trademark of International Characters.
5 */
6
7#include <string>
8#include <iostream>
9#include <iomanip>
10#include <fstream>
11#include <sstream>
12
13
14#include <toolchain.h>
15#include <pablo/pablo_toolchain.h>
16#include <llvm/IR/Function.h>
17#include <llvm/IR/Module.h>
18#include <llvm/ExecutionEngine/ExecutionEngine.h>
19#include <llvm/ExecutionEngine/MCJIT.h>
20#include "llvm/Linker/Linker.h"
21
22#include <llvm/Support/CommandLine.h>
23#include <llvm/Support/raw_ostream.h>
24
25#include <re/re_cc.h>
26#include <cc/cc_compiler.h>
27#include <pablo/function.h>
28#include <pablo/pablo_compiler.h>
29#include <pablo/pablo_kernel.h>
30#include <IDISA/idisa_builder.h>
31#include <IDISA/idisa_target.h>
32#include <kernels/streamset.h>
33#include <kernels/interface.h>
34#include <kernels/kernel.h>
35#include <kernels/s2p_kernel.h>
36#include <editd/editdscan_kernel.h>
37#include <kernels/pipeline.h>
38
39#include <re/re_alt.h>
40#include <editd/pattern_compiler.h>
41
42// mmap system
43#include <boost/filesystem.hpp>
44#include <boost/iostreams/device/mapped_file.hpp>
45#include <fcntl.h>
46static cl::list<std::string> inputFiles(cl::Positional, cl::desc("<regex> <input file ...>"), cl::OneOrMore);
47
48static cl::list<std::string> pattVector("e", cl::desc("pattern"), cl::ZeroOrMore);
49static cl::opt<std::string> PatternFilename("f", cl::desc("Take patterns (one per line) from a file"), cl::value_desc("regex file"), cl::init(""));
50
51static cl::opt<bool> CaseInsensitive("i", cl::desc("Ignore case distinctions in the pattern and the file."));
52
53static cl::opt<int> editDistance("edit-dist", cl::desc("Edit Distance Value"), cl::init(2));
54static cl::opt<int> optPosition("opt-pos", cl::desc("Optimize position"), cl::init(8));
55static cl::opt<int> stepSize("step-size", cl::desc("Step Size"), cl::init(3));
56
57using namespace kernel;
58using namespace pablo;
59
60struct matchPosition
61{
62    size_t pos;
63    size_t dist;
64};
65
66std::vector<struct matchPosition> matchList;
67
68void sort_match_list(){
69   
70}
71
72void run_second_filter(int total_len, int pattern_segs, float errRate){
73   
74    if(matchList.size() == 0) return;
75
76    //Sort match position
77    bool exchanged = true;
78    while(exchanged){
79        exchanged = false;
80        for (int i=0; i<matchList.size()-1; i++){
81            if(matchList[i].pos > matchList[i+1].pos){
82                size_t tmp_pos = matchList[i].pos;
83                size_t tmp_dist = matchList[i].dist;
84                matchList[i].pos = matchList[i+1].pos;
85                matchList[i].dist = matchList[i+1].dist;
86                matchList[i+1].pos = tmp_pos;
87                matchList[i+1].dist = tmp_dist;
88                exchanged = true;
89            }
90        }
91    }
92
93    std::cerr << "pattern_segs = " << pattern_segs << ", total_len = " << total_len << std::endl;
94
95    int v = pattern_segs * (editDistance+1) - total_len * errRate;
96
97    int startPos = matchList[0].pos;
98    int sum = matchList[0].dist;
99    int curIdx = 0;
100    int i = 0;
101    int count = 0;
102    while (i < matchList.size()){
103        if(matchList[i].pos - startPos < total_len * (errRate+1)){
104            sum += matchList[i].dist;
105            i++;
106        }
107        else{
108            if(sum > v) count++;
109            sum -= matchList[curIdx].dist;
110            curIdx++;
111            startPos = matchList[curIdx].pos;
112        }
113    }
114    std::cout << "matching value is " << v << std::endl;
115    std::cout << "total candidate from the first filter is " << matchList.size() << std::endl;
116    std::cout << "total candidate from the second filter is " << count << std::endl;
117}
118
119extern "C" {
120void wrapped_report_pos(size_t match_pos, int dist) {
121        struct matchPosition curMatch;
122        curMatch.pos = match_pos;
123        curMatch.dist = dist;
124        matchList.push_back(curMatch);
125        std::cout << "pos: " << match_pos << ", dist:" << dist << "\n";
126    }
127
128}
129
130void icgrep_Linking(Module * m, ExecutionEngine * e) {
131    Module::FunctionListType & fns = m->getFunctionList();
132    for (Module::FunctionListType::iterator it = fns.begin(), it_end = fns.end(); it != it_end; ++it) {
133        std::string fnName = it->getName().str();
134        if (fnName == "wrapped_report_pos") {
135            e->addGlobalMapping(cast<GlobalValue>(it), (void *)&wrapped_report_pos);
136        }
137    }
138}
139
140void get_editd_pattern(int & pattern_segs, int & total_len) {
141 
142    if (PatternFilename != "") {
143        std::ifstream pattFile(PatternFilename.c_str());
144        std::string r;
145        if (pattFile.is_open()) {
146            while (std::getline(pattFile, r)) {
147                pattVector.push_back(r);
148                pattern_segs ++; 
149                total_len += r.size(); 
150            }
151            pattFile.close();
152        }
153    }
154   
155    // if there are no regexes specified through -e or -f, the first positional argument
156    // must be a regex, not an input file.
157   
158    if (pattVector.size() == 0) {
159        pattVector.push_back(inputFiles[0]);
160        inputFiles.erase(inputFiles.begin());
161    }
162}
163
164Function * editdPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
165    Type * mBitBlockType = iBuilder->getBitBlockType();
166   
167    ExternalFileBuffer ChStream(iBuilder, StreamSetType(4, i1));
168    SingleBlockBuffer MatchResults(iBuilder, StreamSetType(editDistance+1, i1));
169
170    pablo_function_passes(function);
171    pablo::PabloKernel  editdk(iBuilder, "editd", function, {});
172    kernel::editdScanKernel editdScanK(iBuilder, editDistance);
173   
174    std::unique_ptr<Module> editdM = editdk.createKernelModule({&ChStream}, {&MatchResults});
175    std::unique_ptr<Module> scanM = editdScanK.createKernelModule({&MatchResults}, {});               
176   
177    editdk.addKernelDeclarations(mMod);
178    editdScanK.addKernelDeclarations(mMod);
179
180    Type * const size_ty = iBuilder->getSizeTy();
181    Type * const voidTy = Type::getVoidTy(mMod->getContext());
182    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
183   
184    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, size_ty, nullptr));
185    main->setCallingConv(CallingConv::C);
186    Function::arg_iterator args = main->arg_begin();
187   
188    Value * const inputStream = &*(args++);
189    inputStream->setName("input");
190    Value * const fileSize = &*(args++);
191    fileSize->setName("fileSize");
192   
193    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
194
195    ChStream.setStreamSetBuffer(inputStream, fileSize);
196    MatchResults.allocateBuffer();
197   
198    Value * editdInstance = editdk.createInstance({});
199    Value * scanMatchInstance = editdScanK.createInstance({});
200   
201    generatePipelineLoop(iBuilder, {&editdk, &editdScanK}, {editdInstance, scanMatchInstance}, fileSize);
202       
203    iBuilder->CreateRetVoid();
204   
205    Linker L(*mMod);
206    L.linkInModule(std::move(editdM));
207    L.linkInModule(std::move(scanM));
208   
209    return main;
210}
211
212Function * preprocessPipeline(Module * mMod, IDISA::IDISA_Builder * iBuilder, pablo::PabloFunction * function) {
213    Type * mBitBlockType = iBuilder->getBitBlockType();
214   
215    ExternalFileBuffer ByteStream(iBuilder, StreamSetType(1, i8));
216    SingleBlockBuffer BasisBits(iBuilder, StreamSetType(8, i1));
217    ExternalFileBuffer CCResults(iBuilder, StreamSetType(4, i1));
218
219    s2pKernel  s2pk(iBuilder);
220    std::unique_ptr<Module> s2pM = s2pk.createKernelModule({&ByteStream}, {&BasisBits});
221
222    pablo_function_passes(function);
223    pablo::PabloKernel  ccck(iBuilder, "ccc", function, {});
224   
225    std::unique_ptr<Module> cccM = ccck.createKernelModule({&BasisBits}, {&CCResults});
226   
227    s2pk.addKernelDeclarations(mMod);
228    ccck.addKernelDeclarations(mMod);
229
230    Type * const size_ty = iBuilder->getSizeTy();
231    Type * const voidTy = Type::getVoidTy(mMod->getContext());
232    Type * const inputType = PointerType::get(ArrayType::get(ArrayType::get(mBitBlockType, 8), 1), 0);
233    Type * const outputType = PointerType::get(ArrayType::get(mBitBlockType, 4), 0);
234   
235    Function * const main = cast<Function>(mMod->getOrInsertFunction("Main", voidTy, inputType, size_ty, outputType, nullptr));
236    main->setCallingConv(CallingConv::C);
237    Function::arg_iterator args = main->arg_begin();
238   
239    Value * const inputStream = &*(args++);
240    inputStream->setName("input");
241    Value * const fileSize = &*(args++);
242    fileSize->setName("fileSize");
243    Value * const outputStream = &*(args++);
244    outputStream->setName("output");
245   
246    iBuilder->SetInsertPoint(BasicBlock::Create(mMod->getContext(), "entry", main,0));
247
248    ByteStream.setStreamSetBuffer(inputStream, fileSize);
249    BasisBits.allocateBuffer();
250    CCResults.setStreamSetBuffer(outputStream, fileSize);
251   
252    Value * s2pInstance = s2pk.createInstance({});
253    Value * cccInstance = ccck.createInstance({});
254   
255    generatePipelineLoop(iBuilder, {&s2pk, &ccck}, {s2pInstance, cccInstance}, fileSize);
256       
257    iBuilder->CreateRetVoid();
258   
259    Linker L(*mMod);
260    L.linkInModule(std::move(s2pM));
261    L.linkInModule(std::move(cccM));
262   
263    return main;
264}
265
266
267typedef void (*preprocessFunctionType)(char * byte_data, size_t filesize, char * output_data);
268static ExecutionEngine * preprocessEngine = nullptr;
269
270preprocessFunctionType preprocessCodeGen() {
271                           
272    Module * M = new Module("preprocess", getGlobalContext());
273    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
274
275    PabloFunction * function = PabloFunction::Create("preprocess", 8, 4);
276    cc::CC_Compiler ccc(*function);
277    pablo::PabloBuilder pBuilder(ccc.getBuilder().getPabloBlock(), ccc.getBuilder());
278
279    pablo::PabloAST * A = ccc.compileCC(re::makeCC(re::makeCC(0x41), re::makeCC(0x61)));
280    pablo::PabloAST * C = ccc.compileCC(re::makeCC(re::makeCC(0x43), re::makeCC(0x63)));
281    pablo::PabloAST * T = ccc.compileCC(re::makeCC(re::makeCC(0x54), re::makeCC(0x74)));
282    pablo::PabloAST * G = ccc.compileCC(re::makeCC(re::makeCC(0x47), re::makeCC(0x67)));
283
284    function->setResult(0, pBuilder.createAssign("A", A));
285    function->setResult(1, pBuilder.createAssign("C", C));
286    function->setResult(2, pBuilder.createAssign("T", T));
287    function->setResult(3, pBuilder.createAssign("G", G));
288
289    llvm::Function * main_IR = preprocessPipeline(M, idb, function);
290
291    preprocessEngine = JIT_to_ExecutionEngine(M);
292   
293    preprocessEngine->finalizeObject();
294
295    delete idb;
296    return reinterpret_cast<preprocessFunctionType>(preprocessEngine->getPointerToFunction(main_IR));
297}
298
299typedef void (*editdFunctionType)(char * byte_data, size_t filesize);
300static ExecutionEngine * editdEngine = nullptr;
301
302editdFunctionType editdCodeGen() {
303                           
304    Module * M = new Module("editd", getGlobalContext());
305    IDISA::IDISA_Builder * idb = IDISA::GetIDISA_Builder(M);
306
307    PabloFunction * function = PabloFunction::Create("editd", 4, editDistance+1);
308    pablo::PabloBuilder main (function->getEntryBlock());
309
310    const PabloType * streamType = getPabloType(PabloType::Stream, 1);
311
312    std::vector<pablo::Var *>   basisBits(4);
313    function->setParameter(0, basisBits[0] = main.createVar("PatA", streamType));
314    function->setParameter(1, basisBits[1] = main.createVar("PatC", streamType));
315    function->setParameter(2, basisBits[2] = main.createVar("PatT", streamType));
316    function->setParameter(3, basisBits[3] = main.createVar("PatG", streamType));
317
318    re::Pattern_Compiler pattern_compiler(*function);
319    pattern_compiler.compile(pattVector, main, basisBits, editDistance, optPosition, stepSize);
320
321    llvm::Function * main_IR = editdPipeline(M, idb, function);
322
323    editdEngine = JIT_to_ExecutionEngine(M);
324   
325    editdEngine->finalizeObject();
326
327    delete idb;
328    return reinterpret_cast<editdFunctionType>(editdEngine->getPointerToFunction(main_IR));
329}
330
331char * preprocess(preprocessFunctionType fn_ptr, int & size) {
332    std::string fileName = inputFiles[0];
333    size_t fileSize;
334    char * fileBuffer;
335   
336    const boost::filesystem::path file(fileName);
337    if (exists(file)) {
338        if (is_directory(file)) {
339            exit(0);
340        }
341    } else {
342        std::cerr << "Error: cannot open " << fileName << " for processing. Skipped.\n";
343        exit(0);
344    }
345   
346    fileSize = file_size(file);
347    boost::iostreams::mapped_file_source mappedFile;
348    if (fileSize == 0) {
349        fileBuffer = nullptr;
350    }
351    else {
352        try {
353            mappedFile.open(fileName);
354        } catch (std::exception &e) {
355            std::cerr << "Error: Boost mmap of " << fileName << ": " << e.what() << std::endl;
356            exit(0);
357        }
358        fileBuffer = const_cast<char *>(mappedFile.data());
359    }
360    char * chStream = (char *) aligned_alloc(32, fileSize);
361    fn_ptr(fileBuffer, fileSize, chStream);
362    size = fileSize;
363
364    mappedFile.close();
365
366    return chStream;
367   
368}
369
370void editd(editdFunctionType fn_ptr, char * chStream, int size) {
371 
372    if (size == 0) {
373        chStream = nullptr;
374    }
375
376    fn_ptr(chStream, size);
377   
378}
379
380int main(int argc, char *argv[]) {
381
382    cl::ParseCommandLineOptions(argc, argv);
383
384    int pattern_segs = 0;
385    int total_len = 0;
386
387    get_editd_pattern(pattern_segs, total_len);
388 
389    preprocessFunctionType preprocess_ptr = preprocessCodeGen();
390    int size = 0;
391    char * chStream = preprocess(preprocess_ptr, size);
392   
393    editdFunctionType editd_ptr = editdCodeGen();
394    editd(editd_ptr, chStream, size);
395
396    if(pattVector.size()>1)
397        run_second_filter(pattern_segs, total_len, 0.15);
398
399    delete editdEngine;
400    delete preprocessEngine;
401
402    return 0;
403}
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
Note: See TracBrowser for help on using the repository browser.