Ignore:
Timestamp:
Feb 13, 2017, 2:50:03 PM (2 years ago)
Author:
lindanl
Message:

Extend icgrep to use multiple groups of thread on GPU.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5310 r5314  
    3232#ifdef CUDA_ENABLED
    3333#include <IR_Gen/CudaDriver.h>
     34#include "preprocess.cpp"
    3435#endif
    3536#include <util/aligned_allocator.h>
     
    6263static re::CC * parsedCodePointSet = nullptr;
    6364static std::vector<std::string> parsedPropertyValues;
     65
     66#ifdef CUDA_ENABLED
     67int blockNo = 0;
     68size_t * startPoints = nullptr;
     69size_t * accumBytes = nullptr;
     70#endif
    6471
    6572void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<size_t> & total_CountOnly, bool UTF_16) {
     
    8188            boost::iostreams::mapped_file_source source(fileName, fileSize, 0);
    8289            char * fileBuffer = const_cast<char *>(source.data());
     90           
    8391#ifdef CUDA_ENABLED 
    8492            if(codegen::NVPTX){
    85                 ulong * rslt = RunPTX(PTXFilename, fileBuffer, fileSize, CountOnly);
     93                codegen::BlockSize = 128;
     94                std::vector<size_t> LFPositions = preprocess(fileBuffer, fileSize);
     95
     96                const unsigned numOfGroups = codegen::GroupNum;
     97                if (posix_memalign((void**)&startPoints, 8, (numOfGroups+1)*sizeof(size_t)) ||
     98                    posix_memalign((void**)&accumBytes, 8, (numOfGroups+1)*sizeof(size_t))) {
     99                    std::cerr << "Cannot allocate memory for startPoints or accumBytes.\n";
     100                    exit(-1);
     101                }
     102
     103                ulong * rslt = RunPTX(PTXFilename, fileBuffer, fileSize, CountOnly, LFPositions, startPoints, accumBytes);
    86104                if (CountOnly){
    87105                    exit(0);
    88106                }
    89107                else{
    90                     mGrepFunction_CPU((char *)rslt, fileBuffer, fileSize, fileIdx);
     108                    size_t intputSize = startPoints[numOfGroups]-accumBytes[numOfGroups]+accumBytes[numOfGroups-1];
     109                    mGrepFunction_CPU((char *)rslt, fileBuffer, intputSize, fileIdx);
    91110                    return;
    92111                }
     
    121140}
    122141
     142
    123143Function * generateGPUKernel(Module * m, IDISA::IDISA_Builder * iBuilder, bool CountOnly){
    124144    Type * const int64ty = iBuilder->getInt64Ty();
    125     Type * const inputType = PointerType::get(int64ty, 1);
     145    Type * const size_ty = iBuilder->getSizeTy();
     146    Type * const int32ty = iBuilder->getInt32Ty();
     147    Type * const sizeTyPtr = PointerType::get(size_ty, 1);
     148    Type * const int64tyPtr = PointerType::get(int64ty, 1);
     149    Type * const inputType = PointerType::get(iBuilder->getInt8Ty(), 1);
    126150    Type * const resultTy = iBuilder->getVoidTy();
    127     Function * kernelFunc = cast<Function>(m->getOrInsertFunction("GPU_Main", resultTy, inputType, inputType, inputType, nullptr));
     151    Function * kernelFunc = cast<Function>(m->getOrInsertFunction("GPU_Main", resultTy, inputType, sizeTyPtr, sizeTyPtr, int64tyPtr, nullptr));
    128152    kernelFunc->setCallingConv(CallingConv::C);
    129153    Function::arg_iterator args = kernelFunc->arg_begin();
     
    131155    Value * const inputPtr = &*(args++);
    132156    inputPtr->setName("inputPtr");
    133     Value * const bufferSizePtr = &*(args++);
    134     bufferSizePtr->setName("bufferSizePtr");
     157    Value * const startPointsPtr = &*(args++);
     158    startPointsPtr->setName("startPointsPtr");
     159    Value * const bufferSizesPtr = &*(args++);
     160    bufferSizesPtr->setName("bufferSizesPtr");
    135161    Value * const outputPtr = &*(args++);
    136162    outputPtr->setName("resultPtr");
     
    140166
    141167    Function * tidFunc = m->getFunction("llvm.nvvm.read.ptx.sreg.tid.x");
    142     Value * id = iBuilder->CreateCall(tidFunc);
     168    Value * tid = iBuilder->CreateCall(tidFunc);
     169    Function * bidFunc = cast<Function>(m->getOrInsertFunction("llvm.nvvm.read.ptx.sreg.ctaid.x", int32ty, nullptr));
     170    Value * bid = iBuilder->CreateCall(bidFunc);
     171
     172    Value * startPoint = iBuilder->CreateLoad(iBuilder->CreateGEP(startPointsPtr, bid));
    143173
    144174    Function * mainFunc = m->getFunction("Main");
     175    Value * startBlock = iBuilder->CreateUDiv(startPoint, ConstantInt::get(int64ty, iBuilder->getBitBlockWidth()));
    145176    Type * const inputStreamType = PointerType::get(ArrayType::get(ArrayType::get(iBuilder->getBitBlockType(), 8), 1), 1);   
    146     Value * inputStreamPtr = iBuilder->CreateBitCast(inputPtr, inputStreamType);
    147     Value * inputStream = iBuilder->CreateGEP(inputStreamPtr, id);
    148 
    149     Value * bufferSize = iBuilder->CreateLoad(bufferSizePtr);
     177    Value * inputStreamPtr = iBuilder->CreateGEP(iBuilder->CreateBitCast(inputPtr, inputStreamType), startBlock);
     178    Value * inputStream = iBuilder->CreateGEP(inputStreamPtr, tid);
     179    Value * bufferSize = iBuilder->CreateLoad(iBuilder->CreateGEP(bufferSizesPtr, bid));
     180
    150181    if (CountOnly){
    151         Value * outputThreadPtr = iBuilder->CreateGEP(outputPtr, id);
     182        Value * strideBlocks = ConstantInt::get(int32ty, iBuilder->getStride() / iBuilder->getBitBlockWidth());
     183        Value * outputThreadPtr = iBuilder->CreateGEP(outputPtr, iBuilder->CreateAdd(iBuilder->CreateMul(bid, strideBlocks), tid));
    152184        Value * result = iBuilder->CreateCall(mainFunc, {inputStream, bufferSize});
    153185        iBuilder->CreateStore(result, outputThreadPtr);
     
    155187    else {
    156188        Type * const outputStremType = PointerType::get(ArrayType::get(iBuilder->getBitBlockType(), 2), 1);
    157         Value * outputStreamPtr = iBuilder->CreateBitCast(outputPtr, outputStremType);
    158         Value * outputStream = iBuilder->CreateGEP(outputStreamPtr, id);
     189        Value * outputStreamPtr = iBuilder->CreateGEP(iBuilder->CreateBitCast(outputPtr, outputStremType), startBlock);
     190        Value * outputStream = iBuilder->CreateGEP(outputStreamPtr, tid);
    159191        iBuilder->CreateCall(mainFunc, {inputStream, bufferSize, outputStream});
    160192    }   
     
    184216    const unsigned segmentSize = codegen::SegmentSize;
    185217
    186     ExternalFileBuffer MatchResults(iBuilder, iBuilder->getStreamSetTy( 2, 1));
     218    ExternalFileBuffer MatchResults(iBuilder, iBuilder->getStreamSetTy(2, 1));
    187219    MatchResults.setStreamSetBuffer(rsltStream, fileSize);
    188220
     
    440472extern "C" {
    441473    void wrapped_report_match(size_t lineNum, size_t line_start, size_t line_end, const char * buffer, size_t filesize, int fileIdx) {
     474
     475#ifdef CUDA_ENABLED
     476    if (codegen::NVPTX){
     477        while(line_start>startPoints[blockNo]) blockNo++;
     478        line_start -= accumBytes[blockNo-1];
     479        line_end -= accumBytes[blockNo-1];
     480    }
     481#endif
    442482        int index = isUTF_16 ? 2 : 1;
    443483        int idx = fileIdx;
Note: See TracChangeset for help on using the changeset viewer.