Ignore:
Timestamp:
Feb 13, 2017, 2:50:03 PM (3 years ago)
Author:
lindanl
Message:

Extend icgrep to use multiple groups of thread on GPU.

Location:
icGREP/icgrep-devel/icgrep
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/CMakeLists.txt

    r5298 r5314  
    7575target_link_libraries (RegExpCompiler RegExpADT)
    7676
    77 add_executable(icgrep icgrep.cpp toolchain.cpp grep_engine.cpp kernels/scanmatchgen.cpp kernels/cc_kernel.cpp)
     77add_executable(icgrep icgrep.cpp toolchain.cpp grep_engine.cpp kernels/scanmatchgen.cpp kernels/cc_kernel.cpp kernels/cc_scan_kernel.cpp)
    7878add_executable(u8u16 u8u16.cpp toolchain.cpp kernels/p2s_kernel.cpp kernels/deletion.cpp kernels/stdout_kernel.cpp)
    7979add_executable(base64 base64.cpp kernels/radix64.cpp toolchain.cpp kernels/p2s_kernel.cpp kernels/deletion.cpp kernels/stdout_kernel.cpp)
  • icGREP/icgrep-devel/icgrep/IR_Gen/CudaDriver.h

    r5294 r5314  
    66#include <unistd.h>
    77#include <cassert>
     8#include <toolchain.h>
    89#include "cuda.h"
    910
     
    1516
    1617/// main - Program entry point
    17 ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly) {
     18ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly, std::vector<size_t> LFPositions, ulong * startPoints, ulong * accumBytes) {
    1819 
    1920  CUdevice    device;
     
    5960  // Device data
    6061  CUdeviceptr devBufferInput;
    61   CUdeviceptr devBufferSize;
     62  CUdeviceptr devStartPoints;
     63  CUdeviceptr devBufferSizes;
    6264  CUdeviceptr devBufferOutput;
    6365
    6466  int groupSize = GROUPTHREADS * sizeof(ulong) * 8;
    65   int groups = filesize/groupSize + 1;
    66   int bufferSize = groups * groupSize;
     67  const unsigned numOfGroups = codegen::GroupNum;
     68
     69  if(LFPositions.size() < numOfGroups){
     70    std::cerr << "Line Breaks less than " << numOfGroups << std::endl;
     71    exit(-1);
     72  }
     73
     74  unsigned avg = LFPositions.size()/numOfGroups;
     75  unsigned left = LFPositions.size()%numOfGroups;
     76
     77  size_t divPoints[numOfGroups + 1];
     78  size_t bufferSizes[numOfGroups];
     79  divPoints[0] = 0;
     80  startPoints[0] = 0;
     81  unsigned i = 1;
     82  unsigned pos = 0;
     83  while (i < numOfGroups){
     84    if (i < left)
     85      pos += avg + 1;
     86    else
     87      pos += avg;
     88
     89    divPoints[i] = LFPositions[pos]+1;
     90    bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
     91    startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
     92
     93    i++;
     94  }
     95
     96  divPoints[numOfGroups] = filesize;
     97  bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
     98  startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
     99   
     100  checkCudaErrors(cuMemAlloc(&devBufferInput, startPoints[numOfGroups]));
     101  checkCudaErrors(cuMemsetD8(devBufferInput,0,startPoints[numOfGroups]));
     102  checkCudaErrors(cuMemAlloc(&devStartPoints, sizeof(ulong) * (numOfGroups + 1)));
     103  checkCudaErrors(cuMemAlloc(&devBufferSizes, sizeof(ulong) * numOfGroups));
     104
    67105  int outputSize = 0;
    68 
    69   checkCudaErrors(cuMemAlloc(&devBufferInput, bufferSize));
    70   checkCudaErrors(cuMemAlloc(&devBufferSize, sizeof(ulong)));
    71106  if (CountOnly){
    72     outputSize = sizeof(ulong) * GROUPTHREADS;
     107    outputSize = sizeof(ulong) * GROUPTHREADS * numOfGroups;
    73108  }
    74109  else{
    75     outputSize = sizeof(ulong) * 2 * GROUPTHREADS * groups;
     110    outputSize = startPoints[numOfGroups]/4;
    76111  }
    77 
    78112  checkCudaErrors(cuMemAlloc(&devBufferOutput, outputSize));
    79113
    80114  //Copy from host to device
    81   checkCudaErrors(cuMemcpyHtoD(devBufferInput, fileBuffer, bufferSize));
    82   checkCudaErrors(cuMemcpyHtoD(devBufferSize, &filesize, sizeof(ulong)));
     115  for(unsigned i=0; i<numOfGroups; i++){
     116    checkCudaErrors(cuMemcpyHtoD(devBufferInput+startPoints[i], fileBuffer+divPoints[i], bufferSizes[i]));
     117  }
     118  checkCudaErrors(cuMemcpyHtoD(devStartPoints, startPoints, sizeof(ulong) * (numOfGroups + 1)));
     119  checkCudaErrors(cuMemcpyHtoD(devBufferSizes, bufferSizes, sizeof(ulong) * numOfGroups));
    83120
    84121  unsigned blockSizeX = GROUPTHREADS;
    85122  unsigned blockSizeY = 1;
    86123  unsigned blockSizeZ = 1;
    87   unsigned gridSizeX  = 1;
     124  unsigned gridSizeX  = numOfGroups;
    88125  unsigned gridSizeY  = 1;
    89126  unsigned gridSizeZ  = 1;
    90127
    91128  // Kernel parameters
    92   void *KernelParams[] = { &devBufferInput, &devBufferSize, &devBufferOutput};
     129  void *KernelParams[] = { &devBufferInput, &devStartPoints, &devBufferSizes, &devBufferOutput};
    93130
    94131  // std::cout << "Launching kernel\n";
    95 
    96   // Kernel launch
    97   checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
    98                                  blockSizeX, blockSizeY, blockSizeZ,
    99                                  0, NULL, KernelParams, NULL));
    100   // std::cout << "kernel success.\n";
    101   // Retrieve device data
    102132
    103133  CUevent start;
     
    108138  cuEventRecord(start,0);
    109139
    110   ulong * matchRslt;
    111   int ret = posix_memalign((void**)&matchRslt, 32, outputSize);
    112   if (ret) {
    113     std::cerr << "Cannot allocate memory for output.\n";
    114     exit(-1);
    115   }
    116   checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
    117   if (CountOnly){
    118     int count = 0;
    119     for (unsigned i = 0; i < GROUPTHREADS; ++i) {
    120       count += matchRslt[i];
    121     }
    122     std::cout << count << "\n";
    123   }
     140  // Kernel launch
     141  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
     142                                 blockSizeX, blockSizeY, blockSizeZ,
     143                                 0, NULL, KernelParams, NULL));
     144  // std::cout << "kernel success.\n";
    124145
    125146  cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC);
     
    130151  printf("GPU Kernel time : %f ms\n" ,elapsedTime);
    131152
     153  // Retrieve device data
     154  ulong * matchRslt;
     155  if (posix_memalign((void**)&matchRslt, 32, outputSize)) {
     156    std::cerr << "Cannot allocate memory for output.\n";
     157    exit(-1);
     158  }
     159  checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
     160
     161  if (CountOnly){
     162    int count = 0;
     163    for (unsigned i = 0; i < GROUPTHREADS * numOfGroups; ++i) {
     164      // std::cout << i << ":" << matchRslt[i] << "\n";
     165      count += matchRslt[i];
     166    }
     167    std::cout << count << "\n";
     168  }
     169  else{
     170    for(unsigned i=0; i<=numOfGroups; i++){
     171      accumBytes[i] = startPoints[i] - divPoints[i];
     172    }
     173  }
     174
    132175
    133176  // Clean-up
    134177  checkCudaErrors(cuMemFree(devBufferInput));
    135   checkCudaErrors(cuMemFree(devBufferSize));
     178  checkCudaErrors(cuMemFree(devStartPoints));
     179  checkCudaErrors(cuMemFree(devBufferSizes));
    136180  checkCudaErrors(cuMemFree(devBufferOutput));
    137181  checkCudaErrors(cuModuleUnload(cudaModule));
  • icGREP/icgrep-devel/icgrep/grep_engine.cpp

    r5310 r5314  
    3232#ifdef CUDA_ENABLED
    3333#include <IR_Gen/CudaDriver.h>
     34#include "preprocess.cpp"
    3435#endif
    3536#include <util/aligned_allocator.h>
     
    6263static re::CC * parsedCodePointSet = nullptr;
    6364static std::vector<std::string> parsedPropertyValues;
     65
     66#ifdef CUDA_ENABLED
     67int blockNo = 0;
     68size_t * startPoints = nullptr;
     69size_t * accumBytes = nullptr;
     70#endif
    6471
    6572void GrepEngine::doGrep(const std::string & fileName, const int fileIdx, bool CountOnly, std::vector<size_t> & total_CountOnly, bool UTF_16) {
     
    8188            boost::iostreams::mapped_file_source source(fileName, fileSize, 0);
    8289            char * fileBuffer = const_cast<char *>(source.data());
     90           
    8391#ifdef CUDA_ENABLED 
    8492            if(codegen::NVPTX){
    85                 ulong * rslt = RunPTX(PTXFilename, fileBuffer, fileSize, CountOnly);
     93                codegen::BlockSize = 128;
     94                std::vector<size_t> LFPositions = preprocess(fileBuffer, fileSize);
     95
     96                const unsigned numOfGroups = codegen::GroupNum;
     97                if (posix_memalign((void**)&startPoints, 8, (numOfGroups+1)*sizeof(size_t)) ||
     98                    posix_memalign((void**)&accumBytes, 8, (numOfGroups+1)*sizeof(size_t))) {
     99                    std::cerr << "Cannot allocate memory for startPoints or accumBytes.\n";
     100                    exit(-1);
     101                }
     102
     103                ulong * rslt = RunPTX(PTXFilename, fileBuffer, fileSize, CountOnly, LFPositions, startPoints, accumBytes);
    86104                if (CountOnly){
    87105                    exit(0);
    88106                }
    89107                else{
    90                     mGrepFunction_CPU((char *)rslt, fileBuffer, fileSize, fileIdx);
     108                    size_t intputSize = startPoints[numOfGroups]-accumBytes[numOfGroups]+accumBytes[numOfGroups-1];
     109                    mGrepFunction_CPU((char *)rslt, fileBuffer, intputSize, fileIdx);
    91110                    return;
    92111                }
     
    121140}
    122141
     142
    123143Function * generateGPUKernel(Module * m, IDISA::IDISA_Builder * iBuilder, bool CountOnly){
    124144    Type * const int64ty = iBuilder->getInt64Ty();
    125     Type * const inputType = PointerType::get(int64ty, 1);
     145    Type * const size_ty = iBuilder->getSizeTy();
     146    Type * const int32ty = iBuilder->getInt32Ty();
     147    Type * const sizeTyPtr = PointerType::get(size_ty, 1);
     148    Type * const int64tyPtr = PointerType::get(int64ty, 1);
     149    Type * const inputType = PointerType::get(iBuilder->getInt8Ty(), 1);
    126150    Type * const resultTy = iBuilder->getVoidTy();
    127     Function * kernelFunc = cast<Function>(m->getOrInsertFunction("GPU_Main", resultTy, inputType, inputType, inputType, nullptr));
     151    Function * kernelFunc = cast<Function>(m->getOrInsertFunction("GPU_Main", resultTy, inputType, sizeTyPtr, sizeTyPtr, int64tyPtr, nullptr));
    128152    kernelFunc->setCallingConv(CallingConv::C);
    129153    Function::arg_iterator args = kernelFunc->arg_begin();
     
    131155    Value * const inputPtr = &*(args++);
    132156    inputPtr->setName("inputPtr");
    133     Value * const bufferSizePtr = &*(args++);
    134     bufferSizePtr->setName("bufferSizePtr");
     157    Value * const startPointsPtr = &*(args++);
     158    startPointsPtr->setName("startPointsPtr");
     159    Value * const bufferSizesPtr = &*(args++);
     160    bufferSizesPtr->setName("bufferSizesPtr");
    135161    Value * const outputPtr = &*(args++);
    136162    outputPtr->setName("resultPtr");
     
    140166
    141167    Function * tidFunc = m->getFunction("llvm.nvvm.read.ptx.sreg.tid.x");
    142     Value * id = iBuilder->CreateCall(tidFunc);
     168    Value * tid = iBuilder->CreateCall(tidFunc);
     169    Function * bidFunc = cast<Function>(m->getOrInsertFunction("llvm.nvvm.read.ptx.sreg.ctaid.x", int32ty, nullptr));
     170    Value * bid = iBuilder->CreateCall(bidFunc);
     171
     172    Value * startPoint = iBuilder->CreateLoad(iBuilder->CreateGEP(startPointsPtr, bid));
    143173
    144174    Function * mainFunc = m->getFunction("Main");
     175    Value * startBlock = iBuilder->CreateUDiv(startPoint, ConstantInt::get(int64ty, iBuilder->getBitBlockWidth()));
    145176    Type * const inputStreamType = PointerType::get(ArrayType::get(ArrayType::get(iBuilder->getBitBlockType(), 8), 1), 1);   
    146     Value * inputStreamPtr = iBuilder->CreateBitCast(inputPtr, inputStreamType);
    147     Value * inputStream = iBuilder->CreateGEP(inputStreamPtr, id);
    148 
    149     Value * bufferSize = iBuilder->CreateLoad(bufferSizePtr);
     177    Value * inputStreamPtr = iBuilder->CreateGEP(iBuilder->CreateBitCast(inputPtr, inputStreamType), startBlock);
     178    Value * inputStream = iBuilder->CreateGEP(inputStreamPtr, tid);
     179    Value * bufferSize = iBuilder->CreateLoad(iBuilder->CreateGEP(bufferSizesPtr, bid));
     180
    150181    if (CountOnly){
    151         Value * outputThreadPtr = iBuilder->CreateGEP(outputPtr, id);
     182        Value * strideBlocks = ConstantInt::get(int32ty, iBuilder->getStride() / iBuilder->getBitBlockWidth());
     183        Value * outputThreadPtr = iBuilder->CreateGEP(outputPtr, iBuilder->CreateAdd(iBuilder->CreateMul(bid, strideBlocks), tid));
    152184        Value * result = iBuilder->CreateCall(mainFunc, {inputStream, bufferSize});
    153185        iBuilder->CreateStore(result, outputThreadPtr);
     
    155187    else {
    156188        Type * const outputStremType = PointerType::get(ArrayType::get(iBuilder->getBitBlockType(), 2), 1);
    157         Value * outputStreamPtr = iBuilder->CreateBitCast(outputPtr, outputStremType);
    158         Value * outputStream = iBuilder->CreateGEP(outputStreamPtr, id);
     189        Value * outputStreamPtr = iBuilder->CreateGEP(iBuilder->CreateBitCast(outputPtr, outputStremType), startBlock);
     190        Value * outputStream = iBuilder->CreateGEP(outputStreamPtr, tid);
    159191        iBuilder->CreateCall(mainFunc, {inputStream, bufferSize, outputStream});
    160192    }   
     
    184216    const unsigned segmentSize = codegen::SegmentSize;
    185217
    186     ExternalFileBuffer MatchResults(iBuilder, iBuilder->getStreamSetTy( 2, 1));
     218    ExternalFileBuffer MatchResults(iBuilder, iBuilder->getStreamSetTy(2, 1));
    187219    MatchResults.setStreamSetBuffer(rsltStream, fileSize);
    188220
     
    440472extern "C" {
    441473    void wrapped_report_match(size_t lineNum, size_t line_start, size_t line_end, const char * buffer, size_t filesize, int fileIdx) {
     474
     475#ifdef CUDA_ENABLED
     476    if (codegen::NVPTX){
     477        while(line_start>startPoints[blockNo]) blockNo++;
     478        line_start -= accumBytes[blockNo-1];
     479        line_end -= accumBytes[blockNo-1];
     480    }
     481#endif
    442482        int index = isUTF_16 ? 2 : 1;
    443483        int idx = fileIdx;
  • icGREP/icgrep-devel/icgrep/toolchain.cpp

    r5295 r5314  
    7373#ifdef CUDA_ENABLED
    7474bool NVPTX;
     75int GroupNum;
    7576static cl::opt<bool> USENVPTX("NVPTX", cl::desc("Run on GPU only."), cl::init(false));
     77static cl::opt<int, true> GroupNumOption("group-num", cl::location(GroupNum), cl::desc("NUmber of groups declared on GPU"), cl::value_desc("positive integer"), cl::init(256));
    7678#endif
    7779
  • icGREP/icgrep-devel/icgrep/toolchain.h

    r5295 r5314  
    3434#ifdef CUDA_ENABLED
    3535extern bool NVPTX;
     36extern int GroupNum;
    3637#endif
    3738}
Note: See TracChangeset for help on using the changeset viewer.