Ignore:
Timestamp:
Feb 13, 2017, 2:50:03 PM (2 years ago)
Author:
lindanl
Message:

Extend icgrep to use multiple groups of thread on GPU.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • icGREP/icgrep-devel/icgrep/IR_Gen/CudaDriver.h

    r5294 r5314  
    66#include <unistd.h>
    77#include <cassert>
     8#include <toolchain.h>
    89#include "cuda.h"
    910
     
    1516
    1617/// main - Program entry point
    17 ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly) {
     18ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly, std::vector<size_t> LFPositions, ulong * startPoints, ulong * accumBytes) {
    1819 
    1920  CUdevice    device;
     
    5960  // Device data
    6061  CUdeviceptr devBufferInput;
    61   CUdeviceptr devBufferSize;
     62  CUdeviceptr devStartPoints;
     63  CUdeviceptr devBufferSizes;
    6264  CUdeviceptr devBufferOutput;
    6365
    6466  int groupSize = GROUPTHREADS * sizeof(ulong) * 8;
    65   int groups = filesize/groupSize + 1;
    66   int bufferSize = groups * groupSize;
     67  const unsigned numOfGroups = codegen::GroupNum;
     68
     69  if(LFPositions.size() < numOfGroups){
     70    std::cerr << "Line Breaks less than " << numOfGroups << std::endl;
     71    exit(-1);
     72  }
     73
     74  unsigned avg = LFPositions.size()/numOfGroups;
     75  unsigned left = LFPositions.size()%numOfGroups;
     76
     77  size_t divPoints[numOfGroups + 1];
     78  size_t bufferSizes[numOfGroups];
     79  divPoints[0] = 0;
     80  startPoints[0] = 0;
     81  unsigned i = 1;
     82  unsigned pos = 0;
     83  while (i < numOfGroups){
     84    if (i < left)
     85      pos += avg + 1;
     86    else
     87      pos += avg;
     88
     89    divPoints[i] = LFPositions[pos]+1;
     90    bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
     91    startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
     92
     93    i++;
     94  }
     95
     96  divPoints[numOfGroups] = filesize;
     97  bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
     98  startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
     99   
     100  checkCudaErrors(cuMemAlloc(&devBufferInput, startPoints[numOfGroups]));
     101  checkCudaErrors(cuMemsetD8(devBufferInput,0,startPoints[numOfGroups]));
     102  checkCudaErrors(cuMemAlloc(&devStartPoints, sizeof(ulong) * (numOfGroups + 1)));
     103  checkCudaErrors(cuMemAlloc(&devBufferSizes, sizeof(ulong) * numOfGroups));
     104
    67105  int outputSize = 0;
    68 
    69   checkCudaErrors(cuMemAlloc(&devBufferInput, bufferSize));
    70   checkCudaErrors(cuMemAlloc(&devBufferSize, sizeof(ulong)));
    71106  if (CountOnly){
    72     outputSize = sizeof(ulong) * GROUPTHREADS;
     107    outputSize = sizeof(ulong) * GROUPTHREADS * numOfGroups;
    73108  }
    74109  else{
    75     outputSize = sizeof(ulong) * 2 * GROUPTHREADS * groups;
     110    outputSize = startPoints[numOfGroups]/4;
    76111  }
    77 
    78112  checkCudaErrors(cuMemAlloc(&devBufferOutput, outputSize));
    79113
    80114  //Copy from host to device
    81   checkCudaErrors(cuMemcpyHtoD(devBufferInput, fileBuffer, bufferSize));
    82   checkCudaErrors(cuMemcpyHtoD(devBufferSize, &filesize, sizeof(ulong)));
     115  for(unsigned i=0; i<numOfGroups; i++){
     116    checkCudaErrors(cuMemcpyHtoD(devBufferInput+startPoints[i], fileBuffer+divPoints[i], bufferSizes[i]));
     117  }
     118  checkCudaErrors(cuMemcpyHtoD(devStartPoints, startPoints, sizeof(ulong) * (numOfGroups + 1)));
     119  checkCudaErrors(cuMemcpyHtoD(devBufferSizes, bufferSizes, sizeof(ulong) * numOfGroups));
    83120
    84121  unsigned blockSizeX = GROUPTHREADS;
    85122  unsigned blockSizeY = 1;
    86123  unsigned blockSizeZ = 1;
    87   unsigned gridSizeX  = 1;
     124  unsigned gridSizeX  = numOfGroups;
    88125  unsigned gridSizeY  = 1;
    89126  unsigned gridSizeZ  = 1;
    90127
    91128  // Kernel parameters
    92   void *KernelParams[] = { &devBufferInput, &devBufferSize, &devBufferOutput};
     129  void *KernelParams[] = { &devBufferInput, &devStartPoints, &devBufferSizes, &devBufferOutput};
    93130
    94131  // std::cout << "Launching kernel\n";
    95 
    96   // Kernel launch
    97   checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
    98                                  blockSizeX, blockSizeY, blockSizeZ,
    99                                  0, NULL, KernelParams, NULL));
    100   // std::cout << "kernel success.\n";
    101   // Retrieve device data
    102132
    103133  CUevent start;
     
    108138  cuEventRecord(start,0);
    109139
    110   ulong * matchRslt;
    111   int ret = posix_memalign((void**)&matchRslt, 32, outputSize);
    112   if (ret) {
    113     std::cerr << "Cannot allocate memory for output.\n";
    114     exit(-1);
    115   }
    116   checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
    117   if (CountOnly){
    118     int count = 0;
    119     for (unsigned i = 0; i < GROUPTHREADS; ++i) {
    120       count += matchRslt[i];
    121     }
    122     std::cout << count << "\n";
    123   }
     140  // Kernel launch
     141  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
     142                                 blockSizeX, blockSizeY, blockSizeZ,
     143                                 0, NULL, KernelParams, NULL));
     144  // std::cout << "kernel success.\n";
    124145
    125146  cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC);
     
    130151  printf("GPU Kernel time : %f ms\n" ,elapsedTime);
    131152
     153  // Retrieve device data
     154  ulong * matchRslt;
     155  if (posix_memalign((void**)&matchRslt, 32, outputSize)) {
     156    std::cerr << "Cannot allocate memory for output.\n";
     157    exit(-1);
     158  }
     159  checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
     160
     161  if (CountOnly){
     162    int count = 0;
     163    for (unsigned i = 0; i < GROUPTHREADS * numOfGroups; ++i) {
     164      // std::cout << i << ":" << matchRslt[i] << "\n";
     165      count += matchRslt[i];
     166    }
     167    std::cout << count << "\n";
     168  }
     169  else{
     170    for(unsigned i=0; i<=numOfGroups; i++){
     171      accumBytes[i] = startPoints[i] - divPoints[i];
     172    }
     173  }
     174
    132175
    133176  // Clean-up
    134177  checkCudaErrors(cuMemFree(devBufferInput));
    135   checkCudaErrors(cuMemFree(devBufferSize));
     178  checkCudaErrors(cuMemFree(devStartPoints));
     179  checkCudaErrors(cuMemFree(devBufferSizes));
    136180  checkCudaErrors(cuMemFree(devBufferOutput));
    137181  checkCudaErrors(cuModuleUnload(cudaModule));
Note: See TracChangeset for help on using the changeset viewer.