source: icGREP/icgrep-devel/icgrep/IR_Gen/CudaDriver.h @ 5486

Last change on this file since 5486 was 5458, checked in by lindanl, 2 years ago

Add NVPTX driver.

File size: 5.6 KB
Line 
1#include <string>
2#include <iostream>
3#include <fstream>
4#include <sys/stat.h>
5#include <fcntl.h>
6#include <unistd.h>
7#include <cassert>
8#include <toolchain/toolchain.h>
9#include "cuda.h"
10
11#define GROUPTHREADS 64
12
13void checkCudaErrors(CUresult err) {
14  assert(err == CUDA_SUCCESS);
15}
16
17/// main - Program entry point
18ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly, std::vector<size_t> LFPositions, ulong * startPoints, ulong * accumBytes) {
19 
20  CUdevice    device;
21  CUmodule    cudaModule;
22  CUcontext   context;
23  CUfunction  function;
24  int         devCount;
25
26  // CUDA initialization
27  checkCudaErrors(cuInit(0));
28  checkCudaErrors(cuDeviceGetCount(&devCount));
29  checkCudaErrors(cuDeviceGet(&device, 0));
30
31  char name[128];
32  checkCudaErrors(cuDeviceGetName(name, 128, device));
33  // std::cout << "Using CUDA Device [0]: " << name << "\n";
34
35  int devMajor, devMinor;
36  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
37  // std::cout << "Device Compute Capability: " << devMajor << "." << devMinor << "\n";
38  if (devMajor < 2) {
39    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
40    exit(-1);
41  }
42
43  std::ifstream t(PTXFilename);
44  if (!t.is_open()) {
45    std::cerr << "Error: cannot open " << PTXFilename << " for processing. Skipped.\n";
46    exit(-1);
47  }
48 
49  std::string ptx_str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
50
51  // Create driver context
52  checkCudaErrors(cuCtxCreate(&context, 0, device));
53
54  // Create module for object
55  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_str.c_str(), 0, 0, 0));
56
57  // Get kernel function
58  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "Main"));
59
60  // Device data
61  CUdeviceptr devBufferInput;
62  CUdeviceptr devStartPoints;
63  CUdeviceptr devBufferSizes;
64  CUdeviceptr devBufferOutput;
65
66  int groupSize = GROUPTHREADS * sizeof(ulong) * 8;
67  const unsigned numOfGroups = codegen::GroupNum;
68
69  if(LFPositions.size() < numOfGroups){
70    std::cerr << "Number of line Breaks:" << LFPositions.size() << std::endl;
71    std::cerr << "Number of GPU groups:" << numOfGroups << std::endl;
72    std::cerr << "Line breaks must be more than GPU groups. Use -group-num option to change the group size." << std::endl;
73    exit(-1);
74  }
75
76  unsigned avg = LFPositions.size()/numOfGroups;
77  unsigned left = LFPositions.size()%numOfGroups;
78
79  size_t divPoints[numOfGroups + 1];
80  size_t bufferSizes[numOfGroups];
81  divPoints[0] = 0;
82  startPoints[0] = 0;
83  unsigned i = 1;
84  unsigned pos = 0;
85  while (i < numOfGroups){
86    if (i < left)
87      pos += avg + 1;
88    else
89      pos += avg;
90
91    divPoints[i] = LFPositions[pos]+1;
92    bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
93    startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
94
95    i++;
96  }
97
98  divPoints[numOfGroups] = filesize;
99  bufferSizes[i-1] = divPoints[i]-divPoints[i-1];
100  startPoints[i] = startPoints[i-1] + ((bufferSizes[i-1]-1)/groupSize+1)*groupSize;
101   
102  checkCudaErrors(cuMemAlloc(&devBufferInput, startPoints[numOfGroups]));
103  checkCudaErrors(cuMemsetD8(devBufferInput,0,startPoints[numOfGroups]));
104  checkCudaErrors(cuMemAlloc(&devStartPoints, sizeof(ulong) * (numOfGroups + 1)));
105  checkCudaErrors(cuMemAlloc(&devBufferSizes, sizeof(ulong) * numOfGroups));
106
107  int outputSize = 0;
108  if (CountOnly){
109    outputSize = sizeof(ulong) * GROUPTHREADS * numOfGroups;
110  }
111  else{
112    outputSize = startPoints[numOfGroups]/4;
113  }
114  checkCudaErrors(cuMemAlloc(&devBufferOutput, outputSize));
115
116  //Copy from host to device
117  for(unsigned i=0; i<numOfGroups; i++){
118    checkCudaErrors(cuMemcpyHtoD(devBufferInput+startPoints[i], fileBuffer+divPoints[i], bufferSizes[i]));
119  }
120  checkCudaErrors(cuMemcpyHtoD(devStartPoints, startPoints, sizeof(ulong) * (numOfGroups + 1)));
121  checkCudaErrors(cuMemcpyHtoD(devBufferSizes, bufferSizes, sizeof(ulong) * numOfGroups));
122
123  unsigned blockSizeX = GROUPTHREADS;
124  unsigned blockSizeY = 1;
125  unsigned blockSizeZ = 1;
126  unsigned gridSizeX  = numOfGroups;
127  unsigned gridSizeY  = 1;
128  unsigned gridSizeZ  = 1;
129
130  // Kernel parameters
131  void *KernelParams[] = { &devBufferInput, &devStartPoints, &devBufferSizes, &devBufferOutput};
132
133  // std::cerr << "Launching kernel\n";
134
135  CUevent start;
136  CUevent stop;
137  float elapsedTime;
138
139  cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC);
140  cuEventRecord(start,0);
141
142  // Kernel launch
143  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
144                                 blockSizeX, blockSizeY, blockSizeZ,
145                                 0, NULL, KernelParams, NULL));
146  // std::cerr << "kernel success.\n";
147
148  cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC);
149  cuEventRecord(stop,0);
150  cuEventSynchronize(stop);
151
152  cuEventElapsedTime(&elapsedTime, start, stop);
153  // printf("GPU Kernel time : %f ms\n" ,elapsedTime);
154
155  // Retrieve device data
156  ulong * matchRslt;
157  if (posix_memalign((void**)&matchRslt, 32, outputSize)) {
158    std::cerr << "Cannot allocate memory for output.\n";
159    exit(-1);
160  }
161  checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
162
163  if (CountOnly){
164    int count = 0;
165    for (unsigned i = 0; i < GROUPTHREADS * numOfGroups; ++i) {
166      // std::cout << i << ":" << matchRslt[i] << "\n";
167      count += matchRslt[i];
168    }
169    std::cout << count << "\n";
170  }
171  else{
172    for(unsigned i=0; i<=numOfGroups; i++){
173      accumBytes[i] = startPoints[i] - divPoints[i];
174    }
175  }
176
177
178  // Clean-up
179  checkCudaErrors(cuMemFree(devBufferInput));
180  checkCudaErrors(cuMemFree(devStartPoints));
181  checkCudaErrors(cuMemFree(devBufferSizes));
182  checkCudaErrors(cuMemFree(devBufferOutput));
183  checkCudaErrors(cuModuleUnload(cudaModule));
184  checkCudaErrors(cuCtxDestroy(context));
185
186  return matchRslt;
187}
Note: See TracBrowser for help on using the repository browser.