source: icGREP/icgrep-devel/icgrep/IR_Gen/CudaDriver.h @ 5294

Last change on this file since 5294 was 5294, checked in by lindanl, 2 years ago

Bug fixed for icgrep GPU version.

File size: 3.9 KB
Line 
1#include <string>
2#include <iostream>
3#include <fstream>
4#include <sys/stat.h>
5#include <fcntl.h>
6#include <unistd.h>
7#include <cassert>
8#include "cuda.h"
9
10#define GROUPTHREADS 64
11
12void checkCudaErrors(CUresult err) {
13  assert(err == CUDA_SUCCESS);
14}
15
16/// main - Program entry point
17ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, bool CountOnly) {
18 
19  CUdevice    device;
20  CUmodule    cudaModule;
21  CUcontext   context;
22  CUfunction  function;
23  int         devCount;
24
25  // CUDA initialization
26  checkCudaErrors(cuInit(0));
27  checkCudaErrors(cuDeviceGetCount(&devCount));
28  checkCudaErrors(cuDeviceGet(&device, 0));
29
30  char name[128];
31  checkCudaErrors(cuDeviceGetName(name, 128, device));
32  // std::cout << "Using CUDA Device [0]: " << name << "\n";
33
34  int devMajor, devMinor;
35  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
36  // std::cout << "Device Compute Capability: " << devMajor << "." << devMinor << "\n";
37  if (devMajor < 2) {
38    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
39    exit(-1);
40  }
41
42  std::ifstream t(PTXFilename);
43  if (!t.is_open()) {
44    std::cerr << "Error: cannot open " << PTXFilename << " for processing. Skipped.\n";
45    exit(-1);
46  }
47 
48  std::string ptx_str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
49
50  // Create driver context
51  checkCudaErrors(cuCtxCreate(&context, 0, device));
52
53  // Create module for object
54  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_str.c_str(), 0, 0, 0));
55
56  // Get kernel function
57  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "GPU_Main"));
58
59  // Device data
60  CUdeviceptr devBufferInput;
61  CUdeviceptr devBufferSize;
62  CUdeviceptr devBufferOutput;
63
64  int groupSize = GROUPTHREADS * sizeof(ulong) * 8;
65  int groups = filesize/groupSize + 1;
66  int bufferSize = groups * groupSize;
67  int outputSize = 0;
68
69  checkCudaErrors(cuMemAlloc(&devBufferInput, bufferSize));
70  checkCudaErrors(cuMemAlloc(&devBufferSize, sizeof(ulong)));
71  if (CountOnly){
72    outputSize = sizeof(ulong) * GROUPTHREADS;
73  }
74  else{
75    outputSize = sizeof(ulong) * 2 * GROUPTHREADS * groups;
76  }
77
78  checkCudaErrors(cuMemAlloc(&devBufferOutput, outputSize));
79
80  //Copy from host to device
81  checkCudaErrors(cuMemcpyHtoD(devBufferInput, fileBuffer, bufferSize));
82  checkCudaErrors(cuMemcpyHtoD(devBufferSize, &filesize, sizeof(ulong)));
83
84  unsigned blockSizeX = GROUPTHREADS;
85  unsigned blockSizeY = 1;
86  unsigned blockSizeZ = 1;
87  unsigned gridSizeX  = 1;
88  unsigned gridSizeY  = 1;
89  unsigned gridSizeZ  = 1;
90
91  // Kernel parameters
92  void *KernelParams[] = { &devBufferInput, &devBufferSize, &devBufferOutput};
93
94  // std::cout << "Launching kernel\n";
95
96  // Kernel launch
97  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
98                                 blockSizeX, blockSizeY, blockSizeZ,
99                                 0, NULL, KernelParams, NULL));
100  // std::cout << "kernel success.\n";
101  // Retrieve device data
102
103  CUevent start;
104  CUevent stop;
105  float elapsedTime;
106
107  cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC);
108  cuEventRecord(start,0);
109
110  ulong * matchRslt;
111  int ret = posix_memalign((void**)&matchRslt, 32, outputSize);
112  if (ret) {
113    std::cerr << "Cannot allocate memory for output.\n";
114    exit(-1);
115  }
116  checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize));
117  if (CountOnly){
118    int count = 0;
119    for (unsigned i = 0; i < GROUPTHREADS; ++i) {
120      count += matchRslt[i];
121    }
122    std::cout << count << "\n";
123  }
124
125  cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC);
126  cuEventRecord(stop,0);
127  cuEventSynchronize(stop);
128
129  cuEventElapsedTime(&elapsedTime, start, stop);
130  printf("GPU Kernel time : %f ms\n" ,elapsedTime);
131
132
133  // Clean-up
134  checkCudaErrors(cuMemFree(devBufferInput));
135  checkCudaErrors(cuMemFree(devBufferSize));
136  checkCudaErrors(cuMemFree(devBufferOutput));
137  checkCudaErrors(cuModuleUnload(cudaModule));
138  checkCudaErrors(cuCtxDestroy(context));
139
140  return matchRslt;
141}
Note: See TracBrowser for help on using the repository browser.