source: icGREP/icgrep-devel/icgrep/editd/EditdCudaDriver.h @ 5828

Last change on this file since 5828 was 5528, checked in by lindanl, 2 years ago

editd: number of groups is now defined by number of pattern segments.

File size: 5.0 KB
Line 
1#include <string>
2#include <iostream>
3#include <fstream>
4#include <sys/stat.h>
5#include <fcntl.h>
6#include <unistd.h>
7#include <cassert>
8#include <toolchain/toolchain.h>
9#include "cuda.h"
10
11#define GROUPTHREADS 64
12
13void checkCudaErrors(CUresult err) {
14  assert(err == CUDA_SUCCESS);
15}
16
17/// main - Program entry point
18ulong * RunPTX(std::string PTXFilename, char * fileBuffer, ulong filesize, const char * patternStr, unsigned patternLen, int dist) {
19 
20  CUdevice    device;
21  CUmodule    cudaModule;
22  CUcontext   context;
23  CUfunction  function;
24  int         devCount;
25
26  // CUDA initialization
27  checkCudaErrors(cuInit(0));
28  checkCudaErrors(cuDeviceGetCount(&devCount));
29  checkCudaErrors(cuDeviceGet(&device, 0));
30
31  char name[128];
32  checkCudaErrors(cuDeviceGetName(name, 128, device));
33  // std::cout << "Using CUDA Device [0]: " << name << "\n";
34
35  int devMajor, devMinor;
36  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
37  // std::cout << "Device Compute Capability: " << devMajor << "." << devMinor << "\n";
38  if (devMajor < 2) {
39    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
40    exit(-1);
41  }
42
43  std::ifstream f_editd(PTXFilename);
44  if (!f_editd.is_open()) {
45    std::cerr << "Error: cannot open " << PTXFilename << " for processing. Skipped.\n";
46    exit(-1);
47  }
48 
49  std::string ptx_str((std::istreambuf_iterator<char>(f_editd)), std::istreambuf_iterator<char>());
50
51  checkCudaErrors(cuCtxCreate(&context, 0, device));
52  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_str.c_str(), 0, 0, 0));
53  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "Main"));
54
55  CUfunction  mergefunction;
56  CUmodule    mergeModule;
57  std::ifstream f_merge("merge.ptx");
58  if (!f_merge.is_open()) {
59    std::cerr << "Error: cannot open " << "merge.ptx" << " for processing. Skipped.\n";
60    exit(-1);
61  }
62 
63  std::string mergePTX((std::istreambuf_iterator<char>(f_merge)), std::istreambuf_iterator<char>());
64
65  checkCudaErrors(cuModuleLoadDataEx(&mergeModule, mergePTX.c_str(), 0, 0, 0));
66  checkCudaErrors(cuModuleGetFunction(&mergefunction, mergeModule, "Main"));
67
68
69  // Device data
70  CUdeviceptr devBufferInput;
71  CUdeviceptr devInputSize;
72  CUdeviceptr devPatterns;
73  CUdeviceptr devBufferOutput;
74  CUdeviceptr devStrides;
75
76  const unsigned numOfGroups = codegen::GroupNum;
77  int strideSize = GROUPTHREADS * sizeof(ulong) * 4;
78  int strides = filesize/(strideSize * 2) + 1;
79  int bufferSize = strides * strideSize;
80  int outputSize = sizeof(ulong) * GROUPTHREADS * strides * (dist + 1) * numOfGroups;
81
82  checkCudaErrors(cuMemAlloc(&devBufferInput, bufferSize));
83  // checkCudaErrors(cuMemsetD8(devBufferInput, 0, bufferSize));
84  checkCudaErrors(cuMemAlloc(&devInputSize, sizeof(ulong)));
85  checkCudaErrors(cuMemAlloc(&devPatterns, patternLen));
86  checkCudaErrors(cuMemAlloc(&devBufferOutput, outputSize));
87  // checkCudaErrors(cuMemsetD8(devBufferOutput, 0, outputSize));
88  checkCudaErrors(cuMemAlloc(&devStrides, sizeof(int)));
89
90  //Copy from host to device
91  checkCudaErrors(cuMemcpyHtoD(devBufferInput, fileBuffer, bufferSize));
92  checkCudaErrors(cuMemcpyHtoD(devInputSize, &filesize, sizeof(ulong)));
93  checkCudaErrors(cuMemcpyHtoD(devPatterns, patternStr, patternLen));
94  checkCudaErrors(cuMemcpyHtoD(devStrides, &strides, sizeof(int)));
95
96  unsigned blockSizeX = GROUPTHREADS;
97  unsigned blockSizeY = 1;
98  unsigned blockSizeZ = 1;
99  unsigned gridSizeX  = numOfGroups;
100  unsigned gridSizeY  = 1;
101  unsigned gridSizeZ  = 1;
102
103  // Kernel parameters
104  void *KernelParams[] = { &devBufferInput, &devInputSize, &devPatterns, &devBufferOutput, &devStrides};
105
106  void *MergeKernelParams[] = {&devBufferOutput, &devStrides};
107
108  // std::cout << "Launching kernel\n";
109
110  CUevent start;
111  CUevent stop;
112  float elapsedTime;
113
114  cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC);
115  cuEventRecord(start,0);
116
117  // Kernel launch
118  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
119                                 blockSizeX, blockSizeY, blockSizeZ,
120                                 0, NULL, KernelParams, NULL));
121
122  cuCtxSynchronize();
123
124  checkCudaErrors(cuLaunchKernel(mergefunction, dist+1, gridSizeY, gridSizeZ,
125                                 blockSizeX, blockSizeY, blockSizeZ,
126                                 0, NULL, MergeKernelParams, NULL));
127
128  cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC);
129  cuEventRecord(stop,0);
130  cuEventSynchronize(stop);
131
132  cuEventElapsedTime(&elapsedTime, start, stop);
133  printf("Elapsed time : %f ms\n" ,elapsedTime);
134
135  // Retrieve device data
136  ulong * matchRslt;
137  if (posix_memalign((void**)&matchRslt, 32, outputSize/numOfGroups)) {
138    std::cerr << "Cannot allocate memory for output.\n";
139    exit(-1);
140  }
141 
142  checkCudaErrors(cuMemcpyDtoH(matchRslt, devBufferOutput, outputSize/numOfGroups));
143
144  // Clean-up
145  checkCudaErrors(cuMemFree(devBufferInput));
146  checkCudaErrors(cuMemFree(devInputSize));
147  checkCudaErrors(cuMemFree(devBufferOutput));
148  checkCudaErrors(cuMemFree(devPatterns));
149  checkCudaErrors(cuMemFree(devStrides));
150  checkCudaErrors(cuModuleUnload(cudaModule));
151  checkCudaErrors(cuCtxDestroy(context));
152
153  return matchRslt;
154}
Note: See TracBrowser for help on using the repository browser.