Point Cloud Library (PCL)  1.14.1
cutil_inline_runtime.h
1 /*
2  * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3  *
4  * Please refer to the NVIDIA end user license agreement (EULA) associated
5  * with this source code for terms and conditions that govern your use of
6  * this software. Any use, reproduction, disclosure, or distribution of
7  * this software and related documentation outside the terms of the EULA
8  * is strictly prohibited.
9  *
10  */
11 
12 #pragma once
13 
14 #ifdef _WIN32
15 #ifdef _DEBUG // Do this only in debug mode...
16 # define WINDOWS_LEAN_AND_MEAN
17 # include <windows.h>
18 # include <stdlib.h>
19 # undef min
20 # undef max
21 #endif
22 #endif
23 
24 #include <stdio.h>
25 #include <string.h>
26 #include <stdlib.h>
27 
28 #include <cufft.h>
29 
30 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
31 // The advantage is the developers gets to use the inline function so they can debug
32 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
33 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
34 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
35 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
36 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
37 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
38 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
39 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
40 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
41 #define cutilExit(argc, argv) __cutilExit (argc, argv)
42 
43 inline cudaError cutilDeviceSynchronize()
44 {
45  return cudaDeviceSynchronize();
46 }
47 
48 inline cudaError cutilDeviceReset()
49 {
50  return cudaDeviceReset();
51 }
52 
53 inline void __cutilCondition(int val, char *file, int line)
54 {
55  if( CUTFalse == cutCheckCondition( val, file, line ) ) {
56  exit(EXIT_FAILURE);
57  }
58 }
59 
60 inline void __cutilExit(int argc, char **argv)
61 {
62  if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {
63  printf("\nPress ENTER to exit...\n");
64  fflush( stdout);
65  fflush( stderr);
66  getchar();
67  }
68  exit(EXIT_SUCCESS);
69 }
70 
71 #define MIN(a,b) ((a < b) ? a : b)
72 #define MAX(a,b) ((a > b) ? a : b)
73 
74 // Beginning of GPU Architecture definitions
75 inline int _ConvertSMVer2Cores(int major, int minor)
76 {
77  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
78  struct sSMtoCores {
79  int SM; // 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version
80  int Cores;
81  };
82 
83  sSMtoCores nGpuArchCoresPerSM[] =
84  { { 0x10, 8 },
85  { 0x11, 8 },
86  { 0x12, 8 },
87  { 0x13, 8 },
88  { 0x20, 32 },
89  { 0x21, 48 },
90  { -1, -1 }
91  };
92 
93  int index = 0;
94  while (nGpuArchCoresPerSM[index].SM != -1) {
95  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
96  return nGpuArchCoresPerSM[index].Cores;
97  }
98  index++;
99  }
100  printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
101  return -1;
102 }
103 // end of GPU Architecture definitions
104 
105 // This function returns the best GPU (with maximum GFLOPS)
106 inline int cutGetMaxGflopsDeviceId()
107 {
108  int current_device = 0;
109  int max_compute_perf = 0;
110  int max_perf_device = 0;
111  int device_count = 0;
112  int best_SM_arch = 0;
113 
114  cudaGetDeviceCount( &device_count );
115  // Find the best major SM Architecture GPU device
116  while ( current_device < device_count ) {
117  cudaDeviceProp deviceProp;
118  cudaGetDeviceProperties( &deviceProp, current_device );
119  if (deviceProp.major > 0 && deviceProp.major < 9999) {
120  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
121  }
122  current_device++;
123  }
124 
125  // Find the best CUDA capable GPU device
126  current_device = 0;
127  while( current_device < device_count ) {
128  cudaDeviceProp deviceProp;
129  cudaGetDeviceProperties( &deviceProp, current_device );
130  int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
131 
132  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
133  if( compute_perf > max_compute_perf ) {
134  // If we find GPU with SM major > 2, search only these
135  if ( best_SM_arch > 2 ) {
136  // If our device==dest_SM_arch, choose this, or else pass
137  if (deviceProp.major == best_SM_arch) {
138  max_compute_perf = compute_perf;
139  max_perf_device = current_device;
140  }
141  } else {
142  max_compute_perf = compute_perf;
143  max_perf_device = current_device;
144  }
145  }
146  ++current_device;
147  }
148  return max_perf_device;
149 }
150 
151 // This function returns the best GPU (with maximum GFLOPS)
152 inline int cutGetMaxGflopsGraphicsDeviceId()
153 {
154  int current_device = 0;
155  int max_compute_perf = 0;
156  int max_perf_device = 0;
157  int device_count = 0;
158  int best_SM_arch = 0;
159  int bTCC = 0;
160 
161  cudaGetDeviceCount( &device_count );
162  // Find the best major SM Architecture GPU device that is graphics capable
163  while ( current_device < device_count ) {
164  cudaDeviceProp deviceProp;
165  cudaGetDeviceProperties( &deviceProp, current_device );
166 
167  if (deviceProp.tccDriver) bTCC = 1;
168 
169  if (!bTCC) {
170  if (deviceProp.major > 0 && deviceProp.major < 9999) {
171  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
172  }
173  }
174  current_device++;
175  }
176 
177  // Find the best CUDA capable GPU device
178  current_device = 0;
179  while( current_device < device_count ) {
180  cudaDeviceProp deviceProp;
181  cudaGetDeviceProperties( &deviceProp, current_device );
182  int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
183 
184  if (deviceProp.tccDriver) bTCC = 1;
185 
186  if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
187  {
188  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
189  if( compute_perf > max_compute_perf ) {
190  // If we find GPU with SM major > 2, search only these
191  if ( best_SM_arch > 2 ) {
192  // If our device==dest_SM_arch, choose this, or else pass
193  if (deviceProp.major == best_SM_arch) {
194  max_compute_perf = compute_perf;
195  max_perf_device = current_device;
196  }
197  }
198  else {
199  max_compute_perf = compute_perf;
200  max_perf_device = current_device;
201  }
202  }
203  }
204  ++current_device;
205  }
206  return max_perf_device;
207 }
208 
209 // Give a little more for Windows : the console window often disappears before we can read the message
210 #ifdef _WIN32
211 # if 1//ndef UNICODE
212 # ifdef _DEBUG // Do this only in debug mode...
213  inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
214  {
215  std::size_t fmt2_sz = 2048;
216  char *fmt2 = (char*)malloc(fmt2_sz);
217  va_list vlist;
218  va_start(vlist, fmt);
219  while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
220  {
221  fmt2_sz *= 2;
222  if(fmt2) free(fmt2);
223  fmt2 = (char*)malloc(fmt2_sz);
224  }
225  OutputDebugStringA(fmt2);
226  fprintf(file, fmt2);
227  free(fmt2);
228  }
229 #define FPRINTF(a) VSPrintf a
230 #else //debug
231 #define FPRINTF(a) fprintf a
232 // For other than Win32
233 #endif //debug
234 #else //unicode
235 // Unicode case... let's give-up for now and keep basic printf
236 #define FPRINTF(a) fprintf a
237 #endif //unicode
238 #else //win32
239 #define FPRINTF(a) fprintf a
240 #endif //win32
241 
242 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
243 // when the user double clicks on the error line in the Output pane. Like any compile error.
244 
245 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
246 {
247  if( cudaSuccess != err) {
248  FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
249  file, line, cudaGetErrorString( err) ));
250  exit(-1);
251  }
252 }
253 
254 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
255 {
256  if( cudaSuccess != err) {
257  FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
258  file, line, cudaGetErrorString( err) ));
259  exit(-1);
260  }
261 }
262 
263 inline void __cudaSafeThreadSync( const char *file, const int line )
264 {
265  cudaError err = cutilDeviceSynchronize();
266  if ( cudaSuccess != err) {
267  FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
268  file, line, cudaGetErrorString( err) ));
269  exit(-1);
270  }
271 }
272 
273 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
274 {
275  if( CUFFT_SUCCESS != err) {
276  FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error.\n",
277  file, line));
278  exit(-1);
279  }
280 }
281 
282 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
283 {
284  if( CUTTrue != err) {
285  FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
286  file, line));
287  exit(-1);
288  }
289 }
290 
291 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
292 {
293  cudaError_t err = cudaGetLastError();
294  if( cudaSuccess != err) {
295  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
296  file, line, errorMessage, cudaGetErrorString( err) ));
297  exit(-1);
298  }
299 }
300 
301 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
302 {
303  cudaError_t err = cudaGetLastError();
304  if( cudaSuccess != err) {
305  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
306  file, line, errorMessage, cudaGetErrorString( err) ));
307  exit(-1);
308  }
309 
310  err = cutilDeviceSynchronize();
311  if( cudaSuccess != err) {
312  FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
313  file, line, errorMessage, cudaGetErrorString( err) ));
314  exit(-1);
315  }
316 }
317 
318 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
319 {
320  if( !(pointer)) {
321  FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
322  file, line));
323  exit(-1);
324  }
325 }
326 
327 #if __DEVICE_EMULATION__
328  inline int cutilDeviceInit(int ARGC, char **ARGV) { }
329  inline int cutilChooseCudaDevice(int ARGC, char **ARGV) { }
330 #else
331  inline int cutilDeviceInit(int ARGC, char **ARGV)
332  {
333  int deviceCount;
334  cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
335  if (deviceCount == 0) {
336  FPRINTF((stderr, "CUTIL CUDA error: no devices supporting CUDA.\n"));
337  exit(-1);
338  }
339  int dev = 0;
340  cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);
341  if (dev < 0)
342  dev = 0;
343  if (dev > deviceCount-1) {
344  fprintf(stderr, "\n");
345  fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
346  fprintf(stderr, ">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
347  fprintf(stderr, "\n");
348  return -dev;
349  }
350  cudaDeviceProp deviceProp;
351  cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
352  if (deviceProp.major < 1) {
353  FPRINTF((stderr, "cutil error: GPU device does not support CUDA.\n"));
354  exit(-1); \
355  }
356  printf("> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
357  cutilSafeCall(cudaSetDevice(dev));
358 
359  return dev;
360  }
361 
362  // General initialization call to pick the best CUDA Device
363  inline int cutilChooseCudaDevice(int argc, char **argv)
364  {
365  cudaDeviceProp deviceProp;
366  int devID = 0;
367  // If the command-line has a device number specified, use it
368  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
369  devID = cutilDeviceInit(argc, argv);
370  if (devID < 0) {
371  printf("exiting...\n");
372  cutilExit(argc, argv);
373  exit(0);
374  }
375  } else {
376  // Otherwise pick the device with highest Gflops/s
377  devID = cutGetMaxGflopsDeviceId();
378  cutilSafeCallNoSync( cudaSetDevice( devID ) );
379  cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
380  printf("> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
381  }
382  return devID;
383  }
384 #endif
385 
386 
387 //! Check for CUDA context lost
388 inline void cutilCudaCheckCtxLost(const char *errorMessage, const char *file, const int line )
389 {
390  cudaError_t err = cudaGetLastError();
391  if( cudaSuccess != err) {
392  FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
393  file, line, errorMessage, cudaGetErrorString( err) ));
394  exit(-1);
395  }
396  err = cutilDeviceSynchronize();
397  if( cudaSuccess != err) {
398  FPRINTF((stderr, "%s(%i) : CUDA error: %s : %s.\n",
399  file, line, errorMessage, cudaGetErrorString( err) ));
400  exit(-1);
401  }
402 }
403 
404 #ifndef STRCASECMP
405 #ifdef _WIN32
406 #define STRCASECMP _stricmp
407 #else
408 #define STRCASECMP strcasecmp
409 #endif
410 #endif
411 
412 #ifndef STRNCASECMP
413 #ifdef _WIN32
414 #define STRNCASECMP _strnicmp
415 #else
416 #define STRNCASECMP strncasecmp
417 #endif
418 #endif
419 
420 inline void __cutilQAFinish(int argc, char **argv, bool bStatus)
421 {
422  const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
423 
424  bool bFlag = false;
425  for (int i=1; i < argc; i++) {
426  if (!STRCASECMP(argv[i], "-qatest") || !STRCASECMP(argv[i], "-noprompt")) {
427  bFlag |= true;
428  }
429  }
430 
431  if (bFlag) {
432  printf("&&&& %s %s", sStatus[bStatus], argv[0]);
433  for (int i=1; i < argc; i++) printf(" %s", argv[i]);
434  }
435  else {
436  printf("[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
437  }
438 }
439 
440 // General check for CUDA GPU SM Capabilities
441 inline bool cutilCudaCapabilities(int major_version, int minor_version, int argc, char **argv)
442 {
443  cudaDeviceProp deviceProp;
444  deviceProp.major = 0;
445  deviceProp.minor = 0;
446  int dev;
447 
448 #ifdef __DEVICE_EMULATION__
449  printf("> Compute Device Emulation Mode \n");
450 #endif
451 
452  cutilSafeCall( cudaGetDevice(&dev) );
453  cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
454 
455  if((deviceProp.major > major_version) ||
456  (deviceProp.major == major_version && deviceProp.minor >= minor_version)) {
457  printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
458  return true;
459  }
460  else {
461  printf("There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
462  __cutilQAFinish(argc, argv, true);
463  return false;
464  }
465 }