24 #ifndef MXNET_COMMON_CUDA_UTILS_H_ 25 #define MXNET_COMMON_CUDA_UTILS_H_ 27 #include <dmlc/logging.h> 28 #include <dmlc/parameter.h> 29 #include <dmlc/optional.h> 30 #include <mshadow/base.h> 33 #ifdef __JETBRAINS_IDE__ 38 #define __forceinline__ 40 inline void __syncthreads() {}
41 inline void __threadfence_block() {}
42 template<
class T>
inline T __clz(
const T val) {
return val; }
43 struct __cuda_fake_struct {
int x;
int y;
int z; };
44 extern __cuda_fake_struct blockDim;
45 extern __cuda_fake_struct threadIdx;
46 extern __cuda_fake_struct blockIdx;
51 #include <cuda_runtime.h> 52 #include <cublas_v2.h> 66 case CUBLAS_STATUS_SUCCESS:
67 return "CUBLAS_STATUS_SUCCESS";
68 case CUBLAS_STATUS_NOT_INITIALIZED:
69 return "CUBLAS_STATUS_NOT_INITIALIZED";
70 case CUBLAS_STATUS_ALLOC_FAILED:
71 return "CUBLAS_STATUS_ALLOC_FAILED";
72 case CUBLAS_STATUS_INVALID_VALUE:
73 return "CUBLAS_STATUS_INVALID_VALUE";
74 case CUBLAS_STATUS_ARCH_MISMATCH:
75 return "CUBLAS_STATUS_ARCH_MISMATCH";
76 case CUBLAS_STATUS_MAPPING_ERROR:
77 return "CUBLAS_STATUS_MAPPING_ERROR";
78 case CUBLAS_STATUS_EXECUTION_FAILED:
79 return "CUBLAS_STATUS_EXECUTION_FAILED";
80 case CUBLAS_STATUS_INTERNAL_ERROR:
81 return "CUBLAS_STATUS_INTERNAL_ERROR";
82 case CUBLAS_STATUS_NOT_SUPPORTED:
83 return "CUBLAS_STATUS_NOT_SUPPORTED";
87 return "Unknown cuBLAS status";
97 case CUSOLVER_STATUS_SUCCESS:
98 return "CUSOLVER_STATUS_SUCCESS";
99 case CUSOLVER_STATUS_NOT_INITIALIZED:
100 return "CUSOLVER_STATUS_NOT_INITIALIZED";
101 case CUSOLVER_STATUS_ALLOC_FAILED:
102 return "CUSOLVER_STATUS_ALLOC_FAILED";
103 case CUSOLVER_STATUS_INVALID_VALUE:
104 return "CUSOLVER_STATUS_INVALID_VALUE";
105 case CUSOLVER_STATUS_ARCH_MISMATCH:
106 return "CUSOLVER_STATUS_ARCH_MISMATCH";
107 case CUSOLVER_STATUS_EXECUTION_FAILED:
108 return "CUSOLVER_STATUS_EXECUTION_FAILED";
109 case CUSOLVER_STATUS_INTERNAL_ERROR:
110 return "CUSOLVER_STATUS_INTERNAL_ERROR";
111 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
112 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
116 return "Unknown cuSOLVER status";
126 case CURAND_STATUS_SUCCESS:
127 return "CURAND_STATUS_SUCCESS";
128 case CURAND_STATUS_VERSION_MISMATCH:
129 return "CURAND_STATUS_VERSION_MISMATCH";
130 case CURAND_STATUS_NOT_INITIALIZED:
131 return "CURAND_STATUS_NOT_INITIALIZED";
132 case CURAND_STATUS_ALLOCATION_FAILED:
133 return "CURAND_STATUS_ALLOCATION_FAILED";
134 case CURAND_STATUS_TYPE_ERROR:
135 return "CURAND_STATUS_TYPE_ERROR";
136 case CURAND_STATUS_OUT_OF_RANGE:
137 return "CURAND_STATUS_OUT_OF_RANGE";
138 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
139 return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
140 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
141 return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
142 case CURAND_STATUS_LAUNCH_FAILURE:
143 return "CURAND_STATUS_LAUNCH_FAILURE";
144 case CURAND_STATUS_PREEXISTING_FAILURE:
145 return "CURAND_STATUS_PREEXISTING_FAILURE";
146 case CURAND_STATUS_INITIALIZATION_FAILED:
147 return "CURAND_STATUS_INITIALIZATION_FAILED";
148 case CURAND_STATUS_ARCH_MISMATCH:
149 return "CURAND_STATUS_ARCH_MISMATCH";
150 case CURAND_STATUS_INTERNAL_ERROR:
151 return "CURAND_STATUS_INTERNAL_ERROR";
153 return "Unknown cuRAND status";
156 template <
typename DType>
157 inline DType __device__
CudaMax(DType a, DType b) {
158 return a > b ? a : b;
161 template <
typename DType>
162 inline DType __device__
CudaMin(DType a, DType b) {
163 return a < b ? a : b;
174 #define CHECK_CUDA_ERROR(msg) \ 176 cudaError_t e = cudaGetLastError(); \ 177 CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \ 186 #define CUDA_CALL(func) \ 188 cudaError_t e = (func); \ 189 CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 190 << "CUDA: " << cudaGetErrorString(e); \ 199 #define CUBLAS_CALL(func) \ 201 cublasStatus_t e = (func); \ 202 CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \ 203 << "cuBLAS: " << common::cuda::CublasGetErrorString(e); \ 212 #define CUSOLVER_CALL(func) \ 214 cusolverStatus_t e = (func); \ 215 CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \ 216 << "cuSolver: " << common::cuda::CusolverGetErrorString(e); \ 225 #define CURAND_CALL(func) \ 227 curandStatus_t e = (func); \ 228 CHECK_EQ(e, CURAND_STATUS_SUCCESS) \ 229 << "cuRAND: " << common::cuda::CurandGetErrorString(e); \ 232 #if !defined(_MSC_VER) 233 #define CUDA_UNROLL _Pragma("unroll") 234 #define CUDA_NOUNROLL _Pragma("nounroll") 237 #define CUDA_NOUNROLL 248 cudaDevAttrComputeCapabilityMajor, device_id));
260 cudaDevAttrComputeCapabilityMinor, device_id));
272 return 10 * major + minor;
284 return (computeCapabilityMajor > 5) ||
285 (computeCapabilityMajor == 5 && computeCapabilityMinor >= 3);
296 return (computeCapabilityMajor >= 7);
300 #define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true 309 return dmlc::GetEnv(
"MXNET_CUDA_ALLOW_TENSOR_CORE",
310 dmlc::optional<bool>(default_value)).value();
312 #endif // MXNET_USE_CUDA 318 #define CUDNN_CALL(func) \ 320 cudnnStatus_t e = (func); \ 321 CHECK_EQ(e, CUDNN_STATUS_SUCCESS) << "cuDNN: " << cudnnGetErrorString(e); \ 331 inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {
334 CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));
348 inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
351 CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));
365 inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {
368 CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));
375 #endif // MXNET_USE_CUDNN 378 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 380 static inline __device__
void atomicAdd(
double *address,
double val) {
381 unsigned long long* address_as_ull =
382 reinterpret_cast<unsigned long long*
>(address);
383 unsigned long long old = *address_as_ull;
384 unsigned long long assumed;
388 old = atomicCAS(address_as_ull, assumed,
389 __double_as_longlong(val +
390 __longlong_as_double(assumed)));
393 }
while (assumed != old);
400 #if defined(__CUDA_ARCH__) 401 static inline __device__
void atomicAdd(mshadow::half::half_t *address,
402 mshadow::half::half_t val) {
403 unsigned int *address_as_ui =
404 reinterpret_cast<unsigned int *
>(
reinterpret_cast<char *
>(address) -
405 (reinterpret_cast<size_t>(address) & 2));
406 unsigned int old = *address_as_ui;
407 unsigned int assumed;
411 mshadow::half::half_t hsum;
413 reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
415 old =
reinterpret_cast<size_t>(address) & 2
416 ? (old & 0xffff) | (hsum.half_ << 16)
417 : (old & 0xffff0000) | hsum.half_;
418 old = atomicCAS(address_as_ui, assumed, old);
419 }
while (assumed != old);
422 template <
typename DType>
423 __device__
inline DType ldg(
const DType* address) {
424 #if __CUDA_ARCH__ >= 350 425 return __ldg(address);
432 #endif // MXNET_COMMON_CUDA_UTILS_H_ int ComputeCapabilityMajor(int device_id)
Determine major version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:245
namespace of mxnet
Definition: base.h:126
bool GetEnvAllowTensorCore()
Returns global policy for TensorCore algo use.
Definition: cuda_utils.h:306
int SMArch(int device_id)
Return the integer SM architecture (e.g. Volta = 70).
Definition: cuda_utils.h:269
DType __device__ CudaMin(DType a, DType b)
Definition: cuda_utils.h:162
bool SupportsFloat16Compute(int device_id)
Determine whether a cuda-capable gpu's architecture supports float16 math.
Definition: cuda_utils.h:280
DType __device__ CudaMax(DType a, DType b)
Definition: cuda_utils.h:157
bool SupportsTensorCore(int device_id)
Determine whether a cuda-capable gpu's architecture supports Tensor Core math.
Definition: cuda_utils.h:293
const char * CusolverGetErrorString(cusolverStatus_t error)
Get string representation of cuSOLVER errors.
Definition: cuda_utils.h:95
#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT
Definition: cuda_utils.h:300
const char * CurandGetErrorString(curandStatus_t status)
Get string representation of cuRAND errors.
Definition: cuda_utils.h:124
int ComputeCapabilityMinor(int device_id)
Determine minor version number of the gpu's cuda compute architecture.
Definition: cuda_utils.h:257
#define CUDA_CALL(func)
Protected CUDA call.
Definition: cuda_utils.h:186
const char * CublasGetErrorString(cublasStatus_t error)
Get string representation of cuBLAS errors.
Definition: cuda_utils.h:64