24 #ifndef MXNET_COMMON_CUDA_UTILS_H_
25 #define MXNET_COMMON_CUDA_UTILS_H_
27 #include <dmlc/logging.h>
34 #ifdef __JETBRAINS_IDE__
39 #define __forceinline__
41 inline void __syncthreads() {}
42 inline void __threadfence_block() {}
44 inline T __clz(
const T val) {
47 struct __cuda_fake_struct {
52 extern __cuda_fake_struct blockDim;
53 extern __cuda_fake_struct threadIdx;
54 extern __cuda_fake_struct blockIdx;
58 #define QUOTEVALUE(x) QUOTE(x)
62 #include <cuda_runtime.h>
63 #include <cublas_v2.h>
67 #endif // MXNET_USE_NVML
71 #define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \
72 static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
73 QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
74 QUOTEVALUE(min_version) " or later.")
81 inline __device__
bool __is_supported_cuda_architecture() {
82 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
83 #error "Fermi and earlier GPU architectures are not supported (architecture versions less than 3.0)"
87 #endif // __CUDA_ARCH__ < 300
95 #define CHECK_CUDA_ERROR(msg) \
97 cudaError_t e = cudaGetLastError(); \
98 CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
107 #define CUDA_CALL(func) \
109 cudaError_t e = (func); \
110 CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) << "CUDA: " << cudaGetErrorString(e); \
119 #define CUBLAS_CALL(func) \
121 cublasStatus_t e = (func); \
122 CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \
123 << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \
132 #define CUSOLVER_CALL(func) \
134 cusolverStatus_t e = (func); \
135 CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \
136 << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \
145 #define CURAND_CALL(func) \
147 curandStatus_t e = (func); \
148 CHECK_EQ(e, CURAND_STATUS_SUCCESS) \
149 << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \
158 #define NVRTC_CALL(x) \
160 nvrtcResult result = x; \
161 CHECK_EQ(result, NVRTC_SUCCESS) << #x " failed with error " << nvrtcGetErrorString(result); \
170 #define CUDA_DRIVER_CALL(func) \
172 CUresult e = (func); \
173 if (e != CUDA_SUCCESS) { \
174 char const* err_msg = nullptr; \
175 if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) { \
176 LOG(FATAL) << "CUDA Driver: Unknown error " << e; \
178 LOG(FATAL) << "CUDA Driver: " << e << " " << err_msg; \
190 #define NVML_CALL(func) \
192 nvmlReturn_t result = (func); \
193 CHECK_EQ(result, NVML_SUCCESS) << #func " failed with error " << nvmlErrorString(result); \
195 #endif // MXNET_USE_NVML
197 #if !defined(_MSC_VER)
198 #define CUDA_UNROLL _Pragma("unroll")
199 #define CUDA_NOUNROLL _Pragma("nounroll")
202 #define CUDA_NOUNROLL
212 template <
typename DType>
224 #if CUDA_VERSION >= 8000
225 static const cudaDataType_t kCudaFlag = CUDA_R_32F;
234 #if CUDA_VERSION >= 8000
235 static const cudaDataType_t kCudaFlag = CUDA_R_64F;
244 #if CUDA_VERSION >= 8000
245 static const cudaDataType_t kCudaFlag = CUDA_R_16F;
248 static const mshadow::half::half_t
one;
249 static const mshadow::half::half_t
zero;
254 #if CUDA_VERSION >= 8000
255 static const cudaDataType_t kCudaFlag = CUDA_R_8I;
258 static const uint8_t one = 1;
259 static const uint8_t zero = 0;
264 #if CUDA_VERSION >= 8000
265 static const cudaDataType_t kCudaFlag = CUDA_R_32I;
268 static const int32_t one = 1;
269 static const int32_t zero = 0;
279 case CUBLAS_STATUS_SUCCESS:
280 return "CUBLAS_STATUS_SUCCESS";
281 case CUBLAS_STATUS_NOT_INITIALIZED:
282 return "CUBLAS_STATUS_NOT_INITIALIZED";
283 case CUBLAS_STATUS_ALLOC_FAILED:
284 return "CUBLAS_STATUS_ALLOC_FAILED";
285 case CUBLAS_STATUS_INVALID_VALUE:
286 return "CUBLAS_STATUS_INVALID_VALUE";
287 case CUBLAS_STATUS_ARCH_MISMATCH:
288 return "CUBLAS_STATUS_ARCH_MISMATCH";
289 case CUBLAS_STATUS_MAPPING_ERROR:
290 return "CUBLAS_STATUS_MAPPING_ERROR";
291 case CUBLAS_STATUS_EXECUTION_FAILED:
292 return "CUBLAS_STATUS_EXECUTION_FAILED";
293 case CUBLAS_STATUS_INTERNAL_ERROR:
294 return "CUBLAS_STATUS_INTERNAL_ERROR";
295 case CUBLAS_STATUS_NOT_SUPPORTED:
296 return "CUBLAS_STATUS_NOT_SUPPORTED";
300 return "Unknown cuBLAS status";
303 #if CUDA_VERSION >= 8000
309 inline cublasOperation_t CublasTransposeOp(
bool transpose) {
310 return transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
321 case CUSOLVER_STATUS_SUCCESS:
322 return "CUSOLVER_STATUS_SUCCESS";
323 case CUSOLVER_STATUS_NOT_INITIALIZED:
324 return "CUSOLVER_STATUS_NOT_INITIALIZED";
325 case CUSOLVER_STATUS_ALLOC_FAILED:
326 return "CUSOLVER_STATUS_ALLOC_FAILED";
327 case CUSOLVER_STATUS_INVALID_VALUE:
328 return "CUSOLVER_STATUS_INVALID_VALUE";
329 case CUSOLVER_STATUS_ARCH_MISMATCH:
330 return "CUSOLVER_STATUS_ARCH_MISMATCH";
331 case CUSOLVER_STATUS_EXECUTION_FAILED:
332 return "CUSOLVER_STATUS_EXECUTION_FAILED";
333 case CUSOLVER_STATUS_INTERNAL_ERROR:
334 return "CUSOLVER_STATUS_INTERNAL_ERROR";
335 case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
336 return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
340 return "Unknown cuSOLVER status";
350 case CURAND_STATUS_SUCCESS:
351 return "CURAND_STATUS_SUCCESS";
352 case CURAND_STATUS_VERSION_MISMATCH:
353 return "CURAND_STATUS_VERSION_MISMATCH";
354 case CURAND_STATUS_NOT_INITIALIZED:
355 return "CURAND_STATUS_NOT_INITIALIZED";
356 case CURAND_STATUS_ALLOCATION_FAILED:
357 return "CURAND_STATUS_ALLOCATION_FAILED";
358 case CURAND_STATUS_TYPE_ERROR:
359 return "CURAND_STATUS_TYPE_ERROR";
360 case CURAND_STATUS_OUT_OF_RANGE:
361 return "CURAND_STATUS_OUT_OF_RANGE";
362 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
363 return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
364 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
365 return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
366 case CURAND_STATUS_LAUNCH_FAILURE:
367 return "CURAND_STATUS_LAUNCH_FAILURE";
368 case CURAND_STATUS_PREEXISTING_FAILURE:
369 return "CURAND_STATUS_PREEXISTING_FAILURE";
370 case CURAND_STATUS_INITIALIZATION_FAILED:
371 return "CURAND_STATUS_INITIALIZATION_FAILED";
372 case CURAND_STATUS_ARCH_MISMATCH:
373 return "CURAND_STATUS_ARCH_MISMATCH";
374 case CURAND_STATUS_INTERNAL_ERROR:
375 return "CURAND_STATUS_INTERNAL_ERROR";
377 return "Unknown cuRAND status";
380 template <
typename DType>
381 inline DType __device__
CudaMax(DType a, DType b) {
382 return a > b ? a : b;
385 template <
typename DType>
386 inline DType __device__
CudaMin(DType a, DType b) {
387 return a < b ? a : b;
393 explicit DeviceStore(
int requested_device = -1,
bool restore =
true)
394 : restore_device_(-1), current_device_(requested_device), restore_(restore) {
396 CUDA_CALL(cudaGetDevice(&restore_device_));
397 if (requested_device != restore_device_) {
403 if (restore_ && current_device_ != restore_device_ && current_device_ != -1 &&
404 restore_device_ != -1)
405 CUDA_CALL(cudaSetDevice(restore_device_));
411 current_device_ = device;
462 std::vector<int32_t>* cached_values,
464 const char* attr_name) {
465 if (device_id < 0 || device_id >=
static_cast<int>(cached_values->size())) {
466 LOG(FATAL) << attr_name <<
"(device_id) called with invalid id: " << device_id;
467 }
else if ((*cached_values)[device_id] < 0) {
469 CUDA_CALL(cudaDeviceGetAttribute(&temp, attr, device_id));
470 (*cached_values)[device_id] =
static_cast<int32_t
>(temp);
472 return (*cached_values)[device_id];
481 static std::vector<int32_t> capability_major(
kMaxNumGpus, -1);
483 device_id, &capability_major, cudaDevAttrComputeCapabilityMajor,
"ComputeCapabilityMajor");
492 static std::vector<int32_t> capability_minor(
kMaxNumGpus, -1);
494 device_id, &capability_minor, cudaDevAttrComputeCapabilityMinor,
"ComputeCapabilityMinor");
505 return 10 * major + minor;
514 static std::vector<int32_t> sm_counts(
kMaxNumGpus, -1);
516 device_id, &sm_counts, cudaDevAttrMultiProcessorCount,
"MultiprocessorCount");
525 static std::vector<int32_t> max_smem_per_mutiprocessor(
kMaxNumGpus, -1);
527 &max_smem_per_mutiprocessor,
528 cudaDevAttrMaxSharedMemoryPerMultiprocessor,
529 "MaxSharedMemoryPerMultiprocessor");
538 static std::vector<int32_t> coop_launch(
kMaxNumGpus, -1);
540 device_id, &coop_launch, cudaDevAttrCooperativeLaunch,
"SupportsCooperativeLaunch");
555 return (computeCapabilityMajor > 5) ||
572 #define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true
582 static bool allow_tensor_core =
false;
583 static bool is_set =
false;
591 return allow_tensor_core;
596 #define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false
605 return dmlc::GetEnv(
"MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION",
610 #if CUDA_VERSION >= 9000
612 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
613 auto handle_math_mode = CUBLAS_DEFAULT_MATH;
614 CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));
615 CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));
616 return handle_math_mode;
620 #endif // MXNET_USE_CUDA
628 static_assert(CUDNN_PATCHLEVEL < 100 && CUDNN_MINOR < 10,
629 "CUDNN_VERSION_AS_STRING macro assumptions violated.");
630 #if CUDNN_PATCHLEVEL >= 10
631 #define CUDNN_VERSION_AS_STRING \
632 QUOTEVALUE(CUDNN_MAJOR) \
633 QUOTEVALUE(CUDNN_MINOR) \
634 QUOTEVALUE(CUDNN_PATCHLEVEL)
636 #define CUDNN_VERSION_AS_STRING \
637 QUOTEVALUE(CUDNN_MAJOR) \
638 QUOTEVALUE(CUDNN_MINOR) \
639 "0" QUOTEVALUE(CUDNN_PATCHLEVEL)
642 #define STATIC_ASSERT_CUDNN_VERSION_GE(min_version) \
644 CUDNN_VERSION >= min_version, \
645 "Compiled-against cuDNN version " CUDNN_VERSION_AS_STRING \
646 " is too old, please upgrade system to version " QUOTEVALUE(min_version) " or later.")
648 #define CUDNN_CALL_S(f, s) \
650 cudnnStatus_t unclash_cxx_e = (f); \
651 if (unclash_cxx_e != CUDNN_STATUS_SUCCESS) \
652 LOG(s) << "cuDNN: " << cudnnGetErrorString(unclash_cxx_e); \
655 #define CUDNN_CALL(f) CUDNN_CALL_S(f, FATAL)
656 #define CUDNN_CALL_NONFATAL(f) CUDNN_CALL_S(f, WARNING)
658 #define CUTENSOR_CALL(func) \
660 cutensorStatus_t e = (func); \
661 CHECK_EQ(e, CUTENSOR_STATUS_SUCCESS) << "cuTensor: " << cutensorGetErrorString(e); \
671 inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {
672 STATIC_ASSERT_CUDNN_VERSION_GE(7000);
674 CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));
685 inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
686 STATIC_ASSERT_CUDNN_VERSION_GE(7000);
688 CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));
699 inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {
700 STATIC_ASSERT_CUDNN_VERSION_GE(7000);
702 CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));
706 #endif // MXNET_USE_CUDNN
709 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
711 static inline __device__
void atomicAdd(
double* address,
double val) {
712 unsigned long long* address_as_ull =
713 reinterpret_cast<unsigned long long*
>(address);
714 unsigned long long old = *address_as_ull;
715 unsigned long long assumed;
720 address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
723 }
while (assumed != old);
731 static inline __device__
void atomicAdd(mshadow::half::half_t* address, mshadow::half::half_t val) {
732 unsigned int* address_as_ui =
reinterpret_cast<unsigned int*
>(
733 reinterpret_cast<char*
>(address) - (
reinterpret_cast<size_t>(address) & 2));
734 unsigned int old = *address_as_ui;
735 unsigned int assumed;
739 mshadow::half::half_t hsum;
740 hsum.half_ =
reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
742 old =
reinterpret_cast<size_t>(address) & 2 ? (old & 0xffff) | (hsum.half_ << 16) :
743 (old & 0xffff0000) | hsum.half_;
744 old = atomicCAS(address_as_ui, assumed, old);
745 }
while (assumed != old);
748 static inline __device__
void atomicAdd(uint8_t* address, uint8_t val) {
749 unsigned int* address_as_ui = (
unsigned int*)(address - ((
size_t)address & 0x3));
750 unsigned int old = *address_as_ui;
751 unsigned int shift = (((size_t)address & 0x3) << 3);
753 unsigned int assumed;
757 sum = val +
static_cast<uint8_t
>((old >> shift) & 0xff);
758 old = (old & ~(0x000000ff << shift)) | (sum << shift);
759 old = atomicCAS(address_as_ui, assumed, old);
760 }
while (assumed != old);
763 static inline __device__
void atomicAdd(int8_t* address, int8_t val) {
764 unsigned int* address_as_ui = (
unsigned int*)(address - ((
size_t)address & 0x3));
765 unsigned int old = *address_as_ui;
766 unsigned int shift = (((size_t)address & 0x3) << 3);
768 unsigned int assumed;
772 sum = val +
static_cast<int8_t
>((old >> shift) & 0xff);
773 old = (old & ~(0x000000ff << shift)) | (sum << shift);
774 old = atomicCAS(address_as_ui, assumed, old);
775 }
while (assumed != old);
779 static inline __device__
void atomicAdd(int64_t* address, int64_t val) {
780 atomicAdd(
reinterpret_cast<unsigned long long*
>(address),
781 static_cast<unsigned long long>(val));
784 template <
typename DType>
785 __device__
inline DType ldg(
const DType* address) {
786 #if __CUDA_ARCH__ >= 350
787 return __ldg(address);
798 static constexpr
const int warp_size = 32;
806 template <
int NVALUES = warp_size,
typename OP,
typename T>
807 __device__
inline T warp_reduce(T value, OP redfun) {
809 for (
int i = warp_size / 2; i >= 1; i /= 2) {
811 value = redfun(value, __shfl_down_sync(0xffffffff, value, i));
816 template <
typename OP,
typename T>
817 __device__
inline T grouped_warp_allreduce(T value, OP redfun,
const int group_size) {
818 for (
int i = 1; i < group_size; i *= 2) {
819 value = redfun(value, __shfl_down_sync(0xffffffff, value, i));
821 return __shfl_sync(0xffffffff, value, 0, group_size);
824 template <
int NValues = warp_size,
typename OP>
825 __device__
inline mshadow::half::half_t warp_reduce(mshadow::half::half_t value, OP redfun) {
826 float v =
static_cast<float>(value);
828 for (
int i = warp_size / 2; i >= 1; i /= 2) {
830 v = redfun(v, __shfl_down_sync(0xffffffff, v, i));
832 return mshadow::half::half_t(v);
847 template <
int NTHREADS,
bool all_reduce = true,
typename OP,
typename T>
848 __device__
inline T reduce(
const T& value, OP redfun) {
849 static_assert(NTHREADS <= warp_size * warp_size,
"Number of threads too large for reduction");
850 __shared__ T scratch[NTHREADS / warp_size];
851 const int thread_idx_in_warp = threadIdx.x % warp_size;
852 const int warp_id = threadIdx.x / warp_size;
853 const T my_val = warp_reduce<warp_size>(value, redfun);
854 if (thread_idx_in_warp == 0) {
855 scratch[warp_id] = my_val;
860 const T prev_val = threadIdx.x < (NTHREADS / warp_size) ? scratch[threadIdx.x] : 0;
861 const T my_val = warp_reduce<NTHREADS / warp_size>(prev_val, redfun);
863 scratch[threadIdx.x] = my_val;
884 #endif // MXNET_COMMON_CUDA_UTILS_H_