mxnet
Classes | Namespaces | Macros | Functions | Variables
cuda_utils.h File Reference

Common CUDA utilities. More...

#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <dmlc/optional.h>
#include <mshadow/base.h>
#include <mxnet/libinfo.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <curand.h>
#include <vector>
Include dependency graph for cuda_utils.h:

Go to the source code of this file.

Classes

struct  mxnet::common::cuda::CublasType< DType >
 Converts between C++ datatypes and enums/constants needed by cuBLAS. More...
 
struct  mxnet::common::cuda::CublasType< float >
 
struct  mxnet::common::cuda::CublasType< double >
 
struct  mxnet::common::cuda::CublasType< mshadow::half::half_t >
 
struct  mxnet::common::cuda::CublasType< uint8_t >
 
struct  mxnet::common::cuda::CublasType< int32_t >
 
class  mxnet::common::cuda::DeviceStore
 

Namespaces

 mxnet
 namespace of mxnet
 
 mxnet::common
 
 mxnet::common::cuda
 common utils for cuda
 

Macros

#define QUOTE(x)   #x
 Macros/inlines to assist CLion to parse Cuda files (*.cu, *.cuh) More...
 
#define QUOTEVALUE(x)   QUOTE(x)
 
#define STATIC_ASSERT_CUDA_VERSION_GE(min_version)
 
#define CHECK_CUDA_ERROR(msg)
 When compiling a device function, check that the architecture is >= Kepler (3.0) Note that CUDA_ARCH is not defined outside of a device function. More...
 
#define CUDA_CALL(func)
 Protected CUDA call. More...
 
#define CUBLAS_CALL(func)
 Protected cuBLAS call. More...
 
#define CUSOLVER_CALL(func)
 Protected cuSolver call. More...
 
#define CURAND_CALL(func)
 Protected cuRAND call. More...
 
#define NVRTC_CALL(x)
 Protected NVRTC call. More...
 
#define CUDA_DRIVER_CALL(func)
 Protected CUDA driver call. More...
 
#define CUDA_UNROLL   _Pragma("unroll")
 
#define CUDA_NOUNROLL   _Pragma("nounroll")
 
#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT   true
 
#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT   false
 

Functions

const char * mxnet::common::cuda::CublasGetErrorString (cublasStatus_t error)
 Get string representation of cuBLAS errors. More...
 
const char * mxnet::common::cuda::CusolverGetErrorString (cusolverStatus_t error)
 Get string representation of cuSOLVER errors. More...
 
const char * mxnet::common::cuda::CurandGetErrorString (curandStatus_t status)
 Get string representation of cuRAND errors. More...
 
template<typename DType >
DType __device__ mxnet::common::cuda::CudaMax (DType a, DType b)
 
template<typename DType >
DType __device__ mxnet::common::cuda::CudaMin (DType a, DType b)
 
int mxnet::common::cuda::get_load_type (size_t N)
 Get the largest datatype suitable to read requested number of bytes. More...
 
int mxnet::common::cuda::get_rows_per_block (size_t row_size, int num_threads_per_block)
 Determine how many rows in a 2D matrix should a block of threads handle based on the row size and the number of threads in a block. More...
 
int cudaAttributeLookup (int device_id, std::vector< int32_t > *cached_values, cudaDeviceAttr attr, const char *attr_name)
 Return an attribute GPU device_id. More...
 
int ComputeCapabilityMajor (int device_id)
 Determine major version number of the gpu's cuda compute architecture. More...
 
int ComputeCapabilityMinor (int device_id)
 Determine minor version number of the gpu's cuda compute architecture. More...
 
int SMArch (int device_id)
 Return the integer SM architecture (e.g. Volta = 70). More...
 
int MultiprocessorCount (int device_id)
 Return the number of streaming multiprocessors of GPU device_id. More...
 
int MaxSharedMemoryPerMultiprocessor (int device_id)
 Return the shared memory size in bytes of each of the GPU's streaming multiprocessors. More...
 
bool SupportsCooperativeLaunch (int device_id)
 Return whether the GPU device_id supports cooperative-group kernel launching. More...
 
bool SupportsFloat16Compute (int device_id)
 Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is negative. More...
 
bool SupportsTensorCore (int device_id)
 Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_id is negative. More...
 
bool GetEnvAllowTensorCore ()
 Returns global policy for TensorCore algo use. More...
 
bool GetEnvAllowTensorCoreConversion ()
 Returns global policy for TensorCore implicit type casting. More...
 

Variables

constexpr size_t kMaxNumGpus = 64
 Maximum number of GPUs. More...
 

Detailed Description

Common CUDA utilities.

Copyright (c) 2015 by Contributors

Macro Definition Documentation

#define CHECK_CUDA_ERROR (   msg)
Value:
{ \
cudaError_t e = cudaGetLastError(); \
CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
}

When compiling a device function, check that the architecture is >= Kepler (3.0) Note that CUDA_ARCH is not defined outside of a device function.

Check CUDA error.

Parameters
msgMessage to print if an error occured.
#define CUBLAS_CALL (   func)
Value:
{ \
cublasStatus_t e = (func); \
CHECK_EQ(e, CUBLAS_STATUS_SUCCESS) \
}
const char * CublasGetErrorString(cublasStatus_t error)
Get string representation of cuBLAS errors.
Definition: cuda_utils.h:258

Protected cuBLAS call.

Parameters
funcExpression to call.

It checks for cuBLAS errors after invocation of the expression.

#define CUDA_CALL (   func)
Value:
{ \
cudaError_t e = (func); \
CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
<< "CUDA: " << cudaGetErrorString(e); \
}

Protected CUDA call.

Parameters
funcExpression to call.

It checks for CUDA errors after invocation of the expression.

#define CUDA_DRIVER_CALL (   func)
Value:
{ \
CUresult e = (func); \
if (e != CUDA_SUCCESS) { \
char const * err_msg = nullptr; \
if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) { \
LOG(FATAL) << "CUDA Driver: Unknown error " << e; \
} else { \
LOG(FATAL) << "CUDA Driver: " << err_msg; \
} \
} \
}

Protected CUDA driver call.

Parameters
funcExpression to call.

It checks for CUDA driver errors after invocation of the expression.

#define CUDA_NOUNROLL   _Pragma("nounroll")
#define CUDA_UNROLL   _Pragma("unroll")
#define CURAND_CALL (   func)
Value:
{ \
curandStatus_t e = (func); \
CHECK_EQ(e, CURAND_STATUS_SUCCESS) \
}
const char * CurandGetErrorString(curandStatus_t status)
Get string representation of cuRAND errors.
Definition: cuda_utils.h:329

Protected cuRAND call.

Parameters
funcExpression to call.

It checks for cuRAND errors after invocation of the expression.

#define CUSOLVER_CALL (   func)
Value:
{ \
cusolverStatus_t e = (func); \
CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS) \
}
const char * CusolverGetErrorString(cusolverStatus_t error)
Get string representation of cuSOLVER errors.
Definition: cuda_utils.h:300

Protected cuSolver call.

Parameters
funcExpression to call.

It checks for cuSolver errors after invocation of the expression.

#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT   true
#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT   false
#define NVRTC_CALL (   x)
Value:
{ \
nvrtcResult result = x; \
CHECK_EQ(result, NVRTC_SUCCESS) \
<< #x " failed with error " \
<< nvrtcGetErrorString(result); \
}

Protected NVRTC call.

Parameters
funcExpression to call.

It checks for NVRTC errors after invocation of the expression.

#define QUOTE (   x)    #x

Macros/inlines to assist CLion to parse Cuda files (*.cu, *.cuh)

#define QUOTEVALUE (   x)    QUOTE(x)
#define STATIC_ASSERT_CUDA_VERSION_GE (   min_version)
Value:
static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
QUOTEVALUE(min_version) " or later.")
#define QUOTEVALUE(x)
Definition: cuda_utils.h:52

Function Documentation

int ComputeCapabilityMajor ( int  device_id)
inline

Determine major version number of the gpu's cuda compute architecture.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the major version number of the gpu's cuda compute architecture.
int ComputeCapabilityMinor ( int  device_id)
inline

Determine minor version number of the gpu's cuda compute architecture.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the minor version number of the gpu's cuda compute architecture.
int cudaAttributeLookup ( int  device_id,
std::vector< int32_t > *  cached_values,
cudaDeviceAttr  attr,
const char *  attr_name 
)
inline

Return an attribute GPU device_id.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
cached_valuesAn array of attributes for already-looked-up GPUs.
attrThe attribute, by number.
attr_nameA string representation of the attribute, for error messages.
Returns
the gpu's attribute value.
bool GetEnvAllowTensorCore ( )
inline

Returns global policy for TensorCore algo use.

Returns
whether to allow TensorCore algo (if not specified by the Operator locally).
bool GetEnvAllowTensorCoreConversion ( )
inline

Returns global policy for TensorCore implicit type casting.

int MaxSharedMemoryPerMultiprocessor ( int  device_id)
inline

Return the shared memory size in bytes of each of the GPU's streaming multiprocessors.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the shared memory size per streaming multiprocessor.
int MultiprocessorCount ( int  device_id)
inline

Return the number of streaming multiprocessors of GPU device_id.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the gpu's count of streaming multiprocessors.
int SMArch ( int  device_id)
inline

Return the integer SM architecture (e.g. Volta = 70).

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the gpu's cuda compute architecture as an int.
bool SupportsCooperativeLaunch ( int  device_id)
inline

Return whether the GPU device_id supports cooperative-group kernel launching.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
the gpu's ability to run cooperative-group kernels.
bool SupportsFloat16Compute ( int  device_id)
inline

Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is negative.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
whether the gpu's architecture supports float16 math.
bool SupportsTensorCore ( int  device_id)
inline

Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_id is negative.

Parameters
device_idThe device index of the cuda-capable gpu of interest.
Returns
whether the gpu's architecture supports Tensor Core math.

Variable Documentation

constexpr size_t kMaxNumGpus = 64

Maximum number of GPUs.