Common CUDA utilities. More...

#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <dmlc/optional.h>
#include <mshadow/base.h>
#include <mxnet/libinfo.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <curand.h>
#include <vector>

Include dependency graph for cuda_utils.h:

Go to the source code of this file.

Classes
struct	mxnet::common::cuda::CublasType< DType >
	Converts between C++ datatypes and enums/constants needed by cuBLAS. More...

struct	mxnet::common::cuda::CublasType< float >

struct	mxnet::common::cuda::CublasType< double >

struct	mxnet::common::cuda::CublasType< mshadow::half::half_t >

struct	mxnet::common::cuda::CublasType< uint8_t >

struct	mxnet::common::cuda::CublasType< int32_t >

class	mxnet::common::cuda::DeviceStore

Namespaces
	mxnet
	namespace of mxnet

	mxnet::common

	mxnet::common::cuda
	common utils for cuda

Macros
#define	QUOTE(x) #x
	Macros/inlines to assist CLion to parse Cuda files (.cu, .cuh) More...

#define	QUOTEVALUE(x) QUOTE(x)

#define	STATIC_ASSERT_CUDA_VERSION_GE(min_version)

#define	CHECK_CUDA_ERROR(msg)
	When compiling a device function, check that the architecture is >= Kepler (3.0) Note that CUDA_ARCH is not defined outside of a device function. More...

#define	CUDA_CALL(func)
	Protected CUDA call. More...

#define	CUBLAS_CALL(func)
	Protected cuBLAS call. More...

#define	CUSOLVER_CALL(func)
	Protected cuSolver call. More...

#define	CURAND_CALL(func)
	Protected cuRAND call. More...

#define	NVRTC_CALL(x)
	Protected NVRTC call. More...

#define	CUDA_DRIVER_CALL(func)
	Protected CUDA driver call. More...

#define	CUDA_UNROLL _Pragma("unroll")

#define	CUDA_NOUNROLL _Pragma("nounroll")

#define	MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true

#define	MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false

Functions
const char *	mxnet::common::cuda::CublasGetErrorString (cublasStatus_t error)
	Get string representation of cuBLAS errors. More...

const char *	mxnet::common::cuda::CusolverGetErrorString (cusolverStatus_t error)
	Get string representation of cuSOLVER errors. More...

const char *	mxnet::common::cuda::CurandGetErrorString (curandStatus_t status)
	Get string representation of cuRAND errors. More...

template<typename DType >
DType __device__	mxnet::common::cuda::CudaMax (DType a, DType b)

template<typename DType >
DType __device__	mxnet::common::cuda::CudaMin (DType a, DType b)

int	mxnet::common::cuda::get_load_type (size_t N)
	Get the largest datatype suitable to read requested number of bytes. More...

int	mxnet::common::cuda::get_rows_per_block (size_t row_size, int num_threads_per_block)
	Determine how many rows in a 2D matrix should a block of threads handle based on the row size and the number of threads in a block. More...

int	cudaAttributeLookup (int device_id, std::vector< int32_t > cached_values, cudaDeviceAttr attr, const char attr_name)
	Return an attribute GPU `device_id`. More...

int	ComputeCapabilityMajor (int device_id)
	Determine major version number of the gpu's cuda compute architecture. More...

int	ComputeCapabilityMinor (int device_id)
	Determine minor version number of the gpu's cuda compute architecture. More...

int	SMArch (int device_id)
	Return the integer SM architecture (e.g. Volta = 70). More...

int	MultiprocessorCount (int device_id)
	Return the number of streaming multiprocessors of GPU `device_id`. More...

int	MaxSharedMemoryPerMultiprocessor (int device_id)
	Return the shared memory size in bytes of each of the GPU's streaming multiprocessors. More...

bool	SupportsCooperativeLaunch (int device_id)
	Return whether the GPU `device_id` supports cooperative-group kernel launching. More...

bool	SupportsFloat16Compute (int device_id)
	Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is negative. More...

bool	SupportsTensorCore (int device_id)
	Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_id is negative. More...

bool	GetEnvAllowTensorCore ()
	Returns global policy for TensorCore algo use. More...

bool	GetEnvAllowTensorCoreConversion ()
	Returns global policy for TensorCore implicit type casting. More...

Variables
constexpr size_t	kMaxNumGpus = 64
	Maximum number of GPUs. More...

Detailed Description

Common CUDA utilities.

Macro Definition Documentation

#define CHECK_CUDA_ERROR ( msg )

Value:

{                                                                          \
    cudaError_t e = cudaGetLastError();                                      \
    CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
  }

When compiling a device function, check that the architecture is >= Kepler (3.0) Note that CUDA_ARCH is not defined outside of a device function.

Check CUDA error.

Parameters

msg	Message to print if an error occured.

#define CUBLAS_CALL ( func )

Value:

{                                                             \
    cublasStatus_t e = (func);                                  \
    CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                          \
        << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \
  }

Protected cuBLAS call.

Parameters

func	Expression to call.

It checks for cuBLAS errors after invocation of the expression.

#define CUDA_CALL ( func )

Value:

{                                                                \
    cudaError_t e = (func);                                        \
    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)       \
        << "CUDA: " << cudaGetErrorString(e);                      \
  }

Protected CUDA call.

Parameters

func	Expression to call.

It checks for CUDA errors after invocation of the expression.

#define CUDA_DRIVER_CALL ( func )

Value:

{                                                                     \
    CUresult e = (func);                                                \
    if (e != CUDA_SUCCESS) {                                            \
      char const * err_msg = nullptr;                                         \
      if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) {  \
        LOG(FATAL) << "CUDA Driver: Unknown error " << e;               \
      } else {                                                          \
        LOG(FATAL) << "CUDA Driver: " << err_msg;                       \
      }                                                                 \
    }                                                                   \
  }

Protected CUDA driver call.

Parameters

func	Expression to call.

It checks for CUDA driver errors after invocation of the expression.

#define CUDA_NOUNROLL _Pragma("nounroll")

#define CUDA_UNROLL _Pragma("unroll")

#define CURAND_CALL ( func )

Value:

{                                                             \
    curandStatus_t e = (func);                                  \
    CHECK_EQ(e, CURAND_STATUS_SUCCESS)                          \
        << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \
  }

Protected cuRAND call.

Parameters

func	Expression to call.

It checks for cuRAND errors after invocation of the expression.

#define CUSOLVER_CALL ( func )

Value:

{                                                                 \
    cusolverStatus_t e = (func);                                    \
    CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS)                            \
        << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \
  }

Protected cuSolver call.

Parameters

func	Expression to call.

It checks for cuSolver errors after invocation of the expression.

#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true

#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false

#define NVRTC_CALL ( x )

Value:

{                                                     \
    nvrtcResult result = x;                             \
    CHECK_EQ(result, NVRTC_SUCCESS)                     \
      << #x " failed with error "                       \
      << nvrtcGetErrorString(result);                   \
  }

Protected NVRTC call.

Parameters

func	Expression to call.

It checks for NVRTC errors after invocation of the expression.

#define QUOTE ( x ) #x

Macros/inlines to assist CLion to parse Cuda files (*.cu, *.cuh)

#define QUOTEVALUE ( x ) QUOTE(x)

#define STATIC_ASSERT_CUDA_VERSION_GE ( min_version )

Value:

static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \
      QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \
      QUOTEVALUE(min_version) " or later.")

Function Documentation

int ComputeCapabilityMajor ( int device_id )

inline

Determine major version number of the gpu's cuda compute architecture.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the major version number of the gpu's cuda compute architecture.

int ComputeCapabilityMinor ( int device_id )

inline

Determine minor version number of the gpu's cuda compute architecture.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the minor version number of the gpu's cuda compute architecture.

int cudaAttributeLookup	(	int	device_id,
		std::vector< int32_t > *	cached_values,
		cudaDeviceAttr	attr,
		const char *	attr_name
	)

inline

Return an attribute GPU device_id.

Parameters

device_id	The device index of the cuda-capable gpu of interest.
cached_values	An array of attributes for already-looked-up GPUs.
attr	The attribute, by number.
attr_name	A string representation of the attribute, for error messages.

Returns: the gpu's attribute value.

bool GetEnvAllowTensorCore ( )

inline

Returns global policy for TensorCore algo use.

Returns: whether to allow TensorCore algo (if not specified by the Operator locally).

bool GetEnvAllowTensorCoreConversion ( )

inline

Returns global policy for TensorCore implicit type casting.

int MaxSharedMemoryPerMultiprocessor ( int device_id )

inline

Return the shared memory size in bytes of each of the GPU's streaming multiprocessors.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the shared memory size per streaming multiprocessor.

int MultiprocessorCount ( int device_id )

inline

Return the number of streaming multiprocessors of GPU device_id.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the gpu's count of streaming multiprocessors.

int SMArch ( int device_id )

inline

Return the integer SM architecture (e.g. Volta = 70).

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the gpu's cuda compute architecture as an int.

bool SupportsCooperativeLaunch ( int device_id )

inline

Return whether the GPU device_id supports cooperative-group kernel launching.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: the gpu's ability to run cooperative-group kernels.

bool SupportsFloat16Compute ( int device_id )

inline

Determine whether a cuda-capable gpu's architecture supports float16 math. Assume not if device_id is negative.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: whether the gpu's architecture supports float16 math.

bool SupportsTensorCore ( int device_id )

inline

Determine whether a cuda-capable gpu's architecture supports Tensor Core math. Assume not if device_id is negative.

Parameters

device_id The device index of the cuda-capable gpu of interest.

Returns: whether the gpu's architecture supports Tensor Core math.

Variable Documentation

constexpr size_t kMaxNumGpus = 64

Maximum number of GPUs.

Classes

Namespaces

Macros

Functions

Variables

Detailed Description

Macro Definition Documentation

Function Documentation

Variable Documentation