docs/api/cuda_2utils_8h_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */


#ifndef MXNET_COMMON_CUDA_UTILS_H_

#define MXNET_COMMON_CUDA_UTILS_H_


#include <dmlc/logging.h>

#include <dmlc/parameter.h>

#include <dmlc/optional.h>

#include <mshadow/base.h>

#include <mxnet/libinfo.h>


#ifdef __JETBRAINS_IDE__

#define __CUDACC__ 1

#define __host__

#define __device__

#define __global__

#define __forceinline__

#define __shared__

inline void __syncthreads() {}

inline void __threadfence_block() {}

template <class T>

inline T __clz(const T val) {

  return val;

}

struct __cuda_fake_struct {

  int x;

  int y;

  int z;

};

extern __cuda_fake_struct blockDim;

extern __cuda_fake_struct threadIdx;

extern __cuda_fake_struct blockIdx;

#endif


#define QUOTE(x)      #x

#define QUOTEVALUE(x) QUOTE(x)


#if MXNET_USE_CUDA


#include <cuda_runtime.h>

#include <cublas_v2.h>

#include <curand.h>

#if MXNET_USE_NVML

#include <nvml.h>

#endif  // MXNET_USE_NVML


#include <vector>


#define STATIC_ASSERT_CUDA_VERSION_GE(min_version) \

  static_assert(CUDA_VERSION >= min_version, "Compiled-against CUDA version " \

      QUOTEVALUE(CUDA_VERSION) " is too old, please upgrade system to version " \

      QUOTEVALUE(min_version) " or later.")


#ifdef __CUDACC__

inline __device__ bool __is_supported_cuda_architecture() {

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300

#error "Fermi and earlier GPU architectures are not supported (architecture versions less than 3.0)"

  return false;

#else

  return true;

#endif  // __CUDA_ARCH__ < 300

}

#endif  // __CUDACC__


#define CHECK_CUDA_ERROR(msg)                                                \

  {                                                                          \

    cudaError_t e = cudaGetLastError();                                      \

    CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \

  }


#define CUDA_CALL(func)                                                                            \

  {                                                                                                \

    cudaError_t e = (func);                                                                        \

    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) << "CUDA: " << cudaGetErrorString(e); \

  }


#define CUBLAS_CALL(func)                                              \

  {                                                                    \

    cublasStatus_t e = (func);                                         \

    CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                                 \

        << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \

  }


#define CUSOLVER_CALL(func)                                                \

  {                                                                        \

    cusolverStatus_t e = (func);                                           \

    CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS)                                   \

        << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \

  }


#define CURAND_CALL(func)                                              \

  {                                                                    \

    curandStatus_t e = (func);                                         \

    CHECK_EQ(e, CURAND_STATUS_SUCCESS)                                 \

        << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \

  }


#define NVRTC_CALL(x)                                                                           \

  {                                                                                             \

    nvrtcResult result = x;                                                                     \

    CHECK_EQ(result, NVRTC_SUCCESS) << #x " failed with error " << nvrtcGetErrorString(result); \

  }


#define CUDA_DRIVER_CALL(func)                                         \

  {                                                                    \

    CUresult e = (func);                                               \

    if (e != CUDA_SUCCESS) {                                           \

      char const* err_msg = nullptr;                                   \

      if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) { \

        LOG(FATAL) << "CUDA Driver: Unknown error " << e;              \

      } else {                                                         \

        LOG(FATAL) << "CUDA Driver: " << e << " " << err_msg;          \

      }                                                                \

    }                                                                  \

  }


#if MXNET_USE_NVML


#define NVML_CALL(func)                                                                       \

  {                                                                                           \

    nvmlReturn_t result = (func);                                                             \

    CHECK_EQ(result, NVML_SUCCESS) << #func " failed with error " << nvmlErrorString(result); \

  }

#endif  // MXNET_USE_NVML


#if !defined(_MSC_VER)

#define CUDA_UNROLL   _Pragma("unroll")

#define CUDA_NOUNROLL _Pragma("nounroll")

#else

#define CUDA_UNROLL

#define CUDA_NOUNROLL

#endif


namespace mxnet {

namespace common {

namespace cuda {

template <typename DType>

struct CublasType;


// With CUDA v8, cuBLAS adopted use of cudaDataType_t instead of its own

// datatype cublasDataType_t.  The older cudaDataType_t values could be

// included below, but since this class was introduced to support the cuBLAS v8

// call cublasGemmEx(), burdening the class with the legacy type values

// was not needed.


template <>

struct CublasType<float> {

  static const int kFlag = mshadow::kFloat32;

#if CUDA_VERSION >= 8000

  static const cudaDataType_t kCudaFlag = CUDA_R_32F;

#endif

  typedef float ScaleType;

  static const float one;

  static const float zero;

};

template <>

struct CublasType<double> {

  static const int kFlag = mshadow::kFloat64;

#if CUDA_VERSION >= 8000

  static const cudaDataType_t kCudaFlag = CUDA_R_64F;

#endif

  typedef double ScaleType;

  static const double one;

  static const double zero;

};

template <>

struct CublasType<mshadow::half::half_t> {

  static const int kFlag = mshadow::kFloat16;

#if CUDA_VERSION >= 8000

  static const cudaDataType_t kCudaFlag = CUDA_R_16F;

#endif

  typedef float ScaleType;

  static const mshadow::half::half_t one;

  static const mshadow::half::half_t zero;

};

template <>

struct CublasType<uint8_t> {

  static const int kFlag = mshadow::kUint8;

#if CUDA_VERSION >= 8000

  static const cudaDataType_t kCudaFlag = CUDA_R_8I;

#endif

  typedef uint8_t ScaleType;

  static const uint8_t one  = 1;

  static const uint8_t zero = 0;

};

template <>

struct CublasType<int32_t> {

  static const int kFlag = mshadow::kInt32;

#if CUDA_VERSION >= 8000

  static const cudaDataType_t kCudaFlag = CUDA_R_32I;

#endif

  typedef int32_t ScaleType;

  static const int32_t one  = 1;

  static const int32_t zero = 0;

};


inline const char* CublasGetErrorString(cublasStatus_t error) {

  switch (error) {

    case CUBLAS_STATUS_SUCCESS:

      return "CUBLAS_STATUS_SUCCESS";

    case CUBLAS_STATUS_NOT_INITIALIZED:

      return "CUBLAS_STATUS_NOT_INITIALIZED";

    case CUBLAS_STATUS_ALLOC_FAILED:

      return "CUBLAS_STATUS_ALLOC_FAILED";

    case CUBLAS_STATUS_INVALID_VALUE:

      return "CUBLAS_STATUS_INVALID_VALUE";

    case CUBLAS_STATUS_ARCH_MISMATCH:

      return "CUBLAS_STATUS_ARCH_MISMATCH";

    case CUBLAS_STATUS_MAPPING_ERROR:

      return "CUBLAS_STATUS_MAPPING_ERROR";

    case CUBLAS_STATUS_EXECUTION_FAILED:

      return "CUBLAS_STATUS_EXECUTION_FAILED";

    case CUBLAS_STATUS_INTERNAL_ERROR:

      return "CUBLAS_STATUS_INTERNAL_ERROR";

    case CUBLAS_STATUS_NOT_SUPPORTED:

      return "CUBLAS_STATUS_NOT_SUPPORTED";

    default:

      break;

  }

  return "Unknown cuBLAS status";

}


#if CUDA_VERSION >= 8000


inline cublasOperation_t CublasTransposeOp(bool transpose) {

  return transpose ? CUBLAS_OP_T : CUBLAS_OP_N;

}

#endif


inline const char* CusolverGetErrorString(cusolverStatus_t error) {

  switch (error) {

    case CUSOLVER_STATUS_SUCCESS:

      return "CUSOLVER_STATUS_SUCCESS";

    case CUSOLVER_STATUS_NOT_INITIALIZED:

      return "CUSOLVER_STATUS_NOT_INITIALIZED";

    case CUSOLVER_STATUS_ALLOC_FAILED:

      return "CUSOLVER_STATUS_ALLOC_FAILED";

    case CUSOLVER_STATUS_INVALID_VALUE:

      return "CUSOLVER_STATUS_INVALID_VALUE";

    case CUSOLVER_STATUS_ARCH_MISMATCH:

      return "CUSOLVER_STATUS_ARCH_MISMATCH";

    case CUSOLVER_STATUS_EXECUTION_FAILED:

      return "CUSOLVER_STATUS_EXECUTION_FAILED";

    case CUSOLVER_STATUS_INTERNAL_ERROR:

      return "CUSOLVER_STATUS_INTERNAL_ERROR";

    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:

      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

    default:

      break;

  }

  return "Unknown cuSOLVER status";

}


inline const char* CurandGetErrorString(curandStatus_t status) {

  switch (status) {

    case CURAND_STATUS_SUCCESS:

      return "CURAND_STATUS_SUCCESS";

    case CURAND_STATUS_VERSION_MISMATCH:

      return "CURAND_STATUS_VERSION_MISMATCH";

    case CURAND_STATUS_NOT_INITIALIZED:

      return "CURAND_STATUS_NOT_INITIALIZED";

    case CURAND_STATUS_ALLOCATION_FAILED:

      return "CURAND_STATUS_ALLOCATION_FAILED";

    case CURAND_STATUS_TYPE_ERROR:

      return "CURAND_STATUS_TYPE_ERROR";

    case CURAND_STATUS_OUT_OF_RANGE:

      return "CURAND_STATUS_OUT_OF_RANGE";

    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:

      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";

    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:

      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";

    case CURAND_STATUS_LAUNCH_FAILURE:

      return "CURAND_STATUS_LAUNCH_FAILURE";

    case CURAND_STATUS_PREEXISTING_FAILURE:

      return "CURAND_STATUS_PREEXISTING_FAILURE";

    case CURAND_STATUS_INITIALIZATION_FAILED:

      return "CURAND_STATUS_INITIALIZATION_FAILED";

    case CURAND_STATUS_ARCH_MISMATCH:

      return "CURAND_STATUS_ARCH_MISMATCH";

    case CURAND_STATUS_INTERNAL_ERROR:

      return "CURAND_STATUS_INTERNAL_ERROR";

  }

  return "Unknown cuRAND status";

}


template <typename DType>

inline DType __device__ CudaMax(DType a, DType b) {

  return a > b ? a : b;

}


template <typename DType>

inline DType __device__ CudaMin(DType a, DType b) {

  return a < b ? a : b;

}


class DeviceStore {

 public:

  explicit DeviceStore(int requested_device = -1, bool restore = true)

      : restore_device_(-1), current_device_(requested_device), restore_(restore) {

    if (restore_)

      CUDA_CALL(cudaGetDevice(&restore_device_));

    if (requested_device != restore_device_) {

      SetDevice(requested_device);

    }

  }


  ~DeviceStore() {

    if (restore_ && current_device_ != restore_device_ && current_device_ != -1 &&

        restore_device_ != -1)

      CUDA_CALL(cudaSetDevice(restore_device_));

  }


  void SetDevice(int device) {

    if (device != -1) {

      CUDA_CALL(cudaSetDevice(device));

      current_device_ = device;

    }

  }


 private:

  int restore_device_;

  int current_device_;

  bool restore_;

};


int get_load_type(size_t N);


int get_rows_per_block(size_t row_size, int num_threads_per_block);


}  // namespace cuda

}  // namespace common

}  // namespace mxnet


constexpr size_t kMaxNumGpus = 64;


// The implementations below assume that accesses of 32-bit ints are inherently atomic and

// can be read/written by multiple threads without locks.  The values held should be < 2^31.


inline int cudaAttributeLookup(int device_id,

                               std::vector<int32_t>* cached_values,

                               cudaDeviceAttr attr,

                               const char* attr_name) {

  if (device_id < 0 || device_id >= static_cast<int>(cached_values->size())) {

    LOG(FATAL) << attr_name << "(device_id) called with invalid id: " << device_id;

  } else if ((*cached_values)[device_id] < 0) {

    int temp = -1;

    CUDA_CALL(cudaDeviceGetAttribute(&temp, attr, device_id));

    (*cached_values)[device_id] = static_cast<int32_t>(temp);

  }

  return (*cached_values)[device_id];

}


inline int ComputeCapabilityMajor(int device_id) {

  static std::vector<int32_t> capability_major(kMaxNumGpus, -1);

  return cudaAttributeLookup(

      device_id, &capability_major, cudaDevAttrComputeCapabilityMajor, "ComputeCapabilityMajor");

}


inline int ComputeCapabilityMinor(int device_id) {

  static std::vector<int32_t> capability_minor(kMaxNumGpus, -1);

  return cudaAttributeLookup(

      device_id, &capability_minor, cudaDevAttrComputeCapabilityMinor, "ComputeCapabilityMinor");

}


inline int SMArch(int device_id) {

  auto major = ComputeCapabilityMajor(device_id);

  auto minor = ComputeCapabilityMinor(device_id);

  return 10 * major + minor;

}


inline int MultiprocessorCount(int device_id) {

  static std::vector<int32_t> sm_counts(kMaxNumGpus, -1);

  return cudaAttributeLookup(

      device_id, &sm_counts, cudaDevAttrMultiProcessorCount, "MultiprocessorCount");

}


inline int MaxSharedMemoryPerMultiprocessor(int device_id) {

  static std::vector<int32_t> max_smem_per_mutiprocessor(kMaxNumGpus, -1);

  return cudaAttributeLookup(device_id,

                             &max_smem_per_mutiprocessor,

                             cudaDevAttrMaxSharedMemoryPerMultiprocessor,

                             "MaxSharedMemoryPerMultiprocessor");

}


inline bool SupportsCooperativeLaunch(int device_id) {

  static std::vector<int32_t> coop_launch(kMaxNumGpus, -1);

  return cudaAttributeLookup(

      device_id, &coop_launch, cudaDevAttrCooperativeLaunch, "SupportsCooperativeLaunch");

}


inline bool SupportsFloat16Compute(int device_id) {

  if (device_id < 0) {

    return false;

  } else {

    // Kepler and most Maxwell GPUs do not support fp16 compute

    int computeCapabilityMajor = ComputeCapabilityMajor(device_id);

    return (computeCapabilityMajor > 5) ||

           (computeCapabilityMajor == 5 && ComputeCapabilityMinor(device_id) >= 3);

  }

}


inline bool SupportsTensorCore(int device_id) {

  // Volta (sm_70) supports TensorCore algos

  return device_id >= 0 && ComputeCapabilityMajor(device_id) >= 7;

}


// The policy if the user hasn't set the environment variable MXNET_CUDA_ALLOW_TENSOR_CORE

#define MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT true


inline bool GetEnvAllowTensorCore() {

  // Since these statics are in the '.h' file, they will exist and will be set

  // separately in each compilation unit.  Not ideal, but cleaner than creating a

  // cuda_utils.cc solely to have a single instance and initialization.

  static bool allow_tensor_core = false;

  static bool is_set            = false;

  if (!is_set) {

    // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be legal.

    bool default_value = MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT;

    allow_tensor_core =

        dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE", dmlc::optional<bool>(default_value)).value();

    is_set = true;

  }

  return allow_tensor_core;

}


// The policy if the user hasn't set the environment variable

// CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION

#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false


inline bool GetEnvAllowTensorCoreConversion() {

  // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be

  // legal.

  bool default_value = MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT;

  return dmlc::GetEnv("MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION",

                      dmlc::optional<bool>(default_value))

      .value();

}


#if CUDA_VERSION >= 9000

// Sets the cuBLAS math mode that determines the 'allow TensorCore' policy.  Returns previous.

inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {

  auto handle_math_mode = CUBLAS_DEFAULT_MATH;

  CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));

  CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));

  return handle_math_mode;

}

#endif


#endif  // MXNET_USE_CUDA


#if MXNET_USE_CUDNN


#include <cudnn.h>


// Creating CUDNN_VERSION_AS_STRING as follows avoids a static_assert error message that shows

// the formula for CUDNN_VERSION, i.e. "1000 * 7 + 100 * 6 + 0" rather than number "7600".

static_assert(CUDNN_PATCHLEVEL < 100 && CUDNN_MINOR < 10,

              "CUDNN_VERSION_AS_STRING macro assumptions violated.");

#if CUDNN_PATCHLEVEL >= 10

#define CUDNN_VERSION_AS_STRING \

  QUOTEVALUE(CUDNN_MAJOR)       \

  QUOTEVALUE(CUDNN_MINOR)       \

  QUOTEVALUE(CUDNN_PATCHLEVEL)

#else

#define CUDNN_VERSION_AS_STRING \

  QUOTEVALUE(CUDNN_MAJOR)       \

  QUOTEVALUE(CUDNN_MINOR)       \

  "0" QUOTEVALUE(CUDNN_PATCHLEVEL)

#endif


#define STATIC_ASSERT_CUDNN_VERSION_GE(min_version)             \

  static_assert(                                                \

      CUDNN_VERSION >= min_version,                             \

      "Compiled-against cuDNN version " CUDNN_VERSION_AS_STRING \

      " is too old, please upgrade system to version " QUOTEVALUE(min_version) " or later.")


#define CUDNN_CALL_S(f, s)                                       \

  {                                                              \

    cudnnStatus_t unclash_cxx_e = (f);                           \

    if (unclash_cxx_e != CUDNN_STATUS_SUCCESS)                   \

      LOG(s) << "cuDNN: " << cudnnGetErrorString(unclash_cxx_e); \

  }


#define CUDNN_CALL(f)          CUDNN_CALL_S(f, FATAL)

#define CUDNN_CALL_NONFATAL(f) CUDNN_CALL_S(f, WARNING)


#define CUTENSOR_CALL(func)                                                            \

  {                                                                                    \

    cutensorStatus_t e = (func);                                                       \

    CHECK_EQ(e, CUTENSOR_STATUS_SUCCESS) << "cuTensor: " << cutensorGetErrorString(e); \

  }


inline int MaxForwardAlgos(cudnnHandle_t cudnn_handle) {

  STATIC_ASSERT_CUDNN_VERSION_GE(7000);

  int max_algos = 0;

  CUDNN_CALL(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_algos));

  return max_algos;

}


inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {

  STATIC_ASSERT_CUDNN_VERSION_GE(7000);

  int max_algos = 0;

  CUDNN_CALL(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnn_handle, &max_algos));

  return max_algos;

}


inline int MaxBackwardDataAlgos(cudnnHandle_t cudnn_handle) {

  STATIC_ASSERT_CUDNN_VERSION_GE(7000);

  int max_algos = 0;

  CUDNN_CALL(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnn_handle, &max_algos));

  return max_algos;

}


#endif  // MXNET_USE_CUDNN


// Overload atomicAdd to work for floats on all architectures

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600

// From CUDA Programming Guide

static inline __device__ void atomicAdd(double* address, double val) {

  unsigned long long* address_as_ull =                 // NOLINT(*)

      reinterpret_cast<unsigned long long*>(address);  // NOLINT(*)

  unsigned long long old = *address_as_ull;            // NOLINT(*)

  unsigned long long assumed;                          // NOLINT(*)


  do {

    assumed = old;

    old     = atomicCAS(

        address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));


    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)

  } while (assumed != old);

}

#endif


// Overload atomicAdd for half precision

// Taken from:

// https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh

#ifdef __CUDACC__

static inline __device__ void atomicAdd(mshadow::half::half_t* address, mshadow::half::half_t val) {

  unsigned int* address_as_ui = reinterpret_cast<unsigned int*>(

      reinterpret_cast<char*>(address) - (reinterpret_cast<size_t>(address) & 2));

  unsigned int old = *address_as_ui;

  unsigned int assumed;


  do {

    assumed = old;

    mshadow::half::half_t hsum;

    hsum.half_ = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);

    hsum += val;

    old = reinterpret_cast<size_t>(address) & 2 ? (old & 0xffff) | (hsum.half_ << 16) :

                                                  (old & 0xffff0000) | hsum.half_;

    old = atomicCAS(address_as_ui, assumed, old);

  } while (assumed != old);

}


static inline __device__ void atomicAdd(uint8_t* address, uint8_t val) {

  unsigned int* address_as_ui = (unsigned int*)(address - ((size_t)address & 0x3));

  unsigned int old            = *address_as_ui;

  unsigned int shift          = (((size_t)address & 0x3) << 3);

  unsigned int sum;

  unsigned int assumed;


  do {

    assumed = old;

    sum     = val + static_cast<uint8_t>((old >> shift) & 0xff);

    old     = (old & ~(0x000000ff << shift)) | (sum << shift);

    old     = atomicCAS(address_as_ui, assumed, old);

  } while (assumed != old);

}


static inline __device__ void atomicAdd(int8_t* address, int8_t val) {

  unsigned int* address_as_ui = (unsigned int*)(address - ((size_t)address & 0x3));

  unsigned int old            = *address_as_ui;

  unsigned int shift          = (((size_t)address & 0x3) << 3);

  unsigned int sum;

  unsigned int assumed;


  do {

    assumed = old;

    sum     = val + static_cast<int8_t>((old >> shift) & 0xff);

    old     = (old & ~(0x000000ff << shift)) | (sum << shift);

    old     = atomicCAS(address_as_ui, assumed, old);

  } while (assumed != old);

}


// Overload atomicAdd to work for signed int64 on all architectures

static inline __device__ void atomicAdd(int64_t* address, int64_t val) {

  atomicAdd(reinterpret_cast<unsigned long long*>(address),  // NOLINT

            static_cast<unsigned long long>(val));           // NOLINT

}


template <typename DType>

__device__ inline DType ldg(const DType* address) {

#if __CUDA_ARCH__ >= 350

  return __ldg(address);

#else

  return *address;

#endif

}


namespace mxnet {

namespace common {

namespace cuda {


static constexpr const int warp_size = 32;


template <int NVALUES = warp_size, typename OP, typename T>

__device__ inline T warp_reduce(T value, OP redfun) {

#pragma unroll

  for (int i = warp_size / 2; i >= 1; i /= 2) {

    if (NVALUES > i)

      value = redfun(value, __shfl_down_sync(0xffffffff, value, i));

  }

  return value;

}


template <typename OP, typename T>

__device__ inline T grouped_warp_allreduce(T value, OP redfun, const int group_size) {

  for (int i = 1; i < group_size; i *= 2) {

    value = redfun(value, __shfl_down_sync(0xffffffff, value, i));

  }

  return __shfl_sync(0xffffffff, value, 0, group_size);

}


template <int NValues = warp_size, typename OP>

__device__ inline mshadow::half::half_t warp_reduce(mshadow::half::half_t value, OP redfun) {

  float v = static_cast<float>(value);

#pragma unroll

  for (int i = warp_size / 2; i >= 1; i /= 2) {

    if (NValues > i)

      v = redfun(v, __shfl_down_sync(0xffffffff, v, i));

  }

  return mshadow::half::half_t(v);

}


template <int NTHREADS, bool all_reduce = true, typename OP, typename T>

__device__ inline T reduce(const T& value, OP redfun) {

  static_assert(NTHREADS <= warp_size * warp_size, "Number of threads too large for reduction");

  __shared__ T scratch[NTHREADS / warp_size];

  const int thread_idx_in_warp = threadIdx.x % warp_size;

  const int warp_id            = threadIdx.x / warp_size;

  const T my_val               = warp_reduce<warp_size>(value, redfun);

  if (thread_idx_in_warp == 0) {

    scratch[warp_id] = my_val;

  }

  __syncthreads();

  T ret = 0;

  if (warp_id == 0) {

    const T prev_val = threadIdx.x < (NTHREADS / warp_size) ? scratch[threadIdx.x] : 0;

    const T my_val   = warp_reduce<NTHREADS / warp_size>(prev_val, redfun);

    if (all_reduce) {

      scratch[threadIdx.x] = my_val;

    } else {

      ret = my_val;

    }

  }

  // Necessary to synchronize in order to use this function again

  // as the shared memory scratch space is reused between calls

  __syncthreads();

  if (all_reduce) {

    ret = scratch[0];

    __syncthreads();

  }

  return ret;

}


}  // namespace cuda

}  // namespace common

}  // namespace mxnet


#endif  // __CUDACC__


#endif  // MXNET_COMMON_CUDA_UTILS_H_