mxnet
|
Namespaces | |
util | |
Functions | |
template<typename Params > | |
void | VectorizedKernelRTCLauncher (const std::string ¶meters, const std::string &kernel_name, const std::string &code, int nvec, const index_t lead_dim, const index_t other_dim, mshadow::Stream< gpu > *s, const Params params, const std::vector< TBlob > &inputs, const std::vector< TBlob > &outputs, const int dev_id, const int lead_input_num=0, const index_t blocks=0) |
Launcher helper for the kernels using vectorization. More... | |
int | GetMaxSupportedArch () |
CUfunction | get_function (const std::string ¶meters, const std::string &kernel_name, const std::string &code, int dev_id) |
Compile and get the GPU kernel. Uses cache in order to eliminate the overhead of compilation. More... | |
void | launch (CUfunction function, const dim3 grid_dim, const dim3 block_dim, unsigned int shared_mem_bytes, mshadow::Stream< gpu > *stream, std::vector< const void * > *args) |
Launch a GPU kernel. More... | |
Variables | |
const char | backward_function_definitions [] |
const char | grad_function_definitions [] |
const char | function_definitions_util [] |
const char | function_definitions_binary [] |
const char | function_definitions_unary [] |
const char | fp16_support_string [] |
const char | reducer [] |
const char | logic_reducer [] |
const char | special_functions_definitions [] |
const char | type_support_string [] |
const char | util_string [] |
const char | limits [] |
const char | vectorization_support_string [] |
std::mutex | lock |
CUfunction mxnet::common::cuda::rtc::get_function | ( | const std::string & | parameters, |
const std::string & | kernel_name, | ||
const std::string & | code, | ||
int | dev_id | ||
) |
Compile and get the GPU kernel. Uses cache in order to eliminate the overhead of compilation.
parameters | of the kernel (e.g. values of the template arguments, types used) |
kernel_name | name of the kernel |
code | used for compilation of the kernel if not found in cache |
dev_id | id of the device which the kernel will be launched on |
int mxnet::common::cuda::rtc::GetMaxSupportedArch | ( | ) |
void mxnet::common::cuda::rtc::launch | ( | CUfunction | function, |
const dim3 | grid_dim, | ||
const dim3 | block_dim, | ||
unsigned int | shared_mem_bytes, | ||
mshadow::Stream< gpu > * | stream, | ||
std::vector< const void * > * | args | ||
) |
Launch a GPU kernel.
function | to launch |
grid_dim | grid dimensions |
block_dim | block dimensions |
shared_mem_bytes | amount of dynamic shared memory needed by the kernel |
stream | used for launching the kernel |
args | arguments of the kernel |
void mxnet::common::cuda::rtc::VectorizedKernelRTCLauncher | ( | const std::string & | parameters, |
const std::string & | kernel_name, | ||
const std::string & | code, | ||
int | nvec, | ||
const index_t | lead_dim, | ||
const index_t | other_dim, | ||
mshadow::Stream< gpu > * | s, | ||
const Params | params, | ||
const std::vector< TBlob > & | inputs, | ||
const std::vector< TBlob > & | outputs, | ||
const int | dev_id, | ||
const int | lead_input_num = 0 , |
||
const index_t | blocks = 0 |
||
) |
Launcher helper for the kernels using vectorization.
parameters | of the kernel (e.g. values of the template arguments) |
kernel_name | name of the kernel |
code | used for compilation of the kernel if not found in cache |
nvec | length of the vector used for loading/storing data |
lead_dim | size of leading dimension of the tensors |
other_dim | maximum of the total size of all the other dimensions of the tensors |
s | stream used to launch the kernel |
inputs | to the kernel |
outputs | of the kernel |
dev_id | id of the devide which the kernel will be launched on |
lead_input_num | number of input to use for checking alignment (in case only a subset of inputs is used vectorized). Default is 0. |
blocks | if provided and not 0, will launch the specified number of thread blocks. Default is 0. |
const char mxnet::common::cuda::rtc::backward_function_definitions[] |
const char mxnet::common::cuda::rtc::fp16_support_string[] |
const char mxnet::common::cuda::rtc::function_definitions_binary[] |
const char mxnet::common::cuda::rtc::function_definitions_unary[] |
const char mxnet::common::cuda::rtc::function_definitions_util[] |
const char mxnet::common::cuda::rtc::grad_function_definitions[] |
const char mxnet::common::cuda::rtc::limits[] |
std::mutex mxnet::common::cuda::rtc::lock |
const char mxnet::common::cuda::rtc::logic_reducer[] |
const char mxnet::common::cuda::rtc::reducer[] |
const char mxnet::common::cuda::rtc::special_functions_definitions[] |
const char mxnet::common::cuda::rtc::type_support_string[] |
const char mxnet::common::cuda::rtc::util_string[] |
const char mxnet::common::cuda::rtc::vectorization_support_string[] |