Namespaces
	util

Functions
template<typename Params >
void	VectorizedKernelRTCLauncher (const std::string &parameters, const std::string &kernel_name, const std::string &code, int nvec, const index_t lead_dim, const index_t other_dim, mshadow::Stream< gpu > *s, const Params params, const std::vector< TBlob > &inputs, const std::vector< TBlob > &outputs, const int dev_id, const int lead_input_num=0, const index_t blocks=0)
	Launcher helper for the kernels using vectorization. More...

int	GetMaxSupportedArch ()

CUfunction	get_function (const std::string &parameters, const std::string &kernel_name, const std::string &code, int dev_id)
	Compile and get the GPU kernel. Uses cache in order to eliminate the overhead of compilation. More...

void	launch (CUfunction function, const dim3 grid_dim, const dim3 block_dim, unsigned int shared_mem_bytes, mshadow::Stream< gpu > stream, std::vector< const void > *args)
	Launch a GPU kernel. More...

Variables
const char	backward_function_definitions []

const char	grad_function_definitions []

const char	function_definitions_util []

const char	function_definitions_binary []

const char	function_definitions_unary []

const char	fp16_support_string []

const char	reducer []

const char	logic_reducer []

const char	special_functions_definitions []

const char	type_support_string []

const char	util_string []

const char	limits []

const char	vectorization_support_string []

std::mutex	lock

Function Documentation

◆ get_function()

CUfunction mxnet::common::cuda::rtc::get_function	(	const std::string &	parameters,
		const std::string &	kernel_name,
		const std::string &	code,
		int	dev_id
	)

Compile and get the GPU kernel. Uses cache in order to eliminate the overhead of compilation.

Parameters

parameters	of the kernel (e.g. values of the template arguments, types used)
kernel_name	name of the kernel
code	used for compilation of the kernel if not found in cache
dev_id	id of the device which the kernel will be launched on

◆ GetMaxSupportedArch()

int mxnet::common::cuda::rtc::GetMaxSupportedArch ( )

◆ launch()

void mxnet::common::cuda::rtc::launch	(	CUfunction	function,
		const dim3	grid_dim,
		const dim3	block_dim,
		unsigned int	shared_mem_bytes,
		mshadow::Stream< gpu > *	stream,
		std::vector< const void * > *	args
	)

Launch a GPU kernel.

Parameters

function	to launch
grid_dim	grid dimensions
block_dim	block dimensions
shared_mem_bytes	amount of dynamic shared memory needed by the kernel
stream	used for launching the kernel
args	arguments of the kernel

◆ VectorizedKernelRTCLauncher()

template<typename Params >

void mxnet::common::cuda::rtc::VectorizedKernelRTCLauncher	(	const std::string &	parameters,
		const std::string &	kernel_name,
		const std::string &	code,
		int	nvec,
		const index_t	lead_dim,
		const index_t	other_dim,
		mshadow::Stream< gpu > *	s,
		const Params	params,
		const std::vector< TBlob > &	inputs,
		const std::vector< TBlob > &	outputs,
		const int	dev_id,
		const int	lead_input_num = `0`,
		const index_t	blocks = `0`
	)

Launcher helper for the kernels using vectorization.

Parameters

parameters	of the kernel (e.g. values of the template arguments)
kernel_name	name of the kernel
code	used for compilation of the kernel if not found in cache
nvec	length of the vector used for loading/storing data
lead_dim	size of leading dimension of the tensors
other_dim	maximum of the total size of all the other dimensions of the tensors
s	stream used to launch the kernel
inputs	to the kernel
outputs	of the kernel
dev_id	id of the devide which the kernel will be launched on
lead_input_num	number of input to use for checking alignment (in case only a subset of inputs is used vectorized). Default is 0.
blocks	if provided and not 0, will launch the specified number of thread blocks. Default is 0.

Variable Documentation

◆ backward_function_definitions

const char mxnet::common::cuda::rtc::backward_function_definitions[]

◆ fp16_support_string

const char mxnet::common::cuda::rtc::fp16_support_string[]

◆ function_definitions_binary

const char mxnet::common::cuda::rtc::function_definitions_binary[]

◆ function_definitions_unary

const char mxnet::common::cuda::rtc::function_definitions_unary[]

◆ function_definitions_util

const char mxnet::common::cuda::rtc::function_definitions_util[]

◆ grad_function_definitions

const char mxnet::common::cuda::rtc::grad_function_definitions[]

◆ limits

const char mxnet::common::cuda::rtc::limits[]

◆ lock

std::mutex mxnet::common::cuda::rtc::lock

◆ logic_reducer

const char mxnet::common::cuda::rtc::logic_reducer[]

◆ reducer

const char mxnet::common::cuda::rtc::reducer[]

◆ special_functions_definitions

const char mxnet::common::cuda::rtc::special_functions_definitions[]

◆ type_support_string

const char mxnet::common::cuda::rtc::type_support_string[]

◆ util_string

const char mxnet::common::cuda::rtc::util_string[]

◆ vectorization_support_string

const char mxnet::common::cuda::rtc::vectorization_support_string[]

Namespaces

Functions

Variables

Function Documentation

◆ get_function()

◆ GetMaxSupportedArch()

◆ launch()

◆ VectorizedKernelRTCLauncher()

Variable Documentation

◆ backward_function_definitions

◆ fp16_support_string

◆ function_definitions_binary

◆ function_definitions_unary

◆ function_definitions_util

◆ grad_function_definitions

◆ limits

◆ lock

◆ logic_reducer

◆ reducer

◆ special_functions_definitions

◆ type_support_string

◆ util_string

◆ vectorization_support_string