26 #ifndef MSHADOW_TENSOR_GPU_INL_H_ 27 #define MSHADOW_TENSOR_GPU_INL_H_ 38 cudaGetDeviceCount(&device_count);
39 CHECK_GT(device_count, 0) <<
"Cannot find CUDA device. Please check CUDA-Configuration";
45 CHECK_LT(device_id, device_count) <<
"Incorrect Device ID";
56 template<
int dim,
typename DType>
62 obj->
size(dim - 1) *
sizeof(DType),
63 obj->
shape_.FlatTo2D()[0]));
68 obj->
shape_.Size() *
sizeof(DType), 1));
71 template<
int dim,
typename DType>
76 template<
typename A,
typename B,
int dim,
typename DType>
81 CHECK_EQ(_dst.
shape_, _src.
shape_) <<
"Copy:shape mismatch";
86 dst.
size(1) *
sizeof(DType),
94 template<
int dim,
typename DType>
98 Copy(dst, src, cudaMemcpyDeviceToHost, stream);
100 template<
int dim,
typename DType>
104 Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
106 template<
int dim,
typename DType>
110 Copy(dst, src, cudaMemcpyHostToDevice, stream);
112 #endif // MSHADOW_USE_CUDA 117 #include "./cuda/tensor_gpu-inl.cuh" 120 template<
typename Saver,
typename R,
int dim,
121 typename DType,
typename E,
int etype>
125 ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
128 CHECK(eshape[0] == 0 || eshape == dshape)
129 <<
"Assignment: Shape of Tensors are not consistent with target, " 130 <<
"eshape: " << eshape <<
" dshape:" << dshape;
137 template<
typename Saver,
typename Reducer,
138 typename R,
typename DType,
typename E,
int etype>
143 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
145 ::Check(exp.
self()).FlatTo2D();
147 CHECK_EQ(eshape[1], dshape[0]) <<
"MapReduceKeepLowest::reduction dimension do not match";
148 CHECK_NE(eshape[0], 0U) <<
"can not reduce over empty tensor";
149 cuda::MapReduceKeepLowest<Saver, Reducer>
154 template<
typename Saver,
typename Reducer,
int dimkeep,
155 typename R,
typename DType,
typename E,
int etype>
160 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
165 CHECK_EQ(eshape[dimkeep], dshape[0]) <<
"MapReduceKeepHighDim::reduction dimension do not match";
169 eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
170 eshape[EShape::kSubdim]);
172 cuda::MapReduceKeepDim1<Saver, Reducer>
176 template<
typename DType>
182 template<
typename DType>
188 template<
typename DType>
195 template<
typename DType>
203 template<
typename DType>
207 const DType &ignore_label) {
211 template<
typename DType>
215 const DType &ignore_label,
220 template<
typename DType>
227 template<
typename DType>
231 const DType &ignore_label) {
235 template<
bool clip,
typename IndexType,
typename DType>
239 cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
242 template<
typename IndexType,
typename DType>
250 template<
typename KDType,
typename VDType>
256 template<
typename IndexType,
typename DType>
264 #endif // MSHADOW_TENSOR_GPU_INL_H_ void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:141
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix...
Definition: tensor_cpu-inl.h:548
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:307
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:324
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
Definition: pad.h:72
DType * dptr_
pointer to the data
Definition: tensor.h:435
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:410
used to help static type check
Definition: expr_engine-inl.h:331
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:146
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:208
Definition: stream_gpu-inl.h:38
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:437
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:241
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!) ...
Definition: tensor_cpu-inl.h:559
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j])) ...
Definition: tensor_cpu-inl.h:484
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:271
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0) ...
Definition: tensor_cpu-inl.h:224
static Shape< dim > Check(const E &t)
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:520
Definition: expr_engine-inl.h:346
int32_t index_t
type that will be used for index
Definition: base.h:336
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:117
void ShutdownTensorEngine< gpu >(void)
Definition: tensor_gpu-inl.h:50
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[sorted[i]] += src[index[i]] Called when the bat...
Definition: tensor_cpu-inl.h:538
runtime shape checking template get the shape of an expression, report error if shape mismatch ...
Definition: expr_engine-inl.h:365
void InitTensorEngine< gpu >(int dev_id)
Definition: tensor_gpu-inl.h:34
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2) ...
Definition: tensor_cpu-inl.h:251
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:80
const SubType & self(void) const
Definition: expression.h:83
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:240
void SetDevice< gpu >(int devid)
Definition: tensor_gpu-inl.h:53
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:517
overloaded + operator between half_t and bf16_t
Definition: base.h:327
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:506
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:442
general tensor
Definition: tensor.h:421
#define MSHADOW_MIN_PAD_RATIO
x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide a...
Definition: base.h:84
computaion stream structure, used for asynchronous computations
Definition: tensor.h:384