Go to the documentation of this file.
25 #ifndef MSHADOW_TENSOR_CPU_INL_H_
26 #define MSHADOW_TENSOR_CPU_INL_H_
49 bool create_dnn_handle,
61 for (
int i = 0; i < ndim; ++i) {
62 if (i != 0) os <<
',';
66 if (ndim == 1) os <<
',';
71 template<
typename xpu>
73 template<
typename xpu>
78 inline void *AllocHost_<gpu>(
size_t size) {
84 inline void FreeHost_<gpu>(
void *dptr) {
99 template<
typename xpu,
int dim,
typename DType>
103 void *dptr = AllocHost_<xpu>(obj->
MSize() *
sizeof(DType));
104 obj->
dptr_ =
reinterpret_cast<DType*
>(dptr);
106 template<
typename xpu,
int dim,
typename DType>
108 if (obj->
dptr_ == NULL) {
109 LOG(FATAL) <<
"FreeHost:: double free";
111 FreeHost_<xpu>(obj->
dptr_);
115 template<
int dim,
typename DType>
121 (&pitch, obj->
size(dim - 1) *
sizeof(DType), obj->
shape_.FlatTo2D()[0]);
126 (&pitch, obj->
shape_.Size() *
sizeof(DType), 1);
128 obj->
dptr_ =
reinterpret_cast<DType*
>(dptr);
130 template<
typename Device,
typename DType,
int dim>
131 inline Tensor<Device, dim, DType>
139 template<
int dim,
typename DType>
144 template<
int dim,
typename DType>
148 #pragma GCC diagnostic push
150 #pragma GCC diagnostic ignored "-Wclass-memaccess"
153 <<
"Copy:shape mismatch:" << _dst.
shape_ <<
" vs " << _src.
shape_;
160 memcpy(dst[y].dptr_, src[y].dptr_,
sizeof(DType) * dst.
size(1));
163 #pragma GCC diagnostic pop
166 template<
typename Saver,
typename R,
int dim,
167 typename DType,
typename E>
173 #pragma omp parallel for
177 for (
index_t x = 0; x < shape[1]; ++x) {
179 Saver::template Save<DType>(dplan.REval(y, x), plan.
Eval(y, x));
184 template<
bool pass_check,
typename Saver,
186 typename DType,
typename E,
int etype>
194 template<
typename SV,
int dim,
typename DType,
typename E,
int etype>
196 dim, DType, E, etype> {
201 expr::MapPacketPlan<SV>(dst->
self(),
202 expr::MakePacketPlan<MSHADOW_DEFAULT_PACKET>(exp.
self()));
210 template<
typename Saver,
typename R,
int dim,
211 typename DType,
typename E,
int etype>
215 ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
218 CHECK(eshape[0] == 0 || eshape == dshape)
219 <<
"Assignment: Shape of Tensors are not consistent with target, "
220 <<
"eshape: " << eshape <<
" dshape:" << dshape;
222 Saver, R, dim, DType, E, etype>
226 template<
typename Saver,
typename Reducer,
227 typename R,
typename DType,
typename E,
int etype>
232 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
234 ::Check(exp.
self()).FlatTo2D();
236 CHECK_EQ(eshape[1], dshape[0]) <<
"MapReduceKeepLowest::reduction dimension do not match";
237 CHECK_NE(eshape[0], 0U) <<
"can not reduce over empty tensor";
242 #pragma omp parallel for
245 DType res = splan.
Eval(0, x);
246 for (
index_t y = 1; y < eshape[0]; ++y) {
247 Reducer::Reduce(res, splan.
Eval(y, x));
249 Saver::template Save<DType>(dplan.REval(0, x), res * scale);
253 template<
typename Saver,
typename Reducer,
int dimkeep,
254 typename R,
typename DType,
typename E,
int etype>
259 ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
264 CHECK_EQ(eshape[dimkeep], dshape[0])
265 <<
"MapReduceKeepHighDim::reduction dimension do not match";
269 eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
270 eshape[EShape::kSubdim]);
275 #pragma omp parallel for
278 DType res; Reducer::SetInitValue(res);
279 for (
index_t n = 0; n < pshape[0]; ++n) {
280 DType tres; Reducer::SetInitValue(tres);
281 for (
index_t y = 0; y < pshape[2]; ++y) {
282 for (
index_t x = 0; x < pshape[3]; ++x) {
283 Reducer::Reduce(tres,
284 splan.
Eval((n * pshape[1] + c) * pshape[2] + y, x));
287 Reducer::Reduce(res, tres);
289 Saver::template Save<DType>(dplan.REval(0, c), DType(res * scale));
293 template<
typename DType>
296 DType mmax = energy[0];
298 if (mmax < energy[x]) mmax = energy[x];
300 DType sum = DType(0.0f);
302 dst[x] = std::exp(energy[x] - mmax);
310 template<
typename DType>
314 #pragma omp parallel for
316 const index_t k =
static_cast<int>(label[y]);
319 dst[y][k] = src[y][k] - 1.0f;
321 dst[y][x] = src[y][x];
327 template<
typename DType>
332 const float smooth_grad = (alpha / (dst.
size(1) - 1));
333 #pragma omp parallel for
335 const index_t k =
static_cast<int>(label[y]);
338 dst[y][k] = src[y][k] - 1.0f + alpha;
340 dst[y][x] = src[y][x] - smooth_grad;
347 template<
typename DType>
351 const DType &ignore_label) {
352 #pragma omp parallel for
354 const int k =
static_cast<int>(label[y]);
355 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
356 if (
static_cast<int>(ignore_label) == k) {
360 dst[y][k] = src[y][k] - 1.0f;
362 dst[y][x] = src[y][x];
369 template<
typename DType>
373 const DType &ignore_label,
375 const float smooth_grad = (alpha / (dst.
size(1) - 1));
376 #pragma omp parallel for
378 const int k =
static_cast<int>(label[y]);
379 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
380 if (
static_cast<int>(ignore_label) == k) {
384 dst[y][k] = src[y][k] - 1.0f + alpha;
386 dst[y][x] = src[y][x] - smooth_grad;
393 template<
typename DType>
397 #pragma omp parallel for
400 const int k =
static_cast<int>(label[y][n]);
401 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
403 dst[y][k][n] = src[y][k][n] - 1.0f;
405 dst[y][x][n] = src[y][x][n];
412 template<
typename DType>
417 const float smooth_grad = (alpha / (dst.
size(1) - 1));
418 #pragma omp parallel for
421 const int k =
static_cast<int>(label[y][n]);
422 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
424 dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
426 dst[y][x][n] = src[y][x][n] - smooth_grad;
433 template<
typename DType>
437 const DType &ignore_label) {
438 #pragma omp parallel for
441 const int k =
static_cast<int>(label[y][n]);
442 if (k ==
static_cast<int>(ignore_label)) {
443 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
444 dst[y][x][n] = DType(0.0f);
447 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
449 dst[y][k][n] = src[y][k][n] - 1.0f;
451 dst[y][x][n] = src[y][x][n];
459 template<
typename DType>
463 const DType &ignore_label,
465 const float smooth_grad = (alpha / (dst.
size(1) - 1));
466 #pragma omp parallel for
469 const int k =
static_cast<int>(label[y][n]);
470 if (k ==
static_cast<int>(ignore_label)) {
471 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
472 dst[y][x][n] = DType(0.0f);
475 for (
int x = 0; x < static_cast<int>(dst.
size(1)); ++x) {
477 dst[y][k][n] = src[y][k][n] - 1.0f + alpha;
479 dst[y][x][n] = src[y][x][n] - smooth_grad;
487 template<
typename DType>
490 CHECK_EQ(dst.
shape_, energy.
shape_) <<
"Softmax: shape mismatch";
491 #pragma omp parallel for
497 template<
typename DType>
500 CHECK_EQ(dst.
shape_, energy.
shape_) <<
"Softmax: shape mismatch";
501 #pragma omp parallel for
504 DType mmax = energy[y][0][n];
506 if (mmax < energy[y][x][n]) mmax = energy[y][x][n];
508 DType sum = DType(0.0f);
510 dst[y][x][n] = std::exp(energy[y][x][n] - mmax);
520 template<
bool clip,
typename IndexType,
typename DType>
530 else if (j >= K) j = K - 1;
535 for (
index_t i = 0; i < C; ++i) {
536 dst[j][i] += src[y][i];
542 template<
bool clip,
typename IndexType,
typename DType,
typename AType>
549 for (
index_t j = 0; j < K; ++j) {
550 for (
index_t i = 0; i < C; ++i) {
551 temp[j][i] = dst[j][i];
558 else if (j >= K) j = K - 1;
563 for (
index_t i = 0; i < C; ++i) {
564 temp[j][i] += src[y][i];
567 for (
index_t j = 0; j < K; ++j) {
568 for (
index_t i = 0; i < C; ++i) {
569 dst[j][i] = temp[j][i];
574 template<
typename IndexType,
typename DType>
580 dst[sorted[y]] += src[index[y]];
584 template<
typename IndexType,
typename DType>
590 dst[index[y]][j] = src[y][j];
595 template<
typename KDType,
typename VDType>
600 CHECK_EQ(keys.
size(0), values.
size(0))
601 <<
"The sizes of key/value are not equal! keys_size: " << keys.
size(0)
602 <<
"values_size: " << values.
size(0);
603 std::vector<size_t> idx(keys.
size(0));
604 std::vector<KDType> keys_vec(keys.
size(0));
605 std::vector<VDType> values_vec(values.
size(0));
606 for (
int i = 0; i < keys.
size(0); i++) {
608 keys_vec[i] = keys[i];
609 values_vec[i] = values[i];
612 std::stable_sort(idx.begin(), idx.end(),
613 [&keys_vec](
size_t i1,
size_t i2)
614 {return keys_vec[i1] < keys_vec[i2]; });
616 std::stable_sort(idx.begin(), idx.end(),
617 [&keys_vec](
size_t i1,
size_t i2)
618 {return keys_vec[i1] > keys_vec[i2]; });
621 keys[i] = keys_vec[idx[i]];
622 values[i] = values_vec[idx[i]];
626 template<
typename Device,
typename VDType,
typename SDType>
634 template<
typename Device,
typename DType>
639 <<
"VectorDot: Shape mismatch";
640 CHECK_EQ(dst.
size(0), 1U)
641 <<
"VectorDot: expect dst to be scalar";
647 template<
bool transpose_left,
bool transpose_right,
typename Device,
typename DType>
663 CHECK(sleft[0] == batch_size && sright[0] == batch_size)
664 <<
"BatchGEMM: batchsize must be equal."
665 <<
"dst: " << dst.
shape_ <<
"\n"
666 <<
"lhs: " << sleft <<
"\n"
667 <<
"rhs: " << sright <<
"\n";
668 CHECK(dst.
size(1) == sleft[1] && dst.
size(2) == sright[2] && sleft[2] == sright[1])
669 <<
"BatchGEMM: matrix shape mismatch"
670 <<
"dst: " << dst.
shape_ <<
"\n"
671 <<
"lhs: " << sleft <<
"\n"
672 <<
"rhs: " << sright <<
"\n";
673 CHECK(workspace.
size(0) >= 3 * batch_size)
674 <<
"Workspace Size must be bigger than " << 3 * batch_size;
679 transpose_right, transpose_left,
680 transpose_right ? rhs.
size(1) : rhs.
size(2),
681 transpose_left ? lhs.
size(2) : lhs.
size(1),
682 transpose_right ? rhs.
size(2) : rhs.
size(1),
691 #endif // MSHADOW_TENSOR_CPU_INL_H_
MSHADOW_XINLINE Shape< 4 > Shape4(index_t s0, index_t s1, index_t s2, index_t s3)
construct a four dimension shape, stride will equal s0
Definition: tensor.h:254
index_t openmp_index_t
openmp index for linux
Definition: base.h:336
void SortByKey(Tensor< cpu, 1, KDType > keys, Tensor< cpu, 1, VDType > values, bool is_ascend=true)
CPU/GPU: Sort key-value pairs stored in separate places. (Stable sort is performed!...
Definition: tensor_cpu-inl.h:596
const Container & self(void) const
Definition: expression.h:82
MSHADOW_XINLINE index_t MSize(void) const
Definition: tensor.h:602
computaion stream structure, used for asynchronous computations
Definition: tensor.h:488
static void SetStream(Stream< Device > *stream)
Definition: dot_engine-inl.h:82
Tensor RValue, this is the super type of all kinds of possible tensors.
Definition: tensor.h:514
used to help static type check
Definition: expr_engine-inl.h:330
static void dot(Stream< Device > *stream, int n, const DType *X, int incX, const DType *Y, int incY, DType *ret)
Definition: dot_engine-inl.h:125
void Copy(Tensor< cpu, dim, DType > dst, const Tensor< cpu, dim, DType > &src, Stream< cpu > *stream=NULL)
copy data from one tensor to another, with same shape
Definition: tensor_cpu-inl.h:145
void FreeHost_(void *dptr)
void FreeSpace(Tensor< cpu, dim, DType > *obj)
CPU/GPU: free the space of tensor, will set obj.dptr to NULL.
Definition: tensor_cpu-inl.h:140
Container * ptrself(void)
Definition: expression.h:86
void BatchGEMM(Tensor< Device, 3, DType > dst, const Tensor< Device, 3, DType > &lhs, const Tensor< Device, 3, DType > &rhs, DType alpha, DType beta, Tensor< Device, 1, DType * > workspace)
CPU/GPU: dst = alpha * op(lhs) op(rhs) + beta * dst.
Definition: tensor_cpu-inl.h:648
#define MSHADOW_CUDA_CALL(func)
Protected cuda call in mshadow.
Definition: base.h:264
void IndexFill(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Fill the values of the destination matrix to specific rows in the source matrix....
Definition: tensor_cpu-inl.h:585
void SetDevice< cpu >(int devid)
Definition: tensor_cpu-inl.h:45
definitions of how Matrix Multiplications can be evaluated
void SoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label)
CPU/GPU: softmax gradient.
Definition: tensor_cpu-inl.h:311
Stream< Device > * stream_
Definition: tensor.h:679
void MapReduceKeepLowest(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
Definition: tensor_cpu-inl.h:228
general tensor
Definition: tensor.h:525
void * AlignedMallocPitch(size_t *out_pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: packet-inl.h:77
void VectorizedSort(Tensor< Device, 1, VDType > values, Tensor< Device, 1, SDType > segments)
CPU/GPU: Sort the keys within each segment. (Stable sort is performed!) Segments is defined as an asc...
Definition: tensor_cpu-inl.h:627
runtime shape checking template get the shape of an expression, report error if shape mismatch
Definition: expr_engine-inl.h:364
void FreeHost_< cpu >(void *dptr)
Definition: tensor_cpu-inl.h:95
std::ostream & operator<<(std::ostream &os, const Shape< ndim > &shape)
allow string printing of the shape
Definition: tensor_cpu-inl.h:59
static void batched_gemm(Stream< Device > *stream, bool transa, bool transb, int m, int n, int k, DType alpha, const DType *A, int lda, const DType *B, int ldb, DType beta, DType *C, int ldc, int batch_count, DType **workspace)
Definition: dot_engine-inl.h:91
MSHADOW_XINLINE index_t size(index_t i) const
Definition: tensor.h:711
static Shape< dim > Check(const E &t)
device name CPU
Definition: tensor.h:39
void Softmax(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &energy)
CPU/GPU: normalize softmax: dst[i][j] = exp(energy[i][j]) /(sum_j exp(energy[i][j]))
Definition: tensor_cpu-inl.h:488
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
Plan< BinaryMapExp< OP, TA, TB, DType, etype >, DType > MakePlan(const BinaryMapExp< OP, TA, TB, DType, etype > &e)
Definition: expr_engine-inl.h:239
void AllocHost(Tensor< cpu, dim, DType > *obj)
Definition: tensor_cpu-inl.h:100
Stream< Device > * stream_
stream where the computation lies stream is a device dependency concept where each computation
Definition: tensor.h:551
void * AllocHost_(size_t size)
MSHADOW_XINLINE Shape< 2 > FlatTo2D(void) const
Definition: tensor.h:146
Definition: packet-inl.h:379
Generic packet vectorization code.
Definition: tensor_cpu-inl.h:187
MSHADOW_XINLINE bool CheckContiguous(void) const
Definition: tensor.h:596
Shape< dimension > shape_
shape of the tensor
Definition: tensor.h:541
MSHADOW_XINLINE DType Eval(index_t y, index_t x) const
evaluate the expression at index [y][x] to be implemented by SubType, for RValue, the return type wil...
int32_t index_t
type that will be used for index
Definition: base.h:328
DType * dptr_
Definition: tensor.h:676
Definition: expr_engine-inl.h:58
void AddTakeGradLargeBatch(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &sorted, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix with safe accumulation. dst[index[i]] += src[i].
Definition: tensor_cpu-inl.h:575
defines how expression exp can be evaluated and stored into dst
Definition: expression.h:79
PaddingExp< SrcExp, DType, ExpInfo< SrcExp >::kDim > pad(const Exp< SrcExp, DType, etype > &src, index_t pad)
padding expression, pad a image with zeros on boundaries, padding affects shape[0],...
Definition: pad.h:71
void AllocSpace(Tensor< cpu, dim, DType > *obj, bool pad=MSHADOW_ALLOC_PAD)
CPU/CPU: allocate space for CTensor, according to the shape in the obj this function is responsible t...
Definition: tensor_cpu-inl.h:116
void MapReduceKeepHighDim(TRValue< R, cpu, 1, DType > *dst, const expr::Exp< E, DType, etype > &exp, DType scale=1)
CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
Definition: tensor_cpu-inl.h:255
void InitTensorEngine< cpu >(int dev_id)
Definition: tensor_cpu-inl.h:38
void * AllocHost_< cpu >(size_t size)
Definition: tensor_cpu-inl.h:90
overloaded + operator between half_t and bf16_t
Definition: base.h:319
void MapPlan(TRValue< R, cpu, dim, DType > *dst, const expr::Plan< E, DType > &plan)
Definition: tensor_cpu-inl.h:168
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:624
shape of a tensor
Definition: tensor.h:64
DType * dptr_
pointer to the data
Definition: tensor.h:539
#define MSHADOW_DEFAULT_PACKET
Definition: packet-inl.h:47
void VectorDot(Tensor< Device, 1, DType > dst, const Tensor< Device, 1, DType > &lhs, const Tensor< Device, 1, DType > &rhs)
CPU/GPU: 1 dimension vector dot.
Definition: tensor_cpu-inl.h:635
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:610
Tensor< Device, dim, DType > NewTensor(const Shape< dim > &shape, DType initv, bool pad=MSHADOW_ALLOC_PAD, Stream< Device > *stream=NULL)
CPU/GPU: short cut to allocate and initialize a Tensor.
Definition: tensor_cpu-inl.h:132
void AddTakeGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 1, IndexType > &index, const Tensor< cpu, 2, DType > &src)
CPU/GPU: Gradient accumulate of embedding matrix. dst[index[i]] += src[i] Called when the featuredim ...
Definition: tensor_cpu-inl.h:521
Stream< cpu > * NewStream< cpu >(bool create_blas_handle, bool create_dnn_handle, int dev_id)
Definition: tensor_cpu-inl.h:48
void SmoothSoftmaxGrad(Tensor< cpu, 2, DType > dst, const Tensor< cpu, 2, DType > &src, const Tensor< cpu, 1, DType > &label, const float alpha)
Definition: tensor_cpu-inl.h:328
definitions of base types, operators, macros functions
scalar expression
Definition: expression.h:95
static void Map(Tensor< cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
Definition: tensor_cpu-inl.h:197
static void Map(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
Definition: tensor_cpu-inl.h:188
MSHADOW_XINLINE Shape< 3 > Shape3(index_t s0, index_t s1, index_t s2)
construct a three dimension shape, stride will equal s0
Definition: tensor.h:241
void DeleteStream< cpu >(Stream< cpu > *stream)
Definition: tensor_cpu-inl.h:54
void FreeHost(Tensor< cpu, dim, DType > *obj)
Definition: tensor_cpu-inl.h:107
void ShutdownTensorEngine< cpu >(void)
Definition: tensor_cpu-inl.h:41
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:546
void AlignedFree(void *ptr)
free aligned space
Definition: packet-inl.h:106
void MapExp(TRValue< R, cpu, dim, DType > *dst, const expr::Exp< E, DType, etype > &exp)
CPU/GPU: map a expression to a tensor, this function calls MapPlan.
Definition: tensor_cpu-inl.h:212