docs/api/tensor_8h_source.html

/*

 * Licensed to the Apache Software Foundation (ASF) under one

 * or more contributor license agreements.  See the NOTICE file

 * distributed with this work for additional information

 * regarding copyright ownership.  The ASF licenses this file

 * to you under the Apache License, Version 2.0 (the

 * "License"); you may not use this file except in compliance

 * with the License.  You may obtain a copy of the License at

 *

 *   http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing,

 * software distributed under the License is distributed on an

 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

 * KIND, either express or implied.  See the License for the

 * specific language governing permissions and limitations

 * under the License.

 */


#ifndef MSHADOW_TENSOR_H_

#define MSHADOW_TENSOR_H_

#include <string>

#include <iostream>

#include "./base.h"

#include "./expression.h"


namespace mshadow {

struct cpu {

  static const bool kDevCPU = true;

  static const int kDevMask = 1 << 0;

};

struct gpu {

  static const bool kDevCPU = false;

  static const int kDevMask = 1 << 1;

};


template <typename xpu>

struct LapackIndex {

    using IndexT = lapack_index_t;

};


template <>

struct LapackIndex <gpu> {

    using IndexT = int;

};


template<int ndim>

struct Shape;


template<int ndim>

inline std::ostream &operator<<(std::ostream &os, const Shape<ndim> &shape); // NOLINT(*)


template<int dimension>

struct Shape {

  static const int kDimension = dimension;

  static const int kSubdim = dimension - 1;

  index_t shape_[kDimension];

  MSHADOW_XINLINE Shape(void) {}

  MSHADOW_XINLINE Shape(const Shape<kDimension> &s) {

    #pragma unroll

    for (int i = 0; i < kDimension; ++i) {

      this->shape_[i] = s[i];

    }

  }

  MSHADOW_XINLINE index_t &operator[](int idx) {

    return shape_[idx];

  }

  MSHADOW_XINLINE const index_t &operator[](int idx) const {

#pragma GCC diagnostic push

#pragma GCC diagnostic ignored "-Warray-bounds"

    return shape_[idx];

#pragma GCC diagnostic pop

  }

  MSHADOW_XINLINE bool operator==(const Shape<kDimension> &s) const {

    #pragma unroll

    for (int i = 0; i < kDimension; ++i) {

      if (s.shape_[i] != this->shape_[i]) return false;

    }

    return true;

  }

  MSHADOW_XINLINE bool operator!=(const Shape<kDimension> &s) const {

    return !(*this == s);

  }

  MSHADOW_XINLINE Shape<1> FlatTo1D(void) const {

    Shape<1> s;

    s[0] = this->Size();

    return s;

  }

  MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {

    Shape<2> s;

    s.shape_[1] = this->shape_[kDimension - 1];

    index_t ymax = 1;

    #pragma unroll

    for (int i = 0; i < kDimension - 1; ++i) {

      ymax *= this->shape_[i];

    }

    s.shape_[0] = ymax;

    return s;

  }

  MSHADOW_XINLINE index_t Size(void) const {

    index_t size = this->shape_[0];

    #pragma unroll

    for (int i = 1; i < kDimension; ++i) {

      size *= this->shape_[i];

    }

    return size;

  }

  MSHADOW_XINLINE index_t ProdShape(int dimstart, int dimend) const {

    index_t num = 1;

    #pragma unroll

    for (int i = dimstart; i < dimend; ++i) {

      num *= this->shape_[i];

    }

    return num;

  }

  MSHADOW_XINLINE Shape<kSubdim> SubShape(void) const {

    Shape<kSubdim> s;

    // for cuda

    #pragma unroll

    for (int i = 0; i < kSubdim; ++i) {

      s.shape_[i] = this->shape_[i + 1];

    }

    return s;

  }

  template<int dimstart, int dimend>

  MSHADOW_XINLINE Shape<dimend - dimstart> Slice(void) const {

    Shape<dimend - dimstart> s;

    #pragma unroll

    for (int i = dimstart; i < dimend; ++i) {

      s[i - dimstart] = this->shape_[i];

    }

    return s;

  }

  template<int dim>

  friend std::ostream &operator<<(std::ostream &os, const Shape<dim> &shape); // NOLINT(*)

};  // Shape

//------------------------------------------------

// useful construction functions to generate shape

//-------------------------------------------------

MSHADOW_XINLINE Shape<1> Shape1(index_t s0) {

  Shape<1> s; s[0] = s0;

  return s;

}

MSHADOW_XINLINE Shape<2> Shape2(index_t s0, index_t s1) {

  Shape<2> s; s[0] = s0; s[1] = s1;

  return s;

}

MSHADOW_XINLINE Shape<3> Shape3(index_t s0, index_t s1, index_t s2) {

  Shape<3> s;

  s[0] = s0; s[1] = s1; s[2] = s2;

  return s;

}

MSHADOW_XINLINE Shape<4> Shape4(index_t s0, index_t s1,

                                index_t s2, index_t s3) {

  Shape<4> s;

  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;

  return s;

}

MSHADOW_XINLINE Shape<5> Shape5(index_t s0, index_t s1, index_t s2,

                                index_t s3, index_t s4) {

  Shape<5> s;

  s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s[4] = s4;

  return s;

}


inline Shape<3> ConvertLayout(const Shape<3>& src, int src_layout, int dst_layout) {

  Shape<3> dst;

  switch (src_layout) {

  case kNCW:

    dst = src;

    break;

  case kNWC:

    dst[0] = src[0];

    dst[1] = src[2];

    dst[2] = src[1];

    break;

  default:

    LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;

  }

  switch (dst_layout) {

  case kNCW:

    return dst;

  case kNWC:

    {

      index_t tmp = dst[1];

      dst[1] = dst[2];

      dst[2] = tmp;

    }

    break;

  default:

    LOG(FATAL) << "Invalid layout for 3d shape " << src_layout;

  }

  return dst;

}


inline Shape<4> ConvertLayout(const Shape<4>& src, int src_layout, int dst_layout) {

  Shape<4> dst;

  switch (src_layout) {

  case kNCHW:

    dst = src;

    break;

  case kNHWC:

    dst[0] = src[0];

    dst[2] = src[1];

    dst[3] = src[2];

    dst[1] = src[3];

    break;

  default:

    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;

    dst = src;  // fixes compiler warning

  }

  Shape<4> dst2;

  switch (dst_layout) {

  case kNCHW:

    return dst;

  case kNHWC:

    dst2[0] = dst[0];

    dst2[1] = dst[2];

    dst2[2] = dst[3];

    dst2[3] = dst[1];

    break;

  default:

    LOG(FATAL) << "Invalid layout for 4d shape " << src_layout;

    dst2 = src;  // fixes compiler warning

  }

  return dst2;

}


inline Shape<5> ConvertLayout(const Shape<5>& src, int src_layout, int dst_layout) {

  Shape<5> dst;

  switch (src_layout) {

  case kNCDHW:

    dst = src;

    break;

  case kNDHWC:

    dst[0] = src[0];

    dst[2] = src[1];

    dst[3] = src[2];

    dst[4] = src[3];

    dst[1] = src[4];

    break;

  default:

    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;

  }

  Shape<5> dst2;

  switch (dst_layout) {

  case kNCDHW:

    return dst;

  case kNDHWC:

    dst2[0] = dst[0];

    dst2[1] = dst[2];

    dst2[2] = dst[3];

    dst2[3] = dst[4];

    dst2[4] = dst[1];

    break;

  default:

    LOG(FATAL) << "Invalid layout for 5d shape " << src_layout;

  }

  return dst2;

}


template <typename dim_t>

inline std::vector<dim_t> getTranspAxes(const LayoutFlag src_layout, const LayoutFlag dst_layout) {

  auto apply = [](const std::vector<dim_t>& v, const std::vector<dim_t>& op) {

    CHECK_EQ(v.size(), op.size()) << "Layout ndims does not match";

    std::vector<dim_t> ret(v.size());

    for (size_t i = 0; i < v.size(); i++) {

      ret[i] = v[op[i]];

    }

    return ret;

  };

  std::vector<dim_t> axes;

  // transpose from `case` to ND?H?WC

  switch (src_layout) {

    case kUNKNOWN:

      LOG(FATAL) << "Unknown source layout";

      break;

    case kNHWC:

      axes = std::vector<dim_t>({0, 1, 2, 3});

      break;

    case kNCHW:

      axes = std::vector<dim_t>({0, 2, 3, 1});

      break;

    case kCHWN:

      axes = std::vector<dim_t>({3, 1, 2, 0});

      break;

    case kNWC:

      axes = std::vector<dim_t>({0, 1, 2});

      break;

    case kNCW:

      axes = std::vector<dim_t>({0, 2, 1});

      break;

    case kCWN:

      axes = std::vector<dim_t>({2, 1, 0});

      break;

    case kNDHWC:

      axes = std::vector<dim_t>({0, 1, 2, 3, 4});

      break;

    case kNCDHW:

      axes = std::vector<dim_t>({0, 2, 3, 4, 1});

      break;

    case kCDHWN:

      axes = std::vector<dim_t>({4, 1, 2, 3, 0});

      break;

    default:

      LOG(FATAL) << "Invalid source layout " << src_layout;

  }

  // transpose from ND?H?WC to `case`

  switch (dst_layout) {

    case kUNKNOWN:

      LOG(FATAL) << "Unknown destination layout";

      break;

    case kNHWC:

      axes = apply(axes, {0, 1, 2, 3});

      break;

    case kNCHW:

      axes = apply(axes, {0, 3, 1, 2});

      break;

    case kCHWN:

      axes = apply(axes, {3, 1, 2, 0});

      break;

    case kNWC:

      axes = apply(axes, {0, 1, 2});

      break;

    case kNCW:

      axes = apply(axes, {0, 2, 1});

      break;

    case kCWN:

      axes = apply(axes, {2, 1, 0});

      break;

    case kNDHWC:

      axes = apply(axes, {0, 1, 2, 3, 4});

      break;

    case kNCDHW:

      axes = apply(axes, {0, 4, 1, 2, 3});

      break;

    case kCDHWN:

      axes = apply(axes, {4, 1, 2, 3, 0});

      break;

    default:

      LOG(FATAL) << "Invalid destination layout " << src_layout;

  }

  return axes;

}


template<typename Device>

struct Stream {

  // this is only a dummy implementation for CPU

  // for GPU, the actual implementation will be specialized in tensor_gpu-inl.h

  inline void Wait(void) {}

  inline bool CheckIdle(void) {

    return true;

  }

  inline void CreateBlasHandle() {}

};

template<typename Container, typename Device, int dimension, typename DType>

struct TRValue: public expr::RValueExp<Container, DType> {

};

// more compact template

template<typename Device, int dimension,

         typename DType MSHADOW_DEFAULT_DTYPE>

struct Tensor: public TRValue<Tensor<Device, dimension, DType>,

                              Device, dimension, DType> {

 public:

  //--------------------------------

  // struct memembers

  //--------------------------------

  static const bool kDevCPU = Device::kDevCPU;

  static const int  kSubdim = dimension - 1;

  //--------------------------------

  // struct memembers

  //--------------------------------

  DType *dptr_ = nullptr;

  Shape<dimension> shape_;

  index_t stride_;

  Stream<Device> *stream_;

  //--------------------------------

  // functions

  //--------------------------------

  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}

  MSHADOW_XINLINE Tensor(const Shape<dimension> &shape)

      : shape_(shape), stream_(NULL) {}

  MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape)

      : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(NULL) {}

  MSHADOW_XINLINE Tensor(DType *dptr, const Shape<dimension> &shape,

                         Stream<Device> *stream)

    : dptr_(dptr), shape_(shape), stride_(shape[kSubdim]), stream_(stream) {}

  MSHADOW_XINLINE Tensor(DType *dptr,

                         const Shape<dimension> &shape,

                         index_t stride, Stream<Device> *stream)

      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}

  inline void set_stream(Stream<Device> *stream) {

    this->stream_ = stream;

  }

  template<int startdim>

  MSHADOW_XINLINE index_t MemSize(void) const {

    index_t memsz = this->stride_;

    #pragma unroll

    for (int i = startdim; i < kSubdim; ++i) {

      memsz *= this->shape_[i];

    }

    return memsz;

  }

  MSHADOW_XINLINE bool CheckContiguous(void) const {

    return this->shape_[dimension - 1] == stride_;

  }

  MSHADOW_XINLINE index_t MSize(void) const {

    return this->MemSize<0>();

  }

  MSHADOW_XINLINE index_t size(int idx) const {

    return shape_[idx];

  }

  MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {

    return Tensor<Device, 1, DType>(dptr_, shape_.FlatTo1D(), stride_, stream_);

  }

  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {

    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);

  }

  MSHADOW_XINLINE Tensor<Device, kSubdim, DType> operator[](index_t idx) const {

    return Tensor<Device, kSubdim, DType>(dptr_ + this->MemSize<1>() * idx,

                                          shape_.SubShape(), stride_, stream_);

  }

  MSHADOW_XINLINE Tensor<Device, dimension, DType>

  Slice(index_t begin, index_t end) const {

    Shape<dimension> s = this->shape_;

    s[0] = end - begin;

    return Tensor<Device, dimension, DType>(dptr_ + this->MemSize<1>() * begin,

                                            s, stride_, stream_);

  }

  inline Tensor<Device, dimension, DType> &

  operator=(const Tensor<Device, dimension, DType> &exp) {

    dptr_ = exp.dptr_;

    shape_ = exp.shape_;

    stride_ = exp.stride_;

    stream_ = exp.stream_;

    return *this;

  }

  template<typename E, int etype>

  inline Tensor<Device, dimension, DType> &

  operator=(const expr::Exp<E, DType, etype> &exp) {

    return this->__assign(exp);

  }

  inline Tensor<Device, dimension, DType> &operator=(const DType &exp) {

    return this->__assign(exp);

  }

};

/*

 *  respecialized class Tensor1D, thei is due to different implementation in operator[]

 */

template<typename Device, typename DType>

struct Tensor<Device, 1, DType>:

      public TRValue<Tensor<Device, 1, DType>, Device, 1, DType> {

 public:

  DType *dptr_;

  Shape<1> shape_;

  index_t stride_;

  Stream<Device> *stream_;

  // constructor

  MSHADOW_XINLINE Tensor(void) : stream_(NULL) {}

  MSHADOW_XINLINE Tensor(const Shape<1> &shape)

      : shape_(shape), stream_(NULL) {}

  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape)

      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(NULL) {}

  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape, Stream<Device> *stream)

      : dptr_(dptr), shape_(shape), stride_(shape[0]), stream_(stream) {}

  MSHADOW_XINLINE Tensor(DType *dptr, Shape<1> shape,

                         index_t stride, Stream<Device> *stream)

      : dptr_(dptr), shape_(shape), stride_(stride), stream_(stream) {}

  inline void set_stream(Stream<Device> *stream) {

    this->stream_ = stream;

  }

  MSHADOW_XINLINE Tensor<Device, 1, DType> FlatTo1D(void) const {

    return *this;

  }

  MSHADOW_XINLINE Tensor<Device, 2, DType> FlatTo2D(void) const {

    return Tensor<Device, 2, DType>(dptr_, shape_.FlatTo2D(), stride_, stream_);

  }

  MSHADOW_XINLINE Tensor<Device, 1, DType> Slice(index_t begin, index_t end) const {

    Shape<1> s;

    s[0] = end  - begin;

    return Tensor<Device, 1, DType>(dptr_ + begin, s, s[0], stream_);

  }

  MSHADOW_XINLINE bool CheckContiguous(void) const {

    return true;

  }

  MSHADOW_XINLINE index_t MSize(void) const {

    return shape_[0];

  }

  MSHADOW_XINLINE index_t size(index_t i) const {

    return shape_[0];

  }

  MSHADOW_XINLINE DType &operator[](index_t idx) {

    return dptr_[idx];

  }

  MSHADOW_XINLINE const DType &operator[](index_t idx) const {

    return dptr_[idx];

  }

  inline Tensor<Device, 1, DType> &

  operator=(const Tensor<Device, 1, DType> &exp) {

    dptr_ = exp.dptr_;

    shape_ = exp.shape_;

    stride_ = exp.stride_;

    stream_ = exp.stream_;

    return *this;

  }

  template<typename E, int etype>

  inline Tensor<Device, 1, DType> &

  operator=(const expr::Exp<E, DType, etype> &exp) {

    return this->__assign(exp);

  }

  inline Tensor<Device, 1, DType> &operator=(const DType &exp) {

    return this->__assign(exp);

  }

};

//------------------------

// Function Declarations

//-----------------------

template<typename Device>

inline void InitTensorEngine(int device_id = 0);

template<typename Device>

inline void ShutdownTensorEngine(void);

template<typename Device>

inline void SetDevice(int devid);

template<typename Device>

inline Stream<Device> *NewStream(bool create_blas_handle,

                                 bool create_dnn_handle,

                                 int dev_id = -1);

template<typename Device>

inline Stream<Device> *NewStream(int dev_id) {

  return NewStream<Device>(true, false, dev_id);

}

template<typename Device>

inline void DeleteStream(Stream<Device> *stream);

template<int dim, typename DType>

inline void AllocSpace(Tensor<cpu, dim, DType> *obj,

                       bool pad = MSHADOW_ALLOC_PAD);

template<int dim, typename DType>

inline void AllocSpace(Tensor<gpu, dim, DType> *obj,

                       bool pad = MSHADOW_ALLOC_PAD);

template<int dim, typename DType>

inline void FreeSpace(Tensor<cpu, dim, DType> *obj);

template<int dim, typename DType>

inline void FreeSpace(Tensor<gpu, dim, DType> *obj);

template<typename Device, typename DType, int dim>

inline Tensor<Device, dim, DType> NewTensor(const Shape<dim> &shape,

                                            DType initv,

                                            bool pad = MSHADOW_ALLOC_PAD,

                                            Stream<Device> *stream = NULL);

template<int dim, typename DType>

inline void Copy(Tensor<cpu, dim, DType> dst,

                 const Tensor<cpu, dim, DType> &src,

                 Stream<cpu> *stream = NULL);

template<int dim, typename DType>

inline void Copy(Tensor<cpu, dim, DType> dst,

                 const Tensor<gpu, dim, DType> &src,

                 Stream<gpu> *stream = NULL);

template<int dim, typename DType>

inline void Copy(Tensor<gpu, dim, DType> dst,

                 const Tensor<cpu, dim, DType> &src,

                 Stream<gpu> *stream = NULL);

template<int dim, typename DType>

inline void Copy(Tensor<gpu, dim, DType> dst,

                 const Tensor<gpu, dim, DType> &src,

                 Stream<gpu> *stream = NULL);

template<typename DType>

inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &energy);

template<typename DType>

inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);


template<typename DType>

inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,

                        const Tensor<cpu, 2, DType> &src,

                        const Tensor<cpu, 1, DType> &label);

template<typename DType>

inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,

                        const Tensor<gpu, 2, DType> &src,

                        const Tensor<gpu, 1, DType> &label);

template<bool clip = true, typename IndexType, typename DType>

inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,

                        const Tensor<cpu, 1, IndexType>& index,

                        const Tensor<cpu, 2, DType> &src);

template<bool clip = true, typename IndexType, typename DType, typename AType>

inline void AddTakeGrad(Tensor<cpu, 2, DType> dst,

                        Tensor<cpu, 2, AType> temp,

                        const Tensor<cpu, 1, IndexType>& index,

                        const Tensor<cpu, 2, DType> &src);

template<bool clip = true, typename IndexType, typename DType>

inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,

                        const Tensor<gpu, 1, IndexType>& index,

                        const Tensor<gpu, 2, DType> &src);

template<bool clip = true, typename IndexType, typename DType, typename AType>

inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,

                        Tensor<gpu, 2, AType> temp,

                        const Tensor<gpu, 1, IndexType>& index,

                        const Tensor<gpu, 2, DType> &src);

template<typename IndexType, typename DType>

inline void AddTakeGradLargeBatch(Tensor<cpu, 2, DType> dst,

                                  const Tensor<cpu, 1, IndexType>& sorted,

                                  const Tensor<cpu, 1, IndexType>& index,

                                  const Tensor<cpu, 2, DType> &src);

template<typename IndexType, typename DType>

inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,

                                  const Tensor<gpu, 1, IndexType>& sorted,

                                  const Tensor<gpu, 1, IndexType>& index,

                                  const Tensor<gpu, 2, DType> &src);

template<typename IndexType, typename DType>

inline void IndexFill(Tensor<cpu, 2, DType> dst,

                      const Tensor<cpu, 1, IndexType>& index,

                      const Tensor<cpu, 2, DType> &src);

template<typename IndexType, typename DType>

inline void IndexFill(Tensor<gpu, 2, DType> dst,

                      const Tensor<gpu, 1, IndexType>& index,

                      const Tensor<gpu, 2, DType> &src);

template<typename KDType, typename VDType>

inline void SortByKey(Tensor<cpu, 1, KDType> keys, Tensor<cpu, 1, VDType> values,

                      bool is_ascend = true);

template<typename KDType, typename VDType>

inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,

                      bool is_ascend = true);

template<typename Device, typename VDType, typename SDType>

inline void VectorizedSort(Tensor<Device, 1, VDType> values, Tensor<Device, 1, SDType> segments);


// function declarations to support expression, no need to understand them

// these functions do not need to be directly used

template<typename Saver, typename R, int dim,

         typename DType, typename E, int etype>

inline void MapExp(TRValue<R, cpu, dim, DType> *dst,

                   const expr::Exp<E, DType, etype> &exp);

template<typename Saver, typename R, int dim,

         typename DType, typename E, int etype>

inline void MapExp(TRValue<R, gpu, dim, DType> *dst,

                   const expr::Exp<E, DType, etype> &exp);

template<typename Saver, typename Reducer,

         typename R, typename DType, typename E, int etype>

inline void MapReduceKeepLowest(TRValue<R, cpu, 1, DType> *dst,

                                const expr::Exp<E, DType, etype> &exp,

                                DType scale = 1);

template<typename Saver, typename Reducer, typename R,

         typename DType, typename E, int etype>

inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,

                                const expr::Exp<E, DType, etype> &exp,

                                DType scale = 1);

template<typename Saver, typename Reducer, int dimkeep,

         typename R, typename DType, typename E, int etype>

inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,

                                 const expr::Exp<E, DType, etype> &exp,

                                 DType scale = 1);

template<typename Saver, typename Reducer, int dimkeep,

         typename R, typename DType, typename E, int etype>

inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,

                                 const expr::Exp<E, DType, etype> &exp,

                                 DType scale = 1);

template<typename Device, typename DType>

inline void VectorDot(Tensor<Device, 1, DType> dst,

                      const Tensor<Device, 1, DType> &lhs,

                      const Tensor<Device, 1, DType> &rhs);

template<bool transpose_left, bool transpose_right, typename Device, typename DType>

inline void BatchGEMM(Tensor<Device, 3, DType> dst,

                      const Tensor<Device, 3, DType> &lhs,

                      const Tensor<Device, 3, DType> &rhs,

                      DType alpha,

                      DType beta,

                      Tensor<Device, 1, DType*> workspace);

}  // namespace mshadow

// include headers

#include "./stream_gpu-inl.h"

#include "./extension.h"

#include "./expr_engine-inl.h"

#include "./tensor_cpu-inl.h"

#include "./tensor_gpu-inl.h"

#include "./io.h"

#include "./tensor_container.h"

#include "./random.h"

// add definition of scalar related operators

#ifdef MSHADOW_SCALAR_

  #error "MSHADOW_SCALAR_ must not be defined"

#endif

// enumerate all the scalar data type we aim to be good at

#define MSHADOW_SCALAR_ float

#include "./expr_scalar-inl.h"

#undef MSHADOW_SCALAR_

#define MSHADOW_SCALAR_ double

#include "./expr_scalar-inl.h"

#undef MSHADOW_SCALAR_

#define MSHADOW_SCALAR_ int32_t

#include "./expr_scalar-inl.h"

#undef MSHADOW_SCALAR_

#define MSHADOW_SCALAR_ int64_t

#include "./expr_scalar-inl.h"

#undef MSHADOW_SCALAR_

#define MSHADOW_SCALAR_ mshadow::half::half_t

#include "./expr_scalar-inl.h"

#undef MSHADOW_SCALAR_

#endif  // MSHADOW_TENSOR_H_