mxnet
packet-inl.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MSHADOW_PACKET_INL_H_
26 #define MSHADOW_PACKET_INL_H_
27 
28 #if defined(__APPLE__) || defined(__FreeBSD__)
29 #include <stdlib.h>
30 #else
31 #include <malloc.h>
32 #endif
33 #include "./base.h"
34 #include "./tensor.h"
35 #include "./expression.h"
36 
37 
38 namespace mshadow {
40 namespace packet {
41 
42 enum PacketArch {
45 };
46 
47 #if MSHADOW_USE_SSE
48 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kSSE2
49 #else
50 #define MSHADOW_DEFAULT_PACKET ::mshadow::packet::kPlain
51 #endif
52 
53 // whether packet operator is enabled.
59 template<typename DType, PacketArch Arch = MSHADOW_DEFAULT_PACKET>
60 struct Packet;
61 
62 template<PacketArch Arch>
63 struct AlignBytes {
64  static const index_t value = 4;
65 };
66 
67 } // namespace packet
68 } // namespace mshadow
69 
70 namespace mshadow {
71 namespace packet {
78 inline void* AlignedMallocPitch(size_t *out_pitch,
79  size_t lspace,
80  size_t num_line) {
82  const index_t mask = (1 << bits) - 1;
83 
84  size_t pitch = ((lspace + mask) >> bits) << bits;
85  *out_pitch = pitch;
86 #ifdef _MSC_VER
87  void *res = _aligned_malloc(pitch * num_line, 1 << bits);
88 #else
89  void *res;
90  int ret = posix_memalign(&res, 1 << bits, pitch * num_line);
91  CHECK_EQ(ret, 0) << "AlignedMallocPitch failed";
92 #endif
93  if (res == NULL) {
94  LOG(FATAL) << "AlignedMallocPitch failed";
95  }
96  return res;
97 }
98 
103 inline void AlignedFree(void *ptr) {
104 #ifdef _MSC_VER
105  _aligned_free(ptr);
106 #else
107  free(ptr);
108 #endif
109 }
110 
112 template<PacketArch Arch>
113 inline bool CheckAlign(size_t pitch) {
114  const index_t bits = AlignBytes<Arch>::value;
115  return !(pitch & ((1 << bits) - 1));
116 }
117 
119 template<PacketArch Arch>
120 inline bool CheckAlign(void *ptr) {
121  return CheckAlign<Arch>(reinterpret_cast<size_t>(ptr));
122 }
123 
129 template<typename DType, PacketArch Arch>
130 inline index_t UpperAlign(index_t size) {
132  const index_t mask = (1 << bits) - 1;
133  const index_t fsize = sizeof(DType);
134  return (((size * fsize + mask) >> bits) << bits) / fsize;
135 }
136 
142 template<typename DType, PacketArch Arch>
143 inline index_t LowerAlign(index_t size) {
145  const index_t fsize = sizeof(DType);
146  return (((size * fsize) >> bits) << bits) / fsize;
147 }
148 
155 template<typename OP, typename DType, PacketArch Arch>
156 struct PacketOp {
157  static const bool kEnabled = false;
158 };
159 // specialization of operators
160 template<typename DType, PacketArch Arch>
161 struct PacketOp<op::plus, DType, Arch> {
162  static const bool kEnabled = true;
164  const Packet<DType, Arch>& rhs) {
165  return lhs + rhs;
166  }
167 };
168 template<typename DType, PacketArch Arch>
169 struct PacketOp<op::minus, DType, Arch> {
170  static const bool kEnabled = true;
172  const Packet<DType, Arch>& rhs) {
173  return lhs - rhs;
174  }
175 };
176 template<typename DType, PacketArch Arch>
177 struct PacketOp<op::mul, DType, Arch> {
178  static const bool kEnabled = true;
180  const Packet<DType, Arch>& rhs) {
181  return lhs * rhs;
182  }
183 };
184 template<typename DType, PacketArch Arch>
185 struct PacketOp<op::div, DType, Arch> {
186  static const bool kEnabled = true;
188  const Packet<DType, Arch>& rhs) {
189  return lhs / rhs;
190  }
191 };
192 
193 template<typename DType, PacketArch Arch>
194 struct PacketOp<op::identity, DType, Arch> {
195  static const bool kEnabled = true;
197  return src;
198  }
199 };
200 
201 
202 // savers to do storage
203 template<typename SV, typename TFloat, PacketArch Arch>
204 struct Saver{
205  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
208  ans.Store(dst);
209  }
210 };
211 template<typename TFloat, PacketArch Arch>
212 struct Saver<sv::saveto, TFloat, Arch> {
213  MSHADOW_CINLINE static void Save(TFloat *dst, const Packet<TFloat, Arch>& src) {
214  src.Store(dst);
215  }
216 };
217 } // namespace packet
218 } // namespace mshadow
219 
220 #include "packet/plain-inl.h"
221 #if MSHADOW_USE_SSE && !defined(__CUDACC__)
222 #include "packet/sse-inl.h"
223 #endif
224 
225 namespace mshadow {
226 namespace expr {
227 
229 
230 // same as plan, but use packet
231 template<typename ExpType, typename DType, PacketArch Arch>
232 class PacketPlan {
233  public:
239  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const;
240 };
241 
242 template <typename Device, int dim, typename DType, PacketArch Arch>
243 class PacketPlan<Tensor<Device, dim, DType>, DType, Arch> {
244  public:
246  :dptr_(t.dptr_), stride_(t.stride_) {}
248  return packet::Packet<DType, Arch>::Load(&dptr_[y * stride_ + x]);
249  }
250  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
251  return dptr_[y * stride_ + x];
252  }
253 
254  private:
255  const DType *dptr_;
256  index_t stride_;
257 };
258 
259 template<typename DType, PacketArch Arch>
260 class PacketPlan<ScalarExp<DType>, DType, Arch> {
261  public:
262  explicit PacketPlan(DType scalar) : scalar_(scalar) {}
264  return packet::Packet<DType, Arch>::Fill(scalar_);
265  }
266  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
267  return scalar_;
268  }
269 
270  private:
271  DType scalar_;
272 };
273 
274 template<typename OP, typename TA, typename TB, int etype, typename DType, PacketArch Arch>
275 class PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch> {
276  public:
278  : lhs_(lhs), rhs_(rhs) {}
280  return packet::PacketOp<OP, DType, Arch>::Map(lhs_.EvalPacket(y, x), rhs_.EvalPacket(y, x));
281  }
282  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
283  return OP::Map(lhs_.Eval(y, x), rhs_.Eval(y, x));
284  }
285 
286  private:
289 };
290 
291 template<typename OP, typename TA, int etype, typename DType, PacketArch Arch>
292 class PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch> {
293  public:
294  PacketPlan(const PacketPlan<TA, DType, Arch> &src) : src_(src) {}
296  return packet::PacketOp<OP, DType, Arch>::Map(src_.EvalPacket(y, x));
297  }
298  MSHADOW_CINLINE DType Eval(index_t y, index_t x) const {
299  return OP::Map(src_.Eval(y, x));
300  }
301 
302  private:
304 };
305 
306 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
309 
310 template<PacketArch Arch, typename DType>
312  return PacketPlan<ScalarExp<DType>, DType, Arch>(e.scalar_);
313 }
314 template<PacketArch Arch, typename T, typename DType>
316  return PacketPlan<T, DType, Arch>(e.self());
317 }
318 template<PacketArch Arch, typename T, int dim, typename DType>
322 }
323 template<PacketArch Arch, typename OP, typename TA, typename DType, int etype>
326  return PacketPlan<UnaryMapExp<OP, TA, DType, etype>, DType, Arch>(MakePacketPlan<Arch>(e.src_));
327 }
328 template<PacketArch Arch, typename OP, typename TA, typename TB, typename DType, int etype>
329 inline PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>, DType, Arch>
331  return PacketPlan<BinaryMapExp<OP, TA, TB, DType, etype>,
332  DType, Arch>(MakePacketPlan<Arch>(e.lhs_), MakePacketPlan<Arch>(e.rhs_));
333 }
334 
342 template<typename E, PacketArch Arch>
343 struct PacketCheck{
344  static const bool kPass = false;
345 };
346 template<PacketArch Arch>
347 struct PacketCheck<float, Arch> {
348  static const bool kPass = true;
349 };
350 template<PacketArch Arch>
351 struct PacketCheck<double, Arch> {
352  static const bool kPass = true;
353 };
354 template<typename DType, PacketArch Arch>
355 struct PacketCheck<ScalarExp<DType>, Arch> {
356  static const bool kPass = PacketCheck<DType, Arch>::kPass;
357 };
358 template<int dim, typename DType, PacketArch Arch>
359 struct PacketCheck<Tensor<cpu, dim, DType>, Arch> {
360  static const bool kPass = PacketCheck<DType, Arch>::kPass;
361 };
362 template<typename OP, typename TA, typename DType, int etype, PacketArch Arch>
363 struct PacketCheck<UnaryMapExp<OP, TA, DType, etype>, Arch> {
364  static const bool kPass = PacketCheck<TA, Arch>::kPass &&
366 };
367 template<typename OP, typename TA, typename TB, typename DType, int etype, PacketArch Arch>
368 struct PacketCheck< BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
369  static const bool kPass = packet::PacketOp<OP, DType, Arch>::kEnabled &&
371 };
372 //----------------------------------------------------
373 // Check if data is aligned and allow packet operation
374 //----------------------------------------------------
375 template<int dim, typename E, PacketArch Arch>
377  inline static bool Check(const E &exp) {
378  return false;
379  }
380 };
381 template<int dim, typename DType, PacketArch Arch>
382 struct PacketAlignCheck<dim, ScalarExp<DType>, Arch> {
383  inline static bool Check(const ScalarExp<DType> &exp) {
384  return true;
385  }
386 };
387 template<int dim, typename DType, PacketArch Arch>
388 struct PacketAlignCheck<dim, Tensor<cpu, dim, DType>, Arch> {
389  inline static bool Check(const Tensor<cpu, dim, DType> &t) {
390  return packet::CheckAlign<Arch>(t.dptr_) &&
391  packet::CheckAlign<Arch>(t.stride_ * sizeof(DType));
392  }
393 };
394 template<int dim, typename OP, typename TA, typename DType, int etype, PacketArch Arch>
395 struct PacketAlignCheck<dim, UnaryMapExp<OP, TA, DType, etype>, Arch> {
396  inline static bool Check(const UnaryMapExp<OP, TA, DType, etype> &t) {
398  }
399 };
400 template<int dim, typename OP, typename TA, typename TB,
401  typename DType, int etype, PacketArch Arch>
402 struct PacketAlignCheck<dim, BinaryMapExp<OP, TA, TB, DType, etype>, Arch> {
403  inline static bool Check(const BinaryMapExp<OP, TA, TB, DType, etype> &t) {
406  }
407 };
408 
412 template<typename SV, typename E, int dim, typename DType, PacketArch Arch>
414  const expr::PacketPlan<E, DType, Arch>& plan) {
415  Tensor<cpu, 2, DType> dst = _dst.FlatTo2D();
416  const index_t xlen = packet::LowerAlign<DType, Arch>(dst.size(1));
417  const size_t packetSize = packet::Packet<DType, Arch>::size;
418 #ifndef __CUDACC__
419  #pragma omp parallel for
420 #endif
421  for (openmp_index_t y = 0; y < dst.size(0); ++y) {
422  for (index_t x = 0; x < xlen; x += packetSize) {
423  packet::Saver<SV, DType, Arch>::Save(&dst[y][x], plan.EvalPacket(y, x));
424  }
425  for (index_t x = xlen; x < dst.size(1); ++x) {
426  SV::Save(dst[y][x], plan.Eval(y, x));
427  }
428  }
429 }
430 } // namespace expr
431 } // namespace mshadow
432 #endif // MSHADOW_PACKET_INL_H_
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:263
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:187
ScalarExp< DType > scalar(DType s)
create an scalar expression
Definition: expression.h:104
Definition: packet-inl.h:232
DType * dptr_
pointer to the data
Definition: tensor.h:435
MSHADOW_CINLINE packet::Packet< DType > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:295
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:250
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
void AlignedFree(void *ptr)
free aligned space
Definition: packet-inl.h:103
const TB & rhs_
right operand
Definition: expression.h:340
Definition: packet-inl.h:376
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:298
static bool Check(const E &exp)
Definition: packet-inl.h:377
binary map expression lhs [op] rhs
Definition: expression.h:335
static const index_t value
Definition: packet-inl.h:64
PacketPlan< UnaryMapExp< OP, TA, DType, etype >, DType, Arch > MakePacketPlan(const UnaryMapExp< OP, TA, DType, etype > &e)
Definition: packet-inl.h:325
void * AlignedMallocPitch(size_t *out_pitch, size_t lspace, size_t num_line)
analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
Definition: packet-inl.h:78
Definition: packet-inl.h:43
base class of all rvalues
Definition: expression.h:149
DType scalar_
scalar value
Definition: expression.h:98
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:279
PacketArch
Definition: packet-inl.h:42
PacketPlan(const PacketPlan< TA, DType, Arch > &lhs, const PacketPlan< TB, DType, Arch > &rhs)
Definition: packet-inl.h:277
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &src)
Definition: packet-inl.h:196
header file of tensor data structure and functions This lib requires explicit memory allocation and d...
device name CPU
Definition: tensor.h:40
MSHADOW_XINLINE Tensor< Device, 2, DType > FlatTo2D(void) const
flatten the tensor to 2 dimension, collapse the higher dimensions together
Definition: tensor.h:520
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
evaluate the expression at index [y][x], x will be aligned to Packet<DType, Arch>::Size() ...
MaskExp< IndexExp, SrcExp, DType > mask(const Exp< IndexExp, DType, e1 > &index, const Exp< SrcExp, DType, e2 > &src)
Definition: mask.h:58
definitions of abstract expressions and expressions template
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:171
int32_t index_t
type that will be used for index
Definition: base.h:336
PacketPlan(const PacketPlan< TA, DType, Arch > &src)
Definition: packet-inl.h:294
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:179
static bool Check(const BinaryMapExp< OP, TA, TB, DType, etype > &t)
Definition: packet-inl.h:403
Definition: packet-inl.h:44
support of sse2 packet optimization of some operations
static MSHADOW_CINLINE void Save(TFloat *dst, const Packet< TFloat, Arch > &src)
Definition: packet-inl.h:213
generic Packet operator
Definition: packet-inl.h:156
PacketPlan(DType scalar)
Definition: packet-inl.h:262
PacketPlan(const Tensor< Device, dim, DType > &t)
Definition: packet-inl.h:245
bool CheckAlign(size_t pitch)
check if a pointer is aligned
Definition: packet-inl.h:113
index_t LowerAlign(index_t size)
get lower bound of aligned index of size
Definition: packet-inl.h:143
Definition: packet-inl.h:63
const TA & src_
source expression
Definition: expression.h:408
#define MSHADOW_CINLINE
cpu force inline
Definition: base.h:226
index_t UpperAlign(index_t size)
get upper bound of aligned index of size
Definition: packet-inl.h:130
unary map expression op(src)
Definition: expression.h:405
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:282
scalar expression
Definition: expression.h:96
Definition: packet-inl.h:204
MSHADOW_CINLINE packet::Packet< DType, Arch > EvalPacket(index_t y, index_t x) const
Definition: packet-inl.h:247
static MSHADOW_CINLINE void Save(TFloat *dst, const Packet< TFloat, Arch > &src)
Definition: packet-inl.h:205
const Container & self(void) const
Definition: expression.h:83
const SubType & real_self(void) const
true self of subtype
Definition: expr_engine-inl.h:50
static MSHADOW_CINLINE Packet< DType, Arch > Map(const Packet< DType, Arch > &lhs, const Packet< DType, Arch > &rhs)
Definition: packet-inl.h:163
a general class that allows extension that makes tensors of some shape
Definition: expr_engine-inl.h:44
const TA & lhs_
left operand
Definition: expression.h:338
overloaded + operator between half_t and bf16_t
Definition: base.h:327
MSHADOW_XINLINE index_t size(int idx) const
return size of i-th dimension, start counting from highest dimension
Definition: tensor.h:506
index_t stride_
storing the stride information in x dimension this is used to deal with pitch allocation in gpu or ss...
Definition: tensor.h:442
static bool Check(const UnaryMapExp< OP, TA, DType, etype > &t)
Definition: packet-inl.h:396
general tensor
Definition: tensor.h:421
static bool Check(const Tensor< cpu, dim, DType > &t)
Definition: packet-inl.h:389
void MapPacketPlan(Tensor< cpu, dim, DType > _dst, const expr::PacketPlan< E, DType, Arch > &plan)
use PacketPlan to compute result
Definition: packet-inl.h:413
support of plain packet that use the plain datatype.
Generic packet type.
Definition: packet-inl.h:60
static bool Check(const ScalarExp< DType > &exp)
Definition: packet-inl.h:383
index_t openmp_index_t
openmp index for linux
Definition: base.h:344
MSHADOW_CINLINE DType Eval(index_t y, index_t x) const
Definition: packet-inl.h:266
static check packet enable
Definition: packet-inl.h:343