mxnet
sse-inl.h
Go to the documentation of this file.
1 
7 #ifndef MSHADOW_PACKET_SSE_INL_H_
8 #define MSHADOW_PACKET_SSE_INL_H_
9 
10 #include <emmintrin.h>
11 #include "../base.h"
12 #include "../packet-inl.h"
13 
14 namespace mshadow {
15 namespace packet {
16 template<>
17 struct Packet<float, kSSE2> {
18  public:
20  static constexpr index_t size = 4;
22  __m128 data_;
23  // enable default copy constructor
24  Packet(void) {}
25  // constructor from the intrinsic type
26  explicit Packet(__m128 data) : data_(data) {}
27  // create a fill with the target value s
29  return Packet<float, kSSE2>(_mm_set1_ps(s));
30  }
31  // load from address
32  MSHADOW_CINLINE static Packet<float, kSSE2> Load(const float* src) {
33  return Packet<float, kSSE2>(_mm_load_ps(src));
34  }
35  // load from address
37  return Packet<float, kSSE2>(_mm_loadu_ps(src));
38  }
39  // fill it with value s
41  data_ = _mm_set1_ps(s);
42  return *this;
43  }
44  // store data into dst
45  MSHADOW_CINLINE void Store(float* dst) const {
46  _mm_store_ps(dst, data_);
47  }
48  // get the sum of all contents
49  MSHADOW_CINLINE float Sum() const {
50  __m128 ans = _mm_add_ps(data_, _mm_movehl_ps(data_, data_));
51  __m128 rst = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1));
52 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
53  return rst.m128_f32[0];
54 #else
55  float rr = _mm_cvtss_f32(rst);
56  return rr;
57 #endif
58  }
59 };
60 
61 
63 template<>
64 struct Packet<double, kSSE2> {
66  static constexpr index_t size = 2;
67  // internal data
68  __m128d data_;
69  // constructor
70  Packet(void) {}
71  explicit Packet(__m128d data) : data_(data) {}
72  // create a fill with the target value s
74  return Packet<double, kSSE2>(_mm_set1_pd(s));
75  }
76  // load from address
77  MSHADOW_CINLINE static Packet<double, kSSE2> Load(const double* src) {
78  return Packet<double, kSSE2>(_mm_load_pd(src));
79  }
81  return Packet<double, kSSE2>(_mm_loadu_pd(src));
82  }
83  // fill it with value s
85  data_ = _mm_set1_pd(s);
86  return *this;
87  }
88  // store data into dst
89  MSHADOW_CINLINE void Store(double* dst) const {
90  _mm_store_pd(dst, data_);
91  }
92  // get sum of all content
93  inline double Sum(void) const {
94  __m128d tmp = _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_));
95 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
96  return tmp.m128d_f64[0];
97 #else
98  double ans = _mm_cvtsd_f64(tmp);
99  return ans;
100 #endif
101  }
102 };
103 
105  const Packet<float, kSSE2>& rhs) {
106  return Packet<float, kSSE2>(_mm_add_ps(lhs.data_, rhs.data_));
107 }
108 
110  const Packet<double, kSSE2>& rhs) {
111  return Packet<double, kSSE2>(_mm_add_pd(lhs.data_, rhs.data_));
112 }
113 
115  const Packet<float, kSSE2>& rhs) {
116  return Packet<float, kSSE2>(_mm_sub_ps(lhs.data_, rhs.data_));
117 }
118 
120  const Packet<double, kSSE2>& rhs) {
121  return Packet<double, kSSE2>(_mm_sub_pd(lhs.data_, rhs.data_));
122 }
123 
125  const Packet<float, kSSE2>& rhs) {
126  return Packet<float, kSSE2>(_mm_mul_ps(lhs.data_, rhs.data_));
127 }
128 
130  const Packet<double, kSSE2>& rhs) {
131  return Packet<double, kSSE2>(_mm_mul_pd(lhs.data_, rhs.data_));
132 }
133 
134 
136  const Packet<float, kSSE2>& rhs) {
137  return Packet<float, kSSE2>(_mm_div_ps(lhs.data_, rhs.data_));
138 }
139 
141  const Packet<double, kSSE2>& rhs) {
142  return Packet<double, kSSE2>(_mm_div_pd(lhs.data_, rhs.data_));
143 }
144 
145 } // namespace packet
146 } // namespace mshadow
147 #endif // MSHADOW_PACKET_SSE_INL_H_
vector real type for float
Definition: sse-inl.h:64
static MSHADOW_CINLINE Packet< float, kSSE2 > Fill(float s)
Definition: sse-inl.h:28
MSHADOW_CINLINE Packet< float, kSSE2 > & operator=(float s)
Definition: sse-inl.h:40
static MSHADOW_CINLINE Packet< float, kSSE2 > LoadUnAligned(const float *src)
Definition: sse-inl.h:36
MSHADOW_CINLINE Packet< DType, kPlain > operator-(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:59
MSHADOW_CINLINE Packet< DType, kPlain > operator/(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:70
MSHADOW_CINLINE void Store(float *dst) const
Definition: sse-inl.h:45
static MSHADOW_CINLINE Packet< double, kSSE2 > Load(const double *src)
Definition: sse-inl.h:77
Packet(__m128 data)
Definition: sse-inl.h:26
__m128d data_
Definition: sse-inl.h:68
MSHADOW_CINLINE float Sum() const
Definition: sse-inl.h:49
int32_t index_t
type that will be used for index
Definition: base.h:291
Packet(__m128d data)
Definition: sse-inl.h:71
MSHADOW_CINLINE Packet< DType, kPlain > operator*(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:64
Definition: packet-inl.h:25
MSHADOW_CINLINE Packet< double, kSSE2 > & operator=(double s)
Definition: sse-inl.h:84
double Sum(void) const
Definition: sse-inl.h:93
__m128 data_
The internal data.
Definition: sse-inl.h:22
MSHADOW_CINLINE Packet< DType, kPlain > operator+(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:53
#define MSHADOW_CINLINE
cpu force inline
Definition: base.h:207
static MSHADOW_CINLINE Packet< float, kSSE2 > Load(const float *src)
Definition: sse-inl.h:32
namespace for mshadow
Definition: base.h:282
static MSHADOW_CINLINE Packet< double, kSSE2 > LoadUnAligned(const double *src)
Definition: sse-inl.h:80
Packet(void)
Definition: sse-inl.h:24
static MSHADOW_CINLINE Packet< double, kSSE2 > Fill(double s)
Definition: sse-inl.h:73
MSHADOW_CINLINE void Store(double *dst) const
Definition: sse-inl.h:89
Generic packet type.
Definition: packet-inl.h:41
Packet(void)
Definition: sse-inl.h:70