mxnet
sse-inl.h
Go to the documentation of this file.
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
25 #ifndef MSHADOW_PACKET_SSE_INL_H_
26 #define MSHADOW_PACKET_SSE_INL_H_
27 
28 #include <emmintrin.h>
29 #include "../base.h"
30 #include "../packet-inl.h"
31 
32 namespace mshadow {
33 namespace packet {
34 template<>
35 struct Packet<float, kSSE2> {
36  public:
38  static constexpr index_t size = 4;
40  __m128 data_;
41  // enable default copy constructor
42  Packet(void) {}
43  // constructor from the intrinsic type
44  explicit Packet(__m128 data) : data_(data) {}
45  // create a fill with the target value s
47  return Packet<float, kSSE2>(_mm_set1_ps(s));
48  }
49  // load from address
50  MSHADOW_CINLINE static Packet<float, kSSE2> Load(const float* src) {
51  return Packet<float, kSSE2>(_mm_load_ps(src));
52  }
53  // load from address
55  return Packet<float, kSSE2>(_mm_loadu_ps(src));
56  }
57  // fill it with value s
59  data_ = _mm_set1_ps(s);
60  return *this;
61  }
62  // store data into dst
63  MSHADOW_CINLINE void Store(float* dst) const {
64  _mm_store_ps(dst, data_);
65  }
66  // get the sum of all contents
67  MSHADOW_CINLINE float Sum() const {
68  __m128 ans = _mm_add_ps(data_, _mm_movehl_ps(data_, data_));
69  __m128 rst = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1));
70 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
71  return rst.m128_f32[0];
72 #else
73  float rr = _mm_cvtss_f32(rst);
74  return rr;
75 #endif
76  }
77 };
78 
79 
81 template<>
82 struct Packet<double, kSSE2> {
84  static constexpr index_t size = 2;
85  // internal data
86  __m128d data_;
87  // constructor
88  Packet(void) {}
89  explicit Packet(__m128d data) : data_(data) {}
90  // create a fill with the target value s
92  return Packet<double, kSSE2>(_mm_set1_pd(s));
93  }
94  // load from address
95  MSHADOW_CINLINE static Packet<double, kSSE2> Load(const double* src) {
96  return Packet<double, kSSE2>(_mm_load_pd(src));
97  }
99  return Packet<double, kSSE2>(_mm_loadu_pd(src));
100  }
101  // fill it with value s
103  data_ = _mm_set1_pd(s);
104  return *this;
105  }
106  // store data into dst
107  MSHADOW_CINLINE void Store(double* dst) const {
108  _mm_store_pd(dst, data_);
109  }
110  // get sum of all content
111  inline double Sum(void) const {
112  __m128d tmp = _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_));
113 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64)
114  return tmp.m128d_f64[0];
115 #else
116  double ans = _mm_cvtsd_f64(tmp);
117  return ans;
118 #endif
119  }
120 };
121 
123  const Packet<float, kSSE2>& rhs) {
124  return Packet<float, kSSE2>(_mm_add_ps(lhs.data_, rhs.data_));
125 }
126 
128  const Packet<double, kSSE2>& rhs) {
129  return Packet<double, kSSE2>(_mm_add_pd(lhs.data_, rhs.data_));
130 }
131 
133  const Packet<float, kSSE2>& rhs) {
134  return Packet<float, kSSE2>(_mm_sub_ps(lhs.data_, rhs.data_));
135 }
136 
138  const Packet<double, kSSE2>& rhs) {
139  return Packet<double, kSSE2>(_mm_sub_pd(lhs.data_, rhs.data_));
140 }
141 
143  const Packet<float, kSSE2>& rhs) {
144  return Packet<float, kSSE2>(_mm_mul_ps(lhs.data_, rhs.data_));
145 }
146 
148  const Packet<double, kSSE2>& rhs) {
149  return Packet<double, kSSE2>(_mm_mul_pd(lhs.data_, rhs.data_));
150 }
151 
152 
154  const Packet<float, kSSE2>& rhs) {
155  return Packet<float, kSSE2>(_mm_div_ps(lhs.data_, rhs.data_));
156 }
157 
159  const Packet<double, kSSE2>& rhs) {
160  return Packet<double, kSSE2>(_mm_div_pd(lhs.data_, rhs.data_));
161 }
162 
163 } // namespace packet
164 } // namespace mshadow
165 #endif // MSHADOW_PACKET_SSE_INL_H_
mshadow::packet::Packet< float, kSSE2 >::LoadUnAligned
static MSHADOW_CINLINE Packet< float, kSSE2 > LoadUnAligned(const float *src)
Definition: sse-inl.h:54
mshadow::packet::kSSE2
@ kSSE2
Definition: packet-inl.h:43
mshadow::packet::Packet< double, kSSE2 >::Packet
Packet(__m128d data)
Definition: sse-inl.h:89
mshadow::packet::operator/
MSHADOW_CINLINE Packet< DType, kPlain > operator/(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:88
mshadow::packet::Packet< float, kSSE2 >::Store
MSHADOW_CINLINE void Store(float *dst) const
Definition: sse-inl.h:63
mshadow::packet::Packet< float, kSSE2 >::Sum
MSHADOW_CINLINE float Sum() const
Definition: sse-inl.h:67
mshadow::packet::Packet
Generic packet type.
Definition: packet-inl.h:59
mshadow::packet::Packet< double, kSSE2 >::data_
__m128d data_
Definition: sse-inl.h:86
mshadow::packet::Packet< float, kSSE2 >::Fill
static MSHADOW_CINLINE Packet< float, kSSE2 > Fill(float s)
Definition: sse-inl.h:46
mshadow::packet::Packet< float, kSSE2 >::Load
static MSHADOW_CINLINE Packet< float, kSSE2 > Load(const float *src)
Definition: sse-inl.h:50
mshadow::packet::Packet< float, kSSE2 >::Packet
Packet(void)
Definition: sse-inl.h:42
MSHADOW_CINLINE
#define MSHADOW_CINLINE
cpu force inline
Definition: base.h:231
mshadow::packet::operator-
MSHADOW_CINLINE Packet< DType, kPlain > operator-(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:77
mshadow::index_t
int32_t index_t
type that will be used for index
Definition: base.h:328
mshadow::packet::Packet< double, kSSE2 >::Store
MSHADOW_CINLINE void Store(double *dst) const
Definition: sse-inl.h:107
mshadow::packet::Packet< float, kSSE2 >::Packet
Packet(__m128 data)
Definition: sse-inl.h:44
mshadow::packet::Packet< float, kSSE2 >::data_
__m128 data_
The internal data.
Definition: sse-inl.h:40
mshadow
overloaded + operator between half_t and bf16_t
Definition: base.h:319
mshadow::packet::Packet< double, kSSE2 >::Packet
Packet(void)
Definition: sse-inl.h:88
mshadow::packet::Packet< double, kSSE2 >::Load
static MSHADOW_CINLINE Packet< double, kSSE2 > Load(const double *src)
Definition: sse-inl.h:95
mshadow::packet::Packet< double, kSSE2 >::Sum
double Sum(void) const
Definition: sse-inl.h:111
mshadow::packet::Packet< float, kSSE2 >::operator=
MSHADOW_CINLINE Packet< float, kSSE2 > & operator=(float s)
Definition: sse-inl.h:58
mshadow::packet::Packet< double, kSSE2 >::LoadUnAligned
static MSHADOW_CINLINE Packet< double, kSSE2 > LoadUnAligned(const double *src)
Definition: sse-inl.h:98
mshadow::packet::operator*
MSHADOW_CINLINE Packet< DType, kPlain > operator*(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:82
mshadow::packet::operator+
MSHADOW_CINLINE Packet< DType, kPlain > operator+(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:71
mshadow::packet::Packet< double, kSSE2 >::operator=
MSHADOW_CINLINE Packet< double, kSSE2 > & operator=(double s)
Definition: sse-inl.h:102
mshadow::packet::Packet< double, kSSE2 >
vector real type for float
Definition: sse-inl.h:82
mshadow::packet::Packet< float, kSSE2 >
Definition: sse-inl.h:35
mshadow::packet::Packet< double, kSSE2 >::Fill
static MSHADOW_CINLINE Packet< double, kSSE2 > Fill(double s)
Definition: sse-inl.h:91