26 #ifndef MSHADOW_PACKET_SSE_INL_H_ 27 #define MSHADOW_PACKET_SSE_INL_H_ 29 #include <emmintrin.h> 31 #include "../packet-inl.h" 45 explicit Packet(__m128 data) : data_(data) {}
60 data_ = _mm_set1_ps(s);
65 _mm_store_ps(dst, data_);
69 __m128 ans = _mm_add_ps(data_, _mm_movehl_ps(data_, data_));
70 __m128 rst = _mm_add_ss(ans, _mm_shuffle_ps(ans, ans, 1));
71 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) 72 return rst.m128_f32[0];
74 float rr = _mm_cvtss_f32(rst);
90 explicit Packet(__m128d data) : data_(data) {}
104 data_ = _mm_set1_pd(s);
109 _mm_store_pd(dst, data_);
112 inline double Sum(
void)
const {
113 __m128d tmp = _mm_add_sd(data_, _mm_unpackhi_pd(data_, data_));
114 #if defined(_MSC_VER) && (_MSC_VER <= 1500) && defined(_WIN64) 115 return tmp.m128d_f64[0];
117 double ans = _mm_cvtsd_f64(tmp);
166 #endif // MSHADOW_PACKET_SSE_INL_H_ vector real type for float
Definition: sse-inl.h:83
static MSHADOW_CINLINE Packet< float, kSSE2 > Fill(float s)
Definition: sse-inl.h:47
MSHADOW_CINLINE Packet< float, kSSE2 > & operator=(float s)
Definition: sse-inl.h:59
static MSHADOW_CINLINE Packet< float, kSSE2 > LoadUnAligned(const float *src)
Definition: sse-inl.h:55
MSHADOW_CINLINE Packet< DType, kPlain > operator-(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:78
MSHADOW_CINLINE Packet< DType, kPlain > operator/(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:89
MSHADOW_CINLINE void Store(float *dst) const
Definition: sse-inl.h:64
static MSHADOW_CINLINE Packet< double, kSSE2 > Load(const double *src)
Definition: sse-inl.h:96
Packet(__m128 data)
Definition: sse-inl.h:45
__m128d data_
Definition: sse-inl.h:87
MSHADOW_CINLINE float Sum() const
Definition: sse-inl.h:68
int32_t index_t
type that will be used for index
Definition: base.h:336
Packet(__m128d data)
Definition: sse-inl.h:90
MSHADOW_CINLINE Packet< DType, kPlain > operator*(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:83
Definition: packet-inl.h:44
MSHADOW_CINLINE Packet< double, kSSE2 > & operator=(double s)
Definition: sse-inl.h:103
double Sum(void) const
Definition: sse-inl.h:112
__m128 data_
The internal data.
Definition: sse-inl.h:41
MSHADOW_CINLINE Packet< DType, kPlain > operator+(const Packet< DType, kPlain > &lhs, const Packet< DType, kPlain > &rhs)
Definition: plain-inl.h:72
#define MSHADOW_CINLINE
cpu force inline
Definition: base.h:226
static MSHADOW_CINLINE Packet< float, kSSE2 > Load(const float *src)
Definition: sse-inl.h:51
overloaded + operator between half_t and bf16_t
Definition: base.h:327
static MSHADOW_CINLINE Packet< double, kSSE2 > LoadUnAligned(const double *src)
Definition: sse-inl.h:99
Packet(void)
Definition: sse-inl.h:43
static MSHADOW_CINLINE Packet< double, kSSE2 > Fill(double s)
Definition: sse-inl.h:92
MSHADOW_CINLINE void Store(double *dst) const
Definition: sse-inl.h:108
Generic packet type.
Definition: packet-inl.h:60
Packet(void)
Definition: sse-inl.h:89