30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
35 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
38 #include <emmintrin.h>
71 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
74 reg.nsLoad2ValAligned(ptr);
78 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
92 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
99 inline void nsLoad2Val(
const double* ptr)
101 xmm = _mm_loadu_pd(ptr);
104 inline void nsLoad2ValAligned(
const double* pval)
106 xmm = _mm_load_pd(pval);
109 inline void nsLoad2Val(
const float* pval)
111 __m128 temp1 = _mm_load_ss(pval);
112 __m128 temp2 = _mm_load_ss(pval + 1);
113 temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
114 temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
115 xmm = _mm_cvtps_pd(temp1);
118 inline void nsLoad2Val(
const unsigned char* ptr)
120 __m128i xmm_i = _mm_cvtsi32_si128(*(
unsigned short*)(ptr));
121 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
122 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
123 xmm = _mm_cvtepi32_pd(xmm_i);
126 inline void nsLoad2Val(
const short* ptr)
130 __m128i xmm_i = _mm_cvtsi32_si128(i);
131 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
132 xmm_i = _mm_srai_epi32(xmm_i, 16);
133 xmm = _mm_cvtepi32_pd(xmm_i);
136 inline void nsLoad2Val(
const unsigned short* ptr)
140 __m128i xmm_i = _mm_cvtsi32_si128(i);
141 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
142 xmm_i = _mm_srli_epi32(xmm_i, 16);
143 xmm = _mm_cvtepi32_pd(xmm_i);
148 __m128i xmm_i = _mm_cvtsi32_si128(*(
int*)(ptr));
149 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
150 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
151 low.xmm = _mm_cvtepi32_pd(xmm_i);
152 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
158 high.nsLoad2Val(ptr+2);
164 high.nsLoad2Val(ptr+2);
170 high.nsLoad2Val(ptr+2);
175 __m128 temp1 = _mm_loadu_ps(ptr);
176 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
177 low.xmm = _mm_cvtps_pd(temp1);
178 high.xmm = _mm_cvtps_pd(temp2);
181 inline void Zeroize()
183 xmm = _mm_setzero_pd();
194 xmm = _mm_add_pd(xmm, other.xmm);
201 ret.xmm = _mm_add_pd(xmm, other.xmm);
208 ret.xmm = _mm_sub_pd(xmm, other.xmm);
215 ret.xmm = _mm_mul_pd(xmm, other.xmm);
221 xmm = _mm_mul_pd(xmm, other.xmm);
225 inline void AddLowAndHigh()
228 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
229 xmm = _mm_add_pd(xmm, xmm2);
232 inline void Store2Double(
double* pval)
234 _mm_storeu_pd(pval, xmm);
237 inline void Store2DoubleAligned(
double* pval)
239 _mm_store_pd(pval, xmm);
242 inline operator double ()
const
245 _mm_store_sd(&val, xmm);
252 #warning "Software emulation of SSE2 !"
278 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
281 reg.nsLoad2ValAligned(ptr);
292 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
306 inline void nsLoad2Val(
const double* pval)
312 inline void nsLoad2ValAligned(
const double* pval)
318 inline void nsLoad2Val(
const float* pval)
324 inline void nsLoad2Val(
const unsigned char* ptr)
330 inline void nsLoad2Val(
const short* ptr)
336 inline void nsLoad2Val(
const unsigned short* ptr)
353 high.nsLoad2Val(ptr+2);
359 high.nsLoad2Val(ptr+2);
365 high.nsLoad2Val(ptr+2);
371 high.nsLoad2Val(ptr+2);
374 inline void Zeroize()
397 ret.low = low + other.low;
398 ret.high = high + other.high;
405 ret.low = low - other.low;
406 ret.high = high - other.high;
413 ret.low = low * other.low;
414 ret.high = high * other.high;
425 inline void AddLowAndHigh()
427 double add = low + high;
432 inline void Store2Double(
double* pval)
438 inline void Store2DoubleAligned(
double* pval)
444 inline operator double ()
const
468 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
471 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
478 reg.low.nsLoad2Val(ptr);
479 reg.high.nsLoad2Val(ptr+2);
483 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
486 reg.low.nsLoad2Val(ptr);
487 reg.high.nsLoad2Val(ptr+2);
494 reg.low.nsLoad2Val(ptr);
495 reg.high.nsLoad2Val(ptr+2);
499 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
502 reg.low.nsLoad2ValAligned(ptr);
503 reg.high.nsLoad2ValAligned(ptr+2);
510 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
531 ret.low = low + other.low;
532 ret.high = high + other.high;
539 ret.low = low - other.low;
540 ret.high = high - other.high;
547 ret.low = low * other.low;
548 ret.high = high * other.high;
559 inline void AddLowAndHigh()
Definition: gdalsse_priv.h:254
Definition: gdalsse_priv.h:452