77 #ifndef INCLUDED_volk_32f_acos_32f_a_H
78 #define INCLUDED_volk_32f_acos_32f_a_H
81 #include <smmintrin.h>
84 volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int quarterPoints = num_points / 4;
93 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
94 __m128 fzeroes, fones, ftwos, ffours, condition;
96 pi = _mm_set1_ps(3.14159265358979323846);
97 pio2 = _mm_set1_ps(3.14159265358979323846/2);
98 fzeroes = _mm_setzero_ps();
99 fones = _mm_set1_ps(1.0);
100 ftwos = _mm_set1_ps(2.0);
101 ffours = _mm_set1_ps(4.0);
103 for(;number < quarterPoints; number++){
104 aVal = _mm_load_ps(aPtr);
106 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
108 condition = _mm_cmplt_ps(z, fzeroes);
109 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
111 condition = _mm_cmplt_ps(z, fones);
112 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
114 for(i = 0; i < 2; i++)
115 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
116 x = _mm_div_ps(fones, x);
119 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
121 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
122 condition = _mm_cmpgt_ps(z, fones);
124 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
126 condition = _mm_cmplt_ps(aVal, fzeroes);
127 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
128 condition = _mm_cmplt_ps(d, fzeroes);
129 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
131 _mm_store_ps(bPtr, arccosine);
136 number = quarterPoints * 4;
137 for(;number < num_points; number++){
138 *bPtr++ = acos(*aPtr++);
147 #ifndef INCLUDED_volk_32f_acos_32f_u_H
148 #define INCLUDED_volk_32f_acos_32f_u_H
150 #ifdef LV_HAVE_SSE4_1
151 #include <smmintrin.h>
154 volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
156 float* bPtr = bVector;
157 const float* aPtr = aVector;
159 unsigned int number = 0;
160 unsigned int quarterPoints = num_points / 4;
163 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
164 __m128 fzeroes, fones, ftwos, ffours, condition;
166 pi = _mm_set1_ps(3.14159265358979323846);
167 pio2 = _mm_set1_ps(3.14159265358979323846/2);
168 fzeroes = _mm_setzero_ps();
169 fones = _mm_set1_ps(1.0);
170 ftwos = _mm_set1_ps(2.0);
171 ffours = _mm_set1_ps(4.0);
173 for(;number < quarterPoints; number++){
174 aVal = _mm_loadu_ps(aPtr);
176 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
178 condition = _mm_cmplt_ps(z, fzeroes);
179 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
181 condition = _mm_cmplt_ps(z, fones);
182 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
184 for(i = 0; i < 2; i++)
185 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
186 x = _mm_div_ps(fones, x);
190 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
192 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
193 condition = _mm_cmpgt_ps(z, fones);
195 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
197 condition = _mm_cmplt_ps(aVal, fzeroes);
198 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
199 condition = _mm_cmplt_ps(d, fzeroes);
200 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
202 _mm_storeu_ps(bPtr, arccosine);
207 number = quarterPoints * 4;
208 for(;number < num_points; number++){
209 *bPtr++ = acos(*aPtr++);
215 #ifdef LV_HAVE_GENERIC
218 volk_32f_acos_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
220 float* bPtr = bVector;
221 const float* aPtr = aVector;
222 unsigned int number = 0;
224 for(number = 0; number < num_points; number++){
225 *bPtr++ = acos(*aPtr++);
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75