77 #ifndef INCLUDED_volk_32f_asin_32f_a_H
78 #define INCLUDED_volk_32f_asin_32f_a_H
81 #include <smmintrin.h>
84 volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int quarterPoints = num_points / 4;
93 __m128 aVal, pio2, x, y, z, arcsine;
94 __m128 fzeroes, fones, ftwos, ffours, condition;
96 pio2 = _mm_set1_ps(3.14159265358979323846/2);
97 fzeroes = _mm_setzero_ps();
98 fones = _mm_set1_ps(1.0);
99 ftwos = _mm_set1_ps(2.0);
100 ffours = _mm_set1_ps(4.0);
102 for(;number < quarterPoints; number++){
103 aVal = _mm_load_ps(aPtr);
104 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
106 condition = _mm_cmplt_ps(z, fzeroes);
107 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
109 condition = _mm_cmplt_ps(z, fones);
110 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
112 for(i = 0; i < 2; i++){
113 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
115 x = _mm_div_ps(fones, x);
118 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
121 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
122 condition = _mm_cmpgt_ps(z, fones);
124 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
126 condition = _mm_cmplt_ps(aVal, fzeroes);
127 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
129 _mm_store_ps(bPtr, arcsine);
134 number = quarterPoints * 4;
135 for(;number < num_points; number++){
136 *bPtr++ = asin(*aPtr++);
144 #ifndef INCLUDED_volk_32f_asin_32f_u_H
145 #define INCLUDED_volk_32f_asin_32f_u_H
147 #ifdef LV_HAVE_SSE4_1
148 #include <smmintrin.h>
151 volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
153 float* bPtr = bVector;
154 const float* aPtr = aVector;
156 unsigned int number = 0;
157 unsigned int quarterPoints = num_points / 4;
160 __m128 aVal, pio2, x, y, z, arcsine;
161 __m128 fzeroes, fones, ftwos, ffours, condition;
163 pio2 = _mm_set1_ps(3.14159265358979323846/2);
164 fzeroes = _mm_setzero_ps();
165 fones = _mm_set1_ps(1.0);
166 ftwos = _mm_set1_ps(2.0);
167 ffours = _mm_set1_ps(4.0);
169 for(;number < quarterPoints; number++){
170 aVal = _mm_loadu_ps(aPtr);
171 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
173 condition = _mm_cmplt_ps(z, fzeroes);
174 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
176 condition = _mm_cmplt_ps(z, fones);
177 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
179 for(i = 0; i < 2; i++){
180 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
182 x = _mm_div_ps(fones, x);
185 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
188 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
189 condition = _mm_cmpgt_ps(z, fones);
191 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
193 condition = _mm_cmplt_ps(aVal, fzeroes);
194 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
196 _mm_storeu_ps(bPtr, arcsine);
201 number = quarterPoints * 4;
202 for(;number < num_points; number++){
203 *bPtr++ = asin(*aPtr++);
209 #ifdef LV_HAVE_GENERIC
212 volk_32f_asin_32f_u_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
214 float* bPtr = bVector;
215 const float* aPtr = aVector;
216 unsigned int number = 0;
218 for(number = 0; number < num_points; number++){
219 *bPtr++ = asin(*aPtr++);
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:75