78 #ifndef INCLUDED_volk_32f_tan_32f_a_H
79 #define INCLUDED_volk_32f_tan_32f_a_H
82 #include <smmintrin.h>
85 volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
86 unsigned int num_points)
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
91 unsigned int number = 0;
92 unsigned int quarterPoints = num_points / 4;
95 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
96 __m128 sine, cosine, tangent, condition1, condition2, condition3;
97 __m128i q, r, ones, twos, fours;
99 m4pi = _mm_set1_ps(1.273239545);
100 pio4A = _mm_set1_ps(0.78515625);
101 pio4B = _mm_set1_ps(0.241876e-3);
102 ffours = _mm_set1_ps(4.0);
103 ftwos = _mm_set1_ps(2.0);
104 fones = _mm_set1_ps(1.0);
105 fzeroes = _mm_setzero_ps();
106 ones = _mm_set1_epi32(1);
107 twos = _mm_set1_epi32(2);
108 fours = _mm_set1_epi32(4);
110 cp1 = _mm_set1_ps(1.0);
111 cp2 = _mm_set1_ps(0.83333333e-1);
112 cp3 = _mm_set1_ps(0.2777778e-2);
113 cp4 = _mm_set1_ps(0.49603e-4);
114 cp5 = _mm_set1_ps(0.551e-6);
116 for(;number < quarterPoints; number++){
117 aVal = _mm_load_ps(aPtr);
118 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
119 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
120 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
122 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
123 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
125 s = _mm_div_ps(s, _mm_set1_ps(8.0));
126 s = _mm_mul_ps(s, s);
128 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
130 for(i = 0; i < 3; i++){
131 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
133 s = _mm_div_ps(s, ftwos);
135 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
136 cosine = _mm_sub_ps(fones, s);
138 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
139 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
140 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
142 __m128 temp = cosine;
143 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
144 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
145 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
146 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
147 tangent = _mm_div_ps(sine, cosine);
148 _mm_store_ps(bPtr, tangent);
153 number = quarterPoints * 4;
154 for(;number < num_points; number++){
155 *bPtr++ = tan(*aPtr++);
164 #ifndef INCLUDED_volk_32f_tan_32f_u_H
165 #define INCLUDED_volk_32f_tan_32f_u_H
167 #ifdef LV_HAVE_SSE4_1
168 #include <smmintrin.h>
171 volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
173 float* bPtr = bVector;
174 const float* aPtr = aVector;
176 unsigned int number = 0;
177 unsigned int quarterPoints = num_points / 4;
180 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
181 __m128 sine, cosine, tangent, condition1, condition2, condition3;
182 __m128i q, r, ones, twos, fours;
184 m4pi = _mm_set1_ps(1.273239545);
185 pio4A = _mm_set1_ps(0.78515625);
186 pio4B = _mm_set1_ps(0.241876e-3);
187 ffours = _mm_set1_ps(4.0);
188 ftwos = _mm_set1_ps(2.0);
189 fones = _mm_set1_ps(1.0);
190 fzeroes = _mm_setzero_ps();
191 ones = _mm_set1_epi32(1);
192 twos = _mm_set1_epi32(2);
193 fours = _mm_set1_epi32(4);
195 cp1 = _mm_set1_ps(1.0);
196 cp2 = _mm_set1_ps(0.83333333e-1);
197 cp3 = _mm_set1_ps(0.2777778e-2);
198 cp4 = _mm_set1_ps(0.49603e-4);
199 cp5 = _mm_set1_ps(0.551e-6);
201 for(;number < quarterPoints; number++){
202 aVal = _mm_loadu_ps(aPtr);
203 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
204 q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
205 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
207 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
208 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
210 s = _mm_div_ps(s, _mm_set1_ps(8.0));
211 s = _mm_mul_ps(s, s);
213 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
215 for(i = 0; i < 3; i++){
216 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
218 s = _mm_div_ps(s, ftwos);
220 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
221 cosine = _mm_sub_ps(fones, s);
223 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
224 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
225 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
227 __m128 temp = cosine;
228 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
229 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
230 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
231 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
232 tangent = _mm_div_ps(sine, cosine);
233 _mm_storeu_ps(bPtr, tangent);
238 number = quarterPoints * 4;
239 for(;number < num_points; number++){
240 *bPtr++ = tan(*aPtr++);
247 #ifdef LV_HAVE_GENERIC
250 volk_32f_tan_32f_generic(
float* bVector,
const float* aVector,
251 unsigned int num_points)
253 float* bPtr = bVector;
254 const float* aPtr = aVector;
255 unsigned int number = 0;
257 for(; number < num_points; number++){
258 *bPtr++ = tan(*aPtr++);