80 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
81 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
87 #define ROTATOR_RELOAD 512
90 #ifdef LV_HAVE_GENERIC
92 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
97 *outVector++ = *inVector++ * (*phase);
98 (*phase) *= phase_inc;
101 (*phase) /= std::abs((*phase));
103 (*phase) /= cabsf((*phase));
107 *outVector++ = *inVector++ * (*phase);
108 (*phase) *= phase_inc;
116 #ifdef LV_HAVE_SSE4_1
117 #include <smmintrin.h>
119 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
123 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
125 unsigned int i, j = 0;
127 for(i = 0; i < 2; ++
i) {
128 phase_Ptr[
i] *= incr;
135 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
137 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
140 const unsigned int halfPoints = num_points / 2;
146 aVal = _mm_load_ps((
float*)aPtr);
148 yl = _mm_moveldup_ps(phase_Val);
149 yh = _mm_movehdup_ps(phase_Val);
150 ylp = _mm_moveldup_ps(inc_Val);
151 yhp = _mm_movehdup_ps(inc_Val);
153 tmp1 = _mm_mul_ps(aVal, yl);
154 tmp1p = _mm_mul_ps(phase_Val, ylp);
156 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
157 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
158 tmp2 = _mm_mul_ps(aVal, yh);
159 tmp2p = _mm_mul_ps(phase_Val, yhp);
161 z = _mm_addsub_ps(tmp1, tmp2);
162 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
164 _mm_store_ps((
float*)cPtr, z);
169 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
170 tmp2 = _mm_hadd_ps(tmp1, tmp1);
171 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
172 tmp2 = _mm_sqrt_ps(tmp1);
173 phase_Val = _mm_div_ps(phase_Val, tmp2);
176 aVal = _mm_load_ps((
float*)aPtr);
178 yl = _mm_moveldup_ps(phase_Val);
179 yh = _mm_movehdup_ps(phase_Val);
180 ylp = _mm_moveldup_ps(inc_Val);
181 yhp = _mm_movehdup_ps(inc_Val);
183 tmp1 = _mm_mul_ps(aVal, yl);
185 tmp1p = _mm_mul_ps(phase_Val, ylp);
187 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
188 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
189 tmp2 = _mm_mul_ps(aVal, yh);
190 tmp2p = _mm_mul_ps(phase_Val, yhp);
192 z = _mm_addsub_ps(tmp1, tmp2);
193 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
195 _mm_store_ps((
float*)cPtr, z);
201 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
202 for(i = 0; i < num_points%2; ++
i) {
203 *cPtr++ = *aPtr++ * phase_Ptr[0];
204 phase_Ptr[0] *= (phase_inc);
207 (*phase) = phase_Ptr[0];
214 #ifdef LV_HAVE_SSE4_1
215 #include <smmintrin.h>
217 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
221 lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
223 unsigned int i, j = 0;
225 for(i = 0; i < 2; ++
i) {
226 phase_Ptr[
i] *= incr;
233 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
235 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
238 const unsigned int halfPoints = num_points / 2;
241 for(i = 0; i < (
unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
244 aVal = _mm_loadu_ps((
float*)aPtr);
246 yl = _mm_moveldup_ps(phase_Val);
247 yh = _mm_movehdup_ps(phase_Val);
248 ylp = _mm_moveldup_ps(inc_Val);
249 yhp = _mm_movehdup_ps(inc_Val);
251 tmp1 = _mm_mul_ps(aVal, yl);
252 tmp1p = _mm_mul_ps(phase_Val, ylp);
254 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
255 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
256 tmp2 = _mm_mul_ps(aVal, yh);
257 tmp2p = _mm_mul_ps(phase_Val, yhp);
259 z = _mm_addsub_ps(tmp1, tmp2);
260 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
262 _mm_storeu_ps((
float*)cPtr, z);
267 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
268 tmp2 = _mm_hadd_ps(tmp1, tmp1);
269 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
270 tmp2 = _mm_sqrt_ps(tmp1);
271 phase_Val = _mm_div_ps(phase_Val, tmp2);
274 aVal = _mm_loadu_ps((
float*)aPtr);
276 yl = _mm_moveldup_ps(phase_Val);
277 yh = _mm_movehdup_ps(phase_Val);
278 ylp = _mm_moveldup_ps(inc_Val);
279 yhp = _mm_movehdup_ps(inc_Val);
281 tmp1 = _mm_mul_ps(aVal, yl);
283 tmp1p = _mm_mul_ps(phase_Val, ylp);
285 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
286 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
287 tmp2 = _mm_mul_ps(aVal, yh);
288 tmp2p = _mm_mul_ps(phase_Val, yhp);
290 z = _mm_addsub_ps(tmp1, tmp2);
291 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
293 _mm_storeu_ps((
float*)cPtr, z);
299 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
300 for(i = 0; i < num_points%2; ++
i) {
301 *cPtr++ = *aPtr++ * phase_Ptr[0];
302 phase_Ptr[0] *= (phase_inc);
305 (*phase) = phase_Ptr[0];
313 #include <immintrin.h>
315 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
319 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
321 unsigned int i, j = 0;
323 for(i = 0; i < 4; ++
i) {
324 phase_Ptr[
i] *= incr;
333 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
337 const unsigned int fourthPoints = num_points / 4;
340 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
343 aVal = _mm256_load_ps((
float*)aPtr);
345 yl = _mm256_moveldup_ps(phase_Val);
346 yh = _mm256_movehdup_ps(phase_Val);
347 ylp = _mm256_moveldup_ps(inc_Val);
348 yhp = _mm256_movehdup_ps(inc_Val);
350 tmp1 = _mm256_mul_ps(aVal, yl);
351 tmp1p = _mm256_mul_ps(phase_Val, ylp);
353 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
354 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
355 tmp2 = _mm256_mul_ps(aVal, yh);
356 tmp2p = _mm256_mul_ps(phase_Val, yhp);
358 z = _mm256_addsub_ps(tmp1, tmp2);
359 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
361 _mm256_store_ps((
float*)cPtr, z);
366 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
367 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
368 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
369 tmp2 = _mm256_sqrt_ps(tmp1);
370 phase_Val = _mm256_div_ps(phase_Val, tmp2);
373 aVal = _mm256_load_ps((
float*)aPtr);
375 yl = _mm256_moveldup_ps(phase_Val);
376 yh = _mm256_movehdup_ps(phase_Val);
377 ylp = _mm256_moveldup_ps(inc_Val);
378 yhp = _mm256_movehdup_ps(inc_Val);
380 tmp1 = _mm256_mul_ps(aVal, yl);
382 tmp1p = _mm256_mul_ps(phase_Val, ylp);
384 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
385 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
386 tmp2 = _mm256_mul_ps(aVal, yh);
387 tmp2p = _mm256_mul_ps(phase_Val, yhp);
389 z = _mm256_addsub_ps(tmp1, tmp2);
390 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
392 _mm256_store_ps((
float*)cPtr, z);
398 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
399 for(i = 0; i < num_points%4; ++
i) {
400 *cPtr++ = *aPtr++ * phase_Ptr[0];
401 phase_Ptr[0] *= (phase_inc);
404 (*phase) = phase_Ptr[0];
412 #include <immintrin.h>
414 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(
lv_32fc_t* outVector,
const lv_32fc_t* inVector,
const lv_32fc_t phase_inc,
lv_32fc_t* phase,
unsigned int num_points){
418 lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
420 unsigned int i, j = 0;
422 for(i = 0; i < 4; ++
i) {
423 phase_Ptr[
i] *= incr;
432 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
434 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
436 const unsigned int fourthPoints = num_points / 4;
439 for(i = 0; i < (
unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
442 aVal = _mm256_loadu_ps((
float*)aPtr);
444 yl = _mm256_moveldup_ps(phase_Val);
445 yh = _mm256_movehdup_ps(phase_Val);
446 ylp = _mm256_moveldup_ps(inc_Val);
447 yhp = _mm256_movehdup_ps(inc_Val);
449 tmp1 = _mm256_mul_ps(aVal, yl);
450 tmp1p = _mm256_mul_ps(phase_Val, ylp);
452 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
453 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
454 tmp2 = _mm256_mul_ps(aVal, yh);
455 tmp2p = _mm256_mul_ps(phase_Val, yhp);
457 z = _mm256_addsub_ps(tmp1, tmp2);
458 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
460 _mm256_storeu_ps((
float*)cPtr, z);
465 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
466 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
467 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
468 tmp2 = _mm256_sqrt_ps(tmp1);
469 phase_Val = _mm256_div_ps(phase_Val, tmp2);
472 aVal = _mm256_loadu_ps((
float*)aPtr);
474 yl = _mm256_moveldup_ps(phase_Val);
475 yh = _mm256_movehdup_ps(phase_Val);
476 ylp = _mm256_moveldup_ps(inc_Val);
477 yhp = _mm256_movehdup_ps(inc_Val);
479 tmp1 = _mm256_mul_ps(aVal, yl);
481 tmp1p = _mm256_mul_ps(phase_Val, ylp);
483 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
484 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
485 tmp2 = _mm256_mul_ps(aVal, yh);
486 tmp2p = _mm256_mul_ps(phase_Val, yhp);
488 z = _mm256_addsub_ps(tmp1, tmp2);
489 phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
491 _mm256_storeu_ps((
float*)cPtr, z);
497 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
498 for(i = 0; i < num_points%4; ++
i) {
499 *cPtr++ = *aPtr++ * phase_Ptr[0];
500 phase_Ptr[0] *= (phase_inc);
503 (*phase) = phase_Ptr[0];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:87
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80