Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator2_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
68 #ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69 #define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
70 
71 
72 #include <math.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <volk/volk_complex.h>
76 #define ROTATOR_RELOAD 512
77 #define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78 #define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79 
80 
81 #ifdef LV_HAVE_GENERIC
82 
84  const lv_32fc_t* inVector,
85  const lv_32fc_t* phase_inc,
86  lv_32fc_t* phase,
87  unsigned int num_points)
88 {
89  unsigned int i = 0;
90  int j = 0;
91  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
92  for (j = 0; j < ROTATOR_RELOAD; ++j) {
93  *outVector++ = *inVector++ * (*phase);
94  (*phase) *= *phase_inc;
95  }
96 
97  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
98  }
99  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
100  *outVector++ = *inVector++ * (*phase);
101  (*phase) *= *phase_inc;
102  }
103  if (i) {
104  // Make sure, we normalize phase on every call!
105  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106  }
107 }
108 
109 #endif /* LV_HAVE_GENERIC */
110 
111 
112 #ifdef LV_HAVE_NEON
113 #include <arm_neon.h>
115 
116 static inline void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t* outVector,
117  const lv_32fc_t* inVector,
118  const lv_32fc_t* phase_inc,
119  lv_32fc_t* phase,
120  unsigned int num_points)
121 
122 {
123  lv_32fc_t* outputVectorPtr = outVector;
124  const lv_32fc_t* inputVectorPtr = inVector;
125  lv_32fc_t incr = 1;
126  lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
127  float32x4x2_t input_vec;
128  float32x4x2_t output_vec;
129 
130  unsigned int i = 0, j = 0;
131  // const unsigned int quarter_points = num_points / 4;
132 
133  for (i = 0; i < 4; ++i) {
134  phasePtr[i] *= incr;
135  incr *= (*phase_inc);
136  }
137 
138  // Notice that incr has be incremented in the previous loop
139  const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
140  const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
141  float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
142 
143  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
144  for (j = 0; j < ROTATOR_RELOAD_4; j++) {
145  input_vec = vld2q_f32((float*)inputVectorPtr);
146  // Prefetch next one, speeds things up
147  __VOLK_PREFETCH(inputVectorPtr + 4);
148  // Rotate
149  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
150  // Increase phase
151  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
152  // Store output
153  vst2q_f32((float*)outputVectorPtr, output_vec);
154 
155  outputVectorPtr += 4;
156  inputVectorPtr += 4;
157  }
158  // normalize phase so magnitude doesn't grow because of
159  // floating point rounding error
160  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
161  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
162  // Multiply complex with real
163  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
164  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
165  }
166 
167  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
168  input_vec = vld2q_f32((float*)inputVectorPtr);
169  // Prefetch next one, speeds things up
170  __VOLK_PREFETCH(inputVectorPtr + 4);
171  // Rotate
172  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
173  // Increase phase
174  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
175  // Store output
176  vst2q_f32((float*)outputVectorPtr, output_vec);
177 
178  outputVectorPtr += 4;
179  inputVectorPtr += 4;
180  }
181  // if(i) == true means we looped above
182  if (i) {
183  // normalize phase so magnitude doesn't grow because of
184  // floating point rounding error
185  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
186  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
187  // Multiply complex with real
188  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
189  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
190  }
191  // Store current phase
192  vst2q_f32((float*)phasePtr, phase_vec);
193 
194  // Deal with the rest
195  for (i = 0; i < num_points % 4; i++) {
196  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
197  phasePtr[0] *= (*phase_inc);
198  }
199 
200  // For continuous phase next time we need to call this function
201  (*phase) = phasePtr[0];
202 }
203 
204 #endif /* LV_HAVE_NEON */
205 
206 
207 #ifdef LV_HAVE_SSE4_1
208 #include <smmintrin.h>
209 
210 static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(lv_32fc_t* outVector,
211  const lv_32fc_t* inVector,
212  const lv_32fc_t* phase_inc,
213  lv_32fc_t* phase,
214  unsigned int num_points)
215 {
216  lv_32fc_t* cPtr = outVector;
217  const lv_32fc_t* aPtr = inVector;
218  lv_32fc_t incr = 1;
219  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
220 
221  unsigned int i, j = 0;
222 
223  for (i = 0; i < 2; ++i) {
224  phase_Ptr[i] *= incr;
225  incr *= (*phase_inc);
226  }
227 
228  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229 
230  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
232 
233  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
234  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
235 
236  aVal = _mm_load_ps((float*)aPtr);
237 
238  yl = _mm_moveldup_ps(phase_Val);
239  yh = _mm_movehdup_ps(phase_Val);
240  ylp = _mm_moveldup_ps(inc_Val);
241  yhp = _mm_movehdup_ps(inc_Val);
242 
243  tmp1 = _mm_mul_ps(aVal, yl);
244  tmp1p = _mm_mul_ps(phase_Val, ylp);
245 
246  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
247  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
248  tmp2 = _mm_mul_ps(aVal, yh);
249  tmp2p = _mm_mul_ps(phase_Val, yhp);
250 
251  z = _mm_addsub_ps(tmp1, tmp2);
252  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
253 
254  _mm_store_ps((float*)cPtr, z);
255 
256  aPtr += 2;
257  cPtr += 2;
258  }
259  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
260  tmp2 = _mm_hadd_ps(tmp1, tmp1);
261  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
262  tmp2 = _mm_sqrt_ps(tmp1);
263  phase_Val = _mm_div_ps(phase_Val, tmp2);
264  }
265  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
266  aVal = _mm_load_ps((float*)aPtr);
267 
268  yl = _mm_moveldup_ps(phase_Val);
269  yh = _mm_movehdup_ps(phase_Val);
270  ylp = _mm_moveldup_ps(inc_Val);
271  yhp = _mm_movehdup_ps(inc_Val);
272 
273  tmp1 = _mm_mul_ps(aVal, yl);
274 
275  tmp1p = _mm_mul_ps(phase_Val, ylp);
276 
277  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
278  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
279  tmp2 = _mm_mul_ps(aVal, yh);
280  tmp2p = _mm_mul_ps(phase_Val, yhp);
281 
282  z = _mm_addsub_ps(tmp1, tmp2);
283  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
284 
285  _mm_store_ps((float*)cPtr, z);
286 
287  aPtr += 2;
288  cPtr += 2;
289  }
290  if (i) {
291  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
292  tmp2 = _mm_hadd_ps(tmp1, tmp1);
293  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
294  tmp2 = _mm_sqrt_ps(tmp1);
295  phase_Val = _mm_div_ps(phase_Val, tmp2);
296  }
297 
298  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
299  if (num_points & 1) {
300  *cPtr++ = *aPtr++ * phase_Ptr[0];
301  phase_Ptr[0] *= (*phase_inc);
302  }
303 
304  (*phase) = phase_Ptr[0];
305 }
306 
307 #endif /* LV_HAVE_SSE4_1 for aligned */
308 
309 
310 #ifdef LV_HAVE_SSE4_1
311 #include <smmintrin.h>
312 
313 static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(lv_32fc_t* outVector,
314  const lv_32fc_t* inVector,
315  const lv_32fc_t* phase_inc,
316  lv_32fc_t* phase,
317  unsigned int num_points)
318 {
319  lv_32fc_t* cPtr = outVector;
320  const lv_32fc_t* aPtr = inVector;
321  lv_32fc_t incr = 1;
322  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
323 
324  unsigned int i, j = 0;
325 
326  for (i = 0; i < 2; ++i) {
327  phase_Ptr[i] *= incr;
328  incr *= (*phase_inc);
329  }
330 
331  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
332  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
333  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
334  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
335 
336  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
337  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
338 
339  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
340  for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
341 
342  aVal = _mm_loadu_ps((float*)aPtr);
343 
344  yl = _mm_moveldup_ps(phase_Val);
345  yh = _mm_movehdup_ps(phase_Val);
346  ylp = _mm_moveldup_ps(inc_Val);
347  yhp = _mm_movehdup_ps(inc_Val);
348 
349  tmp1 = _mm_mul_ps(aVal, yl);
350  tmp1p = _mm_mul_ps(phase_Val, ylp);
351 
352  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
353  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
354  tmp2 = _mm_mul_ps(aVal, yh);
355  tmp2p = _mm_mul_ps(phase_Val, yhp);
356 
357  z = _mm_addsub_ps(tmp1, tmp2);
358  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
359 
360  _mm_storeu_ps((float*)cPtr, z);
361 
362  aPtr += 2;
363  cPtr += 2;
364  }
365  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
366  tmp2 = _mm_hadd_ps(tmp1, tmp1);
367  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
368  tmp2 = _mm_sqrt_ps(tmp1);
369  phase_Val = _mm_div_ps(phase_Val, tmp2);
370  }
371  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
372  aVal = _mm_loadu_ps((float*)aPtr);
373 
374  yl = _mm_moveldup_ps(phase_Val);
375  yh = _mm_movehdup_ps(phase_Val);
376  ylp = _mm_moveldup_ps(inc_Val);
377  yhp = _mm_movehdup_ps(inc_Val);
378 
379  tmp1 = _mm_mul_ps(aVal, yl);
380 
381  tmp1p = _mm_mul_ps(phase_Val, ylp);
382 
383  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
384  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
385  tmp2 = _mm_mul_ps(aVal, yh);
386  tmp2p = _mm_mul_ps(phase_Val, yhp);
387 
388  z = _mm_addsub_ps(tmp1, tmp2);
389  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
390 
391  _mm_storeu_ps((float*)cPtr, z);
392 
393  aPtr += 2;
394  cPtr += 2;
395  }
396  if (i) {
397  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
398  tmp2 = _mm_hadd_ps(tmp1, tmp1);
399  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
400  tmp2 = _mm_sqrt_ps(tmp1);
401  phase_Val = _mm_div_ps(phase_Val, tmp2);
402  }
403 
404  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
405  if (num_points & 1) {
406  *cPtr++ = *aPtr++ * phase_Ptr[0];
407  phase_Ptr[0] *= (*phase_inc);
408  }
409 
410  (*phase) = phase_Ptr[0];
411 }
412 
413 #endif /* LV_HAVE_SSE4_1 */
414 
415 
416 #ifdef LV_HAVE_AVX
417 #include <immintrin.h>
419 
420 static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t* outVector,
421  const lv_32fc_t* inVector,
422  const lv_32fc_t* phase_inc,
423  lv_32fc_t* phase,
424  unsigned int num_points)
425 {
426  lv_32fc_t* cPtr = outVector;
427  const lv_32fc_t* aPtr = inVector;
428  lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
429  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
430 
431  unsigned int i, j = 0;
432 
433  for (i = 0; i < 4; ++i) {
434  phase_Ptr[i] *= incr;
435  incr *= (*phase_inc);
436  }
437 
438  __m256 aVal, phase_Val, z;
439 
440  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
441 
442  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
443  lv_creal(incr),
444  lv_cimag(incr),
445  lv_creal(incr),
446  lv_cimag(incr),
447  lv_creal(incr),
448  lv_cimag(incr),
449  lv_creal(incr));
450 
451  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
452  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
453 
454  aVal = _mm256_load_ps((float*)aPtr);
455 
456  z = _mm256_complexmul_ps(aVal, phase_Val);
457  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
458 
459  _mm256_store_ps((float*)cPtr, z);
460 
461  aPtr += 4;
462  cPtr += 4;
463  }
464  phase_Val = _mm256_normalize_ps(phase_Val);
465  }
466 
467  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
468  aVal = _mm256_load_ps((float*)aPtr);
469 
470  z = _mm256_complexmul_ps(aVal, phase_Val);
471  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
472 
473  _mm256_store_ps((float*)cPtr, z);
474 
475  aPtr += 4;
476  cPtr += 4;
477  }
478  if (i) {
479  phase_Val = _mm256_normalize_ps(phase_Val);
480  }
481 
482  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
483  (*phase) = phase_Ptr[0];
485  cPtr, aPtr, phase_inc, phase, num_points % 4);
486 }
487 
488 #endif /* LV_HAVE_AVX for aligned */
489 
490 
491 #ifdef LV_HAVE_AVX
492 #include <immintrin.h>
494 
495 static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t* outVector,
496  const lv_32fc_t* inVector,
497  const lv_32fc_t* phase_inc,
498  lv_32fc_t* phase,
499  unsigned int num_points)
500 {
501  lv_32fc_t* cPtr = outVector;
502  const lv_32fc_t* aPtr = inVector;
503  lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
504  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
505 
506  unsigned int i, j = 0;
507 
508  for (i = 0; i < 4; ++i) {
509  phase_Ptr[i] *= incr;
510  incr *= (*phase_inc);
511  }
512 
513  __m256 aVal, phase_Val, z;
514 
515  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
516 
517  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
518  lv_creal(incr),
519  lv_cimag(incr),
520  lv_creal(incr),
521  lv_cimag(incr),
522  lv_creal(incr),
523  lv_cimag(incr),
524  lv_creal(incr));
525 
526  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
527  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
528 
529  aVal = _mm256_loadu_ps((float*)aPtr);
530 
531  z = _mm256_complexmul_ps(aVal, phase_Val);
532  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
533 
534  _mm256_storeu_ps((float*)cPtr, z);
535 
536  aPtr += 4;
537  cPtr += 4;
538  }
539  phase_Val = _mm256_normalize_ps(phase_Val);
540  }
541 
542  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
543  aVal = _mm256_loadu_ps((float*)aPtr);
544 
545  z = _mm256_complexmul_ps(aVal, phase_Val);
546  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
547 
548  _mm256_storeu_ps((float*)cPtr, z);
549 
550  aPtr += 4;
551  cPtr += 4;
552  }
553  if (i) {
554  phase_Val = _mm256_normalize_ps(phase_Val);
555  }
556 
557  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
558  (*phase) = phase_Ptr[0];
560  cPtr, aPtr, phase_inc, phase, num_points % 4);
561 }
562 
563 #endif /* LV_HAVE_AVX */
564 
565 #if LV_HAVE_AVX && LV_HAVE_FMA
566 #include <immintrin.h>
567 
568 static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(lv_32fc_t* outVector,
569  const lv_32fc_t* inVector,
570  const lv_32fc_t* phase_inc,
571  lv_32fc_t* phase,
572  unsigned int num_points)
573 {
574  lv_32fc_t* cPtr = outVector;
575  const lv_32fc_t* aPtr = inVector;
576  lv_32fc_t incr = 1;
578  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
579 
580  unsigned int i, j = 0;
581 
582  for (i = 0; i < 4; ++i) {
583  phase_Ptr[i] *= incr;
584  incr *= (*phase_inc);
585  }
586 
587  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
588 
589  phase_Val = _mm256_load_ps((float*)phase_Ptr);
590  inc_Val = _mm256_set_ps(lv_cimag(incr),
591  lv_creal(incr),
592  lv_cimag(incr),
593  lv_creal(incr),
594  lv_cimag(incr),
595  lv_creal(incr),
596  lv_cimag(incr),
597  lv_creal(incr));
598 
599  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
600  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
601 
602  aVal = _mm256_load_ps((float*)aPtr);
603 
604  yl = _mm256_moveldup_ps(phase_Val);
605  yh = _mm256_movehdup_ps(phase_Val);
606  ylp = _mm256_moveldup_ps(inc_Val);
607  yhp = _mm256_movehdup_ps(inc_Val);
608 
609  tmp1 = aVal;
610  tmp1p = phase_Val;
611 
612  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
613  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
614  tmp2 = _mm256_mul_ps(aVal, yh);
615  tmp2p = _mm256_mul_ps(phase_Val, yhp);
616 
617  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
618  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
619 
620  _mm256_store_ps((float*)cPtr, z);
621 
622  aPtr += 4;
623  cPtr += 4;
624  }
625  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
626  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
627  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
628  tmp2 = _mm256_sqrt_ps(tmp1);
629  phase_Val = _mm256_div_ps(phase_Val, tmp2);
630  }
631  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
632  aVal = _mm256_load_ps((float*)aPtr);
633 
634  yl = _mm256_moveldup_ps(phase_Val);
635  yh = _mm256_movehdup_ps(phase_Val);
636  ylp = _mm256_moveldup_ps(inc_Val);
637  yhp = _mm256_movehdup_ps(inc_Val);
638 
639  tmp1 = aVal;
640  tmp1p = phase_Val;
641 
642  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
643  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
644  tmp2 = _mm256_mul_ps(aVal, yh);
645  tmp2p = _mm256_mul_ps(phase_Val, yhp);
646 
647  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
648  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
649 
650  _mm256_store_ps((float*)cPtr, z);
651 
652  aPtr += 4;
653  cPtr += 4;
654  }
655  if (i) {
656  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
657  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
658  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
659  tmp2 = _mm256_sqrt_ps(tmp1);
660  phase_Val = _mm256_div_ps(phase_Val, tmp2);
661  }
662 
663  _mm256_store_ps((float*)phase_Ptr, phase_Val);
664  for (i = 0; i < num_points % 4; ++i) {
665  *cPtr++ = *aPtr++ * phase_Ptr[0];
666  phase_Ptr[0] *= (*phase_inc);
667  }
668 
669  (*phase) = phase_Ptr[0];
670 }
671 
672 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
673 
674 #if LV_HAVE_AVX && LV_HAVE_FMA
675 #include <immintrin.h>
676 
677 static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVector,
678  const lv_32fc_t* inVector,
679  const lv_32fc_t* phase_inc,
680  lv_32fc_t* phase,
681  unsigned int num_points)
682 {
683  lv_32fc_t* cPtr = outVector;
684  const lv_32fc_t* aPtr = inVector;
685  lv_32fc_t incr = 1;
686  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
687 
688  unsigned int i, j = 0;
689 
690  for (i = 0; i < 4; ++i) {
691  phase_Ptr[i] *= incr;
692  incr *= (*phase_inc);
693  }
694 
695  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
696 
697  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
698  inc_Val = _mm256_set_ps(lv_cimag(incr),
699  lv_creal(incr),
700  lv_cimag(incr),
701  lv_creal(incr),
702  lv_cimag(incr),
703  lv_creal(incr),
704  lv_cimag(incr),
705  lv_creal(incr));
706 
707  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
708  for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
709 
710  aVal = _mm256_loadu_ps((float*)aPtr);
711 
712  yl = _mm256_moveldup_ps(phase_Val);
713  yh = _mm256_movehdup_ps(phase_Val);
714  ylp = _mm256_moveldup_ps(inc_Val);
715  yhp = _mm256_movehdup_ps(inc_Val);
716 
717  tmp1 = aVal;
718  tmp1p = phase_Val;
719 
720  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
721  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
722  tmp2 = _mm256_mul_ps(aVal, yh);
723  tmp2p = _mm256_mul_ps(phase_Val, yhp);
724 
725  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
726  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
727 
728  _mm256_storeu_ps((float*)cPtr, z);
729 
730  aPtr += 4;
731  cPtr += 4;
732  }
733  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
734  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
735  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
736  tmp2 = _mm256_sqrt_ps(tmp1);
737  phase_Val = _mm256_div_ps(phase_Val, tmp2);
738  }
739  for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
740  aVal = _mm256_loadu_ps((float*)aPtr);
741 
742  yl = _mm256_moveldup_ps(phase_Val);
743  yh = _mm256_movehdup_ps(phase_Val);
744  ylp = _mm256_moveldup_ps(inc_Val);
745  yhp = _mm256_movehdup_ps(inc_Val);
746 
747  tmp1 = aVal;
748  tmp1p = phase_Val;
749 
750  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
751  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
752  tmp2 = _mm256_mul_ps(aVal, yh);
753  tmp2p = _mm256_mul_ps(phase_Val, yhp);
754 
755  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
756  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
757 
758  _mm256_storeu_ps((float*)cPtr, z);
759 
760  aPtr += 4;
761  cPtr += 4;
762  }
763  if (i) {
764  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
765  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
766  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
767  tmp2 = _mm256_sqrt_ps(tmp1);
768  phase_Val = _mm256_div_ps(phase_Val, tmp2);
769  }
770 
771  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
772  for (i = 0; i < num_points % 4; ++i) {
773  *cPtr++ = *aPtr++ * phase_Ptr[0];
774  phase_Ptr[0] *= (*phase_inc);
775  }
776 
777  (*phase) = phase_Ptr[0];
778 }
779 
780 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
781 
782 #endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */
static void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:495
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:57
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:77
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:83
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:76
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:105
static void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:83
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:89
static void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:116
static void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t *phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:420
#define lv_cmake(r, i)
Definition: volk_complex.h:77
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:73
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
for i
Definition: volk_config_fixed.tmpl.h:13
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:74
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator2_32fc.h:78
float f[4]
Definition: volk_common.h:120
#define lv_creal(x)
Definition: volk_complex.h:96
#define lv_cimag(x)
Definition: volk_complex.h:98