Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32fc_32f_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
50 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
51 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
52 
53 #include <stdio.h>
54 #include <volk/volk_common.h>
55 
56 #ifdef LV_HAVE_GENERIC
57 
59  const lv_32fc_t* input,
60  const float* taps,
61  unsigned int num_points)
62 {
63 
64  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
65  const float* aPtr = (float*)input;
66  const float* bPtr = taps;
67  unsigned int number = 0;
68 
69  for (number = 0; number < num_points; number++) {
70  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
71  aPtr += 2;
72  bPtr += 1;
73  }
74 
75  *result = returnValue;
76 }
77 
78 #endif /*LV_HAVE_GENERIC*/
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 
82 #include <immintrin.h>
83 
84 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
85  const lv_32fc_t* input,
86  const float* taps,
87  unsigned int num_points)
88 {
89 
90  unsigned int number = 0;
91  const unsigned int sixteenthPoints = num_points / 16;
92 
93  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
94  const float* aPtr = (float*)input;
95  const float* bPtr = taps;
96 
97  __m256 a0Val, a1Val, a2Val, a3Val;
98  __m256 b0Val, b1Val, b2Val, b3Val;
99  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
100 
101  __m256 dotProdVal0 = _mm256_setzero_ps();
102  __m256 dotProdVal1 = _mm256_setzero_ps();
103  __m256 dotProdVal2 = _mm256_setzero_ps();
104  __m256 dotProdVal3 = _mm256_setzero_ps();
105 
106  for (; number < sixteenthPoints; number++) {
107 
108  a0Val = _mm256_load_ps(aPtr);
109  a1Val = _mm256_load_ps(aPtr + 8);
110  a2Val = _mm256_load_ps(aPtr + 16);
111  a3Val = _mm256_load_ps(aPtr + 24);
112 
113  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
114  x1Val = _mm256_load_ps(bPtr + 8);
115  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
116  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
117  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
118  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
119 
120  // TODO: it may be possible to rearrange swizzling to better pipeline data
121  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
122  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
123  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
124  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
125 
126  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
127  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
128  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
129  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
130 
131  aPtr += 32;
132  bPtr += 16;
133  }
134 
135  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
136  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
137  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
138 
139  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
140 
141  _mm256_store_ps(dotProductVector,
142  dotProdVal0); // Store the results back into the dot product vector
143 
144  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
145  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
146  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
147  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
148 
149  number = sixteenthPoints * 16;
150  for (; number < num_points; number++) {
151  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
152  aPtr += 2;
153  bPtr += 1;
154  }
155 
156  *result = returnValue;
157 }
158 
159 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
160 
161 #ifdef LV_HAVE_AVX
162 
163 #include <immintrin.h>
164 
165 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
166  const lv_32fc_t* input,
167  const float* taps,
168  unsigned int num_points)
169 {
170 
171  unsigned int number = 0;
172  const unsigned int sixteenthPoints = num_points / 16;
173 
174  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
175  const float* aPtr = (float*)input;
176  const float* bPtr = taps;
177 
178  __m256 a0Val, a1Val, a2Val, a3Val;
179  __m256 b0Val, b1Val, b2Val, b3Val;
180  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
181  __m256 c0Val, c1Val, c2Val, c3Val;
182 
183  __m256 dotProdVal0 = _mm256_setzero_ps();
184  __m256 dotProdVal1 = _mm256_setzero_ps();
185  __m256 dotProdVal2 = _mm256_setzero_ps();
186  __m256 dotProdVal3 = _mm256_setzero_ps();
187 
188  for (; number < sixteenthPoints; number++) {
189 
190  a0Val = _mm256_load_ps(aPtr);
191  a1Val = _mm256_load_ps(aPtr + 8);
192  a2Val = _mm256_load_ps(aPtr + 16);
193  a3Val = _mm256_load_ps(aPtr + 24);
194 
195  x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
196  x1Val = _mm256_load_ps(bPtr + 8);
197  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
198  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
199  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
200  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
201 
202  // TODO: it may be possible to rearrange swizzling to better pipeline data
203  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
204  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
205  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
206  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
207 
208  c0Val = _mm256_mul_ps(a0Val, b0Val);
209  c1Val = _mm256_mul_ps(a1Val, b1Val);
210  c2Val = _mm256_mul_ps(a2Val, b2Val);
211  c3Val = _mm256_mul_ps(a3Val, b3Val);
212 
213  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
214  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
215  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
216  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
217 
218  aPtr += 32;
219  bPtr += 16;
220  }
221 
222  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
223  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
224  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
225 
226  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
227 
228  _mm256_store_ps(dotProductVector,
229  dotProdVal0); // Store the results back into the dot product vector
230 
231  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
232  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
233  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
234  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
235 
236  number = sixteenthPoints * 16;
237  for (; number < num_points; number++) {
238  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
239  aPtr += 2;
240  bPtr += 1;
241  }
242 
243  *result = returnValue;
244 }
245 
246 #endif /*LV_HAVE_AVX*/
247 
248 
249 #ifdef LV_HAVE_SSE
250 
251 
252 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
253  const lv_32fc_t* input,
254  const float* taps,
255  unsigned int num_points)
256 {
257 
258  unsigned int number = 0;
259  const unsigned int eighthPoints = num_points / 8;
260 
261  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
262  const float* aPtr = (float*)input;
263  const float* bPtr = taps;
264 
265  __m128 a0Val, a1Val, a2Val, a3Val;
266  __m128 b0Val, b1Val, b2Val, b3Val;
267  __m128 x0Val, x1Val, x2Val, x3Val;
268  __m128 c0Val, c1Val, c2Val, c3Val;
269 
270  __m128 dotProdVal0 = _mm_setzero_ps();
271  __m128 dotProdVal1 = _mm_setzero_ps();
272  __m128 dotProdVal2 = _mm_setzero_ps();
273  __m128 dotProdVal3 = _mm_setzero_ps();
274 
275  for (; number < eighthPoints; number++) {
276 
277  a0Val = _mm_load_ps(aPtr);
278  a1Val = _mm_load_ps(aPtr + 4);
279  a2Val = _mm_load_ps(aPtr + 8);
280  a3Val = _mm_load_ps(aPtr + 12);
281 
282  x0Val = _mm_load_ps(bPtr);
283  x1Val = _mm_load_ps(bPtr);
284  x2Val = _mm_load_ps(bPtr + 4);
285  x3Val = _mm_load_ps(bPtr + 4);
286  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
287  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
288  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
289  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
290 
291  c0Val = _mm_mul_ps(a0Val, b0Val);
292  c1Val = _mm_mul_ps(a1Val, b1Val);
293  c2Val = _mm_mul_ps(a2Val, b2Val);
294  c3Val = _mm_mul_ps(a3Val, b3Val);
295 
296  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
297  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
298  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
299  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
300 
301  aPtr += 16;
302  bPtr += 8;
303  }
304 
305  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
306  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
307  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
308 
309  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
310 
311  _mm_store_ps(dotProductVector,
312  dotProdVal0); // Store the results back into the dot product vector
313 
314  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
315  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
316 
317  number = eighthPoints * 8;
318  for (; number < num_points; number++) {
319  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
320  aPtr += 2;
321  bPtr += 1;
322  }
323 
324  *result = returnValue;
325 }
326 
327 #endif /*LV_HAVE_SSE*/
328 
329 #if LV_HAVE_AVX2 && LV_HAVE_FMA
330 
331 #include <immintrin.h>
332 
333 static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
334  const lv_32fc_t* input,
335  const float* taps,
336  unsigned int num_points)
337 {
338 
339  unsigned int number = 0;
340  const unsigned int sixteenthPoints = num_points / 16;
341 
342  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
343  const float* aPtr = (float*)input;
344  const float* bPtr = taps;
345 
346  __m256 a0Val, a1Val, a2Val, a3Val;
347  __m256 b0Val, b1Val, b2Val, b3Val;
348  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
349 
350  __m256 dotProdVal0 = _mm256_setzero_ps();
351  __m256 dotProdVal1 = _mm256_setzero_ps();
352  __m256 dotProdVal2 = _mm256_setzero_ps();
353  __m256 dotProdVal3 = _mm256_setzero_ps();
354 
355  for (; number < sixteenthPoints; number++) {
356 
357  a0Val = _mm256_loadu_ps(aPtr);
358  a1Val = _mm256_loadu_ps(aPtr + 8);
359  a2Val = _mm256_loadu_ps(aPtr + 16);
360  a3Val = _mm256_loadu_ps(aPtr + 24);
361 
362  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
363  x1Val = _mm256_loadu_ps(bPtr + 8);
364  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
365  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
366  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
367  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
368 
369  // TODO: it may be possible to rearrange swizzling to better pipeline data
370  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
371  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
372  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
373  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
374 
375  dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
376  dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
377  dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
378  dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
379 
380  aPtr += 32;
381  bPtr += 16;
382  }
383 
384  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
385  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
386  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
387 
388  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
389 
390  _mm256_store_ps(dotProductVector,
391  dotProdVal0); // Store the results back into the dot product vector
392 
393  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
394  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
395  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
396  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
397 
398  number = sixteenthPoints * 16;
399  for (; number < num_points; number++) {
400  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
401  aPtr += 2;
402  bPtr += 1;
403  }
404 
405  *result = returnValue;
406 }
407 
408 #endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
409 
410 #ifdef LV_HAVE_AVX
411 
412 #include <immintrin.h>
413 
414 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
415  const lv_32fc_t* input,
416  const float* taps,
417  unsigned int num_points)
418 {
419 
420  unsigned int number = 0;
421  const unsigned int sixteenthPoints = num_points / 16;
422 
423  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
424  const float* aPtr = (float*)input;
425  const float* bPtr = taps;
426 
427  __m256 a0Val, a1Val, a2Val, a3Val;
428  __m256 b0Val, b1Val, b2Val, b3Val;
429  __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
430  __m256 c0Val, c1Val, c2Val, c3Val;
431 
432  __m256 dotProdVal0 = _mm256_setzero_ps();
433  __m256 dotProdVal1 = _mm256_setzero_ps();
434  __m256 dotProdVal2 = _mm256_setzero_ps();
435  __m256 dotProdVal3 = _mm256_setzero_ps();
436 
437  for (; number < sixteenthPoints; number++) {
438 
439  a0Val = _mm256_loadu_ps(aPtr);
440  a1Val = _mm256_loadu_ps(aPtr + 8);
441  a2Val = _mm256_loadu_ps(aPtr + 16);
442  a3Val = _mm256_loadu_ps(aPtr + 24);
443 
444  x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
445  x1Val = _mm256_loadu_ps(bPtr + 8);
446  x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
447  x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
448  x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
449  x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
450 
451  // TODO: it may be possible to rearrange swizzling to better pipeline data
452  b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
453  b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
454  b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
455  b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
456 
457  c0Val = _mm256_mul_ps(a0Val, b0Val);
458  c1Val = _mm256_mul_ps(a1Val, b1Val);
459  c2Val = _mm256_mul_ps(a2Val, b2Val);
460  c3Val = _mm256_mul_ps(a3Val, b3Val);
461 
462  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
463  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
464  dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
465  dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
466 
467  aPtr += 32;
468  bPtr += 16;
469  }
470 
471  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
472  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
473  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
474 
475  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
476 
477  _mm256_store_ps(dotProductVector,
478  dotProdVal0); // Store the results back into the dot product vector
479 
480  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
481  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
482  returnValue += lv_cmake(dotProductVector[4], dotProductVector[5]);
483  returnValue += lv_cmake(dotProductVector[6], dotProductVector[7]);
484 
485  number = sixteenthPoints * 16;
486  for (; number < num_points; number++) {
487  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
488  aPtr += 2;
489  bPtr += 1;
490  }
491 
492  *result = returnValue;
493 }
494 #endif /*LV_HAVE_AVX*/
495 
496 #ifdef LV_HAVE_NEON
497 #include <arm_neon.h>
498 
499 static inline void
501  const lv_32fc_t* __restrict input,
502  const float* __restrict taps,
503  unsigned int num_points)
504 {
505 
506  unsigned int number;
507  const unsigned int quarterPoints = num_points / 8;
508 
509  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
510  const float* inputPtr = (float*)input;
511  const float* tapsPtr = taps;
512  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
513  float accVector_real[4];
514  float accVector_imag[4];
515 
516  float32x4x2_t inputVector0, inputVector1;
517  float32x4_t tapsVector0, tapsVector1;
518  float32x4_t tmp_real0, tmp_imag0;
519  float32x4_t tmp_real1, tmp_imag1;
520  float32x4_t real_accumulator0, imag_accumulator0;
521  float32x4_t real_accumulator1, imag_accumulator1;
522 
523  // zero out accumulators
524  // take a *float, return float32x4_t
525  real_accumulator0 = vld1q_f32(zero);
526  imag_accumulator0 = vld1q_f32(zero);
527  real_accumulator1 = vld1q_f32(zero);
528  imag_accumulator1 = vld1q_f32(zero);
529 
530  for (number = 0; number < quarterPoints; number++) {
531  // load doublewords and duplicate in to second lane
532  tapsVector0 = vld1q_f32(tapsPtr);
533  tapsVector1 = vld1q_f32(tapsPtr + 4);
534 
535  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
536  inputVector0 = vld2q_f32(inputPtr);
537  inputVector1 = vld2q_f32(inputPtr + 8);
538  // inputVector is now a struct of two vectors, 0th is real, 1st is imag
539 
540  tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
541  tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
542 
543  tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
544  tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
545 
546  real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
547  imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
548 
549  real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
550  imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
551 
552  tapsPtr += 8;
553  inputPtr += 16;
554  }
555 
556  real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
557  imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
558  // void vst1q_f32( float32_t * ptr, float32x4_t val);
559  // store results back to a complex (array of 2 floats)
560  vst1q_f32(accVector_real, real_accumulator0);
561  vst1q_f32(accVector_imag, imag_accumulator0);
562  returnValue += lv_cmake(
563  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
564  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
565 
566  // clean up the remainder
567  for (number = quarterPoints * 8; number < num_points; number++) {
568  returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
569  inputPtr += 2;
570  tapsPtr += 1;
571  }
572 
573  *result = returnValue;
574 }
575 
576 #endif /*LV_HAVE_NEON*/
577 
578 #ifdef LV_HAVE_NEON
579 #include <arm_neon.h>
580 
581 static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
582  const lv_32fc_t* __restrict input,
583  const float* __restrict taps,
584  unsigned int num_points)
585 {
586 
587  unsigned int number;
588  const unsigned int quarterPoints = num_points / 4;
589 
590  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
591  const float* inputPtr = (float*)input;
592  const float* tapsPtr = taps;
593  float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
594  float accVector_real[4];
595  float accVector_imag[4];
596 
597  float32x4x2_t inputVector;
598  float32x4_t tapsVector;
599  float32x4_t tmp_real, tmp_imag;
600  float32x4_t real_accumulator, imag_accumulator;
601 
602 
603  // zero out accumulators
604  // take a *float, return float32x4_t
605  real_accumulator = vld1q_f32(zero);
606  imag_accumulator = vld1q_f32(zero);
607 
608  for (number = 0; number < quarterPoints; number++) {
609  // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
610  // load doublewords and duplicate in to second lane
611  tapsVector = vld1q_f32(tapsPtr);
612 
613  // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
614  inputVector = vld2q_f32(inputPtr);
615 
616  tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
617  tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
618 
619  real_accumulator = vaddq_f32(real_accumulator, tmp_real);
620  imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
621 
622 
623  tapsPtr += 4;
624  inputPtr += 8;
625  }
626 
627  // store results back to a complex (array of 2 floats)
628  vst1q_f32(accVector_real, real_accumulator);
629  vst1q_f32(accVector_imag, imag_accumulator);
630  returnValue += lv_cmake(
631  accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3],
632  accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3]);
633 
634  // clean up the remainder
635  for (number = quarterPoints * 4; number < num_points; number++) {
636  returnValue += lv_cmake(inputPtr[0] * tapsPtr[0], inputPtr[1] * tapsPtr[0]);
637  inputPtr += 2;
638  tapsPtr += 1;
639  }
640 
641  *result = returnValue;
642 }
643 
644 #endif /*LV_HAVE_NEON*/
645 
646 #ifdef LV_HAVE_NEONV7
647 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
648  const lv_32fc_t* input,
649  const float* taps,
650  unsigned int num_points);
651 #endif /*LV_HAVE_NEONV7*/
652 
653 #ifdef LV_HAVE_NEONV7
654 extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
655  const lv_32fc_t* input,
656  const float* taps,
657  unsigned int num_points);
658 #endif /*LV_HAVE_NEONV7*/
659 
660 #ifdef LV_HAVE_NEONV7
661 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
662  const lv_32fc_t* input,
663  const float* taps,
664  unsigned int num_points);
665 #endif /*LV_HAVE_NEONV7*/
666 
667 #ifdef LV_HAVE_SSE
668 
669 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
670  const lv_32fc_t* input,
671  const float* taps,
672  unsigned int num_points)
673 {
674 
675  unsigned int number = 0;
676  const unsigned int eighthPoints = num_points / 8;
677 
678  lv_32fc_t returnValue = lv_cmake(0.0f, 0.0f);
679  const float* aPtr = (float*)input;
680  const float* bPtr = taps;
681 
682  __m128 a0Val, a1Val, a2Val, a3Val;
683  __m128 b0Val, b1Val, b2Val, b3Val;
684  __m128 x0Val, x1Val, x2Val, x3Val;
685  __m128 c0Val, c1Val, c2Val, c3Val;
686 
687  __m128 dotProdVal0 = _mm_setzero_ps();
688  __m128 dotProdVal1 = _mm_setzero_ps();
689  __m128 dotProdVal2 = _mm_setzero_ps();
690  __m128 dotProdVal3 = _mm_setzero_ps();
691 
692  for (; number < eighthPoints; number++) {
693 
694  a0Val = _mm_loadu_ps(aPtr);
695  a1Val = _mm_loadu_ps(aPtr + 4);
696  a2Val = _mm_loadu_ps(aPtr + 8);
697  a3Val = _mm_loadu_ps(aPtr + 12);
698 
699  x0Val = _mm_loadu_ps(bPtr);
700  x1Val = _mm_loadu_ps(bPtr);
701  x2Val = _mm_loadu_ps(bPtr + 4);
702  x3Val = _mm_loadu_ps(bPtr + 4);
703  b0Val = _mm_unpacklo_ps(x0Val, x1Val);
704  b1Val = _mm_unpackhi_ps(x0Val, x1Val);
705  b2Val = _mm_unpacklo_ps(x2Val, x3Val);
706  b3Val = _mm_unpackhi_ps(x2Val, x3Val);
707 
708  c0Val = _mm_mul_ps(a0Val, b0Val);
709  c1Val = _mm_mul_ps(a1Val, b1Val);
710  c2Val = _mm_mul_ps(a2Val, b2Val);
711  c3Val = _mm_mul_ps(a3Val, b3Val);
712 
713  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
714  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
715  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
716  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
717 
718  aPtr += 16;
719  bPtr += 8;
720  }
721 
722  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
723  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
724  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
725 
726  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
727 
728  _mm_store_ps(dotProductVector,
729  dotProdVal0); // Store the results back into the dot product vector
730 
731  returnValue += lv_cmake(dotProductVector[0], dotProductVector[1]);
732  returnValue += lv_cmake(dotProductVector[2], dotProductVector[3]);
733 
734  number = eighthPoints * 8;
735  for (; number < num_points; number++) {
736  returnValue += lv_cmake(aPtr[0] * bPtr[0], aPtr[1] * bPtr[0]);
737  aPtr += 2;
738  bPtr += 1;
739  }
740 
741  *result = returnValue;
742 }
743 
744 #endif /*LV_HAVE_SSE*/
745 
746 
747 #endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
static void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:165
static void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:58
static void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:581
#define lv_cmake(r, i)
Definition: volk_complex.h:77
static void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:669
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
static void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:414
float complex lv_32fc_t
Definition: volk_complex.h:74
static void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:252
static void volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t *__restrict result, const lv_32fc_t *__restrict input, const float *__restrict taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc.h:500