Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
59 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
60 
61 #include <stdio.h>
62 #include <volk/volk_common.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 
68 static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
69  const float* input,
70  const float* taps,
71  unsigned int num_points)
72 {
73 
74  float dotProduct = 0;
75  const float* aPtr = input;
76  const float* bPtr = taps;
77  unsigned int number = 0;
78 
79  for (number = 0; number < num_points; number++) {
80  dotProduct += ((*aPtr++) * (*bPtr++));
81  }
82 
83  *result = dotProduct;
84 }
85 
86 #endif /*LV_HAVE_GENERIC*/
87 
88 
89 #ifdef LV_HAVE_SSE
90 
91 
92 static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
93  const float* input,
94  const float* taps,
95  unsigned int num_points)
96 {
97 
98  unsigned int number = 0;
99  const unsigned int sixteenthPoints = num_points / 16;
100 
101  float dotProduct = 0;
102  const float* aPtr = input;
103  const float* bPtr = taps;
104 
105  __m128 a0Val, a1Val, a2Val, a3Val;
106  __m128 b0Val, b1Val, b2Val, b3Val;
107  __m128 c0Val, c1Val, c2Val, c3Val;
108 
109  __m128 dotProdVal0 = _mm_setzero_ps();
110  __m128 dotProdVal1 = _mm_setzero_ps();
111  __m128 dotProdVal2 = _mm_setzero_ps();
112  __m128 dotProdVal3 = _mm_setzero_ps();
113 
114  for (; number < sixteenthPoints; number++) {
115 
116  a0Val = _mm_loadu_ps(aPtr);
117  a1Val = _mm_loadu_ps(aPtr + 4);
118  a2Val = _mm_loadu_ps(aPtr + 8);
119  a3Val = _mm_loadu_ps(aPtr + 12);
120  b0Val = _mm_loadu_ps(bPtr);
121  b1Val = _mm_loadu_ps(bPtr + 4);
122  b2Val = _mm_loadu_ps(bPtr + 8);
123  b3Val = _mm_loadu_ps(bPtr + 12);
124 
125  c0Val = _mm_mul_ps(a0Val, b0Val);
126  c1Val = _mm_mul_ps(a1Val, b1Val);
127  c2Val = _mm_mul_ps(a2Val, b2Val);
128  c3Val = _mm_mul_ps(a3Val, b3Val);
129 
130  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
131  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
132  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
133  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
134 
135  aPtr += 16;
136  bPtr += 16;
137  }
138 
139  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
140  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
141  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
142 
143  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
144 
145  _mm_store_ps(dotProductVector,
146  dotProdVal0); // Store the results back into the dot product vector
147 
148  dotProduct = dotProductVector[0];
149  dotProduct += dotProductVector[1];
150  dotProduct += dotProductVector[2];
151  dotProduct += dotProductVector[3];
152 
153  number = sixteenthPoints * 16;
154  for (; number < num_points; number++) {
155  dotProduct += ((*aPtr++) * (*bPtr++));
156  }
157 
158  *result = dotProduct;
159 }
160 
161 #endif /*LV_HAVE_SSE*/
162 
163 #ifdef LV_HAVE_SSE3
164 
165 #include <pmmintrin.h>
166 
167 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
168  const float* input,
169  const float* taps,
170  unsigned int num_points)
171 {
172  unsigned int number = 0;
173  const unsigned int sixteenthPoints = num_points / 16;
174 
175  float dotProduct = 0;
176  const float* aPtr = input;
177  const float* bPtr = taps;
178 
179  __m128 a0Val, a1Val, a2Val, a3Val;
180  __m128 b0Val, b1Val, b2Val, b3Val;
181  __m128 c0Val, c1Val, c2Val, c3Val;
182 
183  __m128 dotProdVal0 = _mm_setzero_ps();
184  __m128 dotProdVal1 = _mm_setzero_ps();
185  __m128 dotProdVal2 = _mm_setzero_ps();
186  __m128 dotProdVal3 = _mm_setzero_ps();
187 
188  for (; number < sixteenthPoints; number++) {
189 
190  a0Val = _mm_loadu_ps(aPtr);
191  a1Val = _mm_loadu_ps(aPtr + 4);
192  a2Val = _mm_loadu_ps(aPtr + 8);
193  a3Val = _mm_loadu_ps(aPtr + 12);
194  b0Val = _mm_loadu_ps(bPtr);
195  b1Val = _mm_loadu_ps(bPtr + 4);
196  b2Val = _mm_loadu_ps(bPtr + 8);
197  b3Val = _mm_loadu_ps(bPtr + 12);
198 
199  c0Val = _mm_mul_ps(a0Val, b0Val);
200  c1Val = _mm_mul_ps(a1Val, b1Val);
201  c2Val = _mm_mul_ps(a2Val, b2Val);
202  c3Val = _mm_mul_ps(a3Val, b3Val);
203 
204  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
205  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
206  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
207  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
208 
209  aPtr += 16;
210  bPtr += 16;
211  }
212 
213  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
214  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
215  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
216 
217  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
218  _mm_store_ps(dotProductVector,
219  dotProdVal0); // Store the results back into the dot product vector
220 
221  dotProduct = dotProductVector[0];
222  dotProduct += dotProductVector[1];
223  dotProduct += dotProductVector[2];
224  dotProduct += dotProductVector[3];
225 
226  number = sixteenthPoints * 16;
227  for (; number < num_points; number++) {
228  dotProduct += ((*aPtr++) * (*bPtr++));
229  }
230 
231  *result = dotProduct;
232 }
233 
234 #endif /*LV_HAVE_SSE3*/
235 
236 #ifdef LV_HAVE_SSE4_1
237 
238 #include <smmintrin.h>
239 
240 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
241  const float* input,
242  const float* taps,
243  unsigned int num_points)
244 {
245  unsigned int number = 0;
246  const unsigned int sixteenthPoints = num_points / 16;
247 
248  float dotProduct = 0;
249  const float* aPtr = input;
250  const float* bPtr = taps;
251 
252  __m128 aVal1, bVal1, cVal1;
253  __m128 aVal2, bVal2, cVal2;
254  __m128 aVal3, bVal3, cVal3;
255  __m128 aVal4, bVal4, cVal4;
256 
257  __m128 dotProdVal = _mm_setzero_ps();
258 
259  for (; number < sixteenthPoints; number++) {
260 
261  aVal1 = _mm_loadu_ps(aPtr);
262  aPtr += 4;
263  aVal2 = _mm_loadu_ps(aPtr);
264  aPtr += 4;
265  aVal3 = _mm_loadu_ps(aPtr);
266  aPtr += 4;
267  aVal4 = _mm_loadu_ps(aPtr);
268  aPtr += 4;
269 
270  bVal1 = _mm_loadu_ps(bPtr);
271  bPtr += 4;
272  bVal2 = _mm_loadu_ps(bPtr);
273  bPtr += 4;
274  bVal3 = _mm_loadu_ps(bPtr);
275  bPtr += 4;
276  bVal4 = _mm_loadu_ps(bPtr);
277  bPtr += 4;
278 
279  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
280  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
281  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
282  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
283 
284  cVal1 = _mm_or_ps(cVal1, cVal2);
285  cVal3 = _mm_or_ps(cVal3, cVal4);
286  cVal1 = _mm_or_ps(cVal1, cVal3);
287 
288  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
289  }
290 
291  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
292  _mm_store_ps(dotProductVector,
293  dotProdVal); // Store the results back into the dot product vector
294 
295  dotProduct = dotProductVector[0];
296  dotProduct += dotProductVector[1];
297  dotProduct += dotProductVector[2];
298  dotProduct += dotProductVector[3];
299 
300  number = sixteenthPoints * 16;
301  for (; number < num_points; number++) {
302  dotProduct += ((*aPtr++) * (*bPtr++));
303  }
304 
305  *result = dotProduct;
306 }
307 
308 #endif /*LV_HAVE_SSE4_1*/
309 
310 #ifdef LV_HAVE_AVX
311 
312 #include <immintrin.h>
313 
314 static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
315  const float* input,
316  const float* taps,
317  unsigned int num_points)
318 {
319 
320  unsigned int number = 0;
321  const unsigned int sixteenthPoints = num_points / 16;
322 
323  float dotProduct = 0;
324  const float* aPtr = input;
325  const float* bPtr = taps;
326 
327  __m256 a0Val, a1Val;
328  __m256 b0Val, b1Val;
329  __m256 c0Val, c1Val;
330 
331  __m256 dotProdVal0 = _mm256_setzero_ps();
332  __m256 dotProdVal1 = _mm256_setzero_ps();
333 
334  for (; number < sixteenthPoints; number++) {
335 
336  a0Val = _mm256_loadu_ps(aPtr);
337  a1Val = _mm256_loadu_ps(aPtr + 8);
338  b0Val = _mm256_loadu_ps(bPtr);
339  b1Val = _mm256_loadu_ps(bPtr + 8);
340 
341  c0Val = _mm256_mul_ps(a0Val, b0Val);
342  c1Val = _mm256_mul_ps(a1Val, b1Val);
343 
344  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
345  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
346 
347  aPtr += 16;
348  bPtr += 16;
349  }
350 
351  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
352 
353  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
354 
355  _mm256_storeu_ps(dotProductVector,
356  dotProdVal0); // Store the results back into the dot product vector
357 
358  dotProduct = dotProductVector[0];
359  dotProduct += dotProductVector[1];
360  dotProduct += dotProductVector[2];
361  dotProduct += dotProductVector[3];
362  dotProduct += dotProductVector[4];
363  dotProduct += dotProductVector[5];
364  dotProduct += dotProductVector[6];
365  dotProduct += dotProductVector[7];
366 
367  number = sixteenthPoints * 16;
368  for (; number < num_points; number++) {
369  dotProduct += ((*aPtr++) * (*bPtr++));
370  }
371 
372  *result = dotProduct;
373 }
374 
375 #endif /*LV_HAVE_AVX*/
376 
377 #if LV_HAVE_AVX2 && LV_HAVE_FMA
378 #include <immintrin.h>
379 static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
380  const float* input,
381  const float* taps,
382  unsigned int num_points)
383 {
384  unsigned int number;
385  const unsigned int eighthPoints = num_points / 8;
386 
387  const float* aPtr = input;
388  const float* bPtr = taps;
389 
390  __m256 dotProdVal = _mm256_setzero_ps();
391  __m256 aVal1, bVal1;
392 
393  for (number = 0; number < eighthPoints; number++) {
394 
395  aVal1 = _mm256_loadu_ps(aPtr);
396  bVal1 = _mm256_loadu_ps(bPtr);
397  aPtr += 8;
398  bPtr += 8;
399 
400  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
401  }
402 
403  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
404  _mm256_storeu_ps(dotProductVector,
405  dotProdVal); // Store the results back into the dot product vector
406 
407  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
408  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
409  dotProductVector[6] + dotProductVector[7];
410 
411  for (number = eighthPoints * 8; number < num_points; number++) {
412  dotProduct += ((*aPtr++) * (*bPtr++));
413  }
414 
415  *result = dotProduct;
416 }
417 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
418 
419 #if LV_HAVE_AVX512F
420 #include <immintrin.h>
421 static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
422  const float* input,
423  const float* taps,
424  unsigned int num_points)
425 {
426  unsigned int number;
427  const unsigned int sixteenthPoints = num_points / 16;
428 
429  const float* aPtr = input;
430  const float* bPtr = taps;
431 
432  __m512 dotProdVal = _mm512_setzero_ps();
433  __m512 aVal1, bVal1;
434 
435  for (number = 0; number < sixteenthPoints; number++) {
436 
437  aVal1 = _mm512_loadu_ps(aPtr);
438  bVal1 = _mm512_loadu_ps(bPtr);
439  aPtr += 16;
440  bPtr += 16;
441 
442  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
443  }
444 
445  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
446  _mm512_storeu_ps(dotProductVector,
447  dotProdVal); // Store the results back into the dot product vector
448 
449  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
450  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
451  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
452  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
453  dotProductVector[12] + dotProductVector[13] +
454  dotProductVector[14] + dotProductVector[15];
455 
456  for (number = sixteenthPoints * 16; number < num_points; number++) {
457  dotProduct += ((*aPtr++) * (*bPtr++));
458  }
459 
460  *result = dotProduct;
461 }
462 #endif /* LV_HAVE_AVX512F */
463 
464 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
465 
466 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
467 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
468 
469 #include <stdio.h>
470 #include <volk/volk_common.h>
471 
472 
473 #ifdef LV_HAVE_SSE
474 
475 
476 static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
477  const float* input,
478  const float* taps,
479  unsigned int num_points)
480 {
481 
482  unsigned int number = 0;
483  const unsigned int sixteenthPoints = num_points / 16;
484 
485  float dotProduct = 0;
486  const float* aPtr = input;
487  const float* bPtr = taps;
488 
489  __m128 a0Val, a1Val, a2Val, a3Val;
490  __m128 b0Val, b1Val, b2Val, b3Val;
491  __m128 c0Val, c1Val, c2Val, c3Val;
492 
493  __m128 dotProdVal0 = _mm_setzero_ps();
494  __m128 dotProdVal1 = _mm_setzero_ps();
495  __m128 dotProdVal2 = _mm_setzero_ps();
496  __m128 dotProdVal3 = _mm_setzero_ps();
497 
498  for (; number < sixteenthPoints; number++) {
499 
500  a0Val = _mm_load_ps(aPtr);
501  a1Val = _mm_load_ps(aPtr + 4);
502  a2Val = _mm_load_ps(aPtr + 8);
503  a3Val = _mm_load_ps(aPtr + 12);
504  b0Val = _mm_load_ps(bPtr);
505  b1Val = _mm_load_ps(bPtr + 4);
506  b2Val = _mm_load_ps(bPtr + 8);
507  b3Val = _mm_load_ps(bPtr + 12);
508 
509  c0Val = _mm_mul_ps(a0Val, b0Val);
510  c1Val = _mm_mul_ps(a1Val, b1Val);
511  c2Val = _mm_mul_ps(a2Val, b2Val);
512  c3Val = _mm_mul_ps(a3Val, b3Val);
513 
514  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
515  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
516  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
517  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
518 
519  aPtr += 16;
520  bPtr += 16;
521  }
522 
523  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
524  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
525  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
526 
527  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
528 
529  _mm_store_ps(dotProductVector,
530  dotProdVal0); // Store the results back into the dot product vector
531 
532  dotProduct = dotProductVector[0];
533  dotProduct += dotProductVector[1];
534  dotProduct += dotProductVector[2];
535  dotProduct += dotProductVector[3];
536 
537  number = sixteenthPoints * 16;
538  for (; number < num_points; number++) {
539  dotProduct += ((*aPtr++) * (*bPtr++));
540  }
541 
542  *result = dotProduct;
543 }
544 
545 #endif /*LV_HAVE_SSE*/
546 
547 #ifdef LV_HAVE_SSE3
548 
549 #include <pmmintrin.h>
550 
551 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
552  const float* input,
553  const float* taps,
554  unsigned int num_points)
555 {
556  unsigned int number = 0;
557  const unsigned int sixteenthPoints = num_points / 16;
558 
559  float dotProduct = 0;
560  const float* aPtr = input;
561  const float* bPtr = taps;
562 
563  __m128 a0Val, a1Val, a2Val, a3Val;
564  __m128 b0Val, b1Val, b2Val, b3Val;
565  __m128 c0Val, c1Val, c2Val, c3Val;
566 
567  __m128 dotProdVal0 = _mm_setzero_ps();
568  __m128 dotProdVal1 = _mm_setzero_ps();
569  __m128 dotProdVal2 = _mm_setzero_ps();
570  __m128 dotProdVal3 = _mm_setzero_ps();
571 
572  for (; number < sixteenthPoints; number++) {
573 
574  a0Val = _mm_load_ps(aPtr);
575  a1Val = _mm_load_ps(aPtr + 4);
576  a2Val = _mm_load_ps(aPtr + 8);
577  a3Val = _mm_load_ps(aPtr + 12);
578  b0Val = _mm_load_ps(bPtr);
579  b1Val = _mm_load_ps(bPtr + 4);
580  b2Val = _mm_load_ps(bPtr + 8);
581  b3Val = _mm_load_ps(bPtr + 12);
582 
583  c0Val = _mm_mul_ps(a0Val, b0Val);
584  c1Val = _mm_mul_ps(a1Val, b1Val);
585  c2Val = _mm_mul_ps(a2Val, b2Val);
586  c3Val = _mm_mul_ps(a3Val, b3Val);
587 
588  dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
589  dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
590  dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
591  dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
592 
593  aPtr += 16;
594  bPtr += 16;
595  }
596 
597  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
598  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
599  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
600 
601  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
602  _mm_store_ps(dotProductVector,
603  dotProdVal0); // Store the results back into the dot product vector
604 
605  dotProduct = dotProductVector[0];
606  dotProduct += dotProductVector[1];
607  dotProduct += dotProductVector[2];
608  dotProduct += dotProductVector[3];
609 
610  number = sixteenthPoints * 16;
611  for (; number < num_points; number++) {
612  dotProduct += ((*aPtr++) * (*bPtr++));
613  }
614 
615  *result = dotProduct;
616 }
617 
618 #endif /*LV_HAVE_SSE3*/
619 
620 #ifdef LV_HAVE_SSE4_1
621 
622 #include <smmintrin.h>
623 
624 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
625  const float* input,
626  const float* taps,
627  unsigned int num_points)
628 {
629  unsigned int number = 0;
630  const unsigned int sixteenthPoints = num_points / 16;
631 
632  float dotProduct = 0;
633  const float* aPtr = input;
634  const float* bPtr = taps;
635 
636  __m128 aVal1, bVal1, cVal1;
637  __m128 aVal2, bVal2, cVal2;
638  __m128 aVal3, bVal3, cVal3;
639  __m128 aVal4, bVal4, cVal4;
640 
641  __m128 dotProdVal = _mm_setzero_ps();
642 
643  for (; number < sixteenthPoints; number++) {
644 
645  aVal1 = _mm_load_ps(aPtr);
646  aPtr += 4;
647  aVal2 = _mm_load_ps(aPtr);
648  aPtr += 4;
649  aVal3 = _mm_load_ps(aPtr);
650  aPtr += 4;
651  aVal4 = _mm_load_ps(aPtr);
652  aPtr += 4;
653 
654  bVal1 = _mm_load_ps(bPtr);
655  bPtr += 4;
656  bVal2 = _mm_load_ps(bPtr);
657  bPtr += 4;
658  bVal3 = _mm_load_ps(bPtr);
659  bPtr += 4;
660  bVal4 = _mm_load_ps(bPtr);
661  bPtr += 4;
662 
663  cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
664  cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
665  cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
666  cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
667 
668  cVal1 = _mm_or_ps(cVal1, cVal2);
669  cVal3 = _mm_or_ps(cVal3, cVal4);
670  cVal1 = _mm_or_ps(cVal1, cVal3);
671 
672  dotProdVal = _mm_add_ps(dotProdVal, cVal1);
673  }
674 
675  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
676  _mm_store_ps(dotProductVector,
677  dotProdVal); // Store the results back into the dot product vector
678 
679  dotProduct = dotProductVector[0];
680  dotProduct += dotProductVector[1];
681  dotProduct += dotProductVector[2];
682  dotProduct += dotProductVector[3];
683 
684  number = sixteenthPoints * 16;
685  for (; number < num_points; number++) {
686  dotProduct += ((*aPtr++) * (*bPtr++));
687  }
688 
689  *result = dotProduct;
690 }
691 
692 #endif /*LV_HAVE_SSE4_1*/
693 
694 #ifdef LV_HAVE_AVX
695 
696 #include <immintrin.h>
697 
698 static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
699  const float* input,
700  const float* taps,
701  unsigned int num_points)
702 {
703 
704  unsigned int number = 0;
705  const unsigned int sixteenthPoints = num_points / 16;
706 
707  float dotProduct = 0;
708  const float* aPtr = input;
709  const float* bPtr = taps;
710 
711  __m256 a0Val, a1Val;
712  __m256 b0Val, b1Val;
713  __m256 c0Val, c1Val;
714 
715  __m256 dotProdVal0 = _mm256_setzero_ps();
716  __m256 dotProdVal1 = _mm256_setzero_ps();
717 
718  for (; number < sixteenthPoints; number++) {
719 
720  a0Val = _mm256_load_ps(aPtr);
721  a1Val = _mm256_load_ps(aPtr + 8);
722  b0Val = _mm256_load_ps(bPtr);
723  b1Val = _mm256_load_ps(bPtr + 8);
724 
725  c0Val = _mm256_mul_ps(a0Val, b0Val);
726  c1Val = _mm256_mul_ps(a1Val, b1Val);
727 
728  dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
729  dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
730 
731  aPtr += 16;
732  bPtr += 16;
733  }
734 
735  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
736 
737  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
738 
739  _mm256_store_ps(dotProductVector,
740  dotProdVal0); // Store the results back into the dot product vector
741 
742  dotProduct = dotProductVector[0];
743  dotProduct += dotProductVector[1];
744  dotProduct += dotProductVector[2];
745  dotProduct += dotProductVector[3];
746  dotProduct += dotProductVector[4];
747  dotProduct += dotProductVector[5];
748  dotProduct += dotProductVector[6];
749  dotProduct += dotProductVector[7];
750 
751  number = sixteenthPoints * 16;
752  for (; number < num_points; number++) {
753  dotProduct += ((*aPtr++) * (*bPtr++));
754  }
755 
756  *result = dotProduct;
757 }
758 #endif /*LV_HAVE_AVX*/
759 
760 
761 #if LV_HAVE_AVX2 && LV_HAVE_FMA
762 #include <immintrin.h>
763 static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
764  const float* input,
765  const float* taps,
766  unsigned int num_points)
767 {
768  unsigned int number;
769  const unsigned int eighthPoints = num_points / 8;
770 
771  const float* aPtr = input;
772  const float* bPtr = taps;
773 
774  __m256 dotProdVal = _mm256_setzero_ps();
775  __m256 aVal1, bVal1;
776 
777  for (number = 0; number < eighthPoints; number++) {
778 
779  aVal1 = _mm256_load_ps(aPtr);
780  bVal1 = _mm256_load_ps(bPtr);
781  aPtr += 8;
782  bPtr += 8;
783 
784  dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
785  }
786 
787  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
788  _mm256_store_ps(dotProductVector,
789  dotProdVal); // Store the results back into the dot product vector
790 
791  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
792  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
793  dotProductVector[6] + dotProductVector[7];
794 
795  for (number = eighthPoints * 8; number < num_points; number++) {
796  dotProduct += ((*aPtr++) * (*bPtr++));
797  }
798 
799  *result = dotProduct;
800 }
801 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
802 
803 #if LV_HAVE_AVX512F
804 #include <immintrin.h>
805 static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
806  const float* input,
807  const float* taps,
808  unsigned int num_points)
809 {
810  unsigned int number;
811  const unsigned int sixteenthPoints = num_points / 16;
812 
813  const float* aPtr = input;
814  const float* bPtr = taps;
815 
816  __m512 dotProdVal = _mm512_setzero_ps();
817  __m512 aVal1, bVal1;
818 
819  for (number = 0; number < sixteenthPoints; number++) {
820 
821  aVal1 = _mm512_load_ps(aPtr);
822  bVal1 = _mm512_load_ps(bPtr);
823  aPtr += 16;
824  bPtr += 16;
825 
826  dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
827  }
828 
829  __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
830  _mm512_store_ps(dotProductVector,
831  dotProdVal); // Store the results back into the dot product vector
832 
833  float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
834  dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
835  dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
836  dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
837  dotProductVector[12] + dotProductVector[13] +
838  dotProductVector[14] + dotProductVector[15];
839 
840  for (number = sixteenthPoints * 16; number < num_points; number++) {
841  dotProduct += ((*aPtr++) * (*bPtr++));
842  }
843 
844  *result = dotProduct;
845 }
846 #endif /* LV_HAVE_AVX512F */
847 
848 #ifdef LV_HAVE_NEON
849 #include <arm_neon.h>
850 
851 static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
852  const float* input,
853  const float* taps,
854  unsigned int num_points)
855 {
856 
857  unsigned int quarter_points = num_points / 16;
858  float dotProduct = 0;
859  const float* aPtr = input;
860  const float* bPtr = taps;
861  unsigned int number = 0;
862 
863  float32x4x4_t a_val, b_val, accumulator0;
864  accumulator0.val[0] = vdupq_n_f32(0);
865  accumulator0.val[1] = vdupq_n_f32(0);
866  accumulator0.val[2] = vdupq_n_f32(0);
867  accumulator0.val[3] = vdupq_n_f32(0);
868  // factor of 4 loop unroll with independent accumulators
869  // uses 12 out of 16 neon q registers
870  for (number = 0; number < quarter_points; ++number) {
871  a_val = vld4q_f32(aPtr);
872  b_val = vld4q_f32(bPtr);
873  accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
874  accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
875  accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
876  accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
877  aPtr += 16;
878  bPtr += 16;
879  }
880  accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
881  accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
882  accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
883  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
884  vst1q_f32(accumulator, accumulator0.val[0]);
885  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
886 
887  for (number = quarter_points * 16; number < num_points; number++) {
888  dotProduct += ((*aPtr++) * (*bPtr++));
889  }
890 
891  *result = dotProduct;
892 }
893 
894 #endif
895 
896 
897 #ifdef LV_HAVE_NEON
898 static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
899  const float* input,
900  const float* taps,
901  unsigned int num_points)
902 {
903 
904  unsigned int quarter_points = num_points / 8;
905  float dotProduct = 0;
906  const float* aPtr = input;
907  const float* bPtr = taps;
908  unsigned int number = 0;
909 
910  float32x4x2_t a_val, b_val, accumulator_val;
911  accumulator_val.val[0] = vdupq_n_f32(0);
912  accumulator_val.val[1] = vdupq_n_f32(0);
913  // factor of 2 loop unroll with independent accumulators
914  for (number = 0; number < quarter_points; ++number) {
915  a_val = vld2q_f32(aPtr);
916  b_val = vld2q_f32(bPtr);
917  accumulator_val.val[0] =
918  vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
919  accumulator_val.val[1] =
920  vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
921  aPtr += 8;
922  bPtr += 8;
923  }
924  accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
925  __VOLK_ATTR_ALIGNED(32) float accumulator[4];
926  vst1q_f32(accumulator, accumulator_val.val[0]);
927  dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
928 
929  for (number = quarter_points * 8; number < num_points; number++) {
930  dotProduct += ((*aPtr++) * (*bPtr++));
931  }
932 
933  *result = dotProduct;
934 }
935 
936 #endif /* LV_HAVE_NEON */
937 
938 #ifdef LV_HAVE_NEONV7
939 extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
940  const float* aVector,
941  const float* bVector,
942  unsigned int num_points);
943 #endif /* LV_HAVE_NEONV7 */
944 
945 #ifdef LV_HAVE_NEONV7
946 extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
947  const float* aVector,
948  const float* bVector,
949  unsigned int num_points);
950 #endif /* LV_HAVE_NEONV7 */
951 
952 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
static void volk_32f_x2_dot_prod_32f_u_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:314
static void volk_32f_x2_dot_prod_32f_a_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:551
static void volk_32f_x2_dot_prod_32f_a_avx(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:698
static void volk_32f_x2_dot_prod_32f_u_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:92
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
static void volk_32f_x2_dot_prod_32f_u_sse3(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:167
static void volk_32f_x2_dot_prod_32f_a_sse(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:476
static void volk_32f_x2_dot_prod_32f_neon(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:898
static void volk_32f_x2_dot_prod_32f_generic(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:68
static void volk_32f_x2_dot_prod_32f_neonopts(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_32f.h:851