1 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
2 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
9 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
16 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_bytes) {
28 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
30 xmm9 = _mm_setzero_ps();
31 xmm1 = _mm_setzero_ps();
33 xmm0 = _mm_load1_ps(¢er_point_array[0]);
34 xmm6 = _mm_load1_ps(¢er_point_array[1]);
35 xmm7 = _mm_load1_ps(¢er_point_array[2]);
36 xmm8 = _mm_load1_ps(¢er_point_array[3]);
38 xmm10 = _mm_load1_ps(cutoff);
40 int bound = num_bytes >> 4;
41 int leftovers = (num_bytes >> 2) & 3;
44 for(; i < bound; ++i) {
45 xmm2 = _mm_load_ps(src0);
46 xmm2 = _mm_max_ps(xmm10, xmm2);
47 xmm3 = _mm_mul_ps(xmm2, xmm2);
48 xmm4 = _mm_mul_ps(xmm2, xmm3);
49 xmm5 = _mm_mul_ps(xmm3, xmm3);
52 xmm2 = _mm_mul_ps(xmm2, xmm0);
53 xmm3 = _mm_mul_ps(xmm3, xmm6);
54 xmm4 = _mm_mul_ps(xmm4, xmm7);
55 xmm5 = _mm_mul_ps(xmm5, xmm8);
58 xmm2 = _mm_add_ps(xmm2, xmm3);
59 xmm3 = _mm_add_ps(xmm4, xmm5);
63 xmm9 = _mm_add_ps(xmm2, xmm9);
65 xmm1 = _mm_add_ps(xmm3, xmm1);
70 xmm2 = _mm_hadd_ps(xmm9, xmm1);
71 xmm3 = _mm_hadd_ps(xmm2, xmm2);
72 xmm4 = _mm_hadd_ps(xmm3, xmm3);
74 _mm_store_ss(&result, xmm4);
78 for(i = 0; i < leftovers; ++i) {
80 fst =
MAX(fst, *cutoff);
86 result += (center_point_array[0] * fst +
87 center_point_array[1] * sq +
88 center_point_array[2] * thrd +
89 center_point_array[3] * frth);
93 result += ((float)((bound * 4) + leftovers)) * center_point_array[4];
101 #ifdef LV_HAVE_GENERIC
103 static inline void volk_32f_x3_sum_of_poly_32f_a_generic(
float* target,
float* src0,
float* center_point_array,
float* cutoff,
unsigned int num_bytes) {
118 for(; i < num_bytes >> 2; ++i) {
120 fst =
MAX(fst, *cutoff);
127 result += (center_point_array[0] * fst +
128 center_point_array[1] * sq +
129 center_point_array[2] * thrd +
130 center_point_array[3] * frth);
141 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);