68 #ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
69 #define INCLUDED_volk_32f_s32f_stddev_32f_a_H
77 #include <smmintrin.h>
80 volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
const float* inputBuffer,
81 const float mean,
unsigned int num_points)
83 float returnValue = 0;
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 const float* aPtr = inputBuffer;
92 __m128 squareAccumulator = _mm_setzero_ps();
93 __m128 aVal1, aVal2, aVal3, aVal4;
94 __m128 cVal1, cVal2, cVal3, cVal4;
95 for(;number < sixteenthPoints; number++) {
96 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
97 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
99 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
100 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
102 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
103 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
105 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
106 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
108 cVal1 = _mm_or_ps(cVal1, cVal2);
109 cVal3 = _mm_or_ps(cVal3, cVal4);
110 cVal1 = _mm_or_ps(cVal1, cVal3);
112 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
114 _mm_store_ps(squareBuffer,squareAccumulator);
115 returnValue = squareBuffer[0];
116 returnValue += squareBuffer[1];
117 returnValue += squareBuffer[2];
118 returnValue += squareBuffer[3];
120 number = sixteenthPoints * 16;
121 for(;number < num_points; number++){
122 returnValue += (*aPtr) * (*aPtr);
125 returnValue /= num_points;
126 returnValue -= (mean * mean);
127 returnValue = sqrtf(returnValue);
129 *stddev = returnValue;
136 #include <xmmintrin.h>
139 volk_32f_s32f_stddev_32f_a_sse(
float* stddev,
const float* inputBuffer,
140 const float mean,
unsigned int num_points)
142 float returnValue = 0;
144 unsigned int number = 0;
145 const unsigned int quarterPoints = num_points / 4;
147 const float* aPtr = inputBuffer;
151 __m128 squareAccumulator = _mm_setzero_ps();
152 __m128 aVal = _mm_setzero_ps();
153 for(;number < quarterPoints; number++) {
154 aVal = _mm_load_ps(aPtr);
155 aVal = _mm_mul_ps(aVal, aVal);
156 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
159 _mm_store_ps(squareBuffer,squareAccumulator);
160 returnValue = squareBuffer[0];
161 returnValue += squareBuffer[1];
162 returnValue += squareBuffer[2];
163 returnValue += squareBuffer[3];
165 number = quarterPoints * 4;
166 for(;number < num_points; number++){
167 returnValue += (*aPtr) * (*aPtr);
170 returnValue /= num_points;
171 returnValue -= (mean * mean);
172 returnValue = sqrtf(returnValue);
174 *stddev = returnValue;
179 #ifdef LV_HAVE_GENERIC
182 volk_32f_s32f_stddev_32f_generic(
float* stddev,
const float* inputBuffer,
183 const float mean,
unsigned int num_points)
185 float returnValue = 0;
187 const float* aPtr = inputBuffer;
188 unsigned int number = 0;
190 for(number = 0; number < num_points; number++){
191 returnValue += (*aPtr) * (*aPtr);
195 returnValue /= num_points;
196 returnValue -= (mean * mean);
197 returnValue = sqrtf(returnValue);
199 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27