71 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
72 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
80 #include <smmintrin.h>
83 volk_32f_stddev_and_mean_32f_x2_a_sse4_1(
float* stddev,
float* mean,
84 const float* inputBuffer,
85 unsigned int num_points)
87 float returnValue = 0;
90 unsigned int number = 0;
91 const unsigned int sixteenthPoints = num_points / 16;
93 const float* aPtr = inputBuffer;
97 __m128 accumulator = _mm_setzero_ps();
98 __m128 squareAccumulator = _mm_setzero_ps();
99 __m128 aVal1, aVal2, aVal3, aVal4;
100 __m128 cVal1, cVal2, cVal3, cVal4;
101 for(;number < sixteenthPoints; number++) {
102 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
103 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
104 accumulator = _mm_add_ps(accumulator, aVal1);
106 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
107 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
108 accumulator = _mm_add_ps(accumulator, aVal2);
110 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
111 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
112 accumulator = _mm_add_ps(accumulator, aVal3);
114 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
115 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
116 accumulator = _mm_add_ps(accumulator, aVal4);
118 cVal1 = _mm_or_ps(cVal1, cVal2);
119 cVal3 = _mm_or_ps(cVal3, cVal4);
120 cVal1 = _mm_or_ps(cVal1, cVal3);
122 squareAccumulator = _mm_add_ps(squareAccumulator, cVal1);
124 _mm_store_ps(meanBuffer,accumulator);
125 _mm_store_ps(squareBuffer,squareAccumulator);
126 newMean = meanBuffer[0];
127 newMean += meanBuffer[1];
128 newMean += meanBuffer[2];
129 newMean += meanBuffer[3];
130 returnValue = squareBuffer[0];
131 returnValue += squareBuffer[1];
132 returnValue += squareBuffer[2];
133 returnValue += squareBuffer[3];
135 number = sixteenthPoints * 16;
136 for(;number < num_points; number++){
137 returnValue += (*aPtr) * (*aPtr);
140 newMean /= num_points;
141 returnValue /= num_points;
142 returnValue -= (newMean * newMean);
143 returnValue = sqrtf(returnValue);
145 *stddev = returnValue;
152 #include <xmmintrin.h>
155 volk_32f_stddev_and_mean_32f_x2_a_sse(
float* stddev,
float* mean,
156 const float* inputBuffer,
157 unsigned int num_points)
159 float returnValue = 0;
162 unsigned int number = 0;
163 const unsigned int quarterPoints = num_points / 4;
165 const float* aPtr = inputBuffer;
169 __m128 accumulator = _mm_setzero_ps();
170 __m128 squareAccumulator = _mm_setzero_ps();
171 __m128 aVal = _mm_setzero_ps();
172 for(;number < quarterPoints; number++) {
173 aVal = _mm_load_ps(aPtr);
174 accumulator = _mm_add_ps(accumulator, aVal);
175 aVal = _mm_mul_ps(aVal, aVal);
176 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
179 _mm_store_ps(meanBuffer,accumulator);
180 _mm_store_ps(squareBuffer,squareAccumulator);
181 newMean = meanBuffer[0];
182 newMean += meanBuffer[1];
183 newMean += meanBuffer[2];
184 newMean += meanBuffer[3];
185 returnValue = squareBuffer[0];
186 returnValue += squareBuffer[1];
187 returnValue += squareBuffer[2];
188 returnValue += squareBuffer[3];
190 number = quarterPoints * 4;
191 for(;number < num_points; number++){
192 returnValue += (*aPtr) * (*aPtr);
195 newMean /= num_points;
196 returnValue /= num_points;
197 returnValue -= (newMean * newMean);
198 returnValue = sqrtf(returnValue);
200 *stddev = returnValue;
206 #ifdef LV_HAVE_GENERIC
209 volk_32f_stddev_and_mean_32f_x2_generic(
float* stddev,
float* mean,
210 const float* inputBuffer,
211 unsigned int num_points)
213 float returnValue = 0;
216 const float* aPtr = inputBuffer;
217 unsigned int number = 0;
219 for(number = 0; number < num_points; number++){
220 returnValue += (*aPtr) * (*aPtr);
223 newMean /= num_points;
224 returnValue /= num_points;
225 returnValue -= (newMean * newMean);
226 returnValue = sqrtf(returnValue);
228 *stddev = returnValue;
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27