73 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
74 #define INCLUDED_volk_32f_x2_add_32f_u_H
80 #include <xmmintrin.h>
83 volk_32f_x2_add_32f_u_sse(
float* cVector,
const float* aVector,
84 const float* bVector,
unsigned int num_points)
86 unsigned int number = 0;
87 const unsigned int quarterPoints = num_points / 4;
89 float* cPtr = cVector;
90 const float* aPtr = aVector;
91 const float* bPtr= bVector;
93 __m128 aVal, bVal, cVal;
94 for(;number < quarterPoints; number++){
96 aVal = _mm_loadu_ps(aPtr);
97 bVal = _mm_loadu_ps(bPtr);
99 cVal = _mm_add_ps(aVal, bVal);
101 _mm_storeu_ps(cPtr,cVal);
108 number = quarterPoints * 4;
109 for(;number < num_points; number++){
110 *cPtr++ = (*aPtr++) + (*bPtr++);
116 #ifdef LV_HAVE_GENERIC
119 volk_32f_x2_add_32f_generic(
float* cVector,
const float* aVector,
120 const float* bVector,
unsigned int num_points)
122 float* cPtr = cVector;
123 const float* aPtr = aVector;
124 const float* bPtr= bVector;
125 unsigned int number = 0;
127 for(number = 0; number < num_points; number++){
128 *cPtr++ = (*aPtr++) + (*bPtr++);
135 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
136 #define INCLUDED_volk_32f_x2_add_32f_a_H
142 #include <xmmintrin.h>
145 volk_32f_x2_add_32f_a_sse(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points)
147 unsigned int number = 0;
148 const unsigned int quarterPoints = num_points / 4;
150 float* cPtr = cVector;
151 const float* aPtr = aVector;
152 const float* bPtr= bVector;
154 __m128 aVal, bVal, cVal;
155 for(;number < quarterPoints; number++){
156 aVal = _mm_load_ps(aPtr);
157 bVal = _mm_load_ps(bPtr);
159 cVal = _mm_add_ps(aVal, bVal);
161 _mm_store_ps(cPtr,cVal);
168 number = quarterPoints * 4;
169 for(;number < num_points; number++){
170 *cPtr++ = (*aPtr++) + (*bPtr++);
177 #include <arm_neon.h>
180 volk_32f_x2_add_32f_u_neon(
float* cVector,
const float* aVector,
181 const float* bVector,
unsigned int num_points)
183 unsigned int number = 0;
184 const unsigned int quarterPoints = num_points / 4;
186 float* cPtr = cVector;
187 const float* aPtr = aVector;
188 const float* bPtr= bVector;
189 float32x4_t aVal, bVal, cVal;
190 for(number=0; number < quarterPoints; number++){
192 aVal = vld1q_f32(aPtr);
193 bVal = vld1q_f32(bPtr);
194 __builtin_prefetch(aPtr+4);
195 __builtin_prefetch(bPtr+4);
198 cVal = vaddq_f32(aVal, bVal);
200 vst1q_f32(cPtr,cVal);
207 number = quarterPoints * 4;
208 for(;number < num_points; number++){
209 *cPtr++ = (*aPtr++) + (*bPtr++);
216 #ifdef LV_HAVE_GENERIC
219 volk_32f_x2_add_32f_a_generic(
float* cVector,
const float* aVector,
220 const float* bVector,
unsigned int num_points)
222 float* cPtr = cVector;
223 const float* aPtr = aVector;
224 const float* bPtr= bVector;
225 unsigned int number = 0;
227 for(number = 0; number < num_points; number++){
228 *cPtr++ = (*aPtr++) + (*bPtr++);
237 volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
const float* aVector,
238 const float* bVector,
unsigned int num_points);
241 volk_32f_x2_add_32f_u_orc(
float* cVector,
const float* aVector,
242 const float* bVector,
unsigned int num_points){
243 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);