61 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
62 #define INCLUDED_volk_32f_x2_add_32f_u_H
67 #ifdef LV_HAVE_AVX512F
68 #include <immintrin.h>
70 static inline void volk_32f_x2_add_32f_u_avx512f(
float* cVector,
73 unsigned int num_points)
75 unsigned int number = 0;
76 const unsigned int sixteenthPoints = num_points / 16;
78 float* cPtr = cVector;
79 const float* aPtr = aVector;
80 const float* bPtr = bVector;
82 __m512 aVal, bVal, cVal;
83 for (; number < sixteenthPoints; number++) {
85 aVal = _mm512_loadu_ps(aPtr);
86 bVal = _mm512_loadu_ps(bPtr);
88 cVal = _mm512_add_ps(aVal, bVal);
90 _mm512_storeu_ps(cPtr, cVal);
97 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 *cPtr++ = (*aPtr++) + (*bPtr++);
108 #include <immintrin.h>
111 const float* aVector,
112 const float* bVector,
113 unsigned int num_points)
115 unsigned int number = 0;
116 const unsigned int eighthPoints = num_points / 8;
117 float* cPtr = cVector;
118 const float* aPtr = aVector;
119 const float* bPtr = bVector;
120 __m256 aVal, bVal, cVal;
121 for (; number < eighthPoints; number++) {
123 aVal = _mm256_loadu_ps(aPtr);
124 bVal = _mm256_loadu_ps(bPtr);
126 cVal = _mm256_add_ps(aVal, bVal);
128 _mm256_storeu_ps(cPtr, cVal);
135 number = eighthPoints * 8;
137 for (; number < num_points; number++) {
138 *cPtr++ = (*aPtr++) + (*bPtr++);
145 #include <xmmintrin.h>
148 const float* aVector,
149 const float* bVector,
150 unsigned int num_points)
152 unsigned int number = 0;
153 const unsigned int quarterPoints = num_points / 4;
155 float* cPtr = cVector;
156 const float* aPtr = aVector;
157 const float* bPtr = bVector;
159 __m128 aVal, bVal, cVal;
160 for (; number < quarterPoints; number++) {
162 aVal = _mm_loadu_ps(aPtr);
163 bVal = _mm_loadu_ps(bPtr);
165 cVal = _mm_add_ps(aVal, bVal);
167 _mm_storeu_ps(cPtr, cVal);
174 number = quarterPoints * 4;
175 for (; number < num_points; number++) {
176 *cPtr++ = (*aPtr++) + (*bPtr++);
182 #ifdef LV_HAVE_GENERIC
185 const float* aVector,
186 const float* bVector,
187 unsigned int num_points)
189 float* cPtr = cVector;
190 const float* aPtr = aVector;
191 const float* bPtr = bVector;
192 unsigned int number = 0;
194 for (number = 0; number < num_points; number++) {
195 *cPtr++ = (*aPtr++) + (*bPtr++);
202 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
203 #define INCLUDED_volk_32f_x2_add_32f_a_H
205 #include <inttypes.h>
208 #ifdef LV_HAVE_AVX512F
209 #include <immintrin.h>
211 static inline void volk_32f_x2_add_32f_a_avx512f(
float* cVector,
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
216 unsigned int number = 0;
217 const unsigned int sixteenthPoints = num_points / 16;
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221 const float* bPtr = bVector;
223 __m512 aVal, bVal, cVal;
224 for (; number < sixteenthPoints; number++) {
226 aVal = _mm512_load_ps(aPtr);
227 bVal = _mm512_load_ps(bPtr);
229 cVal = _mm512_add_ps(aVal, bVal);
231 _mm512_store_ps(cPtr, cVal);
238 number = sixteenthPoints * 16;
240 for (; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);
249 #include <immintrin.h>
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
256 unsigned int number = 0;
257 const unsigned int eighthPoints = num_points / 8;
259 float* cPtr = cVector;
260 const float* aPtr = aVector;
261 const float* bPtr = bVector;
263 __m256 aVal, bVal, cVal;
264 for (; number < eighthPoints; number++) {
266 aVal = _mm256_load_ps(aPtr);
267 bVal = _mm256_load_ps(bPtr);
269 cVal = _mm256_add_ps(aVal, bVal);
271 _mm256_store_ps(cPtr, cVal);
278 number = eighthPoints * 8;
279 for (; number < num_points; number++) {
280 *cPtr++ = (*aPtr++) + (*bPtr++);
286 #include <xmmintrin.h>
289 const float* aVector,
290 const float* bVector,
291 unsigned int num_points)
293 unsigned int number = 0;
294 const unsigned int quarterPoints = num_points / 4;
296 float* cPtr = cVector;
297 const float* aPtr = aVector;
298 const float* bPtr = bVector;
300 __m128 aVal, bVal, cVal;
301 for (; number < quarterPoints; number++) {
302 aVal = _mm_load_ps(aPtr);
303 bVal = _mm_load_ps(bPtr);
305 cVal = _mm_add_ps(aVal, bVal);
307 _mm_store_ps(cPtr, cVal);
314 number = quarterPoints * 4;
315 for (; number < num_points; number++) {
316 *cPtr++ = (*aPtr++) + (*bPtr++);
323 #include <arm_neon.h>
326 const float* aVector,
327 const float* bVector,
328 unsigned int num_points)
330 unsigned int number = 0;
331 const unsigned int quarterPoints = num_points / 4;
333 float* cPtr = cVector;
334 const float* aPtr = aVector;
335 const float* bPtr = bVector;
336 float32x4_t aVal, bVal, cVal;
337 for (number = 0; number < quarterPoints; number++) {
339 aVal = vld1q_f32(aPtr);
340 bVal = vld1q_f32(bPtr);
345 cVal = vaddq_f32(aVal, bVal);
347 vst1q_f32(cPtr, cVal);
354 number = quarterPoints * 4;
355 for (; number < num_points; number++) {
356 *cPtr++ = (*aPtr++) + (*bPtr++);
362 #ifdef LV_HAVE_NEONV7
363 extern void volk_32f_x2_add_32f_a_neonasm(
float* cVector,
364 const float* aVector,
365 const float* bVector,
366 unsigned int num_points);
369 #ifdef LV_HAVE_NEONV7
370 extern void volk_32f_x2_add_32f_a_neonpipeline(
float* cVector,
371 const float* aVector,
372 const float* bVector,
373 unsigned int num_points);
379 extern void volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
380 const float* aVector,
381 const float* bVector,
384 static inline void volk_32f_x2_add_32f_u_orc(
float* cVector,
385 const float* aVector,
386 const float* bVector,
387 unsigned int num_points)
389 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:325
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:184
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:147
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:251
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:110
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:288