Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_x2_subtract_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
58 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
59 #define INCLUDED_volk_32f_x2_subtract_32f_a_H
60 
61 #include <inttypes.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
68  const float* aVector,
69  const float* bVector,
70  unsigned int num_points)
71 {
72  for (unsigned int number = 0; number < num_points; number++) {
73  *cVector++ = (*aVector++) - (*bVector++);
74  }
75 }
76 #endif /* LV_HAVE_GENERIC */
77 
78 
79 #ifdef LV_HAVE_AVX512F
80 #include <immintrin.h>
81 
82 static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
83  const float* aVector,
84  const float* bVector,
85  unsigned int num_points)
86 {
87  const unsigned int sixteenthPoints = num_points / 16;
88 
89  for (unsigned int number = 0; number < sixteenthPoints; number++) {
90  __m512 aVal = _mm512_load_ps(aVector);
91  __m512 bVal = _mm512_load_ps(bVector);
92 
93  __m512 cVal = _mm512_sub_ps(aVal, bVal);
94 
95  _mm512_store_ps(cVector, cVal); // Store the results back into the C container
96 
97  aVector += 16;
98  bVector += 16;
99  cVector += 16;
100  }
101 
103  cVector, aVector, bVector, num_points - sixteenthPoints * 16);
104 }
105 #endif /* LV_HAVE_AVX512F */
106 
107 #ifdef LV_HAVE_AVX
108 #include <immintrin.h>
109 
110 static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
111  const float* aVector,
112  const float* bVector,
113  unsigned int num_points)
114 {
115  const unsigned int eighthPoints = num_points / 8;
116 
117  for (unsigned int number = 0; number < eighthPoints; number++) {
118  __m256 aVal = _mm256_load_ps(aVector);
119  __m256 bVal = _mm256_load_ps(bVector);
120 
121  __m256 cVal = _mm256_sub_ps(aVal, bVal);
122 
123  _mm256_store_ps(cVector, cVal); // Store the results back into the C container
124 
125  aVector += 8;
126  bVector += 8;
127  cVector += 8;
128  }
129 
131  cVector, aVector, bVector, num_points - eighthPoints * 8);
132 }
133 #endif /* LV_HAVE_AVX */
134 
135 #ifdef LV_HAVE_SSE
136 #include <xmmintrin.h>
137 
138 static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
139  const float* aVector,
140  const float* bVector,
141  unsigned int num_points)
142 {
143  const unsigned int quarterPoints = num_points / 4;
144 
145  for (unsigned int number = 0; number < quarterPoints; number++) {
146  __m128 aVal = _mm_load_ps(aVector);
147  __m128 bVal = _mm_load_ps(bVector);
148 
149  __m128 cVal = _mm_sub_ps(aVal, bVal);
150 
151  _mm_store_ps(cVector, cVal); // Store the results back into the C container
152 
153  aVector += 4;
154  bVector += 4;
155  cVector += 4;
156  }
157 
159  cVector, aVector, bVector, num_points - quarterPoints * 4);
160 }
161 #endif /* LV_HAVE_SSE */
162 
163 
164 #ifdef LV_HAVE_NEON
165 #include <arm_neon.h>
166 
167 static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
168  const float* aVector,
169  const float* bVector,
170  unsigned int num_points)
171 {
172  const unsigned int quarterPoints = num_points / 4;
173 
174  for (unsigned int number = 0; number < quarterPoints; number++) {
175  float32x4_t a_vec = vld1q_f32(aVector);
176  float32x4_t b_vec = vld1q_f32(bVector);
177 
178  float32x4_t c_vec = vsubq_f32(a_vec, b_vec);
179 
180  vst1q_f32(cVector, c_vec);
181 
182  aVector += 4;
183  bVector += 4;
184  cVector += 4;
185  }
186 
188  cVector, aVector, bVector, num_points - quarterPoints * 4);
189 }
190 #endif /* LV_HAVE_NEON */
191 
192 
193 #ifdef LV_HAVE_ORC
194 extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
195  const float* aVector,
196  const float* bVector,
197  int num_points);
198 
199 static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
200  const float* aVector,
201  const float* bVector,
202  unsigned int num_points)
203 {
204  volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
205 }
206 #endif /* LV_HAVE_ORC */
207 
208 
209 #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
210 
211 
212 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
213 #define INCLUDED_volk_32f_x2_subtract_32f_u_H
214 
215 #include <inttypes.h>
216 #include <stdio.h>
217 
218 #ifdef LV_HAVE_AVX512F
219 #include <immintrin.h>
220 
221 static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
222  const float* aVector,
223  const float* bVector,
224  unsigned int num_points)
225 {
226  const unsigned int sixteenthPoints = num_points / 16;
227 
228  for (unsigned int number = 0; number < sixteenthPoints; number++) {
229  __m512 aVal = _mm512_loadu_ps(aVector);
230  __m512 bVal = _mm512_loadu_ps(bVector);
231 
232  __m512 cVal = _mm512_sub_ps(aVal, bVal);
233 
234  _mm512_storeu_ps(cVector, cVal); // Store the results back into the C container
235 
236  aVector += 16;
237  bVector += 16;
238  cVector += 16;
239  }
240 
242  cVector, aVector, bVector, num_points - sixteenthPoints * 16);
243 }
244 #endif /* LV_HAVE_AVX512F */
245 
246 
247 #ifdef LV_HAVE_AVX
248 #include <immintrin.h>
249 
250 static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
251  const float* aVector,
252  const float* bVector,
253  unsigned int num_points)
254 {
255  const unsigned int eighthPoints = num_points / 8;
256 
257  for (unsigned int number = 0; number < eighthPoints; number++) {
258  __m256 aVal = _mm256_loadu_ps(aVector);
259  __m256 bVal = _mm256_loadu_ps(bVector);
260 
261  __m256 cVal = _mm256_sub_ps(aVal, bVal);
262 
263  _mm256_storeu_ps(cVector, cVal); // Store the results back into the C container
264 
265  aVector += 8;
266  bVector += 8;
267  cVector += 8;
268  }
269 
271  cVector, aVector, bVector, num_points - eighthPoints * 8);
272 }
273 #endif /* LV_HAVE_AVX */
274 
275 #endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
static void volk_32f_x2_subtract_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:110
static void volk_32f_x2_subtract_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:167
static void volk_32f_x2_subtract_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:67
static void volk_32f_x2_subtract_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:138
static void volk_32f_x2_subtract_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:250