Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_8i_s32f_convert_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
41 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
42 #define INCLUDED_volk_8i_s32f_convert_32f_u_H
43 
44 #include <inttypes.h>
45 #include <stdio.h>
46 
47 #ifdef LV_HAVE_AVX2
48 #include <immintrin.h>
49 
50 static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
51  const int8_t* inputVector,
52  const float scalar,
53  unsigned int num_points)
54 {
55  unsigned int number = 0;
56  const unsigned int sixteenthPoints = num_points / 16;
57 
58  float* outputVectorPtr = outputVector;
59  const float iScalar = 1.0 / scalar;
60  __m256 invScalar = _mm256_set1_ps(iScalar);
61  const int8_t* inputVectorPtr = inputVector;
62  __m256 ret;
63  __m128i inputVal128;
64  __m256i interimVal;
65 
66  for (; number < sixteenthPoints; number++) {
67  inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
68 
69  interimVal = _mm256_cvtepi8_epi32(inputVal128);
70  ret = _mm256_cvtepi32_ps(interimVal);
71  ret = _mm256_mul_ps(ret, invScalar);
72  _mm256_storeu_ps(outputVectorPtr, ret);
73  outputVectorPtr += 8;
74 
75  inputVal128 = _mm_srli_si128(inputVal128, 8);
76  interimVal = _mm256_cvtepi8_epi32(inputVal128);
77  ret = _mm256_cvtepi32_ps(interimVal);
78  ret = _mm256_mul_ps(ret, invScalar);
79  _mm256_storeu_ps(outputVectorPtr, ret);
80  outputVectorPtr += 8;
81 
82  inputVectorPtr += 16;
83  }
84 
85  number = sixteenthPoints * 16;
86  for (; number < num_points; number++) {
87  outputVector[number] = (float)(inputVector[number]) * iScalar;
88  }
89 }
90 #endif /* LV_HAVE_AVX2 */
91 
92 
93 #ifdef LV_HAVE_SSE4_1
94 #include <smmintrin.h>
95 
96 static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
97  const int8_t* inputVector,
98  const float scalar,
99  unsigned int num_points)
100 {
101  unsigned int number = 0;
102  const unsigned int sixteenthPoints = num_points / 16;
103 
104  float* outputVectorPtr = outputVector;
105  const float iScalar = 1.0 / scalar;
106  __m128 invScalar = _mm_set_ps1(iScalar);
107  const int8_t* inputVectorPtr = inputVector;
108  __m128 ret;
109  __m128i inputVal;
110  __m128i interimVal;
111 
112  for (; number < sixteenthPoints; number++) {
113  inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
114 
115  interimVal = _mm_cvtepi8_epi32(inputVal);
116  ret = _mm_cvtepi32_ps(interimVal);
117  ret = _mm_mul_ps(ret, invScalar);
118  _mm_storeu_ps(outputVectorPtr, ret);
119  outputVectorPtr += 4;
120 
121  inputVal = _mm_srli_si128(inputVal, 4);
122  interimVal = _mm_cvtepi8_epi32(inputVal);
123  ret = _mm_cvtepi32_ps(interimVal);
124  ret = _mm_mul_ps(ret, invScalar);
125  _mm_storeu_ps(outputVectorPtr, ret);
126  outputVectorPtr += 4;
127 
128  inputVal = _mm_srli_si128(inputVal, 4);
129  interimVal = _mm_cvtepi8_epi32(inputVal);
130  ret = _mm_cvtepi32_ps(interimVal);
131  ret = _mm_mul_ps(ret, invScalar);
132  _mm_storeu_ps(outputVectorPtr, ret);
133  outputVectorPtr += 4;
134 
135  inputVal = _mm_srli_si128(inputVal, 4);
136  interimVal = _mm_cvtepi8_epi32(inputVal);
137  ret = _mm_cvtepi32_ps(interimVal);
138  ret = _mm_mul_ps(ret, invScalar);
139  _mm_storeu_ps(outputVectorPtr, ret);
140  outputVectorPtr += 4;
141 
142  inputVectorPtr += 16;
143  }
144 
145  number = sixteenthPoints * 16;
146  for (; number < num_points; number++) {
147  outputVector[number] = (float)(inputVector[number]) * iScalar;
148  }
149 }
150 #endif /* LV_HAVE_SSE4_1 */
151 
152 #ifdef LV_HAVE_GENERIC
153 
154 static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
155  const int8_t* inputVector,
156  const float scalar,
157  unsigned int num_points)
158 {
159  float* outputVectorPtr = outputVector;
160  const int8_t* inputVectorPtr = inputVector;
161  unsigned int number = 0;
162  const float iScalar = 1.0 / scalar;
163 
164  for (number = 0; number < num_points; number++) {
165  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
166  }
167 }
168 #endif /* LV_HAVE_GENERIC */
169 
170 
171 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
172 
173 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
174 #define INCLUDED_volk_8i_s32f_convert_32f_a_H
175 
176 #include <inttypes.h>
177 #include <stdio.h>
178 
179 #ifdef LV_HAVE_AVX2
180 #include <immintrin.h>
181 
182 static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
183  const int8_t* inputVector,
184  const float scalar,
185  unsigned int num_points)
186 {
187  unsigned int number = 0;
188  const unsigned int sixteenthPoints = num_points / 16;
189 
190  float* outputVectorPtr = outputVector;
191  const float iScalar = 1.0 / scalar;
192  __m256 invScalar = _mm256_set1_ps(iScalar);
193  const int8_t* inputVectorPtr = inputVector;
194  __m256 ret;
195  __m128i inputVal128;
196  __m256i interimVal;
197 
198  for (; number < sixteenthPoints; number++) {
199  inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
200 
201  interimVal = _mm256_cvtepi8_epi32(inputVal128);
202  ret = _mm256_cvtepi32_ps(interimVal);
203  ret = _mm256_mul_ps(ret, invScalar);
204  _mm256_store_ps(outputVectorPtr, ret);
205  outputVectorPtr += 8;
206 
207  inputVal128 = _mm_srli_si128(inputVal128, 8);
208  interimVal = _mm256_cvtepi8_epi32(inputVal128);
209  ret = _mm256_cvtepi32_ps(interimVal);
210  ret = _mm256_mul_ps(ret, invScalar);
211  _mm256_store_ps(outputVectorPtr, ret);
212  outputVectorPtr += 8;
213 
214  inputVectorPtr += 16;
215  }
216 
217  number = sixteenthPoints * 16;
218  for (; number < num_points; number++) {
219  outputVector[number] = (float)(inputVector[number]) * iScalar;
220  }
221 }
222 #endif /* LV_HAVE_AVX2 */
223 
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
226 
227 static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
228  const int8_t* inputVector,
229  const float scalar,
230  unsigned int num_points)
231 {
232  unsigned int number = 0;
233  const unsigned int sixteenthPoints = num_points / 16;
234 
235  float* outputVectorPtr = outputVector;
236  const float iScalar = 1.0 / scalar;
237  __m128 invScalar = _mm_set_ps1(iScalar);
238  const int8_t* inputVectorPtr = inputVector;
239  __m128 ret;
240  __m128i inputVal;
241  __m128i interimVal;
242 
243  for (; number < sixteenthPoints; number++) {
244  inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
245 
246  interimVal = _mm_cvtepi8_epi32(inputVal);
247  ret = _mm_cvtepi32_ps(interimVal);
248  ret = _mm_mul_ps(ret, invScalar);
249  _mm_store_ps(outputVectorPtr, ret);
250  outputVectorPtr += 4;
251 
252  inputVal = _mm_srli_si128(inputVal, 4);
253  interimVal = _mm_cvtepi8_epi32(inputVal);
254  ret = _mm_cvtepi32_ps(interimVal);
255  ret = _mm_mul_ps(ret, invScalar);
256  _mm_store_ps(outputVectorPtr, ret);
257  outputVectorPtr += 4;
258 
259  inputVal = _mm_srli_si128(inputVal, 4);
260  interimVal = _mm_cvtepi8_epi32(inputVal);
261  ret = _mm_cvtepi32_ps(interimVal);
262  ret = _mm_mul_ps(ret, invScalar);
263  _mm_store_ps(outputVectorPtr, ret);
264  outputVectorPtr += 4;
265 
266  inputVal = _mm_srli_si128(inputVal, 4);
267  interimVal = _mm_cvtepi8_epi32(inputVal);
268  ret = _mm_cvtepi32_ps(interimVal);
269  ret = _mm_mul_ps(ret, invScalar);
270  _mm_store_ps(outputVectorPtr, ret);
271  outputVectorPtr += 4;
272 
273  inputVectorPtr += 16;
274  }
275 
276  number = sixteenthPoints * 16;
277  for (; number < num_points; number++) {
278  outputVector[number] = (float)(inputVector[number]) * iScalar;
279  }
280 }
281 #endif /* LV_HAVE_SSE4_1 */
282 
283 #ifdef LV_HAVE_NEON
284 #include <arm_neon.h>
285 
286 static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
287  const int8_t* inputVector,
288  const float scalar,
289  unsigned int num_points)
290 {
291  float* outputVectorPtr = outputVector;
292  const int8_t* inputVectorPtr = inputVector;
293 
294  const float iScalar = 1.0 / scalar;
295  const float32x4_t qiScalar = vdupq_n_f32(iScalar);
296 
297  int8x16_t inputVal;
298 
299  int16x8_t lower;
300  int16x8_t higher;
301 
302  float32x4_t outputFloat;
303 
304  unsigned int number = 0;
305  const unsigned int sixteenthPoints = num_points / 16;
306  for (; number < sixteenthPoints; number++) {
307  inputVal = vld1q_s8(inputVectorPtr);
308  inputVectorPtr += 16;
309 
310  lower = vmovl_s8(vget_low_s8(inputVal));
311  higher = vmovl_s8(vget_high_s8(inputVal));
312 
313  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
314  vst1q_f32(outputVectorPtr, outputFloat);
315  outputVectorPtr += 4;
316 
317  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
318  vst1q_f32(outputVectorPtr, outputFloat);
319  outputVectorPtr += 4;
320 
321  outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
322  vst1q_f32(outputVectorPtr, outputFloat);
323  outputVectorPtr += 4;
324 
325  outputFloat =
326  vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
327  vst1q_f32(outputVectorPtr, outputFloat);
328  outputVectorPtr += 4;
329  }
330  for (number = sixteenthPoints * 16; number < num_points; number++) {
331  *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
332  }
333 }
334 
335 #endif /* LV_HAVE_NEON */
336 
337 #ifdef LV_HAVE_ORC
338 extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
339  const int8_t* inputVector,
340  const float scalar,
341  int num_points);
342 
343 static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
344  const int8_t* inputVector,
345  const float scalar,
346  unsigned int num_points)
347 {
348  float invscalar = 1.0 / scalar;
349  volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
350 }
351 #endif /* LV_HAVE_ORC */
352 
353 
354 #endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:286
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:154