54 #ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
55 #define INCLUDED_volk_16i_s32f_convert_32f_u_H
61 #include <immintrin.h>
64 volk_16i_s32f_convert_32f_u_avx(
float* outputVector,
const int16_t* inputVector,
65 const float scalar,
unsigned int num_points)
67 unsigned int number = 0;
68 const unsigned int eighthPoints = num_points / 8;
70 float* outputVectorPtr = outputVector;
71 __m128 invScalar = _mm_set_ps1(1.0/scalar);
73 __m128i inputVal, inputVal2;
76 __m256 dummy = _mm256_setzero_ps();
78 for(;number < eighthPoints; number++){
82 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
85 inputVal2 = _mm_srli_si128(inputVal, 8);
88 inputVal = _mm_cvtepi16_epi32(inputVal);
89 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
91 ret = _mm_cvtepi32_ps(inputVal);
92 ret = _mm_mul_ps(ret, invScalar);
93 output = _mm256_insertf128_ps(dummy, ret, 0);
95 ret = _mm_cvtepi32_ps(inputVal2);
96 ret = _mm_mul_ps(ret, invScalar);
97 output = _mm256_insertf128_ps(output, ret, 1);
99 _mm256_storeu_ps(outputVectorPtr, output);
101 outputVectorPtr += 8;
106 number = eighthPoints * 8;
107 for(; number < num_points; number++){
108 outputVector[number] =((float)(inputVector[number])) / scalar;
113 #ifdef LV_HAVE_SSE4_1
114 #include <smmintrin.h>
117 volk_16i_s32f_convert_32f_u_sse4_1(
float* outputVector,
const int16_t* inputVector,
118 const float scalar,
unsigned int num_points)
120 unsigned int number = 0;
121 const unsigned int eighthPoints = num_points / 8;
123 float* outputVectorPtr = outputVector;
124 __m128 invScalar = _mm_set_ps1(1.0/scalar);
130 for(;number < eighthPoints; number++){
133 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
136 inputVal2 = _mm_srli_si128(inputVal, 8);
139 inputVal = _mm_cvtepi16_epi32(inputVal);
140 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
142 ret = _mm_cvtepi32_ps(inputVal);
143 ret = _mm_mul_ps(ret, invScalar);
144 _mm_storeu_ps(outputVectorPtr, ret);
145 outputVectorPtr += 4;
147 ret = _mm_cvtepi32_ps(inputVal2);
148 ret = _mm_mul_ps(ret, invScalar);
149 _mm_storeu_ps(outputVectorPtr, ret);
151 outputVectorPtr += 4;
156 number = eighthPoints * 8;
157 for(; number < num_points; number++){
158 outputVector[number] =((float)(inputVector[number])) / scalar;
164 #include <xmmintrin.h>
167 volk_16i_s32f_convert_32f_u_sse(
float* outputVector,
const int16_t* inputVector,
168 const float scalar,
unsigned int num_points)
170 unsigned int number = 0;
171 const unsigned int quarterPoints = num_points / 4;
173 float* outputVectorPtr = outputVector;
174 __m128 invScalar = _mm_set_ps1(1.0/scalar);
178 for(;number < quarterPoints; number++){
179 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
181 ret = _mm_mul_ps(ret, invScalar);
182 _mm_storeu_ps(outputVectorPtr, ret);
185 outputVectorPtr += 4;
188 number = quarterPoints * 4;
189 for(; number < num_points; number++){
190 outputVector[number] = (float)(inputVector[number]) / scalar;
195 #ifdef LV_HAVE_GENERIC
198 volk_16i_s32f_convert_32f_generic(
float* outputVector,
const int16_t* inputVector,
199 const float scalar,
unsigned int num_points)
201 float* outputVectorPtr = outputVector;
202 const int16_t* inputVectorPtr = inputVector;
203 unsigned int number = 0;
205 for(number = 0; number < num_points; number++){
206 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
214 volk_16i_s32f_convert_32f_neon(
float* outputVector,
const int16_t* inputVector,
215 const float scalar,
unsigned int num_points)
217 float* outputPtr = outputVector;
218 const int16_t* inputPtr = inputVector;
219 unsigned int number = 0;
220 unsigned int eighth_points = num_points / 8;
223 int32x4_t input32_0, input32_1;
224 float32x4_t input_float_0, input_float_1;
225 float32x4x2_t output_float;
226 float32x4_t inv_scale;
228 inv_scale = vdupq_n_f32(1.0/scalar);
234 for(number = 0; number < eighth_points; number++){
235 input16 = vld2_s16(inputPtr);
237 input32_0 = vmovl_s16(input16.val[0]);
238 input32_1 = vmovl_s16(input16.val[1]);
240 input_float_0 = vcvtq_f32_s32(input32_0);
241 input_float_1 = vcvtq_f32_s32(input32_1);
242 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
243 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
244 vst2q_f32(outputPtr, output_float);
249 for(number = eighth_points*8; number < num_points; number++){
250 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
257 #ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
258 #define INCLUDED_volk_16i_s32f_convert_32f_a_H
264 #include <immintrin.h>
267 volk_16i_s32f_convert_32f_a_avx(
float* outputVector,
const int16_t* inputVector,
268 const float scalar,
unsigned int num_points)
270 unsigned int number = 0;
271 const unsigned int eighthPoints = num_points / 8;
273 float* outputVectorPtr = outputVector;
274 __m128 invScalar = _mm_set_ps1(1.0/scalar);
276 __m128i inputVal, inputVal2;
279 __m256 dummy = _mm256_setzero_ps();
281 for(;number < eighthPoints; number++){
285 inputVal = _mm_load_si128((__m128i*)inputPtr);
288 inputVal2 = _mm_srli_si128(inputVal, 8);
291 inputVal = _mm_cvtepi16_epi32(inputVal);
292 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
294 ret = _mm_cvtepi32_ps(inputVal);
295 ret = _mm_mul_ps(ret, invScalar);
296 output = _mm256_insertf128_ps(dummy, ret, 0);
298 ret = _mm_cvtepi32_ps(inputVal2);
299 ret = _mm_mul_ps(ret, invScalar);
300 output = _mm256_insertf128_ps(output, ret, 1);
302 _mm256_store_ps(outputVectorPtr, output);
304 outputVectorPtr += 8;
309 number = eighthPoints * 8;
310 for(; number < num_points; number++){
311 outputVector[number] =((float)(inputVector[number])) / scalar;
316 #ifdef LV_HAVE_SSE4_1
317 #include <smmintrin.h>
320 volk_16i_s32f_convert_32f_a_sse4_1(
float* outputVector,
const int16_t* inputVector,
321 const float scalar,
unsigned int num_points)
323 unsigned int number = 0;
324 const unsigned int eighthPoints = num_points / 8;
326 float* outputVectorPtr = outputVector;
327 __m128 invScalar = _mm_set_ps1(1.0/scalar);
333 for(;number < eighthPoints; number++){
336 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
339 inputVal2 = _mm_srli_si128(inputVal, 8);
342 inputVal = _mm_cvtepi16_epi32(inputVal);
343 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
345 ret = _mm_cvtepi32_ps(inputVal);
346 ret = _mm_mul_ps(ret, invScalar);
347 _mm_storeu_ps(outputVectorPtr, ret);
348 outputVectorPtr += 4;
350 ret = _mm_cvtepi32_ps(inputVal2);
351 ret = _mm_mul_ps(ret, invScalar);
352 _mm_storeu_ps(outputVectorPtr, ret);
354 outputVectorPtr += 4;
359 number = eighthPoints * 8;
360 for(; number < num_points; number++){
361 outputVector[number] =((float)(inputVector[number])) / scalar;
367 #include <xmmintrin.h>
370 volk_16i_s32f_convert_32f_a_sse(
float* outputVector,
const int16_t* inputVector,
371 const float scalar,
unsigned int num_points)
373 unsigned int number = 0;
374 const unsigned int quarterPoints = num_points / 4;
376 float* outputVectorPtr = outputVector;
377 __m128 invScalar = _mm_set_ps1(1.0/scalar);
381 for(;number < quarterPoints; number++){
382 ret = _mm_set_ps((
float)(inputPtr[3]), (
float)(inputPtr[2]), (
float)(inputPtr[1]), (
float)(inputPtr[0]));
384 ret = _mm_mul_ps(ret, invScalar);
385 _mm_storeu_ps(outputVectorPtr, ret);
388 outputVectorPtr += 4;
391 number = quarterPoints * 4;
392 for(; number < num_points; number++){
393 outputVector[number] = (float)(inputVector[number]) / scalar;
398 #ifdef LV_HAVE_GENERIC
401 volk_16i_s32f_convert_32f_a_generic(
float* outputVector,
const int16_t* inputVector,
402 const float scalar,
unsigned int num_points)
404 float* outputVectorPtr = outputVector;
405 const int16_t* inputVectorPtr = inputVector;
406 unsigned int number = 0;
408 for(number = 0; number < num_points; number++){
409 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
signed short int16_t
Definition: stdint.h:76