71 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
72 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
78 #include <emmintrin.h>
81 volk_32f_s32f_convert_8i_u_sse2(
int8_t* outputVector,
const float* inputVector,
82 const float scalar,
unsigned int num_points)
84 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 const float* inputVectorPtr = (
const float*)inputVector;
89 int8_t* outputVectorPtr = outputVector;
95 __m128 vScalar = _mm_set_ps1(scalar);
96 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
97 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
98 __m128 vmin_val = _mm_set_ps1(min_val);
99 __m128 vmax_val = _mm_set_ps1(max_val);
101 for(;number < sixteenthPoints; number++){
102 inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
103 inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
104 inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
105 inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
107 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
108 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
109 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
110 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
112 intInputVal1 = _mm_cvtps_epi32(inputVal1);
113 intInputVal2 = _mm_cvtps_epi32(inputVal2);
114 intInputVal3 = _mm_cvtps_epi32(inputVal3);
115 intInputVal4 = _mm_cvtps_epi32(inputVal4);
117 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
118 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
120 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
122 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
123 outputVectorPtr += 16;
126 number = sixteenthPoints * 16;
127 for(; number < num_points; number++){
128 r = inputVector[number] * scalar;
133 outputVector[number] = (
int16_t)(r);
141 #include <xmmintrin.h>
144 volk_32f_s32f_convert_8i_u_sse(
int8_t* outputVector,
const float* inputVector,
145 const float scalar,
unsigned int num_points)
147 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
151 const float* inputVectorPtr = (
const float*)inputVector;
152 int8_t* outputVectorPtr = outputVector;
154 float min_val = -128;
158 __m128 vScalar = _mm_set_ps1(scalar);
160 __m128 vmin_val = _mm_set_ps1(min_val);
161 __m128 vmax_val = _mm_set_ps1(max_val);
165 for(;number < quarterPoints; number++){
166 ret = _mm_loadu_ps(inputVectorPtr);
169 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
171 _mm_store_ps(outputFloatBuffer, ret);
172 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
173 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
174 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
175 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
178 number = quarterPoints * 4;
179 for(; number < num_points; number++){
180 r = inputVector[number] * scalar;
185 outputVector[number] = (
int16_t)(r);
192 #ifdef LV_HAVE_GENERIC
195 volk_32f_s32f_convert_8i_generic(
int8_t* outputVector,
const float* inputVector,
196 const float scalar,
unsigned int num_points)
198 int8_t* outputVectorPtr = outputVector;
199 const float* inputVectorPtr = inputVector;
200 unsigned int number = 0;
201 float min_val = -128;
205 for(number = 0; number < num_points; number++){
206 r = *inputVectorPtr++ * scalar;
211 *outputVectorPtr++ = (
int16_t)(r);
219 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
220 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
227 #include <emmintrin.h>
230 volk_32f_s32f_convert_8i_a_sse2(
int8_t* outputVector,
const float* inputVector,
231 const float scalar,
unsigned int num_points)
233 unsigned int number = 0;
235 const unsigned int sixteenthPoints = num_points / 16;
237 const float* inputVectorPtr = (
const float*)inputVector;
238 int8_t* outputVectorPtr = outputVector;
240 float min_val = -128;
244 __m128 vScalar = _mm_set_ps1(scalar);
245 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
246 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
247 __m128 vmin_val = _mm_set_ps1(min_val);
248 __m128 vmax_val = _mm_set_ps1(max_val);
250 for(;number < sixteenthPoints; number++){
251 inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
252 inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
253 inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
254 inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
256 inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
257 inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
258 inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
259 inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
261 intInputVal1 = _mm_cvtps_epi32(inputVal1);
262 intInputVal2 = _mm_cvtps_epi32(inputVal2);
263 intInputVal3 = _mm_cvtps_epi32(inputVal3);
264 intInputVal4 = _mm_cvtps_epi32(inputVal4);
266 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
267 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
269 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
271 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
272 outputVectorPtr += 16;
275 number = sixteenthPoints * 16;
276 for(; number < num_points; number++){
277 r = inputVector[number] * scalar;
282 outputVector[number] = (
int8_t)(r);
289 #include <xmmintrin.h>
292 volk_32f_s32f_convert_8i_a_sse(
int8_t* outputVector,
const float* inputVector,
293 const float scalar,
unsigned int num_points)
295 unsigned int number = 0;
297 const unsigned int quarterPoints = num_points / 4;
299 const float* inputVectorPtr = (
const float*)inputVector;
301 float min_val = -128;
305 int8_t* outputVectorPtr = outputVector;
306 __m128 vScalar = _mm_set_ps1(scalar);
308 __m128 vmin_val = _mm_set_ps1(min_val);
309 __m128 vmax_val = _mm_set_ps1(max_val);
313 for(;number < quarterPoints; number++){
314 ret = _mm_load_ps(inputVectorPtr);
317 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
319 _mm_store_ps(outputFloatBuffer, ret);
320 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[0]);
321 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[1]);
322 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[2]);
323 *outputVectorPtr++ = (
int8_t)(outputFloatBuffer[3]);
326 number = quarterPoints * 4;
327 for(; number < num_points; number++){
328 r = inputVector[number] * scalar;
333 outputVector[number] = (
int8_t)(r);
340 #ifdef LV_HAVE_GENERIC
343 volk_32f_s32f_convert_8i_a_generic(
int8_t* outputVector,
const float* inputVector,
344 const float scalar,
unsigned int num_points)
346 int8_t* outputVectorPtr = outputVector;
347 const float* inputVectorPtr = inputVector;
348 unsigned int number = 0;
349 float min_val = -128;
353 for(number = 0; number < num_points; number++){
354 r = *inputVectorPtr++ * scalar;
359 *outputVectorPtr++ = (
int8_t)(r);
signed short int16_t
Definition: stdint.h:76
signed char int8_t
Definition: stdint.h:75
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27