Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
60 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
61 #define INCLUDED_volk_32f_s32f_convert_8i_u_H
62 
63 #include <inttypes.h>
64 
65 static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
66 {
67  const float min_val = INT8_MIN;
68  const float max_val = INT8_MAX;
69  if (in > max_val) {
70  *out = (int8_t)(max_val);
71  } else if (in < min_val) {
72  *out = (int8_t)(min_val);
73  } else {
74  *out = (int8_t)(rintf(in));
75  }
76 }
77 
78 #ifdef LV_HAVE_GENERIC
79 
80 static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
81  const float* inputVector,
82  const float scalar,
83  unsigned int num_points)
84 {
85  const float* inputVectorPtr = inputVector;
86 
87  for (unsigned int number = 0; number < num_points; number++) {
88  const float r = *inputVectorPtr++ * scalar;
89  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
90  }
91 }
92 
93 #endif /* LV_HAVE_GENERIC */
94 
95 
96 #ifdef LV_HAVE_AVX2
97 #include <immintrin.h>
98 
99 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
100  const float* inputVector,
101  const float scalar,
102  unsigned int num_points)
103 {
104  const unsigned int thirtysecondPoints = num_points / 32;
105 
106  const float* inputVectorPtr = (const float*)inputVector;
107  int8_t* outputVectorPtr = outputVector;
108 
109  const float min_val = INT8_MIN;
110  const float max_val = INT8_MAX;
111  const __m256 vmin_val = _mm256_set1_ps(min_val);
112  const __m256 vmax_val = _mm256_set1_ps(max_val);
113 
114  const __m256 vScalar = _mm256_set1_ps(scalar);
115 
116  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
117  __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
118  inputVectorPtr += 8;
119  __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
120  inputVectorPtr += 8;
121  __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
122  inputVectorPtr += 8;
123  __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
124  inputVectorPtr += 8;
125 
126  inputVal1 = _mm256_max_ps(
127  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
128  inputVal2 = _mm256_max_ps(
129  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
130  inputVal3 = _mm256_max_ps(
131  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
132  inputVal4 = _mm256_max_ps(
133  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
134 
135  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
136  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
137  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
138  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
139 
140  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
141  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
142  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
143  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
144 
145  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
146  const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
147 
148  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
149  outputVectorPtr += 32;
150  }
151 
152  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
153  float r = inputVector[number] * scalar;
154  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
155  }
156 }
157 
158 #endif /* LV_HAVE_AVX2 */
159 
160 
161 #ifdef LV_HAVE_SSE2
162 #include <emmintrin.h>
163 
164 static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
165  const float* inputVector,
166  const float scalar,
167  unsigned int num_points)
168 {
169  const unsigned int sixteenthPoints = num_points / 16;
170 
171  const float* inputVectorPtr = (const float*)inputVector;
172  int8_t* outputVectorPtr = outputVector;
173 
174  const float min_val = INT8_MIN;
175  const float max_val = INT8_MAX;
176  const __m128 vmin_val = _mm_set_ps1(min_val);
177  const __m128 vmax_val = _mm_set_ps1(max_val);
178 
179  const __m128 vScalar = _mm_set_ps1(scalar);
180 
181  for (unsigned int number = 0; number < sixteenthPoints; number++) {
182  __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
183  inputVectorPtr += 4;
184  __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
185  inputVectorPtr += 4;
186  __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
187  inputVectorPtr += 4;
188  __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
189  inputVectorPtr += 4;
190 
191  inputVal1 =
192  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
193  inputVal2 =
194  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
195  inputVal3 =
196  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
197  inputVal4 =
198  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
199 
200  __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
201  __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
202  __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
203  __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
204 
205  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
206  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
207 
208  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
209 
210  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
211  outputVectorPtr += 16;
212  }
213 
214  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
215  const float r = inputVector[number] * scalar;
216  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
217  }
218 }
219 
220 #endif /* LV_HAVE_SSE2 */
221 
222 
223 #ifdef LV_HAVE_SSE
224 #include <xmmintrin.h>
225 
226 static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
227  const float* inputVector,
228  const float scalar,
229  unsigned int num_points)
230 {
231  const unsigned int quarterPoints = num_points / 4;
232 
233  const float* inputVectorPtr = (const float*)inputVector;
234  int8_t* outputVectorPtr = outputVector;
235 
236  const float min_val = INT8_MIN;
237  const float max_val = INT8_MAX;
238  const __m128 vmin_val = _mm_set_ps1(min_val);
239  const __m128 vmax_val = _mm_set_ps1(max_val);
240 
241  const __m128 vScalar = _mm_set_ps1(scalar);
242 
243  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
244 
245  for (unsigned int number = 0; number < quarterPoints; number++) {
246  __m128 ret = _mm_loadu_ps(inputVectorPtr);
247  inputVectorPtr += 4;
248 
249  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
250 
251  _mm_store_ps(outputFloatBuffer, ret);
252  for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
253  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
254  }
255  }
256 
257  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
258  const float r = inputVector[number] * scalar;
259  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
260  }
261 }
262 
263 #endif /* LV_HAVE_SSE */
264 
265 
266 #endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
267 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
268 #define INCLUDED_volk_32f_s32f_convert_8i_a_H
269 
270 #include <inttypes.h>
271 
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274 
275 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
276  const float* inputVector,
277  const float scalar,
278  unsigned int num_points)
279 {
280  const unsigned int thirtysecondPoints = num_points / 32;
281 
282  const float* inputVectorPtr = (const float*)inputVector;
283  int8_t* outputVectorPtr = outputVector;
284 
285  const float min_val = INT8_MIN;
286  const float max_val = INT8_MAX;
287  const __m256 vmin_val = _mm256_set1_ps(min_val);
288  const __m256 vmax_val = _mm256_set1_ps(max_val);
289 
290  const __m256 vScalar = _mm256_set1_ps(scalar);
291 
292  for (unsigned int number = 0; number < thirtysecondPoints; number++) {
293  __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
294  inputVectorPtr += 8;
295  __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
296  inputVectorPtr += 8;
297  __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
298  inputVectorPtr += 8;
299  __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
300  inputVectorPtr += 8;
301 
302  inputVal1 = _mm256_max_ps(
303  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
304  inputVal2 = _mm256_max_ps(
305  _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
306  inputVal3 = _mm256_max_ps(
307  _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
308  inputVal4 = _mm256_max_ps(
309  _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
310 
311  __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
312  __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
313  __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
314  __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
315 
316  intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
317  intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
318  intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
319  intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
320 
321  intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
322  __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
323 
324  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
325  outputVectorPtr += 32;
326  }
327 
328  for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
329  const float r = inputVector[number] * scalar;
330  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
331  }
332 }
333 
334 #endif /* LV_HAVE_AVX2 */
335 
336 
337 #ifdef LV_HAVE_SSE2
338 #include <emmintrin.h>
339 
340 static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
341  const float* inputVector,
342  const float scalar,
343  unsigned int num_points)
344 {
345  const unsigned int sixteenthPoints = num_points / 16;
346 
347  const float* inputVectorPtr = (const float*)inputVector;
348  int8_t* outputVectorPtr = outputVector;
349 
350  const float min_val = INT8_MIN;
351  const float max_val = INT8_MAX;
352  const __m128 vmin_val = _mm_set_ps1(min_val);
353  const __m128 vmax_val = _mm_set_ps1(max_val);
354 
355  const __m128 vScalar = _mm_set_ps1(scalar);
356 
357  for (unsigned int number = 0; number < sixteenthPoints; number++) {
358  __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
359  inputVectorPtr += 4;
360  __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
361  inputVectorPtr += 4;
362  __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
363  inputVectorPtr += 4;
364  __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
365  inputVectorPtr += 4;
366 
367  inputVal1 =
368  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
369  inputVal2 =
370  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
371  inputVal3 =
372  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
373  inputVal4 =
374  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
375 
376  __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
377  __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
378  __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
379  __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
380 
381  intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
382  intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
383 
384  intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
385 
386  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
387  outputVectorPtr += 16;
388  }
389 
390  for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
391  const float r = inputVector[number] * scalar;
392  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
393  }
394 }
395 #endif /* LV_HAVE_SSE2 */
396 
397 
398 #ifdef LV_HAVE_SSE
399 #include <xmmintrin.h>
400 
401 static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
402  const float* inputVector,
403  const float scalar,
404  unsigned int num_points)
405 {
406  const unsigned int quarterPoints = num_points / 4;
407 
408  const float* inputVectorPtr = (const float*)inputVector;
409  int8_t* outputVectorPtr = outputVector;
410 
411  const float min_val = INT8_MIN;
412  const float max_val = INT8_MAX;
413  const __m128 vmin_val = _mm_set_ps1(min_val);
414  const __m128 vmax_val = _mm_set_ps1(max_val);
415 
416  const __m128 vScalar = _mm_set_ps1(scalar);
417 
418  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
419 
420  for (unsigned int number = 0; number < quarterPoints; number++) {
421  __m128 ret = _mm_load_ps(inputVectorPtr);
422  inputVectorPtr += 4;
423 
424  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
425 
426  _mm_store_ps(outputFloatBuffer, ret);
427  for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
428  *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
429  }
430  }
431 
432  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
433  const float r = inputVector[number] * scalar;
434  volk_32f_s32f_convert_8i_single(&outputVector[number], r);
435  }
436 }
437 
438 #endif /* LV_HAVE_SSE */
439 
440 
441 #endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:65
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:80
static float rintf(float x)
Definition: config.h:45
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:226
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:340
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:164
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:401