71 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
72 #define INCLUDED_volk_32f_binary_slicer_8i_H
75 #ifdef LV_HAVE_GENERIC
78 volk_32f_binary_slicer_8i_generic(
int8_t* cVector,
const float* aVector,
79 unsigned int num_points)
82 const float* aPtr = aVector;
83 unsigned int number = 0;
85 for(number = 0; number < num_points; number++) {
97 #ifdef LV_HAVE_GENERIC
100 volk_32f_binary_slicer_8i_generic_branchless(
int8_t* cVector,
const float* aVector,
101 unsigned int num_points)
104 const float* aPtr = aVector;
105 unsigned int number = 0;
107 for(number = 0; number < num_points; number++){
108 *cPtr++ = (*aPtr++ >= 0);
115 #include <emmintrin.h>
118 volk_32f_binary_slicer_8i_a_sse2(
int8_t* cVector,
const float* aVector,
119 unsigned int num_points)
122 const float* aPtr = aVector;
123 unsigned int number = 0;
125 unsigned int n16points = num_points / 16;
126 __m128 a0_val, a1_val, a2_val, a3_val;
127 __m128 res0_f, res1_f, res2_f, res3_f;
128 __m128i res0_i, res1_i, res2_i, res3_i;
130 zero_val = _mm_set1_ps(0.0f);
132 for(number = 0; number < n16points; number++) {
133 a0_val = _mm_load_ps(aPtr);
134 a1_val = _mm_load_ps(aPtr+4);
135 a2_val = _mm_load_ps(aPtr+8);
136 a3_val = _mm_load_ps(aPtr+12);
139 res0_f = _mm_cmpge_ps(a0_val, zero_val);
140 res1_f = _mm_cmpge_ps(a1_val, zero_val);
141 res2_f = _mm_cmpge_ps(a2_val, zero_val);
142 res3_f = _mm_cmpge_ps(a3_val, zero_val);
145 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
146 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
147 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
148 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
151 res0_i = _mm_packs_epi32(res0_i, res1_i);
152 res2_i = _mm_packs_epi32(res2_i, res3_i);
155 res0_i = _mm_packs_epi16(res0_i, res2_i);
157 _mm_store_si128((__m128i*)cPtr, res0_i);
163 for(number = n16points * 16; number < num_points; number++) {
177 #include <emmintrin.h>
180 volk_32f_binary_slicer_8i_u_sse2(
int8_t* cVector,
const float* aVector,
181 unsigned int num_points)
184 const float* aPtr = aVector;
185 unsigned int number = 0;
187 unsigned int n16points = num_points / 16;
188 __m128 a0_val, a1_val, a2_val, a3_val;
189 __m128 res0_f, res1_f, res2_f, res3_f;
190 __m128i res0_i, res1_i, res2_i, res3_i;
192 zero_val = _mm_set1_ps (0.0f);
194 for(number = 0; number < n16points; number++) {
195 a0_val = _mm_loadu_ps(aPtr);
196 a1_val = _mm_loadu_ps(aPtr+4);
197 a2_val = _mm_loadu_ps(aPtr+8);
198 a3_val = _mm_loadu_ps(aPtr+12);
201 res0_f = _mm_cmpge_ps(a0_val, zero_val);
202 res1_f = _mm_cmpge_ps(a1_val, zero_val);
203 res2_f = _mm_cmpge_ps(a2_val, zero_val);
204 res3_f = _mm_cmpge_ps(a3_val, zero_val);
207 res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
208 res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
209 res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
210 res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
213 res0_i = _mm_packs_epi32(res0_i, res1_i);
214 res2_i = _mm_packs_epi32(res2_i, res3_i);
217 res0_i = _mm_packs_epi16(res0_i, res2_i);
219 _mm_storeu_si128((__m128i*)cPtr, res0_i);
225 for(number = n16points * 16; number < num_points; number++) {
238 #include <arm_neon.h>
241 volk_32f_binary_slicer_8i_neon(
int8_t* cVector,
const float* aVector,
242 unsigned int num_points)
245 const float* aPtr = aVector;
246 unsigned int number = 0;
247 unsigned int n16points = num_points / 16;
249 float32x4x2_t input_val0, input_val1;
250 float32x4_t zero_val;
251 uint32x4x2_t res0_u32, res1_u32;
252 uint16x4x2_t res0_u16x4, res1_u16x4;
253 uint16x8x2_t res_u16x8;
257 zero_val = vdupq_n_f32(0.0);
258 one = vdup_n_u8(0x01);
263 for(number = 0; number < n16points; number++) {
264 input_val0 = vld2q_f32(aPtr);
265 input_val1 = vld2q_f32(aPtr+8);
268 res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
269 res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
270 res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
271 res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
274 res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
275 res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
276 res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
277 res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
279 res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
280 res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
283 res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
284 res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
291 res_u8.val[0] = vand_u8(one, res_u8.val[0]);
292 res_u8.val[1] = vand_u8(one, res_u8.val[1]);
294 vst2_u8((
unsigned char*)cPtr, res_u8);
300 for(number = n16points * 16; number < num_points; number++) {
signed char int8_t
Definition: stdint.h:75