GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_binary_slicer_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_binary_slicer_8i
25  *
26  * \b Overview
27  *
28  * Slices input floats and and returns 1 when the input >= 0 and 0
29  * when < 0. Results are converted to 8-bit chars.
30  *
31  * <b>Dispatcher Prototype</b>
32  * \code
33  * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int num_points)
34  * \endcode
35  *
36  * \b Inputs
37  * \li aVector: The input vector of floats.
38  * \li num_points: The number of data points.
39  *
40  * \b Outputs
41  * \li cVector: The output vector of 8-bit chars.
42  *
43  * \b Example
44  * Generate bytes of a 7-bit barker code from floats.
45  * \code
46  int N = 7;
47  unsigned int alignment = volk_get_alignment();
48  float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
49  int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
50 
51  in[0] = 0.9f;
52  in[1] = 1.1f;
53  in[2] = 0.4f;
54  in[3] = -0.7f;
55  in[5] = -1.2f;
56  in[6] = 0.2f;
57  in[7] = -0.8f;
58 
59  volk_32f_binary_slicer_8i(out, in, N);
60 
61  for(unsigned int ii = 0; ii < N; ++ii){
62  printf("out(%i) = %i\n", ii, out[ii]);
63  }
64 
65  volk_free(in);
66  volk_free(out);
67 
68  * \endcode
69  */
70 
71 #ifndef INCLUDED_volk_32f_binary_slicer_8i_H
72 #define INCLUDED_volk_32f_binary_slicer_8i_H
73 
74 
75 #ifdef LV_HAVE_GENERIC
76 
77 static inline void
78 volk_32f_binary_slicer_8i_generic(int8_t* cVector, const float* aVector,
79  unsigned int num_points)
80 {
81  int8_t* cPtr = cVector;
82  const float* aPtr = aVector;
83  unsigned int number = 0;
84 
85  for(number = 0; number < num_points; number++) {
86  if(*aPtr++ >= 0) {
87  *cPtr++ = 1;
88  }
89  else {
90  *cPtr++ = 0;
91  }
92  }
93 }
94 #endif /* LV_HAVE_GENERIC */
95 
96 
97 #ifdef LV_HAVE_GENERIC
98 
99 static inline void
100 volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector, const float* aVector,
101  unsigned int num_points)
102 {
103  int8_t* cPtr = cVector;
104  const float* aPtr = aVector;
105  unsigned int number = 0;
106 
107  for(number = 0; number < num_points; number++){
108  *cPtr++ = (*aPtr++ >= 0);
109  }
110 }
111 #endif /* LV_HAVE_GENERIC */
112 
113 
114 #ifdef LV_HAVE_SSE2
115 #include <emmintrin.h>
116 
117 static inline void
118 volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector, const float* aVector,
119  unsigned int num_points)
120 {
121  int8_t* cPtr = cVector;
122  const float* aPtr = aVector;
123  unsigned int number = 0;
124 
125  unsigned int n16points = num_points / 16;
126  __m128 a0_val, a1_val, a2_val, a3_val;
127  __m128 res0_f, res1_f, res2_f, res3_f;
128  __m128i res0_i, res1_i, res2_i, res3_i;
129  __m128 zero_val;
130  zero_val = _mm_set1_ps(0.0f);
131 
132  for(number = 0; number < n16points; number++) {
133  a0_val = _mm_load_ps(aPtr);
134  a1_val = _mm_load_ps(aPtr+4);
135  a2_val = _mm_load_ps(aPtr+8);
136  a3_val = _mm_load_ps(aPtr+12);
137 
138  // compare >= 0; return float
139  res0_f = _mm_cmpge_ps(a0_val, zero_val);
140  res1_f = _mm_cmpge_ps(a1_val, zero_val);
141  res2_f = _mm_cmpge_ps(a2_val, zero_val);
142  res3_f = _mm_cmpge_ps(a3_val, zero_val);
143 
144  // convert to 32i and >> 31
145  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
146  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
147  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
148  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
149 
150  // pack into 16-bit results
151  res0_i = _mm_packs_epi32(res0_i, res1_i);
152  res2_i = _mm_packs_epi32(res2_i, res3_i);
153 
154  // pack into 8-bit results
155  res0_i = _mm_packs_epi16(res0_i, res2_i);
156 
157  _mm_store_si128((__m128i*)cPtr, res0_i);
158 
159  cPtr += 16;
160  aPtr += 16;
161  }
162 
163  for(number = n16points * 16; number < num_points; number++) {
164  if( *aPtr++ >= 0) {
165  *cPtr++ = 1;
166  }
167  else {
168  *cPtr++ = 0;
169  }
170  }
171 }
172 #endif /* LV_HAVE_SSE2 */
173 
174 
175 
176 #ifdef LV_HAVE_SSE2
177 #include <emmintrin.h>
178 
179 static inline void
180 volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const float* aVector,
181  unsigned int num_points)
182 {
183  int8_t* cPtr = cVector;
184  const float* aPtr = aVector;
185  unsigned int number = 0;
186 
187  unsigned int n16points = num_points / 16;
188  __m128 a0_val, a1_val, a2_val, a3_val;
189  __m128 res0_f, res1_f, res2_f, res3_f;
190  __m128i res0_i, res1_i, res2_i, res3_i;
191  __m128 zero_val;
192  zero_val = _mm_set1_ps (0.0f);
193 
194  for(number = 0; number < n16points; number++) {
195  a0_val = _mm_loadu_ps(aPtr);
196  a1_val = _mm_loadu_ps(aPtr+4);
197  a2_val = _mm_loadu_ps(aPtr+8);
198  a3_val = _mm_loadu_ps(aPtr+12);
199 
200  // compare >= 0; return float
201  res0_f = _mm_cmpge_ps(a0_val, zero_val);
202  res1_f = _mm_cmpge_ps(a1_val, zero_val);
203  res2_f = _mm_cmpge_ps(a2_val, zero_val);
204  res3_f = _mm_cmpge_ps(a3_val, zero_val);
205 
206  // convert to 32i and >> 31
207  res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
208  res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
209  res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
210  res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
211 
212  // pack into 16-bit results
213  res0_i = _mm_packs_epi32(res0_i, res1_i);
214  res2_i = _mm_packs_epi32(res2_i, res3_i);
215 
216  // pack into 8-bit results
217  res0_i = _mm_packs_epi16(res0_i, res2_i);
218 
219  _mm_storeu_si128((__m128i*)cPtr, res0_i);
220 
221  cPtr += 16;
222  aPtr += 16;
223  }
224 
225  for(number = n16points * 16; number < num_points; number++) {
226  if( *aPtr++ >= 0) {
227  *cPtr++ = 1;
228  }
229  else {
230  *cPtr++ = 0;
231  }
232  }
233 }
234 #endif /* LV_HAVE_SSE2 */
235 
236 
237 #ifdef LV_HAVE_NEON
238 #include <arm_neon.h>
239 
240 static inline void
241 volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
242  unsigned int num_points)
243 {
244  int8_t* cPtr = cVector;
245  const float* aPtr = aVector;
246  unsigned int number = 0;
247  unsigned int n16points = num_points / 16;
248 
249  float32x4x2_t input_val0, input_val1;
250  float32x4_t zero_val;
251  uint32x4x2_t res0_u32, res1_u32;
252  uint16x4x2_t res0_u16x4, res1_u16x4;
253  uint16x8x2_t res_u16x8;
254  uint8x8x2_t res_u8;
255  uint8x8_t one;
256 
257  zero_val = vdupq_n_f32(0.0);
258  one = vdup_n_u8(0x01);
259 
260  // TODO: this is a good candidate for asm because the vcombines
261  // can be eliminated simply by picking dst registers that are
262  // adjacent.
263  for(number = 0; number < n16points; number++) {
264  input_val0 = vld2q_f32(aPtr);
265  input_val1 = vld2q_f32(aPtr+8);
266 
267  // test against 0; return uint32
268  res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
269  res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
270  res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
271  res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
272 
273  // narrow uint32 -> uint16 followed by combine to 8-element vectors
274  res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
275  res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
276  res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
277  res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
278 
279  res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
280  res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
281 
282  // narrow uint16x8 -> uint8x8
283  res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
284  res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
285  // we *could* load twice as much data and do another vcombine here
286  // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
287  // but that turns out to be ~16% slower than this version on zc702
288  // it's possible register contention in GCC scheduler slows it down
289  // and a hand-written asm with quad-word u8 registers is much faster.
290 
291  res_u8.val[0] = vand_u8(one, res_u8.val[0]);
292  res_u8.val[1] = vand_u8(one, res_u8.val[1]);
293 
294  vst2_u8((unsigned char*)cPtr, res_u8);
295  cPtr += 16;
296  aPtr += 16;
297 
298  }
299 
300  for(number = n16points * 16; number < num_points; number++) {
301  if(*aPtr++ >= 0) {
302  *cPtr++ = 1;
303  }
304  else {
305  *cPtr++ = 0;
306  }
307  }
308 }
309 #endif /* LV_HAVE_NEON */
310 
311 
312 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
signed char int8_t
Definition: stdint.h:75