GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_16i_32fc_dot_prod_32fc
25  *
26  * \b Overview
27  *
28  * This block computes the dot product (or inner product) between two
29  * vectors, the \p input and \p taps vectors. Given a set of \p
30  * num_points taps, the result is the sum of products between the two
31  * vectors. The result is a single value stored in the \p result
32  * address and will be complex.
33  *
34  * <b>Dispatcher Prototype</b>
35  * \code
36  * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points)
37  * \endcode
38  *
39  * \b Inputs
40  * \li input: vector of shorts.
41  * \li taps: complex taps.
42  * \li num_points: number of samples in both \p input and \p taps.
43  *
44  * \b Outputs
45  * \li result: pointer to a complex value to hold the dot product result.
46  *
47  * \b Example
48  * \code
49  * int N = 10000;
50  *
51  * <FIXME>
52  *
53  * volk_16i_32fc_dot_prod_32fc();
54  *
55  * \endcode
56  */
57 
58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
60 
61 #include <volk/volk_common.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
68 
69  static const int N_UNROLL = 4;
70 
71  lv_32fc_t acc0 = 0;
72  lv_32fc_t acc1 = 0;
73  lv_32fc_t acc2 = 0;
74  lv_32fc_t acc3 = 0;
75 
76  unsigned i = 0;
77  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
78 
79  for(i = 0; i < n; i += N_UNROLL) {
80  acc0 += taps[i + 0] * (float)input[i + 0];
81  acc1 += taps[i + 1] * (float)input[i + 1];
82  acc2 += taps[i + 2] * (float)input[i + 2];
83  acc3 += taps[i + 3] * (float)input[i + 3];
84  }
85 
86  for(; i < num_points; i++) {
87  acc0 += taps[i] * (float)input[i];
88  }
89 
90  *result = acc0 + acc1 + acc2 + acc3;
91 }
92 
93 #endif /*LV_HAVE_GENERIC*/
94 
95 #ifdef LV_HAVE_NEON
96 #include <arm_neon.h>
97 static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
98 
99  unsigned ii;
100  unsigned quarter_points = num_points / 4;
101  lv_32fc_t* tapsPtr = (lv_32fc_t*) taps;
102  short* inputPtr = (short*) input;
103  lv_32fc_t accumulator_vec[4];
104 
105  float32x4x2_t tapsVal, accumulator_val;
106  int16x4_t input16;
107  int32x4_t input32;
108  float32x4_t input_float, prod_re, prod_im;
109 
110  accumulator_val.val[0] = vdupq_n_f32(0.0);
111  accumulator_val.val[1] = vdupq_n_f32(0.0);
112 
113  for(ii = 0; ii < quarter_points; ++ii) {
114  tapsVal = vld2q_f32((float*)tapsPtr);
115  input16 = vld1_s16(inputPtr);
116  // widen 16-bit int to 32-bit int
117  input32 = vmovl_s16(input16);
118  // convert 32-bit int to float with scale
119  input_float = vcvtq_f32_s32(input32);
120 
121  prod_re = vmulq_f32(input_float, tapsVal.val[0]);
122  prod_im = vmulq_f32(input_float, tapsVal.val[1]);
123 
124  accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
125  accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
126 
127  tapsPtr += 4;
128  inputPtr += 4;
129  }
130  vst2q_f32((float*)accumulator_vec, accumulator_val);
131  accumulator_vec[0] += accumulator_vec[1];
132  accumulator_vec[2] += accumulator_vec[3];
133  accumulator_vec[0] += accumulator_vec[2];
134 
135  for(ii = quarter_points * 4; ii < num_points; ++ii) {
136  accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
137  }
138 
139  *result = accumulator_vec[0];
140 }
141 
142 #endif /*LV_HAVE_NEON*/
143 
144 #if LV_HAVE_SSE && LV_HAVE_MMX
145 
146 static inline void volk_16i_32fc_dot_prod_32fc_u_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
147 
148  unsigned int number = 0;
149  const unsigned int sixteenthPoints = num_points / 8;
150 
151  float res[2];
152  float *realpt = &res[0], *imagpt = &res[1];
153  const short* aPtr = input;
154  const float* bPtr = (float*)taps;
155 
156  __m64 m0, m1;
157  __m128 f0, f1, f2, f3;
158  __m128 a0Val, a1Val, a2Val, a3Val;
159  __m128 b0Val, b1Val, b2Val, b3Val;
160  __m128 c0Val, c1Val, c2Val, c3Val;
161 
162  __m128 dotProdVal0 = _mm_setzero_ps();
163  __m128 dotProdVal1 = _mm_setzero_ps();
164  __m128 dotProdVal2 = _mm_setzero_ps();
165  __m128 dotProdVal3 = _mm_setzero_ps();
166 
167  for(;number < sixteenthPoints; number++){
168 
169  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
170  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
171  f0 = _mm_cvtpi16_ps(m0);
172  f1 = _mm_cvtpi16_ps(m0);
173  f2 = _mm_cvtpi16_ps(m1);
174  f3 = _mm_cvtpi16_ps(m1);
175 
176  a0Val = _mm_unpacklo_ps(f0, f1);
177  a1Val = _mm_unpackhi_ps(f0, f1);
178  a2Val = _mm_unpacklo_ps(f2, f3);
179  a3Val = _mm_unpackhi_ps(f2, f3);
180 
181  b0Val = _mm_loadu_ps(bPtr);
182  b1Val = _mm_loadu_ps(bPtr+4);
183  b2Val = _mm_loadu_ps(bPtr+8);
184  b3Val = _mm_loadu_ps(bPtr+12);
185 
186  c0Val = _mm_mul_ps(a0Val, b0Val);
187  c1Val = _mm_mul_ps(a1Val, b1Val);
188  c2Val = _mm_mul_ps(a2Val, b2Val);
189  c3Val = _mm_mul_ps(a3Val, b3Val);
190 
191  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
192  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
193  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
194  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
195 
196  aPtr += 8;
197  bPtr += 16;
198  }
199 
200  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
203 
204  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
205 
206  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
207 
208  *realpt = dotProductVector[0];
209  *imagpt = dotProductVector[1];
210  *realpt += dotProductVector[2];
211  *imagpt += dotProductVector[3];
212 
213  number = sixteenthPoints*8;
214  for(;number < num_points; number++){
215  *realpt += ((*aPtr) * (*bPtr++));
216  *imagpt += ((*aPtr++) * (*bPtr++));
217  }
218 
219  *result = *(lv_32fc_t*)(&res[0]);
220 }
221 
222 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
223 
224 
225 
226 
227 #if LV_HAVE_SSE && LV_HAVE_MMX
228 
229 
230 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
231 
232  unsigned int number = 0;
233  const unsigned int sixteenthPoints = num_points / 8;
234 
235  float res[2];
236  float *realpt = &res[0], *imagpt = &res[1];
237  const short* aPtr = input;
238  const float* bPtr = (float*)taps;
239 
240  __m64 m0, m1;
241  __m128 f0, f1, f2, f3;
242  __m128 a0Val, a1Val, a2Val, a3Val;
243  __m128 b0Val, b1Val, b2Val, b3Val;
244  __m128 c0Val, c1Val, c2Val, c3Val;
245 
246  __m128 dotProdVal0 = _mm_setzero_ps();
247  __m128 dotProdVal1 = _mm_setzero_ps();
248  __m128 dotProdVal2 = _mm_setzero_ps();
249  __m128 dotProdVal3 = _mm_setzero_ps();
250 
251  for(;number < sixteenthPoints; number++){
252 
253  m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
254  m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
255  f0 = _mm_cvtpi16_ps(m0);
256  f1 = _mm_cvtpi16_ps(m0);
257  f2 = _mm_cvtpi16_ps(m1);
258  f3 = _mm_cvtpi16_ps(m1);
259 
260  a0Val = _mm_unpacklo_ps(f0, f1);
261  a1Val = _mm_unpackhi_ps(f0, f1);
262  a2Val = _mm_unpacklo_ps(f2, f3);
263  a3Val = _mm_unpackhi_ps(f2, f3);
264 
265  b0Val = _mm_load_ps(bPtr);
266  b1Val = _mm_load_ps(bPtr+4);
267  b2Val = _mm_load_ps(bPtr+8);
268  b3Val = _mm_load_ps(bPtr+12);
269 
270  c0Val = _mm_mul_ps(a0Val, b0Val);
271  c1Val = _mm_mul_ps(a1Val, b1Val);
272  c2Val = _mm_mul_ps(a2Val, b2Val);
273  c3Val = _mm_mul_ps(a3Val, b3Val);
274 
275  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
276  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
277  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
278  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
279 
280  aPtr += 8;
281  bPtr += 16;
282  }
283 
284  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
285  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
286  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
287 
288  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
289 
290  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
291 
292  *realpt = dotProductVector[0];
293  *imagpt = dotProductVector[1];
294  *realpt += dotProductVector[2];
295  *imagpt += dotProductVector[3];
296 
297  number = sixteenthPoints*8;
298  for(;number < num_points; number++){
299  *realpt += ((*aPtr) * (*bPtr++));
300  *imagpt += ((*aPtr++) * (*bPtr++));
301  }
302 
303  *result = *(lv_32fc_t*)(&res[0]);
304 }
305 
306 #endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
307 
308 
309 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56