GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_x2_dot_prod_16i
25  *
26  * \b Overview
27  *
28  * This block computes the dot product (or inner product) between two
29  * vectors, the \p input and \p taps vectors. Given a set of \p
30  * num_points taps, the result is the sum of products between the two
31  * vectors. The result is a single value stored in the \p result
32  * address and is conerted to a fixed-point short.
33  *
34  * <b>Dispatcher Prototype</b>
35  * \code
36  * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps, unsigned int num_points)
37  * \endcode
38  *
39  * \b Inputs
40  * \li input: vector of floats.
41  * \li taps: float taps.
42  * \li num_points: number of samples in both \p input and \p taps.
43  *
44  * \b Outputs
45  * \li result: pointer to a short value to hold the dot product result.
46  *
47  * \b Example
48  * \code
49  * int N = 10000;
50  *
51  * <FIXME>
52  *
53  * volk_32f_x2_dot_prod_16i();
54  *
55  * \endcode
56  */
57 
58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
60 
61 #include <volk/volk_common.h>
62 #include <stdio.h>
63 
64 
65 #ifdef LV_HAVE_GENERIC
66 
67 
68 static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
69 
70  float dotProduct = 0;
71  const float* aPtr = input;
72  const float* bPtr= taps;
73  unsigned int number = 0;
74 
75  for(number = 0; number < num_points; number++){
76  dotProduct += ((*aPtr++) * (*bPtr++));
77  }
78 
79  *result = (int16_t)dotProduct;
80 }
81 
82 #endif /*LV_HAVE_GENERIC*/
83 
84 
85 #ifdef LV_HAVE_SSE
86 
87 static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
88 
89  unsigned int number = 0;
90  const unsigned int sixteenthPoints = num_points / 16;
91 
92  float dotProduct = 0;
93  const float* aPtr = input;
94  const float* bPtr = taps;
95 
96  __m128 a0Val, a1Val, a2Val, a3Val;
97  __m128 b0Val, b1Val, b2Val, b3Val;
98  __m128 c0Val, c1Val, c2Val, c3Val;
99 
100  __m128 dotProdVal0 = _mm_setzero_ps();
101  __m128 dotProdVal1 = _mm_setzero_ps();
102  __m128 dotProdVal2 = _mm_setzero_ps();
103  __m128 dotProdVal3 = _mm_setzero_ps();
104 
105  for(;number < sixteenthPoints; number++){
106 
107  a0Val = _mm_load_ps(aPtr);
108  a1Val = _mm_load_ps(aPtr+4);
109  a2Val = _mm_load_ps(aPtr+8);
110  a3Val = _mm_load_ps(aPtr+12);
111  b0Val = _mm_load_ps(bPtr);
112  b1Val = _mm_load_ps(bPtr+4);
113  b2Val = _mm_load_ps(bPtr+8);
114  b3Val = _mm_load_ps(bPtr+12);
115 
116  c0Val = _mm_mul_ps(a0Val, b0Val);
117  c1Val = _mm_mul_ps(a1Val, b1Val);
118  c2Val = _mm_mul_ps(a2Val, b2Val);
119  c3Val = _mm_mul_ps(a3Val, b3Val);
120 
121  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
122  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
123  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
124  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
125 
126  aPtr += 16;
127  bPtr += 16;
128  }
129 
130  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
131  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
132  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
133 
134  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
135 
136  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
137 
138  dotProduct = dotProductVector[0];
139  dotProduct += dotProductVector[1];
140  dotProduct += dotProductVector[2];
141  dotProduct += dotProductVector[3];
142 
143  number = sixteenthPoints*16;
144  for(;number < num_points; number++){
145  dotProduct += ((*aPtr++) * (*bPtr++));
146  }
147 
148  *result = (short)dotProduct;
149 }
150 
151 #endif /*LV_HAVE_SSE*/
152 
153 
154 #ifdef LV_HAVE_SSE
155 
156 static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
157 
158  unsigned int number = 0;
159  const unsigned int sixteenthPoints = num_points / 16;
160 
161  float dotProduct = 0;
162  const float* aPtr = input;
163  const float* bPtr = taps;
164 
165  __m128 a0Val, a1Val, a2Val, a3Val;
166  __m128 b0Val, b1Val, b2Val, b3Val;
167  __m128 c0Val, c1Val, c2Val, c3Val;
168 
169  __m128 dotProdVal0 = _mm_setzero_ps();
170  __m128 dotProdVal1 = _mm_setzero_ps();
171  __m128 dotProdVal2 = _mm_setzero_ps();
172  __m128 dotProdVal3 = _mm_setzero_ps();
173 
174  for(;number < sixteenthPoints; number++){
175 
176  a0Val = _mm_loadu_ps(aPtr);
177  a1Val = _mm_loadu_ps(aPtr+4);
178  a2Val = _mm_loadu_ps(aPtr+8);
179  a3Val = _mm_loadu_ps(aPtr+12);
180  b0Val = _mm_loadu_ps(bPtr);
181  b1Val = _mm_loadu_ps(bPtr+4);
182  b2Val = _mm_loadu_ps(bPtr+8);
183  b3Val = _mm_loadu_ps(bPtr+12);
184 
185  c0Val = _mm_mul_ps(a0Val, b0Val);
186  c1Val = _mm_mul_ps(a1Val, b1Val);
187  c2Val = _mm_mul_ps(a2Val, b2Val);
188  c3Val = _mm_mul_ps(a3Val, b3Val);
189 
190  dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
191  dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
192  dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
193  dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
194 
195  aPtr += 16;
196  bPtr += 16;
197  }
198 
199  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
200  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
201  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
202 
203  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
204 
205  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
206 
207  dotProduct = dotProductVector[0];
208  dotProduct += dotProductVector[1];
209  dotProduct += dotProductVector[2];
210  dotProduct += dotProductVector[3];
211 
212  number = sixteenthPoints*16;
213  for(;number < num_points; number++){
214  dotProduct += ((*aPtr++) * (*bPtr++));
215  }
216 
217  *result = (short)dotProduct;
218 }
219 
220 #endif /*LV_HAVE_SSE*/
221 
222 #endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9