GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_s32fc_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_s32fc_multiply_32fc
25  *
26  * \b Overview
27  *
28  * Multiplies the input complex vector by a complex scalar and returns
29  * the results.
30  *
31  * <b>Dispatcher Prototype</b>
32  * \code
33  * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
34  * \endcode
35  *
36  * \b Inputs
37  * \li aVector: The input vector to be multiplied.
38  * \li scalar The complex scalar to multiply against aVector.
39  * \li num_points: The number of complex values in aVector.
40  *
41  * \b Outputs
42  * \li cVector: The vector where the results will be stored.
43  *
44  * \b Example
45  * Generate points around the unit circle and shift the phase pi/3 rad.
46  * \code
47  * int N = 10;
48  * unsigned int alignment = volk_get_alignment();
49  * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50  * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
51  * lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
52  *
53  * float delta = 2.f*M_PI / (float)N;
54  * for(unsigned int ii = 0; ii < N/2; ++ii){
55  * // Generate points around the unit circle
56  * float real = std::cos(delta * (float)ii);
57  * float imag = std::sin(delta * (float)ii);
58  * in[ii] = lv_cmake(real, imag);
59  * in[ii+N/2] = lv_cmake(-real, -imag);
60  * }
61  *
62  * volk_32fc_s32fc_multiply_32fc(out, in, scalar, N);
63  *
64  * printf(" mag phase | mag phase\n");
65  * for(unsigned int ii = 0; ii < N; ++ii){
66  * printf("%+1.2f %+1.2f | %+1.2f %+1.2f\n",
67  * std::abs(in[ii]), std::arg(in[ii]),
68  * std::abs(out[ii]), std::arg(out[ii]));
69  * }
70  *
71  * volk_free(in);
72  * volk_free(out);
73  * \endcode
74  */
75 
76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
78 
79 #include <inttypes.h>
80 #include <stdio.h>
81 #include <volk/volk_complex.h>
82 #include <float.h>
83 
84 #ifdef LV_HAVE_AVX
85 #include <immintrin.h>
86 
87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
88  unsigned int number = 0;
89  unsigned int i = 0;
90  const unsigned int quarterPoints = num_points / 4;
91  unsigned int isodd = num_points & 3;
92  __m256 x, yl, yh, z, tmp1, tmp2;
93  lv_32fc_t* c = cVector;
94  const lv_32fc_t* a = aVector;
95 
96  // Set up constant scalar vector
97  yl = _mm256_set1_ps(lv_creal(scalar));
98  yh = _mm256_set1_ps(lv_cimag(scalar));
99 
100  for(;number < quarterPoints; number++){
101  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
102 
103  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
104 
105  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
106 
107  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
108 
109  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
110 
111  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
112 
113  a += 4;
114  c += 4;
115  }
116 
117  for(i = num_points-isodd; i < num_points; i++) {
118  *c++ = (*a++) * scalar;
119  }
120 
121 }
122 #endif /* LV_HAVE_AVX */
123 
124 #ifdef LV_HAVE_SSE3
125 #include <pmmintrin.h>
126 
127 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
128  unsigned int number = 0;
129  const unsigned int halfPoints = num_points / 2;
130 
131  __m128 x, yl, yh, z, tmp1, tmp2;
132  lv_32fc_t* c = cVector;
133  const lv_32fc_t* a = aVector;
134 
135  // Set up constant scalar vector
136  yl = _mm_set_ps1(lv_creal(scalar));
137  yh = _mm_set_ps1(lv_cimag(scalar));
138 
139  for(;number < halfPoints; number++){
140 
141  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
142 
143  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
144 
145  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
146 
147  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
148 
149  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
150 
151  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
152 
153  a += 2;
154  c += 2;
155  }
156 
157  if((num_points % 2) != 0) {
158  *c = (*a) * scalar;
159  }
160 }
161 #endif /* LV_HAVE_SSE */
162 
163 #ifdef LV_HAVE_GENERIC
164 
165 static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
166  lv_32fc_t* cPtr = cVector;
167  const lv_32fc_t* aPtr = aVector;
168  unsigned int number = num_points;
169 
170  // unwrap loop
171  while (number >= 8){
172  *cPtr++ = (*aPtr++) * scalar;
173  *cPtr++ = (*aPtr++) * scalar;
174  *cPtr++ = (*aPtr++) * scalar;
175  *cPtr++ = (*aPtr++) * scalar;
176  *cPtr++ = (*aPtr++) * scalar;
177  *cPtr++ = (*aPtr++) * scalar;
178  *cPtr++ = (*aPtr++) * scalar;
179  *cPtr++ = (*aPtr++) * scalar;
180  number -= 8;
181  }
182 
183  // clean up any remaining
184  while (number-- > 0)
185  *cPtr++ = *aPtr++ * scalar;
186 }
187 #endif /* LV_HAVE_GENERIC */
188 
189 
190 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
191 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
192 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
193 
194 #include <inttypes.h>
195 #include <stdio.h>
196 #include <volk/volk_complex.h>
197 #include <float.h>
198 
199 #ifdef LV_HAVE_AVX
200 #include <immintrin.h>
201 
202 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
203  unsigned int number = 0;
204  unsigned int i = 0;
205  const unsigned int quarterPoints = num_points / 4;
206  unsigned int isodd = num_points & 3;
207  __m256 x, yl, yh, z, tmp1, tmp2;
208  lv_32fc_t* c = cVector;
209  const lv_32fc_t* a = aVector;
210 
211  // Set up constant scalar vector
212  yl = _mm256_set1_ps(lv_creal(scalar));
213  yh = _mm256_set1_ps(lv_cimag(scalar));
214 
215  for(;number < quarterPoints; number++){
216  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
217 
218  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
219 
220  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
221 
222  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
223 
224  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
225 
226  _mm256_store_ps((float*)c,z); // Store the results back into the C container
227 
228  a += 4;
229  c += 4;
230  }
231 
232  for(i = num_points-isodd; i < num_points; i++) {
233  *c++ = (*a++) * scalar;
234  }
235 
236 }
237 #endif /* LV_HAVE_AVX */
238 
239 #ifdef LV_HAVE_SSE3
240 #include <pmmintrin.h>
241 
242 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
243  unsigned int number = 0;
244  const unsigned int halfPoints = num_points / 2;
245 
246  __m128 x, yl, yh, z, tmp1, tmp2;
247  lv_32fc_t* c = cVector;
248  const lv_32fc_t* a = aVector;
249 
250  // Set up constant scalar vector
251  yl = _mm_set_ps1(lv_creal(scalar));
252  yh = _mm_set_ps1(lv_cimag(scalar));
253 
254  for(;number < halfPoints; number++){
255 
256  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
257 
258  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
259 
260  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
261 
262  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
263 
264  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
265 
266  _mm_store_ps((float*)c,z); // Store the results back into the C container
267 
268  a += 2;
269  c += 2;
270  }
271 
272  if((num_points % 2) != 0) {
273  *c = (*a) * scalar;
274  }
275 }
276 #endif /* LV_HAVE_SSE */
277 
278 #ifdef LV_HAVE_NEON
279 #include <arm_neon.h>
280 
281 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
282  lv_32fc_t* cPtr = cVector;
283  const lv_32fc_t* aPtr = aVector;
284  unsigned int number = num_points;
285  unsigned int quarter_points = num_points / 4;
286 
287  float32x4x2_t a_val, scalar_val;
288  float32x4x2_t tmp_imag;
289 
290  scalar_val = vld2q_f32((const float*)&scalar);
291  for(number = 0; number < quarter_points; ++number) {
292  a_val = vld2q_f32((float*)aPtr);
293  tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
294  tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
295 
296  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
297  tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
298 
299  vst2q_f32((float*)cVector, tmp_imag);
300  aPtr += 4;
301  cVector += 4;
302  }
303 
304  for(number = quarter_points*4; number < num_points; number++){
305  *cPtr++ = *aPtr++ * scalar;
306  }
307 }
308 #endif /* LV_HAVE_NEON */
309 
310 #ifdef LV_HAVE_GENERIC
311 
312 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
313  lv_32fc_t* cPtr = cVector;
314  const lv_32fc_t* aPtr = aVector;
315  unsigned int number = num_points;
316 
317  // unwrap loop
318  while (number >= 8){
319  *cPtr++ = (*aPtr++) * scalar;
320  *cPtr++ = (*aPtr++) * scalar;
321  *cPtr++ = (*aPtr++) * scalar;
322  *cPtr++ = (*aPtr++) * scalar;
323  *cPtr++ = (*aPtr++) * scalar;
324  *cPtr++ = (*aPtr++) * scalar;
325  *cPtr++ = (*aPtr++) * scalar;
326  *cPtr++ = (*aPtr++) * scalar;
327  number -= 8;
328  }
329 
330  // clean up any remaining
331  while (number-- > 0)
332  *cPtr++ = *aPtr++ * scalar;
333 }
334 #endif /* LV_HAVE_GENERIC */
335 
336 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80