GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_x2_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_x2_multiply_conjugate_32fc
25  *
26  * \b Overview
27  *
28  * Multiplies a complex vector by the conjugate of a secod complex
29  * vector and returns the complex result.
30  *
31  * <b>Dispatcher Prototype</b>
32  * \code
33  * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
34  * \endcode
35  *
36  * \b Inputs
37  * \li aVector: The first input vector of complex floats.
38  * \li bVector: The second input vector of complex floats that is conjugated.
39  * \li num_points: The number of data points.
40  *
41  * \b Outputs
42  * \li outputVector: The output vector complex floats.
43  *
44  * \b Example
45  * Calculate mag^2 of a signal using x * conj(x).
46  * \code
47  * int N = 10;
48  * unsigned int alignment = volk_get_alignment();
49  * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50  * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
51  *
52  * float delta = 2.f*M_PI / (float)N;
53  * for(unsigned int ii = 0; ii < N; ++ii){
54  * float real_1 = std::cos(0.3f * (float)ii);
55  * float imag_1 = std::sin(0.3f * (float)ii);
56  * sig_1[ii] = lv_cmake(real_1, imag_1);
57  * }
58  *
59  * volk_32fc_x2_multiply_conjugate_32fc(out, sig_1, sig_1, N);
60  *
61  * for(unsigned int ii = 0; ii < N; ++ii){
62  * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
63  * }
64  * printf("\n");
65  *
66  * volk_free(sig_1);
67  * volk_free(out);
68  * \endcode
69  */
70 
71 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77 #include <float.h>
78 
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
84  const lv_32fc_t* bVector, unsigned int num_points)
85 {
86  unsigned int number = 0;
87  const unsigned int quarterPoints = num_points / 4;
88 
89  __m256 x, y, yl, yh, z, tmp1, tmp2;
90  lv_32fc_t* c = cVector;
91  const lv_32fc_t* a = aVector;
92  const lv_32fc_t* b = bVector;
93 
94  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
95 
96  for(;number < quarterPoints; number++){
97 
98  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
99  y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
100 
101  y = _mm256_xor_ps(y, conjugator); // conjugate y
102 
103  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
104  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
105 
106  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
107 
108  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
109 
110  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
111 
112  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
113 
114  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
115 
116  a += 4;
117  b += 4;
118  c += 4;
119  }
120 
121  number = quarterPoints * 4;
122 
123  for(; number < num_points; number++) {
124  *c++ = (*a++) * lv_conj(*b++);
125  }
126 }
127 #endif /* LV_HAVE_AVX */
128 
129 
130 #ifdef LV_HAVE_SSE3
131 #include <pmmintrin.h>
132 
133 static inline void
134 volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
135  const lv_32fc_t* bVector, unsigned int num_points)
136 {
137  unsigned int number = 0;
138  const unsigned int halfPoints = num_points / 2;
139 
140  __m128 x, y, yl, yh, z, tmp1, tmp2;
141  lv_32fc_t* c = cVector;
142  const lv_32fc_t* a = aVector;
143  const lv_32fc_t* b = bVector;
144 
145  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
146 
147  for(;number < halfPoints; number++){
148 
149  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
150  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
151 
152  y = _mm_xor_ps(y, conjugator); // conjugate y
153 
154  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
155  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
156 
157  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
158 
159  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
160 
161  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
162 
163  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
164 
165  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
166 
167  a += 2;
168  b += 2;
169  c += 2;
170  }
171 
172  if((num_points % 2) != 0) {
173  *c = (*a) * lv_conj(*b);
174  }
175 }
176 #endif /* LV_HAVE_SSE */
177 
178 
179 #ifdef LV_HAVE_GENERIC
180 
181 static inline void
182 volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
183  const lv_32fc_t* bVector, unsigned int num_points)
184 {
185  lv_32fc_t* cPtr = cVector;
186  const lv_32fc_t* aPtr = aVector;
187  const lv_32fc_t* bPtr= bVector;
188  unsigned int number = 0;
189 
190  for(number = 0; number < num_points; number++){
191  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
192  }
193 }
194 #endif /* LV_HAVE_GENERIC */
195 
196 
197 
198 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
199 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
200 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
201 
202 #include <inttypes.h>
203 #include <stdio.h>
204 #include <volk/volk_complex.h>
205 #include <float.h>
206 
207 #ifdef LV_HAVE_AVX
208 #include <immintrin.h>
209 
210 static inline void
211 volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
212  const lv_32fc_t* bVector, unsigned int num_points)
213 {
214  unsigned int number = 0;
215  const unsigned int quarterPoints = num_points / 4;
216 
217  __m256 x, y, yl, yh, z, tmp1, tmp2;
218  lv_32fc_t* c = cVector;
219  const lv_32fc_t* a = aVector;
220  const lv_32fc_t* b = bVector;
221 
222  __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
223 
224  for(;number < quarterPoints; number++){
225 
226  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
227  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
228 
229  y = _mm256_xor_ps(y, conjugator); // conjugate y
230 
231  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
232  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
233 
234  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
235 
236  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
237 
238  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
239 
240  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ...
241 
242  _mm256_store_ps((float*)c,z); // Store the results back into the C container
243 
244  a += 4;
245  b += 4;
246  c += 4;
247  }
248 
249  number = quarterPoints * 4;
250 
251  for(; number < num_points; number++) {
252  *c++ = (*a++) * lv_conj(*b++);
253  }
254 }
255 #endif /* LV_HAVE_AVX */
256 
257 
258 #ifdef LV_HAVE_SSE3
259 #include <pmmintrin.h>
260 
261 static inline void
262 volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
263  const lv_32fc_t* bVector, unsigned int num_points)
264 {
265  unsigned int number = 0;
266  const unsigned int halfPoints = num_points / 2;
267 
268  __m128 x, y, yl, yh, z, tmp1, tmp2;
269  lv_32fc_t* c = cVector;
270  const lv_32fc_t* a = aVector;
271  const lv_32fc_t* b = bVector;
272 
273  __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
274 
275  for(;number < halfPoints; number++){
276 
277  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
278  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
279 
280  y = _mm_xor_ps(y, conjugator); // conjugate y
281 
282  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
283  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
284 
285  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
286 
287  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
288 
289  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
290 
291  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
292 
293  _mm_store_ps((float*)c,z); // Store the results back into the C container
294 
295  a += 2;
296  b += 2;
297  c += 2;
298  }
299 
300  if((num_points % 2) != 0) {
301  *c = (*a) * lv_conj(*b);
302  }
303 }
304 #endif /* LV_HAVE_SSE */
305 
306 
307 #ifdef LV_HAVE_NEON
308 #include <arm_neon.h>
309 
310 static inline void
311 volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
312  const lv_32fc_t* bVector, unsigned int num_points)
313 {
314  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
315  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
316  unsigned int quarter_points = num_points / 4;
317  float32x4x2_t a_val, b_val, c_val;
318  float32x4x2_t tmp_real, tmp_imag;
319  unsigned int number = 0;
320 
321  for(number = 0; number < quarter_points; ++number) {
322  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
323  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
324  b_val.val[1] = vnegq_f32(b_val.val[1]);
325  __builtin_prefetch(a_ptr+4);
326  __builtin_prefetch(b_ptr+4);
327 
328  // multiply the real*real and imag*imag to get real result
329  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
330  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
331  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
332  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
333 
334  // Multiply cross terms to get the imaginary result
335  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
336  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
337  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
338  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
339 
340  // store the results
341  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
342  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
343  vst2q_f32((float*)cVector, c_val);
344 
345  a_ptr += 4;
346  b_ptr += 4;
347  cVector += 4;
348  }
349 
350  for(number = quarter_points*4; number < num_points; number++){
351  *cVector++ = (*a_ptr++) * conj(*b_ptr++);
352  }
353 }
354 #endif /* LV_HAVE_NEON */
355 
356 
357 #ifdef LV_HAVE_GENERIC
358 
359 static inline void
360 volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
361  const lv_32fc_t* bVector, unsigned int num_points)
362 {
363  lv_32fc_t* cPtr = cVector;
364  const lv_32fc_t* aPtr = aVector;
365  const lv_32fc_t* bPtr= bVector;
366  unsigned int number = 0;
367 
368  for(number = 0; number < num_points; number++){
369  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
370  }
371 }
372 #endif /* LV_HAVE_GENERIC */
373 
374 
375 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
#define lv_conj(x)
Definition: volk_complex.h:80
float complex lv_32fc_t
Definition: volk_complex.h:56