71 #ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
72 #define INCLUDED_volk_32fc_magnitude_32f_u_H
79 #include <immintrin.h>
81 volk_32fc_magnitude_32f_u_avx(
float* magnitudeVector,
const lv_32fc_t* complexVector,
82 unsigned int num_points)
84 unsigned int number = 0;
85 const unsigned int eighthPoints = num_points / 8;
87 const float* complexVectorPtr = (
float*)complexVector;
88 float* magnitudeVectorPtr = magnitudeVector;
90 __m256 cplxValue1, cplxValue2, complex1, complex2, result;
91 for(;number < eighthPoints; number++){
92 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
93 complexVectorPtr += 8;
95 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
96 complexVectorPtr += 8;
98 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
99 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
101 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
102 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
104 result = _mm256_hadd_ps(complex1, complex2);
106 result = _mm256_sqrt_ps(result);
108 _mm256_storeu_ps(magnitudeVectorPtr, result);
109 magnitudeVectorPtr += 8;
112 number = eighthPoints * 8;
113 for(; number < num_points; number++){
114 float val1Real = *complexVectorPtr++;
115 float val1Imag = *complexVectorPtr++;
116 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
122 #include <pmmintrin.h>
125 volk_32fc_magnitude_32f_u_sse3(
float* magnitudeVector,
const lv_32fc_t* complexVector,
126 unsigned int num_points)
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
131 const float* complexVectorPtr = (
float*)complexVector;
132 float* magnitudeVectorPtr = magnitudeVector;
134 __m128 cplxValue1, cplxValue2, result;
135 for(;number < quarterPoints; number++){
136 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
137 complexVectorPtr += 4;
139 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
140 complexVectorPtr += 4;
142 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
143 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
145 result = _mm_hadd_ps(cplxValue1, cplxValue2);
147 result = _mm_sqrt_ps(result);
149 _mm_storeu_ps(magnitudeVectorPtr, result);
150 magnitudeVectorPtr += 4;
153 number = quarterPoints * 4;
154 for(; number < num_points; number++){
155 float val1Real = *complexVectorPtr++;
156 float val1Imag = *complexVectorPtr++;
157 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
164 #include <xmmintrin.h>
167 volk_32fc_magnitude_32f_u_sse(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points)
169 unsigned int number = 0;
170 const unsigned int quarterPoints = num_points / 4;
172 const float* complexVectorPtr = (
float*)complexVector;
173 float* magnitudeVectorPtr = magnitudeVector;
175 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
176 for(;number < quarterPoints; number++){
177 cplxValue1 = _mm_loadu_ps(complexVectorPtr);
178 complexVectorPtr += 4;
180 cplxValue2 = _mm_loadu_ps(complexVectorPtr);
181 complexVectorPtr += 4;
184 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
186 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
188 iValue = _mm_mul_ps(iValue, iValue);
189 qValue = _mm_mul_ps(qValue, qValue);
191 result = _mm_add_ps(iValue, qValue);
193 result = _mm_sqrt_ps(result);
195 _mm_storeu_ps(magnitudeVectorPtr, result);
196 magnitudeVectorPtr += 4;
199 number = quarterPoints * 4;
200 for(; number < num_points; number++){
201 float val1Real = *complexVectorPtr++;
202 float val1Imag = *complexVectorPtr++;
203 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
209 #ifdef LV_HAVE_GENERIC
212 volk_32fc_magnitude_32f_generic(
float* magnitudeVector,
const lv_32fc_t* complexVector,
unsigned int num_points)
214 const float* complexVectorPtr = (
float*)complexVector;
215 float* magnitudeVectorPtr = magnitudeVector;
216 unsigned int number = 0;
217 for(number = 0; number < num_points; number++){
218 const float real = *complexVectorPtr++;
219 const float imag = *complexVectorPtr++;
220 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
228 #ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
229 #define INCLUDED_volk_32fc_magnitude_32f_a_H
236 #include <immintrin.h>
239 volk_32fc_magnitude_32f_a_avx(
float* magnitudeVector,
const lv_32fc_t* complexVector,
240 unsigned int num_points)
242 unsigned int number = 0;
243 const unsigned int eighthPoints = num_points / 8;
245 const float* complexVectorPtr = (
float*)complexVector;
246 float* magnitudeVectorPtr = magnitudeVector;
248 __m256 cplxValue1, cplxValue2, complex1, complex2, result;
249 for(;number < eighthPoints; number++){
250 cplxValue1 = _mm256_load_ps(complexVectorPtr);
251 complexVectorPtr += 8;
253 cplxValue2 = _mm256_load_ps(complexVectorPtr);
254 complexVectorPtr += 8;
256 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
257 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
259 complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
260 complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
262 result = _mm256_hadd_ps(complex1, complex2);
264 result = _mm256_sqrt_ps(result);
266 _mm256_store_ps(magnitudeVectorPtr, result);
267 magnitudeVectorPtr += 8;
270 number = eighthPoints * 8;
271 for(; number < num_points; number++){
272 float val1Real = *complexVectorPtr++;
273 float val1Imag = *complexVectorPtr++;
274 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
280 #include <pmmintrin.h>
283 volk_32fc_magnitude_32f_a_sse3(
float* magnitudeVector,
const lv_32fc_t* complexVector,
284 unsigned int num_points)
286 unsigned int number = 0;
287 const unsigned int quarterPoints = num_points / 4;
289 const float* complexVectorPtr = (
float*)complexVector;
290 float* magnitudeVectorPtr = magnitudeVector;
292 __m128 cplxValue1, cplxValue2, result;
293 for(;number < quarterPoints; number++){
294 cplxValue1 = _mm_load_ps(complexVectorPtr);
295 complexVectorPtr += 4;
297 cplxValue2 = _mm_load_ps(complexVectorPtr);
298 complexVectorPtr += 4;
300 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
301 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
303 result = _mm_hadd_ps(cplxValue1, cplxValue2);
305 result = _mm_sqrt_ps(result);
307 _mm_store_ps(magnitudeVectorPtr, result);
308 magnitudeVectorPtr += 4;
311 number = quarterPoints * 4;
312 for(; number < num_points; number++){
313 float val1Real = *complexVectorPtr++;
314 float val1Imag = *complexVectorPtr++;
315 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
321 #include <xmmintrin.h>
324 volk_32fc_magnitude_32f_a_sse(
float* magnitudeVector,
const lv_32fc_t* complexVector,
325 unsigned int num_points)
327 unsigned int number = 0;
328 const unsigned int quarterPoints = num_points / 4;
330 const float* complexVectorPtr = (
float*)complexVector;
331 float* magnitudeVectorPtr = magnitudeVector;
333 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
334 for(;number < quarterPoints; number++){
335 cplxValue1 = _mm_load_ps(complexVectorPtr);
336 complexVectorPtr += 4;
338 cplxValue2 = _mm_load_ps(complexVectorPtr);
339 complexVectorPtr += 4;
342 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
344 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
346 iValue = _mm_mul_ps(iValue, iValue);
347 qValue = _mm_mul_ps(qValue, qValue);
349 result = _mm_add_ps(iValue, qValue);
351 result = _mm_sqrt_ps(result);
353 _mm_store_ps(magnitudeVectorPtr, result);
354 magnitudeVectorPtr += 4;
357 number = quarterPoints * 4;
358 for(; number < num_points; number++){
359 float val1Real = *complexVectorPtr++;
360 float val1Imag = *complexVectorPtr++;
361 *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
367 #ifdef LV_HAVE_GENERIC
370 volk_32fc_magnitude_32f_a_generic(
float* magnitudeVector,
const lv_32fc_t* complexVector,
371 unsigned int num_points)
373 const float* complexVectorPtr = (
float*)complexVector;
374 float* magnitudeVectorPtr = magnitudeVector;
375 unsigned int number = 0;
376 for(number = 0; number < num_points; number++){
377 const float real = *complexVectorPtr++;
378 const float imag = *complexVectorPtr++;
379 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
386 #include <arm_neon.h>
389 volk_32fc_magnitude_32f_neon(
float* magnitudeVector,
const lv_32fc_t* complexVector,
390 unsigned int num_points)
393 unsigned int quarter_points = num_points / 4;
394 const float* complexVectorPtr = (
float*)complexVector;
395 float* magnitudeVectorPtr = magnitudeVector;
397 float32x4x2_t complex_vec;
398 float32x4_t magnitude_vec;
399 for(number = 0; number < quarter_points; number++){
400 complex_vec = vld2q_f32(complexVectorPtr);
401 complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
402 magnitude_vec = vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
403 magnitude_vec = vrsqrteq_f32(magnitude_vec);
404 magnitude_vec = vrecpeq_f32( magnitude_vec );
405 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
407 complexVectorPtr += 8;
408 magnitudeVectorPtr += 4;
411 for(number = quarter_points*4; number < num_points; number++){
412 const float real = *complexVectorPtr++;
413 const float imag = *complexVectorPtr++;
414 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
436 volk_32fc_magnitude_32f_neon_fancy_sweet(
float* magnitudeVector,
const lv_32fc_t* complexVector,
437 unsigned int num_points)
440 unsigned int quarter_points = num_points / 4;
441 const float* complexVectorPtr = (
float*)complexVector;
442 float* magnitudeVectorPtr = magnitudeVector;
444 const float threshold = 0.4142135;
446 float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
447 a_high = vdupq_n_f32( 0.84 );
448 b_high = vdupq_n_f32( 0.561);
449 a_low = vdupq_n_f32( 0.99 );
450 b_low = vdupq_n_f32( 0.197);
452 uint32x4_t comp0, comp1;
454 float32x4x2_t complex_vec;
455 float32x4_t min_vec, max_vec, magnitude_vec;
456 float32x4_t real_abs, imag_abs;
457 for(number = 0; number < quarter_points; number++){
458 complex_vec = vld2q_f32(complexVectorPtr);
460 real_abs = vabsq_f32(complex_vec.val[0]);
461 imag_abs = vabsq_f32(complex_vec.val[1]);
463 min_vec = vminq_f32(real_abs, imag_abs);
464 max_vec = vmaxq_f32(real_abs, imag_abs);
467 comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
468 comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
471 a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
472 vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
473 b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
474 vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
477 min_vec = vmulq_f32(min_vec, b_vec);
478 max_vec = vmulq_f32(max_vec, a_vec);
480 magnitude_vec = vaddq_f32(min_vec, max_vec);
481 vst1q_f32(magnitudeVectorPtr, magnitude_vec);
483 complexVectorPtr += 8;
484 magnitudeVectorPtr += 4;
487 for(number = quarter_points*4; number < num_points; number++){
488 const float real = *complexVectorPtr++;
489 const float imag = *complexVectorPtr++;
490 *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
499 volk_32fc_magnitude_32f_a_orc_impl(
float* magnitudeVector,
const lv_32fc_t* complexVector,
500 unsigned int num_points);
503 volk_32fc_magnitude_32f_u_orc(
float* magnitudeVector,
const lv_32fc_t* complexVector,
504 unsigned int num_points)
506 volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56