79 #ifndef INCLUDED_volk_32f_log2_32f_a_H
80 #define INCLUDED_volk_32f_log2_32f_a_H
87 #define LOG_POLY_DEGREE 6
89 #ifdef LV_HAVE_GENERIC
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
98 for (number = 0; number < num_points; number++)
103 #if LV_HAVE_AVX2 && LV_HAVE_FMA
104 #include <immintrin.h>
106 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107 #define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109 #define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118 static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
119 const float* aVector,
120 unsigned int num_points)
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
131 for (; number < eighthPoints; number++) {
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
141 bVal = _mm256_cvtepi32_ps(exp);
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148 #if LOG_POLY_DEGREE == 6
149 mantissa = POLY5_FMAAVX2(frac,
156 #elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163 #elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169 #elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
178 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
185 number = eighthPoints * 8;
192 #include <immintrin.h>
194 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195 #define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197 #define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199 #define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
207 volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
212 unsigned int number = 0;
213 const unsigned int eighthPoints = num_points / 8;
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
218 for (; number < eighthPoints; number++) {
220 aVal = _mm256_load_ps(aPtr);
221 bias = _mm256_set1_epi32(127);
222 leadingOne = _mm256_set1_ps(1.0f);
223 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
228 bVal = _mm256_cvtepi32_ps(exp);
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235 #if LOG_POLY_DEGREE == 6
236 mantissa = POLY5_AVX2(frac,
243 #elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250 #elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256 #elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
266 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
273 number = eighthPoints * 8;
279 #ifdef LV_HAVE_SSE4_1
280 #include <smmintrin.h>
282 #define POLY0(x, c0) _mm_set1_ps(c0)
283 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285 #define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287 #define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
293 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
298 unsigned int number = 0;
299 const unsigned int quarterPoints = num_points / 4;
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
304 for (; number < quarterPoints; number++) {
306 aVal = _mm_load_ps(aPtr);
307 bias = _mm_set1_epi32(127);
308 leadingOne = _mm_set1_ps(1.0f);
311 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
313 bVal = _mm_cvtepi32_ps(exp);
316 frac = _mm_or_ps(leadingOne,
317 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
319 #if LOG_POLY_DEGREE == 6
320 mantissa = POLY5(frac,
327 #elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334 #elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340 #elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
349 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
350 _mm_store_ps(bPtr, bVal);
356 number = quarterPoints * 4;
363 #include <arm_neon.h>
366 #define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
381 #define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
397 log2_approx = vaddq_f32(exponent_f, p0); \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
401 tmp1 = vmulq_f32(sig_2, p2); \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
405 tmp1 = vmulq_f32(sig_3, p3); \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
408 tmp1 = vmulq_f32(sig_4, p4); \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
411 tmp1 = vmulq_f32(sig_5, p5); \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
414 tmp1 = vmulq_f32(sig_6, p6); \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
423 const unsigned int quarterPoints = num_points / 4;
426 float32x4_t log2_approx;
437 for (number = 0; number < quarterPoints; ++number) {
439 aval = vld1q_s32((
int*)aPtr);
443 vst1q_f32(bPtr, log2_approx);
449 number = quarterPoints * 4;
458 #ifndef INCLUDED_volk_32f_log2_32f_u_H
459 #define INCLUDED_volk_32f_log2_32f_u_H
462 #ifdef LV_HAVE_SSE4_1
463 #include <smmintrin.h>
465 #define POLY0(x, c0) _mm_set1_ps(c0)
466 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
467 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
468 #define POLY3(x, c0, c1, c2, c3) \
469 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
470 #define POLY4(x, c0, c1, c2, c3, c4) \
471 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
472 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
473 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
476 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
478 float* bPtr = bVector;
479 const float* aPtr = aVector;
481 unsigned int number = 0;
482 const unsigned int quarterPoints = num_points / 4;
484 __m128 aVal, bVal, mantissa, frac, leadingOne;
487 for (; number < quarterPoints; number++) {
489 aVal = _mm_loadu_ps(aPtr);
490 bias = _mm_set1_epi32(127);
491 leadingOne = _mm_set1_ps(1.0f);
494 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
496 bVal = _mm_cvtepi32_ps(exp);
499 frac = _mm_or_ps(leadingOne,
500 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
502 #if LOG_POLY_DEGREE == 6
503 mantissa = POLY5(frac,
510 #elif LOG_POLY_DEGREE == 5
511 mantissa = POLY4(frac,
512 2.8882704548164776201f,
513 -2.52074962577807006663f,
514 1.48116647521213171641f,
515 -0.465725644288844778798f,
516 0.0596515482674574969533f);
517 #elif LOG_POLY_DEGREE == 4
518 mantissa = POLY3(frac,
519 2.61761038894603480148f,
520 -1.75647175389045657003f,
521 0.688243882994381274313f,
522 -0.107254423828329604454f);
523 #elif LOG_POLY_DEGREE == 3
524 mantissa = POLY2(frac,
525 2.28330284476918490682f,
526 -1.04913055217340124191f,
527 0.204446009836232697516f);
532 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
533 _mm_storeu_ps(bPtr, bVal);
539 number = quarterPoints * 4;
545 #if LV_HAVE_AVX2 && LV_HAVE_FMA
546 #include <immintrin.h>
548 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
549 #define POLY1_FMAAVX2(x, c0, c1) \
550 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
551 #define POLY2_FMAAVX2(x, c0, c1, c2) \
552 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
553 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
554 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
555 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
556 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
557 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
558 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
560 static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
561 const float* aVector,
562 unsigned int num_points)
564 float* bPtr = bVector;
565 const float* aPtr = aVector;
567 unsigned int number = 0;
568 const unsigned int eighthPoints = num_points / 8;
570 __m256 aVal, bVal, mantissa, frac, leadingOne;
573 for (; number < eighthPoints; number++) {
575 aVal = _mm256_loadu_ps(aPtr);
576 bias = _mm256_set1_epi32(127);
577 leadingOne = _mm256_set1_ps(1.0f);
578 exp = _mm256_sub_epi32(
579 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
580 _mm256_set1_epi32(0x7f800000)),
583 bVal = _mm256_cvtepi32_ps(exp);
588 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
590 #if LOG_POLY_DEGREE == 6
591 mantissa = POLY5_FMAAVX2(frac,
598 #elif LOG_POLY_DEGREE == 5
599 mantissa = POLY4_FMAAVX2(frac,
600 2.8882704548164776201f,
601 -2.52074962577807006663f,
602 1.48116647521213171641f,
603 -0.465725644288844778798f,
604 0.0596515482674574969533f);
605 #elif LOG_POLY_DEGREE == 4
606 mantissa = POLY3_FMAAVX2(frac,
607 2.61761038894603480148f,
608 -1.75647175389045657003f,
609 0.688243882994381274313f,
610 -0.107254423828329604454f);
611 #elif LOG_POLY_DEGREE == 3
612 mantissa = POLY2_FMAAVX2(frac,
613 2.28330284476918490682f,
614 -1.04913055217340124191f,
615 0.204446009836232697516f);
620 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
621 _mm256_storeu_ps(bPtr, bVal);
627 number = eighthPoints * 8;
634 #include <immintrin.h>
636 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
637 #define POLY1_AVX2(x, c0, c1) \
638 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
639 #define POLY2_AVX2(x, c0, c1, c2) \
640 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
641 #define POLY3_AVX2(x, c0, c1, c2, c3) \
642 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
643 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
644 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
645 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
646 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
649 volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
651 float* bPtr = bVector;
652 const float* aPtr = aVector;
654 unsigned int number = 0;
655 const unsigned int eighthPoints = num_points / 8;
657 __m256 aVal, bVal, mantissa, frac, leadingOne;
660 for (; number < eighthPoints; number++) {
662 aVal = _mm256_loadu_ps(aPtr);
663 bias = _mm256_set1_epi32(127);
664 leadingOne = _mm256_set1_ps(1.0f);
665 exp = _mm256_sub_epi32(
666 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
667 _mm256_set1_epi32(0x7f800000)),
670 bVal = _mm256_cvtepi32_ps(exp);
675 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
677 #if LOG_POLY_DEGREE == 6
678 mantissa = POLY5_AVX2(frac,
685 #elif LOG_POLY_DEGREE == 5
686 mantissa = POLY4_AVX2(frac,
687 2.8882704548164776201f,
688 -2.52074962577807006663f,
689 1.48116647521213171641f,
690 -0.465725644288844778798f,
691 0.0596515482674574969533f);
692 #elif LOG_POLY_DEGREE == 4
693 mantissa = POLY3_AVX2(frac,
694 2.61761038894603480148f,
695 -1.75647175389045657003f,
696 0.688243882994381274313f,
697 -0.107254423828329604454f);
698 #elif LOG_POLY_DEGREE == 3
699 mantissa = POLY2_AVX2(frac,
700 2.28330284476918490682f,
701 -1.04913055217340124191f,
702 0.204446009836232697516f);
708 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
709 _mm256_storeu_ps(bPtr, bVal);
715 number = eighthPoints * 8;
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:92
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:366
static float log2f_non_ieee(float f)
Definition: volk_common.h:155
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:381
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:418