GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_x2_pow_32f
25  *
26  * \b Overview
27  *
28  * Raises the sample in aVector to the power of the number in bVector.
29  *
30  * c[i] = pow(a[i], b[i])
31  *
32  * <b>Dispatcher Prototype</b>
33  * \code
34  * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector, unsigned int num_points)
35  * \endcode
36  *
37  * \b Inputs
38  * \li bVector: The input vector of indices (power values).
39  * \li aVector: The input vector of base values.
40  * \li num_points: The number of values in both input vectors.
41  *
42  * \b Outputs
43  * \li cVector: The output vector.
44  *
45  * \b Example
46  * Calculate the first two powers of two (2^x).
47  * \code
48  * int N = 10;
49  * unsigned int alignment = volk_get_alignment();
50  * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
51  * float* twos = (float*)volk_malloc(sizeof(float)*N, alignment);
52  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
53  *
54  * for(unsigned int ii = 0; ii < N; ++ii){
55  * increasing[ii] = (float)ii;
56  * twos[ii] = 2.f;
57  * }
58  *
59  * volk_32f_x2_pow_32f(out, increasing, twos, N);
60  *
61  * for(unsigned int ii = 0; ii < N; ++ii){
62  * printf("out[%u] = %1.2f\n", ii, out[ii]);
63  * }
64  *
65  * volk_free(increasing);
66  * volk_free(twos);
67  * volk_free(out);
68  * \endcode
69  */
70 
71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
73 
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <inttypes.h>
77 #include <math.h>
78 
79 #define POLY0(x, c0) _mm_set1_ps(c0)
80 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
81 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
82 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
83 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
84 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
85 
86 #define POW_POLY_DEGREE 3
87 
88 #ifdef LV_HAVE_SSE4_1
89 #include <smmintrin.h>
90 
91 static inline void
92 volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
93  const float* aVector, unsigned int num_points)
94 {
95  float* cPtr = cVector;
96  const float* bPtr = bVector;
97  const float* aPtr = aVector;
98 
99  unsigned int number = 0;
100  const unsigned int quarterPoints = num_points / 4;
101 
102  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103  __m128 tmp, fx, mask, pow2n, z, y;
104  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106  __m128i bias, exp, emm0, pi32_0x7f;
107 
108  one = _mm_set1_ps(1.0);
109  exp_hi = _mm_set1_ps(88.3762626647949);
110  exp_lo = _mm_set1_ps(-88.3762626647949);
111  ln2 = _mm_set1_ps(0.6931471805);
112  log2EF = _mm_set1_ps(1.44269504088896341);
113  half = _mm_set1_ps(0.5);
114  exp_C1 = _mm_set1_ps(0.693359375);
115  exp_C2 = _mm_set1_ps(-2.12194440e-4);
116  pi32_0x7f = _mm_set1_epi32(0x7f);
117 
118  exp_p0 = _mm_set1_ps(1.9875691500e-4);
119  exp_p1 = _mm_set1_ps(1.3981999507e-3);
120  exp_p2 = _mm_set1_ps(8.3334519073e-3);
121  exp_p3 = _mm_set1_ps(4.1665795894e-2);
122  exp_p4 = _mm_set1_ps(1.6666665459e-1);
123  exp_p5 = _mm_set1_ps(5.0000001201e-1);
124 
125  for(;number < quarterPoints; number++){
126  // First compute the logarithm
127  aVal = _mm_load_ps(aPtr);
128  bias = _mm_set1_epi32(127);
129  leadingOne = _mm_set1_ps(1.0f);
130  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
131  logarithm = _mm_cvtepi32_ps(exp);
132 
133  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
134 
135 #if POW_POLY_DEGREE == 6
136  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5
138  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4
140  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3
142  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
143 #else
144 #error
145 #endif
146 
147  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
148  logarithm = _mm_mul_ps(logarithm, ln2);
149 
150 
151  // Now calculate b*lna
152  bVal = _mm_load_ps(bPtr);
153  bVal = _mm_mul_ps(bVal, logarithm);
154 
155  // Now compute exp(b*lna)
156  tmp = _mm_setzero_ps();
157 
158  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
159 
160  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
161 
162  emm0 = _mm_cvttps_epi32(fx);
163  tmp = _mm_cvtepi32_ps(emm0);
164 
165  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
166  fx = _mm_sub_ps(tmp, mask);
167 
168  tmp = _mm_mul_ps(fx, exp_C1);
169  z = _mm_mul_ps(fx, exp_C2);
170  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
171  z = _mm_mul_ps(bVal, bVal);
172 
173  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
174  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
175  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
176  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
177  y = _mm_add_ps(y, one);
178 
179  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
180 
181  pow2n = _mm_castsi128_ps(emm0);
182  cVal = _mm_mul_ps(y, pow2n);
183 
184  _mm_store_ps(cPtr, cVal);
185 
186  aPtr += 4;
187  bPtr += 4;
188  cPtr += 4;
189  }
190 
191  number = quarterPoints * 4;
192  for(;number < num_points; number++){
193  *cPtr++ = pow(*aPtr++, *bPtr++);
194  }
195 }
196 
197 #endif /* LV_HAVE_SSE4_1 for aligned */
198 
199 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
200 
201 
202 
203 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
204 #define INCLUDED_volk_32f_x2_pow_32f_u_H
205 
206 #ifdef LV_HAVE_GENERIC
207 
208 static inline void
209 volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
210  const float* aVector, unsigned int num_points)
211 {
212  float* cPtr = cVector;
213  const float* bPtr = bVector;
214  const float* aPtr = aVector;
215  unsigned int number = 0;
216 
217  for(number = 0; number < num_points; number++){
218  *cPtr++ = pow(*aPtr++, *bPtr++);
219  }
220 }
221 #endif /* LV_HAVE_GENERIC */
222 
223 
224 #ifdef LV_HAVE_SSE4_1
225 #include <smmintrin.h>
226 
227 static inline void
228 volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
229  const float* aVector, unsigned int num_points)
230 {
231  float* cPtr = cVector;
232  const float* bPtr = bVector;
233  const float* aPtr = aVector;
234 
235  unsigned int number = 0;
236  const unsigned int quarterPoints = num_points / 4;
237 
238  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
239  __m128 tmp, fx, mask, pow2n, z, y;
240  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
241  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
242  __m128i bias, exp, emm0, pi32_0x7f;
243 
244  one = _mm_set1_ps(1.0);
245  exp_hi = _mm_set1_ps(88.3762626647949);
246  exp_lo = _mm_set1_ps(-88.3762626647949);
247  ln2 = _mm_set1_ps(0.6931471805);
248  log2EF = _mm_set1_ps(1.44269504088896341);
249  half = _mm_set1_ps(0.5);
250  exp_C1 = _mm_set1_ps(0.693359375);
251  exp_C2 = _mm_set1_ps(-2.12194440e-4);
252  pi32_0x7f = _mm_set1_epi32(0x7f);
253 
254  exp_p0 = _mm_set1_ps(1.9875691500e-4);
255  exp_p1 = _mm_set1_ps(1.3981999507e-3);
256  exp_p2 = _mm_set1_ps(8.3334519073e-3);
257  exp_p3 = _mm_set1_ps(4.1665795894e-2);
258  exp_p4 = _mm_set1_ps(1.6666665459e-1);
259  exp_p5 = _mm_set1_ps(5.0000001201e-1);
260 
261  for(;number < quarterPoints; number++){
262  // First compute the logarithm
263  aVal = _mm_loadu_ps(aPtr);
264  bias = _mm_set1_epi32(127);
265  leadingOne = _mm_set1_ps(1.0f);
266  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
267  logarithm = _mm_cvtepi32_ps(exp);
268 
269  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
270 
271 #if POW_POLY_DEGREE == 6
272  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
273 #elif POW_POLY_DEGREE == 5
274  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
275 #elif POW_POLY_DEGREE == 4
276  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
277 #elif POW_POLY_DEGREE == 3
278  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
279 #else
280 #error
281 #endif
282 
283  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
284  logarithm = _mm_mul_ps(logarithm, ln2);
285 
286 
287  // Now calculate b*lna
288  bVal = _mm_loadu_ps(bPtr);
289  bVal = _mm_mul_ps(bVal, logarithm);
290 
291  // Now compute exp(b*lna)
292  tmp = _mm_setzero_ps();
293 
294  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
295 
296  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
297 
298  emm0 = _mm_cvttps_epi32(fx);
299  tmp = _mm_cvtepi32_ps(emm0);
300 
301  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
302  fx = _mm_sub_ps(tmp, mask);
303 
304  tmp = _mm_mul_ps(fx, exp_C1);
305  z = _mm_mul_ps(fx, exp_C2);
306  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
307  z = _mm_mul_ps(bVal, bVal);
308 
309  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
310  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
311  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
312  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
313  y = _mm_add_ps(y, one);
314 
315  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
316 
317  pow2n = _mm_castsi128_ps(emm0);
318  cVal = _mm_mul_ps(y, pow2n);
319 
320  _mm_storeu_ps(cPtr, cVal);
321 
322  aPtr += 4;
323  bPtr += 4;
324  cPtr += 4;
325  }
326 
327  number = quarterPoints * 4;
328  for(;number < num_points; number++){
329  *cPtr++ = pow(*aPtr++, *bPtr++);
330  }
331 }
332 
333 #endif /* LV_HAVE_SSE4_1 for unaligned */
334 
335 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_x2_pow_32f.h:82
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_x2_pow_32f.h:84
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_x2_pow_32f.h:81
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_x2_pow_32f.h:83