Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_exp_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /* SIMD (SSE4) implementation of exp
11  Inspired by Intel Approximate Math library, and based on the
12  corresponding algorithms of the cephes math library
13 */
14 
15 /* Copyright (C) 2007 Julien Pommier
16 
17  This software is provided 'as-is', without any express or implied
18  warranty. In no event will the authors be held liable for any damages
19  arising from the use of this software.
20 
21  Permission is granted to anyone to use this software for any purpose,
22  including commercial applications, and to alter it and redistribute it
23  freely, subject to the following restrictions:
24 
25  1. The origin of this software must not be misrepresented; you must not
26  claim that you wrote the original software. If you use this software
27  in a product, an acknowledgment in the product documentation would be
28  appreciated but is not required.
29  2. Altered source versions must be plainly marked as such, and must not be
30  misrepresented as being the original software.
31  3. This notice may not be removed or altered from any source distribution.
32 
33  (this is the zlib license)
34 */
35 
82 #include <inttypes.h>
83 #include <math.h>
84 #include <stdio.h>
85 
86 #ifndef INCLUDED_volk_32f_exp_32f_a_H
87 #define INCLUDED_volk_32f_exp_32f_a_H
88 
89 #ifdef LV_HAVE_SSE2
90 #include <emmintrin.h>
91 
92 static inline void
93 volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
94 {
95  float* bPtr = bVector;
96  const float* aPtr = aVector;
97 
98  unsigned int number = 0;
99  unsigned int quarterPoints = num_points / 4;
100 
101  // Declare variables and constants
102  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105  __m128i emm0, pi32_0x7f;
106 
107  one = _mm_set1_ps(1.0);
108  exp_hi = _mm_set1_ps(88.3762626647949);
109  exp_lo = _mm_set1_ps(-88.3762626647949);
110  log2EF = _mm_set1_ps(1.44269504088896341);
111  half = _mm_set1_ps(0.5);
112  exp_C1 = _mm_set1_ps(0.693359375);
113  exp_C2 = _mm_set1_ps(-2.12194440e-4);
114  pi32_0x7f = _mm_set1_epi32(0x7f);
115 
116  exp_p0 = _mm_set1_ps(1.9875691500e-4);
117  exp_p1 = _mm_set1_ps(1.3981999507e-3);
118  exp_p2 = _mm_set1_ps(8.3334519073e-3);
119  exp_p3 = _mm_set1_ps(4.1665795894e-2);
120  exp_p4 = _mm_set1_ps(1.6666665459e-1);
121  exp_p5 = _mm_set1_ps(5.0000001201e-1);
122 
123  for (; number < quarterPoints; number++) {
124  aVal = _mm_load_ps(aPtr);
125  tmp = _mm_setzero_ps();
126 
127  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
128 
129  /* express exp(x) as exp(g + n*log(2)) */
130  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
131 
132  emm0 = _mm_cvttps_epi32(fx);
133  tmp = _mm_cvtepi32_ps(emm0);
134 
135  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136  fx = _mm_sub_ps(tmp, mask);
137 
138  tmp = _mm_mul_ps(fx, exp_C1);
139  z = _mm_mul_ps(fx, exp_C2);
140  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141  z = _mm_mul_ps(aVal, aVal);
142 
143  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147  y = _mm_add_ps(y, one);
148 
149  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
150 
151  pow2n = _mm_castsi128_ps(emm0);
152  bVal = _mm_mul_ps(y, pow2n);
153 
154  _mm_store_ps(bPtr, bVal);
155  aPtr += 4;
156  bPtr += 4;
157  }
158 
159  number = quarterPoints * 4;
160  for (; number < num_points; number++) {
161  *bPtr++ = expf(*aPtr++);
162  }
163 }
164 
165 #endif /* LV_HAVE_SSE2 for aligned */
166 
167 
168 #endif /* INCLUDED_volk_32f_exp_32f_a_H */
169 
170 #ifndef INCLUDED_volk_32f_exp_32f_u_H
171 #define INCLUDED_volk_32f_exp_32f_u_H
172 
173 #ifdef LV_HAVE_SSE2
174 #include <emmintrin.h>
175 
176 static inline void
177 volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
178 {
179  float* bPtr = bVector;
180  const float* aPtr = aVector;
181 
182  unsigned int number = 0;
183  unsigned int quarterPoints = num_points / 4;
184 
185  // Declare variables and constants
186  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189  __m128i emm0, pi32_0x7f;
190 
191  one = _mm_set1_ps(1.0);
192  exp_hi = _mm_set1_ps(88.3762626647949);
193  exp_lo = _mm_set1_ps(-88.3762626647949);
194  log2EF = _mm_set1_ps(1.44269504088896341);
195  half = _mm_set1_ps(0.5);
196  exp_C1 = _mm_set1_ps(0.693359375);
197  exp_C2 = _mm_set1_ps(-2.12194440e-4);
198  pi32_0x7f = _mm_set1_epi32(0x7f);
199 
200  exp_p0 = _mm_set1_ps(1.9875691500e-4);
201  exp_p1 = _mm_set1_ps(1.3981999507e-3);
202  exp_p2 = _mm_set1_ps(8.3334519073e-3);
203  exp_p3 = _mm_set1_ps(4.1665795894e-2);
204  exp_p4 = _mm_set1_ps(1.6666665459e-1);
205  exp_p5 = _mm_set1_ps(5.0000001201e-1);
206 
207 
208  for (; number < quarterPoints; number++) {
209  aVal = _mm_loadu_ps(aPtr);
210  tmp = _mm_setzero_ps();
211 
212  aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
213 
214  /* express exp(x) as exp(g + n*log(2)) */
215  fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
216 
217  emm0 = _mm_cvttps_epi32(fx);
218  tmp = _mm_cvtepi32_ps(emm0);
219 
220  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221  fx = _mm_sub_ps(tmp, mask);
222 
223  tmp = _mm_mul_ps(fx, exp_C1);
224  z = _mm_mul_ps(fx, exp_C2);
225  aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226  z = _mm_mul_ps(aVal, aVal);
227 
228  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232  y = _mm_add_ps(y, one);
233 
234  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
235 
236  pow2n = _mm_castsi128_ps(emm0);
237  bVal = _mm_mul_ps(y, pow2n);
238 
239  _mm_storeu_ps(bPtr, bVal);
240  aPtr += 4;
241  bPtr += 4;
242  }
243 
244  number = quarterPoints * 4;
245  for (; number < num_points; number++) {
246  *bPtr++ = expf(*aPtr++);
247  }
248 }
249 
250 #endif /* LV_HAVE_SSE2 for unaligned */
251 
252 
253 #ifdef LV_HAVE_GENERIC
254 
255 static inline void
256 volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
257 {
258  float* bPtr = bVector;
259  const float* aPtr = aVector;
260  unsigned int number = 0;
261 
262  for (number = 0; number < num_points; number++) {
263  *bPtr++ = expf(*aPtr++);
264  }
265 }
266 
267 #endif /* LV_HAVE_GENERIC */
268 
269 #endif /* INCLUDED_volk_32f_exp_32f_u_H */
static void volk_32f_exp_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:256
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:93
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:177