68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
69 #define INCLUDED_volk_32f_tanh_32f_a_H
76 #ifdef LV_HAVE_GENERIC
79 volk_32f_tanh_32f_generic(
float* cVector,
const float* aVector,
80 unsigned int num_points)
82 unsigned int number = 0;
83 float* cPtr = cVector;
84 const float* aPtr = aVector;
85 for(; number < num_points; number++) {
86 *cPtr++ = tanh(*aPtr++);
93 #ifdef LV_HAVE_GENERIC
96 volk_32f_tanh_32f_series(
float* cVector,
const float* aVector,
97 unsigned int num_points)
99 unsigned int number = 0;
100 float* cPtr = cVector;
101 const float* aPtr = aVector;
102 for(; number < num_points; number++) {
105 else if(*aPtr <= -4.97)
108 float x2 = (*aPtr) * (*aPtr);
109 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
110 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
122 #include <xmmintrin.h>
125 volk_32f_tanh_32f_a_sse(
float* cVector,
const float* aVector,
126 unsigned int num_points)
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
131 float* cPtr = cVector;
132 const float* aPtr = aVector;
134 __m128 aVal, cVal, x2, a, b;
135 __m128 const1, const2, const3, const4, const5, const6;
136 const1 = _mm_set_ps1(135135.0f);
137 const2 = _mm_set_ps1(17325.0f);
138 const3 = _mm_set_ps1(378.0f);
139 const4 = _mm_set_ps1(62370.0f);
140 const5 = _mm_set_ps1(3150.0f);
141 const6 = _mm_set_ps1(28.0f);
142 for(;number < quarterPoints; number++){
144 aVal = _mm_load_ps(aPtr);
145 x2 = _mm_mul_ps(aVal, aVal);
146 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
147 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
149 cVal = _mm_div_ps(a, b);
151 _mm_store_ps(cPtr, cVal);
157 number = quarterPoints * 4;
158 for(;number < num_points; number++) {
161 else if(*aPtr <= -4.97)
164 float x2 = (*aPtr) * (*aPtr);
165 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
166 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
176 #include <immintrin.h>
179 volk_32f_tanh_32f_a_avx(
float* cVector,
const float* aVector,
180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int eighthPoints = num_points / 8;
185 float* cPtr = cVector;
186 const float* aPtr = aVector;
188 __m256 aVal, cVal, x2, a, b;
189 __m256 const1, const2, const3, const4, const5, const6;
190 const1 = _mm256_set1_ps(135135.0f);
191 const2 = _mm256_set1_ps(17325.0f);
192 const3 = _mm256_set1_ps(378.0f);
193 const4 = _mm256_set1_ps(62370.0f);
194 const5 = _mm256_set1_ps(3150.0f);
195 const6 = _mm256_set1_ps(28.0f);
196 for(;number < eighthPoints; number++){
198 aVal = _mm256_load_ps(aPtr);
199 x2 = _mm256_mul_ps(aVal, aVal);
200 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
201 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
203 cVal = _mm256_div_ps(a, b);
205 _mm256_store_ps(cPtr, cVal);
211 number = eighthPoints * 8;
212 for(;number < num_points; number++) {
215 else if(*aPtr <= -4.97)
218 float x2 = (*aPtr) * (*aPtr);
219 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
220 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
232 #include <xmmintrin.h>
235 volk_32f_tanh_32f_u_sse(
float* cVector,
const float* aVector,
236 unsigned int num_points)
238 unsigned int number = 0;
239 const unsigned int quarterPoints = num_points / 4;
241 float* cPtr = cVector;
242 const float* aPtr = aVector;
244 __m128 aVal, cVal, x2, a, b;
245 __m128 const1, const2, const3, const4, const5, const6;
246 const1 = _mm_set_ps1(135135.0f);
247 const2 = _mm_set_ps1(17325.0f);
248 const3 = _mm_set_ps1(378.0f);
249 const4 = _mm_set_ps1(62370.0f);
250 const5 = _mm_set_ps1(3150.0f);
251 const6 = _mm_set_ps1(28.0f);
252 for(;number < quarterPoints; number++){
254 aVal = _mm_loadu_ps(aPtr);
255 x2 = _mm_mul_ps(aVal, aVal);
256 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
257 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
259 cVal = _mm_div_ps(a, b);
261 _mm_storeu_ps(cPtr, cVal);
267 number = quarterPoints * 4;
268 for(;number < num_points; number++) {
271 else if(*aPtr <= -4.97)
274 float x2 = (*aPtr) * (*aPtr);
275 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
276 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
286 #include <immintrin.h>
289 volk_32f_tanh_32f_u_avx(
float* cVector,
const float* aVector,
290 unsigned int num_points)
292 unsigned int number = 0;
293 const unsigned int eighthPoints = num_points / 8;
295 float* cPtr = cVector;
296 const float* aPtr = aVector;
298 __m256 aVal, cVal, x2, a, b;
299 __m256 const1, const2, const3, const4, const5, const6;
300 const1 = _mm256_set1_ps(135135.0f);
301 const2 = _mm256_set1_ps(17325.0f);
302 const3 = _mm256_set1_ps(378.0f);
303 const4 = _mm256_set1_ps(62370.0f);
304 const5 = _mm256_set1_ps(3150.0f);
305 const6 = _mm256_set1_ps(28.0f);
306 for(;number < eighthPoints; number++){
308 aVal = _mm256_loadu_ps(aPtr);
309 x2 = _mm256_mul_ps(aVal, aVal);
310 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
311 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
313 cVal = _mm256_div_ps(a, b);
315 _mm256_storeu_ps(cPtr, cVal);
321 number = eighthPoints * 8;
322 for(;number < num_points; number++) {
325 else if(*aPtr <= -4.97)
328 float x2 = (*aPtr) * (*aPtr);
329 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
330 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));