Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
57 #include <math.h>
58 
59 #ifndef INCLUDED_volk_32f_atan_32f_a_H
60 #define INCLUDED_volk_32f_atan_32f_a_H
61 
62 #if LV_HAVE_AVX2 && LV_HAVE_FMA
63 #include <immintrin.h>
65 static inline void
66 volk_32f_atan_32f_a_avx2_fma(float* out, const float* in, unsigned int num_points)
67 {
68  const __m256 one = _mm256_set1_ps(1.f);
69  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
70  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
71  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
72 
73  unsigned int number = 0;
74  unsigned int eighth_points = num_points / 8;
75  for (; number < eighth_points; number++) {
76  __m256 x = _mm256_load_ps(in);
77  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
78  __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
79  _mm256_blendv_ps(one, x, swap_mask));
80  __m256 result = _m256_arctan_poly_avx2_fma(x_star);
81  __m256 term = _mm256_and_ps(x_star, sign_mask);
82  term = _mm256_or_ps(pi_over_2, term);
83  term = _mm256_sub_ps(term, result);
84  result = _mm256_blendv_ps(result, term, swap_mask);
85  _mm256_store_ps(out, result);
86  in += 8;
87  out += 8;
88  }
89 
90  number = eighth_points * 8;
91  for (; number < num_points; number++) {
92  *out++ = volk_arctan(*in++);
93  }
94 }
95 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
96 
97 #if LV_HAVE_AVX
98 #include <immintrin.h>
100 static inline void
101 volk_32f_atan_32f_a_avx2(float* out, const float* in, unsigned int num_points)
102 {
103  const __m256 one = _mm256_set1_ps(1.f);
104  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
105  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
106  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
107 
108  unsigned int number = 0;
109  unsigned int eighth_points = num_points / 8;
110  for (; number < eighth_points; number++) {
111  __m256 x = _mm256_load_ps(in);
112  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
113  __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
114  _mm256_blendv_ps(one, x, swap_mask));
115  __m256 result = _m256_arctan_poly_avx(x_star);
116  __m256 term = _mm256_and_ps(x_star, sign_mask);
117  term = _mm256_or_ps(pi_over_2, term);
118  term = _mm256_sub_ps(term, result);
119  result = _mm256_blendv_ps(result, term, swap_mask);
120  _mm256_store_ps(out, result);
121  in += 8;
122  out += 8;
123  }
124 
125  number = eighth_points * 8;
126  for (; number < num_points; number++) {
127  *out++ = volk_arctan(*in++);
128  }
129 }
130 #endif /* LV_HAVE_AVX for aligned */
131 
132 #ifdef LV_HAVE_SSE4_1
133 #include <smmintrin.h>
135 static inline void
136 volk_32f_atan_32f_a_sse4_1(float* out, const float* in, unsigned int num_points)
137 {
138  const __m128 one = _mm_set1_ps(1.f);
139  const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
140  const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
141  const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
142 
143  unsigned int number = 0;
144  unsigned int quarter_points = num_points / 4;
145  for (; number < quarter_points; number++) {
146  __m128 x = _mm_load_ps(in);
147  __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
148  __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
149  _mm_blendv_ps(one, x, swap_mask));
150  __m128 result = _mm_arctan_poly_sse(x_star);
151  __m128 term = _mm_and_ps(x_star, sign_mask);
152  term = _mm_or_ps(pi_over_2, term);
153  term = _mm_sub_ps(term, result);
154  result = _mm_blendv_ps(result, term, swap_mask);
155  _mm_store_ps(out, result);
156  in += 4;
157  out += 4;
158  }
159 
160  number = quarter_points * 4;
161  for (; number < num_points; number++) {
162  *out++ = volk_arctan(*in++);
163  }
164 }
165 #endif /* LV_HAVE_SSE4_1 for aligned */
166 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
167 
168 #ifndef INCLUDED_volk_32f_atan_32f_u_H
169 #define INCLUDED_volk_32f_atan_32f_u_H
170 
171 #if LV_HAVE_AVX2 && LV_HAVE_FMA
172 #include <immintrin.h>
173 static inline void
174 volk_32f_atan_32f_u_avx2_fma(float* out, const float* in, unsigned int num_points)
175 {
176  const __m256 one = _mm256_set1_ps(1.f);
177  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
178  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180 
181  unsigned int number = 0;
182  unsigned int eighth_points = num_points / 8;
183  for (; number < eighth_points; number++) {
184  __m256 x = _mm256_loadu_ps(in);
185  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
186  __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
187  _mm256_blendv_ps(one, x, swap_mask));
188  __m256 result = _m256_arctan_poly_avx2_fma(x_star);
189  __m256 term = _mm256_and_ps(x_star, sign_mask);
190  term = _mm256_or_ps(pi_over_2, term);
191  term = _mm256_sub_ps(term, result);
192  result = _mm256_blendv_ps(result, term, swap_mask);
193  _mm256_storeu_ps(out, result);
194  in += 8;
195  out += 8;
196  }
197 
198  number = eighth_points * 8;
199  for (; number < num_points; number++) {
200  *out++ = volk_arctan(*in++);
201  }
202 }
203 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
204 
205 #if LV_HAVE_AVX
206 #include <immintrin.h>
207 static inline void
208 volk_32f_atan_32f_u_avx2(float* out, const float* in, unsigned int num_points)
209 {
210  const __m256 one = _mm256_set1_ps(1.f);
211  const __m256 pi_over_2 = _mm256_set1_ps(0x1.921fb6p0f);
212  const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
213  const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
214 
215  unsigned int number = 0;
216  unsigned int eighth_points = num_points / 8;
217  for (; number < eighth_points; number++) {
218  __m256 x = _mm256_loadu_ps(in);
219  __m256 swap_mask = _mm256_cmp_ps(_mm256_and_ps(x, abs_mask), one, _CMP_GT_OS);
220  __m256 x_star = _mm256_div_ps(_mm256_blendv_ps(x, one, swap_mask),
221  _mm256_blendv_ps(one, x, swap_mask));
222  __m256 result = _m256_arctan_poly_avx(x_star);
223  __m256 term = _mm256_and_ps(x_star, sign_mask);
224  term = _mm256_or_ps(pi_over_2, term);
225  term = _mm256_sub_ps(term, result);
226  result = _mm256_blendv_ps(result, term, swap_mask);
227  _mm256_storeu_ps(out, result);
228  in += 8;
229  out += 8;
230  }
231 
232  number = eighth_points * 8;
233  for (; number < num_points; number++) {
234  *out++ = volk_arctan(*in++);
235  }
236 }
237 #endif /* LV_HAVE_AVX for unaligned */
238 
239 #ifdef LV_HAVE_SSE4_1
240 #include <smmintrin.h>
242 static inline void
243 volk_32f_atan_32f_u_sse4_1(float* out, const float* in, unsigned int num_points)
244 {
245  const __m128 one = _mm_set1_ps(1.f);
246  const __m128 pi_over_2 = _mm_set1_ps(0x1.921fb6p0f);
247  const __m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
248  const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
249 
250  unsigned int number = 0;
251  unsigned int quarter_points = num_points / 4;
252  for (; number < quarter_points; number++) {
253  __m128 x = _mm_loadu_ps(in);
254  __m128 swap_mask = _mm_cmpgt_ps(_mm_and_ps(x, abs_mask), one);
255  __m128 x_star = _mm_div_ps(_mm_blendv_ps(x, one, swap_mask),
256  _mm_blendv_ps(one, x, swap_mask));
257  __m128 result = _mm_arctan_poly_sse(x_star);
258  __m128 term = _mm_and_ps(x_star, sign_mask);
259  term = _mm_or_ps(pi_over_2, term);
260  term = _mm_sub_ps(term, result);
261  result = _mm_blendv_ps(result, term, swap_mask);
262  _mm_storeu_ps(out, result);
263  in += 4;
264  out += 4;
265  }
266 
267  number = quarter_points * 4;
268  for (; number < num_points; number++) {
269  *out++ = volk_arctan(*in++);
270  }
271 }
272 #endif /* LV_HAVE_SSE4_1 for unaligned */
273 
274 #ifdef LV_HAVE_GENERIC
275 static inline void
276 volk_32f_atan_32f_polynomial(float* out, const float* in, unsigned int num_points)
277 {
278  unsigned int number = 0;
279  for (; number < num_points; number++) {
280  *out++ = volk_arctan(*in++);
281  }
282 }
283 #endif /* LV_HAVE_GENERIC */
284 
285 #ifdef LV_HAVE_GENERIC
286 static inline void
287 volk_32f_atan_32f_generic(float* out, const float* in, unsigned int num_points)
288 {
289  unsigned int number = 0;
290  for (; number < num_points; number++) {
291  *out++ = atanf(*in++);
292  }
293 }
294 #endif /* LV_HAVE_GENERIC */
295 
296 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
static void volk_32f_atan_32f_polynomial(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:276
static void volk_32f_atan_32f_u_avx2(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:208
static void volk_32f_atan_32f_a_avx2(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:101
static __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
Definition: volk_avx2_fma_intrinsics.h:26
static __m256 _m256_arctan_poly_avx(const __m256 x)
Definition: volk_avx_intrinsics.h:27
static void volk_32f_atan_32f_generic(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_atan_32f.h:287
static __m128 _mm_arctan_poly_sse(const __m128 x)
Definition: volk_sse_intrinsics.h:27
static float volk_arctan(const float x)
Definition: volk_common.h:199