Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5  *
6  * This file is part of VOLK
7  *
8  * SPDX-License-Identifier: LGPL-3.0-or-later
9  */
10 
11 /*
12  * This file is intended to hold AVX2 intrinsics of intrinsics.
13  * They should be used in VOLK kernels to avoid copy-paste.
14  */
15 
16 #ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17 #define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
19 #include <immintrin.h>
20 
21 static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
22 {
23  const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
24  __m256 r = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(2, 0, 2, 0));
25  return _mm256_permutevar8x32_ps(r, permute_mask);
26 }
27 
28 static inline __m256 _mm256_imag(const __m256 z1, const __m256 z2)
29 {
30  const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
31  __m256 i = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(3, 1, 3, 1));
32  return _mm256_permutevar8x32_ps(i, permute_mask);
33 }
34 
35 static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
36 {
37  const __m128i zeros = _mm_set1_epi8(0x00);
38  const __m128i sign_extract = _mm_set1_epi8(0x80);
39  const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
40  0xff,
41  0xff,
42  0x00,
43  0xff,
44  0xff,
45  0xff,
46  0x01,
47  0xff,
48  0xff,
49  0xff,
50  0x02,
51  0xff,
52  0xff,
53  0xff,
54  0x03,
55  0xff,
56  0xff,
57  0xff,
58  0x04,
59  0xff,
60  0xff,
61  0xff,
62  0x05,
63  0xff,
64  0xff,
65  0xff,
66  0x06,
67  0xff,
68  0xff,
69  0xff,
70  0x07);
71  __m256i sign_bits = _mm256_setzero_si256();
72 
73  fbits = _mm_cmpgt_epi8(fbits, zeros);
74  fbits = _mm_and_si128(fbits, sign_extract);
75  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
76  sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
77  sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
78 
79  return _mm256_castsi256_ps(sign_bits);
80 }
81 
82 static inline __m256
83 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
84 {
85  // prepare sign mask for correct +-
86  __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
87 
88  __m256 llr0, llr1;
89  _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
90 
91  // calculate result
92  llr0 = _mm256_xor_ps(llr0, sign_mask);
93  __m256 dst = _mm256_add_ps(llr0, llr1);
94  return dst;
95 }
96 
97 static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
98  const __m256 cplxValue1)
99 {
100  const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
101  const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
102  const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
103  const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
104  return _mm256_permutevar8x32_ps(complex_result, idx);
105 }
106 
107 static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
108  const __m256 symbols1,
109  const __m256 points0,
110  const __m256 points1,
111  const __m256 scalar)
112 {
113  /*
114  * Calculate: |y - x|^2 * SNR_lin
115  * Consider 'symbolsX' and 'pointsX' to be complex float
116  * 'symbolsX' are 'y' and 'pointsX' are 'x'
117  */
118  const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
119  const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
120  const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
121  return _mm256_mul_ps(norms, scalar);
122 }
123 
124 /*
125  * The function below vectorizes the inner loop of the following code:
126  *
127  * float max_values[8] = {0.f};
128  * unsigned max_indices[8] = {0};
129  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
130  * for (unsigned i = 0; i < num_points / 8; ++i) {
131  * for (unsigned j = 0; j < 8; ++j) {
132  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
133  * bool compare = abs_squared > max_values[j];
134  * max_values[j] = compare ? abs_squared : max_values[j];
135  * max_indices[j] = compare ? current_indices[j] : max_indices[j]
136  * current_indices[j] += 8; // update for next outer loop iteration
137  * ++src0;
138  * }
139  * }
140  */
141 static inline void vector_32fc_index_max_variant0(__m256 in0,
142  __m256 in1,
143  __m256* max_values,
144  __m256i* max_indices,
145  __m256i* current_indices,
146  __m256i indices_increment)
147 {
148  in0 = _mm256_mul_ps(in0, in0);
149  in1 = _mm256_mul_ps(in1, in1);
150 
151  /*
152  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
153  * hadd_ps(a, b) computes
154  * (b_7 + b_6,
155  * b_5 + b_4,
156  * ---------
157  * a_7 + b_6,
158  * a_5 + a_4,
159  * ---------
160  * b_3 + b_2,
161  * b_1 + b_0,
162  * ---------
163  * a_3 + a_2,
164  * a_1 + a_0).
165  * The result is the squared absolute value of complex numbers at index
166  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
167  * current_indices!
168  */
169  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
170 
171  /*
172  * Compare the recently computed squared absolute values with the
173  * previously determined maximum values. cmp_ps(a, b) determines
174  * a > b ? 0xFFFFFFFF for each element in the vectors =>
175  * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
176  *
177  * If either operand is NaN, 0 is returned as an “ordered” comparision is
178  * used => the blend operation will select the value from *max_values.
179  */
180  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
181 
182  /* Select maximum by blending. This is the only line which differs from variant1 */
183  *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
184 
185  /*
186  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
187  * each element in the vectors =>
188  * max_indices = compare_mask ? current_indices : max_indices
189  *
190  * Note: The casting of data types is required to make the compiler happy
191  * and does not change values.
192  */
193  *max_indices =
194  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
195  _mm256_castsi256_ps(*current_indices),
196  compare_mask));
197 
198  /* compute indices of complex numbers which will be loaded in the next iteration */
199  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
200 }
201 
202 /* See _variant0 for details */
203 static inline void vector_32fc_index_max_variant1(__m256 in0,
204  __m256 in1,
205  __m256* max_values,
206  __m256i* max_indices,
207  __m256i* current_indices,
208  __m256i indices_increment)
209 {
210  in0 = _mm256_mul_ps(in0, in0);
211  in1 = _mm256_mul_ps(in1, in1);
212 
213  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
214  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
215 
216  /*
217  * This is the only line which differs from variant0. Using maxps instead of
218  * blendvps is faster on Intel CPUs (on the ones tested with).
219  *
220  * Note: The order of arguments matters if a NaN is encountered in which
221  * case the value of the second argument is selected. This is consistent
222  * with the “ordered” comparision and the blend operation: The comparision
223  * returns false if a NaN is encountered and the blend operation
224  * consequently selects the value from max_indices.
225  */
226  *max_values = _mm256_max_ps(abs_squared, *max_values);
227 
228  *max_indices =
229  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
230  _mm256_castsi256_ps(*current_indices),
231  compare_mask));
232 
233  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
234 }
235 
236 /*
237  * The function below vectorizes the inner loop of the following code:
238  *
239  * float min_values[8] = {FLT_MAX};
240  * unsigned min_indices[8] = {0};
241  * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
242  * for (unsigned i = 0; i < num_points / 8; ++i) {
243  * for (unsigned j = 0; j < 8; ++j) {
244  * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
245  * bool compare = abs_squared < min_values[j];
246  * min_values[j] = compare ? abs_squared : min_values[j];
247  * min_indices[j] = compare ? current_indices[j] : min_indices[j]
248  * current_indices[j] += 8; // update for next outer loop iteration
249  * ++src0;
250  * }
251  * }
252  */
253 static inline void vector_32fc_index_min_variant0(__m256 in0,
254  __m256 in1,
255  __m256* min_values,
256  __m256i* min_indices,
257  __m256i* current_indices,
258  __m256i indices_increment)
259 {
260  in0 = _mm256_mul_ps(in0, in0);
261  in1 = _mm256_mul_ps(in1, in1);
262 
263  /*
264  * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
265  * hadd_ps(a, b) computes
266  * (b_7 + b_6,
267  * b_5 + b_4,
268  * ---------
269  * a_7 + b_6,
270  * a_5 + a_4,
271  * ---------
272  * b_3 + b_2,
273  * b_1 + b_0,
274  * ---------
275  * a_3 + a_2,
276  * a_1 + a_0).
277  * The result is the squared absolute value of complex numbers at index
278  * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
279  * current_indices!
280  */
281  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
282 
283  /*
284  * Compare the recently computed squared absolute values with the
285  * previously determined minimum values. cmp_ps(a, b) determines
286  * a < b ? 0xFFFFFFFF for each element in the vectors =>
287  * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
288  *
289  * If either operand is NaN, 0 is returned as an “ordered” comparision is
290  * used => the blend operation will select the value from *min_values.
291  */
292  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
293 
294  /* Select minimum by blending. This is the only line which differs from variant1 */
295  *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
296 
297  /*
298  * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
299  * each element in the vectors =>
300  * min_indices = compare_mask ? current_indices : min_indices
301  *
302  * Note: The casting of data types is required to make the compiler happy
303  * and does not change values.
304  */
305  *min_indices =
306  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
307  _mm256_castsi256_ps(*current_indices),
308  compare_mask));
309 
310  /* compute indices of complex numbers which will be loaded in the next iteration */
311  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
312 }
313 
314 /* See _variant0 for details */
315 static inline void vector_32fc_index_min_variant1(__m256 in0,
316  __m256 in1,
317  __m256* min_values,
318  __m256i* min_indices,
319  __m256i* current_indices,
320  __m256i indices_increment)
321 {
322  in0 = _mm256_mul_ps(in0, in0);
323  in1 = _mm256_mul_ps(in1, in1);
324 
325  __m256 abs_squared = _mm256_hadd_ps(in0, in1);
326  __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
327 
328  /*
329  * This is the only line which differs from variant0. Using maxps instead of
330  * blendvps is faster on Intel CPUs (on the ones tested with).
331  *
332  * Note: The order of arguments matters if a NaN is encountered in which
333  * case the value of the second argument is selected. This is consistent
334  * with the “ordered” comparision and the blend operation: The comparision
335  * returns false if a NaN is encountered and the blend operation
336  * consequently selects the value from min_indices.
337  */
338  *min_values = _mm256_min_ps(abs_squared, *min_values);
339 
340  *min_indices =
341  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
342  _mm256_castsi256_ps(*current_indices),
343  compare_mask));
344 
345  *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
346 }
347 
348 #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:35
static __m256 _mm256_real(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:21
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:183
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:107
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:203
static __m256 _mm256_imag(const __m256 z1, const __m256 z2)
Definition: volk_avx2_intrinsics.h:28
for i
Definition: volk_config_fixed.tmpl.h:13
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:253
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:97
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:141
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:315
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:83