63 #ifndef INCLUDED_volk_32fc_index_min_16u_a_H
64 #define INCLUDED_volk_32fc_index_min_16u_a_H
73 #include <immintrin.h>
76 static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
80 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
82 const __m256i indices_increment = _mm256_set1_epi32(8);
88 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
90 __m256 min_values = _mm256_set1_ps(FLT_MAX);
91 __m256i min_indices = _mm256_setzero_si256();
93 for (
unsigned i = 0;
i < num_points / 8u; ++
i) {
94 __m256 in0 = _mm256_load_ps((
float*)source);
95 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
97 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
104 _mm256_store_ps(min_values_buffer, min_values);
105 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
109 for (
unsigned i = 0; i < 8; i++) {
110 if (min_values_buffer[i] < min) {
111 min = min_values_buffer[
i];
112 index = min_indices_buffer[
i];
117 for (
unsigned i = num_points & (~7u); i < num_points; ++
i) {
118 const float abs_squared =
120 if (abs_squared < min) {
133 #include <immintrin.h>
136 static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
140 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
142 const __m256i indices_increment = _mm256_set1_epi32(8);
148 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
150 __m256 min_values = _mm256_set1_ps(FLT_MAX);
151 __m256i min_indices = _mm256_setzero_si256();
153 for (
unsigned i = 0; i < num_points / 8u; ++
i) {
154 __m256 in0 = _mm256_load_ps((
float*)source);
155 __m256 in1 = _mm256_load_ps((
float*)(source + 4));
157 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
164 _mm256_store_ps(min_values_buffer, min_values);
165 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
169 for (
unsigned i = 0; i < 8; i++) {
170 if (min_values_buffer[i] < min) {
171 min = min_values_buffer[
i];
172 index = min_indices_buffer[
i];
177 for (
unsigned i = num_points & (~7u); i < num_points; ++
i) {
178 const float abs_squared =
180 if (abs_squared < min) {
193 #include <pmmintrin.h>
194 #include <xmmintrin.h>
200 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
207 __m128 xmm1, xmm2, xmm3;
208 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 xmm5.
int_vec = _mm_setzero_si128();
211 xmm4.
int_vec = _mm_setzero_si128();
212 holderf.
int_vec = _mm_setzero_si128();
213 holderi.
int_vec = _mm_setzero_si128();
215 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
216 xmm9 = _mm_setzero_si128();
217 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
218 xmm3 = _mm_set_ps1(FLT_MAX);
220 int bound = num_points >> 2;
222 for (
int i = 0; i < bound; ++
i) {
223 xmm1 = _mm_load_ps((
float*)source);
224 xmm2 = _mm_load_ps((
float*)&source[2]);
228 xmm1 = _mm_mul_ps(xmm1, xmm1);
229 xmm2 = _mm_mul_ps(xmm2, xmm2);
231 xmm1 = _mm_hadd_ps(xmm1, xmm2);
233 xmm3 = _mm_min_ps(xmm1, xmm3);
235 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
236 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
238 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
239 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
241 xmm9 = _mm_add_epi32(xmm11, xmm12);
243 xmm8 = _mm_add_epi32(xmm8, xmm10);
246 if (num_points >> 1 & 1) {
247 xmm2 = _mm_load_ps((
float*)source);
252 xmm2 = _mm_mul_ps(xmm2, xmm2);
256 xmm1 = _mm_hadd_ps(xmm2, xmm2);
258 xmm3 = _mm_min_ps(xmm1, xmm3);
260 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
262 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
263 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
265 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
266 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
268 xmm9 = _mm_add_epi32(xmm11, xmm12);
270 xmm8 = _mm_add_epi32(xmm8, xmm10);
273 if (num_points & 1) {
277 xmm2 = _mm_load1_ps(&sq_dist);
281 xmm3 = _mm_min_ss(xmm3, xmm2);
283 xmm4.
float_vec = _mm_cmpgt_ps(xmm1, xmm3);
284 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
286 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
288 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
289 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
291 xmm9 = _mm_add_epi32(xmm11, xmm12);
294 _mm_store_ps((
float*)&(holderf.
f), xmm3);
295 _mm_store_si128(&(holderi.
int_vec), xmm9);
297 target[0] = holderi.
i[0];
298 sq_dist = holderf.
f[0];
299 target[0] = (holderf.
f[1] < sq_dist) ? holderi.
i[1] : target[0];
300 sq_dist = (holderf.
f[1] < sq_dist) ? holderf.
f[1] : sq_dist;
301 target[0] = (holderf.
f[2] < sq_dist) ? holderi.
i[2] : target[0];
302 sq_dist = (holderf.
f[2] < sq_dist) ? holderf.
f[2] : sq_dist;
303 target[0] = (holderf.
f[3] < sq_dist) ? holderi.
i[3] : target[0];
304 sq_dist = (holderf.
f[3] < sq_dist) ? holderf.
f[3] : sq_dist;
309 #ifdef LV_HAVE_GENERIC
314 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
320 for (uint32_t i = 0; i < num_points; ++
i) {
336 #ifndef INCLUDED_volk_32fc_index_min_16u_u_H
337 #define INCLUDED_volk_32fc_index_min_16u_u_H
339 #include <inttypes.h>
346 #include <immintrin.h>
349 static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
353 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
355 const __m256i indices_increment = _mm256_set1_epi32(8);
361 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
363 __m256 min_values = _mm256_set1_ps(FLT_MAX);
364 __m256i min_indices = _mm256_setzero_si256();
366 for (
unsigned i = 0; i < num_points / 8u; ++
i) {
367 __m256 in0 = _mm256_loadu_ps((
float*)source);
368 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
370 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
377 _mm256_store_ps(min_values_buffer, min_values);
378 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
382 for (
unsigned i = 0; i < 8; i++) {
383 if (min_values_buffer[i] < min) {
384 min = min_values_buffer[
i];
385 index = min_indices_buffer[
i];
390 for (
unsigned i = num_points & (~7u); i < num_points; ++
i) {
391 const float abs_squared =
393 if (abs_squared < min) {
406 #include <immintrin.h>
409 static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
413 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
415 const __m256i indices_increment = _mm256_set1_epi32(8);
421 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
423 __m256 min_values = _mm256_set1_ps(FLT_MAX);
424 __m256i min_indices = _mm256_setzero_si256();
426 for (
unsigned i = 0; i < num_points / 8u; ++
i) {
427 __m256 in0 = _mm256_loadu_ps((
float*)source);
428 __m256 in1 = _mm256_loadu_ps((
float*)(source + 4));
430 in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment);
437 _mm256_store_ps(min_values_buffer, min_values);
438 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
442 for (
unsigned i = 0; i < 8; i++) {
443 if (min_values_buffer[i] < min) {
444 min = min_values_buffer[
i];
445 index = min_indices_buffer[
i];
450 for (
unsigned i = num_points & (~7u); i < num_points; ++
i) {
451 const float abs_squared =
453 if (abs_squared < min) {
#define bit128_p(x)
Definition: volk_common.h:147
__m128i int_vec
Definition: volk_common.h:128
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:310
for i
Definition: volk_config_fixed.tmpl.h:13
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:253
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
__m128 float_vec
Definition: volk_common.h:124
float complex lv_32fc_t
Definition: volk_complex.h:74
float f[4]
Definition: volk_common.h:120
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:315
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:196
Definition: volk_common.h:116
#define lv_creal(x)
Definition: volk_complex.h:96
#define lv_cimag(x)
Definition: volk_complex.h:98
uint32_t i[4]
Definition: volk_common.h:119