Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47 
48 typedef union {
49  unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50  unsigned int w[64 /*NUMSTATES*/ / 32];
51  unsigned short s[64 /*NUMSTATES*/ / 16];
52  unsigned char c[64 /*NUMSTATES*/ / 8];
53 #ifdef _MSC_VER
54 } decision_t;
55 #else
56 } decision_t __attribute__((aligned(16)));
57 #endif
58 
59 
60 static inline void renormalize(unsigned char* X)
61 {
62  int NUMSTATES = 64;
63  int i;
64 
65  unsigned char min = X[0];
66  for (i = 0; i < NUMSTATES; i++)
67  if (min > X[i])
68  min = X[i];
69  for (i = 0; i < NUMSTATES; i++)
70  X[i] -= min;
71 }
72 
73 
74 // helper BFLY for GENERIC version
75 static inline void BFLY(int i,
76  int s,
77  unsigned char* syms,
78  unsigned char* Y,
79  unsigned char* X,
80  decision_t* d,
81  unsigned char* Branchtab)
82 {
83  int j;
84  unsigned int decision0, decision1;
85  unsigned char metric, m0, m1, m2, m3;
86  unsigned short metricsum;
87 
88  int NUMSTATES = 64;
89  int RATE = 2;
90  int METRICSHIFT = 1;
91  int PRECISIONSHIFT = 2;
92 
93  metricsum = 1;
94  for (j = 0; j < RATE; j++)
95  metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
96  metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
97 
98  unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
99 
100  m0 = X[i] + metric;
101  m1 = X[i + NUMSTATES / 2] + (max - metric);
102  m2 = X[i] + (max - metric);
103  m3 = X[i + NUMSTATES / 2] + metric;
104 
105  decision0 = (signed int)(m0 - m1) >= 0;
106  decision1 = (signed int)(m2 - m3) >= 0;
107 
108  Y[2 * i] = decision0 ? m1 : m0;
109  Y[2 * i + 1] = decision1 ? m3 : m2;
110 
111  d->w[i / (sizeof(unsigned int) * 8 / 2) +
112  s * (sizeof(decision_t) / sizeof(unsigned int))] |=
113  (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
114 }
115 
116 
117 #if LV_HAVE_AVX2
118 
119 #include <immintrin.h>
120 #include <stdio.h>
121 
122 static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
123  unsigned char* X,
124  unsigned char* syms,
125  unsigned char* dec,
126  unsigned int framebits,
127  unsigned int excess,
128  unsigned char* Branchtab)
129 {
130  unsigned int i;
131  for (i = 0; i < framebits + excess; i++) {
132  unsigned char* tmp;
133  unsigned int* dec_int = (unsigned int*)dec;
134  __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
135  m26, s18, s19, s22, s23, t14, t15;
136 
137  // Butterfly
138  s18 = ((__m256i*)X)[0];
139  s19 = ((__m256i*)X)[1];
140  a76 = _mm256_set1_epi8(syms[2 * i]);
141  a78 = ((__m256i*)Branchtab)[0];
142  a79 = _mm256_xor_si256(a76, a78);
143  a82 = _mm256_set1_epi8(syms[2 * i + 1]);
144  a84 = ((__m256i*)Branchtab)[1];
145  a85 = _mm256_xor_si256(a82, a84);
146  a86 = _mm256_avg_epu8(a79, a85);
147  a88 = _mm256_srli_epi16(a86, 2);
148  t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
149  t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
150  m23 = _mm256_adds_epu8(s18, t14);
151  m24 = _mm256_adds_epu8(s19, t15);
152  m25 = _mm256_adds_epu8(s18, t15);
153  m26 = _mm256_adds_epu8(s19, t14);
154  a89 = _mm256_min_epu8(m24, m23);
155  d9 = _mm256_cmpeq_epi8(a89, m24);
156  a90 = _mm256_min_epu8(m26, m25);
157  d10 = _mm256_cmpeq_epi8(a90, m26);
158  s22 = _mm256_unpacklo_epi8(d9, d10);
159  s23 = _mm256_unpackhi_epi8(d9, d10);
160  dec_int[2 * i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
161  dec_int[2 * i + 1] =
162  _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
163  s22 = _mm256_unpacklo_epi8(a89, a90);
164  s23 = _mm256_unpackhi_epi8(a89, a90);
165  ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
166  ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
167 
168  // Renormalize
169  __m256i m5, m6;
170  m5 = ((__m256i*)Y)[0];
171  m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
172  m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
173  __m256i m7;
174  m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
175  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
176  ((__m256i)m7)));
177  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
178  ((__m256i)m7)));
179  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
180  ((__m256i)m7)));
181  m7 = _mm256_unpacklo_epi8(m7, m7);
182  m7 = _mm256_shufflelo_epi16(m7, 0);
183  m6 = _mm256_unpacklo_epi64(m7, m7);
184  m6 = _mm256_permute2x128_si256(
185  m6, m6, 0); // copy lower half of m6 to upper half, since above ops
186  // operate on 128 bit lanes
187  ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
188  ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
189 
190  // Swap pointers to old and new metrics
191  tmp = X;
192  X = Y;
193  Y = tmp;
194  }
195 }
196 
197 #endif /*LV_HAVE_AVX2*/
198 
199 
200 #if LV_HAVE_SSE3
201 
202 #include <emmintrin.h>
203 #include <mmintrin.h>
204 #include <pmmintrin.h>
205 #include <stdio.h>
206 #include <xmmintrin.h>
207 
208 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
209  unsigned char* X,
210  unsigned char* syms,
211  unsigned char* dec,
212  unsigned int framebits,
213  unsigned int excess,
214  unsigned char* Branchtab)
215 {
216  unsigned int i;
217  for (i = 0; i < framebits + excess; i++) {
218  unsigned char* tmp;
219  unsigned short* dec_short = (unsigned short*)dec;
220  __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
221  a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
222  m30, s18, s19, s24, s25, t14, t15, t17, t18;
223 
224  // First half of butterfly
225  s18 = ((__m128i*)X)[0];
226  s19 = ((__m128i*)X)[2];
227  a76 = _mm_set1_epi8(syms[2 * i]);
228  a78 = ((__m128i*)Branchtab)[0];
229  a79 = _mm_xor_si128(a76, a78);
230  a82 = _mm_set1_epi8(syms[2 * i + 1]);
231  a84 = ((__m128i*)Branchtab)[2];
232  a85 = _mm_xor_si128(a82, a84);
233  a86 = _mm_avg_epu8(a79, a85);
234  a88 = _mm_srli_epi16(a86, 2);
235  t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
236  t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
237  m23 = _mm_adds_epu8(s18, t14);
238  m24 = _mm_adds_epu8(s19, t15);
239  m25 = _mm_adds_epu8(s18, t15);
240  m26 = _mm_adds_epu8(s19, t14);
241  a89 = _mm_min_epu8(m24, m23);
242  d9 = _mm_cmpeq_epi8(a89, m24);
243  a90 = _mm_min_epu8(m26, m25);
244  d10 = _mm_cmpeq_epi8(a90, m26);
245  dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
246  dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
247  ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
248  ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
249 
250  // Second half of butterfly
251  s24 = ((__m128i*)X)[1];
252  s25 = ((__m128i*)X)[3];
253  a100 = ((__m128i*)Branchtab)[1];
254  a101 = _mm_xor_si128(a76, a100);
255  a103 = ((__m128i*)Branchtab)[3];
256  a104 = _mm_xor_si128(a82, a103);
257  a105 = _mm_avg_epu8(a101, a104);
258  a107 = _mm_srli_epi16(a105, 2);
259  t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
260  t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
261  m27 = _mm_adds_epu8(s24, t17);
262  m28 = _mm_adds_epu8(s25, t18);
263  m29 = _mm_adds_epu8(s24, t18);
264  m30 = _mm_adds_epu8(s25, t17);
265  a108 = _mm_min_epu8(m28, m27);
266  d11 = _mm_cmpeq_epi8(a108, m28);
267  a109 = _mm_min_epu8(m30, m29);
268  d12 = _mm_cmpeq_epi8(a109, m30);
269  dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
270  dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
271  ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
272  ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
273 
274  // Renormalize
275  __m128i m5, m6;
276  m5 = ((__m128i*)Y)[0];
277  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
278  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
279  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
280  __m128i m7;
281  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
282  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
283  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
284  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
285  m7 = _mm_unpacklo_epi8(m7, m7);
286  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
287  m6 = _mm_unpacklo_epi64(m7, m7);
288  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
289  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
290  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
291  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
292 
293  // Swap pointers to old and new metrics
294  tmp = X;
295  X = Y;
296  Y = tmp;
297  }
298 }
299 
300 #endif /*LV_HAVE_SSE3*/
301 
302 #if LV_HAVE_NEON
303 
304 #include <arm_neon.h>
305 
306 static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
307  unsigned char* X,
308  unsigned char* syms,
309  unsigned char* dec,
310  unsigned int framebits,
311  unsigned int excess,
312  unsigned char* Branchtab)
313 {
314  unsigned int i;
315  for (i = 0; i < framebits + excess; i++) {
316  unsigned char* tmp;
317  unsigned int* dec_int = (unsigned int*)dec;
318  uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
319  a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
320  s19, s24, s25, t14, t15, t17, t18;
321  uint16x8_t high_bits;
322  uint32x4_t paired16;
323  uint8x16_t paired32;
324  uint8x8_t left, right;
325  uint8x8x2_t both;
326 
327  // First half of butterfly
328  s18 = ((uint8x16_t*)X)[0];
329  s19 = ((uint8x16_t*)X)[2];
330  a76 = vdupq_n_u8(syms[2 * i]);
331  a78 = ((uint8x16_t*)Branchtab)[0];
332  a79 = veorq_u8(a76, a78);
333  a82 = vdupq_n_u8(syms[2 * i + 1]);
334  a84 = ((uint8x16_t*)Branchtab)[2];
335  a85 = veorq_u8(a82, a84);
336  a86 = vrhaddq_u8(a79, a85);
337  t14 = vshrq_n_u8(a86, 2);
338  t15 = vqsubq_u8(vdupq_n_u8(63), t14);
339  m23 = vqaddq_u8(s18, t14);
340  m24 = vqaddq_u8(s19, t15);
341  m25 = vqaddq_u8(s18, t15);
342  m26 = vqaddq_u8(s19, t14);
343  a89 = vminq_u8(m24, m23);
344  d9 = vceqq_u8(a89, m24);
345  a90 = vminq_u8(m26, m25);
346  d10 = vceqq_u8(a90, m26);
347  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
348  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
349  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
350  dec_int[2 * i] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
351  ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
352  ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
353  ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
354  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
355  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
356  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
357  dec_int[2 * i] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
358  ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
359  ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
360  ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
361  left = vget_low_u8(a89);
362  right = vget_low_u8(a90);
363  both = vzip_u8(left, right);
364  ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
365  left = vget_high_u8(a89);
366  right = vget_high_u8(a90);
367  both = vzip_u8(left, right);
368  ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
369 
370  // Second half of butterfly
371  s24 = ((uint8x16_t*)X)[1];
372  s25 = ((uint8x16_t*)X)[3];
373  a100 = ((uint8x16_t*)Branchtab)[1];
374  a101 = veorq_u8(a76, a100);
375  a103 = ((uint8x16_t*)Branchtab)[3];
376  a104 = veorq_u8(a82, a103);
377  a105 = vrhaddq_u8(a101, a104);
378  t17 = vshrq_n_u8(a105, 2);
379  t18 = vqsubq_u8(vdupq_n_u8(63), t17);
380  m27 = vqaddq_u8(s24, t17);
381  m28 = vqaddq_u8(s25, t18);
382  m29 = vqaddq_u8(s24, t18);
383  m30 = vqaddq_u8(s25, t17);
384  a108 = vminq_u8(m28, m27);
385  d11 = vceqq_u8(a108, m28);
386  a109 = vminq_u8(m30, m29);
387  d12 = vceqq_u8(a109, m30);
388  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
389  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
390  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
391  dec_int[2 * i + 1] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
392  ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
393  ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
394  ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
395  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
396  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
397  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
398  dec_int[2 * i + 1] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
399  ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
400  ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
401  ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
402  left = vget_low_u8(a108);
403  right = vget_low_u8(a109);
404  both = vzip_u8(left, right);
405  ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
406  left = vget_high_u8(a108);
407  right = vget_high_u8(a109);
408  both = vzip_u8(left, right);
409  ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
410 
411  // Renormalize
412  uint8x16_t m5, m6;
413  m5 = ((uint8x16_t*)Y)[0];
414  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
415  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
416  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
417  uint8x8_t m7;
418  m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
419  m7 = vpmin_u8(m7, m7);
420  m7 = vpmin_u8(m7, m7);
421  m7 = vpmin_u8(m7, m7);
422  m6 = vcombine_u8(m7, m7);
423  ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
424  ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
425  ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
426  ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
427 
428  // Swap pointers to old and new metrics
429  tmp = X;
430  X = Y;
431  Y = tmp;
432  }
433 }
434 
435 #endif /*LV_HAVE_NEON*/
436 
437 #if LV_HAVE_GENERIC
438 
439 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
440  unsigned char* X,
441  unsigned char* syms,
442  unsigned char* dec,
443  unsigned int framebits,
444  unsigned int excess,
445  unsigned char* Branchtab)
446 {
447  int nbits = framebits + excess;
448  int NUMSTATES = 64;
449 
450  int s, i;
451  for (s = 0; s < nbits; s++) {
452  void* tmp;
453  for (i = 0; i < NUMSTATES / 2; i++) {
454  BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
455  }
456 
457  renormalize(Y);
458 
460  tmp = (void*)X;
461  X = Y;
462  Y = (unsigned char*)tmp;
463  }
464 }
465 
466 #endif /* LV_HAVE_GENERIC */
467 
468 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:208
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:439
static void renormalize(unsigned char *X)
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:50
for i
Definition: volk_config_fixed.tmpl.h:13
Definition: volk_8u_x4_conv_k7_r2_8u.h:48
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:306
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:75