57 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
58 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
61 unsigned char t[64/8];
62 unsigned int w[64/32];
63 unsigned short s[64/16];
64 unsigned char c[64/8];
78 unsigned char min=X[0];
80 for(i=0;i<NUMSTATES;i++)
83 for(i=0;i<NUMSTATES;i++)
91 BFLY(
int i,
int s,
unsigned char * syms,
unsigned char *Y,
92 unsigned char *X,
decision_t * d,
unsigned char* Branchtab)
94 int j, decision0, decision1;
95 unsigned char metric,m0,m1,m2,m3;
100 int PRECISIONSHIFT = 2;
104 metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
105 metric=metric>>PRECISIONSHIFT;
107 unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
110 m1 = X[i+NUMSTATES/2] + (max - metric);
111 m2 = X[i] + (max - metric);
112 m3 = X[i+NUMSTATES/2] + metric;
114 decision0 = (
signed int)(m0-m1) > 0;
115 decision1 = (
signed int)(m2-m3) > 0;
117 Y[2*i] = decision0 ? m1 : m0;
118 Y[2*i+1] = decision1 ? m3 : m2;
120 d->
w[i/(
sizeof(
unsigned int)*8/2)+s*(
sizeof(
decision_t)/
sizeof(
unsigned int))] |=
121 (decision0|decision1<<1) << ((2*i)&(
sizeof(
unsigned int)*8-1));
127 #include <pmmintrin.h>
128 #include <emmintrin.h>
129 #include <xmmintrin.h>
130 #include <mmintrin.h>
134 volk_8u_x4_conv_k7_r2_8u_spiral(
unsigned char* Y,
unsigned char* X,
135 unsigned char* syms,
unsigned char* dec,
136 unsigned int framebits,
unsigned int excess,
137 unsigned char* Branchtab)
140 for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
141 unsigned char a75, a81;
143 short int s20, s21, s26, s27;
144 unsigned char *a74, *a80, *b6;
145 short int *a110, *a111, *a91, *a93, *a94;
146 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
147 , *a95, *a96, *a97, *a98, *a99;
148 __m128i a105, a106, a86, a87;
149 __m128i a100, a101, a103, a104, a107, a108, a109
150 , a76, a78, a79, a82, a84, a85, a88, a89
151 , a90, d10, d11, d12, d9, m23, m24, m25
152 , m26, m27, m28, m29, m30, s18, s19, s22
153 , s23, s24, s25, s28, s29, t13, t14, t15
155 a71 = ((__m128i *) X);
162 a76 = _mm_set1_epi8(a75);
163 a77 = ((__m128i *) Branchtab);
165 a79 = _mm_xor_si128(a76, a78);
169 a82 = _mm_set1_epi8(a81);
172 a85 = _mm_xor_si128(a82, a84);
173 t13 = _mm_avg_epu8(a79,a85);
174 a86 = ((__m128i ) t13);
175 a87 = _mm_srli_epi16(a86, 2);
176 a88 = ((__m128i ) a87);
177 t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
178 , 63, 63, 63, 63, 63, 63, 63, 63
180 t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
181 , 63, 63, 63, 63, 63, 63, 63, 63
183 m23 = _mm_adds_epu8(s18, t14);
184 m24 = _mm_adds_epu8(s19, t15);
185 m25 = _mm_adds_epu8(s18, t15);
186 m26 = _mm_adds_epu8(s19, t14);
187 a89 = _mm_min_epu8(m24, m23);
188 d9 = _mm_cmpeq_epi8(a89, m24);
189 a90 = _mm_min_epu8(m26, m25);
190 d10 = _mm_cmpeq_epi8(a90, m26);
191 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
192 a91 = ((
short int *) dec);
196 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
199 s22 = _mm_unpacklo_epi8(a89, a90);
200 s23 = _mm_unpackhi_epi8(a89, a90);
201 a95 = ((__m128i *) Y);
211 a101 = _mm_xor_si128(a76, a100);
214 a104 = _mm_xor_si128(a82, a103);
215 t16 = _mm_avg_epu8(a101,a104);
216 a105 = ((__m128i ) t16);
217 a106 = _mm_srli_epi16(a105, 2);
218 a107 = ((__m128i ) a106);
219 t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
220 , 63, 63, 63, 63, 63, 63, 63, 63
222 t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
223 , 63, 63, 63, 63, 63, 63, 63, 63
225 m27 = _mm_adds_epu8(s24, t17);
226 m28 = _mm_adds_epu8(s25, t18);
227 m29 = _mm_adds_epu8(s24, t18);
228 m30 = _mm_adds_epu8(s25, t17);
229 a108 = _mm_min_epu8(m28, m27);
230 d11 = _mm_cmpeq_epi8(a108, m28);
231 a109 = _mm_min_epu8(m30, m29);
232 d12 = _mm_cmpeq_epi8(a109, m30);
233 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
236 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
239 s28 = _mm_unpacklo_epi8(a108, a109);
240 s29 = _mm_unpackhi_epi8(a108, a109);
245 if ((((
unsigned char *) Y)[0]>210)) {
247 m5 = ((__m128i *) Y)[0];
248 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
249 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
250 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
252 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
253 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
254 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
255 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
256 m7 = _mm_unpacklo_epi8(m7, m7);
257 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
258 m6 = _mm_unpacklo_epi64(m7, m7);
259 ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
260 ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
261 ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
262 ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
264 unsigned char a188, a194;
266 short int s48, s49, s54, s55;
267 unsigned char *a187, *a193, *b15;
268 short int *a204, *a206, *a207, *a223, *a224, *b16;
269 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
270 , *a211, *a212, *a215, *a225, *a226;
271 __m128i a199, a200, a218, a219;
272 __m128i a189, a191, a192, a195, a197, a198, a201
273 , a202, a203, a213, a214, a216, a217, a220, a221
274 , a222, d17, d18, d19, d20, m39, m40, m41
275 , m42, m43, m44, m45, m46, s46, s47, s50
276 , s51, s52, s53, s56, s57, t25, t26, t27
278 a184 = ((__m128i *) Y);
286 a189 = _mm_set1_epi8(a188);
287 a190 = ((__m128i *) Branchtab);
289 a192 = _mm_xor_si128(a189, a191);
292 a195 = _mm_set1_epi8(a194);
295 a198 = _mm_xor_si128(a195, a197);
296 t25 = _mm_avg_epu8(a192,a198);
297 a199 = ((__m128i ) t25);
298 a200 = _mm_srli_epi16(a199, 2);
299 a201 = ((__m128i ) a200);
300 t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
301 , 63, 63, 63, 63, 63, 63, 63, 63
303 t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
304 , 63, 63, 63, 63, 63, 63, 63, 63
306 m39 = _mm_adds_epu8(s46, t26);
307 m40 = _mm_adds_epu8(s47, t27);
308 m41 = _mm_adds_epu8(s46, t27);
309 m42 = _mm_adds_epu8(s47, t26);
310 a202 = _mm_min_epu8(m40, m39);
311 d17 = _mm_cmpeq_epi8(a202, m40);
312 a203 = _mm_min_epu8(m42, m41);
313 d18 = _mm_cmpeq_epi8(a203, m42);
314 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
315 a204 = ((
short int *) dec);
320 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
323 s50 = _mm_unpacklo_epi8(a202, a203);
324 s51 = _mm_unpackhi_epi8(a202, a203);
325 a208 = ((__m128i *) X);
335 a214 = _mm_xor_si128(a189, a213);
338 a217 = _mm_xor_si128(a195, a216);
339 t28 = _mm_avg_epu8(a214,a217);
340 a218 = ((__m128i ) t28);
341 a219 = _mm_srli_epi16(a218, 2);
342 a220 = ((__m128i ) a219);
343 t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
344 , 63, 63, 63, 63, 63, 63, 63, 63
346 t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
347 , 63, 63, 63, 63, 63, 63, 63, 63
349 m43 = _mm_adds_epu8(s52, t29);
350 m44 = _mm_adds_epu8(s53, t30);
351 m45 = _mm_adds_epu8(s52, t30);
352 m46 = _mm_adds_epu8(s53, t29);
353 a221 = _mm_min_epu8(m44, m43);
354 d19 = _mm_cmpeq_epi8(a221, m44);
355 a222 = _mm_min_epu8(m46, m45);
356 d20 = _mm_cmpeq_epi8(a222, m46);
357 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
360 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
363 s56 = _mm_unpacklo_epi8(a221, a222);
364 s57 = _mm_unpackhi_epi8(a221, a222);
369 if ((((
unsigned char *) X)[0]>210)) {
371 m12 = ((__m128i *) X)[0];
372 m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
373 m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
374 m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
376 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
377 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
378 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
379 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
380 m14 = _mm_unpacklo_epi8(m14, m14);
381 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
382 m13 = _mm_unpacklo_epi64(m14, m14);
383 ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
384 ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
385 ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
386 ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
399 for(j=0; j < (framebits + excess) % 2; ++j) {
402 BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (
decision_t *)dec, Branchtab);
424 volk_8u_x4_conv_k7_r2_8u_generic(
unsigned char* Y,
unsigned char* X,
425 unsigned char* syms,
unsigned char* dec,
426 unsigned int framebits,
unsigned int excess,
427 unsigned char* Branchtab)
429 int nbits = framebits + excess;
431 int RENORMALIZE_THRESHOLD = 210;
434 for (s=0;s<nbits;s++){
436 for(i=0;i<NUMSTATES/2;i++){
445 Y = (
unsigned char*)tmp;
float min(float a, float b)
unsigned int * w
Definition: cc_common.h:36
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
Definition: cc_common.h:33
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:91