blit_argb8_sse.h
1 /*
2 ** ClanLib SDK
3 ** Copyright (c) 1997-2013 The ClanLib Team
4 **
5 ** This software is provided 'as-is', without any express or implied
6 ** warranty. In no event will the authors be held liable for any damages
7 ** arising from the use of this software.
8 **
9 ** Permission is granted to anyone to use this software for any purpose,
10 ** including commercial applications, and to alter it and redistribute it
11 ** freely, subject to the following restrictions:
12 **
13 ** 1. The origin of this software must not be misrepresented; you must not
14 ** claim that you wrote the original software. If you use this software
15 ** in a product, an acknowledgment in the product documentation would be
16 ** appreciated but is not required.
17 ** 2. Altered source versions must be plainly marked as such, and must not be
18 ** misrepresented as being the original software.
19 ** 3. This notice may not be removed or altered from any source distribution.
20 **
21 ** Note: Some of the libraries ClanLib may link to may have additional
22 ** requirements or restrictions.
23 **
24 ** File Author(s):
25 **
26 ** Magnus Norddahl
27 */
28 
29 
30 #pragma once
31 
32 #include "api_swrender.h"
33 
34 #if defined(__GNUC__) && !defined(__SSE2__)
35 // Do not attempt to compile SSE2 code if the compiler does not support it
36 #else
37 
38 #ifndef DISABLE_SSE2
39 
40 #include <emmintrin.h>
41 
42 namespace clan
43 {
46 
49 {
51 public:
52  static void copy_pixels(unsigned int *dest, const unsigned int *src);
53  static void load_pixel(__m128i &xmm, const unsigned int &pixel);
54  static void load_pixels(__m128i &xmm, const unsigned int *pixels);
55  static void load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2);
56  static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy);
57  static void set_one(__m128i &xmm);
58  static void set_half(__m128i &xmm);
59  static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha);
60  static void set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2);
61 
62 #ifdef _MSC_VER
63  static void multiply_color(__m128i &src, __m128i &primcolor);
64 #else
65  // Fix to compile on gcc
66  static void multiply_color(__m128i &src, __m128i primcolor);
67 #endif
68  static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
69  static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
70  static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
71  static void store_pixel(unsigned int &pixel, __m128i &xmm);
72  static void store_pixels(unsigned int *pixels, __m128i &xmm);
73 
74  static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1);
75  static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
76 // static void sample_nearest(__m128i &out0, __m128i tx, __m128i ty, const unsigned int *data, int width);
77 };
78 
79 inline void BlitARGB8SSE::copy_pixels(unsigned int *dest, const unsigned int *src)
80 {
81  __m128i src0;
82  src0 = _mm_loadl_epi64((const __m128i *) src);
83  _mm_storel_epi64((__m128i *) dest, src0);
84 }
85 
86 inline void BlitARGB8SSE::load_pixel(__m128i &xmm, const unsigned int &pixel)
87 {
88  xmm = _mm_cvtsi32_si128(pixel);
89  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
90 }
91 
92 inline void BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int *pixels)
93 {
94  xmm = _mm_loadl_epi64((const __m128i *) pixels);
95  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
96 }
97 
98 inline void BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2)
99 {
100  xmm = _mm_set_epi32(0, 0, p2, p1);
101  xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
102 }
103 
104 inline void BlitARGB8SSE::load_pixel_linear(__m128i &xmm, const unsigned int &pixel1, const unsigned int &pixel2, const unsigned int &pixel3, const unsigned int &pixel4, unsigned int ifracx, unsigned int ifracy)
105 {
106  __m128i src0, src1, src2, src3;
107  __m128i frac0, frac1, frac2, frac3;
108  __m128i fracx, inv_fracx, fracy, inv_fracy;
109  __m128i half = _mm_set1_epi16(64);
110  fracx = _mm_set1_epi16(ifracx);
111  fracy = _mm_set1_epi16(ifracy);
112  inv_fracx = _mm_set1_epi16(0x80-ifracx);
113  inv_fracy = _mm_set1_epi16(0x80-ifracy);
114  frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
115  frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
116  frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
117  frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
118  src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
119  src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
120  src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
121  src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
122  xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
123 }
124 
125 inline void BlitARGB8SSE::set_one(__m128i &xmm)
126 {
127  xmm = _mm_set1_epi16(0x0100);
128 }
129 
130 inline void BlitARGB8SSE::set_half(__m128i &xmm)
131 {
132  xmm = _mm_set1_epi16(0x007f);
133 }
134 
135 inline void BlitARGB8SSE::set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
136 {
137  xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
138 }
139 
140 inline void BlitARGB8SSE::set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2)
141 {
142  xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
143 }
144 
145 #ifdef _MSC_VER
146 inline void BlitARGB8SSE::multiply_color(__m128i &src, __m128i &primcolor)
147 {
148  src = _mm_mullo_epi16(src, primcolor);
149  src = _mm_srli_epi16(src, 8);
150 }
151 #else
152  // For some reason "primcolor" cannot be a reference on gcc
153 inline void BlitARGB8SSE::multiply_color(__m128i &src, __m128i primcolor)
154 {
155  src = _mm_mullo_epi16(src, primcolor);
156  src = _mm_srli_epi16(src, 8);
157 }
158 #endif
159 
160 #define cl_blitargb8sse_multiply_color(src, primcolor) \
161 { \
162  src = _mm_mullo_epi16(src, primcolor); \
163  src = _mm_srli_epi16(src, 8); \
164 }
165 
166 inline void BlitARGB8SSE::blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
167 {
168  __m128i src_alpha, invsrc_alpha;
169 
170  src_alpha = src;
171  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
172  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
173 
174  invsrc_alpha = _mm_sub_epi16(one, src_alpha);
175 
176  src = _mm_mullo_epi16(src, src_alpha);
177  dest = _mm_mullo_epi16(dest, invsrc_alpha);
178 
179  dest = _mm_add_epi16(dest, src);
180  dest = _mm_add_epi16(dest, half); // round up
181  dest = _mm_srli_epi16(dest, 8);
182 }
183 
184 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
185 { \
186  __m128i src_alpha, invsrc_alpha; \
187 \
188  src_alpha = src; \
189  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
190  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
191 \
192  invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
193 \
194  src = _mm_mullo_epi16(src, src_alpha); \
195  dest = _mm_mullo_epi16(dest, invsrc_alpha); \
196 \
197  dest = _mm_add_epi16(dest, src); \
198  dest = _mm_add_epi16(dest, half); \
199  dest = _mm_srli_epi16(dest, 8); \
200 }
201 
202 inline void BlitARGB8SSE::blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
203 {
204  __m128i src_alpha, invsrc_alpha;
205 
206  src_alpha = src;
207  src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
208  src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
209 
210  invsrc_alpha = _mm_sub_epi16(one, src_alpha);
211 
212  dest = _mm_mullo_epi16(dest, invsrc_alpha);
213  dest = _mm_add_epi16(dest, half); // round up
214  dest = _mm_srli_epi16(dest, 8);
215  dest = _mm_add_epi16(dest, src);
216 }
217 
218 inline void BlitARGB8SSE::blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
219 {
220  __m128i invsrc;
221  invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
222 
223  dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
224  dest = _mm_add_epi16(dest, half); // round up
225  dest = _mm_srli_epi16(dest, 8);
226 }
227 
228 inline void BlitARGB8SSE::store_pixel(unsigned int &pixel, __m128i &xmm)
229 {
230  xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
231  pixel = _mm_cvtsi128_si32(xmm);
232 }
233 
234 inline void BlitARGB8SSE::store_pixels(unsigned int *pixels, __m128i &xmm)
235 {
236  xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
237  _mm_storel_epi64((__m128i *) pixels, xmm);
238 }
239 
240 inline void BlitARGB8SSE::pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
241 {
242  __m128i alpha_mask = _mm_set1_epi32(0xff000000);
243  __m128i red_mask = _mm_set1_epi32(0x00ff0000);
244  __m128i green_mask = _mm_set1_epi32(0x0000ff00);
245  __m128i blue_mask = _mm_set1_epi32(0x000000ff);
246 
247  alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
248  alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
249 
250  red = _mm_and_si128(red_mask, src0);
251  red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
252 
253  green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
254  green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
255 
256  blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
257  blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
258 }
259 
260 inline void BlitARGB8SSE::channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
261 {
262  __m128i alpha_mask = _mm_set1_epi32(0xff000000);
263  __m128i red_mask = _mm_set1_epi32(0x00ff0000);
264  __m128i green_mask = _mm_set1_epi32(0x0000ff00);
265  __m128i blue_mask = _mm_set1_epi32(0x000000ff);
266 
267  dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
268  dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
269 
270  dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
271  dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
272 
273  dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
274  dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
275 
276  dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
277  dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
278 }
279 
280 #ifdef _MSC_VER
281 
282 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
283 { \
284  __declspec(align(16)) unsigned int x[4], y[4]; \
285  _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
286  _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
287  out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
288 }
289 
290 #else
291 
292 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
293 { \
294  __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
295  _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
296  _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
297  out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
298 }
299 
300 #endif
301 
302 // Sadly it seems that the Visual C++ 2008 compiler is unable to optimize BlitARGB8SSE::texture_repeat properly
303 // when implemented as an inline function. Maybe it is the branching or the loops that does it?
304 // Implemented as a macro instead.
305 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
306 { \
307  while (true) \
308  { \
309  __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
310  if (_mm_movemask_epi8(compare_result)) \
311  tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
312  else \
313  break; \
314  } \
315  while (true) \
316  { \
317  __m128i compare_result = _mm_cmplt_epi32(tx, width); \
318  if (_mm_movemask_epi8(compare_result)!=0xffff) \
319  tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
320  else \
321  break; \
322  } \
323  while (true) \
324  { \
325  __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
326  if (_mm_movemask_epi8(compare_result)) \
327  ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
328  else \
329  break; \
330  } \
331  while (true) \
332  { \
333  __m128i compare_result = _mm_cmplt_epi32(ty, height); \
334  if (_mm_movemask_epi8(compare_result)!=0xffff) \
335  ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \
336  else \
337  break; \
338  } \
339 }
340 
341 
342 }
343 
344 #endif
345 #endif
346 
348 
static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
Definition: blit_argb8_sse.h:260
SSE accelerated rendering operations for ARGB8888.
Definition: blit_argb8_sse.h:48
static void store_pixel(unsigned int &pixel, __m128i &xmm)
Definition: blit_argb8_sse.h:228
static void multiply_color(__m128i &src, __m128i primcolor)
Definition: blit_argb8_sse.h:153
static void copy_pixels(unsigned int *dest, const unsigned int *src)
Operations.
Definition: blit_argb8_sse.h:79
static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
Definition: blit_argb8_sse.h:135
static void set_half(__m128i &xmm)
Definition: blit_argb8_sse.h:130
static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
Definition: blit_argb8_sse.h:240
static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
Definition: blit_argb8_sse.h:166
static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
Definition: blit_argb8_sse.h:218
static void set_one(__m128i &xmm)
Definition: blit_argb8_sse.h:125
static void load_pixels(__m128i &xmm, const unsigned int *pixels)
Definition: blit_argb8_sse.h:92
static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy)
Definition: blit_argb8_sse.h:104
static void store_pixels(unsigned int *pixels, __m128i &xmm)
Definition: blit_argb8_sse.h:234
static void load_pixel(__m128i &xmm, const unsigned int &pixel)
Definition: blit_argb8_sse.h:86
static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
Definition: blit_argb8_sse.h:202