rgb2rgb_template.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  * lot of big-endian byte order fixes by Alex Beregszaszi
9  *
10  * This file is part of Libav.
11  *
12  * Libav is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * Libav is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with Libav; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 
29 #undef PREFETCH
30 #undef MOVNTQ
31 #undef EMMS
32 #undef SFENCE
33 #undef PAVGB
34 
35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH " # nop"
43 #endif
44 
45 #if COMPILE_TEMPLATE_AMD3DNOW
46 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
47 #define EMMS "femms"
48 #else
49 #define EMMS "emms"
50 #endif
51 
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
55 #else
56 #define MOVNTQ "movq"
57 #define SFENCE " # nop"
58 #endif
59 
60 #if !COMPILE_TEMPLATE_SSE2
61 
62 #if !COMPILE_TEMPLATE_AMD3DNOW
63 
64 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
65 {
66  uint8_t *dest = dst;
67  const uint8_t *s = src;
68  const uint8_t *end;
69  const uint8_t *mm_end;
70  end = s + src_size;
71  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
72  mm_end = end - 23;
73  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
74  while (s < mm_end) {
75  __asm__ volatile(
76  PREFETCH" 32(%1) \n\t"
77  "movd (%1), %%mm0 \n\t"
78  "punpckldq 3(%1), %%mm0 \n\t"
79  "movd 6(%1), %%mm1 \n\t"
80  "punpckldq 9(%1), %%mm1 \n\t"
81  "movd 12(%1), %%mm2 \n\t"
82  "punpckldq 15(%1), %%mm2 \n\t"
83  "movd 18(%1), %%mm3 \n\t"
84  "punpckldq 21(%1), %%mm3 \n\t"
85  "por %%mm7, %%mm0 \n\t"
86  "por %%mm7, %%mm1 \n\t"
87  "por %%mm7, %%mm2 \n\t"
88  "por %%mm7, %%mm3 \n\t"
89  MOVNTQ" %%mm0, (%0) \n\t"
90  MOVNTQ" %%mm1, 8(%0) \n\t"
91  MOVNTQ" %%mm2, 16(%0) \n\t"
92  MOVNTQ" %%mm3, 24(%0)"
93  :: "r"(dest), "r"(s)
94  :"memory");
95  dest += 32;
96  s += 24;
97  }
98  __asm__ volatile(SFENCE:::"memory");
99  __asm__ volatile(EMMS:::"memory");
100  while (s < end) {
101  *dest++ = *s++;
102  *dest++ = *s++;
103  *dest++ = *s++;
104  *dest++ = 255;
105  }
106 }
107 
108 #define STORE_BGR24_MMX \
109  "psrlq $8, %%mm2 \n\t" \
110  "psrlq $8, %%mm3 \n\t" \
111  "psrlq $8, %%mm6 \n\t" \
112  "psrlq $8, %%mm7 \n\t" \
113  "pand "MANGLE(mask24l)", %%mm0\n\t" \
114  "pand "MANGLE(mask24l)", %%mm1\n\t" \
115  "pand "MANGLE(mask24l)", %%mm4\n\t" \
116  "pand "MANGLE(mask24l)", %%mm5\n\t" \
117  "pand "MANGLE(mask24h)", %%mm2\n\t" \
118  "pand "MANGLE(mask24h)", %%mm3\n\t" \
119  "pand "MANGLE(mask24h)", %%mm6\n\t" \
120  "pand "MANGLE(mask24h)", %%mm7\n\t" \
121  "por %%mm2, %%mm0 \n\t" \
122  "por %%mm3, %%mm1 \n\t" \
123  "por %%mm6, %%mm4 \n\t" \
124  "por %%mm7, %%mm5 \n\t" \
125  \
126  "movq %%mm1, %%mm2 \n\t" \
127  "movq %%mm4, %%mm3 \n\t" \
128  "psllq $48, %%mm2 \n\t" \
129  "psllq $32, %%mm3 \n\t" \
130  "pand "MANGLE(mask24hh)", %%mm2\n\t" \
131  "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
132  "por %%mm2, %%mm0 \n\t" \
133  "psrlq $16, %%mm1 \n\t" \
134  "psrlq $32, %%mm4 \n\t" \
135  "psllq $16, %%mm5 \n\t" \
136  "por %%mm3, %%mm1 \n\t" \
137  "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
138  "por %%mm5, %%mm4 \n\t" \
139  \
140  MOVNTQ" %%mm0, (%0) \n\t" \
141  MOVNTQ" %%mm1, 8(%0) \n\t" \
142  MOVNTQ" %%mm4, 16(%0)"
143 
144 
145 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
146 {
147  uint8_t *dest = dst;
148  const uint8_t *s = src;
149  const uint8_t *end;
150  const uint8_t *mm_end;
151  end = s + src_size;
152  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
153  mm_end = end - 31;
154  while (s < mm_end) {
155  __asm__ volatile(
156  PREFETCH" 32(%1) \n\t"
157  "movq (%1), %%mm0 \n\t"
158  "movq 8(%1), %%mm1 \n\t"
159  "movq 16(%1), %%mm4 \n\t"
160  "movq 24(%1), %%mm5 \n\t"
161  "movq %%mm0, %%mm2 \n\t"
162  "movq %%mm1, %%mm3 \n\t"
163  "movq %%mm4, %%mm6 \n\t"
164  "movq %%mm5, %%mm7 \n\t"
166  :: "r"(dest), "r"(s)
167  :"memory");
168  dest += 24;
169  s += 32;
170  }
171  __asm__ volatile(SFENCE:::"memory");
172  __asm__ volatile(EMMS:::"memory");
173  while (s < end) {
174  *dest++ = *s++;
175  *dest++ = *s++;
176  *dest++ = *s++;
177  s++;
178  }
179 }
180 
181 /*
182  original by Strepto/Astral
183  ported to gcc & bugfixed: A'rpi
184  MMXEXT, 3DNOW optimization by Nick Kurshev
185  32-bit C version, and and&add trick by Michael Niedermayer
186 */
187 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
188 {
189  register const uint8_t* s=src;
190  register uint8_t* d=dst;
191  register const uint8_t *end;
192  const uint8_t *mm_end;
193  end = s + src_size;
194  __asm__ volatile(PREFETCH" %0"::"m"(*s));
195  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
196  mm_end = end - 15;
197  while (s<mm_end) {
198  __asm__ volatile(
199  PREFETCH" 32(%1) \n\t"
200  "movq (%1), %%mm0 \n\t"
201  "movq 8(%1), %%mm2 \n\t"
202  "movq %%mm0, %%mm1 \n\t"
203  "movq %%mm2, %%mm3 \n\t"
204  "pand %%mm4, %%mm0 \n\t"
205  "pand %%mm4, %%mm2 \n\t"
206  "paddw %%mm1, %%mm0 \n\t"
207  "paddw %%mm3, %%mm2 \n\t"
208  MOVNTQ" %%mm0, (%0) \n\t"
209  MOVNTQ" %%mm2, 8(%0)"
210  :: "r"(d), "r"(s)
211  );
212  d+=16;
213  s+=16;
214  }
215  __asm__ volatile(SFENCE:::"memory");
216  __asm__ volatile(EMMS:::"memory");
217  mm_end = end - 3;
218  while (s < mm_end) {
219  register unsigned x= *((const uint32_t *)s);
220  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
221  d+=4;
222  s+=4;
223  }
224  if (s < end) {
225  register unsigned short x= *((const uint16_t *)s);
226  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
227  }
228 }
229 
230 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
231 {
232  register const uint8_t* s=src;
233  register uint8_t* d=dst;
234  register const uint8_t *end;
235  const uint8_t *mm_end;
236  end = s + src_size;
237  __asm__ volatile(PREFETCH" %0"::"m"(*s));
238  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
239  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
240  mm_end = end - 15;
241  while (s<mm_end) {
242  __asm__ volatile(
243  PREFETCH" 32(%1) \n\t"
244  "movq (%1), %%mm0 \n\t"
245  "movq 8(%1), %%mm2 \n\t"
246  "movq %%mm0, %%mm1 \n\t"
247  "movq %%mm2, %%mm3 \n\t"
248  "psrlq $1, %%mm0 \n\t"
249  "psrlq $1, %%mm2 \n\t"
250  "pand %%mm7, %%mm0 \n\t"
251  "pand %%mm7, %%mm2 \n\t"
252  "pand %%mm6, %%mm1 \n\t"
253  "pand %%mm6, %%mm3 \n\t"
254  "por %%mm1, %%mm0 \n\t"
255  "por %%mm3, %%mm2 \n\t"
256  MOVNTQ" %%mm0, (%0) \n\t"
257  MOVNTQ" %%mm2, 8(%0)"
258  :: "r"(d), "r"(s)
259  );
260  d+=16;
261  s+=16;
262  }
263  __asm__ volatile(SFENCE:::"memory");
264  __asm__ volatile(EMMS:::"memory");
265  mm_end = end - 3;
266  while (s < mm_end) {
267  register uint32_t x= *((const uint32_t*)s);
268  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
269  s+=4;
270  d+=4;
271  }
272  if (s < end) {
273  register uint16_t x= *((const uint16_t*)s);
274  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
275  }
276 }
277 
278 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
279 {
280  const uint8_t *s = src;
281  const uint8_t *end;
282  const uint8_t *mm_end;
283  uint16_t *d = (uint16_t *)dst;
284  end = s + src_size;
285  mm_end = end - 15;
286  __asm__ volatile(
287  "movq %3, %%mm5 \n\t"
288  "movq %4, %%mm6 \n\t"
289  "movq %5, %%mm7 \n\t"
290  "jmp 2f \n\t"
291  ".p2align 4 \n\t"
292  "1: \n\t"
293  PREFETCH" 32(%1) \n\t"
294  "movd (%1), %%mm0 \n\t"
295  "movd 4(%1), %%mm3 \n\t"
296  "punpckldq 8(%1), %%mm0 \n\t"
297  "punpckldq 12(%1), %%mm3 \n\t"
298  "movq %%mm0, %%mm1 \n\t"
299  "movq %%mm3, %%mm4 \n\t"
300  "pand %%mm6, %%mm0 \n\t"
301  "pand %%mm6, %%mm3 \n\t"
302  "pmaddwd %%mm7, %%mm0 \n\t"
303  "pmaddwd %%mm7, %%mm3 \n\t"
304  "pand %%mm5, %%mm1 \n\t"
305  "pand %%mm5, %%mm4 \n\t"
306  "por %%mm1, %%mm0 \n\t"
307  "por %%mm4, %%mm3 \n\t"
308  "psrld $5, %%mm0 \n\t"
309  "pslld $11, %%mm3 \n\t"
310  "por %%mm3, %%mm0 \n\t"
311  MOVNTQ" %%mm0, (%0) \n\t"
312  "add $16, %1 \n\t"
313  "add $8, %0 \n\t"
314  "2: \n\t"
315  "cmp %2, %1 \n\t"
316  " jb 1b \n\t"
317  : "+r" (d), "+r"(s)
318  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
319  );
320  __asm__ volatile(SFENCE:::"memory");
321  __asm__ volatile(EMMS:::"memory");
322  while (s < end) {
323  register int rgb = *(const uint32_t*)s; s += 4;
324  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
325  }
326 }
327 
328 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
329 {
330  const uint8_t *s = src;
331  const uint8_t *end;
332  const uint8_t *mm_end;
333  uint16_t *d = (uint16_t *)dst;
334  end = s + src_size;
335  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
336  __asm__ volatile(
337  "movq %0, %%mm7 \n\t"
338  "movq %1, %%mm6 \n\t"
339  ::"m"(red_16mask),"m"(green_16mask));
340  mm_end = end - 15;
341  while (s < mm_end) {
342  __asm__ volatile(
343  PREFETCH" 32(%1) \n\t"
344  "movd (%1), %%mm0 \n\t"
345  "movd 4(%1), %%mm3 \n\t"
346  "punpckldq 8(%1), %%mm0 \n\t"
347  "punpckldq 12(%1), %%mm3 \n\t"
348  "movq %%mm0, %%mm1 \n\t"
349  "movq %%mm0, %%mm2 \n\t"
350  "movq %%mm3, %%mm4 \n\t"
351  "movq %%mm3, %%mm5 \n\t"
352  "psllq $8, %%mm0 \n\t"
353  "psllq $8, %%mm3 \n\t"
354  "pand %%mm7, %%mm0 \n\t"
355  "pand %%mm7, %%mm3 \n\t"
356  "psrlq $5, %%mm1 \n\t"
357  "psrlq $5, %%mm4 \n\t"
358  "pand %%mm6, %%mm1 \n\t"
359  "pand %%mm6, %%mm4 \n\t"
360  "psrlq $19, %%mm2 \n\t"
361  "psrlq $19, %%mm5 \n\t"
362  "pand %2, %%mm2 \n\t"
363  "pand %2, %%mm5 \n\t"
364  "por %%mm1, %%mm0 \n\t"
365  "por %%mm4, %%mm3 \n\t"
366  "por %%mm2, %%mm0 \n\t"
367  "por %%mm5, %%mm3 \n\t"
368  "psllq $16, %%mm3 \n\t"
369  "por %%mm3, %%mm0 \n\t"
370  MOVNTQ" %%mm0, (%0) \n\t"
371  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
372  d += 4;
373  s += 16;
374  }
375  __asm__ volatile(SFENCE:::"memory");
376  __asm__ volatile(EMMS:::"memory");
377  while (s < end) {
378  register int rgb = *(const uint32_t*)s; s += 4;
379  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
380  }
381 }
382 
383 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
384 {
385  const uint8_t *s = src;
386  const uint8_t *end;
387  const uint8_t *mm_end;
388  uint16_t *d = (uint16_t *)dst;
389  end = s + src_size;
390  mm_end = end - 15;
391  __asm__ volatile(
392  "movq %3, %%mm5 \n\t"
393  "movq %4, %%mm6 \n\t"
394  "movq %5, %%mm7 \n\t"
395  "jmp 2f \n\t"
396  ".p2align 4 \n\t"
397  "1: \n\t"
398  PREFETCH" 32(%1) \n\t"
399  "movd (%1), %%mm0 \n\t"
400  "movd 4(%1), %%mm3 \n\t"
401  "punpckldq 8(%1), %%mm0 \n\t"
402  "punpckldq 12(%1), %%mm3 \n\t"
403  "movq %%mm0, %%mm1 \n\t"
404  "movq %%mm3, %%mm4 \n\t"
405  "pand %%mm6, %%mm0 \n\t"
406  "pand %%mm6, %%mm3 \n\t"
407  "pmaddwd %%mm7, %%mm0 \n\t"
408  "pmaddwd %%mm7, %%mm3 \n\t"
409  "pand %%mm5, %%mm1 \n\t"
410  "pand %%mm5, %%mm4 \n\t"
411  "por %%mm1, %%mm0 \n\t"
412  "por %%mm4, %%mm3 \n\t"
413  "psrld $6, %%mm0 \n\t"
414  "pslld $10, %%mm3 \n\t"
415  "por %%mm3, %%mm0 \n\t"
416  MOVNTQ" %%mm0, (%0) \n\t"
417  "add $16, %1 \n\t"
418  "add $8, %0 \n\t"
419  "2: \n\t"
420  "cmp %2, %1 \n\t"
421  " jb 1b \n\t"
422  : "+r" (d), "+r"(s)
423  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
424  );
425  __asm__ volatile(SFENCE:::"memory");
426  __asm__ volatile(EMMS:::"memory");
427  while (s < end) {
428  register int rgb = *(const uint32_t*)s; s += 4;
429  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
430  }
431 }
432 
433 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
434 {
435  const uint8_t *s = src;
436  const uint8_t *end;
437  const uint8_t *mm_end;
438  uint16_t *d = (uint16_t *)dst;
439  end = s + src_size;
440  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
441  __asm__ volatile(
442  "movq %0, %%mm7 \n\t"
443  "movq %1, %%mm6 \n\t"
444  ::"m"(red_15mask),"m"(green_15mask));
445  mm_end = end - 15;
446  while (s < mm_end) {
447  __asm__ volatile(
448  PREFETCH" 32(%1) \n\t"
449  "movd (%1), %%mm0 \n\t"
450  "movd 4(%1), %%mm3 \n\t"
451  "punpckldq 8(%1), %%mm0 \n\t"
452  "punpckldq 12(%1), %%mm3 \n\t"
453  "movq %%mm0, %%mm1 \n\t"
454  "movq %%mm0, %%mm2 \n\t"
455  "movq %%mm3, %%mm4 \n\t"
456  "movq %%mm3, %%mm5 \n\t"
457  "psllq $7, %%mm0 \n\t"
458  "psllq $7, %%mm3 \n\t"
459  "pand %%mm7, %%mm0 \n\t"
460  "pand %%mm7, %%mm3 \n\t"
461  "psrlq $6, %%mm1 \n\t"
462  "psrlq $6, %%mm4 \n\t"
463  "pand %%mm6, %%mm1 \n\t"
464  "pand %%mm6, %%mm4 \n\t"
465  "psrlq $19, %%mm2 \n\t"
466  "psrlq $19, %%mm5 \n\t"
467  "pand %2, %%mm2 \n\t"
468  "pand %2, %%mm5 \n\t"
469  "por %%mm1, %%mm0 \n\t"
470  "por %%mm4, %%mm3 \n\t"
471  "por %%mm2, %%mm0 \n\t"
472  "por %%mm5, %%mm3 \n\t"
473  "psllq $16, %%mm3 \n\t"
474  "por %%mm3, %%mm0 \n\t"
475  MOVNTQ" %%mm0, (%0) \n\t"
476  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
477  d += 4;
478  s += 16;
479  }
480  __asm__ volatile(SFENCE:::"memory");
481  __asm__ volatile(EMMS:::"memory");
482  while (s < end) {
483  register int rgb = *(const uint32_t*)s; s += 4;
484  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
485  }
486 }
487 
488 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
489 {
490  const uint8_t *s = src;
491  const uint8_t *end;
492  const uint8_t *mm_end;
493  uint16_t *d = (uint16_t *)dst;
494  end = s + src_size;
495  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
496  __asm__ volatile(
497  "movq %0, %%mm7 \n\t"
498  "movq %1, %%mm6 \n\t"
499  ::"m"(red_16mask),"m"(green_16mask));
500  mm_end = end - 11;
501  while (s < mm_end) {
502  __asm__ volatile(
503  PREFETCH" 32(%1) \n\t"
504  "movd (%1), %%mm0 \n\t"
505  "movd 3(%1), %%mm3 \n\t"
506  "punpckldq 6(%1), %%mm0 \n\t"
507  "punpckldq 9(%1), %%mm3 \n\t"
508  "movq %%mm0, %%mm1 \n\t"
509  "movq %%mm0, %%mm2 \n\t"
510  "movq %%mm3, %%mm4 \n\t"
511  "movq %%mm3, %%mm5 \n\t"
512  "psrlq $3, %%mm0 \n\t"
513  "psrlq $3, %%mm3 \n\t"
514  "pand %2, %%mm0 \n\t"
515  "pand %2, %%mm3 \n\t"
516  "psrlq $5, %%mm1 \n\t"
517  "psrlq $5, %%mm4 \n\t"
518  "pand %%mm6, %%mm1 \n\t"
519  "pand %%mm6, %%mm4 \n\t"
520  "psrlq $8, %%mm2 \n\t"
521  "psrlq $8, %%mm5 \n\t"
522  "pand %%mm7, %%mm2 \n\t"
523  "pand %%mm7, %%mm5 \n\t"
524  "por %%mm1, %%mm0 \n\t"
525  "por %%mm4, %%mm3 \n\t"
526  "por %%mm2, %%mm0 \n\t"
527  "por %%mm5, %%mm3 \n\t"
528  "psllq $16, %%mm3 \n\t"
529  "por %%mm3, %%mm0 \n\t"
530  MOVNTQ" %%mm0, (%0) \n\t"
531  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
532  d += 4;
533  s += 12;
534  }
535  __asm__ volatile(SFENCE:::"memory");
536  __asm__ volatile(EMMS:::"memory");
537  while (s < end) {
538  const int b = *s++;
539  const int g = *s++;
540  const int r = *s++;
541  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
542  }
543 }
544 
545 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
546 {
547  const uint8_t *s = src;
548  const uint8_t *end;
549  const uint8_t *mm_end;
550  uint16_t *d = (uint16_t *)dst;
551  end = s + src_size;
552  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
553  __asm__ volatile(
554  "movq %0, %%mm7 \n\t"
555  "movq %1, %%mm6 \n\t"
556  ::"m"(red_16mask),"m"(green_16mask));
557  mm_end = end - 15;
558  while (s < mm_end) {
559  __asm__ volatile(
560  PREFETCH" 32(%1) \n\t"
561  "movd (%1), %%mm0 \n\t"
562  "movd 3(%1), %%mm3 \n\t"
563  "punpckldq 6(%1), %%mm0 \n\t"
564  "punpckldq 9(%1), %%mm3 \n\t"
565  "movq %%mm0, %%mm1 \n\t"
566  "movq %%mm0, %%mm2 \n\t"
567  "movq %%mm3, %%mm4 \n\t"
568  "movq %%mm3, %%mm5 \n\t"
569  "psllq $8, %%mm0 \n\t"
570  "psllq $8, %%mm3 \n\t"
571  "pand %%mm7, %%mm0 \n\t"
572  "pand %%mm7, %%mm3 \n\t"
573  "psrlq $5, %%mm1 \n\t"
574  "psrlq $5, %%mm4 \n\t"
575  "pand %%mm6, %%mm1 \n\t"
576  "pand %%mm6, %%mm4 \n\t"
577  "psrlq $19, %%mm2 \n\t"
578  "psrlq $19, %%mm5 \n\t"
579  "pand %2, %%mm2 \n\t"
580  "pand %2, %%mm5 \n\t"
581  "por %%mm1, %%mm0 \n\t"
582  "por %%mm4, %%mm3 \n\t"
583  "por %%mm2, %%mm0 \n\t"
584  "por %%mm5, %%mm3 \n\t"
585  "psllq $16, %%mm3 \n\t"
586  "por %%mm3, %%mm0 \n\t"
587  MOVNTQ" %%mm0, (%0) \n\t"
588  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
589  d += 4;
590  s += 12;
591  }
592  __asm__ volatile(SFENCE:::"memory");
593  __asm__ volatile(EMMS:::"memory");
594  while (s < end) {
595  const int r = *s++;
596  const int g = *s++;
597  const int b = *s++;
598  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
599  }
600 }
601 
602 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
603 {
604  const uint8_t *s = src;
605  const uint8_t *end;
606  const uint8_t *mm_end;
607  uint16_t *d = (uint16_t *)dst;
608  end = s + src_size;
609  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
610  __asm__ volatile(
611  "movq %0, %%mm7 \n\t"
612  "movq %1, %%mm6 \n\t"
613  ::"m"(red_15mask),"m"(green_15mask));
614  mm_end = end - 11;
615  while (s < mm_end) {
616  __asm__ volatile(
617  PREFETCH" 32(%1) \n\t"
618  "movd (%1), %%mm0 \n\t"
619  "movd 3(%1), %%mm3 \n\t"
620  "punpckldq 6(%1), %%mm0 \n\t"
621  "punpckldq 9(%1), %%mm3 \n\t"
622  "movq %%mm0, %%mm1 \n\t"
623  "movq %%mm0, %%mm2 \n\t"
624  "movq %%mm3, %%mm4 \n\t"
625  "movq %%mm3, %%mm5 \n\t"
626  "psrlq $3, %%mm0 \n\t"
627  "psrlq $3, %%mm3 \n\t"
628  "pand %2, %%mm0 \n\t"
629  "pand %2, %%mm3 \n\t"
630  "psrlq $6, %%mm1 \n\t"
631  "psrlq $6, %%mm4 \n\t"
632  "pand %%mm6, %%mm1 \n\t"
633  "pand %%mm6, %%mm4 \n\t"
634  "psrlq $9, %%mm2 \n\t"
635  "psrlq $9, %%mm5 \n\t"
636  "pand %%mm7, %%mm2 \n\t"
637  "pand %%mm7, %%mm5 \n\t"
638  "por %%mm1, %%mm0 \n\t"
639  "por %%mm4, %%mm3 \n\t"
640  "por %%mm2, %%mm0 \n\t"
641  "por %%mm5, %%mm3 \n\t"
642  "psllq $16, %%mm3 \n\t"
643  "por %%mm3, %%mm0 \n\t"
644  MOVNTQ" %%mm0, (%0) \n\t"
645  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
646  d += 4;
647  s += 12;
648  }
649  __asm__ volatile(SFENCE:::"memory");
650  __asm__ volatile(EMMS:::"memory");
651  while (s < end) {
652  const int b = *s++;
653  const int g = *s++;
654  const int r = *s++;
655  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
656  }
657 }
658 
659 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
660 {
661  const uint8_t *s = src;
662  const uint8_t *end;
663  const uint8_t *mm_end;
664  uint16_t *d = (uint16_t *)dst;
665  end = s + src_size;
666  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
667  __asm__ volatile(
668  "movq %0, %%mm7 \n\t"
669  "movq %1, %%mm6 \n\t"
670  ::"m"(red_15mask),"m"(green_15mask));
671  mm_end = end - 15;
672  while (s < mm_end) {
673  __asm__ volatile(
674  PREFETCH" 32(%1) \n\t"
675  "movd (%1), %%mm0 \n\t"
676  "movd 3(%1), %%mm3 \n\t"
677  "punpckldq 6(%1), %%mm0 \n\t"
678  "punpckldq 9(%1), %%mm3 \n\t"
679  "movq %%mm0, %%mm1 \n\t"
680  "movq %%mm0, %%mm2 \n\t"
681  "movq %%mm3, %%mm4 \n\t"
682  "movq %%mm3, %%mm5 \n\t"
683  "psllq $7, %%mm0 \n\t"
684  "psllq $7, %%mm3 \n\t"
685  "pand %%mm7, %%mm0 \n\t"
686  "pand %%mm7, %%mm3 \n\t"
687  "psrlq $6, %%mm1 \n\t"
688  "psrlq $6, %%mm4 \n\t"
689  "pand %%mm6, %%mm1 \n\t"
690  "pand %%mm6, %%mm4 \n\t"
691  "psrlq $19, %%mm2 \n\t"
692  "psrlq $19, %%mm5 \n\t"
693  "pand %2, %%mm2 \n\t"
694  "pand %2, %%mm5 \n\t"
695  "por %%mm1, %%mm0 \n\t"
696  "por %%mm4, %%mm3 \n\t"
697  "por %%mm2, %%mm0 \n\t"
698  "por %%mm5, %%mm3 \n\t"
699  "psllq $16, %%mm3 \n\t"
700  "por %%mm3, %%mm0 \n\t"
701  MOVNTQ" %%mm0, (%0) \n\t"
702  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
703  d += 4;
704  s += 12;
705  }
706  __asm__ volatile(SFENCE:::"memory");
707  __asm__ volatile(EMMS:::"memory");
708  while (s < end) {
709  const int r = *s++;
710  const int g = *s++;
711  const int b = *s++;
712  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
713  }
714 }
715 
716 /*
717  I use less accurate approximation here by simply left-shifting the input
718  value and filling the low order bits with zeroes. This method improves PNG
719  compression but this scheme cannot reproduce white exactly, since it does
720  not generate an all-ones maximum value; the net effect is to darken the
721  image slightly.
722 
723  The better method should be "left bit replication":
724 
725  4 3 2 1 0
726  ---------
727  1 1 0 1 1
728 
729  7 6 5 4 3 2 1 0
730  ----------------
731  1 1 0 1 1 1 1 0
732  |=======| |===|
733  | leftmost bits repeated to fill open bits
734  |
735  original bits
736 */
737 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
738 {
739  const uint16_t *end;
740  const uint16_t *mm_end;
741  uint8_t *d = dst;
742  const uint16_t *s = (const uint16_t*)src;
743  end = s + src_size/2;
744  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
745  mm_end = end - 7;
746  while (s < mm_end) {
747  __asm__ volatile(
748  PREFETCH" 32(%1) \n\t"
749  "movq (%1), %%mm0 \n\t"
750  "movq (%1), %%mm1 \n\t"
751  "movq (%1), %%mm2 \n\t"
752  "pand %2, %%mm0 \n\t"
753  "pand %3, %%mm1 \n\t"
754  "pand %4, %%mm2 \n\t"
755  "psllq $3, %%mm0 \n\t"
756  "psrlq $2, %%mm1 \n\t"
757  "psrlq $7, %%mm2 \n\t"
758  "movq %%mm0, %%mm3 \n\t"
759  "movq %%mm1, %%mm4 \n\t"
760  "movq %%mm2, %%mm5 \n\t"
761  "punpcklwd %5, %%mm0 \n\t"
762  "punpcklwd %5, %%mm1 \n\t"
763  "punpcklwd %5, %%mm2 \n\t"
764  "punpckhwd %5, %%mm3 \n\t"
765  "punpckhwd %5, %%mm4 \n\t"
766  "punpckhwd %5, %%mm5 \n\t"
767  "psllq $8, %%mm1 \n\t"
768  "psllq $16, %%mm2 \n\t"
769  "por %%mm1, %%mm0 \n\t"
770  "por %%mm2, %%mm0 \n\t"
771  "psllq $8, %%mm4 \n\t"
772  "psllq $16, %%mm5 \n\t"
773  "por %%mm4, %%mm3 \n\t"
774  "por %%mm5, %%mm3 \n\t"
775 
776  "movq %%mm0, %%mm6 \n\t"
777  "movq %%mm3, %%mm7 \n\t"
778 
779  "movq 8(%1), %%mm0 \n\t"
780  "movq 8(%1), %%mm1 \n\t"
781  "movq 8(%1), %%mm2 \n\t"
782  "pand %2, %%mm0 \n\t"
783  "pand %3, %%mm1 \n\t"
784  "pand %4, %%mm2 \n\t"
785  "psllq $3, %%mm0 \n\t"
786  "psrlq $2, %%mm1 \n\t"
787  "psrlq $7, %%mm2 \n\t"
788  "movq %%mm0, %%mm3 \n\t"
789  "movq %%mm1, %%mm4 \n\t"
790  "movq %%mm2, %%mm5 \n\t"
791  "punpcklwd %5, %%mm0 \n\t"
792  "punpcklwd %5, %%mm1 \n\t"
793  "punpcklwd %5, %%mm2 \n\t"
794  "punpckhwd %5, %%mm3 \n\t"
795  "punpckhwd %5, %%mm4 \n\t"
796  "punpckhwd %5, %%mm5 \n\t"
797  "psllq $8, %%mm1 \n\t"
798  "psllq $16, %%mm2 \n\t"
799  "por %%mm1, %%mm0 \n\t"
800  "por %%mm2, %%mm0 \n\t"
801  "psllq $8, %%mm4 \n\t"
802  "psllq $16, %%mm5 \n\t"
803  "por %%mm4, %%mm3 \n\t"
804  "por %%mm5, %%mm3 \n\t"
805 
806  :"=m"(*d)
807  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
808  :"memory");
809  /* borrowed 32 to 24 */
810  __asm__ volatile(
811  "movq %%mm0, %%mm4 \n\t"
812  "movq %%mm3, %%mm5 \n\t"
813  "movq %%mm6, %%mm0 \n\t"
814  "movq %%mm7, %%mm1 \n\t"
815 
816  "movq %%mm4, %%mm6 \n\t"
817  "movq %%mm5, %%mm7 \n\t"
818  "movq %%mm0, %%mm2 \n\t"
819  "movq %%mm1, %%mm3 \n\t"
820 
822 
823  :: "r"(d), "m"(*s)
824  :"memory");
825  d += 24;
826  s += 8;
827  }
828  __asm__ volatile(SFENCE:::"memory");
829  __asm__ volatile(EMMS:::"memory");
830  while (s < end) {
831  register uint16_t bgr;
832  bgr = *s++;
833  *d++ = (bgr&0x1F)<<3;
834  *d++ = (bgr&0x3E0)>>2;
835  *d++ = (bgr&0x7C00)>>7;
836  }
837 }
838 
839 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
840 {
841  const uint16_t *end;
842  const uint16_t *mm_end;
843  uint8_t *d = (uint8_t *)dst;
844  const uint16_t *s = (const uint16_t *)src;
845  end = s + src_size/2;
846  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
847  mm_end = end - 7;
848  while (s < mm_end) {
849  __asm__ volatile(
850  PREFETCH" 32(%1) \n\t"
851  "movq (%1), %%mm0 \n\t"
852  "movq (%1), %%mm1 \n\t"
853  "movq (%1), %%mm2 \n\t"
854  "pand %2, %%mm0 \n\t"
855  "pand %3, %%mm1 \n\t"
856  "pand %4, %%mm2 \n\t"
857  "psllq $3, %%mm0 \n\t"
858  "psrlq $3, %%mm1 \n\t"
859  "psrlq $8, %%mm2 \n\t"
860  "movq %%mm0, %%mm3 \n\t"
861  "movq %%mm1, %%mm4 \n\t"
862  "movq %%mm2, %%mm5 \n\t"
863  "punpcklwd %5, %%mm0 \n\t"
864  "punpcklwd %5, %%mm1 \n\t"
865  "punpcklwd %5, %%mm2 \n\t"
866  "punpckhwd %5, %%mm3 \n\t"
867  "punpckhwd %5, %%mm4 \n\t"
868  "punpckhwd %5, %%mm5 \n\t"
869  "psllq $8, %%mm1 \n\t"
870  "psllq $16, %%mm2 \n\t"
871  "por %%mm1, %%mm0 \n\t"
872  "por %%mm2, %%mm0 \n\t"
873  "psllq $8, %%mm4 \n\t"
874  "psllq $16, %%mm5 \n\t"
875  "por %%mm4, %%mm3 \n\t"
876  "por %%mm5, %%mm3 \n\t"
877 
878  "movq %%mm0, %%mm6 \n\t"
879  "movq %%mm3, %%mm7 \n\t"
880 
881  "movq 8(%1), %%mm0 \n\t"
882  "movq 8(%1), %%mm1 \n\t"
883  "movq 8(%1), %%mm2 \n\t"
884  "pand %2, %%mm0 \n\t"
885  "pand %3, %%mm1 \n\t"
886  "pand %4, %%mm2 \n\t"
887  "psllq $3, %%mm0 \n\t"
888  "psrlq $3, %%mm1 \n\t"
889  "psrlq $8, %%mm2 \n\t"
890  "movq %%mm0, %%mm3 \n\t"
891  "movq %%mm1, %%mm4 \n\t"
892  "movq %%mm2, %%mm5 \n\t"
893  "punpcklwd %5, %%mm0 \n\t"
894  "punpcklwd %5, %%mm1 \n\t"
895  "punpcklwd %5, %%mm2 \n\t"
896  "punpckhwd %5, %%mm3 \n\t"
897  "punpckhwd %5, %%mm4 \n\t"
898  "punpckhwd %5, %%mm5 \n\t"
899  "psllq $8, %%mm1 \n\t"
900  "psllq $16, %%mm2 \n\t"
901  "por %%mm1, %%mm0 \n\t"
902  "por %%mm2, %%mm0 \n\t"
903  "psllq $8, %%mm4 \n\t"
904  "psllq $16, %%mm5 \n\t"
905  "por %%mm4, %%mm3 \n\t"
906  "por %%mm5, %%mm3 \n\t"
907  :"=m"(*d)
908  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
909  :"memory");
910  /* borrowed 32 to 24 */
911  __asm__ volatile(
912  "movq %%mm0, %%mm4 \n\t"
913  "movq %%mm3, %%mm5 \n\t"
914  "movq %%mm6, %%mm0 \n\t"
915  "movq %%mm7, %%mm1 \n\t"
916 
917  "movq %%mm4, %%mm6 \n\t"
918  "movq %%mm5, %%mm7 \n\t"
919  "movq %%mm0, %%mm2 \n\t"
920  "movq %%mm1, %%mm3 \n\t"
921 
923 
924  :: "r"(d), "m"(*s)
925  :"memory");
926  d += 24;
927  s += 8;
928  }
929  __asm__ volatile(SFENCE:::"memory");
930  __asm__ volatile(EMMS:::"memory");
931  while (s < end) {
932  register uint16_t bgr;
933  bgr = *s++;
934  *d++ = (bgr&0x1F)<<3;
935  *d++ = (bgr&0x7E0)>>3;
936  *d++ = (bgr&0xF800)>>8;
937  }
938 }
939 
940 /*
941  * mm0 = 00 B3 00 B2 00 B1 00 B0
942  * mm1 = 00 G3 00 G2 00 G1 00 G0
943  * mm2 = 00 R3 00 R2 00 R1 00 R0
944  * mm6 = FF FF FF FF FF FF FF FF
945  * mm7 = 00 00 00 00 00 00 00 00
946  */
947 #define PACK_RGB32 \
948  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
949  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
950  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
951  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
952  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
953  "movq %%mm0, %%mm3 \n\t" \
954  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
955  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
956  MOVNTQ" %%mm0, (%0) \n\t" \
957  MOVNTQ" %%mm3, 8(%0) \n\t" \
958 
959 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
960 {
961  const uint16_t *end;
962  const uint16_t *mm_end;
963  uint8_t *d = dst;
964  const uint16_t *s = (const uint16_t *)src;
965  end = s + src_size/2;
966  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
967  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
968  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
969  mm_end = end - 3;
970  while (s < mm_end) {
971  __asm__ volatile(
972  PREFETCH" 32(%1) \n\t"
973  "movq (%1), %%mm0 \n\t"
974  "movq (%1), %%mm1 \n\t"
975  "movq (%1), %%mm2 \n\t"
976  "pand %2, %%mm0 \n\t"
977  "pand %3, %%mm1 \n\t"
978  "pand %4, %%mm2 \n\t"
979  "psllq $3, %%mm0 \n\t"
980  "psrlq $2, %%mm1 \n\t"
981  "psrlq $7, %%mm2 \n\t"
982  PACK_RGB32
983  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
984  :"memory");
985  d += 16;
986  s += 4;
987  }
988  __asm__ volatile(SFENCE:::"memory");
989  __asm__ volatile(EMMS:::"memory");
990  while (s < end) {
991  register uint16_t bgr;
992  bgr = *s++;
993  *d++ = (bgr&0x1F)<<3;
994  *d++ = (bgr&0x3E0)>>2;
995  *d++ = (bgr&0x7C00)>>7;
996  *d++ = 255;
997  }
998 }
999 
1000 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1001 {
1002  const uint16_t *end;
1003  const uint16_t *mm_end;
1004  uint8_t *d = dst;
1005  const uint16_t *s = (const uint16_t*)src;
1006  end = s + src_size/2;
1007  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1008  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1009  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1010  mm_end = end - 3;
1011  while (s < mm_end) {
1012  __asm__ volatile(
1013  PREFETCH" 32(%1) \n\t"
1014  "movq (%1), %%mm0 \n\t"
1015  "movq (%1), %%mm1 \n\t"
1016  "movq (%1), %%mm2 \n\t"
1017  "pand %2, %%mm0 \n\t"
1018  "pand %3, %%mm1 \n\t"
1019  "pand %4, %%mm2 \n\t"
1020  "psllq $3, %%mm0 \n\t"
1021  "psrlq $3, %%mm1 \n\t"
1022  "psrlq $8, %%mm2 \n\t"
1023  PACK_RGB32
1024  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1025  :"memory");
1026  d += 16;
1027  s += 4;
1028  }
1029  __asm__ volatile(SFENCE:::"memory");
1030  __asm__ volatile(EMMS:::"memory");
1031  while (s < end) {
1032  register uint16_t bgr;
1033  bgr = *s++;
1034  *d++ = (bgr&0x1F)<<3;
1035  *d++ = (bgr&0x7E0)>>3;
1036  *d++ = (bgr&0xF800)>>8;
1037  *d++ = 255;
1038  }
1039 }
1040 
1041 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1042 {
1043  x86_reg idx = 15 - src_size;
1044  const uint8_t *s = src-idx;
1045  uint8_t *d = dst-idx;
1046  __asm__ volatile(
1047  "test %0, %0 \n\t"
1048  "jns 2f \n\t"
1049  PREFETCH" (%1, %0) \n\t"
1050  "movq %3, %%mm7 \n\t"
1051  "pxor %4, %%mm7 \n\t"
1052  "movq %%mm7, %%mm6 \n\t"
1053  "pxor %5, %%mm7 \n\t"
1054  ".p2align 4 \n\t"
1055  "1: \n\t"
1056  PREFETCH" 32(%1, %0) \n\t"
1057  "movq (%1, %0), %%mm0 \n\t"
1058  "movq 8(%1, %0), %%mm1 \n\t"
1059 # if COMPILE_TEMPLATE_MMXEXT
1060  "pshufw $177, %%mm0, %%mm3 \n\t"
1061  "pshufw $177, %%mm1, %%mm5 \n\t"
1062  "pand %%mm7, %%mm0 \n\t"
1063  "pand %%mm6, %%mm3 \n\t"
1064  "pand %%mm7, %%mm1 \n\t"
1065  "pand %%mm6, %%mm5 \n\t"
1066  "por %%mm3, %%mm0 \n\t"
1067  "por %%mm5, %%mm1 \n\t"
1068 # else
1069  "movq %%mm0, %%mm2 \n\t"
1070  "movq %%mm1, %%mm4 \n\t"
1071  "pand %%mm7, %%mm0 \n\t"
1072  "pand %%mm6, %%mm2 \n\t"
1073  "pand %%mm7, %%mm1 \n\t"
1074  "pand %%mm6, %%mm4 \n\t"
1075  "movq %%mm2, %%mm3 \n\t"
1076  "movq %%mm4, %%mm5 \n\t"
1077  "pslld $16, %%mm2 \n\t"
1078  "psrld $16, %%mm3 \n\t"
1079  "pslld $16, %%mm4 \n\t"
1080  "psrld $16, %%mm5 \n\t"
1081  "por %%mm2, %%mm0 \n\t"
1082  "por %%mm4, %%mm1 \n\t"
1083  "por %%mm3, %%mm0 \n\t"
1084  "por %%mm5, %%mm1 \n\t"
1085 # endif
1086  MOVNTQ" %%mm0, (%2, %0) \n\t"
1087  MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1088  "add $16, %0 \n\t"
1089  "js 1b \n\t"
1090  SFENCE" \n\t"
1091  EMMS" \n\t"
1092  "2: \n\t"
1093  : "+&r"(idx)
1094  : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1095  : "memory");
1096  for (; idx<15; idx+=4) {
1097  register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1098  v &= 0xff00ff;
1099  *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1100  }
1101 }
1102 
1103 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1104 {
1105  unsigned i;
1106  x86_reg mmx_size= 23 - src_size;
1107  __asm__ volatile (
1108  "test %%"REG_a", %%"REG_a" \n\t"
1109  "jns 2f \n\t"
1110  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1111  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1112  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1113  ".p2align 4 \n\t"
1114  "1: \n\t"
1115  PREFETCH" 32(%1, %%"REG_a") \n\t"
1116  "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1117  "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1118  "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1119  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1120  "pand %%mm5, %%mm0 \n\t"
1121  "pand %%mm6, %%mm1 \n\t"
1122  "pand %%mm7, %%mm2 \n\t"
1123  "por %%mm0, %%mm1 \n\t"
1124  "por %%mm2, %%mm1 \n\t"
1125  "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1126  MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1127  "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1128  "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1129  "pand %%mm7, %%mm0 \n\t"
1130  "pand %%mm5, %%mm1 \n\t"
1131  "pand %%mm6, %%mm2 \n\t"
1132  "por %%mm0, %%mm1 \n\t"
1133  "por %%mm2, %%mm1 \n\t"
1134  "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1135  MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1136  "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1137  "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1138  "pand %%mm6, %%mm0 \n\t"
1139  "pand %%mm7, %%mm1 \n\t"
1140  "pand %%mm5, %%mm2 \n\t"
1141  "por %%mm0, %%mm1 \n\t"
1142  "por %%mm2, %%mm1 \n\t"
1143  MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1144  "add $24, %%"REG_a" \n\t"
1145  " js 1b \n\t"
1146  "2: \n\t"
1147  : "+a" (mmx_size)
1148  : "r" (src-mmx_size), "r"(dst-mmx_size)
1149  );
1150 
1151  __asm__ volatile(SFENCE:::"memory");
1152  __asm__ volatile(EMMS:::"memory");
1153 
1154  if (mmx_size==23) return; //finished, was multiple of 8
1155 
1156  src+= src_size;
1157  dst+= src_size;
1158  src_size= 23-mmx_size;
1159  src-= src_size;
1160  dst-= src_size;
1161  for (i=0; i<src_size; i+=3) {
1162  register uint8_t x;
1163  x = src[i + 2];
1164  dst[i + 1] = src[i + 1];
1165  dst[i + 2] = src[i + 0];
1166  dst[i + 0] = x;
1167  }
1168 }
1169 
1170 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1171  int width, int height,
1172  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1173 {
1174  int y;
1175  const x86_reg chromWidth= width>>1;
1176  for (y=0; y<height; y++) {
1177  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1178  __asm__ volatile(
1179  "xor %%"REG_a", %%"REG_a" \n\t"
1180  ".p2align 4 \n\t"
1181  "1: \n\t"
1182  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1183  PREFETCH" 32(%2, %%"REG_a") \n\t"
1184  PREFETCH" 32(%3, %%"REG_a") \n\t"
1185  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1186  "movq %%mm0, %%mm2 \n\t" // U(0)
1187  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1188  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1189  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1190 
1191  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1192  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1193  "movq %%mm3, %%mm4 \n\t" // Y(0)
1194  "movq %%mm5, %%mm6 \n\t" // Y(8)
1195  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1196  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1197  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1198  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1199 
1200  MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1201  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1202  MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1203  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1204 
1205  "add $8, %%"REG_a" \n\t"
1206  "cmp %4, %%"REG_a" \n\t"
1207  " jb 1b \n\t"
1208  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1209  : "%"REG_a
1210  );
1211  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1212  usrc += chromStride;
1213  vsrc += chromStride;
1214  }
1215  ysrc += lumStride;
1216  dst += dstStride;
1217  }
1218  __asm__(EMMS" \n\t"
1219  SFENCE" \n\t"
1220  :::"memory");
1221 }
1222 
1227 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1228  int width, int height,
1229  int lumStride, int chromStride, int dstStride)
1230 {
1231  //FIXME interpolate chroma
1232  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1233 }
1234 
1235 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1236  int width, int height,
1237  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1238 {
1239  int y;
1240  const x86_reg chromWidth= width>>1;
1241  for (y=0; y<height; y++) {
1242  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1243  __asm__ volatile(
1244  "xor %%"REG_a", %%"REG_a" \n\t"
1245  ".p2align 4 \n\t"
1246  "1: \n\t"
1247  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1248  PREFETCH" 32(%2, %%"REG_a") \n\t"
1249  PREFETCH" 32(%3, %%"REG_a") \n\t"
1250  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1251  "movq %%mm0, %%mm2 \n\t" // U(0)
1252  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1253  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1254  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1255 
1256  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1257  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1258  "movq %%mm0, %%mm4 \n\t" // Y(0)
1259  "movq %%mm2, %%mm6 \n\t" // Y(8)
1260  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1261  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1262  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1263  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1264 
1265  MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1266  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1267  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1268  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1269 
1270  "add $8, %%"REG_a" \n\t"
1271  "cmp %4, %%"REG_a" \n\t"
1272  " jb 1b \n\t"
1273  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1274  : "%"REG_a
1275  );
1276  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1277  usrc += chromStride;
1278  vsrc += chromStride;
1279  }
1280  ysrc += lumStride;
1281  dst += dstStride;
1282  }
1283  __asm__(EMMS" \n\t"
1284  SFENCE" \n\t"
1285  :::"memory");
1286 }
1287 
1292 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1293  int width, int height,
1294  int lumStride, int chromStride, int dstStride)
1295 {
1296  //FIXME interpolate chroma
1297  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1298 }
1299 
1303 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1304  int width, int height,
1305  int lumStride, int chromStride, int dstStride)
1306 {
1307  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1308 }
1309 
1313 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1314  int width, int height,
1315  int lumStride, int chromStride, int dstStride)
1316 {
1317  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1318 }
1319 
1324 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1325  int width, int height,
1326  int lumStride, int chromStride, int srcStride)
1327 {
1328  int y;
1329  const x86_reg chromWidth= width>>1;
1330  for (y=0; y<height; y+=2) {
1331  __asm__ volatile(
1332  "xor %%"REG_a", %%"REG_a" \n\t"
1333  "pcmpeqw %%mm7, %%mm7 \n\t"
1334  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1335  ".p2align 4 \n\t"
1336  "1: \n\t"
1337  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1338  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1339  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1340  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1341  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1342  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1343  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1344  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1345  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1346  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1347  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1348 
1349  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1350 
1351  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1352  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1353  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1354  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1355  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1356  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1357  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1358  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1359  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1360  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1361 
1362  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1363 
1364  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1365  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1366  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1367  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1368  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1369  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1370  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1371  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1372 
1373  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1374  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1375 
1376  "add $8, %%"REG_a" \n\t"
1377  "cmp %4, %%"REG_a" \n\t"
1378  " jb 1b \n\t"
1379  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1380  : "memory", "%"REG_a
1381  );
1382 
1383  ydst += lumStride;
1384  src += srcStride;
1385 
1386  __asm__ volatile(
1387  "xor %%"REG_a", %%"REG_a" \n\t"
1388  ".p2align 4 \n\t"
1389  "1: \n\t"
1390  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1391  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1392  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1393  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1394  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1395  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1396  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1397  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1398  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1399  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1400  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1401 
1402  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1403  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1404 
1405  "add $8, %%"REG_a" \n\t"
1406  "cmp %4, %%"REG_a" \n\t"
1407  " jb 1b \n\t"
1408 
1409  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1410  : "memory", "%"REG_a
1411  );
1412  udst += chromStride;
1413  vdst += chromStride;
1414  ydst += lumStride;
1415  src += srcStride;
1416  }
1417  __asm__ volatile(EMMS" \n\t"
1418  SFENCE" \n\t"
1419  :::"memory");
1420 }
1421 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1422 
1423 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1424 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1425 {
1426  int x,y;
1427 
1428  dst[0]= src[0];
1429 
1430  // first line
1431  for (x=0; x<srcWidth-1; x++) {
1432  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1433  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1434  }
1435  dst[2*srcWidth-1]= src[srcWidth-1];
1436 
1437  dst+= dstStride;
1438 
1439  for (y=1; y<srcHeight; y++) {
1440  const x86_reg mmxSize= srcWidth&~15;
1441  __asm__ volatile(
1442  "mov %4, %%"REG_a" \n\t"
1443  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1444  "movq (%0, %%"REG_a"), %%mm4 \n\t"
1445  "movq %%mm4, %%mm2 \n\t"
1446  "psllq $8, %%mm4 \n\t"
1447  "pand %%mm0, %%mm2 \n\t"
1448  "por %%mm2, %%mm4 \n\t"
1449  "movq (%1, %%"REG_a"), %%mm5 \n\t"
1450  "movq %%mm5, %%mm3 \n\t"
1451  "psllq $8, %%mm5 \n\t"
1452  "pand %%mm0, %%mm3 \n\t"
1453  "por %%mm3, %%mm5 \n\t"
1454  "1: \n\t"
1455  "movq (%0, %%"REG_a"), %%mm0 \n\t"
1456  "movq (%1, %%"REG_a"), %%mm1 \n\t"
1457  "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1458  "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1459  PAVGB" %%mm0, %%mm5 \n\t"
1460  PAVGB" %%mm0, %%mm3 \n\t"
1461  PAVGB" %%mm0, %%mm5 \n\t"
1462  PAVGB" %%mm0, %%mm3 \n\t"
1463  PAVGB" %%mm1, %%mm4 \n\t"
1464  PAVGB" %%mm1, %%mm2 \n\t"
1465  PAVGB" %%mm1, %%mm4 \n\t"
1466  PAVGB" %%mm1, %%mm2 \n\t"
1467  "movq %%mm5, %%mm7 \n\t"
1468  "movq %%mm4, %%mm6 \n\t"
1469  "punpcklbw %%mm3, %%mm5 \n\t"
1470  "punpckhbw %%mm3, %%mm7 \n\t"
1471  "punpcklbw %%mm2, %%mm4 \n\t"
1472  "punpckhbw %%mm2, %%mm6 \n\t"
1473  MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1474  MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1475  MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1476  MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1477  "add $8, %%"REG_a" \n\t"
1478  "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1479  "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1480  " js 1b \n\t"
1481  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1482  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1483  "g" (-mmxSize)
1484  : "%"REG_a
1485  );
1486 
1487  for (x=mmxSize-1; x<srcWidth-1; x++) {
1488  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1489  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1490  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1491  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1492  }
1493  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1494  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1495 
1496  dst+=dstStride*2;
1497  src+=srcStride;
1498  }
1499 
1500  // last line
1501  dst[0]= src[0];
1502 
1503  for (x=0; x<srcWidth-1; x++) {
1504  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1505  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1506  }
1507  dst[2*srcWidth-1]= src[srcWidth-1];
1508 
1509  __asm__ volatile(EMMS" \n\t"
1510  SFENCE" \n\t"
1511  :::"memory");
1512 }
1513 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1514 
1515 #if !COMPILE_TEMPLATE_AMD3DNOW
1516 
1522 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1523  int width, int height,
1524  int lumStride, int chromStride, int srcStride)
1525 {
1526  int y;
1527  const x86_reg chromWidth= width>>1;
1528  for (y=0; y<height; y+=2) {
1529  __asm__ volatile(
1530  "xor %%"REG_a", %%"REG_a" \n\t"
1531  "pcmpeqw %%mm7, %%mm7 \n\t"
1532  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1533  ".p2align 4 \n\t"
1534  "1: \n\t"
1535  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1536  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1537  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1538  "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1539  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1540  "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1541  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1542  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1543  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1544  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1545  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1546 
1547  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1548 
1549  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1550  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1551  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1552  "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1553  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1554  "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1555  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1556  "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1557  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1558  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1559 
1560  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1561 
1562  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1563  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1564  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1565  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1566  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1567  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1568  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1569  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1570 
1571  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1572  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1573 
1574  "add $8, %%"REG_a" \n\t"
1575  "cmp %4, %%"REG_a" \n\t"
1576  " jb 1b \n\t"
1577  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1578  : "memory", "%"REG_a
1579  );
1580 
1581  ydst += lumStride;
1582  src += srcStride;
1583 
1584  __asm__ volatile(
1585  "xor %%"REG_a", %%"REG_a" \n\t"
1586  ".p2align 4 \n\t"
1587  "1: \n\t"
1588  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1589  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1590  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1591  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1592  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1593  "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1594  "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1595  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1596  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1597  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1598  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1599 
1600  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1601  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1602 
1603  "add $8, %%"REG_a" \n\t"
1604  "cmp %4, %%"REG_a" \n\t"
1605  " jb 1b \n\t"
1606 
1607  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1608  : "memory", "%"REG_a
1609  );
1610  udst += chromStride;
1611  vdst += chromStride;
1612  ydst += lumStride;
1613  src += srcStride;
1614  }
1615  __asm__ volatile(EMMS" \n\t"
1616  SFENCE" \n\t"
1617  :::"memory");
1618 }
1619 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1620 
1628 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1629  int width, int height,
1630  int lumStride, int chromStride, int srcStride)
1631 {
1632  int y;
1633  const x86_reg chromWidth= width>>1;
1634  for (y=0; y<height-2; y+=2) {
1635  int i;
1636  for (i=0; i<2; i++) {
1637  __asm__ volatile(
1638  "mov %2, %%"REG_a" \n\t"
1639  "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1640  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1641  "pxor %%mm7, %%mm7 \n\t"
1642  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1643  ".p2align 4 \n\t"
1644  "1: \n\t"
1645  PREFETCH" 64(%0, %%"REG_d") \n\t"
1646  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1647  "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1648  "punpcklbw %%mm7, %%mm0 \n\t"
1649  "punpcklbw %%mm7, %%mm1 \n\t"
1650  "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1651  "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1652  "punpcklbw %%mm7, %%mm2 \n\t"
1653  "punpcklbw %%mm7, %%mm3 \n\t"
1654  "pmaddwd %%mm6, %%mm0 \n\t"
1655  "pmaddwd %%mm6, %%mm1 \n\t"
1656  "pmaddwd %%mm6, %%mm2 \n\t"
1657  "pmaddwd %%mm6, %%mm3 \n\t"
1658 #ifndef FAST_BGR2YV12
1659  "psrad $8, %%mm0 \n\t"
1660  "psrad $8, %%mm1 \n\t"
1661  "psrad $8, %%mm2 \n\t"
1662  "psrad $8, %%mm3 \n\t"
1663 #endif
1664  "packssdw %%mm1, %%mm0 \n\t"
1665  "packssdw %%mm3, %%mm2 \n\t"
1666  "pmaddwd %%mm5, %%mm0 \n\t"
1667  "pmaddwd %%mm5, %%mm2 \n\t"
1668  "packssdw %%mm2, %%mm0 \n\t"
1669  "psraw $7, %%mm0 \n\t"
1670 
1671  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1672  "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1673  "punpcklbw %%mm7, %%mm4 \n\t"
1674  "punpcklbw %%mm7, %%mm1 \n\t"
1675  "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1676  "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1677  "punpcklbw %%mm7, %%mm2 \n\t"
1678  "punpcklbw %%mm7, %%mm3 \n\t"
1679  "pmaddwd %%mm6, %%mm4 \n\t"
1680  "pmaddwd %%mm6, %%mm1 \n\t"
1681  "pmaddwd %%mm6, %%mm2 \n\t"
1682  "pmaddwd %%mm6, %%mm3 \n\t"
1683 #ifndef FAST_BGR2YV12
1684  "psrad $8, %%mm4 \n\t"
1685  "psrad $8, %%mm1 \n\t"
1686  "psrad $8, %%mm2 \n\t"
1687  "psrad $8, %%mm3 \n\t"
1688 #endif
1689  "packssdw %%mm1, %%mm4 \n\t"
1690  "packssdw %%mm3, %%mm2 \n\t"
1691  "pmaddwd %%mm5, %%mm4 \n\t"
1692  "pmaddwd %%mm5, %%mm2 \n\t"
1693  "add $24, %%"REG_d" \n\t"
1694  "packssdw %%mm2, %%mm4 \n\t"
1695  "psraw $7, %%mm4 \n\t"
1696 
1697  "packuswb %%mm4, %%mm0 \n\t"
1698  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1699 
1700  MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1701  "add $8, %%"REG_a" \n\t"
1702  " js 1b \n\t"
1703  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1704  : "%"REG_a, "%"REG_d
1705  );
1706  ydst += lumStride;
1707  src += srcStride;
1708  }
1709  src -= srcStride*2;
1710  __asm__ volatile(
1711  "mov %4, %%"REG_a" \n\t"
1712  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1713  "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1714  "pxor %%mm7, %%mm7 \n\t"
1715  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1716  "add %%"REG_d", %%"REG_d" \n\t"
1717  ".p2align 4 \n\t"
1718  "1: \n\t"
1719  PREFETCH" 64(%0, %%"REG_d") \n\t"
1720  PREFETCH" 64(%1, %%"REG_d") \n\t"
1721 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1722  "movq (%0, %%"REG_d"), %%mm0 \n\t"
1723  "movq (%1, %%"REG_d"), %%mm1 \n\t"
1724  "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1725  "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1726  PAVGB" %%mm1, %%mm0 \n\t"
1727  PAVGB" %%mm3, %%mm2 \n\t"
1728  "movq %%mm0, %%mm1 \n\t"
1729  "movq %%mm2, %%mm3 \n\t"
1730  "psrlq $24, %%mm0 \n\t"
1731  "psrlq $24, %%mm2 \n\t"
1732  PAVGB" %%mm1, %%mm0 \n\t"
1733  PAVGB" %%mm3, %%mm2 \n\t"
1734  "punpcklbw %%mm7, %%mm0 \n\t"
1735  "punpcklbw %%mm7, %%mm2 \n\t"
1736 #else
1737  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1738  "movd (%1, %%"REG_d"), %%mm1 \n\t"
1739  "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1740  "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1741  "punpcklbw %%mm7, %%mm0 \n\t"
1742  "punpcklbw %%mm7, %%mm1 \n\t"
1743  "punpcklbw %%mm7, %%mm2 \n\t"
1744  "punpcklbw %%mm7, %%mm3 \n\t"
1745  "paddw %%mm1, %%mm0 \n\t"
1746  "paddw %%mm3, %%mm2 \n\t"
1747  "paddw %%mm2, %%mm0 \n\t"
1748  "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1749  "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1750  "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1751  "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1752  "punpcklbw %%mm7, %%mm4 \n\t"
1753  "punpcklbw %%mm7, %%mm1 \n\t"
1754  "punpcklbw %%mm7, %%mm2 \n\t"
1755  "punpcklbw %%mm7, %%mm3 \n\t"
1756  "paddw %%mm1, %%mm4 \n\t"
1757  "paddw %%mm3, %%mm2 \n\t"
1758  "paddw %%mm4, %%mm2 \n\t"
1759  "psrlw $2, %%mm0 \n\t"
1760  "psrlw $2, %%mm2 \n\t"
1761 #endif
1762  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1763  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1764 
1765  "pmaddwd %%mm0, %%mm1 \n\t"
1766  "pmaddwd %%mm2, %%mm3 \n\t"
1767  "pmaddwd %%mm6, %%mm0 \n\t"
1768  "pmaddwd %%mm6, %%mm2 \n\t"
1769 #ifndef FAST_BGR2YV12
1770  "psrad $8, %%mm0 \n\t"
1771  "psrad $8, %%mm1 \n\t"
1772  "psrad $8, %%mm2 \n\t"
1773  "psrad $8, %%mm3 \n\t"
1774 #endif
1775  "packssdw %%mm2, %%mm0 \n\t"
1776  "packssdw %%mm3, %%mm1 \n\t"
1777  "pmaddwd %%mm5, %%mm0 \n\t"
1778  "pmaddwd %%mm5, %%mm1 \n\t"
1779  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1780  "psraw $7, %%mm0 \n\t"
1781 
1782 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1783  "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1784  "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1785  "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1786  "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1787  PAVGB" %%mm1, %%mm4 \n\t"
1788  PAVGB" %%mm3, %%mm2 \n\t"
1789  "movq %%mm4, %%mm1 \n\t"
1790  "movq %%mm2, %%mm3 \n\t"
1791  "psrlq $24, %%mm4 \n\t"
1792  "psrlq $24, %%mm2 \n\t"
1793  PAVGB" %%mm1, %%mm4 \n\t"
1794  PAVGB" %%mm3, %%mm2 \n\t"
1795  "punpcklbw %%mm7, %%mm4 \n\t"
1796  "punpcklbw %%mm7, %%mm2 \n\t"
1797 #else
1798  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1799  "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1800  "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1801  "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1802  "punpcklbw %%mm7, %%mm4 \n\t"
1803  "punpcklbw %%mm7, %%mm1 \n\t"
1804  "punpcklbw %%mm7, %%mm2 \n\t"
1805  "punpcklbw %%mm7, %%mm3 \n\t"
1806  "paddw %%mm1, %%mm4 \n\t"
1807  "paddw %%mm3, %%mm2 \n\t"
1808  "paddw %%mm2, %%mm4 \n\t"
1809  "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1810  "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1811  "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1812  "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1813  "punpcklbw %%mm7, %%mm5 \n\t"
1814  "punpcklbw %%mm7, %%mm1 \n\t"
1815  "punpcklbw %%mm7, %%mm2 \n\t"
1816  "punpcklbw %%mm7, %%mm3 \n\t"
1817  "paddw %%mm1, %%mm5 \n\t"
1818  "paddw %%mm3, %%mm2 \n\t"
1819  "paddw %%mm5, %%mm2 \n\t"
1820  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1821  "psrlw $2, %%mm4 \n\t"
1822  "psrlw $2, %%mm2 \n\t"
1823 #endif
1824  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1825  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1826 
1827  "pmaddwd %%mm4, %%mm1 \n\t"
1828  "pmaddwd %%mm2, %%mm3 \n\t"
1829  "pmaddwd %%mm6, %%mm4 \n\t"
1830  "pmaddwd %%mm6, %%mm2 \n\t"
1831 #ifndef FAST_BGR2YV12
1832  "psrad $8, %%mm4 \n\t"
1833  "psrad $8, %%mm1 \n\t"
1834  "psrad $8, %%mm2 \n\t"
1835  "psrad $8, %%mm3 \n\t"
1836 #endif
1837  "packssdw %%mm2, %%mm4 \n\t"
1838  "packssdw %%mm3, %%mm1 \n\t"
1839  "pmaddwd %%mm5, %%mm4 \n\t"
1840  "pmaddwd %%mm5, %%mm1 \n\t"
1841  "add $24, %%"REG_d" \n\t"
1842  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1843  "psraw $7, %%mm4 \n\t"
1844 
1845  "movq %%mm0, %%mm1 \n\t"
1846  "punpckldq %%mm4, %%mm0 \n\t"
1847  "punpckhdq %%mm4, %%mm1 \n\t"
1848  "packsswb %%mm1, %%mm0 \n\t"
1849  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1850  "movd %%mm0, (%2, %%"REG_a") \n\t"
1851  "punpckhdq %%mm0, %%mm0 \n\t"
1852  "movd %%mm0, (%3, %%"REG_a") \n\t"
1853  "add $4, %%"REG_a" \n\t"
1854  " js 1b \n\t"
1855  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1856  : "%"REG_a, "%"REG_d
1857  );
1858 
1859  udst += chromStride;
1860  vdst += chromStride;
1861  src += srcStride*2;
1862  }
1863 
1864  __asm__ volatile(EMMS" \n\t"
1865  SFENCE" \n\t"
1866  :::"memory");
1867 
1868  rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1869 }
1870 #endif /* !COMPILE_TEMPLATE_SSE2 */
1871 
1872 #if !COMPILE_TEMPLATE_AMD3DNOW
1873 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1874  int width, int height, int src1Stride,
1875  int src2Stride, int dstStride)
1876 {
1877  int h;
1878 
1879  for (h=0; h < height; h++) {
1880  int w;
1881 
1882 #if COMPILE_TEMPLATE_SSE2
1883  __asm__(
1884  "xor %%"REG_a", %%"REG_a" \n\t"
1885  "1: \n\t"
1886  PREFETCH" 64(%1, %%"REG_a") \n\t"
1887  PREFETCH" 64(%2, %%"REG_a") \n\t"
1888  "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1889  "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1890  "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1891  "punpcklbw %%xmm2, %%xmm0 \n\t"
1892  "punpckhbw %%xmm2, %%xmm1 \n\t"
1893  "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1894  "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1895  "add $16, %%"REG_a" \n\t"
1896  "cmp %3, %%"REG_a" \n\t"
1897  " jb 1b \n\t"
1898  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1899  : "memory", "%"REG_a""
1900  );
1901 #else
1902  __asm__(
1903  "xor %%"REG_a", %%"REG_a" \n\t"
1904  "1: \n\t"
1905  PREFETCH" 64(%1, %%"REG_a") \n\t"
1906  PREFETCH" 64(%2, %%"REG_a") \n\t"
1907  "movq (%1, %%"REG_a"), %%mm0 \n\t"
1908  "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1909  "movq %%mm0, %%mm1 \n\t"
1910  "movq %%mm2, %%mm3 \n\t"
1911  "movq (%2, %%"REG_a"), %%mm4 \n\t"
1912  "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1913  "punpcklbw %%mm4, %%mm0 \n\t"
1914  "punpckhbw %%mm4, %%mm1 \n\t"
1915  "punpcklbw %%mm5, %%mm2 \n\t"
1916  "punpckhbw %%mm5, %%mm3 \n\t"
1917  MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1918  MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1919  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1920  MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1921  "add $16, %%"REG_a" \n\t"
1922  "cmp %3, %%"REG_a" \n\t"
1923  " jb 1b \n\t"
1924  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1925  : "memory", "%"REG_a
1926  );
1927 #endif
1928  for (w= (width&(~15)); w < width; w++) {
1929  dest[2*w+0] = src1[w];
1930  dest[2*w+1] = src2[w];
1931  }
1932  dest += dstStride;
1933  src1 += src1Stride;
1934  src2 += src2Stride;
1935  }
1936  __asm__(
1937  EMMS" \n\t"
1938  SFENCE" \n\t"
1939  ::: "memory"
1940  );
1941 }
1942 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1943 
1944 #if !COMPILE_TEMPLATE_SSE2
1945 #if !COMPILE_TEMPLATE_AMD3DNOW
1946 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1947  uint8_t *dst1, uint8_t *dst2,
1948  int width, int height,
1949  int srcStride1, int srcStride2,
1950  int dstStride1, int dstStride2)
1951 {
1952  x86_reg x, y;
1953  int w,h;
1954  w=width/2; h=height/2;
1955  __asm__ volatile(
1956  PREFETCH" %0 \n\t"
1957  PREFETCH" %1 \n\t"
1958  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1959  for (y=0;y<h;y++) {
1960  const uint8_t* s1=src1+srcStride1*(y>>1);
1961  uint8_t* d=dst1+dstStride1*y;
1962  x=0;
1963  for (;x<w-31;x+=32) {
1964  __asm__ volatile(
1965  PREFETCH" 32(%1,%2) \n\t"
1966  "movq (%1,%2), %%mm0 \n\t"
1967  "movq 8(%1,%2), %%mm2 \n\t"
1968  "movq 16(%1,%2), %%mm4 \n\t"
1969  "movq 24(%1,%2), %%mm6 \n\t"
1970  "movq %%mm0, %%mm1 \n\t"
1971  "movq %%mm2, %%mm3 \n\t"
1972  "movq %%mm4, %%mm5 \n\t"
1973  "movq %%mm6, %%mm7 \n\t"
1974  "punpcklbw %%mm0, %%mm0 \n\t"
1975  "punpckhbw %%mm1, %%mm1 \n\t"
1976  "punpcklbw %%mm2, %%mm2 \n\t"
1977  "punpckhbw %%mm3, %%mm3 \n\t"
1978  "punpcklbw %%mm4, %%mm4 \n\t"
1979  "punpckhbw %%mm5, %%mm5 \n\t"
1980  "punpcklbw %%mm6, %%mm6 \n\t"
1981  "punpckhbw %%mm7, %%mm7 \n\t"
1982  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1983  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1984  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1985  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1986  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1987  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1988  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1989  MOVNTQ" %%mm7, 56(%0,%2,2)"
1990  :: "r"(d), "r"(s1), "r"(x)
1991  :"memory");
1992  }
1993  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1994  }
1995  for (y=0;y<h;y++) {
1996  const uint8_t* s2=src2+srcStride2*(y>>1);
1997  uint8_t* d=dst2+dstStride2*y;
1998  x=0;
1999  for (;x<w-31;x+=32) {
2000  __asm__ volatile(
2001  PREFETCH" 32(%1,%2) \n\t"
2002  "movq (%1,%2), %%mm0 \n\t"
2003  "movq 8(%1,%2), %%mm2 \n\t"
2004  "movq 16(%1,%2), %%mm4 \n\t"
2005  "movq 24(%1,%2), %%mm6 \n\t"
2006  "movq %%mm0, %%mm1 \n\t"
2007  "movq %%mm2, %%mm3 \n\t"
2008  "movq %%mm4, %%mm5 \n\t"
2009  "movq %%mm6, %%mm7 \n\t"
2010  "punpcklbw %%mm0, %%mm0 \n\t"
2011  "punpckhbw %%mm1, %%mm1 \n\t"
2012  "punpcklbw %%mm2, %%mm2 \n\t"
2013  "punpckhbw %%mm3, %%mm3 \n\t"
2014  "punpcklbw %%mm4, %%mm4 \n\t"
2015  "punpckhbw %%mm5, %%mm5 \n\t"
2016  "punpcklbw %%mm6, %%mm6 \n\t"
2017  "punpckhbw %%mm7, %%mm7 \n\t"
2018  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2019  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2020  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2021  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2022  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2023  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2024  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2025  MOVNTQ" %%mm7, 56(%0,%2,2)"
2026  :: "r"(d), "r"(s2), "r"(x)
2027  :"memory");
2028  }
2029  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2030  }
2031  __asm__(
2032  EMMS" \n\t"
2033  SFENCE" \n\t"
2034  ::: "memory"
2035  );
2036 }
2037 
2038 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2039  uint8_t *dst,
2040  int width, int height,
2041  int srcStride1, int srcStride2,
2042  int srcStride3, int dstStride)
2043 {
2044  x86_reg x;
2045  int y,w,h;
2046  w=width/2; h=height;
2047  for (y=0;y<h;y++) {
2048  const uint8_t* yp=src1+srcStride1*y;
2049  const uint8_t* up=src2+srcStride2*(y>>2);
2050  const uint8_t* vp=src3+srcStride3*(y>>2);
2051  uint8_t* d=dst+dstStride*y;
2052  x=0;
2053  for (;x<w-7;x+=8) {
2054  __asm__ volatile(
2055  PREFETCH" 32(%1, %0) \n\t"
2056  PREFETCH" 32(%2, %0) \n\t"
2057  PREFETCH" 32(%3, %0) \n\t"
2058  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2059  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2060  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2061  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2062  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2063  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2064  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2065  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2066  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2067  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2068 
2069  "movq %%mm1, %%mm6 \n\t"
2070  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2071  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2072  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2073  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2074  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2075 
2076  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2077  "movq 8(%1, %0, 4), %%mm0 \n\t"
2078  "movq %%mm0, %%mm3 \n\t"
2079  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2080  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2081  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2082  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2083 
2084  "movq %%mm4, %%mm6 \n\t"
2085  "movq 16(%1, %0, 4), %%mm0 \n\t"
2086  "movq %%mm0, %%mm3 \n\t"
2087  "punpcklbw %%mm5, %%mm4 \n\t"
2088  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2089  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2090  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2091  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2092 
2093  "punpckhbw %%mm5, %%mm6 \n\t"
2094  "movq 24(%1, %0, 4), %%mm0 \n\t"
2095  "movq %%mm0, %%mm3 \n\t"
2096  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2097  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2098  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2099  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2100 
2101  : "+r" (x)
2102  : "r"(yp), "r" (up), "r"(vp), "r"(d)
2103  :"memory");
2104  }
2105  for (; x<w; x++) {
2106  const int x2 = x<<2;
2107  d[8*x+0] = yp[x2];
2108  d[8*x+1] = up[x];
2109  d[8*x+2] = yp[x2+1];
2110  d[8*x+3] = vp[x];
2111  d[8*x+4] = yp[x2+2];
2112  d[8*x+5] = up[x];
2113  d[8*x+6] = yp[x2+3];
2114  d[8*x+7] = vp[x];
2115  }
2116  }
2117  __asm__(
2118  EMMS" \n\t"
2119  SFENCE" \n\t"
2120  ::: "memory"
2121  );
2122 }
2123 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2124 
2125 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2126 {
2127  dst += count;
2128  src += 2*count;
2129  count= - count;
2130 
2131  if(count <= -16) {
2132  count += 15;
2133  __asm__ volatile(
2134  "pcmpeqw %%mm7, %%mm7 \n\t"
2135  "psrlw $8, %%mm7 \n\t"
2136  "1: \n\t"
2137  "movq -30(%1, %0, 2), %%mm0 \n\t"
2138  "movq -22(%1, %0, 2), %%mm1 \n\t"
2139  "movq -14(%1, %0, 2), %%mm2 \n\t"
2140  "movq -6(%1, %0, 2), %%mm3 \n\t"
2141  "pand %%mm7, %%mm0 \n\t"
2142  "pand %%mm7, %%mm1 \n\t"
2143  "pand %%mm7, %%mm2 \n\t"
2144  "pand %%mm7, %%mm3 \n\t"
2145  "packuswb %%mm1, %%mm0 \n\t"
2146  "packuswb %%mm3, %%mm2 \n\t"
2147  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2148  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2149  "add $16, %0 \n\t"
2150  " js 1b \n\t"
2151  : "+r"(count)
2152  : "r"(src), "r"(dst)
2153  );
2154  count -= 15;
2155  }
2156  while(count<0) {
2157  dst[count]= src[2*count];
2158  count++;
2159  }
2160 }
2161 
2162 #if !COMPILE_TEMPLATE_AMD3DNOW
2163 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2164 {
2165  dst0+= count;
2166  dst1+= count;
2167  src += 4*count;
2168  count= - count;
2169  if(count <= -8) {
2170  count += 7;
2171  __asm__ volatile(
2172  "pcmpeqw %%mm7, %%mm7 \n\t"
2173  "psrlw $8, %%mm7 \n\t"
2174  "1: \n\t"
2175  "movq -28(%1, %0, 4), %%mm0 \n\t"
2176  "movq -20(%1, %0, 4), %%mm1 \n\t"
2177  "movq -12(%1, %0, 4), %%mm2 \n\t"
2178  "movq -4(%1, %0, 4), %%mm3 \n\t"
2179  "pand %%mm7, %%mm0 \n\t"
2180  "pand %%mm7, %%mm1 \n\t"
2181  "pand %%mm7, %%mm2 \n\t"
2182  "pand %%mm7, %%mm3 \n\t"
2183  "packuswb %%mm1, %%mm0 \n\t"
2184  "packuswb %%mm3, %%mm2 \n\t"
2185  "movq %%mm0, %%mm1 \n\t"
2186  "movq %%mm2, %%mm3 \n\t"
2187  "psrlw $8, %%mm0 \n\t"
2188  "psrlw $8, %%mm2 \n\t"
2189  "pand %%mm7, %%mm1 \n\t"
2190  "pand %%mm7, %%mm3 \n\t"
2191  "packuswb %%mm2, %%mm0 \n\t"
2192  "packuswb %%mm3, %%mm1 \n\t"
2193  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2194  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2195  "add $8, %0 \n\t"
2196  " js 1b \n\t"
2197  : "+r"(count)
2198  : "r"(src), "r"(dst0), "r"(dst1)
2199  );
2200  count -= 7;
2201  }
2202  while(count<0) {
2203  dst0[count]= src[4*count+0];
2204  dst1[count]= src[4*count+2];
2205  count++;
2206  }
2207 }
2208 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2209 
2210 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2211 {
2212  dst0 += count;
2213  dst1 += count;
2214  src0 += 4*count;
2215  src1 += 4*count;
2216  count= - count;
2217 #ifdef PAVGB
2218  if(count <= -8) {
2219  count += 7;
2220  __asm__ volatile(
2221  "pcmpeqw %%mm7, %%mm7 \n\t"
2222  "psrlw $8, %%mm7 \n\t"
2223  "1: \n\t"
2224  "movq -28(%1, %0, 4), %%mm0 \n\t"
2225  "movq -20(%1, %0, 4), %%mm1 \n\t"
2226  "movq -12(%1, %0, 4), %%mm2 \n\t"
2227  "movq -4(%1, %0, 4), %%mm3 \n\t"
2228  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2229  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2230  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2231  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2232  "pand %%mm7, %%mm0 \n\t"
2233  "pand %%mm7, %%mm1 \n\t"
2234  "pand %%mm7, %%mm2 \n\t"
2235  "pand %%mm7, %%mm3 \n\t"
2236  "packuswb %%mm1, %%mm0 \n\t"
2237  "packuswb %%mm3, %%mm2 \n\t"
2238  "movq %%mm0, %%mm1 \n\t"
2239  "movq %%mm2, %%mm3 \n\t"
2240  "psrlw $8, %%mm0 \n\t"
2241  "psrlw $8, %%mm2 \n\t"
2242  "pand %%mm7, %%mm1 \n\t"
2243  "pand %%mm7, %%mm3 \n\t"
2244  "packuswb %%mm2, %%mm0 \n\t"
2245  "packuswb %%mm3, %%mm1 \n\t"
2246  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2247  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2248  "add $8, %0 \n\t"
2249  " js 1b \n\t"
2250  : "+r"(count)
2251  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2252  );
2253  count -= 7;
2254  }
2255 #endif
2256  while(count<0) {
2257  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2258  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2259  count++;
2260  }
2261 }
2262 
2263 #if !COMPILE_TEMPLATE_AMD3DNOW
2264 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2265 {
2266  dst0+= count;
2267  dst1+= count;
2268  src += 4*count;
2269  count= - count;
2270  if(count <= -8) {
2271  count += 7;
2272  __asm__ volatile(
2273  "pcmpeqw %%mm7, %%mm7 \n\t"
2274  "psrlw $8, %%mm7 \n\t"
2275  "1: \n\t"
2276  "movq -28(%1, %0, 4), %%mm0 \n\t"
2277  "movq -20(%1, %0, 4), %%mm1 \n\t"
2278  "movq -12(%1, %0, 4), %%mm2 \n\t"
2279  "movq -4(%1, %0, 4), %%mm3 \n\t"
2280  "psrlw $8, %%mm0 \n\t"
2281  "psrlw $8, %%mm1 \n\t"
2282  "psrlw $8, %%mm2 \n\t"
2283  "psrlw $8, %%mm3 \n\t"
2284  "packuswb %%mm1, %%mm0 \n\t"
2285  "packuswb %%mm3, %%mm2 \n\t"
2286  "movq %%mm0, %%mm1 \n\t"
2287  "movq %%mm2, %%mm3 \n\t"
2288  "psrlw $8, %%mm0 \n\t"
2289  "psrlw $8, %%mm2 \n\t"
2290  "pand %%mm7, %%mm1 \n\t"
2291  "pand %%mm7, %%mm3 \n\t"
2292  "packuswb %%mm2, %%mm0 \n\t"
2293  "packuswb %%mm3, %%mm1 \n\t"
2294  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2295  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2296  "add $8, %0 \n\t"
2297  " js 1b \n\t"
2298  : "+r"(count)
2299  : "r"(src), "r"(dst0), "r"(dst1)
2300  );
2301  count -= 7;
2302  }
2303  src++;
2304  while(count<0) {
2305  dst0[count]= src[4*count+0];
2306  dst1[count]= src[4*count+2];
2307  count++;
2308  }
2309 }
2310 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2311 
2312 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2313 {
2314  dst0 += count;
2315  dst1 += count;
2316  src0 += 4*count;
2317  src1 += 4*count;
2318  count= - count;
2319 #ifdef PAVGB
2320  if(count <= -8) {
2321  count += 7;
2322  __asm__ volatile(
2323  "pcmpeqw %%mm7, %%mm7 \n\t"
2324  "psrlw $8, %%mm7 \n\t"
2325  "1: \n\t"
2326  "movq -28(%1, %0, 4), %%mm0 \n\t"
2327  "movq -20(%1, %0, 4), %%mm1 \n\t"
2328  "movq -12(%1, %0, 4), %%mm2 \n\t"
2329  "movq -4(%1, %0, 4), %%mm3 \n\t"
2330  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2331  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2332  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2333  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2334  "psrlw $8, %%mm0 \n\t"
2335  "psrlw $8, %%mm1 \n\t"
2336  "psrlw $8, %%mm2 \n\t"
2337  "psrlw $8, %%mm3 \n\t"
2338  "packuswb %%mm1, %%mm0 \n\t"
2339  "packuswb %%mm3, %%mm2 \n\t"
2340  "movq %%mm0, %%mm1 \n\t"
2341  "movq %%mm2, %%mm3 \n\t"
2342  "psrlw $8, %%mm0 \n\t"
2343  "psrlw $8, %%mm2 \n\t"
2344  "pand %%mm7, %%mm1 \n\t"
2345  "pand %%mm7, %%mm3 \n\t"
2346  "packuswb %%mm2, %%mm0 \n\t"
2347  "packuswb %%mm3, %%mm1 \n\t"
2348  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2349  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2350  "add $8, %0 \n\t"
2351  " js 1b \n\t"
2352  : "+r"(count)
2353  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2354  );
2355  count -= 7;
2356  }
2357 #endif
2358  src0++;
2359  src1++;
2360  while(count<0) {
2361  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2362  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2363  count++;
2364  }
2365 }
2366 
2367 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2368  int width, int height,
2369  int lumStride, int chromStride, int srcStride)
2370 {
2371  int y;
2372  const int chromWidth= -((-width)>>1);
2373 
2374  for (y=0; y<height; y++) {
2375  RENAME(extract_even)(src, ydst, width);
2376  if(y&1) {
2377  RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2378  udst+= chromStride;
2379  vdst+= chromStride;
2380  }
2381 
2382  src += srcStride;
2383  ydst+= lumStride;
2384  }
2385  __asm__(
2386  EMMS" \n\t"
2387  SFENCE" \n\t"
2388  ::: "memory"
2389  );
2390 }
2391 
2392 #if !COMPILE_TEMPLATE_AMD3DNOW
2393 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2394  int width, int height,
2395  int lumStride, int chromStride, int srcStride)
2396 {
2397  int y;
2398  const int chromWidth= -((-width)>>1);
2399 
2400  for (y=0; y<height; y++) {
2401  RENAME(extract_even)(src, ydst, width);
2402  RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2403 
2404  src += srcStride;
2405  ydst+= lumStride;
2406  udst+= chromStride;
2407  vdst+= chromStride;
2408  }
2409  __asm__(
2410  EMMS" \n\t"
2411  SFENCE" \n\t"
2412  ::: "memory"
2413  );
2414 }
2415 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2416 
2417 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2418  int width, int height,
2419  int lumStride, int chromStride, int srcStride)
2420 {
2421  int y;
2422  const int chromWidth= -((-width)>>1);
2423 
2424  for (y=0; y<height; y++) {
2425  RENAME(extract_even)(src+1, ydst, width);
2426  if(y&1) {
2427  RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2428  udst+= chromStride;
2429  vdst+= chromStride;
2430  }
2431 
2432  src += srcStride;
2433  ydst+= lumStride;
2434  }
2435  __asm__(
2436  EMMS" \n\t"
2437  SFENCE" \n\t"
2438  ::: "memory"
2439  );
2440 }
2441 
2442 #if !COMPILE_TEMPLATE_AMD3DNOW
2443 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2444  int width, int height,
2445  int lumStride, int chromStride, int srcStride)
2446 {
2447  int y;
2448  const int chromWidth= -((-width)>>1);
2449 
2450  for (y=0; y<height; y++) {
2451  RENAME(extract_even)(src+1, ydst, width);
2452  RENAME(extract_even2)(src, udst, vdst, chromWidth);
2453 
2454  src += srcStride;
2455  ydst+= lumStride;
2456  udst+= chromStride;
2457  vdst+= chromStride;
2458  }
2459  __asm__(
2460  EMMS" \n\t"
2461  SFENCE" \n\t"
2462  ::: "memory"
2463  );
2464 }
2465 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2466 #endif /* !COMPILE_TEMPLATE_SSE2 */
2467 
2468 static inline void RENAME(rgb2rgb_init)(void)
2469 {
2470 #if !COMPILE_TEMPLATE_SSE2
2471 #if !COMPILE_TEMPLATE_AMD3DNOW
2499 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2500 
2501 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2503 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2505 
2508 #endif /* !COMPILE_TEMPLATE_SSE2 */
2509 
2510 #if !COMPILE_TEMPLATE_AMD3DNOW
2512 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2513 }