42 { 0x8000000080000000ULL, 0x8000000080000000ULL };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
200 #define SKIP_FOR_3DNOW
207 #undef SKIP_FOR_3DNOW
212 #define DEF(x) x ## _mmxext
215 #define PAVGB "pavgb"
224 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
225 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
226 #define put_pixels16_mmxext put_pixels16_mmx
227 #define put_pixels8_mmxext put_pixels8_mmx
228 #define put_pixels4_mmxext put_pixels4_mmx
229 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
230 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
246 "movq (%3), %%mm0 \n\t"
247 "movq 8(%3), %%mm1 \n\t"
248 "movq 16(%3), %%mm2 \n\t"
249 "movq 24(%3), %%mm3 \n\t"
250 "movq 32(%3), %%mm4 \n\t"
251 "movq 40(%3), %%mm5 \n\t"
252 "movq 48(%3), %%mm6 \n\t"
253 "movq 56(%3), %%mm7 \n\t"
254 "packuswb %%mm1, %%mm0 \n\t"
255 "packuswb %%mm3, %%mm2 \n\t"
256 "packuswb %%mm5, %%mm4 \n\t"
257 "packuswb %%mm7, %%mm6 \n\t"
258 "movq %%mm0, (%0) \n\t"
259 "movq %%mm2, (%0, %1) \n\t"
260 "movq %%mm4, (%0, %1, 2) \n\t"
261 "movq %%mm6, (%0, %2) \n\t"
262 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
265 pix += line_size * 4;
272 "movq (%3), %%mm0 \n\t"
273 "movq 8(%3), %%mm1 \n\t"
274 "movq 16(%3), %%mm2 \n\t"
275 "movq 24(%3), %%mm3 \n\t"
276 "movq 32(%3), %%mm4 \n\t"
277 "movq 40(%3), %%mm5 \n\t"
278 "movq 48(%3), %%mm6 \n\t"
279 "movq 56(%3), %%mm7 \n\t"
280 "packuswb %%mm1, %%mm0 \n\t"
281 "packuswb %%mm3, %%mm2 \n\t"
282 "packuswb %%mm5, %%mm4 \n\t"
283 "packuswb %%mm7, %%mm6 \n\t"
284 "movq %%mm0, (%0) \n\t"
285 "movq %%mm2, (%0, %1) \n\t"
286 "movq %%mm4, (%0, %1, 2) \n\t"
287 "movq %%mm6, (%0, %2) \n\t"
288 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
292 #define put_signed_pixels_clamped_mmx_half(off) \
293 "movq "#off"(%2), %%mm1 \n\t" \
294 "movq 16 + "#off"(%2), %%mm2 \n\t" \
295 "movq 32 + "#off"(%2), %%mm3 \n\t" \
296 "movq 48 + "#off"(%2), %%mm4 \n\t" \
297 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
298 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
299 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
300 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
301 "paddb %%mm0, %%mm1 \n\t" \
302 "paddb %%mm0, %%mm2 \n\t" \
303 "paddb %%mm0, %%mm3 \n\t" \
304 "paddb %%mm0, %%mm4 \n\t" \
305 "movq %%mm1, (%0) \n\t" \
306 "movq %%mm2, (%0, %3) \n\t" \
307 "movq %%mm3, (%0, %3, 2) \n\t" \
308 "movq %%mm4, (%0, %1) \n\t"
318 "lea (%3, %3, 2), %1 \n\t"
319 put_signed_pixels_clamped_mmx_half(0)
320 "lea (%0, %3, 4), %0 \n\t"
321 put_signed_pixels_clamped_mmx_half(64)
322 :
"+&r"(pixels),
"=&r"(line_skip3)
323 :
"r"(block),
"r"(line_skip)
341 "movq (%2), %%mm0 \n\t"
342 "movq 8(%2), %%mm1 \n\t"
343 "movq 16(%2), %%mm2 \n\t"
344 "movq 24(%2), %%mm3 \n\t"
345 "movq %0, %%mm4 \n\t"
346 "movq %1, %%mm6 \n\t"
347 "movq %%mm4, %%mm5 \n\t"
348 "punpcklbw %%mm7, %%mm4 \n\t"
349 "punpckhbw %%mm7, %%mm5 \n\t"
350 "paddsw %%mm4, %%mm0 \n\t"
351 "paddsw %%mm5, %%mm1 \n\t"
352 "movq %%mm6, %%mm5 \n\t"
353 "punpcklbw %%mm7, %%mm6 \n\t"
354 "punpckhbw %%mm7, %%mm5 \n\t"
355 "paddsw %%mm6, %%mm2 \n\t"
356 "paddsw %%mm5, %%mm3 \n\t"
357 "packuswb %%mm1, %%mm0 \n\t"
358 "packuswb %%mm3, %%mm2 \n\t"
359 "movq %%mm0, %0 \n\t"
360 "movq %%mm2, %1 \n\t"
361 :
"+m"(*pix),
"+m"(*(pix + line_size))
364 pix += line_size * 2;
369 static void put_pixels8_mmx(
uint8_t *block,
const uint8_t *pixels,
370 int line_size,
int h)
373 "lea (%3, %3), %%"REG_a
" \n\t"
376 "movq (%1 ), %%mm0 \n\t"
377 "movq (%1, %3), %%mm1 \n\t"
378 "movq %%mm0, (%2) \n\t"
379 "movq %%mm1, (%2, %3) \n\t"
380 "add %%"REG_a
", %1 \n\t"
381 "add %%"REG_a
", %2 \n\t"
382 "movq (%1 ), %%mm0 \n\t"
383 "movq (%1, %3), %%mm1 \n\t"
384 "movq %%mm0, (%2) \n\t"
385 "movq %%mm1, (%2, %3) \n\t"
386 "add %%"REG_a
", %1 \n\t"
387 "add %%"REG_a
", %2 \n\t"
390 :
"+g"(h),
"+r"(pixels),
"+r"(
block)
396 static void put_pixels16_mmx(
uint8_t *block,
const uint8_t *pixels,
397 int line_size,
int h)
400 "lea (%3, %3), %%"REG_a
" \n\t"
403 "movq (%1 ), %%mm0 \n\t"
404 "movq 8(%1 ), %%mm4 \n\t"
405 "movq (%1, %3), %%mm1 \n\t"
406 "movq 8(%1, %3), %%mm5 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm4, 8(%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "movq %%mm5, 8(%2, %3) \n\t"
411 "add %%"REG_a
", %1 \n\t"
412 "add %%"REG_a
", %2 \n\t"
413 "movq (%1 ), %%mm0 \n\t"
414 "movq 8(%1 ), %%mm4 \n\t"
415 "movq (%1, %3), %%mm1 \n\t"
416 "movq 8(%1, %3), %%mm5 \n\t"
417 "movq %%mm0, (%2) \n\t"
418 "movq %%mm4, 8(%2) \n\t"
419 "movq %%mm1, (%2, %3) \n\t"
420 "movq %%mm5, 8(%2, %3) \n\t"
421 "add %%"REG_a
", %1 \n\t"
422 "add %%"REG_a
", %2 \n\t"
425 :
"+g"(h),
"+r"(pixels),
"+r"(
block)
431 #define CLEAR_BLOCKS(name, n) \
432 static void name(DCTELEM *blocks) \
435 "pxor %%mm7, %%mm7 \n\t" \
436 "mov %1, %%"REG_a" \n\t" \
438 "movq %%mm7, (%0, %%"REG_a") \n\t" \
439 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
440 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
441 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
442 "add $32, %%"REG_a" \n\t" \
444 :: "r"(((uint8_t *)blocks) + 128 * n), \
449 CLEAR_BLOCKS(clear_blocks_mmx, 6)
450 CLEAR_BLOCKS(clear_block_mmx, 1)
452 static
void clear_block_sse(
DCTELEM *block)
455 "xorps %%xmm0, %%xmm0 \n"
456 "movaps %%xmm0, (%0) \n"
457 "movaps %%xmm0, 16(%0) \n"
458 "movaps %%xmm0, 32(%0) \n"
459 "movaps %%xmm0, 48(%0) \n"
460 "movaps %%xmm0, 64(%0) \n"
461 "movaps %%xmm0, 80(%0) \n"
462 "movaps %%xmm0, 96(%0) \n"
463 "movaps %%xmm0, 112(%0) \n"
469 static void clear_blocks_sse(
DCTELEM *blocks)
472 "xorps %%xmm0, %%xmm0 \n"
473 "mov %1, %%"REG_a
" \n"
475 "movaps %%xmm0, (%0, %%"REG_a
") \n"
476 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
477 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
478 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
479 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
480 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
481 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
482 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
483 "add $128, %%"REG_a
" \n"
485 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
497 "movq (%1, %0), %%mm0 \n\t"
498 "movq (%2, %0), %%mm1 \n\t"
499 "paddb %%mm0, %%mm1 \n\t"
500 "movq %%mm1, (%2, %0) \n\t"
501 "movq 8(%1, %0), %%mm0 \n\t"
502 "movq 8(%2, %0), %%mm1 \n\t"
503 "paddb %%mm0, %%mm1 \n\t"
504 "movq %%mm1, 8(%2, %0) \n\t"
510 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
513 dst[i + 0] += src[i + 0];
517 static void add_hfyu_median_prediction_cmov(
uint8_t *dst,
const uint8_t *top,
519 int *left,
int *left_top)
523 int l = *left & 0xff;
524 int tl = *left_top & 0xff;
529 "movzbl (%3, %4), %2 \n"
542 "add (%6, %4), %b0 \n"
543 "mov %b0, (%5, %4) \n"
546 :
"+&q"(l),
"+&q"(tl),
"=&r"(
t),
"=&q"(x),
"+&r"(w2)
547 :
"r"(dst + w),
"r"(diff + w),
"rm"(top + w)
556 "movd (%1), %%mm0 \n\t"
558 "movd (%1), %%mm1 \n\t"
559 "movd (%1,%3,1), %%mm2 \n\t"
560 "movd (%1,%3,2), %%mm3 \n\t"
561 "punpcklbw %%mm1, %%mm0 \n\t"
562 "punpcklbw %%mm3, %%mm2 \n\t"
563 "movq %%mm0, %%mm1 \n\t"
564 "punpcklwd %%mm2, %%mm0 \n\t"
565 "punpckhwd %%mm2, %%mm1 \n\t"
566 "movd %%mm0, (%0) \n\t"
568 "punpckhdq %%mm0, %%mm0 \n\t"
569 "movd %%mm0, (%0) \n\t"
570 "movd %%mm1, (%0,%2,1) \n\t"
571 "punpckhdq %%mm1, %%mm1 \n\t"
572 "movd %%mm1, (%0,%2,2) \n\t"
582 #define H263_LOOP_FILTER \
583 "pxor %%mm7, %%mm7 \n\t" \
584 "movq %0, %%mm0 \n\t" \
585 "movq %0, %%mm1 \n\t" \
586 "movq %3, %%mm2 \n\t" \
587 "movq %3, %%mm3 \n\t" \
588 "punpcklbw %%mm7, %%mm0 \n\t" \
589 "punpckhbw %%mm7, %%mm1 \n\t" \
590 "punpcklbw %%mm7, %%mm2 \n\t" \
591 "punpckhbw %%mm7, %%mm3 \n\t" \
592 "psubw %%mm2, %%mm0 \n\t" \
593 "psubw %%mm3, %%mm1 \n\t" \
594 "movq %1, %%mm2 \n\t" \
595 "movq %1, %%mm3 \n\t" \
596 "movq %2, %%mm4 \n\t" \
597 "movq %2, %%mm5 \n\t" \
598 "punpcklbw %%mm7, %%mm2 \n\t" \
599 "punpckhbw %%mm7, %%mm3 \n\t" \
600 "punpcklbw %%mm7, %%mm4 \n\t" \
601 "punpckhbw %%mm7, %%mm5 \n\t" \
602 "psubw %%mm2, %%mm4 \n\t" \
603 "psubw %%mm3, %%mm5 \n\t" \
604 "psllw $2, %%mm4 \n\t" \
605 "psllw $2, %%mm5 \n\t" \
606 "paddw %%mm0, %%mm4 \n\t" \
607 "paddw %%mm1, %%mm5 \n\t" \
608 "pxor %%mm6, %%mm6 \n\t" \
609 "pcmpgtw %%mm4, %%mm6 \n\t" \
610 "pcmpgtw %%mm5, %%mm7 \n\t" \
611 "pxor %%mm6, %%mm4 \n\t" \
612 "pxor %%mm7, %%mm5 \n\t" \
613 "psubw %%mm6, %%mm4 \n\t" \
614 "psubw %%mm7, %%mm5 \n\t" \
615 "psrlw $3, %%mm4 \n\t" \
616 "psrlw $3, %%mm5 \n\t" \
617 "packuswb %%mm5, %%mm4 \n\t" \
618 "packsswb %%mm7, %%mm6 \n\t" \
619 "pxor %%mm7, %%mm7 \n\t" \
620 "movd %4, %%mm2 \n\t" \
621 "punpcklbw %%mm2, %%mm2 \n\t" \
622 "punpcklbw %%mm2, %%mm2 \n\t" \
623 "punpcklbw %%mm2, %%mm2 \n\t" \
624 "psubusb %%mm4, %%mm2 \n\t" \
625 "movq %%mm2, %%mm3 \n\t" \
626 "psubusb %%mm4, %%mm3 \n\t" \
627 "psubb %%mm3, %%mm2 \n\t" \
628 "movq %1, %%mm3 \n\t" \
629 "movq %2, %%mm4 \n\t" \
630 "pxor %%mm6, %%mm3 \n\t" \
631 "pxor %%mm6, %%mm4 \n\t" \
632 "paddusb %%mm2, %%mm3 \n\t" \
633 "psubusb %%mm2, %%mm4 \n\t" \
634 "pxor %%mm6, %%mm3 \n\t" \
635 "pxor %%mm6, %%mm4 \n\t" \
636 "paddusb %%mm2, %%mm2 \n\t" \
637 "packsswb %%mm1, %%mm0 \n\t" \
638 "pcmpgtb %%mm0, %%mm7 \n\t" \
639 "pxor %%mm7, %%mm0 \n\t" \
640 "psubb %%mm7, %%mm0 \n\t" \
641 "movq %%mm0, %%mm1 \n\t" \
642 "psubusb %%mm2, %%mm0 \n\t" \
643 "psubb %%mm0, %%mm1 \n\t" \
644 "pand %5, %%mm1 \n\t" \
645 "psrlw $2, %%mm1 \n\t" \
646 "pxor %%mm7, %%mm1 \n\t" \
647 "psubb %%mm7, %%mm1 \n\t" \
648 "movq %0, %%mm5 \n\t" \
649 "movq %3, %%mm6 \n\t" \
650 "psubb %%mm1, %%mm5 \n\t" \
651 "paddb %%mm1, %%mm6 \n\t"
653 static void h263_v_loop_filter_mmx(
uint8_t *src,
int stride,
int qscale)
661 "movq %%mm3, %1 \n\t"
662 "movq %%mm4, %2 \n\t"
663 "movq %%mm5, %0 \n\t"
664 "movq %%mm6, %3 \n\t"
665 :
"+m"(*(uint64_t*)(src - 2 * stride)),
666 "+m"(*(uint64_t*)(src - 1 *
stride)),
667 "+m"(*(uint64_t*)(src + 0 * stride)),
668 "+m"(*(uint64_t*)(src + 1 *
stride))
674 static void h263_h_loop_filter_mmx(
uint8_t *src,
int stride,
int qscale)
683 transpose4x4(btemp, src, 8, stride);
684 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
696 "movq %%mm5, %%mm1 \n\t"
697 "movq %%mm4, %%mm0 \n\t"
698 "punpcklbw %%mm3, %%mm5 \n\t"
699 "punpcklbw %%mm6, %%mm4 \n\t"
700 "punpckhbw %%mm3, %%mm1 \n\t"
701 "punpckhbw %%mm6, %%mm0 \n\t"
702 "movq %%mm5, %%mm3 \n\t"
703 "movq %%mm1, %%mm6 \n\t"
704 "punpcklwd %%mm4, %%mm5 \n\t"
705 "punpcklwd %%mm0, %%mm1 \n\t"
706 "punpckhwd %%mm4, %%mm3 \n\t"
707 "punpckhwd %%mm0, %%mm6 \n\t"
708 "movd %%mm5, (%0) \n\t"
709 "punpckhdq %%mm5, %%mm5 \n\t"
710 "movd %%mm5, (%0, %2) \n\t"
711 "movd %%mm3, (%0, %2, 2) \n\t"
712 "punpckhdq %%mm3, %%mm3 \n\t"
713 "movd %%mm3, (%0, %3) \n\t"
714 "movd %%mm1, (%1) \n\t"
715 "punpckhdq %%mm1, %%mm1 \n\t"
716 "movd %%mm1, (%1, %2) \n\t"
717 "movd %%mm6, (%1, %2, 2) \n\t"
718 "punpckhdq %%mm6, %%mm6 \n\t"
719 "movd %%mm6, (%1, %3) \n\t"
721 "r"(src + 4 * stride),
731 int w,
int h,
int sides)
736 last_line = buf + (height - 1) * wrap;
742 "movd (%0), %%mm0 \n\t"
743 "punpcklbw %%mm0, %%mm0 \n\t"
744 "punpcklwd %%mm0, %%mm0 \n\t"
745 "punpckldq %%mm0, %%mm0 \n\t"
746 "movq %%mm0, -8(%0) \n\t"
747 "movq -8(%0, %2), %%mm1 \n\t"
748 "punpckhbw %%mm1, %%mm1 \n\t"
749 "punpckhwd %%mm1, %%mm1 \n\t"
750 "punpckhdq %%mm1, %%mm1 \n\t"
751 "movq %%mm1, (%0, %2) \n\t"
761 "movd (%0), %%mm0 \n\t"
762 "punpcklbw %%mm0, %%mm0 \n\t"
763 "punpcklwd %%mm0, %%mm0 \n\t"
764 "punpckldq %%mm0, %%mm0 \n\t"
765 "movq %%mm0, -8(%0) \n\t"
766 "movq %%mm0, -16(%0) \n\t"
767 "movq -8(%0, %2), %%mm1 \n\t"
768 "punpckhbw %%mm1, %%mm1 \n\t"
769 "punpckhwd %%mm1, %%mm1 \n\t"
770 "punpckhdq %%mm1, %%mm1 \n\t"
771 "movq %%mm1, (%0, %2) \n\t"
772 "movq %%mm1, 8(%0, %2) \n\t"
783 for (i = 0; i < h; i += 4) {
784 ptr = buf - (i + 1) * wrap - w;
787 "movq (%1, %0), %%mm0 \n\t"
788 "movq %%mm0, (%0) \n\t"
789 "movq %%mm0, (%0, %2) \n\t"
790 "movq %%mm0, (%0, %2, 2) \n\t"
791 "movq %%mm0, (%0, %3) \n\t"
797 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
803 for (i = 0; i < h; i += 4) {
804 ptr = last_line + (i + 1) * wrap - w;
807 "movq (%1, %0), %%mm0 \n\t"
808 "movq %%mm0, (%0) \n\t"
809 "movq %%mm0, (%0, %2) \n\t"
810 "movq %%mm0, (%0, %2, 2) \n\t"
811 "movq %%mm0, (%0, %3) \n\t"
818 "r"(ptr + width + 2 * w)
824 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
825 in0, in1, in2, in7, out, OP) \
826 "paddw "#m4", "#m3" \n\t" \
827 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
828 "pmullw "#m3", %%mm4 \n\t" \
829 "movq "#in7", "#m3" \n\t" \
830 "movq "#in0", %%mm5 \n\t" \
831 "paddw "#m3", %%mm5 \n\t" \
832 "psubw %%mm5, %%mm4 \n\t" \
833 "movq "#in1", %%mm5 \n\t" \
834 "movq "#in2", %%mm6 \n\t" \
835 "paddw "#m6", %%mm5 \n\t" \
836 "paddw "#m5", %%mm6 \n\t" \
837 "paddw %%mm6, %%mm6 \n\t" \
838 "psubw %%mm6, %%mm5 \n\t" \
839 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
840 "paddw "#rnd", %%mm4 \n\t" \
841 "paddw %%mm4, %%mm5 \n\t" \
842 "psraw $5, %%mm5 \n\t" \
843 "packuswb %%mm5, %%mm5 \n\t" \
844 OP(%%mm5, out, %%mm7, d)
846 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
847 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
856 "pxor %%mm7, %%mm7 \n\t" \
858 "movq (%0), %%mm0 \n\t" \
859 "movq %%mm0, %%mm1 \n\t" \
860 "movq %%mm0, %%mm2 \n\t" \
861 "punpcklbw %%mm7, %%mm0 \n\t" \
862 "punpckhbw %%mm7, %%mm1 \n\t" \
863 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
864 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
865 "movq %%mm2, %%mm3 \n\t" \
866 "movq %%mm2, %%mm4 \n\t" \
867 "psllq $8, %%mm2 \n\t" \
868 "psllq $16, %%mm3 \n\t" \
869 "psllq $24, %%mm4 \n\t" \
870 "punpckhbw %%mm7, %%mm2 \n\t" \
871 "punpckhbw %%mm7, %%mm3 \n\t" \
872 "punpckhbw %%mm7, %%mm4 \n\t" \
873 "paddw %%mm3, %%mm5 \n\t" \
874 "paddw %%mm2, %%mm6 \n\t" \
875 "paddw %%mm5, %%mm5 \n\t" \
876 "psubw %%mm5, %%mm6 \n\t" \
877 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
878 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
879 "paddw %%mm4, %%mm0 \n\t" \
880 "paddw %%mm1, %%mm5 \n\t" \
881 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
882 "psubw %%mm5, %%mm0 \n\t" \
883 "paddw %6, %%mm6 \n\t" \
884 "paddw %%mm6, %%mm0 \n\t" \
885 "psraw $5, %%mm0 \n\t" \
886 "movq %%mm0, %5 \n\t" \
889 "movq 5(%0), %%mm0 \n\t" \
890 "movq %%mm0, %%mm5 \n\t" \
891 "movq %%mm0, %%mm6 \n\t" \
892 "psrlq $8, %%mm0 \n\t" \
893 "psrlq $16, %%mm5 \n\t" \
894 "punpcklbw %%mm7, %%mm0 \n\t" \
895 "punpcklbw %%mm7, %%mm5 \n\t" \
896 "paddw %%mm0, %%mm2 \n\t" \
897 "paddw %%mm5, %%mm3 \n\t" \
898 "paddw %%mm2, %%mm2 \n\t" \
899 "psubw %%mm2, %%mm3 \n\t" \
900 "movq %%mm6, %%mm2 \n\t" \
901 "psrlq $24, %%mm6 \n\t" \
902 "punpcklbw %%mm7, %%mm2 \n\t" \
903 "punpcklbw %%mm7, %%mm6 \n\t" \
904 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
905 "paddw %%mm2, %%mm1 \n\t" \
906 "paddw %%mm6, %%mm4 \n\t" \
907 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
908 "psubw %%mm4, %%mm3 \n\t" \
909 "paddw %6, %%mm1 \n\t" \
910 "paddw %%mm1, %%mm3 \n\t" \
911 "psraw $5, %%mm3 \n\t" \
912 "movq %5, %%mm1 \n\t" \
913 "packuswb %%mm3, %%mm1 \n\t" \
914 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
917 "movq 9(%0), %%mm1 \n\t" \
918 "movq %%mm1, %%mm4 \n\t" \
919 "movq %%mm1, %%mm3 \n\t" \
920 "psrlq $8, %%mm1 \n\t" \
921 "psrlq $16, %%mm4 \n\t" \
922 "punpcklbw %%mm7, %%mm1 \n\t" \
923 "punpcklbw %%mm7, %%mm4 \n\t" \
924 "paddw %%mm1, %%mm5 \n\t" \
925 "paddw %%mm4, %%mm0 \n\t" \
926 "paddw %%mm5, %%mm5 \n\t" \
927 "psubw %%mm5, %%mm0 \n\t" \
928 "movq %%mm3, %%mm5 \n\t" \
929 "psrlq $24, %%mm3 \n\t" \
930 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
931 "punpcklbw %%mm7, %%mm3 \n\t" \
932 "paddw %%mm3, %%mm2 \n\t" \
933 "psubw %%mm2, %%mm0 \n\t" \
934 "movq %%mm5, %%mm2 \n\t" \
935 "punpcklbw %%mm7, %%mm2 \n\t" \
936 "punpckhbw %%mm7, %%mm5 \n\t" \
937 "paddw %%mm2, %%mm6 \n\t" \
938 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
939 "paddw %6, %%mm0 \n\t" \
940 "paddw %%mm6, %%mm0 \n\t" \
941 "psraw $5, %%mm0 \n\t" \
945 "paddw %%mm5, %%mm3 \n\t" \
946 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
947 "paddw %%mm4, %%mm6 \n\t" \
948 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
949 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
950 "paddw %%mm1, %%mm4 \n\t" \
951 "paddw %%mm2, %%mm5 \n\t" \
952 "paddw %%mm6, %%mm6 \n\t" \
953 "psubw %%mm6, %%mm4 \n\t" \
954 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
955 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
956 "psubw %%mm5, %%mm3 \n\t" \
957 "paddw %6, %%mm4 \n\t" \
958 "paddw %%mm3, %%mm4 \n\t" \
959 "psraw $5, %%mm4 \n\t" \
960 "packuswb %%mm4, %%mm0 \n\t" \
961 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
967 : "+a"(src), "+c"(dst), "+D"(h) \
968 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
969 "m"(temp), "m"(ROUNDER) \
974 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
981 "pxor %%mm7, %%mm7 \n\t" \
983 "movq (%0), %%mm0 \n\t" \
984 "movq %%mm0, %%mm1 \n\t" \
985 "movq %%mm0, %%mm2 \n\t" \
986 "punpcklbw %%mm7, %%mm0 \n\t" \
987 "punpckhbw %%mm7, %%mm1 \n\t" \
988 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
989 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
990 "movq %%mm2, %%mm3 \n\t" \
991 "movq %%mm2, %%mm4 \n\t" \
992 "psllq $8, %%mm2 \n\t" \
993 "psllq $16, %%mm3 \n\t" \
994 "psllq $24, %%mm4 \n\t" \
995 "punpckhbw %%mm7, %%mm2 \n\t" \
996 "punpckhbw %%mm7, %%mm3 \n\t" \
997 "punpckhbw %%mm7, %%mm4 \n\t" \
998 "paddw %%mm3, %%mm5 \n\t" \
999 "paddw %%mm2, %%mm6 \n\t" \
1000 "paddw %%mm5, %%mm5 \n\t" \
1001 "psubw %%mm5, %%mm6 \n\t" \
1002 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
1003 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
1004 "paddw %%mm4, %%mm0 \n\t" \
1005 "paddw %%mm1, %%mm5 \n\t" \
1006 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
1007 "psubw %%mm5, %%mm0 \n\t" \
1008 "paddw %5, %%mm6 \n\t" \
1009 "paddw %%mm6, %%mm0 \n\t" \
1010 "psraw $5, %%mm0 \n\t" \
1013 "movd 5(%0), %%mm5 \n\t" \
1014 "punpcklbw %%mm7, %%mm5 \n\t" \
1015 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
1016 "paddw %%mm5, %%mm1 \n\t" \
1017 "paddw %%mm6, %%mm2 \n\t" \
1018 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
1019 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
1020 "paddw %%mm6, %%mm3 \n\t" \
1021 "paddw %%mm5, %%mm4 \n\t" \
1022 "paddw %%mm2, %%mm2 \n\t" \
1023 "psubw %%mm2, %%mm3 \n\t" \
1024 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
1025 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
1026 "psubw %%mm4, %%mm3 \n\t" \
1027 "paddw %5, %%mm1 \n\t" \
1028 "paddw %%mm1, %%mm3 \n\t" \
1029 "psraw $5, %%mm3 \n\t" \
1030 "packuswb %%mm3, %%mm0 \n\t" \
1031 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1037 : "+a"(src), "+c"(dst), "+d"(h) \
1038 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1044 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1045 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1050 uint64_t temp[17 * 4]; \
1051 uint64_t *temp_ptr = temp; \
1055 __asm__ volatile ( \
1056 "pxor %%mm7, %%mm7 \n\t" \
1058 "movq (%0), %%mm0 \n\t" \
1059 "movq (%0), %%mm1 \n\t" \
1060 "movq 8(%0), %%mm2 \n\t" \
1061 "movq 8(%0), %%mm3 \n\t" \
1062 "punpcklbw %%mm7, %%mm0 \n\t" \
1063 "punpckhbw %%mm7, %%mm1 \n\t" \
1064 "punpcklbw %%mm7, %%mm2 \n\t" \
1065 "punpckhbw %%mm7, %%mm3 \n\t" \
1066 "movq %%mm0, (%1) \n\t" \
1067 "movq %%mm1, 17 * 8(%1) \n\t" \
1068 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1069 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1074 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1075 : "r"((x86_reg)srcStride) \
1083 __asm__ volatile ( \
1086 "movq (%0), %%mm0 \n\t" \
1087 "movq 8(%0), %%mm1 \n\t" \
1088 "movq 16(%0), %%mm2 \n\t" \
1089 "movq 24(%0), %%mm3 \n\t" \
1090 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1091 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1093 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1095 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1097 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1098 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1100 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1101 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1103 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1104 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1106 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1107 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1109 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1114 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1116 "add $136, %0 \n\t" \
1121 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1122 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1124 "g"(4 - 14 * (x86_reg)dstStride) \
1129 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1134 uint64_t temp[9 * 2]; \
1135 uint64_t *temp_ptr = temp; \
1139 __asm__ volatile ( \
1140 "pxor %%mm7, %%mm7 \n\t" \
1142 "movq (%0), %%mm0 \n\t" \
1143 "movq (%0), %%mm1 \n\t" \
1144 "punpcklbw %%mm7, %%mm0 \n\t" \
1145 "punpckhbw %%mm7, %%mm1 \n\t" \
1146 "movq %%mm0, (%1) \n\t" \
1147 "movq %%mm1, 9*8(%1) \n\t" \
1152 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1153 : "r"((x86_reg)srcStride) \
1161 __asm__ volatile ( \
1164 "movq (%0), %%mm0 \n\t" \
1165 "movq 8(%0), %%mm1 \n\t" \
1166 "movq 16(%0), %%mm2 \n\t" \
1167 "movq 24(%0), %%mm3 \n\t" \
1168 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1169 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1171 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1173 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1175 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1177 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1179 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1180 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1182 "add $72, %0 \n\t" \
1187 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1188 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1190 "g"(4 - 6 * (x86_reg)dstStride) \
1195 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1198 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1201 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1205 uint8_t * const half = (uint8_t*)temp; \
1206 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1208 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1211 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1214 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1218 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1222 uint8_t * const half = (uint8_t*)temp; \
1223 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1225 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1229 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1233 uint8_t * const half = (uint8_t*)temp; \
1234 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1235 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1238 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1241 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1244 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1248 uint8_t * const half = (uint8_t*)temp; \
1249 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1250 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1254 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1257 uint64_t half[8 + 9]; \
1258 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1259 uint8_t * const halfHV = ((uint8_t*)half); \
1260 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1262 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1263 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1264 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1267 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1270 uint64_t half[8 + 9]; \
1271 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1272 uint8_t * const halfHV = ((uint8_t*)half); \
1273 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1275 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1277 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1278 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1281 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1284 uint64_t half[8 + 9]; \
1285 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1286 uint8_t * const halfHV = ((uint8_t*)half); \
1287 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1289 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1290 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1291 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1294 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1297 uint64_t half[8 + 9]; \
1298 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1299 uint8_t * const halfHV = ((uint8_t*)half); \
1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1302 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1304 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1305 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1308 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1311 uint64_t half[8 + 9]; \
1312 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1313 uint8_t * const halfHV = ((uint8_t*)half); \
1314 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1316 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1317 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1320 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1323 uint64_t half[8 + 9]; \
1324 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1325 uint8_t * const halfHV = ((uint8_t*)half); \
1326 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1328 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1329 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1332 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1335 uint64_t half[8 + 9]; \
1336 uint8_t * const halfH = ((uint8_t*)half); \
1337 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1339 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1340 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1343 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1346 uint64_t half[8 + 9]; \
1347 uint8_t * const halfH = ((uint8_t*)half); \
1348 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1350 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1352 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1355 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1359 uint8_t * const halfH = ((uint8_t*)half); \
1360 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1362 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1365 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1368 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1371 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1374 uint64_t temp[32]; \
1375 uint8_t * const half = (uint8_t*)temp; \
1376 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1378 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1381 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1384 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1385 stride, stride, 16); \
1388 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1391 uint64_t temp[32]; \
1392 uint8_t * const half = (uint8_t*)temp; \
1393 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1395 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1396 stride, stride, 16); \
1399 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1402 uint64_t temp[32]; \
1403 uint8_t * const half = (uint8_t*)temp; \
1404 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1406 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1409 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1412 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1415 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1418 uint64_t temp[32]; \
1419 uint8_t * const half = (uint8_t*)temp; \
1420 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1422 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1423 stride, stride, 16); \
1426 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1429 uint64_t half[16 * 2 + 17 * 2]; \
1430 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1431 uint8_t * const halfHV = ((uint8_t*)half); \
1432 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1434 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1436 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1438 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1441 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1444 uint64_t half[16 * 2 + 17 * 2]; \
1445 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1446 uint8_t * const halfHV = ((uint8_t*)half); \
1447 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1449 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1451 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1453 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1456 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[16 * 2 + 17 * 2]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1464 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1466 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1472 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1475 uint64_t half[16 * 2 + 17 * 2]; \
1476 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1477 uint8_t * const halfHV = ((uint8_t*)half); \
1478 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1480 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1482 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1484 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1488 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1491 uint64_t half[16 * 2 + 17 * 2]; \
1492 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1493 uint8_t * const halfHV = ((uint8_t*)half); \
1494 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1496 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1498 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1501 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1504 uint64_t half[16 * 2 + 17 * 2]; \
1505 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1506 uint8_t * const halfHV = ((uint8_t*)half); \
1507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1509 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1515 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1518 uint64_t half[17 * 2]; \
1519 uint8_t * const halfH = ((uint8_t*)half); \
1520 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1522 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1524 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1527 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1530 uint64_t half[17 * 2]; \
1531 uint8_t * const halfH = ((uint8_t*)half); \
1532 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1534 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1536 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1539 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1542 uint64_t half[17 * 2]; \
1543 uint8_t * const halfH = ((uint8_t*)half); \
1544 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1546 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1549 #define PUT_OP(a, b, temp, size) \
1550 "mov"#size" "#a", "#b" \n\t"
1552 #define AVG_MMXEXT_OP(a, b, temp, size) \
1553 "mov"#size" "#b", "#temp" \n\t" \
1554 "pavgb "#temp", "#a" \n\t" \
1555 "mov"#size" "#a", "#b" \n\t"
1557 QPEL_BASE(put_,
ff_pw_16, _, PUT_OP)
1558 QPEL_BASE(avg_,
ff_pw_16, _, AVG_MMXEXT_OP)
1559 QPEL_BASE(put_no_rnd_,
ff_pw_15, _no_rnd_, PUT_OP)
1560 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1561 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1562 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1567 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1568 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1572 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1575 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1576 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1580 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1584 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1585 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1586 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1587 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1588 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1589 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1590 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1591 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1592 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1593 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1594 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1598 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1600 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1604 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1607 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1608 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1609 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1610 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1611 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1612 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1613 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1614 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1616 QPEL_2TAP(put_, 16, mmxext)
1617 QPEL_2TAP(avg_, 16, mmxext)
1618 QPEL_2TAP(put_, 8, mmxext)
1619 QPEL_2TAP(avg_, 8, mmxext)
1623 put_pixels8_xy2_mmx(dst, src, stride, 8);
1627 put_pixels16_xy2_mmx(dst, src, stride, 16);
1631 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1635 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1639 int stride,
int h,
int ox,
int oy,
1640 int dxx,
int dxy,
int dyx,
int dyy,
1641 int shift,
int r,
int width,
int height)
1644 const int ix = ox >> (16 + shift);
1645 const int iy = oy >> (16 + shift);
1646 const int oxs = ox >> 4;
1647 const int oys = oy >> 4;
1648 const int dxxs = dxx >> 4;
1649 const int dxys = dxy >> 4;
1650 const int dyxs = dyx >> 4;
1651 const int dyys = dyy >> 4;
1652 const uint16_t r4[4] = {
r,
r,
r, r };
1653 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1654 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1655 const uint64_t
shift2 = 2 * shift;
1658 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1659 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1660 const int dxh = dxy * (h - 1);
1661 const int dyw = dyx * (w - 1);
1663 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1664 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1666 || (dxx | dxy | dyx | dyy) & 15 ||
1667 (unsigned)ix >= width - w ||
1668 (
unsigned)iy >= height - h) {
1670 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1671 shift,
r, width, height);
1678 "movd %0, %%mm6 \n\t"
1679 "pxor %%mm7, %%mm7 \n\t"
1680 "punpcklwd %%mm6, %%mm6 \n\t"
1681 "punpcklwd %%mm6, %%mm6 \n\t"
1685 for (x = 0; x < w; x += 4) {
1686 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1687 oxs - dxys + dxxs * (x + 1),
1688 oxs - dxys + dxxs * (x + 2),
1689 oxs - dxys + dxxs * (x + 3) };
1690 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1691 oys - dyys + dyxs * (x + 1),
1692 oys - dyys + dyxs * (x + 2),
1693 oys - dyys + dyxs * (x + 3) };
1695 for (y = 0; y < h; y++) {
1697 "movq %0, %%mm4 \n\t"
1698 "movq %1, %%mm5 \n\t"
1699 "paddw %2, %%mm4 \n\t"
1700 "paddw %3, %%mm5 \n\t"
1701 "movq %%mm4, %0 \n\t"
1702 "movq %%mm5, %1 \n\t"
1703 "psrlw $12, %%mm4 \n\t"
1704 "psrlw $12, %%mm5 \n\t"
1705 :
"+m"(*dx4),
"+m"(*dy4)
1706 :
"m"(*dxy4),
"m"(*dyy4)
1710 "movq %%mm6, %%mm2 \n\t"
1711 "movq %%mm6, %%mm1 \n\t"
1712 "psubw %%mm4, %%mm2 \n\t"
1713 "psubw %%mm5, %%mm1 \n\t"
1714 "movq %%mm2, %%mm0 \n\t"
1715 "movq %%mm4, %%mm3 \n\t"
1716 "pmullw %%mm1, %%mm0 \n\t"
1717 "pmullw %%mm5, %%mm3 \n\t"
1718 "pmullw %%mm5, %%mm2 \n\t"
1719 "pmullw %%mm4, %%mm1 \n\t"
1721 "movd %4, %%mm5 \n\t"
1722 "movd %3, %%mm4 \n\t"
1723 "punpcklbw %%mm7, %%mm5 \n\t"
1724 "punpcklbw %%mm7, %%mm4 \n\t"
1725 "pmullw %%mm5, %%mm3 \n\t"
1726 "pmullw %%mm4, %%mm2 \n\t"
1728 "movd %2, %%mm5 \n\t"
1729 "movd %1, %%mm4 \n\t"
1730 "punpcklbw %%mm7, %%mm5 \n\t"
1731 "punpcklbw %%mm7, %%mm4 \n\t"
1732 "pmullw %%mm5, %%mm1 \n\t"
1733 "pmullw %%mm4, %%mm0 \n\t"
1734 "paddw %5, %%mm1 \n\t"
1735 "paddw %%mm3, %%mm2 \n\t"
1736 "paddw %%mm1, %%mm0 \n\t"
1737 "paddw %%mm2, %%mm0 \n\t"
1739 "psrlw %6, %%mm0 \n\t"
1740 "packuswb %%mm0, %%mm0 \n\t"
1741 "movd %%mm0, %0 \n\t"
1743 :
"=m"(dst[x + y *
stride])
1744 :
"m"(src[0]),
"m"(src[1]),
1745 "m"(src[stride]),
"m"(src[stride + 1]),
1758 int stride,
int h,
int x,
int y);
1760 int stride,
int h,
int x,
int y);
1762 int stride,
int h,
int x,
int y);
1765 int stride,
int h,
int x,
int y);
1767 int stride,
int h,
int x,
int y);
1769 int stride,
int h,
int x,
int y);
1772 int stride,
int h,
int x,
int y);
1774 int stride,
int h,
int x,
int y);
1777 int stride,
int h,
int x,
int y);
1779 int stride,
int h,
int x,
int y);
1782 int stride,
int h,
int x,
int y);
1784 int stride,
int h,
int x,
int y);
1786 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1787 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1788 (uint8_t *dst, uint8_t *src, \
1789 int stride, int h, int x, int y);
1805 put_pixels8_mmx(dst, src, stride, 8);
1810 avg_pixels8_mmx(dst, src, stride, 8);
1815 put_pixels16_mmx(dst, src, stride, 16);
1820 avg_pixels16_mmx(dst, src, stride, 16);
1825 int stride,
int rnd)
1827 put_pixels8_mmx(dst, src, stride, 8);
1831 int stride,
int rnd)
1833 avg_pixels8_mmxext(dst, src, stride, 8);
1836 static void vorbis_inverse_coupling_3dnow(
float *mag,
float *ang,
int blocksize)
1839 __asm__
volatile (
"pxor %%mm7, %%mm7":);
1840 for (i = 0; i < blocksize; i += 2) {
1842 "movq %0, %%mm0 \n\t"
1843 "movq %1, %%mm1 \n\t"
1844 "movq %%mm0, %%mm2 \n\t"
1845 "movq %%mm1, %%mm3 \n\t"
1846 "pfcmpge %%mm7, %%mm2 \n\t"
1847 "pfcmpge %%mm7, %%mm3 \n\t"
1848 "pslld $31, %%mm2 \n\t"
1849 "pxor %%mm2, %%mm1 \n\t"
1850 "movq %%mm3, %%mm4 \n\t"
1851 "pand %%mm1, %%mm3 \n\t"
1852 "pandn %%mm1, %%mm4 \n\t"
1853 "pfadd %%mm0, %%mm3 \n\t"
1854 "pfsub %%mm4, %%mm0 \n\t"
1855 "movq %%mm3, %1 \n\t"
1856 "movq %%mm0, %0 \n\t"
1857 :
"+m"(mag[i]),
"+m"(ang[i])
1861 __asm__
volatile (
"femms");
1864 static void vorbis_inverse_coupling_sse(
float *mag,
float *ang,
int blocksize)
1869 "movaps %0, %%xmm5 \n\t"
1872 for (i = 0; i < blocksize; i += 4) {
1874 "movaps %0, %%xmm0 \n\t"
1875 "movaps %1, %%xmm1 \n\t"
1876 "xorps %%xmm2, %%xmm2 \n\t"
1877 "xorps %%xmm3, %%xmm3 \n\t"
1878 "cmpleps %%xmm0, %%xmm2 \n\t"
1879 "cmpleps %%xmm1, %%xmm3 \n\t"
1880 "andps %%xmm5, %%xmm2 \n\t"
1881 "xorps %%xmm2, %%xmm1 \n\t"
1882 "movaps %%xmm3, %%xmm4 \n\t"
1883 "andps %%xmm1, %%xmm3 \n\t"
1884 "andnps %%xmm1, %%xmm4 \n\t"
1885 "addps %%xmm0, %%xmm3 \n\t"
1886 "subps %%xmm4, %%xmm0 \n\t"
1887 "movaps %%xmm3, %1 \n\t"
1888 "movaps %%xmm0, %0 \n\t"
1889 :
"+m"(mag[i]),
"+m"(ang[i])
1895 static void vector_clipf_sse(
float *dst,
const float *src,
1896 float min,
float max,
int len)
1900 "movss %3, %%xmm4 \n\t"
1901 "movss %4, %%xmm5 \n\t"
1902 "shufps $0, %%xmm4, %%xmm4 \n\t"
1903 "shufps $0, %%xmm5, %%xmm5 \n\t"
1905 "movaps (%2, %0), %%xmm0 \n\t"
1906 "movaps 16(%2, %0), %%xmm1 \n\t"
1907 "movaps 32(%2, %0), %%xmm2 \n\t"
1908 "movaps 48(%2, %0), %%xmm3 \n\t"
1909 "maxps %%xmm4, %%xmm0 \n\t"
1910 "maxps %%xmm4, %%xmm1 \n\t"
1911 "maxps %%xmm4, %%xmm2 \n\t"
1912 "maxps %%xmm4, %%xmm3 \n\t"
1913 "minps %%xmm5, %%xmm0 \n\t"
1914 "minps %%xmm5, %%xmm1 \n\t"
1915 "minps %%xmm5, %%xmm2 \n\t"
1916 "minps %%xmm5, %%xmm3 \n\t"
1917 "movaps %%xmm0, (%1, %0) \n\t"
1918 "movaps %%xmm1, 16(%1, %0) \n\t"
1919 "movaps %%xmm2, 32(%1, %0) \n\t"
1920 "movaps %%xmm3, 48(%1, %0) \n\t"
1924 :
"r"(dst),
"r"(src),
"m"(min),
"m"(max)
1937 int order,
int mul);
1940 int order,
int mul);
1943 int order,
int mul);
1946 const int16_t *window,
unsigned int len);
1948 const int16_t *window,
unsigned int len);
1950 const int16_t *window,
unsigned int len);
1952 const int16_t *window,
unsigned int len);
1954 const int16_t *window,
unsigned int len);
1956 const int16_t *window,
unsigned int len);
1963 int *left,
int *left_top);
1972 const float *src1,
int len);
1974 const float *src1,
int len);
1977 const float *src2,
int len);
1979 const float *src2,
int len);
1991 const float *src1,
int len);
1993 const float *src1,
int len);
1995 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1997 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1998 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1999 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2000 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2001 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2002 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2003 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2015 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2017 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2018 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2019 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2020 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2023 #define H264_QPEL_FUNCS(x, y, CPU) \
2025 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2026 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2027 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2028 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2031 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2033 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2034 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2035 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2036 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2048 if (!high_bit_depth) {
2103 const int high_bit_depth = bit_depth > 8;
2118 if (!high_bit_depth) {
2135 if (!high_bit_depth) {
2159 #if HAVE_MMXEXT_EXTERNAL
2161 if (!high_bit_depth) {
2168 }
else if (bit_depth == 10) {
2214 if (!high_bit_depth) {
2262 if (!high_bit_depth) {
2288 const int high_bit_depth = bit_depth > 8;
2290 #if HAVE_SSE2_INLINE
2299 #if HAVE_SSE2_EXTERNAL
2302 if (!high_bit_depth) {
2326 if (bit_depth == 10) {
2351 }
else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2361 #if HAVE_SSSE3_EXTERNAL
2407 #if HAVE_SSE4_EXTERNAL
2414 #if HAVE_AVX_EXTERNAL
2417 if (bit_depth == 10) {
2441 #if HAVE_7REGS && HAVE_INLINE_ASM