simple_idct.c
Go to the documentation of this file.
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/simple_idct.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/mem.h"
26 #include "dsputil_mmx.h"
27 
28 #if HAVE_INLINE_ASM
29 
30 /*
31 23170.475006
32 22725.260826
33 21406.727617
34 19265.545870
35 16384.000000
36 12872.826198
37 8866.956905
38 4520.335430
39 */
40 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
45 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 
49 #define ROW_SHIFT 11
50 #define COL_SHIFT 20 // 6
51 
52 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
53 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54 
55 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
56  1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
57 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
58 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
59  1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
60  // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
61 // 0, 0, 0, 0,
62 // 0, 0, 0, 0,
63 
64  C4, C4, C4, C4,
65  C4, -C4, C4, -C4,
66 
67  C2, C6, C2, C6,
68  C6, -C2, C6, -C2,
69 
70  C1, C3, C1, C3,
71  C5, C7, C5, C7,
72 
73  C3, -C7, C3, -C7,
74 -C1, -C5, -C1, -C5,
75 
76  C5, -C1, C5, -C1,
77  C7, C3, C7, C3,
78 
79  C7, -C5, C7, -C5,
80  C3, -C1, C3, -C1
81 };
82 
83 static inline void idct(int16_t *block)
84 {
85  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
86  int16_t * const temp= (int16_t*)align_tmp;
87 
88  __asm__ volatile(
89 #if 0 //Alternative, simpler variant
90 
91 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
92  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
93  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
94  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
95  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
96  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
97  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
98  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
99  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
100  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
101  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
102  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
103  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
104  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
105  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
106  #rounder ", %%mm4 \n\t"\
107  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
108  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
109  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
110  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
111  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
112  #rounder ", %%mm0 \n\t"\
113  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
114  "paddd %%mm0, %%mm0 \n\t" \
115  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
116  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
117  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
118  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
119  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
120  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
121  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
122  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
123  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
124  "psrad $" #shift ", %%mm7 \n\t"\
125  "psrad $" #shift ", %%mm4 \n\t"\
126  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
127  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
128  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
129  "psrad $" #shift ", %%mm1 \n\t"\
130  "psrad $" #shift ", %%mm2 \n\t"\
131  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
132  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
133  "movq %%mm7, " #dst " \n\t"\
134  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
135  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
136  "movq %%mm2, 24+" #dst " \n\t"\
137  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
138  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
139  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
140  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
141  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
142  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
143  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
144  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
145  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
146  "psrad $" #shift ", %%mm2 \n\t"\
147  "psrad $" #shift ", %%mm0 \n\t"\
148  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
149  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
150  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
151  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
152  "psrad $" #shift ", %%mm6 \n\t"\
153  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
154  "movq %%mm2, 8+" #dst " \n\t"\
155  "psrad $" #shift ", %%mm4 \n\t"\
156  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
157  "movq %%mm4, 16+" #dst " \n\t"\
158 
159 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
160  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
161  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
162  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
163  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
164  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
165  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
166  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
167  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
168  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
169  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
170  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
171  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
172  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
173  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
174  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
175  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
176  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
177  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
178  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
179  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
180  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
181  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
182  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
183  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
184  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
185  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
186  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
187  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
188  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
189  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
190  "psrad $" #shift ", %%mm7 \n\t"\
191  "psrad $" #shift ", %%mm4 \n\t"\
192  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
193  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
194  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
195  "psrad $" #shift ", %%mm0 \n\t"\
196  "psrad $" #shift ", %%mm2 \n\t"\
197  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
198  "movd %%mm7, " #dst " \n\t"\
199  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
200  "movd %%mm0, 16+" #dst " \n\t"\
201  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
202  "movd %%mm2, 96+" #dst " \n\t"\
203  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
204  "movd %%mm4, 112+" #dst " \n\t"\
205  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
206  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
207  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
208  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
209  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
210  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
211  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
212  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
213  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
214  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
215  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
216  "psrad $" #shift ", %%mm2 \n\t"\
217  "psrad $" #shift ", %%mm5 \n\t"\
218  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
219  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
220  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
221  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
222  "psrad $" #shift ", %%mm6 \n\t"\
223  "psrad $" #shift ", %%mm4 \n\t"\
224  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
225  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
226  "movd %%mm2, 32+" #dst " \n\t"\
227  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
228  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
229  "movd %%mm6, 48+" #dst " \n\t"\
230  "movd %%mm4, 64+" #dst " \n\t"\
231  "movd %%mm5, 80+" #dst " \n\t"\
232 
233 
234 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
235  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
236  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
237  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
238  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
239  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
240  "pand %%mm0, %%mm4 \n\t"\
241  "por %%mm1, %%mm4 \n\t"\
242  "por %%mm2, %%mm4 \n\t"\
243  "por %%mm3, %%mm4 \n\t"\
244  "packssdw %%mm4,%%mm4 \n\t"\
245  "movd %%mm4, %%eax \n\t"\
246  "orl %%eax, %%eax \n\t"\
247  "jz 1f \n\t"\
248  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
249  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
250  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
251  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
252  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
253  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
254  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
255  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
256  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
257  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
258  #rounder ", %%mm4 \n\t"\
259  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
260  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
261  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
262  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
263  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
264  #rounder ", %%mm0 \n\t"\
265  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
266  "paddd %%mm0, %%mm0 \n\t" \
267  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
268  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
269  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
270  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
271  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
272  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
273  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
274  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
275  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
276  "psrad $" #shift ", %%mm7 \n\t"\
277  "psrad $" #shift ", %%mm4 \n\t"\
278  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
279  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
280  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
281  "psrad $" #shift ", %%mm1 \n\t"\
282  "psrad $" #shift ", %%mm2 \n\t"\
283  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
284  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
285  "movq %%mm7, " #dst " \n\t"\
286  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
287  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
288  "movq %%mm2, 24+" #dst " \n\t"\
289  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
290  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
291  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
292  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
293  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
294  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
295  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
296  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
297  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
298  "psrad $" #shift ", %%mm2 \n\t"\
299  "psrad $" #shift ", %%mm0 \n\t"\
300  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
301  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
302  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
303  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
304  "psrad $" #shift ", %%mm6 \n\t"\
305  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
306  "movq %%mm2, 8+" #dst " \n\t"\
307  "psrad $" #shift ", %%mm4 \n\t"\
308  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
309  "movq %%mm4, 16+" #dst " \n\t"\
310  "jmp 2f \n\t"\
311  "1: \n\t"\
312  "pslld $16, %%mm0 \n\t"\
313  "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
314  "psrad $13, %%mm0 \n\t"\
315  "packssdw %%mm0, %%mm0 \n\t"\
316  "movq %%mm0, " #dst " \n\t"\
317  "movq %%mm0, 8+" #dst " \n\t"\
318  "movq %%mm0, 16+" #dst " \n\t"\
319  "movq %%mm0, 24+" #dst " \n\t"\
320  "2: \n\t"
321 
322 
323 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
324 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
325 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
326 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
327 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
328 
329 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
330 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
331 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
332 
333 
334 //IDCT( src0, src4, src1, src5, dst, shift)
335 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
336 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
337 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
338 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
339 
340 #else
341 
342 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
343  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
344  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
345  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
346  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
347  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
348  "pand %%mm0, %%mm4 \n\t"\
349  "por %%mm1, %%mm4 \n\t"\
350  "por %%mm2, %%mm4 \n\t"\
351  "por %%mm3, %%mm4 \n\t"\
352  "packssdw %%mm4,%%mm4 \n\t"\
353  "movd %%mm4, %%eax \n\t"\
354  "orl %%eax, %%eax \n\t"\
355  "jz 1f \n\t"\
356  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
357  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
358  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
359  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
360  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
361  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
362  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
363  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
364  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
365  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
366  #rounder ", %%mm4 \n\t"\
367  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
368  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
369  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
370  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
371  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
372  #rounder ", %%mm0 \n\t"\
373  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
374  "paddd %%mm0, %%mm0 \n\t" \
375  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
376  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
377  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
378  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
379  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
380  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
381  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
382  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
383  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
384  "psrad $" #shift ", %%mm7 \n\t"\
385  "psrad $" #shift ", %%mm4 \n\t"\
386  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
387  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
388  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
389  "psrad $" #shift ", %%mm1 \n\t"\
390  "psrad $" #shift ", %%mm2 \n\t"\
391  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
392  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
393  "movq %%mm7, " #dst " \n\t"\
394  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
395  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
396  "movq %%mm2, 24+" #dst " \n\t"\
397  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
398  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
399  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
400  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
401  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
402  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
403  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
404  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
405  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
406  "psrad $" #shift ", %%mm2 \n\t"\
407  "psrad $" #shift ", %%mm0 \n\t"\
408  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
409  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
410  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
411  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
412  "psrad $" #shift ", %%mm6 \n\t"\
413  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
414  "movq %%mm2, 8+" #dst " \n\t"\
415  "psrad $" #shift ", %%mm4 \n\t"\
416  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
417  "movq %%mm4, 16+" #dst " \n\t"\
418  "jmp 2f \n\t"\
419  "1: \n\t"\
420  "pslld $16, %%mm0 \n\t"\
421  "paddd "MANGLE(d40000)", %%mm0 \n\t"\
422  "psrad $13, %%mm0 \n\t"\
423  "packssdw %%mm0, %%mm0 \n\t"\
424  "movq %%mm0, " #dst " \n\t"\
425  "movq %%mm0, 8+" #dst " \n\t"\
426  "movq %%mm0, 16+" #dst " \n\t"\
427  "movq %%mm0, 24+" #dst " \n\t"\
428  "2: \n\t"
429 
430 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
431  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
432  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
433  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
434  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
435  "movq %%mm0, %%mm4 \n\t"\
436  "por %%mm1, %%mm4 \n\t"\
437  "por %%mm2, %%mm4 \n\t"\
438  "por %%mm3, %%mm4 \n\t"\
439  "packssdw %%mm4,%%mm4 \n\t"\
440  "movd %%mm4, %%eax \n\t"\
441  "orl %%eax, %%eax \n\t"\
442  "jz " #bt " \n\t"\
443  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
444  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
445  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
446  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
447  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
448  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
449  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
450  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
451  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
452  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
453  #rounder ", %%mm4 \n\t"\
454  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
455  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
456  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
457  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
458  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
459  #rounder ", %%mm0 \n\t"\
460  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
461  "paddd %%mm0, %%mm0 \n\t" \
462  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
463  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
464  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
465  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
466  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
467  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
468  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
469  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
470  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
471  "psrad $" #shift ", %%mm7 \n\t"\
472  "psrad $" #shift ", %%mm4 \n\t"\
473  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
474  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
475  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
476  "psrad $" #shift ", %%mm1 \n\t"\
477  "psrad $" #shift ", %%mm2 \n\t"\
478  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
479  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
480  "movq %%mm7, " #dst " \n\t"\
481  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
482  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
483  "movq %%mm2, 24+" #dst " \n\t"\
484  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
485  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
486  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
487  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
488  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
489  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
490  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
491  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
492  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
493  "psrad $" #shift ", %%mm2 \n\t"\
494  "psrad $" #shift ", %%mm0 \n\t"\
495  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
496  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
497  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
498  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
499  "psrad $" #shift ", %%mm6 \n\t"\
500  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
501  "movq %%mm2, 8+" #dst " \n\t"\
502  "psrad $" #shift ", %%mm4 \n\t"\
503  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
504  "movq %%mm4, 16+" #dst " \n\t"\
505 
506 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
507  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
508  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
509  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
510  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
511  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
512  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
513  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
514  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
515  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
516  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
517  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
518  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
519  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
520  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
521  #rounder ", %%mm4 \n\t"\
522  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
523  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
524  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
525  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
526  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
527  #rounder ", %%mm0 \n\t"\
528  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
529  "paddd %%mm0, %%mm0 \n\t" \
530  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
531  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
532  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
533  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
534  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
535  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
536  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
537  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
538  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
539  "psrad $" #shift ", %%mm7 \n\t"\
540  "psrad $" #shift ", %%mm4 \n\t"\
541  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
542  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
543  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
544  "psrad $" #shift ", %%mm1 \n\t"\
545  "psrad $" #shift ", %%mm2 \n\t"\
546  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
547  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
548  "movq %%mm7, " #dst " \n\t"\
549  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
550  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
551  "movq %%mm2, 24+" #dst " \n\t"\
552  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
553  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
554  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
555  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
556  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
557  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
558  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
559  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
560  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
561  "psrad $" #shift ", %%mm2 \n\t"\
562  "psrad $" #shift ", %%mm0 \n\t"\
563  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
564  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
565  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
566  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
567  "psrad $" #shift ", %%mm6 \n\t"\
568  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
569  "movq %%mm2, 8+" #dst " \n\t"\
570  "psrad $" #shift ", %%mm4 \n\t"\
571  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
572  "movq %%mm4, 16+" #dst " \n\t"\
573 
574 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
575 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
576 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
577 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
578 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
579 
580 #undef IDCT
581 #define IDCT(src0, src4, src1, src5, dst, shift) \
582  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
583  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
584  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
585  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
586  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
587  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
588  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
589  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
590  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
591  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
592  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
593  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
594  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
595  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
596  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
597  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
598  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
599  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
600  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
601  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
602  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
603  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
604  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
605  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
606  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
607  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
608  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
609  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
610  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
611  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
612  "psrad $" #shift ", %%mm7 \n\t"\
613  "psrad $" #shift ", %%mm4 \n\t"\
614  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
615  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
616  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
617  "psrad $" #shift ", %%mm0 \n\t"\
618  "psrad $" #shift ", %%mm2 \n\t"\
619  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
620  "movd %%mm7, " #dst " \n\t"\
621  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
622  "movd %%mm0, 16+" #dst " \n\t"\
623  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
624  "movd %%mm2, 96+" #dst " \n\t"\
625  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
626  "movd %%mm4, 112+" #dst " \n\t"\
627  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
628  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
629  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
630  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
631  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
632  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
633  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
634  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
635  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
636  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
637  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
638  "psrad $" #shift ", %%mm2 \n\t"\
639  "psrad $" #shift ", %%mm5 \n\t"\
640  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
641  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
642  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
643  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
644  "psrad $" #shift ", %%mm6 \n\t"\
645  "psrad $" #shift ", %%mm4 \n\t"\
646  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
647  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
648  "movd %%mm2, 32+" #dst " \n\t"\
649  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
650  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
651  "movd %%mm6, 48+" #dst " \n\t"\
652  "movd %%mm4, 64+" #dst " \n\t"\
653  "movd %%mm5, 80+" #dst " \n\t"
654 
655 
656 //IDCT( src0, src4, src1, src5, dst, shift)
657 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
658 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
659 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
660 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
661  "jmp 9f \n\t"
662 
663  "# .p2align 4 \n\t"\
664  "4: \n\t"
665 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
666 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
667 
668 #undef IDCT
669 #define IDCT(src0, src4, src1, src5, dst, shift) \
670  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
671  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
672  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
673  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
674  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
675  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
676  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
677  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
678  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
679  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
680  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
681  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
682  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
683  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
684  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
685  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
686  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
687  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
688  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
689  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
690  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
691  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
692  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
693  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
694  "psrad $" #shift ", %%mm1 \n\t"\
695  "psrad $" #shift ", %%mm4 \n\t"\
696  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
697  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
698  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
699  "psrad $" #shift ", %%mm0 \n\t"\
700  "psrad $" #shift ", %%mm2 \n\t"\
701  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
702  "movd %%mm1, " #dst " \n\t"\
703  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
704  "movd %%mm0, 16+" #dst " \n\t"\
705  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
706  "movd %%mm2, 96+" #dst " \n\t"\
707  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
708  "movd %%mm4, 112+" #dst " \n\t"\
709  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
710  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
711  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
712  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
713  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
714  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
715  "psrad $" #shift ", %%mm2 \n\t"\
716  "psrad $" #shift ", %%mm5 \n\t"\
717  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
718  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
719  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
720  "psrad $" #shift ", %%mm6 \n\t"\
721  "psrad $" #shift ", %%mm1 \n\t"\
722  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
723  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
724  "movd %%mm2, 32+" #dst " \n\t"\
725  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
726  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
727  "movd %%mm6, 48+" #dst " \n\t"\
728  "movd %%mm1, 64+" #dst " \n\t"\
729  "movd %%mm5, 80+" #dst " \n\t"
730 
731 //IDCT( src0, src4, src1, src5, dst, shift)
732 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
733 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
734 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
735 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
736  "jmp 9f \n\t"
737 
738  "# .p2align 4 \n\t"\
739  "6: \n\t"
740 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
741 
742 #undef IDCT
743 #define IDCT(src0, src4, src1, src5, dst, shift) \
744  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
745  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
746  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
747  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
748  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
749  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
750  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
751  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
752  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
753  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
754  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
755  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
756  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
757  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
758  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
759  "psrad $" #shift ", %%mm1 \n\t"\
760  "psrad $" #shift ", %%mm4 \n\t"\
761  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
762  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
763  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
764  "psrad $" #shift ", %%mm0 \n\t"\
765  "psrad $" #shift ", %%mm2 \n\t"\
766  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
767  "movd %%mm1, " #dst " \n\t"\
768  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
769  "movd %%mm0, 16+" #dst " \n\t"\
770  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
771  "movd %%mm2, 96+" #dst " \n\t"\
772  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
773  "movd %%mm4, 112+" #dst " \n\t"\
774  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
775  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
776  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
777  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
778  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
779  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
780  "psrad $" #shift ", %%mm2 \n\t"\
781  "psrad $" #shift ", %%mm5 \n\t"\
782  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
783  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
784  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
785  "psrad $" #shift ", %%mm6 \n\t"\
786  "psrad $" #shift ", %%mm1 \n\t"\
787  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
788  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
789  "movd %%mm2, 32+" #dst " \n\t"\
790  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
791  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
792  "movd %%mm6, 48+" #dst " \n\t"\
793  "movd %%mm1, 64+" #dst " \n\t"\
794  "movd %%mm5, 80+" #dst " \n\t"
795 
796 
797 //IDCT( src0, src4, src1, src5, dst, shift)
798 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
799 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
800 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
801 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
802  "jmp 9f \n\t"
803 
804  "# .p2align 4 \n\t"\
805  "2: \n\t"
806 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
807 
808 #undef IDCT
809 #define IDCT(src0, src4, src1, src5, dst, shift) \
810  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
811  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
812  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
813  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
814  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
815  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
816  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
817  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
818  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
819  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
820  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
821  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
822  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
823  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
824  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
825  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
826  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
827  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
828  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
829  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
830  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
831  "psrad $" #shift ", %%mm7 \n\t"\
832  "psrad $" #shift ", %%mm4 \n\t"\
833  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
834  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
835  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
836  "psrad $" #shift ", %%mm0 \n\t"\
837  "psrad $" #shift ", %%mm2 \n\t"\
838  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
839  "movd %%mm7, " #dst " \n\t"\
840  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
841  "movd %%mm0, 16+" #dst " \n\t"\
842  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
843  "movd %%mm2, 96+" #dst " \n\t"\
844  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
845  "movd %%mm4, 112+" #dst " \n\t"\
846  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
847  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
848  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
849  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
850  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
851  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
852  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
853  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
854  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
855  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
856  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
857  "psrad $" #shift ", %%mm2 \n\t"\
858  "psrad $" #shift ", %%mm5 \n\t"\
859  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
860  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
861  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
862  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
863  "psrad $" #shift ", %%mm6 \n\t"\
864  "psrad $" #shift ", %%mm4 \n\t"\
865  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
866  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
867  "movd %%mm2, 32+" #dst " \n\t"\
868  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
869  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
870  "movd %%mm6, 48+" #dst " \n\t"\
871  "movd %%mm4, 64+" #dst " \n\t"\
872  "movd %%mm5, 80+" #dst " \n\t"
873 
874 //IDCT( src0, src4, src1, src5, dst, shift)
875 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
876 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
877 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
878 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
879  "jmp 9f \n\t"
880 
881  "# .p2align 4 \n\t"\
882  "3: \n\t"
883 #undef IDCT
884 #define IDCT(src0, src4, src1, src5, dst, shift) \
885  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
886  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
887  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
888  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
889  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
890  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
891  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
892  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
893  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
894  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
895  "movq 64(%2), %%mm3 \n\t"\
896  "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
897  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
898  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
899  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
900  "psrad $" #shift ", %%mm7 \n\t"\
901  "psrad $" #shift ", %%mm4 \n\t"\
902  "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
903  "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
904  "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
905  "psrad $" #shift ", %%mm0 \n\t"\
906  "psrad $" #shift ", %%mm1 \n\t"\
907  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
908  "movd %%mm7, " #dst " \n\t"\
909  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
910  "movd %%mm0, 16+" #dst " \n\t"\
911  "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
912  "movd %%mm1, 96+" #dst " \n\t"\
913  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
914  "movd %%mm4, 112+" #dst " \n\t"\
915  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
916  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
917  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
918  "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
919  "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
920  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
921  "psrad $" #shift ", %%mm1 \n\t"\
922  "psrad $" #shift ", %%mm5 \n\t"\
923  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
924  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
925  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
926  "psrad $" #shift ", %%mm6 \n\t"\
927  "psrad $" #shift ", %%mm4 \n\t"\
928  "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
929  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
930  "movd %%mm1, 32+" #dst " \n\t"\
931  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
932  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
933  "movd %%mm6, 48+" #dst " \n\t"\
934  "movd %%mm4, 64+" #dst " \n\t"\
935  "movd %%mm5, 80+" #dst " \n\t"
936 
937 
938 //IDCT( src0, src4, src1, src5, dst, shift)
939 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
940 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
941 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
942 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
943  "jmp 9f \n\t"
944 
945  "# .p2align 4 \n\t"\
946  "5: \n\t"
947 #undef IDCT
948 #define IDCT(src0, src4, src1, src5, dst, shift) \
949  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
950  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
951  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
952  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
953  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
954  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
955  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
956  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
957  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
958  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
959  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
960  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
961  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
962  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
963  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
964  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
965  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
966  "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
967  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
968  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
969  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
970  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
971  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
972  "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
973  "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
974  "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
975  "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
976  "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
977  "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
978  "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
979  "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
980  "psrad $" #shift ", %%mm4 \n\t"\
981  "psrad $" #shift ", %%mm7 \n\t"\
982  "psrad $" #shift ", %%mm3 \n\t"\
983  "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
984  "movq %%mm4, " #dst " \n\t"\
985  "psrad $" #shift ", %%mm0 \n\t"\
986  "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
987  "movq %%mm0, 16+" #dst " \n\t"\
988  "movq %%mm0, 96+" #dst " \n\t"\
989  "movq %%mm4, 112+" #dst " \n\t"\
990  "psrad $" #shift ", %%mm5 \n\t"\
991  "psrad $" #shift ", %%mm6 \n\t"\
992  "psrad $" #shift ", %%mm2 \n\t"\
993  "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
994  "movq %%mm5, 32+" #dst " \n\t"\
995  "psrad $" #shift ", %%mm1 \n\t"\
996  "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
997  "movq %%mm6, 48+" #dst " \n\t"\
998  "movq %%mm6, 64+" #dst " \n\t"\
999  "movq %%mm5, 80+" #dst " \n\t"
1000 
1001 
1002 //IDCT( src0, src4, src1, src5, dst, shift)
1003 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1004 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1005 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1006 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1007  "jmp 9f \n\t"
1008 
1009 
1010  "# .p2align 4 \n\t"\
1011  "1: \n\t"
1012 #undef IDCT
1013 #define IDCT(src0, src4, src1, src5, dst, shift) \
1014  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1015  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1016  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1017  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1018  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1019  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1020  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1021  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1022  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1023  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1024  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1025  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1026  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1027  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1028  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1029  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1030  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1031  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1032  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1033  "movq 64(%2), %%mm1 \n\t"\
1034  "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1035  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1036  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1037  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1038  "psrad $" #shift ", %%mm7 \n\t"\
1039  "psrad $" #shift ", %%mm4 \n\t"\
1040  "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1041  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1042  "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1043  "psrad $" #shift ", %%mm0 \n\t"\
1044  "psrad $" #shift ", %%mm3 \n\t"\
1045  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1046  "movd %%mm7, " #dst " \n\t"\
1047  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1048  "movd %%mm0, 16+" #dst " \n\t"\
1049  "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1050  "movd %%mm3, 96+" #dst " \n\t"\
1051  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1052  "movd %%mm4, 112+" #dst " \n\t"\
1053  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1054  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1055  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1056  "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1057  "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1058  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1059  "psrad $" #shift ", %%mm3 \n\t"\
1060  "psrad $" #shift ", %%mm5 \n\t"\
1061  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1062  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1063  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1064  "psrad $" #shift ", %%mm6 \n\t"\
1065  "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1066  "movd %%mm3, 32+" #dst " \n\t"\
1067  "psrad $" #shift ", %%mm4 \n\t"\
1068  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1069  "movd %%mm6, 48+" #dst " \n\t"\
1070  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1071  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1072  "movd %%mm4, 64+" #dst " \n\t"\
1073  "movd %%mm5, 80+" #dst " \n\t"
1074 
1075 
1076 //IDCT( src0, src4, src1, src5, dst, shift)
1077 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1078 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1079 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1080 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1081  "jmp 9f \n\t"
1082 
1083 
1084  "# .p2align 4 \n\t"
1085  "7: \n\t"
1086 #undef IDCT
1087 #define IDCT(src0, src4, src1, src5, dst, shift) \
1088  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1089  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1090  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1091  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1092  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1093  "psrad $" #shift ", %%mm4 \n\t"\
1094  "psrad $" #shift ", %%mm0 \n\t"\
1095  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1096  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1097  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1098  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1099  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1100  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1101  "psrad $" #shift ", %%mm1 \n\t"\
1102  "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1103  "movq %%mm4, " #dst " \n\t"\
1104  "psrad $" #shift ", %%mm2 \n\t"\
1105  "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1106  "movq %%mm0, 16+" #dst " \n\t"\
1107  "movq %%mm0, 96+" #dst " \n\t"\
1108  "movq %%mm4, 112+" #dst " \n\t"\
1109  "movq %%mm0, 32+" #dst " \n\t"\
1110  "movq %%mm4, 48+" #dst " \n\t"\
1111  "movq %%mm4, 64+" #dst " \n\t"\
1112  "movq %%mm0, 80+" #dst " \n\t"
1113 
1114 //IDCT( src0, src4, src1, src5, dst, shift)
1115 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1116 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1117 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1118 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1119 
1120 
1121 #endif
1122 
1123 /*
1124 Input
1125  00 40 04 44 20 60 24 64
1126  10 30 14 34 50 70 54 74
1127  01 41 03 43 21 61 23 63
1128  11 31 13 33 51 71 53 73
1129  02 42 06 46 22 62 26 66
1130  12 32 16 36 52 72 56 76
1131  05 45 07 47 25 65 27 67
1132  15 35 17 37 55 75 57 77
1133 
1134 Temp
1135  00 04 10 14 20 24 30 34
1136  40 44 50 54 60 64 70 74
1137  01 03 11 13 21 23 31 33
1138  41 43 51 53 61 63 71 73
1139  02 06 12 16 22 26 32 36
1140  42 46 52 56 62 66 72 76
1141  05 07 15 17 25 27 35 37
1142  45 47 55 57 65 67 75 77
1143 */
1144 
1145 "9: \n\t"
1146  :: "r" (block), "r" (temp), "r" (coeffs)
1147  : "%eax"
1148  );
1149 }
1150 
1151 void ff_simple_idct_mmx(int16_t *block)
1152 {
1153  idct(block);
1154 }
1155 
1156 //FIXME merge add/put into the idct
1157 
1158 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1159 {
1160  idct(block);
1161  ff_put_pixels_clamped_mmx(block, dest, line_size);
1162 }
1163 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1164 {
1165  idct(block);
1166  ff_add_pixels_clamped_mmx(block, dest, line_size);
1167 }
1168 
1169 #endif /* HAVE_INLINE_ASM */