Go to the documentation of this file.
21 #ifdef COMPILE_TEMPLATE_SSE2
25 #define MOVQU "movdqu"
27 #define LOAD(mem,dst) \
28 MOV" "mem", "dst" \n\t"\
29 "punpcklbw "MM"7, "dst" \n\t"
30 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
31 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
32 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
33 "psrldq $2, "src" \n\t"
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
44 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
45 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
48 #ifdef COMPILE_TEMPLATE_SSSE3
49 #define PABS(tmp,dst) \
50 "pabsw "dst", "dst" \n\t"
52 #define PABS(tmp,dst) \
53 "pxor "tmp", "tmp" \n\t"\
54 "psubw "dst", "tmp" \n\t"\
55 "pmaxsw "tmp", "dst" \n\t"
58 #define CHECK(pj,mj) \
59 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" \
60 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" \
61 MOVQ" "MM"2, "MM"4 \n\t"\
62 MOVQ" "MM"2, "MM"5 \n\t"\
63 "pxor "MM"3, "MM"4 \n\t"\
64 "pavgb "MM"3, "MM"5 \n\t"\
65 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
66 "psubusb "MM"4, "MM"5 \n\t"\
68 "punpcklbw "MM"7, "MM"5 \n\t" \
69 MOVQ" "MM"2, "MM"4 \n\t"\
70 "psubusb "MM"3, "MM"2 \n\t"\
71 "psubusb "MM"4, "MM"3 \n\t"\
72 "pmaxub "MM"3, "MM"2 \n\t"\
73 MOVQ" "MM"2, "MM"3 \n\t"\
74 MOVQ" "MM"2, "MM"4 \n\t" \
77 "punpcklbw "MM"7, "MM"2 \n\t"\
78 "punpcklbw "MM"7, "MM"3 \n\t"\
79 "punpcklbw "MM"7, "MM"4 \n\t"\
80 "paddw "MM"3, "MM"2 \n\t"\
81 "paddw "MM"4, "MM"2 \n\t"
84 MOVQ" "MM"0, "MM"3 \n\t"\
85 "pcmpgtw "MM"2, "MM"3 \n\t" \
86 "pminsw "MM"2, "MM"0 \n\t" \
87 MOVQ" "MM"3, "MM"6 \n\t"\
88 "pand "MM"3, "MM"5 \n\t"\
89 "pandn "MM"1, "MM"3 \n\t"\
90 "por "MM"5, "MM"3 \n\t"\
91 MOVQ" "MM"3, "MM"1 \n\t"
95 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
96 "psllw $14, "MM"6 \n\t"\
97 "paddsw "MM"6, "MM"2 \n\t"\
98 MOVQ" "MM"0, "MM"3 \n\t"\
99 "pcmpgtw "MM"2, "MM"3 \n\t"\
100 "pminsw "MM"2, "MM"0 \n\t"\
101 "pand "MM"3, "MM"5 \n\t"\
102 "pandn "MM"1, "MM"3 \n\t"\
103 "por "MM"5, "MM"3 \n\t"\
104 MOVQ" "MM"3, "MM"1 \n\t"
107 uint8_t *next,
int w,
int prefs,
108 int mrefs,
int parity,
int mode)
114 for(x=0; x<w; x+=STEP){\
116 "pxor "MM"7, "MM"7 \n\t"\
117 LOAD("(%[cur],%[mrefs])", MM"0") \
118 LOAD("(%[cur],%[prefs])", MM"1") \
119 LOAD("(%["prev2"])", MM"2") \
120 LOAD("(%["next2"])", MM"3") \
121 MOVQ" "MM"3, "MM"4 \n\t"\
122 "paddw "MM"2, "MM"3 \n\t"\
123 "psraw $1, "MM"3 \n\t" \
124 MOVQ" "MM"0, (%[tmp]) \n\t" \
125 MOVQ" "MM"3, 16(%[tmp]) \n\t" \
126 MOVQ" "MM"1, 32(%[tmp]) \n\t" \
127 "psubw "MM"4, "MM"2 \n\t"\
128 PABS( MM"4", MM"2") \
129 LOAD("(%[prev],%[mrefs])", MM"3") \
130 LOAD("(%[prev],%[prefs])", MM"4") \
131 "psubw "MM"0, "MM"3 \n\t"\
132 "psubw "MM"1, "MM"4 \n\t"\
135 "paddw "MM"4, "MM"3 \n\t" \
136 "psrlw $1, "MM"2 \n\t"\
137 "psrlw $1, "MM"3 \n\t"\
138 "pmaxsw "MM"3, "MM"2 \n\t"\
139 LOAD("(%[next],%[mrefs])", MM"3") \
140 LOAD("(%[next],%[prefs])", MM"4") \
141 "psubw "MM"0, "MM"3 \n\t"\
142 "psubw "MM"1, "MM"4 \n\t"\
145 "paddw "MM"4, "MM"3 \n\t" \
146 "psrlw $1, "MM"3 \n\t"\
147 "pmaxsw "MM"3, "MM"2 \n\t"\
148 MOVQ" "MM"2, 48(%[tmp]) \n\t" \
150 "paddw "MM"0, "MM"1 \n\t"\
151 "paddw "MM"0, "MM"0 \n\t"\
152 "psubw "MM"1, "MM"0 \n\t"\
153 "psrlw $1, "MM"1 \n\t" \
154 PABS( MM"2", MM"0") \
156 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" \
157 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" \
158 MOVQ" "MM"2, "MM"4 \n\t"\
159 "psubusb "MM"3, "MM"2 \n\t"\
160 "psubusb "MM"4, "MM"3 \n\t"\
161 "pmaxub "MM"3, "MM"2 \n\t"\
162 PSHUF(MM"3", MM"2") \
163 "punpcklbw "MM"7, "MM"2 \n\t" \
164 "punpcklbw "MM"7, "MM"3 \n\t" \
165 "paddw "MM"2, "MM"0 \n\t"\
166 "paddw "MM"3, "MM"0 \n\t"\
167 "psubw "MANGLE(pw_1)", "MM"0 \n\t" \
179 MOVQ" 48(%[tmp]), "MM"6 \n\t" \
180 "cmpl $2, %[mode] \n\t"\
182 LOAD("(%["prev2"],%[mrefs],2)", MM"2") \
183 LOAD("(%["next2"],%[mrefs],2)", MM"4") \
184 LOAD("(%["prev2"],%[prefs],2)", MM"3") \
185 LOAD("(%["next2"],%[prefs],2)", MM"5") \
186 "paddw "MM"4, "MM"2 \n\t"\
187 "paddw "MM"5, "MM"3 \n\t"\
188 "psrlw $1, "MM"2 \n\t" \
189 "psrlw $1, "MM"3 \n\t" \
190 MOVQ" (%[tmp]), "MM"4 \n\t" \
191 MOVQ" 16(%[tmp]), "MM"5 \n\t" \
192 MOVQ" 32(%[tmp]), "MM"7 \n\t" \
193 "psubw "MM"4, "MM"2 \n\t" \
194 "psubw "MM"7, "MM"3 \n\t" \
195 MOVQ" "MM"5, "MM"0 \n\t"\
196 "psubw "MM"4, "MM"5 \n\t" \
197 "psubw "MM"7, "MM"0 \n\t" \
198 MOVQ" "MM"2, "MM"4 \n\t"\
199 "pminsw "MM"3, "MM"2 \n\t"\
200 "pmaxsw "MM"4, "MM"3 \n\t"\
201 "pmaxsw "MM"5, "MM"2 \n\t"\
202 "pminsw "MM"5, "MM"3 \n\t"\
203 "pmaxsw "MM"0, "MM"2 \n\t" \
204 "pminsw "MM"0, "MM"3 \n\t" \
205 "pxor "MM"4, "MM"4 \n\t"\
206 "pmaxsw "MM"3, "MM"6 \n\t"\
207 "psubw "MM"2, "MM"4 \n\t" \
208 "pmaxsw "MM"4, "MM"6 \n\t" \
211 MOVQ" 16(%[tmp]), "MM"2 \n\t" \
212 MOVQ" "MM"2, "MM"3 \n\t"\
213 "psubw "MM"6, "MM"2 \n\t" \
214 "paddw "MM"6, "MM"3 \n\t" \
215 "pmaxsw "MM"2, "MM"1 \n\t"\
216 "pminsw "MM"3, "MM"1 \n\t" \
217 "packuswb "MM"1, "MM"1 \n\t"\
222 [prefs]"r"((x86_reg)prefs),\
223 [mrefs]"r"((x86_reg)mrefs),\
227 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\