swscale_template.c 87 KB
Newer Older
1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 4 5 6 7

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
8

9 10 11 12
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
13

14 15 16 17
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
18

19
#undef REAL_MOVNTQ
Michael Niedermayer's avatar
Michael Niedermayer committed
20
#undef MOVNTQ
21
#undef PAVGB
Michael Niedermayer's avatar
Michael Niedermayer committed
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
#undef PREFETCH
#undef PREFETCHW
#undef EMMS
#undef SFENCE

#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS     "femms"
#else
#define EMMS     "emms"
#endif

#ifdef HAVE_3DNOW
#define PREFETCH  "prefetch"
#define PREFETCHW "prefetchw"
#elif defined ( HAVE_MMX2 )
#define PREFETCH "prefetchnta"
#define PREFETCHW "prefetcht0"
#else
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif

#ifdef HAVE_MMX2
#define SFENCE "sfence"
#else
#define SFENCE "/nop"
#endif
50

51 52 53 54 55
#ifdef HAVE_MMX2
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
#elif defined (HAVE_3DNOW)
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
#endif
56

57
#ifdef HAVE_MMX2
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59
#else
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61
#endif
62
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63

64 65 66 67
#ifdef HAVE_ALTIVEC
#include "swscale_altivec_template.c"
#endif

Michael Niedermayer's avatar
Michael Niedermayer committed
68
#define YSCALEYUV2YV12X(x, offset) \
69
			"xor %%"REG_a", %%"REG_a"	\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
70 71
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
			"movq %%mm3, %%mm4		\n\t"\
72 73
			"lea " offset "(%0), %%"REG_d"	\n\t"\
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
74 75
			".balign 16			\n\t" /* FIXME Unroll? */\
			"1:				\n\t"\
76 77 78 79 80 81
			"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
			"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
			"movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
			"add $16, %%"REG_d"		\n\t"\
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
			"test %%"REG_S", %%"REG_S"	\n\t"\
82 83 84 85 86 87 88 89
			"pmulhw %%mm0, %%mm2		\n\t"\
			"pmulhw %%mm0, %%mm5		\n\t"\
			"paddw %%mm2, %%mm3		\n\t"\
			"paddw %%mm5, %%mm4		\n\t"\
			" jnz 1b			\n\t"\
			"psraw $3, %%mm3		\n\t"\
			"psraw $3, %%mm4		\n\t"\
			"packuswb %%mm4, %%mm3		\n\t"\
90 91 92
			MOVNTQ(%%mm3, (%1, %%REGa))\
			"add $8, %%"REG_a"		\n\t"\
			"cmp %2, %%"REG_a"		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
93 94
			"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
			"movq %%mm3, %%mm4		\n\t"\
95 96
			"lea " offset "(%0), %%"REG_d"	\n\t"\
			"mov (%%"REG_d"), %%"REG_S"	\n\t"\
97 98 99
			"jb 1b				\n\t"

#define YSCALEYUV2YV121 \
100
			"mov %2, %%"REG_a"		\n\t"\
101 102
			".balign 16			\n\t" /* FIXME Unroll? */\
			"1:				\n\t"\
103 104
			"movq (%0, %%"REG_a", 2), %%mm0	\n\t"\
			"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
105 106 107
			"psraw $7, %%mm0		\n\t"\
			"psraw $7, %%mm1		\n\t"\
			"packuswb %%mm1, %%mm0		\n\t"\
108 109
			MOVNTQ(%%mm0, (%1, %%REGa))\
			"add $8, %%"REG_a"		\n\t"\
110 111 112 113 114 115 116 117 118
			"jnc 1b				\n\t"

/*
			:: "m" (-lumFilterSize), "m" (-chrFilterSize),
			   "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
			   "r" (dest), "m" (dstW),
			   "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
			: "%eax", "%ebx", "%ecx", "%edx", "%esi"
*/
Michael Niedermayer's avatar
Michael Niedermayer committed
119
#define YSCALEYUV2PACKEDX \
120
		"xor %%"REG_a", %%"REG_a"	\n\t"\
121
		".balign 16			\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
122
		"nop				\n\t"\
123
		"1:				\n\t"\
124 125
		"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
126 127
		"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
		"movq %%mm3, %%mm4		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
128
		".balign 16			\n\t"\
129
		"2:				\n\t"\
130 131 132 133 134
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
		"movq (%%"REG_S", %%"REG_a"), %%mm2	\n\t" /* UsrcData */\
		"movq 4096(%%"REG_S", %%"REG_a"), %%mm5	\n\t" /* VsrcData */\
		"add $16, %%"REG_d"		\n\t"\
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
135 136 137 138
		"pmulhw %%mm0, %%mm2		\n\t"\
		"pmulhw %%mm0, %%mm5		\n\t"\
		"paddw %%mm2, %%mm3		\n\t"\
		"paddw %%mm5, %%mm4		\n\t"\
139
		"test %%"REG_S", %%"REG_S"	\n\t"\
140 141
		" jnz 2b			\n\t"\
\
142 143
		"lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
144 145
		"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
		"movq %%mm1, %%mm7		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
146
		".balign 16			\n\t"\
147
		"2:				\n\t"\
148 149 150 151 152
		"movq 8(%%"REG_d"), %%mm0	\n\t" /* filterCoeff */\
		"movq (%%"REG_S", %%"REG_a", 2), %%mm2	\n\t" /* Y1srcData */\
		"movq 8(%%"REG_S", %%"REG_a", 2), %%mm5	\n\t" /* Y2srcData */\
		"add $16, %%"REG_d"		\n\t"\
		"mov (%%"REG_d"), %%"REG_S"	\n\t"\
153 154 155 156
		"pmulhw %%mm0, %%mm2		\n\t"\
		"pmulhw %%mm0, %%mm5		\n\t"\
		"paddw %%mm2, %%mm1		\n\t"\
		"paddw %%mm5, %%mm7		\n\t"\
157
		"test %%"REG_S", %%"REG_S"	\n\t"\
158
		" jnz 2b			\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
159 160 161 162


#define YSCALEYUV2RGBX \
		YSCALEYUV2PACKEDX\
Michael Niedermayer's avatar
Michael Niedermayer committed
163 164
		"psubw "U_OFFSET"(%0), %%mm3	\n\t" /* (U-128)8*/\
		"psubw "V_OFFSET"(%0), %%mm4	\n\t" /* (V-128)8*/\
165 166
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
167 168
		"pmulhw "UG_COEFF"(%0), %%mm3	\n\t"\
		"pmulhw "VG_COEFF"(%0), %%mm4	\n\t"\
169
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
170 171 172 173 174 175
		"pmulhw "UB_COEFF"(%0), %%mm2	\n\t"\
		"pmulhw "VR_COEFF"(%0), %%mm5	\n\t"\
		"psubw "Y_OFFSET"(%0), %%mm1	\n\t" /* 8(Y-16)*/\
		"psubw "Y_OFFSET"(%0), %%mm7	\n\t" /* 8(Y-16)*/\
		"pmulhw "Y_COEFF"(%0), %%mm1	\n\t"\
		"pmulhw "Y_COEFF"(%0), %%mm7	\n\t"\
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
		"paddw %%mm3, %%mm4		\n\t"\
		"movq %%mm2, %%mm0		\n\t"\
		"movq %%mm5, %%mm6		\n\t"\
		"movq %%mm4, %%mm3		\n\t"\
		"punpcklwd %%mm2, %%mm2		\n\t"\
		"punpcklwd %%mm5, %%mm5		\n\t"\
		"punpcklwd %%mm4, %%mm4		\n\t"\
		"paddw %%mm1, %%mm2		\n\t"\
		"paddw %%mm1, %%mm5		\n\t"\
		"paddw %%mm1, %%mm4		\n\t"\
		"punpckhwd %%mm0, %%mm0		\n\t"\
		"punpckhwd %%mm6, %%mm6		\n\t"\
		"punpckhwd %%mm3, %%mm3		\n\t"\
		"paddw %%mm7, %%mm0		\n\t"\
		"paddw %%mm7, %%mm6		\n\t"\
		"paddw %%mm7, %%mm3		\n\t"\
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
		"packuswb %%mm0, %%mm2		\n\t"\
		"packuswb %%mm6, %%mm5		\n\t"\
		"packuswb %%mm3, %%mm4		\n\t"\
		"pxor %%mm7, %%mm7		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
198
#if 0
199 200 201 202 203 204 205 206
#define FULL_YSCALEYUV2RGB \
		"pxor %%mm7, %%mm7		\n\t"\
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
		"punpcklwd %%mm6, %%mm6		\n\t"\
		"punpcklwd %%mm6, %%mm6		\n\t"\
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
		"punpcklwd %%mm5, %%mm5		\n\t"\
		"punpcklwd %%mm5, %%mm5		\n\t"\
207
		"xor %%"REG_a", %%"REG_a"		\n\t"\
208
		".balign 16			\n\t"\
209
		"1:				\n\t"\
210 211 212 213
		"movq (%0, %%"REG_a", 2), %%mm0	\n\t" /*buf0[eax]*/\
		"movq (%1, %%"REG_a", 2), %%mm1	\n\t" /*buf1[eax]*/\
		"movq (%2, %%"REG_a",2), %%mm2	\n\t" /* uvbuf0[eax]*/\
		"movq (%3, %%"REG_a",2), %%mm3	\n\t" /* uvbuf1[eax]*/\
214 215 216 217 218
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
219
		"movq 4096(%2, %%"REG_a",2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
220 221
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
222
		"movq 4096(%3, %%"REG_a",2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
223 224
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
225 226 227
		"psubw "MANGLE(w80)", %%mm1	\n\t" /* 8(Y-16)*/\
		"psubw "MANGLE(w400)", %%mm3	\n\t" /* 8(U-128)*/\
		"pmulhw "MANGLE(yCoeff)", %%mm1	\n\t"\
228 229 230 231
\
\
		"pmulhw %%mm5, %%mm4		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
232
		"pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
233
		"psraw $4, %%mm0		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
234
		"pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
235
		"paddw %%mm4, %%mm0		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
236
		"psubw "MANGLE(w400)", %%mm0	\n\t" /* (V-128)8*/\
237 238 239
\
\
		"movq %%mm0, %%mm4		\n\t" /* (V-128)8*/\
240 241
		"pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
		"pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
242 243 244 245 246 247 248 249 250
		"paddw %%mm1, %%mm3		\n\t" /* B*/\
		"paddw %%mm1, %%mm0		\n\t" /* R*/\
		"packuswb %%mm3, %%mm3		\n\t"\
\
		"packuswb %%mm0, %%mm0		\n\t"\
		"paddw %%mm4, %%mm2		\n\t"\
		"paddw %%mm2, %%mm1		\n\t" /* G*/\
\
		"packuswb %%mm1, %%mm1		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
251
#endif
252

253
#define REAL_YSCALEYUV2PACKED(index, c) \
Michael Niedermayer's avatar
Michael Niedermayer committed
254 255 256 257 258 259
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
		"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
		"psraw $3, %%mm0		\n\t"\
		"psraw $3, %%mm1		\n\t"\
		"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
		"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
260
		"xor "#index", "#index"		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
261 262
		".balign 16			\n\t"\
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
263 264 265 266
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
267 268
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
		"psubw %%mm4, %%mm5		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
269
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
270 271 272 273 274 275
		"pmulhw %%mm0, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
		"pmulhw %%mm0, %%mm5		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
		"psraw $7, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
		"psraw $7, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
276 277 278 279
		"movq (%0, "#index", 2), %%mm0	\n\t" /*buf0[eax]*/\
		"movq (%1, "#index", 2), %%mm1	\n\t" /*buf1[eax]*/\
		"movq 8(%0, "#index", 2), %%mm6	\n\t" /*buf0[eax]*/\
		"movq 8(%1, "#index", 2), %%mm7	\n\t" /*buf1[eax]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
280 281
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
		"psubw %%mm7, %%mm6		\n\t" /* buf0[eax] - buf1[eax]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
282 283
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
284 285 286 287 288
		"psraw $7, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"psraw $7, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
                
289 290 291 292
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
                
#define REAL_YSCALEYUV2RGB(index, c) \
		"xor "#index", "#index"	\n\t"\
293
		".balign 16			\n\t"\
294
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
295 296 297 298
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
		"movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
		"movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
299 300
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
		"psubw %%mm4, %%mm5		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
301
		"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
302 303 304 305 306 307
		"pmulhw %%mm0, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
		"pmulhw %%mm0, %%mm5		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
308 309
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
310 311
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
312 313
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
314
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
315 316 317 318
		"movq (%0, "#index", 2), %%mm0	\n\t" /*buf0[eax]*/\
		"movq (%1, "#index", 2), %%mm1	\n\t" /*buf1[eax]*/\
		"movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
		"movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
319 320
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
		"psubw %%mm7, %%mm6		\n\t" /* buf0[eax] - buf1[eax]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
321 322
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
		"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
323 324 325 326
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
327 328 329 330 331 332
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
		"paddw %%mm3, %%mm4		\n\t"\
		"movq %%mm2, %%mm0		\n\t"\
		"movq %%mm5, %%mm6		\n\t"\
		"movq %%mm4, %%mm3		\n\t"\
		"punpcklwd %%mm2, %%mm2		\n\t"\
		"punpcklwd %%mm5, %%mm5		\n\t"\
		"punpcklwd %%mm4, %%mm4		\n\t"\
		"paddw %%mm1, %%mm2		\n\t"\
		"paddw %%mm1, %%mm5		\n\t"\
		"paddw %%mm1, %%mm4		\n\t"\
		"punpckhwd %%mm0, %%mm0		\n\t"\
		"punpckhwd %%mm6, %%mm6		\n\t"\
		"punpckhwd %%mm3, %%mm3		\n\t"\
		"paddw %%mm7, %%mm0		\n\t"\
		"paddw %%mm7, %%mm6		\n\t"\
		"paddw %%mm7, %%mm3		\n\t"\
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
		"packuswb %%mm0, %%mm2		\n\t"\
		"packuswb %%mm6, %%mm5		\n\t"\
		"packuswb %%mm3, %%mm4		\n\t"\
		"pxor %%mm7, %%mm7		\n\t"
355
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
Michael Niedermayer's avatar
Michael Niedermayer committed
356
                
357 358
#define REAL_YSCALEYUV2PACKED1(index, c) \
		"xor "#index", "#index"		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
359 360
		".balign 16			\n\t"\
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
361 362
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
363 364
		"psraw $7, %%mm3		\n\t" \
		"psraw $7, %%mm4		\n\t" \
Michael Niedermayer's avatar
Michael Niedermayer committed
365 366
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
367 368 369
		"psraw $7, %%mm1		\n\t" \
		"psraw $7, %%mm7		\n\t" \
                
370 371 372 373
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
                
#define REAL_YSCALEYUV2RGB1(index, c) \
		"xor "#index", "#index"	\n\t"\
374
		".balign 16			\n\t"\
375
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
376 377
		"movq (%2, "#index"), %%mm3	\n\t" /* uvbuf0[eax]*/\
		"movq 4096(%2, "#index"), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
378 379
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
380 381
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
382 383
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
384 385
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
386
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
387 388
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
389 390
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
391 392 393 394 395 396
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
		"paddw %%mm3, %%mm4		\n\t"\
		"movq %%mm2, %%mm0		\n\t"\
		"movq %%mm5, %%mm6		\n\t"\
		"movq %%mm4, %%mm3		\n\t"\
		"punpcklwd %%mm2, %%mm2		\n\t"\
		"punpcklwd %%mm5, %%mm5		\n\t"\
		"punpcklwd %%mm4, %%mm4		\n\t"\
		"paddw %%mm1, %%mm2		\n\t"\
		"paddw %%mm1, %%mm5		\n\t"\
		"paddw %%mm1, %%mm4		\n\t"\
		"punpckhwd %%mm0, %%mm0		\n\t"\
		"punpckhwd %%mm6, %%mm6		\n\t"\
		"punpckhwd %%mm3, %%mm3		\n\t"\
		"paddw %%mm7, %%mm0		\n\t"\
		"paddw %%mm7, %%mm6		\n\t"\
		"paddw %%mm7, %%mm3		\n\t"\
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
		"packuswb %%mm0, %%mm2		\n\t"\
		"packuswb %%mm6, %%mm5		\n\t"\
		"packuswb %%mm3, %%mm4		\n\t"\
		"pxor %%mm7, %%mm7		\n\t"
419
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
420

421 422
#define REAL_YSCALEYUV2PACKED1b(index, c) \
		"xor "#index", "#index"		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
423 424
		".balign 16			\n\t"\
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
425 426 427 428
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
429 430 431 432
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
		"psrlw $8, %%mm3		\n\t" \
		"psrlw $8, %%mm4		\n\t" \
Michael Niedermayer's avatar
Michael Niedermayer committed
433 434
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
435 436
		"psraw $7, %%mm1		\n\t" \
		"psraw $7, %%mm7		\n\t" 
437
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
Michael Niedermayer's avatar
Michael Niedermayer committed
438
                
439
// do vertical chrominance interpolation
440 441
#define REAL_YSCALEYUV2RGB1b(index, c) \
		"xor "#index", "#index"		\n\t"\
442
		".balign 16			\n\t"\
443
		"1:				\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
444 445 446 447
		"movq (%2, "#index"), %%mm2	\n\t" /* uvbuf0[eax]*/\
		"movq (%3, "#index"), %%mm3	\n\t" /* uvbuf1[eax]*/\
		"movq 4096(%2, "#index"), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
		"movq 4096(%3, "#index"), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
448 449
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
450 451
		"psrlw $5, %%mm3		\n\t" /*FIXME might overflow*/\
		"psrlw $5, %%mm4		\n\t" /*FIXME might overflow*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
452 453
		"psubw "U_OFFSET"("#c"), %%mm3	\n\t" /* (U-128)8*/\
		"psubw "V_OFFSET"("#c"), %%mm4	\n\t" /* (V-128)8*/\
454 455
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
456 457
		"pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
		"pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
458
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
459 460
		"movq (%0, "#index", 2), %%mm1	\n\t" /*buf0[eax]*/\
		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
461 462
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
Michael Niedermayer's avatar
Michael Niedermayer committed
463 464 465 466 467 468
		"pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
		"pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
		"psubw "Y_OFFSET"("#c"), %%mm1	\n\t" /* 8(Y-16)*/\
		"psubw "Y_OFFSET"("#c"), %%mm7	\n\t" /* 8(Y-16)*/\
		"pmulhw "Y_COEFF"("#c"), %%mm1	\n\t"\
		"pmulhw "Y_COEFF"("#c"), %%mm7	\n\t"\
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
		"paddw %%mm3, %%mm4		\n\t"\
		"movq %%mm2, %%mm0		\n\t"\
		"movq %%mm5, %%mm6		\n\t"\
		"movq %%mm4, %%mm3		\n\t"\
		"punpcklwd %%mm2, %%mm2		\n\t"\
		"punpcklwd %%mm5, %%mm5		\n\t"\
		"punpcklwd %%mm4, %%mm4		\n\t"\
		"paddw %%mm1, %%mm2		\n\t"\
		"paddw %%mm1, %%mm5		\n\t"\
		"paddw %%mm1, %%mm4		\n\t"\
		"punpckhwd %%mm0, %%mm0		\n\t"\
		"punpckhwd %%mm6, %%mm6		\n\t"\
		"punpckhwd %%mm3, %%mm3		\n\t"\
		"paddw %%mm7, %%mm0		\n\t"\
		"paddw %%mm7, %%mm6		\n\t"\
		"paddw %%mm7, %%mm3		\n\t"\
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
		"packuswb %%mm0, %%mm2		\n\t"\
		"packuswb %%mm6, %%mm5		\n\t"\
		"packuswb %%mm3, %%mm4		\n\t"\
		"pxor %%mm7, %%mm7		\n\t"
491
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
492

493
#define REAL_WRITEBGR32(dst, dstw, index) \
494 495 496 497 498 499 500 501 502 503 504 505 506 507
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
			"movq %%mm2, %%mm1		\n\t" /* B */\
			"movq %%mm5, %%mm6		\n\t" /* R */\
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
\
Michael Niedermayer's avatar
Michael Niedermayer committed
508 509 510 511
			MOVNTQ(%%mm0, (dst, index, 4))\
			MOVNTQ(%%mm2, 8(dst, index, 4))\
			MOVNTQ(%%mm1, 16(dst, index, 4))\
			MOVNTQ(%%mm3, 24(dst, index, 4))\
512
\
513 514
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
515
			" jb 1b				\n\t"
516
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
517

518
#define REAL_WRITEBGR16(dst, dstw, index) \
519 520 521
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
			"pand "MANGLE(bFC)", %%mm4	\n\t" /* G */\
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
Michael Niedermayer's avatar
Michael Niedermayer committed
522
			"psrlq $3, %%mm2		\n\t"\
523
\
Michael Niedermayer's avatar
Michael Niedermayer committed
524 525
			"movq %%mm2, %%mm1		\n\t"\
			"movq %%mm4, %%mm3		\n\t"\
526
\
Michael Niedermayer's avatar
Michael Niedermayer committed
527 528 529 530
			"punpcklbw %%mm7, %%mm3		\n\t"\
			"punpcklbw %%mm5, %%mm2		\n\t"\
			"punpckhbw %%mm7, %%mm4		\n\t"\
			"punpckhbw %%mm5, %%mm1		\n\t"\
531
\
Michael Niedermayer's avatar
Michael Niedermayer committed
532 533
			"psllq $3, %%mm3		\n\t"\
			"psllq $3, %%mm4		\n\t"\
534 535 536 537
\
			"por %%mm3, %%mm2		\n\t"\
			"por %%mm4, %%mm1		\n\t"\
\
Michael Niedermayer's avatar
Michael Niedermayer committed
538 539
			MOVNTQ(%%mm2, (dst, index, 2))\
			MOVNTQ(%%mm1, 8(dst, index, 2))\
540
\
541 542
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
543
			" jb 1b				\n\t"
544
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
545

546
#define REAL_WRITEBGR15(dst, dstw, index) \
547 548 549
			"pand "MANGLE(bF8)", %%mm2	\n\t" /* B */\
			"pand "MANGLE(bF8)", %%mm4	\n\t" /* G */\
			"pand "MANGLE(bF8)", %%mm5	\n\t" /* R */\
Michael Niedermayer's avatar
Michael Niedermayer committed
550 551
			"psrlq $3, %%mm2		\n\t"\
			"psrlq $1, %%mm5		\n\t"\
552
\
Michael Niedermayer's avatar
Michael Niedermayer committed
553 554
			"movq %%mm2, %%mm1		\n\t"\
			"movq %%mm4, %%mm3		\n\t"\
555
\
Michael Niedermayer's avatar
Michael Niedermayer committed
556 557 558 559
			"punpcklbw %%mm7, %%mm3		\n\t"\
			"punpcklbw %%mm5, %%mm2		\n\t"\
			"punpckhbw %%mm7, %%mm4		\n\t"\
			"punpckhbw %%mm5, %%mm1		\n\t"\
560
\
Michael Niedermayer's avatar
Michael Niedermayer committed
561 562
			"psllq $2, %%mm3		\n\t"\
			"psllq $2, %%mm4		\n\t"\
563 564 565 566
\
			"por %%mm3, %%mm2		\n\t"\
			"por %%mm4, %%mm1		\n\t"\
\
Michael Niedermayer's avatar
Michael Niedermayer committed
567 568
			MOVNTQ(%%mm2, (dst, index, 2))\
			MOVNTQ(%%mm1, 8(dst, index, 2))\
569
\
570 571
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
572
			" jb 1b				\n\t"
573
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
Michael Niedermayer's avatar
Michael Niedermayer committed
574

Michael Niedermayer's avatar
Michael Niedermayer committed
575
#define WRITEBGR24OLD(dst, dstw, index) \
576 577 578 579 580 581 582 583 584
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
			"movq %%mm2, %%mm1		\n\t" /* B */\
			"movq %%mm5, %%mm6		\n\t" /* R */\
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
585 586 587 588
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
589 590 591
\
			"movq %%mm0, %%mm4		\n\t" /* 0RGB0RGB 0 */\
			"psrlq $8, %%mm0		\n\t" /* 00RGB0RG 0 */\
592 593
			"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
			"pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
594 595 596 597 598 599 600 601 602
			"por %%mm4, %%mm0		\n\t" /* 00RGBRGB 0 */\
			"movq %%mm2, %%mm4		\n\t" /* 0RGB0RGB 1 */\
			"psllq $48, %%mm2		\n\t" /* GB000000 1 */\
			"por %%mm2, %%mm0		\n\t" /* GBRGBRGB 0 */\
\
			"movq %%mm4, %%mm2		\n\t" /* 0RGB0RGB 1 */\
			"psrld $16, %%mm4		\n\t" /* 000R000R 1 */\
			"psrlq $24, %%mm2		\n\t" /* 0000RGB0 1.5 */\
			"por %%mm4, %%mm2		\n\t" /* 000RRGBR 1 */\
603
			"pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
604 605
			"movq %%mm1, %%mm4		\n\t" /* 0RGB0RGB 2 */\
			"psrlq $8, %%mm1		\n\t" /* 00RGB0RG 2 */\
606 607
			"pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
			"pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
608 609 610 611 612 613 614 615
			"por %%mm4, %%mm1		\n\t" /* 00RGBRGB 2 */\
			"movq %%mm1, %%mm4		\n\t" /* 00RGBRGB 2 */\
			"psllq $32, %%mm1		\n\t" /* BRGB0000 2 */\
			"por %%mm1, %%mm2		\n\t" /* BRGBRGBR 1 */\
\
			"psrlq $32, %%mm4		\n\t" /* 000000RG 2.5 */\
			"movq %%mm3, %%mm5		\n\t" /* 0RGB0RGB 3 */\
			"psrlq $8, %%mm3		\n\t" /* 00RGB0RG 3 */\
616 617
			"pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
			"pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
618 619 620 621
			"por %%mm5, %%mm3		\n\t" /* 00RGBRGB 3 */\
			"psllq $16, %%mm3		\n\t" /* RGBRGB00 3 */\
			"por %%mm4, %%mm3		\n\t" /* RGBRGBRG 2.5 */\
\
Michael Niedermayer's avatar
Michael Niedermayer committed
622 623 624
			MOVNTQ(%%mm0, (dst))\
			MOVNTQ(%%mm2, 8(dst))\
			MOVNTQ(%%mm3, 16(dst))\
625
			"add $24, "#dst"		\n\t"\
626
\
627 628
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
629 630
			" jb 1b				\n\t"

Michael Niedermayer's avatar
Michael Niedermayer committed
631
#define WRITEBGR24MMX(dst, dstw, index) \
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
			"movq %%mm2, %%mm1		\n\t" /* B */\
			"movq %%mm5, %%mm6		\n\t" /* R */\
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
\
			"movq %%mm0, %%mm4		\n\t" /* 0RGB0RGB 0 */\
			"movq %%mm2, %%mm6		\n\t" /* 0RGB0RGB 1 */\
			"movq %%mm1, %%mm5		\n\t" /* 0RGB0RGB 2 */\
			"movq %%mm3, %%mm7		\n\t" /* 0RGB0RGB 3 */\
\
			"psllq $40, %%mm0		\n\t" /* RGB00000 0 */\
			"psllq $40, %%mm2		\n\t" /* RGB00000 1 */\
			"psllq $40, %%mm1		\n\t" /* RGB00000 2 */\
			"psllq $40, %%mm3		\n\t" /* RGB00000 3 */\
\
			"punpckhdq %%mm4, %%mm0		\n\t" /* 0RGBRGB0 0 */\
			"punpckhdq %%mm6, %%mm2		\n\t" /* 0RGBRGB0 1 */\
			"punpckhdq %%mm5, %%mm1		\n\t" /* 0RGBRGB0 2 */\
			"punpckhdq %%mm7, %%mm3		\n\t" /* 0RGBRGB0 3 */\
\
			"psrlq $8, %%mm0		\n\t" /* 00RGBRGB 0 */\
			"movq %%mm2, %%mm6		\n\t" /* 0RGBRGB0 1 */\
			"psllq $40, %%mm2		\n\t" /* GB000000 1 */\
			"por %%mm2, %%mm0		\n\t" /* GBRGBRGB 0 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
665
			MOVNTQ(%%mm0, (dst))\
666 667 668 669 670
\
			"psrlq $24, %%mm6		\n\t" /* 0000RGBR 1 */\
			"movq %%mm1, %%mm5		\n\t" /* 0RGBRGB0 2 */\
			"psllq $24, %%mm1		\n\t" /* BRGB0000 2 */\
			"por %%mm1, %%mm6		\n\t" /* BRGBRGBR 1 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
671
			MOVNTQ(%%mm6, 8(dst))\
672 673 674 675
\
			"psrlq $40, %%mm5		\n\t" /* 000000RG 2 */\
			"psllq $8, %%mm3		\n\t" /* RGBRGB00 3 */\
			"por %%mm3, %%mm5		\n\t" /* RGBRGBRG 2 */\
Michael Niedermayer's avatar
Michael Niedermayer committed
676
			MOVNTQ(%%mm5, 16(dst))\
677
\
678
			"add $24, "#dst"		\n\t"\
679
\
680 681
			"add $8, "#index"			\n\t"\
			"cmp "#dstw", "#index"			\n\t"\
682 683
			" jb 1b				\n\t"

Michael Niedermayer's avatar
Michael Niedermayer committed
684
#define WRITEBGR24MMX2(dst, dstw, index) \
685
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
686 687
			"movq "MANGLE(M24A)", %%mm0	\n\t"\
			"movq "MANGLE(M24C)", %%mm7	\n\t"\
688 689 690 691 692 693 694 695 696 697 698
			"pshufw $0x50, %%mm2, %%mm1	\n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
			"pshufw $0x50, %%mm4, %%mm3	\n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
			"pshufw $0x00, %%mm5, %%mm6	\n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
\
			"pand %%mm0, %%mm1		\n\t" /*    B2        B1       B0 */\
			"pand %%mm0, %%mm3		\n\t" /*    G2        G1       G0 */\
			"pand %%mm7, %%mm6		\n\t" /*       R1        R0       */\
\
			"psllq $8, %%mm3		\n\t" /* G2        G1       G0    */\
			"por %%mm1, %%mm6		\n\t"\
			"por %%mm3, %%mm6		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
699
			MOVNTQ(%%mm6, (dst))\
700 701 702 703 704 705
\
			"psrlq $8, %%mm4		\n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
			"pshufw $0xA5, %%mm2, %%mm1	\n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
			"pshufw $0x55, %%mm4, %%mm3	\n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
			"pshufw $0xA5, %%mm5, %%mm6	\n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
\
706
			"pand "MANGLE(M24B)", %%mm1	\n\t" /* B5       B4        B3    */\
707 708 709 710 711
			"pand %%mm7, %%mm3		\n\t" /*       G4        G3       */\
			"pand %%mm0, %%mm6		\n\t" /*    R4        R3       R2 */\
\
			"por %%mm1, %%mm3		\n\t" /* B5    G4 B4     G3 B3    */\
			"por %%mm3, %%mm6		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
712
			MOVNTQ(%%mm6, 8(dst))\
713 714 715 716 717 718 719
\
			"pshufw $0xFF, %%mm2, %%mm1	\n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
			"pshufw $0xFA, %%mm4, %%mm3	\n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
			"pshufw $0xFA, %%mm5, %%mm6	\n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
\
			"pand %%mm7, %%mm1		\n\t" /*       B7        B6       */\
			"pand %%mm0, %%mm3		\n\t" /*    G7        G6       G5 */\
720
			"pand "MANGLE(M24B)", %%mm6	\n\t" /* R7       R6        R5    */\
721 722 723
\
			"por %%mm1, %%mm3		\n\t"\
			"por %%mm3, %%mm6		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
724
			MOVNTQ(%%mm6, 16(dst))\
725
\
726
			"add $24, "#dst"		\n\t"\
727
\
728 729
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
730 731 732
			" jb 1b				\n\t"

#ifdef HAVE_MMX2
733
#undef WRITEBGR24
734
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
735
#else
736
#undef WRITEBGR24
737
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
738 739
#endif

740
#define REAL_WRITEYUY2(dst, dstw, index) \
Michael Niedermayer's avatar
Michael Niedermayer committed
741 742 743 744 745 746 747 748
			"packuswb %%mm3, %%mm3		\n\t"\
			"packuswb %%mm4, %%mm4		\n\t"\
			"packuswb %%mm7, %%mm1		\n\t"\
			"punpcklbw %%mm4, %%mm3		\n\t"\
			"movq %%mm1, %%mm7		\n\t"\
			"punpcklbw %%mm3, %%mm1		\n\t"\
			"punpckhbw %%mm3, %%mm7		\n\t"\
\
Michael Niedermayer's avatar
Michael Niedermayer committed
749 750
			MOVNTQ(%%mm1, (dst, index, 2))\
			MOVNTQ(%%mm7, 8(dst, index, 2))\
Michael Niedermayer's avatar
Michael Niedermayer committed
751
\
752 753
			"add $8, "#index"		\n\t"\
			"cmp "#dstw", "#index"		\n\t"\
Michael Niedermayer's avatar
Michael Niedermayer committed
754
			" jb 1b				\n\t"
755
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
Michael Niedermayer's avatar
Michael Niedermayer committed
756 757


Michael Niedermayer's avatar
Michael Niedermayer committed
758
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
759
				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
Michael Niedermayer's avatar
Michael Niedermayer committed
760
				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
761
{
762 763 764 765
#ifdef HAVE_MMX
	if(uDest != NULL)
	{
		asm volatile(
Michael Niedermayer's avatar
Michael Niedermayer committed
766 767
				YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET)
				:: "r" (&c->redDither),
768 769
				"r" (uDest), "m" ((long)chrDstW)
				: "%"REG_a, "%"REG_d, "%"REG_S
770 771 772
			);

		asm volatile(
Michael Niedermayer's avatar
Michael Niedermayer committed
773 774
				YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET)
				:: "r" (&c->redDither),
775 776
				"r" (vDest), "m" ((long)chrDstW)
				: "%"REG_a, "%"REG_d, "%"REG_S
777 778 779 780
			);
	}

	asm volatile(
Michael Niedermayer's avatar
Michael Niedermayer committed
781 782
			YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET)
			:: "r" (&c->redDither),
783 784
			   "r" (dest), "m" ((long)dstW)
			: "%"REG_a, "%"REG_d, "%"REG_S
785 786
		);
#else
787 788 789 790 791
#ifdef HAVE_ALTIVEC
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
		      chrFilter, chrSrc, chrFilterSize,
		      dest, uDest, vDest, dstW, chrDstW);
#else //HAVE_ALTIVEC
792
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
793
	    chrFilter, chrSrc, chrFilterSize,
794
	    dest, uDest, vDest, dstW, chrDstW);
795
#endif //!HAVE_ALTIVEC
796
#endif
797
}
798

799
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
800
				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
801 802 803
{
#ifdef HAVE_MMX
	if(uDest != NULL)
804
	{
805 806
		asm volatile(
				YSCALEYUV2YV121
807
				:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
808 809
				"g" ((long)-chrDstW)
				: "%"REG_a
810 811 812 813
			);

		asm volatile(
				YSCALEYUV2YV121
814
				:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
815 816
				"g" ((long)-chrDstW)
				: "%"REG_a
817
			);
818 819
	}

820 821 822
	asm volatile(
		YSCALEYUV2YV121
		:: "r" (lumSrc + dstW), "r" (dest + dstW),
823 824
		"g" ((long)-dstW)
		: "%"REG_a
825 826 827 828
	);
#else
	int i;
	for(i=0; i<dstW; i++)
829
	{
830
		int val= lumSrc[i]>>7;
831 832 833 834 835
		
		if(val&256){
			if(val<0) val=0;
			else      val=255;
		}
836

837
		dest[i]= val;
838 839 840
	}

	if(uDest != NULL)
841
		for(i=0; i<chrDstW; i++)
842
		{
843 844 845
			int u=chrSrc[i]>>7;
			int v=chrSrc[i + 2048]>>7;

846 847 848 849 850 851 852 853 854
			if((u|v)&256){
				if(u<0)         u=0;
				else if (u>255) u=255;
				if(v<0)         v=0;
				else if (v>255) v=255;
			}

			uDest[i]= u;
			vDest[i]= v;
855
		}
856
#endif
857 858
}

859

860 861 862
/**
 * vertical scale YV12 to RGB
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
863
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
864
				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
Michael Niedermayer's avatar
Michael Niedermayer committed
865
			    uint8_t *dest, int dstW, int dstY)
866
{
Michael Niedermayer's avatar
Michael Niedermayer committed
867
	int dummy=0;
868
	switch(c->dstFormat)
869 870
	{
#ifdef HAVE_MMX
871
	case IMGFMT_BGR32:
872 873 874
		{
			asm volatile(
				YSCALEYUV2RGBX
875
				WRITEBGR32(%4, %5, %%REGa)
876

Michael Niedermayer's avatar
Michael Niedermayer committed
877 878 879
			:: "r" (&c->redDither), 
			   "m" (dummy), "m" (dummy), "m" (dummy),
			   "r" (dest), "m" (dstW)
880
			: "%"REG_a, "%"REG_d, "%"REG_S
881 882
			);
		}
883 884
		break;
	case IMGFMT_BGR24:
885 886 887
		{
			asm volatile(
				YSCALEYUV2RGBX
888 889 890
				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
				"add %4, %%"REG_b"			\n\t"
				WRITEBGR24(%%REGb, %5, %%REGa)
891

Michael Niedermayer's avatar
Michael Niedermayer committed
892 893 894
			:: "r" (&c->redDither), 
			   "m" (dummy), "m" (dummy), "m" (dummy),
			   "r" (dest), "m" (dstW)
895
			: "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
896 897
			);
		}
898 899
		break;
	case IMGFMT_BGR15:
900 901 902 903 904
		{
			asm volatile(
				YSCALEYUV2RGBX
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
905 906 907
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
908 909
#endif

910
				WRITEBGR15(%4, %5, %%REGa)
911

Michael Niedermayer's avatar
Michael Niedermayer committed
912 913 914
			:: "r" (&c->redDither), 
			   "m" (dummy), "m" (dummy), "m" (dummy),
			   "r" (dest), "m" (dstW)
915
			: "%"REG_a, "%"REG_d, "%"REG_S
916 917
			);
		}
918 919
		break;
	case IMGFMT_BGR16:
920 921 922 923 924
		{
			asm volatile(
				YSCALEYUV2RGBX
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
925 926 927
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
928 929
#endif

930
				WRITEBGR16(%4, %5, %%REGa)
931

Michael Niedermayer's avatar
Michael Niedermayer committed
932 933 934
			:: "r" (&c->redDither), 
			   "m" (dummy), "m" (dummy), "m" (dummy),
			   "r" (dest), "m" (dstW)
935
			: "%"REG_a, "%"REG_d, "%"REG_S
936 937
			);
		}
938
		break;
Michael Niedermayer's avatar
Michael Niedermayer committed
939 940 941 942 943 944 945 946 947 948
	case IMGFMT_YUY2:
		{
			asm volatile(
				YSCALEYUV2PACKEDX
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */

				"psraw $3, %%mm3		\n\t"
				"psraw $3, %%mm4		\n\t"
				"psraw $3, %%mm1		\n\t"
				"psraw $3, %%mm7		\n\t"
949
				WRITEYUY2(%4, %5, %%REGa)
Michael Niedermayer's avatar
Michael Niedermayer committed
950

Michael Niedermayer's avatar
Michael Niedermayer committed
951 952 953
			:: "r" (&c->redDither), 
			   "m" (dummy), "m" (dummy), "m" (dummy),
			   "r" (dest), "m" (dstW)
954
			: "%"REG_a, "%"REG_d, "%"REG_S
Michael Niedermayer's avatar
Michael Niedermayer committed
955 956 957
			);
		}
		break;
958
#endif
959
	default:
960 961 962 963 964
#ifdef HAVE_ALTIVEC
		altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
			    chrFilter, chrSrc, chrFilterSize,
			    dest, dstW, dstY);
#else
Michael Niedermayer's avatar
Michael Niedermayer committed
965
		yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
966 967
			    chrFilter, chrSrc, chrFilterSize,
			    dest, dstW, dstY);
968
#endif
969 970
		break;
	}
971 972 973 974 975
}

/**
 * vertical bilinear scale YV12 to RGB
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
976
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
977
			    uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
978 979 980
{
	int yalpha1=yalpha^4095;
	int uvalpha1=uvalpha^4095;
981
	int i;
982

983
#if 0 //isn't used
984
	if(flags&SWS_FULL_CHR_H_INT)
985
	{
986
		switch(dstFormat)
987
		{
988 989
#ifdef HAVE_MMX
		case IMGFMT_BGR32:
990 991 992 993 994 995 996 997 998 999 1000
			asm volatile(


FULL_YSCALEYUV2RGB
			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0

			"movq %%mm3, %%mm1		\n\t"
			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0

1001 1002
			MOVNTQ(%%mm3, (%4, %%REGa, 4))
			MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1003

1004 1005
			"add $4, %%"REG_a"		\n\t"
			"cmp %5, %%"REG_a"		\n\t"
1006 1007 1008
			" jb 1b				\n\t"


1009
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1010
			"m" (yalpha1), "m" (uvalpha1)
1011
			: "%"REG_a
1012
			);
1013 1014
			break;
		case IMGFMT_BGR24:
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
			asm volatile(

FULL_YSCALEYUV2RGB

								// lsb ... msb
			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0

			"movq %%mm3, %%mm1		\n\t"
			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0

			"movq %%mm3, %%mm2		\n\t" // BGR0BGR0
			"psrlq $8, %%mm3		\n\t" // GR0BGR00
1029 1030
			"pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
			"pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
			"por %%mm2, %%mm3		\n\t" // BGRBGR00
			"movq %%mm1, %%mm2		\n\t"
			"psllq $48, %%mm1		\n\t" // 000000BG
			"por %%mm1, %%mm3		\n\t" // BGRBGRBG

			"movq %%mm2, %%mm1		\n\t" // BGR0BGR0
			"psrld $16, %%mm2		\n\t" // R000R000
			"psrlq $24, %%mm1		\n\t" // 0BGR0000
			"por %%mm2, %%mm1		\n\t" // RBGRR000

1041 1042
			"mov %4, %%"REG_b"		\n\t"
			"add %%"REG_a", %%"REG_b"	\n\t"
1043 1044 1045

#ifdef HAVE_MMX2
			//FIXME Alignment
1046 1047
			"movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
			"movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1048
#else
1049
			"movd %%mm3, (%%"REG_b", %%"REG_a", 2)	\n\t"
1050
			"psrlq $32, %%mm3		\n\t"
1051 1052
			"movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)	\n\t"
			"movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)	\n\t"
1053
#endif
1054 1055
			"add $4, %%"REG_a"		\n\t"
			"cmp %5, %%"REG_a"		\n\t"
1056 1057
			" jb 1b				\n\t"

Michael Niedermayer's avatar
Michael Niedermayer committed
1058
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1059
			"m" (yalpha1), "m" (uvalpha1)
1060
			: "%"REG_a, "%"REG_b
1061
			);
1062 1063
			break;
		case IMGFMT_BGR15:
1064 1065 1066 1067
			asm volatile(

FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
1068 1069 1070
			"paddusb "MANGLE(g5Dither)", %%mm1\n\t"
			"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
			"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1071 1072 1073 1074 1075 1076 1077 1078
#endif
			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R

			"psrlw $3, %%mm3		\n\t"
			"psllw $2, %%mm1		\n\t"
			"psllw $7, %%mm0		\n\t"
1079 1080
			"pand "MANGLE(g15Mask)", %%mm1	\n\t"
			"pand "MANGLE(r15Mask)", %%mm0	\n\t"
1081 1082 1083 1084

			"por %%mm3, %%mm1		\n\t"
			"por %%mm1, %%mm0		\n\t"

1085
			MOVNTQ(%%mm0, (%4, %%REGa, 2))
1086

1087 1088
			"add $4, %%"REG_a"		\n\t"
			"cmp %5, %%"REG_a"		\n\t"
1089 1090
			" jb 1b				\n\t"

Michael Niedermayer's avatar
Michael Niedermayer committed
1091
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1092
			"m" (yalpha1), "m" (uvalpha1)
1093
			: "%"REG_a
1094
			);
1095 1096
			break;
		case IMGFMT_BGR16:
1097 1098 1099 1100
			asm volatile(

FULL_YSCALEYUV2RGB
#ifdef DITHER1XBPP
1101 1102 1103
			"paddusb "MANGLE(g6Dither)", %%mm1\n\t"
			"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
			"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1104 1105 1106 1107 1108 1109 1110 1111
#endif
			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R

			"psrlw $3, %%mm3		\n\t"
			"psllw $3, %%mm1		\n\t"
			"psllw $8, %%mm0		\n\t"
1112 1113
			"pand "MANGLE(g16Mask)", %%mm1	\n\t"
			"pand "MANGLE(r16Mask)", %%mm0	\n\t"
1114 1115 1116 1117

			"por %%mm3, %%mm1		\n\t"
			"por %%mm1, %%mm0		\n\t"

1118
			MOVNTQ(%%mm0, (%4, %%REGa, 2))
1119

1120 1121
			"add $4, %%"REG_a"		\n\t"
			"cmp %5, %%"REG_a"		\n\t"
1122 1123
			" jb 1b				\n\t"

Michael Niedermayer's avatar
Michael Niedermayer committed
1124
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1125
			"m" (yalpha1), "m" (uvalpha1)
1126
			: "%"REG_a
1127
			);
1128 1129 1130 1131 1132 1133
		break;
#endif
		case IMGFMT_RGB32:
#ifndef HAVE_MMX
		case IMGFMT_BGR32:
#endif
1134 1135
		if(dstFormat==IMGFMT_BGR32)
		{
Michael Niedermayer's avatar
Michael Niedermayer committed
1136
			int i;
1137 1138 1139
#ifdef WORDS_BIGENDIAN
			dest++;
#endif
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
			for(i=0;i<dstW;i++){
				// vertical linear interpolation && yuv2rgb in a single step:
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
				dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
				dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
				dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
				dest+= 4;
			}
		}
		else if(dstFormat==IMGFMT_BGR24)
1152
		{
Michael Niedermayer's avatar
Michael Niedermayer committed
1153
			int i;
Michael Niedermayer's avatar
Michael Niedermayer committed
1154
			for(i=0;i<dstW;i++){
1155 1156 1157 1158
				// vertical linear interpolation && yuv2rgb in a single step:
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
Michael Niedermayer's avatar
Michael Niedermayer committed
1159 1160 1161
				dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
				dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
				dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1162
				dest+= 3;
1163 1164
			}
		}
1165
		else if(dstFormat==IMGFMT_BGR16)
1166
		{
Michael Niedermayer's avatar
Michael Niedermayer committed
1167
			int i;
Michael Niedermayer's avatar
Michael Niedermayer committed
1168
			for(i=0;i<dstW;i++){
1169 1170 1171 1172 1173
				// vertical linear interpolation && yuv2rgb in a single step:
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);

1174
				((uint16_t*)dest)[i] =
1175 1176 1177
					clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
					clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
					clip_table16r[(Y + yuvtab_3343[V]) >>13];
1178 1179
			}
		}
1180
		else if(dstFormat==IMGFMT_BGR15)
1181
		{
Michael Niedermayer's avatar
Michael Niedermayer committed
1182
			int i;
Michael Niedermayer's avatar
Michael Niedermayer committed
1183
			for(i=0;i<dstW;i++){
1184 1185 1186 1187 1188
				// vertical linear interpolation && yuv2rgb in a single step:
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);

1189
				((uint16_t*)dest)[i] =
1190 1191 1192
					clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
					clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
					clip_table15r[(Y + yuvtab_3343[V]) >>13];
1193 1194 1195 1196 1197
			}
		}
	}//FULL_UV_IPOL
	else
	{
1198
#endif // if 0
1199
#ifdef HAVE_MMX
1200 1201
	switch(c->dstFormat)
	{
1202
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1203
	case IMGFMT_BGR32:
1204
			asm volatile(
1205 1206 1207 1208 1209
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB(%%REGa, %5)
				WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1210 1211 1212

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1213
			: "%"REG_a
1214
			);
1215 1216
			return;
	case IMGFMT_BGR24:
1217
			asm volatile(
1218 1219 1220 1221 1222
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB(%%REGa, %5)
				WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1223 1224
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1225
			: "%"REG_a
1226
			);
1227 1228
			return;
	case IMGFMT_BGR15:
1229
			asm volatile(
1230 1231 1232
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB(%%REGa, %5)
1233 1234
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1235 1236 1237
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1238 1239
#endif

1240 1241
				WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
1242

Michael Niedermayer's avatar
Michael Niedermayer committed
1243 1244
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1245
			: "%"REG_a
1246
			);
1247 1248
			return;
	case IMGFMT_BGR16:
1249
			asm volatile(
1250 1251 1252
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB(%%REGa, %5)
1253 1254
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1255 1256 1257
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1258 1259
#endif

1260 1261
				WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1262 1263
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1264
			: "%"REG_a
Michael Niedermayer's avatar
Michael Niedermayer committed
1265 1266 1267 1268
			);
			return;
	case IMGFMT_YUY2:
			asm volatile(
1269 1270 1271 1272 1273
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2PACKED(%%REGa, %5)
				WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1274 1275
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1276
			: "%"REG_a
1277
			);
1278 1279 1280 1281
			return;
	default: break;
	}
#endif //HAVE_MMX
Michael Niedermayer's avatar
Michael Niedermayer committed
1282
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1283 1284 1285 1286 1287
}

/**
 * YV12 to RGB without scaling or interpolating
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
1288
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1289
			    uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1290
{
1291
	const int yalpha1=0;
1292 1293 1294 1295
	int i;
	
	uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
	const int yalpha= 4096; //FIXME ...
Michael Niedermayer's avatar
Michael Niedermayer committed
1296

1297
	if(flags&SWS_FULL_CHR_H_INT)
1298
	{
Michael Niedermayer's avatar
Michael Niedermayer committed
1299
		RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1300 1301
		return;
	}
Michael Niedermayer's avatar
Michael Niedermayer committed
1302 1303

#ifdef HAVE_MMX
1304 1305
	if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
	{
1306
		switch(dstFormat)
1307
		{
1308
		case IMGFMT_BGR32:
1309
			asm volatile(
1310 1311 1312 1313 1314
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1(%%REGa, %5)
				WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1315 1316 1317

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1318
			: "%"REG_a
1319
			);
1320 1321
			return;
		case IMGFMT_BGR24:
1322
			asm volatile(
1323 1324 1325 1326 1327
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1(%%REGa, %5)
				WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1328 1329 1330

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1331
			: "%"REG_a
1332
			);
1333 1334
			return;
		case IMGFMT_BGR15:
1335
			asm volatile(
1336 1337 1338
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1(%%REGa, %5)
1339 1340
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1341 1342 1343
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1344
#endif
1345 1346
				WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1347 1348 1349

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1350
			: "%"REG_a
1351
			);
1352 1353
			return;
		case IMGFMT_BGR16:
1354
			asm volatile(
1355 1356 1357
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1(%%REGa, %5)
1358 1359
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1360 1361 1362
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1363 1364
#endif

1365 1366
				WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1367 1368 1369

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1370
			: "%"REG_a
1371
			);
1372
			return;
Michael Niedermayer's avatar
Michael Niedermayer committed
1373 1374
		case IMGFMT_YUY2:
			asm volatile(
1375 1376 1377 1378 1379
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2PACKED1(%%REGa, %5)
				WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1380 1381 1382

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1383
			: "%"REG_a
Michael Niedermayer's avatar
Michael Niedermayer committed
1384 1385
			);
			return;
1386
		}
1387 1388 1389
	}
	else
	{
1390
		switch(dstFormat)
1391
		{
1392
		case IMGFMT_BGR32:
1393
			asm volatile(
1394 1395 1396 1397 1398
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1b(%%REGa, %5)
				WRITEBGR32(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1399 1400 1401

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1402
			: "%"REG_a
1403
			);
1404 1405
			return;
		case IMGFMT_BGR24:
1406
			asm volatile(
1407 1408 1409 1410 1411
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1b(%%REGa, %5)
				WRITEBGR24(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1412 1413 1414

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1415
			: "%"REG_a
1416
			);
1417 1418
			return;
		case IMGFMT_BGR15:
1419
			asm volatile(
1420 1421 1422
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1b(%%REGa, %5)
1423 1424
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1425 1426 1427
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1428
#endif
1429 1430
				WRITEBGR15(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1431 1432 1433

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1434
			: "%"REG_a
1435
			);
1436 1437
			return;
		case IMGFMT_BGR16:
1438
			asm volatile(
1439 1440 1441
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2RGB1b(%%REGa, %5)
1442 1443
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
1444 1445 1446
				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1447
#endif
1448

1449 1450
				WRITEBGR16(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1451 1452 1453

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1454
			: "%"REG_a
1455
			);
1456
			return;
Michael Niedermayer's avatar
Michael Niedermayer committed
1457 1458
		case IMGFMT_YUY2:
			asm volatile(
1459 1460 1461 1462 1463
				"mov %%"REG_SP", "ESP_OFFSET"(%5)	\n\t"
				"mov %4, %%"REG_SP"			\n\t"
				YSCALEYUV2PACKED1b(%%REGa, %5)
				WRITEYUY2(%%REGSP, 8280(%5), %%REGa)
				"mov "ESP_OFFSET"(%5), %%"REG_SP"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1464 1465 1466

			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest),
			"r" (&c->redDither)
1467
			: "%"REG_a
Michael Niedermayer's avatar
Michael Niedermayer committed
1468 1469
			);
			return;
1470
		}
1471
	}
1472
#endif
1473
	if( uvalpha < 2048 )
1474
	{
Michael Niedermayer's avatar
Michael Niedermayer committed
1475
		YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1476
	}else{
Michael Niedermayer's avatar
Michael Niedermayer committed
1477
		YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1478
	}
1479 1480
}

1481 1482
//FIXME yuy2* can read upto 7 samples to much

1483 1484
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
{
1485 1486 1487
#ifdef HAVE_MMX
	asm volatile(
		"movq "MANGLE(bm01010101)", %%mm2\n\t"
1488
		"mov %0, %%"REG_a"		\n\t"
1489
		"1:				\n\t"
1490 1491
		"movq (%1, %%"REG_a",2), %%mm0	\n\t"
		"movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
1492 1493 1494
		"pand %%mm2, %%mm0		\n\t"
		"pand %%mm2, %%mm1		\n\t"
		"packuswb %%mm1, %%mm0		\n\t"
1495 1496
		"movq %%mm0, (%2, %%"REG_a")	\n\t"
		"add $8, %%"REG_a"		\n\t"
1497
		" js 1b				\n\t"
1498 1499
		: : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
		: "%"REG_a
1500
	);
1501 1502 1503 1504 1505 1506 1507 1508 1509
#else
	int i;
	for(i=0; i<width; i++)
		dst[i]= src[2*i];
#endif
}

static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
1510 1511 1512
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
	asm volatile(
		"movq "MANGLE(bm01010101)", %%mm4\n\t"
1513
		"mov %0, %%"REG_a"		\n\t"
1514
		"1:				\n\t"
1515 1516 1517 1518
		"movq (%1, %%"REG_a",4), %%mm0	\n\t"
		"movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
		"movq (%2, %%"REG_a",4), %%mm2	\n\t"
		"movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
		PAVGB(%%mm2, %%mm0)
		PAVGB(%%mm3, %%mm1)
		"psrlw $8, %%mm0		\n\t"
		"psrlw $8, %%mm1		\n\t"
		"packuswb %%mm1, %%mm0		\n\t"
		"movq %%mm0, %%mm1		\n\t"
		"psrlw $8, %%mm0		\n\t"
		"pand %%mm4, %%mm1		\n\t"
		"packuswb %%mm0, %%mm0		\n\t"
		"packuswb %%mm1, %%mm1		\n\t"
1529 1530 1531
		"movd %%mm0, (%4, %%"REG_a")	\n\t"
		"movd %%mm1, (%3, %%"REG_a")	\n\t"
		"add $4, %%"REG_a"		\n\t"
1532
		" js 1b				\n\t"
1533 1534
		: : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
		: "%"REG_a
1535
	);
1536 1537 1538 1539 1540 1541 1542 1543 1544 1545
#else
	int i;
	for(i=0; i<width; i++)
	{
		dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
		dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
	}
#endif
}

Michael Niedermayer's avatar
Michael Niedermayer committed
1546 1547 1548 1549 1550
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMX
	asm volatile(
1551
		"mov %0, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1552
		"1:				\n\t"
1553 1554
		"movq (%1, %%"REG_a",2), %%mm0	\n\t"
		"movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1555 1556 1557
		"psrlw $8, %%mm0		\n\t"
		"psrlw $8, %%mm1		\n\t"
		"packuswb %%mm1, %%mm0		\n\t"
1558 1559
		"movq %%mm0, (%2, %%"REG_a")	\n\t"
		"add $8, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1560
		" js 1b				\n\t"
1561 1562
		: : "g" ((long)-width), "r" (src+width*2), "r" (dst+width)
		: "%"REG_a
Michael Niedermayer's avatar
Michael Niedermayer committed
1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
	);
#else
	int i;
	for(i=0; i<width; i++)
		dst[i]= src[2*i+1];
#endif
}

static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
	asm volatile(
		"movq "MANGLE(bm01010101)", %%mm4\n\t"
1576
		"mov %0, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1577
		"1:				\n\t"
1578 1579 1580 1581
		"movq (%1, %%"REG_a",4), %%mm0	\n\t"
		"movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
		"movq (%2, %%"REG_a",4), %%mm2	\n\t"
		"movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591
		PAVGB(%%mm2, %%mm0)
		PAVGB(%%mm3, %%mm1)
		"pand %%mm4, %%mm0		\n\t"
		"pand %%mm4, %%mm1		\n\t"
		"packuswb %%mm1, %%mm0		\n\t"
		"movq %%mm0, %%mm1		\n\t"
		"psrlw $8, %%mm0		\n\t"
		"pand %%mm4, %%mm1		\n\t"
		"packuswb %%mm0, %%mm0		\n\t"
		"packuswb %%mm1, %%mm1		\n\t"
1592 1593 1594
		"movd %%mm0, (%4, %%"REG_a")	\n\t"
		"movd %%mm1, (%3, %%"REG_a")	\n\t"
		"add $4, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1595
		" js 1b				\n\t"
1596 1597
		: : "g" ((long)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
		: "%"REG_a
Michael Niedermayer's avatar
Michael Niedermayer committed
1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608
	);
#else
	int i;
	for(i=0; i<width; i++)
	{
		dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
		dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
	}
#endif
}

1609 1610 1611 1612 1613 1614 1615
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
{
#ifdef HAVE_MMXFIXME
#else
	int i;
	for(i=0; i<width; i++)
	{
1616 1617
		int b=  ((uint32_t*)src)[i]&0xFF;
		int g= (((uint32_t*)src)[i]>>8)&0xFF;
Michael Niedermayer's avatar
Michael Niedermayer committed
1618
		int r= (((uint32_t*)src)[i]>>16)&0xFF;
1619

1620
		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631
	}
#endif
}

static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
#ifdef HAVE_MMXFIXME
#else
	int i;
	for(i=0; i<width; i++)
	{
1632 1633 1634 1635 1636 1637 1638 1639 1640
		const int a= ((uint32_t*)src1)[2*i+0];
		const int e= ((uint32_t*)src1)[2*i+1];
		const int c= ((uint32_t*)src2)[2*i+0];
		const int d= ((uint32_t*)src2)[2*i+1];
		const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
		const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
 		const int b=  l&0x3FF;
		const int g=  h>>8;
		const int r=  l>>16;
1641 1642 1643 1644 1645 1646 1647 1648 1649

		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
	}
#endif
}

static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
{
Michael Niedermayer's avatar
Michael Niedermayer committed
1650 1651
#ifdef HAVE_MMX
	asm volatile(
1652
		"mov %2, %%"REG_a"		\n\t"
Felix Bünemann's avatar
Felix Bünemann committed
1653 1654
		"movq "MANGLE(bgr2YCoeff)", %%mm6		\n\t"
		"movq "MANGLE(w1111)", %%mm5		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1655
		"pxor %%mm7, %%mm7		\n\t"
1656
		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1657 1658
		".balign 16			\n\t"
		"1:				\n\t"
1659 1660 1661
		PREFETCH" 64(%0, %%"REG_b")	\n\t"
		"movd (%0, %%"REG_b"), %%mm0	\n\t"
		"movd 3(%0, %%"REG_b"), %%mm1	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1662 1663
		"punpcklbw %%mm7, %%mm0		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
1664 1665
		"movd 6(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 9(%0, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"pmaddwd %%mm6, %%mm0		\n\t"
		"pmaddwd %%mm6, %%mm1		\n\t"
		"pmaddwd %%mm6, %%mm2		\n\t"
		"pmaddwd %%mm6, %%mm3		\n\t"
#ifndef FAST_BGR2YV12
		"psrad $8, %%mm0		\n\t"
		"psrad $8, %%mm1		\n\t"
		"psrad $8, %%mm2		\n\t"
		"psrad $8, %%mm3		\n\t"
#endif
		"packssdw %%mm1, %%mm0		\n\t"
		"packssdw %%mm3, %%mm2		\n\t"
		"pmaddwd %%mm5, %%mm0		\n\t"
		"pmaddwd %%mm5, %%mm2		\n\t"
		"packssdw %%mm2, %%mm0		\n\t"
		"psraw $7, %%mm0		\n\t"

1685 1686
		"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
		"movd 15(%0, %%"REG_b"), %%mm1	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1687 1688
		"punpcklbw %%mm7, %%mm4		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
1689 1690
		"movd 18(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 21(%0, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"pmaddwd %%mm6, %%mm4		\n\t"
		"pmaddwd %%mm6, %%mm1		\n\t"
		"pmaddwd %%mm6, %%mm2		\n\t"
		"pmaddwd %%mm6, %%mm3		\n\t"
#ifndef FAST_BGR2YV12
		"psrad $8, %%mm4		\n\t"
		"psrad $8, %%mm1		\n\t"
		"psrad $8, %%mm2		\n\t"
		"psrad $8, %%mm3		\n\t"
#endif
		"packssdw %%mm1, %%mm4		\n\t"
		"packssdw %%mm3, %%mm2		\n\t"
		"pmaddwd %%mm5, %%mm4		\n\t"
		"pmaddwd %%mm5, %%mm2		\n\t"
1707
		"add $24, %%"REG_b"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1708 1709 1710 1711
		"packssdw %%mm2, %%mm4		\n\t"
		"psraw $7, %%mm4		\n\t"

		"packuswb %%mm4, %%mm0		\n\t"
Felix Bünemann's avatar
Felix Bünemann committed
1712
		"paddusb "MANGLE(bgr2YOffset)", %%mm0	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1713

1714 1715
		"movq %%mm0, (%1, %%"REG_a")	\n\t"
		"add $8, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1716
		" js 1b				\n\t"
1717 1718
		: : "r" (src+width*3), "r" (dst+width), "g" ((long)-width)
		: "%"REG_a, "%"REG_b
Michael Niedermayer's avatar
Michael Niedermayer committed
1719
	);
1720 1721 1722 1723 1724 1725 1726 1727
#else
	int i;
	for(i=0; i<width; i++)
	{
		int b= src[i*3+0];
		int g= src[i*3+1];
		int r= src[i*3+2];

Michael Niedermayer's avatar
Michael Niedermayer committed
1728
		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1729 1730 1731 1732 1733 1734
	}
#endif
}

static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
Michael Niedermayer's avatar
Michael Niedermayer committed
1735 1736
#ifdef HAVE_MMX
	asm volatile(
1737
		"mov %4, %%"REG_a"		\n\t"
Felix Bünemann's avatar
Felix Bünemann committed
1738 1739
		"movq "MANGLE(w1111)", %%mm5		\n\t"
		"movq "MANGLE(bgr2UCoeff)", %%mm6		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1740
		"pxor %%mm7, %%mm7		\n\t"
1741 1742
		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"	\n\t"
		"add %%"REG_b", %%"REG_b"	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1743 1744
		".balign 16			\n\t"
		"1:				\n\t"
1745 1746
		PREFETCH" 64(%0, %%"REG_b")	\n\t"
		PREFETCH" 64(%1, %%"REG_b")	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1747
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1748 1749 1750 1751
		"movq (%0, %%"REG_b"), %%mm0	\n\t"
		"movq (%1, %%"REG_b"), %%mm1	\n\t"
		"movq 6(%0, %%"REG_b"), %%mm2	\n\t"
		"movq 6(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
		PAVGB(%%mm1, %%mm0)
		PAVGB(%%mm3, %%mm2)
		"movq %%mm0, %%mm1		\n\t"
		"movq %%mm2, %%mm3		\n\t"
		"psrlq $24, %%mm0		\n\t"
		"psrlq $24, %%mm2		\n\t"
		PAVGB(%%mm1, %%mm0)
		PAVGB(%%mm3, %%mm2)
		"punpcklbw %%mm7, %%mm0		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
#else
1763 1764 1765 1766
		"movd (%0, %%"REG_b"), %%mm0	\n\t"
		"movd (%1, %%"REG_b"), %%mm1	\n\t"
		"movd 3(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 3(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1767 1768 1769 1770 1771 1772 1773
		"punpcklbw %%mm7, %%mm0		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"paddw %%mm1, %%mm0		\n\t"
		"paddw %%mm3, %%mm2		\n\t"
		"paddw %%mm2, %%mm0		\n\t"
1774 1775 1776 1777
		"movd 6(%0, %%"REG_b"), %%mm4	\n\t"
		"movd 6(%1, %%"REG_b"), %%mm1	\n\t"
		"movd 9(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 9(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
		"punpcklbw %%mm7, %%mm4		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"paddw %%mm1, %%mm4		\n\t"
		"paddw %%mm3, %%mm2		\n\t"
		"paddw %%mm4, %%mm2		\n\t"
		"psrlw $2, %%mm0		\n\t"
		"psrlw $2, %%mm2		\n\t"
#endif
Felix Bünemann's avatar
Felix Bünemann committed
1788 1789
		"movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
		"movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808
		
		"pmaddwd %%mm0, %%mm1		\n\t"
		"pmaddwd %%mm2, %%mm3		\n\t"
		"pmaddwd %%mm6, %%mm0		\n\t"
		"pmaddwd %%mm6, %%mm2		\n\t"
#ifndef FAST_BGR2YV12
		"psrad $8, %%mm0		\n\t"
		"psrad $8, %%mm1		\n\t"
		"psrad $8, %%mm2		\n\t"
		"psrad $8, %%mm3		\n\t"
#endif
		"packssdw %%mm2, %%mm0		\n\t"
		"packssdw %%mm3, %%mm1		\n\t"
		"pmaddwd %%mm5, %%mm0		\n\t"
		"pmaddwd %%mm5, %%mm1		\n\t"
		"packssdw %%mm1, %%mm0		\n\t" // V1 V0 U1 U0
		"psraw $7, %%mm0		\n\t"

#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1809 1810 1811 1812
		"movq 12(%0, %%"REG_b"), %%mm4	\n\t"
		"movq 12(%1, %%"REG_b"), %%mm1	\n\t"
		"movq 18(%0, %%"REG_b"), %%mm2	\n\t"
		"movq 18(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
		PAVGB(%%mm1, %%mm4)
		PAVGB(%%mm3, %%mm2)
		"movq %%mm4, %%mm1		\n\t"
		"movq %%mm2, %%mm3		\n\t"
		"psrlq $24, %%mm4		\n\t"
		"psrlq $24, %%mm2		\n\t"
		PAVGB(%%mm1, %%mm4)
		PAVGB(%%mm3, %%mm2)
		"punpcklbw %%mm7, %%mm4		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
#else
1824 1825 1826 1827
		"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
		"movd 12(%1, %%"REG_b"), %%mm1	\n\t"
		"movd 15(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 15(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1828 1829 1830 1831 1832 1833 1834
		"punpcklbw %%mm7, %%mm4		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"paddw %%mm1, %%mm4		\n\t"
		"paddw %%mm3, %%mm2		\n\t"
		"paddw %%mm2, %%mm4		\n\t"
1835 1836 1837 1838
		"movd 18(%0, %%"REG_b"), %%mm5	\n\t"
		"movd 18(%1, %%"REG_b"), %%mm1	\n\t"
		"movd 21(%0, %%"REG_b"), %%mm2	\n\t"
		"movd 21(%1, %%"REG_b"), %%mm3	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1839 1840 1841 1842 1843 1844 1845
		"punpcklbw %%mm7, %%mm5		\n\t"
		"punpcklbw %%mm7, %%mm1		\n\t"
		"punpcklbw %%mm7, %%mm2		\n\t"
		"punpcklbw %%mm7, %%mm3		\n\t"
		"paddw %%mm1, %%mm5		\n\t"
		"paddw %%mm3, %%mm2		\n\t"
		"paddw %%mm5, %%mm2		\n\t"
Felix Bünemann's avatar
Felix Bünemann committed
1846
		"movq "MANGLE(w1111)", %%mm5		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1847 1848 1849
		"psrlw $2, %%mm4		\n\t"
		"psrlw $2, %%mm2		\n\t"
#endif
Felix Bünemann's avatar
Felix Bünemann committed
1850 1851
		"movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
		"movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866
		
		"pmaddwd %%mm4, %%mm1		\n\t"
		"pmaddwd %%mm2, %%mm3		\n\t"
		"pmaddwd %%mm6, %%mm4		\n\t"
		"pmaddwd %%mm6, %%mm2		\n\t"
#ifndef FAST_BGR2YV12
		"psrad $8, %%mm4		\n\t"
		"psrad $8, %%mm1		\n\t"
		"psrad $8, %%mm2		\n\t"
		"psrad $8, %%mm3		\n\t"
#endif
		"packssdw %%mm2, %%mm4		\n\t"
		"packssdw %%mm3, %%mm1		\n\t"
		"pmaddwd %%mm5, %%mm4		\n\t"
		"pmaddwd %%mm5, %%mm1		\n\t"
1867
		"add $24, %%"REG_b"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1868 1869 1870 1871 1872 1873 1874
		"packssdw %%mm1, %%mm4		\n\t" // V3 V2 U3 U2
		"psraw $7, %%mm4		\n\t"
		
		"movq %%mm0, %%mm1		\n\t"
		"punpckldq %%mm4, %%mm0		\n\t"
		"punpckhdq %%mm4, %%mm1		\n\t"
		"packsswb %%mm1, %%mm0		\n\t"
Felix Bünemann's avatar
Felix Bünemann committed
1875
		"paddb "MANGLE(bgr2UVOffset)", %%mm0	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1876

1877
		"movd %%mm0, (%2, %%"REG_a")	\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1878
		"punpckhdq %%mm0, %%mm0		\n\t"
1879 1880
		"movd %%mm0, (%3, %%"REG_a")	\n\t"
		"add $4, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
1881
		" js 1b				\n\t"
1882 1883
		: : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((long)-width)
		: "%"REG_a, "%"REG_b
Michael Niedermayer's avatar
Michael Niedermayer committed
1884
	);
1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
#else
	int i;
	for(i=0; i<width; i++)
	{
		int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
		int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
		int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];

		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
	}
#endif
}

1899 1900 1901 1902 1903
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1904
		int d= ((uint16_t*)src)[i];
1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
		int b= d&0x1F;
		int g= (d>>5)&0x3F;
		int r= (d>>11)&0x1F;

		dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
	}
}

static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1918 1919
		int d0= ((uint32_t*)src1)[i];
		int d1= ((uint32_t*)src2)[i];
Michael Niedermayer's avatar
Michael Niedermayer committed
1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
		
		int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
		int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);

		int dh2= (dh>>11) + (dh<<21);
		int d= dh2 + dl;

		int b= d&0x7F;
		int r= (d>>11)&0x7F;
		int g= d>>21;
1930 1931 1932 1933 1934
		dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
		dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
	}
}

1935 1936 1937 1938 1939
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1940
		int d= ((uint16_t*)src)[i];
1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
		int b= d&0x1F;
		int g= (d>>5)&0x1F;
		int r= (d>>10)&0x1F;

		dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
	}
}

static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1954 1955
		int d0= ((uint32_t*)src1)[i];
		int d1= ((uint32_t*)src2)[i];
1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
		
		int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
		int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);

		int dh2= (dh>>11) + (dh<<21);
		int d= dh2 + dl;

		int b= d&0x7F;
		int r= (d>>10)&0x7F;
		int g= d>>21;
		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
	}
}


1972 1973 1974 1975 1976
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1977 1978
		int r=  ((uint32_t*)src)[i]&0xFF;
		int g= (((uint32_t*)src)[i]>>8)&0xFF;
Michael Niedermayer's avatar
Michael Niedermayer committed
1979
		int b= (((uint32_t*)src)[i]>>16)&0xFF;
1980

1981
		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1982 1983 1984 1985 1986 1987 1988 1989
	}
}

static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
1990 1991 1992 1993 1994 1995 1996 1997 1998
		const int a= ((uint32_t*)src1)[2*i+0];
		const int e= ((uint32_t*)src1)[2*i+1];
		const int c= ((uint32_t*)src2)[2*i+0];
		const int d= ((uint32_t*)src2)[2*i+1];
		const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
		const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
 		const int r=  l&0x3FF;
		const int g=  h>>8;
		const int b=  l>>16;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013

		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
	}
}

static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
		int r= src[i*3+0];
		int g= src[i*3+1];
		int b= src[i*3+2];

2014
		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031
	}
}

static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
{
	int i;
	for(i=0; i<width; i++)
	{
		int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
		int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
		int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];

		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
	}
}

2032

2033 2034 2035
// Bilinear / Bicubic scaling
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
				  int16_t *filter, int16_t *filterPos, int filterSize)
Michael Niedermayer's avatar
Michael Niedermayer committed
2036
{
2037
#ifdef HAVE_MMX
2038
	assert(filterSize % 4 == 0 && filterSize>0);
2039 2040
	if(filterSize==4) // allways true for upscaling, sometimes for down too
	{
2041
		long counter= -2*dstW;
2042 2043 2044 2045 2046
		filter-= counter*2;
		filterPos-= counter/2;
		dst-= counter/2;
		asm volatile(
			"pxor %%mm7, %%mm7		\n\t"
2047
			"movq "MANGLE(w02)", %%mm6	\n\t"
2048 2049
			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
			"mov %%"REG_a", %%"REG_BP"	\n\t"
2050 2051
			".balign 16			\n\t"
			"1:				\n\t"
2052 2053
			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2054 2055 2056 2057
			"movq (%1, %%"REG_BP", 4), %%mm1\n\t"
			"movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
			"movd (%3, %%"REG_a"), %%mm0	\n\t"
			"movd (%3, %%"REG_b"), %%mm2	\n\t"
2058 2059 2060 2061 2062 2063 2064 2065 2066
			"punpcklbw %%mm7, %%mm0		\n\t"
			"punpcklbw %%mm7, %%mm2		\n\t"
			"pmaddwd %%mm1, %%mm0		\n\t"
			"pmaddwd %%mm2, %%mm3		\n\t"
			"psrad $8, %%mm0		\n\t"
			"psrad $8, %%mm3		\n\t"
			"packssdw %%mm3, %%mm0		\n\t"
			"pmaddwd %%mm6, %%mm0		\n\t"
			"packssdw %%mm0, %%mm0		\n\t"
2067 2068
			"movd %%mm0, (%4, %%"REG_BP")	\n\t"
			"add $4, %%"REG_BP"		\n\t"
2069
			" jnc 1b			\n\t"
2070

2071
			"pop %%"REG_BP"			\n\t"
2072 2073
			: "+a" (counter)
			: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2074
			: "%"REG_b
2075 2076 2077 2078
		);
	}
	else if(filterSize==8)
	{
2079
		long counter= -2*dstW;
2080 2081 2082 2083 2084
		filter-= counter*4;
		filterPos-= counter/2;
		dst-= counter/2;
		asm volatile(
			"pxor %%mm7, %%mm7		\n\t"
2085
			"movq "MANGLE(w02)", %%mm6	\n\t"
2086 2087
			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
			"mov %%"REG_a", %%"REG_BP"	\n\t"
2088 2089
			".balign 16			\n\t"
			"1:				\n\t"
2090 2091
			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2092 2093 2094 2095
			"movq (%1, %%"REG_BP", 8), %%mm1\n\t"
			"movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
			"movd (%3, %%"REG_a"), %%mm0	\n\t"
			"movd (%3, %%"REG_b"), %%mm2	\n\t"
2096 2097 2098 2099 2100
			"punpcklbw %%mm7, %%mm0		\n\t"
			"punpcklbw %%mm7, %%mm2		\n\t"
			"pmaddwd %%mm1, %%mm0		\n\t"
			"pmaddwd %%mm2, %%mm3		\n\t"

2101 2102 2103 2104
			"movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
			"movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
			"movd 4(%3, %%"REG_a"), %%mm4	\n\t"
			"movd 4(%3, %%"REG_b"), %%mm2	\n\t"
2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116
			"punpcklbw %%mm7, %%mm4		\n\t"
			"punpcklbw %%mm7, %%mm2		\n\t"
			"pmaddwd %%mm1, %%mm4		\n\t"
			"pmaddwd %%mm2, %%mm5		\n\t"
			"paddd %%mm4, %%mm0		\n\t"
			"paddd %%mm5, %%mm3		\n\t"
						
			"psrad $8, %%mm0		\n\t"
			"psrad $8, %%mm3		\n\t"
			"packssdw %%mm3, %%mm0		\n\t"
			"pmaddwd %%mm6, %%mm0		\n\t"
			"packssdw %%mm0, %%mm0		\n\t"
2117 2118
			"movd %%mm0, (%4, %%"REG_BP")	\n\t"
			"add $4, %%"REG_BP"		\n\t"
2119
			" jnc 1b			\n\t"
2120

2121
			"pop %%"REG_BP"			\n\t"
2122 2123
			: "+a" (counter)
			: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2124
			: "%"REG_b
2125 2126 2127 2128
		);
	}
	else
	{
2129
		long counter= -2*dstW;
2130 2131 2132 2133 2134
//		filter-= counter*filterSize/2;
		filterPos-= counter/2;
		dst-= counter/2;
		asm volatile(
			"pxor %%mm7, %%mm7		\n\t"
2135
			"movq "MANGLE(w02)", %%mm6	\n\t"
2136 2137
			".balign 16			\n\t"
			"1:				\n\t"
2138
			"mov %2, %%"REG_c"		\n\t"
2139 2140
			"movzwl (%%"REG_c", %0), %%eax	\n\t"
			"movzwl 2(%%"REG_c", %0), %%ebx	\n\t"
2141
			"mov %5, %%"REG_c"		\n\t"
2142 2143 2144 2145 2146
			"pxor %%mm4, %%mm4		\n\t"
			"pxor %%mm5, %%mm5		\n\t"
			"2:				\n\t"
			"movq (%1), %%mm1		\n\t"
			"movq (%1, %6), %%mm3		\n\t"
2147 2148
			"movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
			"movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2149 2150 2151 2152 2153 2154
			"punpcklbw %%mm7, %%mm0		\n\t"
			"punpcklbw %%mm7, %%mm2		\n\t"
			"pmaddwd %%mm1, %%mm0		\n\t"
			"pmaddwd %%mm2, %%mm3		\n\t"
			"paddd %%mm3, %%mm5		\n\t"
			"paddd %%mm0, %%mm4		\n\t"
2155 2156 2157
			"add $8, %1			\n\t"
			"add $4, %%"REG_c"		\n\t"
			"cmp %4, %%"REG_c"		\n\t"
2158
			" jb 2b				\n\t"
2159
			"add %6, %1			\n\t"
2160 2161 2162 2163 2164
			"psrad $8, %%mm4		\n\t"
			"psrad $8, %%mm5		\n\t"
			"packssdw %%mm5, %%mm4		\n\t"
			"pmaddwd %%mm6, %%mm4		\n\t"
			"packssdw %%mm4, %%mm4		\n\t"
2165 2166 2167
			"mov %3, %%"REG_a"		\n\t"
			"movd %%mm4, (%%"REG_a", %0)	\n\t"
			"add $4, %0			\n\t"
2168
			" jnc 1b			\n\t"
2169

2170 2171
			: "+r" (counter), "+r" (filter)
			: "m" (filterPos), "m" (dst), "m"(src+filterSize),
2172 2173
			  "m" (src), "r" ((long)filterSize*2)
			: "%"REG_b, "%"REG_a, "%"REG_c
2174 2175
		);
	}
2176 2177 2178
#else
#ifdef HAVE_ALTIVEC
	hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2179 2180 2181 2182 2183 2184 2185
#else
	int i;
	for(i=0; i<dstW; i++)
	{
		int j;
		int srcPos= filterPos[i];
		int val=0;
2186
//		printf("filterPos: %d\n", filterPos[i]);
2187 2188 2189 2190 2191 2192 2193 2194 2195 2196
		for(j=0; j<filterSize; j++)
		{
//			printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
			val += ((int)src[srcPos + j])*filter[filterSize*i + j];
		}
//		filter += hFilterSize;
		dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
//		dst[i] = val>>7;
	}
#endif
2197
#endif
2198
}
Michael Niedermayer's avatar
Michael Niedermayer committed
2199
      // *** horizontal scale Y line to temp buffer
2200 2201
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
				   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2202
				   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2203 2204
				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
				   int32_t *mmx2FilterPos)
2205
{
2206 2207 2208 2209 2210
    if(srcFormat==IMGFMT_YUY2)
    {
	RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
Michael Niedermayer's avatar
Michael Niedermayer committed
2211 2212 2213 2214 2215
    else if(srcFormat==IMGFMT_UYVY)
    {
	RENAME(uyvyToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
2216 2217 2218 2219 2220 2221 2222 2223 2224 2225
    else if(srcFormat==IMGFMT_BGR32)
    {
	RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
    else if(srcFormat==IMGFMT_BGR24)
    {
	RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
2226 2227 2228 2229 2230
    else if(srcFormat==IMGFMT_BGR16)
    {
	RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
2231 2232 2233 2234 2235
    else if(srcFormat==IMGFMT_BGR15)
    {
	RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
    else if(srcFormat==IMGFMT_RGB32)
    {
	RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
    else if(srcFormat==IMGFMT_RGB24)
    {
	RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
	src= formatConvBuffer;
    }
2246

2247
#ifdef HAVE_MMX
2248
	// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2249
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2250
#else
2251
    if(!(flags&SWS_FAST_BILINEAR))
2252
#endif
2253 2254 2255 2256 2257
    {
    	RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
    }
    else // Fast Bilinear upscale / crap downscale
    {
2258
#if defined(ARCH_X86) || defined(ARCH_X86_64)
Michael Niedermayer's avatar
Michael Niedermayer committed
2259
#ifdef HAVE_MMX2
Michael Niedermayer's avatar
Michael Niedermayer committed
2260
	int i;
Michael Niedermayer's avatar
Michael Niedermayer committed
2261 2262 2263 2264
	if(canMMX2BeUsed)
	{
		asm volatile(
			"pxor %%mm7, %%mm7		\n\t"
2265 2266 2267 2268 2269 2270 2271 2272
			"mov %0, %%"REG_c"		\n\t"
			"mov %1, %%"REG_D"		\n\t"
			"mov %2, %%"REG_d"		\n\t"
			"mov %3, %%"REG_b"		\n\t"
			"xor %%"REG_a", %%"REG_a"	\n\t" // i
			PREFETCH" (%%"REG_c")		\n\t"
			PREFETCH" 32(%%"REG_c")		\n\t"
			PREFETCH" 64(%%"REG_c")		\n\t"
2273

Michael Niedermayer's avatar
Michael Niedermayer committed
2274
#define FUNNY_Y_CODE \
2275
			"mov (%%"REG_b"), %%"REG_S"	\n\t"\
2276
			"call *%4			\n\t"\
2277 2278 2279
			"addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
			"add %%"REG_a", %%"REG_d"	\n\t"\
			"xor %%"REG_a", %%"REG_a"	\n\t"\
2280

Michael Niedermayer's avatar
Michael Niedermayer committed
2281 2282 2283 2284 2285 2286 2287 2288 2289
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE
FUNNY_Y_CODE

2290 2291
			:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
			"m" (funnyYCode)
2292
			: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_d
Michael Niedermayer's avatar
Michael Niedermayer committed
2293
		);
2294
		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
Michael Niedermayer's avatar
Michael Niedermayer committed
2295 2296 2297 2298 2299 2300
	}
	else
	{
#endif
	//NO MMX just normal asm ...
	asm volatile(
2301 2302
		"xor %%"REG_a", %%"REG_a"	\n\t" // i
		"xor %%"REG_b", %%"REG_b"	\n\t" // xx
Michael Niedermayer's avatar
Michael Niedermayer committed
2303
		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
2304
		".balign 16			\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2305
		"1:				\n\t"
2306 2307
		"movzbl  (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
		"movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
Michael Niedermayer's avatar
Michael Niedermayer committed
2308 2309 2310 2311
		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
		"shll $16, %%edi		\n\t"
		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2312
		"mov %1, %%"REG_D"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2313
		"shrl $9, %%esi			\n\t"
2314
		"movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2315
		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
2316
		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
Michael Niedermayer's avatar
Michael Niedermayer committed
2317

2318 2319
		"movzbl (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
		"movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
Michael Niedermayer's avatar
Michael Niedermayer committed
2320 2321 2322 2323
		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
		"shll $16, %%edi		\n\t"
		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2324
		"mov %1, %%"REG_D"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2325
		"shrl $9, %%esi			\n\t"
2326
		"movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2327
		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
2328
		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
Michael Niedermayer's avatar
Michael Niedermayer committed
2329 2330


2331 2332
		"add $2, %%"REG_a"		\n\t"
		"cmp %2, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2333 2334 2335 2336
		" jb 1b				\n\t"


		:: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2337
		: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
Michael Niedermayer's avatar
Michael Niedermayer committed
2338 2339
		);
#ifdef HAVE_MMX2
2340
	} //if MMX2 can't be used
Michael Niedermayer's avatar
Michael Niedermayer committed
2341 2342
#endif
#else
Michael Niedermayer's avatar
Michael Niedermayer committed
2343 2344 2345 2346 2347 2348 2349 2350 2351
	int i;
	unsigned int xpos=0;
	for(i=0;i<dstWidth;i++)
	{
		register unsigned int xx=xpos>>16;
		register unsigned int xalpha=(xpos&0xFFFF)>>9;
		dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
		xpos+=xInc;
	}
Michael Niedermayer's avatar
Michael Niedermayer committed
2352
#endif
2353
    }
Michael Niedermayer's avatar
Michael Niedermayer committed
2354 2355
}

2356 2357
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
				   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2358
				   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2359 2360
				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
				   int32_t *mmx2FilterPos)
Michael Niedermayer's avatar
Michael Niedermayer committed
2361
{
2362 2363 2364 2365 2366 2367
    if(srcFormat==IMGFMT_YUY2)
    {
	RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
Michael Niedermayer's avatar
Michael Niedermayer committed
2368 2369 2370 2371 2372 2373
    else if(srcFormat==IMGFMT_UYVY)
    {
	RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385
    else if(srcFormat==IMGFMT_BGR32)
    {
	RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
    else if(srcFormat==IMGFMT_BGR24)
    {
	RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
2386 2387 2388 2389 2390 2391
    else if(srcFormat==IMGFMT_BGR16)
    {
	RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
2392 2393 2394 2395 2396 2397
    else if(srcFormat==IMGFMT_BGR15)
    {
	RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409
    else if(srcFormat==IMGFMT_RGB32)
    {
	RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
    else if(srcFormat==IMGFMT_RGB24)
    {
	RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
	src1= formatConvBuffer;
	src2= formatConvBuffer+2048;
    }
2410 2411 2412 2413
    else if(isGray(srcFormat))
    {
    	return;
    }
2414

2415
#ifdef HAVE_MMX
2416
	// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2417
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2418
#else
2419
    if(!(flags&SWS_FAST_BILINEAR))
2420
#endif
2421 2422 2423 2424 2425 2426
    {
    	RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
    	RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
    }
    else // Fast Bilinear upscale / crap downscale
    {
2427
#if defined(ARCH_X86) || defined(ARCH_X86_64)
Michael Niedermayer's avatar
Michael Niedermayer committed
2428
#ifdef HAVE_MMX2
Michael Niedermayer's avatar
Michael Niedermayer committed
2429
	int i;
Michael Niedermayer's avatar
Michael Niedermayer committed
2430 2431 2432
	if(canMMX2BeUsed)
	{
		asm volatile(
2433
			"pxor %%mm7, %%mm7		\n\t"
2434 2435 2436 2437 2438 2439 2440 2441
			"mov %0, %%"REG_c"		\n\t"
			"mov %1, %%"REG_D"		\n\t"
			"mov %2, %%"REG_d"		\n\t"
			"mov %3, %%"REG_b"		\n\t"
			"xor %%"REG_a", %%"REG_a"	\n\t" // i
			PREFETCH" (%%"REG_c")		\n\t"
			PREFETCH" 32(%%"REG_c")		\n\t"
			PREFETCH" 64(%%"REG_c")		\n\t"
2442 2443

#define FUNNY_UV_CODE \
2444
			"movl (%%"REG_b"), %%esi	\n\t"\
2445
			"call *%4			\n\t"\
2446 2447 2448
			"addl (%%"REG_b", %%"REG_a"), %%ecx\n\t"\
			"add %%"REG_a", %%"REG_D"	\n\t"\
			"xor %%"REG_a", %%"REG_a"	\n\t"\
2449 2450 2451 2452 2453

FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
2454 2455 2456 2457 2458 2459 2460
			"xor %%"REG_a", %%"REG_a"	\n\t" // i
			"mov %5, %%"REG_c"		\n\t" // src
			"mov %1, %%"REG_D"		\n\t" // buf1
			"add $4096, %%"REG_D"		\n\t"
			PREFETCH" (%%"REG_c")		\n\t"
			PREFETCH" 32(%%"REG_c")		\n\t"
			PREFETCH" 64(%%"REG_c")		\n\t"
2461 2462 2463 2464 2465 2466 2467 2468

FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE
FUNNY_UV_CODE

			:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
			"m" (funnyUVCode), "m" (src2)
2469
			: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%esi", "%"REG_D
2470
		);
2471
		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
Michael Niedermayer's avatar
Michael Niedermayer committed
2472
		{
2473 2474 2475
//			printf("%d %d %d\n", dstWidth, i, srcW);
			dst[i] = src1[srcW-1]*128;
			dst[i+2048] = src2[srcW-1]*128;
Michael Niedermayer's avatar
Michael Niedermayer committed
2476 2477 2478 2479 2480 2481
		}
	}
	else
	{
#endif
	asm volatile(
2482 2483
		"xor %%"REG_a", %%"REG_a"	\n\t" // i
		"xor %%"REG_b", %%"REG_b"		\n\t" // xx
Michael Niedermayer's avatar
Michael Niedermayer committed
2484
		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
2485
		".balign 16			\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2486
		"1:				\n\t"
2487 2488 2489
		"mov %0, %%"REG_S"		\n\t"
		"movzbl  (%%"REG_S", %%"REG_b"), %%edi	\n\t" //src[xx]
		"movzbl 1(%%"REG_S", %%"REG_b"), %%esi	\n\t" //src[xx+1]
Michael Niedermayer's avatar
Michael Niedermayer committed
2490 2491 2492 2493
		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
		"shll $16, %%edi		\n\t"
		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2494
		"mov %1, %%"REG_D"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2495
		"shrl $9, %%esi			\n\t"
2496
		"movw %%si, (%%"REG_d", %%"REG_a", 2)\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2497

2498 2499
		"movzbl  (%5, %%"REG_b"), %%edi	\n\t" //src[xx]
		"movzbl 1(%5, %%"REG_b"), %%esi	\n\t" //src[xx+1]
Michael Niedermayer's avatar
Michael Niedermayer committed
2500 2501 2502 2503
		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
		"shll $16, %%edi		\n\t"
		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2504
		"mov %1, %%"REG_D"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2505
		"shrl $9, %%esi			\n\t"
2506
		"movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2507 2508

		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
2509 2510 2511
		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
		"add $1, %%"REG_a"		\n\t"
		"cmp %2, %%"REG_a"		\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
2512 2513
		" jb 1b				\n\t"

2514
		:: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" ((long)(xInc>>16)), "m" ((xInc&0xFFFF)),
Michael Niedermayer's avatar
Michael Niedermayer committed
2515
		"r" (src2)
2516
		: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
Michael Niedermayer's avatar
Michael Niedermayer committed
2517 2518
		);
#ifdef HAVE_MMX2
2519
	} //if MMX2 can't be used
Michael Niedermayer's avatar
Michael Niedermayer committed
2520 2521
#endif
#else
Michael Niedermayer's avatar
Michael Niedermayer committed
2522 2523 2524 2525 2526 2527 2528 2529
	int i;
	unsigned int xpos=0;
	for(i=0;i<dstWidth;i++)
	{
		register unsigned int xx=xpos>>16;
		register unsigned int xalpha=(xpos&0xFFFF)>>9;
		dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
		dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
Michael Niedermayer's avatar
Michael Niedermayer committed
2530 2531 2532 2533
/* slower
	  dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
	  dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
*/
Michael Niedermayer's avatar
Michael Niedermayer committed
2534 2535
		xpos+=xInc;
	}
Michael Niedermayer's avatar
Michael Niedermayer committed
2536
#endif
2537 2538 2539
   }
}

Michael Niedermayer's avatar
Michael Niedermayer committed
2540 2541
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2542 2543 2544 2545 2546 2547

	/* load a few things into local vars to make the code more readable? and faster */
	const int srcW= c->srcW;
	const int dstW= c->dstW;
	const int dstH= c->dstH;
	const int chrDstW= c->chrDstW;
2548
	const int chrSrcW= c->chrSrcW;
2549 2550
	const int lumXInc= c->lumXInc;
	const int chrXInc= c->chrXInc;
2551
	const int dstFormat= c->dstFormat;
2552
	const int srcFormat= c->srcFormat;
2553 2554 2555 2556 2557 2558 2559 2560 2561 2562
	const int flags= c->flags;
	const int canMMX2BeUsed= c->canMMX2BeUsed;
	int16_t *vLumFilterPos= c->vLumFilterPos;
	int16_t *vChrFilterPos= c->vChrFilterPos;
	int16_t *hLumFilterPos= c->hLumFilterPos;
	int16_t *hChrFilterPos= c->hChrFilterPos;
	int16_t *vLumFilter= c->vLumFilter;
	int16_t *vChrFilter= c->vChrFilter;
	int16_t *hLumFilter= c->hLumFilter;
	int16_t *hChrFilter= c->hChrFilter;
Michael Niedermayer's avatar
Michael Niedermayer committed
2563 2564
	int32_t *lumMmxFilter= c->lumMmxFilter;
	int32_t *chrMmxFilter= c->chrMmxFilter;
2565 2566 2567 2568 2569 2570 2571 2572 2573 2574
	const int vLumFilterSize= c->vLumFilterSize;
	const int vChrFilterSize= c->vChrFilterSize;
	const int hLumFilterSize= c->hLumFilterSize;
	const int hChrFilterSize= c->hChrFilterSize;
	int16_t **lumPixBuf= c->lumPixBuf;
	int16_t **chrPixBuf= c->chrPixBuf;
	const int vLumBufSize= c->vLumBufSize;
	const int vChrBufSize= c->vChrBufSize;
	uint8_t *funnyYCode= c->funnyYCode;
	uint8_t *funnyUVCode= c->funnyUVCode;
2575
	uint8_t *formatConvBuffer= c->formatConvBuffer;
2576 2577
	const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
	const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2578
	int lastDstY;
2579 2580 2581 2582 2583 2584 2585

	/* vars whch will change and which we need to storw back in the context */
	int dstY= c->dstY;
	int lumBufIndex= c->lumBufIndex;
	int chrBufIndex= c->chrBufIndex;
	int lastInLumBuf= c->lastInLumBuf;
	int lastInChrBuf= c->lastInChrBuf;
2586 2587
	
	if(isPacked(c->srcFormat)){
2588 2589
		src[0]=
		src[1]=
Michael Niedermayer's avatar
Michael Niedermayer committed
2590
		src[2]= src[0];
2591
		srcStride[0]=
2592
		srcStride[1]=
Michael Niedermayer's avatar
Michael Niedermayer committed
2593
		srcStride[2]= srcStride[0];
2594
	}
2595 2596
	srcStride[1]<<= c->vChrDrop;
	srcStride[2]<<= c->vChrDrop;
2597

Michael Niedermayer's avatar
Michael Niedermayer committed
2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609
//	printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
//		(int)dst[0], (int)dst[1], (int)dst[2]);

#if 0 //self test FIXME move to a vfilter or something
{
static volatile int i=0;
i++;
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
	selfTest(src, srcStride, c->srcW, c->srcH);
i--;
}
#endif
2610 2611 2612

//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
//dstStride[0],dstStride[1],dstStride[2]);
2613 2614 2615 2616 2617 2618

	if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
	{
		static int firstTime=1; //FIXME move this into the context perhaps
		if(flags & SWS_PRINT_INFO && firstTime)
		{
2619
			MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2620 2621 2622 2623
					"SwScaler:          ->cannot do aligned memory acesses anymore\n");
			firstTime=0;
		}
	}
2624

2625 2626
	/* Note the user might start scaling the picture in the middle so this will not get executed
	   this is not really intended but works currently, so ppl might do it */
2627 2628 2629
	if(srcSliceY ==0){
		lumBufIndex=0;
		chrBufIndex=0;
2630
		dstY=0;	
2631 2632
		lastInLumBuf= -1;
		lastInChrBuf= -1;
2633
	}
2634

2635 2636
	lastDstY= dstY;

2637
	for(;dstY < dstH; dstY++){
2638
		unsigned char *dest =dst[0]+dstStride[0]*dstY;
Michael Niedermayer's avatar
Michael Niedermayer committed
2639 2640 2641
		const int chrDstY= dstY>>c->chrDstVSubSample;
		unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
		unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2642

2643 2644 2645 2646
		const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
		const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
		const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
		const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2647

Michael Niedermayer's avatar
Michael Niedermayer committed
2648 2649
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2650 2651 2652 2653
		//handle holes (FAST_BILINEAR & weird filters)
		if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
		if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2654 2655
		ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
		ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2656

2657
		// Do we have enough lines in this slice to output the dstY line
2658
		if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2659 2660 2661
		{
			//Do horizontal scaling
			while(lastInLumBuf < lastLumSrcY)
2662
			{
2663
				uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2664
				lumBufIndex++;
2665
//				printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2666 2667 2668 2669
				ASSERT(lumBufIndex < 2*vLumBufSize)
				ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
//				printf("%d %d\n", lumBufIndex, vLumBufSize);
2670 2671
				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2672 2673
						funnyYCode, c->srcFormat, formatConvBuffer, 
						c->lumMmx2Filter, c->lumMmx2FilterPos);
2674 2675 2676 2677
				lastInLumBuf++;
			}
			while(lastInChrBuf < lastChrSrcY)
			{
2678 2679
				uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
				uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2680 2681
				chrBufIndex++;
				ASSERT(chrBufIndex < 2*vChrBufSize)
2682 2683
				ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
				ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2684
				//FIXME replace parameters through context struct (some at least)
2685 2686

				if(!(isGray(srcFormat) || isGray(dstFormat)))
2687
					RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2688
						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2689 2690
						funnyUVCode, c->srcFormat, formatConvBuffer, 
						c->chrMmx2Filter, c->chrMmx2FilterPos);
2691
				lastInChrBuf++;
2692
			}
2693 2694 2695
			//wrap buf index around to stay inside the ring buffer
			if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
			if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2696
		}
2697
		else // not enough lines left in this slice -> load the rest in the buffer
Michael Niedermayer's avatar
Michael Niedermayer committed
2698
		{
2699 2700 2701
/*		printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
			firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
			lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2702 2703
			vChrBufSize, vLumBufSize);*/

2704 2705 2706
			//Do horizontal scaling
			while(lastInLumBuf+1 < srcSliceY + srcSliceH)
			{
2707
				uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2708 2709 2710 2711
				lumBufIndex++;
				ASSERT(lumBufIndex < 2*vLumBufSize)
				ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2712 2713
				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
						flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2714 2715
						funnyYCode, c->srcFormat, formatConvBuffer, 
						c->lumMmx2Filter, c->lumMmx2FilterPos);
2716 2717
				lastInLumBuf++;
			}
2718
			while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2719
			{
2720 2721
				uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
				uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722 2723
				chrBufIndex++;
				ASSERT(chrBufIndex < 2*vChrBufSize)
2724 2725
				ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
				ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2726 2727

				if(!(isGray(srcFormat) || isGray(dstFormat)))
2728
					RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2729
						flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2730 2731
						funnyUVCode, c->srcFormat, formatConvBuffer, 
						c->chrMmx2Filter, c->chrMmx2FilterPos);
2732 2733 2734 2735 2736
				lastInChrBuf++;
			}
			//wrap buf index around to stay inside the ring buffer
			if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
			if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2737
			break; //we can't output a dstY line so let's try with the next slice
Michael Niedermayer's avatar
Michael Niedermayer committed
2738
		}
2739

2740 2741 2742 2743 2744 2745
#ifdef HAVE_MMX
		b5Dither= dither8[dstY&1];
		g6Dither= dither4[dstY&1];
		g5Dither= dither8[dstY&1];
		r5Dither= dither8[(dstY+1)&1];
#endif
2746
	    if(dstY < dstH-2)
2747
	    {
Michael Niedermayer's avatar
Michael Niedermayer committed
2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766
		int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
		int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
#ifdef HAVE_MMX
		int i;
		for(i=0; i<vLumFilterSize; i++)
		{
			lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
			lumMmxFilter[4*i+2]= 
			lumMmxFilter[4*i+3]= 
				((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
		}
		for(i=0; i<vChrFilterSize; i++)
		{
			chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
			chrMmxFilter[4*i+2]= 
			chrMmxFilter[4*i+3]= 
				((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
		}
#endif
2767
		if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2768
		{
2769 2770
			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
			if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2771
			if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
Michael Niedermayer's avatar
Michael Niedermayer committed
2772
			{
2773 2774
				int16_t *lumBuf = lumPixBuf[0];
				int16_t *chrBuf= chrPixBuf[0];
2775
				RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2776 2777 2778
			}
			else //General YV12
			{
Michael Niedermayer's avatar
Michael Niedermayer committed
2779
				RENAME(yuv2yuvX)(c,
2780 2781
					vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
					vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
Michael Niedermayer's avatar
Michael Niedermayer committed
2782
					dest, uDest, vDest, dstW, chrDstW);
Michael Niedermayer's avatar
Michael Niedermayer committed
2783
			}
2784
		}
2785
		else
Michael Niedermayer's avatar
Michael Niedermayer committed
2786
		{
2787 2788 2789 2790 2791
			ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
			ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
			if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
			{
				int chrAlpha= vChrFilter[2*dstY+1];
Michael Niedermayer's avatar
Michael Niedermayer committed
2792
				RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2793
						 dest, dstW, chrAlpha, dstFormat, flags, dstY);
2794 2795 2796 2797 2798
			}
			else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
			{
				int lumAlpha= vLumFilter[2*dstY+1];
				int chrAlpha= vChrFilter[2*dstY+1];
Michael Niedermayer's avatar
Michael Niedermayer committed
2799
				RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2800
						 dest, dstW, lumAlpha, chrAlpha, dstY);
2801 2802 2803
			}
			else //General RGB
			{
Michael Niedermayer's avatar
Michael Niedermayer committed
2804
				RENAME(yuv2packedX)(c,
2805 2806
					vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
					vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
Michael Niedermayer's avatar
Michael Niedermayer committed
2807
					dest, dstW, dstY);
2808 2809
			}
		}
2810
            }
2811
	    else // hmm looks like we can't use MMX here without overwriting this array's tail
2812 2813 2814
	    {
		int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
		int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2815
		if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
2816
		{
2817 2818
			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
			if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2819
			yuv2yuvXinC(
2820 2821
				vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
				vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2822
				dest, uDest, vDest, dstW, chrDstW);
2823 2824 2825 2826 2827
		}
		else
		{
			ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
			ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
Michael Niedermayer's avatar
Michael Niedermayer committed
2828
			yuv2packedXinC(c, 
2829 2830
				vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
				vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2831
				dest, dstW, dstY);
2832 2833
		}
	    }
2834
	}
2835 2836 2837

#ifdef HAVE_MMX
	__asm __volatile(SFENCE:::"memory");
Michael Niedermayer's avatar
Michael Niedermayer committed
2838
	__asm __volatile(EMMS:::"memory");
2839
#endif
2840 2841 2842 2843 2844 2845
	/* store changed local vars back in the context */
	c->dstY= dstY;
	c->lumBufIndex= lumBufIndex;
	c->chrBufIndex= chrBufIndex;
	c->lastInLumBuf= lastInLumBuf;
	c->lastInChrBuf= lastInChrBuf;
2846 2847

	return dstY - lastDstY;
2848
}