yuv2rgb_template.c 19.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25

/*
 * yuv2rgb_mmx.c, Software YUV to RGB coverter with Intel MMX "technology"
 *
 * Copyright (C) 2000, Silicon Integrated System Corp.
 * All Rights Reserved.
 *
 * Author: Olie Lho <ollie@sis.com.tw>
 *
 * This file is part of mpeg2dec, a free MPEG-2 video decoder
 *
 * mpeg2dec is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * mpeg2dec is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with GNU Make; see the file COPYING. If not, write to
 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
26
 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
27
 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
28
 * context / deglobalize stuff by Michael Niedermayer
29 30
 */

31 32 33
#undef MOVNTQ
#undef EMMS
#undef SFENCE
34

35 36 37 38 39 40
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define EMMS     "femms"
#else
#define EMMS     "emms"
#endif
41

42 43 44 45 46 47 48
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE "/nop"
#endif
49

50 51 52 53 54 55 56 57 58 59 60 61
#define YUV2RGB \
		     /* Do the multiply part of the conversion for even and odd pixels,
			register usage:
			mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
			mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
			mm6 -> Y even, mm7 -> Y odd */\
		     /* convert the chroma part */\
		     "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
		     "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\
		     "psllw $3, %%mm0;" /* Promote precision */ \
		     "psllw $3, %%mm1;" /* Promote precision */ \
62 63 64
\
		     "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
		     "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
65 66 67 68
\
		     "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
		     "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\
69 70
		     "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
		     "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
71
\
72 73
		     "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
		     "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
74 75 76 77 78
\
		     "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
\
		     /* convert the luma part */\
		     "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
Felix Bünemann's avatar
Felix Bünemann committed
79
		     "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
80 81 82 83 84 85
\
		     "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
\
		     "psllw $3, %%mm6;" /* Promote precision */\
		     "psllw $3, %%mm7;" /* Promote precision */\
\
86 87 88 89 90
		     "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
		     "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
\
		     "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
		     "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
\
		     /* Do the addition part of the conversion for even and odd pixels,
			register usage:
			mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
			mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
			mm6 -> Y even, mm7 -> Y odd */\
		     "movq %%mm0, %%mm3;" /* Copy Cblue */\
		     "movq %%mm1, %%mm4;" /* Copy Cred */\
		     "movq %%mm2, %%mm5;" /* Copy Cgreen */\
\
		     "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
		     "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
\
		     "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
		     "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
\
		     "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
		     "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
\
		     /* Limit RGB even to 0..255 */\
		     "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0  B6 B4 B2 B0 */\
		     "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0  R6 R4 R2 R0 */\
		     "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0  G6 G4 G2 G0 */\
\
		     /* Limit RGB odd to 0..255 */\
		     "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1  B7 B5 B3 B1 */\
		     "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1  R7 R5 R3 R1 */\
		     "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1  G7 G5 G3 G1 */\
\
		     /* Interleave RGB even and odd */\
		     "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
		     "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
		     "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\


Michael Niedermayer's avatar
Michael Niedermayer committed
126
static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
127 128
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;
129

130
    if(c->srcFormat == PIX_FMT_YUV422P){
131 132 133
	srcStride[1] *= 2;
	srcStride[2] *= 2;
    }
134

135 136 137 138 139 140 141 142 143 144 145
    h_size= (c->dstW+7)&~7;
    if(h_size*2 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
    for (y= 0; y<srcSliceH; y++ ) {
	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
	uint8_t *_py = src[0] + y*srcStride[0];
	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
146
	long index= -h_size/2;
147

148 149 150 151
	b5Dither= dither8[y&1];
	g6Dither= dither4[y&1];
	g5Dither= dither8[y&1];
	r5Dither= dither8[(y+1)&1];
152 153 154
	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
	       pixels in each iteration */
	    __asm__ __volatile__ (
155 156 157 158 159 160
	/* load data for start of next scan line */
		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//		    ".balign 16			\n\t"
		    "1:				\n\t"
161 162 163 164 165 166
/* no speed diference on my p3@500 with prefetch,
 * if it is faster for anyone with -benchmark then tell me
			PREFETCH" 64(%0) \n\t"
			PREFETCH" 64(%1) \n\t"
			PREFETCH" 64(%2) \n\t"
*/
167
YUV2RGB
168

169
#ifdef DITHER1XBPP
Felix Bünemann's avatar
Felix Bünemann committed
170 171 172
			"paddusb "MANGLE(b5Dither)", %%mm0;"
			"paddusb "MANGLE(g6Dither)", %%mm2;"
			"paddusb "MANGLE(r5Dither)", %%mm1;"
173
#endif
174
		     /* mask unneeded bits off */
Felix Bünemann's avatar
Felix Bünemann committed
175 176 177
		     "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
		     "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
		     "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
178

179
		     "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
180
		     "pxor %%mm4, %%mm4;" /* zero mm4 */
181

182 183
		     "movq %%mm0, %%mm5;" /* Copy B7-B0 */
		     "movq %%mm2, %%mm7;" /* Copy G7-G0 */
184

185 186 187
		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
		     "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
		     "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
188

189
		     "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
190
		     "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
191

192 193
		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
194

195 196 197
		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
198

199
		     "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
200
		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
201

202
		     "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
203 204 205 206
		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
		     
207 208
		     "add $16, %1			\n\t"
		     "add $4, %0			\n\t"
209 210 211 212 213
		     " js 1b				\n\t"
		     
		     : "+r" (index), "+r" (_image)
		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
		     );
214
    }
215

216
    __asm__ __volatile__ (EMMS);
217 218
    
    return srcSliceH;
219
}
220

Michael Niedermayer's avatar
Michael Niedermayer committed
221
static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
222 223
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;
Michael Niedermayer's avatar
Michael Niedermayer committed
224

225
    if(c->srcFormat == PIX_FMT_YUV422P){
226 227 228
	srcStride[1] *= 2;
	srcStride[2] *= 2;
    }
229

230 231 232 233 234 235 236 237 238 239 240
    h_size= (c->dstW+7)&~7;
    if(h_size*2 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
    for (y= 0; y<srcSliceH; y++ ) {
	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
	uint8_t *_py = src[0] + y*srcStride[0];
	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
241
	long index= -h_size/2;
Michael Niedermayer's avatar
Michael Niedermayer committed
242

243 244 245 246
	b5Dither= dither8[y&1];
	g6Dither= dither4[y&1];
	g5Dither= dither8[y&1];
	r5Dither= dither8[(y+1)&1];
Michael Niedermayer's avatar
Michael Niedermayer committed
247 248 249
	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
	       pixels in each iteration */
	    __asm__ __volatile__ (
250 251 252 253 254 255
	/* load data for start of next scan line */
		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//		    ".balign 16			\n\t"
		    "1:				\n\t"
Michael Niedermayer's avatar
Michael Niedermayer committed
256 257
YUV2RGB

258
#ifdef DITHER1XBPP
Felix Bünemann's avatar
Felix Bünemann committed
259 260 261
			"paddusb "MANGLE(b5Dither)", %%mm0	\n\t"
			"paddusb "MANGLE(g5Dither)", %%mm2	\n\t"
			"paddusb "MANGLE(r5Dither)", %%mm1	\n\t"
262 263
#endif

Michael Niedermayer's avatar
Michael Niedermayer committed
264
		     /* mask unneeded bits off */
Felix Bünemann's avatar
Felix Bünemann committed
265 266 267
		     "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
		     "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
		     "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
Michael Niedermayer's avatar
Michael Niedermayer committed
268

269
		     "psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
Michael Niedermayer's avatar
Michael Niedermayer committed
270 271 272 273 274 275 276 277 278 279 280 281 282
		     "psrlw $1,%%mm1;"            /* 0_r7r6r5  r4r3_0_0 0_r7r6r5 r4r3_0_0 */
		     "pxor %%mm4, %%mm4;" /* zero mm4 */

		     "movq %%mm0, %%mm5;" /* Copy B7-B0 */
		     "movq %%mm2, %%mm7;" /* Copy G7-G0 */

		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
		     "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
		     "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

		     "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
		     "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */

283 284
		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
		     MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
Michael Niedermayer's avatar
Michael Niedermayer committed
285 286 287 288 289 290

		     /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
		     "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
		     "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */

		     "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
291
		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
Michael Niedermayer's avatar
Michael Niedermayer committed
292 293

		     "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
294 295 296 297
		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */

		     MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
		     
298 299
		     "add $16, %1			\n\t"
		     "add $4, %0			\n\t"
300 301 302 303
		     " js 1b				\n\t"
		     : "+r" (index), "+r" (_image)
		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
		     );
Michael Niedermayer's avatar
Michael Niedermayer committed
304 305 306
    }

    __asm__ __volatile__ (EMMS);
307
    return srcSliceH;
Michael Niedermayer's avatar
Michael Niedermayer committed
308 309
}

Michael Niedermayer's avatar
Michael Niedermayer committed
310
static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
311 312
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;
313

314
    if(c->srcFormat == PIX_FMT_YUV422P){
315 316 317
	srcStride[1] *= 2;
	srcStride[2] *= 2;
    }
318

319 320 321 322
    h_size= (c->dstW+7)&~7;
    if(h_size*3 > dstStride[0]) h_size-=8;
    
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
323

324 325 326 327 328
    for (y= 0; y<srcSliceH; y++ ) {
	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
	uint8_t *_py = src[0] + y*srcStride[0];
	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
329
	long index= -h_size/2;
330

331 332 333
	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
	       pixels in each iteration */
	    __asm__ __volatile__ (
334 335 336 337 338 339
	/* load data for start of next scan line */
		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//		    ".balign 16			\n\t"
		    "1:				\n\t"
340 341
YUV2RGB
	/* mm0=B, %%mm2=G, %%mm1=R */
342
#ifdef HAVE_MMX2
Felix Bünemann's avatar
Felix Bünemann committed
343 344
			"movq "MANGLE(M24A)", %%mm4	\n\t"
			"movq "MANGLE(M24C)", %%mm7	\n\t"
345 346 347 348 349 350 351 352 353 354 355
			"pshufw $0x50, %%mm0, %%mm5	\n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */
			"pshufw $0x50, %%mm2, %%mm3	\n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */
			"pshufw $0x00, %%mm1, %%mm6	\n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */

			"pand %%mm4, %%mm5		\n\t" /*    B2        B1       B0 */
			"pand %%mm4, %%mm3		\n\t" /*    G2        G1       G0 */
			"pand %%mm7, %%mm6		\n\t" /*       R1        R0       */

			"psllq $8, %%mm3		\n\t" /* G2        G1       G0    */
			"por %%mm5, %%mm6		\n\t"
			"por %%mm3, %%mm6		\n\t"
356
			MOVNTQ" %%mm6, (%1)		\n\t"
357 358 359 360 361 362

			"psrlq $8, %%mm2		\n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */
			"pshufw $0xA5, %%mm0, %%mm5	\n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */
			"pshufw $0x55, %%mm2, %%mm3	\n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */
			"pshufw $0xA5, %%mm1, %%mm6	\n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */

Felix Bünemann's avatar
Felix Bünemann committed
363
			"pand "MANGLE(M24B)", %%mm5	\n\t" /* B5       B4        B3    */
364 365 366 367 368
			"pand %%mm7, %%mm3		\n\t" /*       G4        G3       */
			"pand %%mm4, %%mm6		\n\t" /*    R4        R3       R2 */

			"por %%mm5, %%mm3		\n\t" /* B5    G4 B4     G3 B3    */
			"por %%mm3, %%mm6		\n\t"
369
			MOVNTQ" %%mm6, 8(%1)		\n\t"
370 371 372 373

			"pshufw $0xFF, %%mm0, %%mm5	\n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */
			"pshufw $0xFA, %%mm2, %%mm3	\n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */
			"pshufw $0xFA, %%mm1, %%mm6	\n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */
374
			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
375 376 377

			"pand %%mm7, %%mm5		\n\t" /*       B7        B6       */
			"pand %%mm4, %%mm3		\n\t" /*    G7        G6       G5 */
Felix Bünemann's avatar
Felix Bünemann committed
378
			"pand "MANGLE(M24B)", %%mm6	\n\t" /* R7       R6        R5    */
379
			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
380 381 382
\
			"por %%mm5, %%mm3		\n\t"
			"por %%mm3, %%mm6		\n\t"
383 384
			MOVNTQ" %%mm6, 16(%1)		\n\t"
			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
385 386 387 388
			"pxor %%mm4, %%mm4		\n\t"

#else

389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
			"pxor %%mm4, %%mm4		\n\t"
			"movq %%mm0, %%mm5		\n\t" /* B */
			"movq %%mm1, %%mm6		\n\t" /* R */
			"punpcklbw %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */
			"punpcklbw %%mm4, %%mm1		\n\t" /* 0R0R0R0R 0 */
			"punpckhbw %%mm2, %%mm5		\n\t" /* GBGBGBGB 2 */
			"punpckhbw %%mm4, %%mm6		\n\t" /* 0R0R0R0R 2 */
			"movq %%mm0, %%mm7		\n\t" /* GBGBGBGB 0 */
			"movq %%mm5, %%mm3		\n\t" /* GBGBGBGB 2 */
			"punpcklwd %%mm1, %%mm7		\n\t" /* 0RGB0RGB 0 */
			"punpckhwd %%mm1, %%mm0		\n\t" /* 0RGB0RGB 1 */
			"punpcklwd %%mm6, %%mm5		\n\t" /* 0RGB0RGB 2 */
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */

			"movq %%mm7, %%mm2		\n\t" /* 0RGB0RGB 0 */
			"movq %%mm0, %%mm6		\n\t" /* 0RGB0RGB 1 */
			"movq %%mm5, %%mm1		\n\t" /* 0RGB0RGB 2 */
			"movq %%mm3, %%mm4		\n\t" /* 0RGB0RGB 3 */

			"psllq $40, %%mm7		\n\t" /* RGB00000 0 */
			"psllq $40, %%mm0		\n\t" /* RGB00000 1 */
			"psllq $40, %%mm5		\n\t" /* RGB00000 2 */
			"psllq $40, %%mm3		\n\t" /* RGB00000 3 */

			"punpckhdq %%mm2, %%mm7		\n\t" /* 0RGBRGB0 0 */
			"punpckhdq %%mm6, %%mm0		\n\t" /* 0RGBRGB0 1 */
			"punpckhdq %%mm1, %%mm5		\n\t" /* 0RGBRGB0 2 */
			"punpckhdq %%mm4, %%mm3		\n\t" /* 0RGBRGB0 3 */

			"psrlq $8, %%mm7		\n\t" /* 00RGBRGB 0 */
			"movq %%mm0, %%mm6		\n\t" /* 0RGBRGB0 1 */
			"psllq $40, %%mm0		\n\t" /* GB000000 1 */
			"por %%mm0, %%mm7		\n\t" /* GBRGBRGB 0 */
422
			MOVNTQ" %%mm7, (%1)		\n\t"
423

424
			"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
425 426 427 428 429

			"psrlq $24, %%mm6		\n\t" /* 0000RGBR 1 */
			"movq %%mm5, %%mm1		\n\t" /* 0RGBRGB0 2 */
			"psllq $24, %%mm5		\n\t" /* BRGB0000 2 */
			"por %%mm5, %%mm6		\n\t" /* BRGBRGBR 1 */
430
			MOVNTQ" %%mm6, 8(%1)		\n\t"
431

432
			"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
433 434 435 436

			"psrlq $40, %%mm1		\n\t" /* 000000RG 2 */
			"psllq $8, %%mm3		\n\t" /* RGBRGB00 3 */
			"por %%mm3, %%mm1		\n\t" /* RGBRGBRG 2 */
437
			MOVNTQ" %%mm1, 16(%1)		\n\t"
438

439
			"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
440
			"pxor %%mm4, %%mm4		\n\t"
441
#endif
442
		     
443 444
		     "add $24, %1			\n\t"
		     "add $4, %0			\n\t"
445 446 447 448 449
		     " js 1b				\n\t"
		     
		     : "+r" (index), "+r" (_image)
		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
		     );
450 451 452
    }

    __asm__ __volatile__ (EMMS);
453
    return srcSliceH;
454 455
}

Michael Niedermayer's avatar
Michael Niedermayer committed
456
static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
457 458
             int srcSliceH, uint8_t* dst[], int dstStride[]){
    int y, h_size;
459

460
    if(c->srcFormat == PIX_FMT_YUV422P){
461 462 463
	srcStride[1] *= 2;
	srcStride[2] *= 2;
    }
464

465 466 467
    h_size= (c->dstW+7)&~7;
    if(h_size*4 > dstStride[0]) h_size-=8;
    
468 469
    __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );

470 471 472 473 474
    for (y= 0; y<srcSliceH; y++ ) {
	uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
	uint8_t *_py = src[0] + y*srcStride[0];
	uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
	uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
475
	long index= -h_size/2;
476 477 478 479

	    /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
	       pixels in each iteration */
	    __asm__ __volatile__ (
480 481 482 483 484 485
	/* load data for start of next scan line */
		     "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
		     "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
		     "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//		    ".balign 16			\n\t"
		    "1:				\n\t"
486 487
YUV2RGB
		     /* convert RGB plane to RGB packed format,
488 489 490
			mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
			mm4 -> GB, mm5 -> AR pixel 4-7,
			mm6 -> GB, mm7 -> AR pixel 0-3 */
491 492 493 494
		     "pxor %%mm3, %%mm3;" /* zero mm3 */

		     "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
		     "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
495

496 497
		     "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
		     "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
498

499 500
		     "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
		     "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
501

502
		     "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
503
		     MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */
504

505 506
		     "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
		     "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
507

508
		     "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
509
		     MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */
510

511 512
		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
		     "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
513

514
		     "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
515
		     MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
516

517 518
		     "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
		     "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
519

520
		     "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
521
		     MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
522

523 524
		     "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
		     "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
525

526
		     "pxor %%mm4, %%mm4;" /* zero mm4 */
527 528
		     "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */

529 530
		     "add $32, %1			\n\t"
		     "add $4, %0			\n\t"
531 532 533 534 535
		     " js 1b				\n\t"
		     
		     : "+r" (index), "+r" (_image)
		     : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
		     );
536 537 538
    }

    __asm__ __volatile__ (EMMS);
539
    return srcSliceH;
540
}