Commit e2bad983 authored by Ronald S. Bultje's avatar Ronald S. Bultje

swscale: reformat x86/swscale_template.c.

Interleave macros and code so that it's easier to find the
actual code that belongs to a function. Also reindent where
appropriate and remove dead code.
parent 71d9c33c
......@@ -73,6 +73,24 @@
: "%"REG_a, "%"REG_d, "%"REG_S\
);
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW)
{
if (uDest) {
YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
}
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
}
YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
}
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
__asm__ volatile(\
"lea " offset "(%0), %%"REG_d" \n\t"\
......@@ -135,6 +153,24 @@
: "%"REG_a, "%"REG_d, "%"REG_S\
);
static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW)
{
if (uDest) {
YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
}
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
}
YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
}
#define YSCALEYUV2YV121 \
"mov %2, %%"REG_a" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
......@@ -148,6 +184,28 @@
"add $8, %%"REG_a" \n\t"\
"jnc 1b \n\t"
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrSrc, const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW)
{
long p= 4;
const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW };
uint8_t *dst[4]= { aDest, dest, uDest, vDest };
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
while (p--) {
if (dst[p]) {
__asm__ volatile(
YSCALEYUV2YV121
:: "r" (src[p]), "r" (dst[p] + counter[p]),
"g" (-counter[p])
: "%"REG_a
);
}
}
}
#define YSCALEYUV2YV121_ACCURATE \
"mov %2, %%"REG_a" \n\t"\
"pcmpeqw %%mm7, %%mm7 \n\t"\
......@@ -166,13 +224,28 @@
"add $8, %%"REG_a" \n\t"\
"jnc 1b \n\t"
/*
:: "m" (-lumFilterSize), "m" (-chrFilterSize),
"m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
"r" (dest), "m" (dstW_reg),
"m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
: "%eax", "%ebx", "%ecx", "%edx", "%esi"
*/
static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrSrc, const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW)
{
long p= 4;
const uint8_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW };
uint8_t *dst[4]= { aDest, dest, uDest, vDest };
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
while (p--) {
if (dst[p]) {
__asm__ volatile(
YSCALEYUV2YV121_ACCURATE
:: "r" (src[p]), "r" (dst[p] + counter[p]),
"g" (-counter[p])
: "%"REG_a
);
}
}
}
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
......@@ -362,314 +435,159 @@
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
#define REAL_YSCALEYUV2PACKED(index, c) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
"movq "#b", "#q2" \n\t" /* B */\
"movq "#r", "#t" \n\t" /* R */\
"punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
"punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
"punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
"punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
"movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
"movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
"punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
"punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
"punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
"punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
\
MOVNTQ( q0, (dst, index, 4))\
MOVNTQ( b, 8(dst, index, 4))\
MOVNTQ( q2, 16(dst, index, 4))\
MOVNTQ( q3, 24(dst, index, 4))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
#define REAL_YSCALEYUV2RGB_UV(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"movq %%mm2, "U_TEMP"(%0) \n\t"
"movq %%mm4, "V_TEMP"(%0) \n\t"
"movq %%mm5, "Y_TEMP"(%0) \n\t"
YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
"movq "Y_TEMP"(%0), %%mm5 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
"movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
#define REAL_YSCALEYUV2RGB_COEFF(c) \
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
#define REAL_WRITERGB16(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $3, %%mm3 \n\t"\
"psllq $3, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
#define YSCALEYUV2RGB(index, c) \
REAL_YSCALEYUV2RGB_UV(index, c) \
REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
REAL_YSCALEYUV2RGB_COEFF(c)
static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
#define REAL_YSCALEYUV2PACKED1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
#define REAL_YSCALEYUV2RGB1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
#define REAL_YSCALEYUV2PACKED1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
// do vertical chrominance interpolation
#define REAL_YSCALEYUV2RGB1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
"movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
"movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
"psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
"psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
"packuswb %%mm1, %%mm7 \n\t"
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB16(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
"movq "#b", "#q2" \n\t" /* B */\
"movq "#r", "#t" \n\t" /* R */\
"punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
"punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
"punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
"punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
"movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
"movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
"punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
"punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
"punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
"punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
\
MOVNTQ( q0, (dst, index, 4))\
MOVNTQ( b, 8(dst, index, 4))\
MOVNTQ( q2, 16(dst, index, 4))\
MOVNTQ( q3, 24(dst, index, 4))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
#define REAL_WRITERGB16(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $3, %%mm3 \n\t"\
"psllq $3, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
WRITERGB16(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define REAL_WRITERGB15(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
......@@ -700,6 +618,50 @@
" jb 1b \n\t"
#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB15(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
WRITERGB15(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define WRITEBGR24MMX(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq %%mm2, %%mm1 \n\t" /* B */\
......@@ -809,132 +771,11 @@
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
#endif
#define REAL_WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (dst, index, 2))\
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
{
if (uDest) {
YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
}
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
}
YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
}
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
{
if (uDest) {
YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
}
if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
}
YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
}
static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
{
long p= 4;
const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
uint8_t *dst[4]= {aDest, dest, uDest, vDest};
x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
while(p--) {
if (dst[p]) {
__asm__ volatile(
YSCALEYUV2YV121_ACCURATE
:: "r" (src[p]), "r" (dst[p] + counter[p]),
"g" (-counter[p])
: "%"REG_a
);
}
}
}
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
{
long p= 4;
const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
uint8_t *dst[4]= {aDest, dest, uDest, vDest};
x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
while(p--) {
if (dst[p]) {
__asm__ volatile(
YSCALEYUV2YV121
:: "r" (src[p]), "r" (dst[p] + counter[p]),
"g" (-counter[p])
: "%"REG_a
);
}
}
}
/**
* vertical scale YV12 to RGB
*/
static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"movq %%mm2, "U_TEMP"(%0) \n\t"
"movq %%mm4, "V_TEMP"(%0) \n\t"
"movq %%mm5, "Y_TEMP"(%0) \n\t"
YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
"movq "Y_TEMP"(%0), %%mm5 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
......@@ -945,8 +786,6 @@ static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilte
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
WRITEBGR24(%%REGc, %5, %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg)
......@@ -954,59 +793,56 @@ static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilte
);
}
static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB15(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
WRITEBGR24(%%REGc, %5, %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB16(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define REAL_WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (dst, index, 2))\
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "#dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
......@@ -1015,117 +851,108 @@ static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFil
YSCALEYUV2PACKEDX_END
}
static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
WRITEBGR24(%%REGc, %5, %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
WRITERGB15(%4, %5, %%REGa)
WRITEYUY2(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
#define REAL_YSCALEYUV2RGB_UV(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
WRITERGB16(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
"movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
#define REAL_YSCALEYUV2RGB_COEFF(c) \
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
WRITEYUY2(%4, %5, %%REGa)
YSCALEYUV2PACKEDX_END
}
#define YSCALEYUV2RGB(index, c) \
REAL_YSCALEYUV2RGB_UV(index, c) \
REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
REAL_YSCALEYUV2RGB_COEFF(c)
/**
* vertical bilinear scale YV12 to RGB
*/
static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
#if ARCH_X86_64
......@@ -1136,10 +963,9 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, cons
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
"a" (&c->redDither)
,"r" (abuf0), "r" (abuf1)
"a" (&c->redDither),
"r" (abuf0), "r" (abuf1)
: "%r8"
);
#else
......@@ -1163,7 +989,6 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1178,15 +1003,17 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
......@@ -1203,8 +1030,11 @@ static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0, cons
);
}
static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
......@@ -1219,18 +1049,19 @@ static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, con
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
}
static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
......@@ -1245,7 +1076,6 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, con
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
......@@ -1254,8 +1084,49 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, con
);
}
static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define REAL_YSCALEYUV2PACKED(index, c) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile(
......@@ -1271,11 +1142,125 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, co
);
}
#define REAL_YSCALEYUV2RGB1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
// do vertical chrominance interpolation
#define REAL_YSCALEYUV2RGB1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
"movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
"movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
"psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
"psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
"packuswb %%mm1, %%mm7 \n\t"
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
/**
* YV12 to RGB without scaling or interpolating
*/
static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y)
{
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
......@@ -1290,7 +1275,6 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1304,7 +1288,6 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1320,7 +1303,6 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1334,7 +1316,6 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1342,8 +1323,11 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, cons
}
}
static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y)
{
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
......@@ -1357,7 +1341,6 @@ static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1371,15 +1354,17 @@ static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, cons
WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y)
{
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
......@@ -1399,7 +1384,6 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, con
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1419,15 +1403,17 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, con
WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y)
{
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
......@@ -1444,11 +1430,9 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, con
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1465,19 +1449,53 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, con
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
#define REAL_YSCALEYUV2PACKED1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
#define REAL_YSCALEYUV2PACKED1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1,
const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y)
{
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
......@@ -1490,7 +1508,6 @@ static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, co
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......@@ -1503,7 +1520,6 @@ static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, co
WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
"a" (&c->redDither)
);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment