Commit 986f0d86 authored by Michael Niedermayer's avatar Michael Niedermayer

Commits that could not be pulled earlier due to bugs.

    commit 93681fbd
    Author: Ronald S. Bultje <rsbultje@gmail.com>
    Date:   Thu May 26 11:32:32 2011 -0400

        swscale: fix compile on ppc.

    commit e758573a
    Author: Ronald S. Bultje <rsbultje@gmail.com>
    Date:   Thu May 26 10:36:47 2011 -0400

        swscale: fix compile on x86-32.

    commit 0f4eb8b0
    Author: Ronald S. Bultje <rsbultje@gmail.com>
    Date:   Thu May 26 09:17:52 2011 -0400

        swscale: remove VOF/VOFW.

    commit b4a224c5
    Author: Ronald S. Bultje <rsbultje@gmail.com>
    Date:   Wed May 25 14:30:09 2011 -0400

        swscale: split chroma buffers into separate U/V planes.

        Preparatory step to implement support for sizes > VOFW.
parent ea535ed5
...@@ -86,9 +86,11 @@ altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) ...@@ -86,9 +86,11 @@ altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW)
} }
static inline void static inline void
yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, int lumFilterSize, const int16_t *chrFilter,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) const int16_t **chrUSrc, const int16_t **chrVSrc,
int chrFilterSize, uint8_t *dest, uint8_t *uDest,
uint8_t *vDest, int dstW, int chrDstW)
{ {
const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)};
register int i, j; register int i, j;
...@@ -159,22 +161,22 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF ...@@ -159,22 +161,22 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF
vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0);
vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter
perm = vec_lvsl(0, chrSrc[j]); perm = vec_lvsl(0, chrUSrc[j]);
l1 = vec_ld(0, chrSrc[j]); l1 = vec_ld(0, chrUSrc[j]);
l1_V = vec_ld(VOFW << 1, chrSrc[j]); l1_V = vec_ld(0, chrVSrc[j]);
for (i = 0; i < (chrDstW - 7); i+=8) { for (i = 0; i < (chrDstW - 7); i+=8) {
int offset = i << 2; int offset = i << 2;
vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); vector signed short l2 = vec_ld((i << 1) + 16, chrUSrc[j]);
vector signed short l2_V = vec_ld(((i + VOFW) << 1) + 16, chrSrc[j]); vector signed short l2_V = vec_ld((i << 1) + 16, chrVSrc[j]);
vector signed int v1 = vec_ld(offset, u); vector signed int v1 = vec_ld(offset, u);
vector signed int v2 = vec_ld(offset + 16, u); vector signed int v2 = vec_ld(offset + 16, u);
vector signed int v1_V = vec_ld(offset, v); vector signed int v1_V = vec_ld(offset, v);
vector signed int v2_V = vec_ld(offset + 16, v); vector signed int v2_V = vec_ld(offset + 16, v);
vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] vector signed short ls = vec_perm(l1, l2, perm); // chrUSrc[j][i] ... chrUSrc[j][i+7]
vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+VOFW] ... chrSrc[j][i+2055] vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrVSrc[j][i] ... chrVSrc[j][i]
vector signed int i1 = vec_mule(vChrFilter, ls); vector signed int i1 = vec_mule(vChrFilter, ls);
vector signed int i2 = vec_mulo(vChrFilter, ls); vector signed int i2 = vec_mulo(vChrFilter, ls);
...@@ -182,9 +184,9 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF ...@@ -182,9 +184,9 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF
vector signed int i2_V = vec_mulo(vChrFilter, ls_V); vector signed int i2_V = vec_mulo(vChrFilter, ls_V);
vector signed int vf1 = vec_mergeh(i1, i2); vector signed int vf1 = vec_mergeh(i1, i2);
vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] vector signed int vf2 = vec_mergel(i1, i2); // chrUSrc[j][i] * chrFilter[j] ... chrUSrc[j][i+7] * chrFilter[j]
vector signed int vf1_V = vec_mergeh(i1_V, i2_V); vector signed int vf1_V = vec_mergeh(i1_V, i2_V);
vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrVSrc[j][i] * chrFilter[j] ... chrVSrc[j][i+7] * chrFilter[j]
vector signed int vo1 = vec_add(v1, vf1); vector signed int vo1 = vec_add(v1, vf1);
vector signed int vo2 = vec_add(v2, vf2); vector signed int vo2 = vec_add(v2, vf2);
...@@ -200,8 +202,8 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF ...@@ -200,8 +202,8 @@ yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumF
l1_V = l2_V; l1_V = l2_V;
} }
for ( ; i < chrDstW; i++) { for ( ; i < chrDstW; i++) {
u[i] += chrSrc[j][i] * chrFilter[j]; u[i] += chrUSrc[j][i] * chrFilter[j];
v[i] += chrSrc[j][i + VOFW] * chrFilter[j]; v[i] += chrVSrc[j][i] * chrFilter[j];
} }
} }
altivec_packIntArrayToCharArray(u, uDest, chrDstW); altivec_packIntArrayToCharArray(u, uDest, chrDstW);
......
...@@ -24,21 +24,28 @@ ...@@ -24,21 +24,28 @@
#endif #endif
#if COMPILE_TEMPLATE_ALTIVEC #if COMPILE_TEMPLATE_ALTIVEC
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc, const int16_t **lumSrc, int lumFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW)
{ {
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
dest, uDest, vDest, dstW, chrDstW); dest, uDest, vDest, dstW, chrDstW);
} }
/** /**
* vertical scale YV12 to RGB * vertical scale YV12 to RGB
*/ */
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY) const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest,
long dstW, long dstY)
{ {
/* The following list of supported dstFormat values should /* The following list of supported dstFormat values should
match what's found in the body of ff_yuv2packedX_altivec() */ match what's found in the body of ff_yuv2packedX_altivec() */
...@@ -47,11 +54,11 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, ...@@ -47,11 +54,11 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
dest, dstW, dstY); dest, dstW, dstY);
else else
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, dest, dstW, dstY); alpSrc, dest, dstW, dstY);
} }
#endif #endif
......
...@@ -778,10 +778,11 @@ void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int b ...@@ -778,10 +778,11 @@ void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int b
void void
ff_yuv2packedX_altivec(SwsContext *c, ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
uint8_t *dest, int dstW, int dstY) const int16_t **chrVSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY)
{ {
int i,j; int i,j;
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
...@@ -816,9 +817,9 @@ ff_yuv2packedX_altivec(SwsContext *c, ...@@ -816,9 +817,9 @@ ff_yuv2packedX_altivec(SwsContext *c,
V = RND; V = RND;
/* extract 8 coeffs from U,V */ /* extract 8 coeffs from U,V */
for (j=0; j<chrFilterSize; j++) { for (j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]); X = vec_ld (0, &chrUSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U); U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+VOFW]); X = vec_ld (0, &chrVSrc[j][i/2]);
V = vec_mradds (X, CCoeffs[j], V); V = vec_mradds (X, CCoeffs[j], V);
} }
...@@ -894,9 +895,9 @@ ff_yuv2packedX_altivec(SwsContext *c, ...@@ -894,9 +895,9 @@ ff_yuv2packedX_altivec(SwsContext *c,
V = RND; V = RND;
/* extract 8 coeffs from U,V */ /* extract 8 coeffs from U,V */
for (j=0; j<chrFilterSize; j++) { for (j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]); X = vec_ld (0, &chrUSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U); U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+VOFW]); X = vec_ld (0, &chrVSrc[j][i/2]);
V = vec_mradds (X, CCoeffs[j], V); V = vec_mradds (X, CCoeffs[j], V);
} }
......
...@@ -301,7 +301,8 @@ uint16_t dither_scale[15][16]={ ...@@ -301,7 +301,8 @@ uint16_t dither_scale[15][16]={
}; };
static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest,
int dstW, int chrDstW, int big_endian, int output_bits) int dstW, int chrDstW, int big_endian, int output_bits)
{ {
...@@ -340,8 +341,8 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co ...@@ -340,8 +341,8 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
int j; int j;
for (j = 0; j < chrFilterSize; j++) { for (j = 0; j < chrFilterSize; j++) {
u += chrSrc[j][i ] * chrFilter[j]; u += chrUSrc[j][i] * chrFilter[j];
v += chrSrc[j][i + VOFW] * chrFilter[j]; v += chrVSrc[j][i] * chrFilter[j];
} }
output_pixel(&uDest[i], u); output_pixel(&uDest[i], u);
...@@ -362,28 +363,50 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co ...@@ -362,28 +363,50 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
} }
} }
#define yuv2NBPS(bits, BE_LE, is_be) \
static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *lumFilter, \
const int16_t **lumSrc, int lumFilterSize, \
const int16_t *chrFilter, const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, const int16_t **alpSrc, \
uint16_t *dest, uint16_t *uDest, uint16_t *vDest, \
uint16_t *aDest, int dstW, int chrDstW) \
{ \
yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
alpSrc, \
dest, uDest, vDest, aDest, \
dstW, chrDstW, is_be, bits); \
}
yuv2NBPS( 9, BE, 1);
yuv2NBPS( 9, LE, 0);
yuv2NBPS(10, BE, 1);
yuv2NBPS(10, LE, 0);
yuv2NBPS(16, BE, 1);
yuv2NBPS(16, LE, 0);
static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW, const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
enum PixelFormat dstFormat) enum PixelFormat dstFormat)
{ {
if (isNBPS(dstFormat)) { if (isNBPS(dstFormat)) {
const int depth = av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1+1; const int depth = av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1+1;
yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, alpSrc,
dest, uDest, vDest, aDest, dest, uDest, vDest, aDest,
dstW, chrDstW, isBE(dstFormat), depth); dstW, chrDstW, isBE(dstFormat), depth);
} else { } else {
if (isBE(dstFormat)) { if (isBE(dstFormat)) {
yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, alpSrc,
dest, uDest, vDest, aDest, dest, uDest, vDest, aDest,
dstW, chrDstW, 1, 16); dstW, chrDstW, 1, 16);
} else { } else {
yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, alpSrc,
dest, uDest, vDest, aDest, dest, uDest, vDest, aDest,
dstW, chrDstW, 0, 16); dstW, chrDstW, 0, 16);
...@@ -392,7 +415,8 @@ static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSr ...@@ -392,7 +415,8 @@ static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSr
} }
static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW) const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW)
{ {
//FIXME Optimize (just quickly written not optimized..) //FIXME Optimize (just quickly written not optimized..)
...@@ -412,8 +436,8 @@ static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, ...@@ -412,8 +436,8 @@ static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc,
int v=1<<18; int v=1<<18;
int j; int j;
for (j=0; j<chrFilterSize; j++) { for (j=0; j<chrFilterSize; j++) {
u += chrSrc[j][i] * chrFilter[j]; u += chrUSrc[j][i] * chrFilter[j];
v += chrSrc[j][i + VOFW] * chrFilter[j]; v += chrVSrc[j][i] * chrFilter[j];
} }
uDest[i]= av_clip_uint8(u>>19); uDest[i]= av_clip_uint8(u>>19);
...@@ -433,7 +457,8 @@ static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, ...@@ -433,7 +457,8 @@ static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc,
} }
static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
{ {
//FIXME Optimize (just quickly written not optimized..) //FIXME Optimize (just quickly written not optimized..)
...@@ -456,8 +481,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -456,8 +481,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
int v=1<<18; int v=1<<18;
int j; int j;
for (j=0; j<chrFilterSize; j++) { for (j=0; j<chrFilterSize; j++) {
u += chrSrc[j][i] * chrFilter[j]; u += chrUSrc[j][i] * chrFilter[j];
v += chrSrc[j][i + VOFW] * chrFilter[j]; v += chrVSrc[j][i] * chrFilter[j];
} }
uDest[2*i]= av_clip_uint8(u>>19); uDest[2*i]= av_clip_uint8(u>>19);
...@@ -469,8 +494,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -469,8 +494,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
int v=1<<18; int v=1<<18;
int j; int j;
for (j=0; j<chrFilterSize; j++) { for (j=0; j<chrFilterSize; j++) {
u += chrSrc[j][i] * chrFilter[j]; u += chrUSrc[j][i] * chrFilter[j];
v += chrSrc[j][i + VOFW] * chrFilter[j]; v += chrVSrc[j][i] * chrFilter[j];
} }
uDest[2*i]= av_clip_uint8(v>>19); uDest[2*i]= av_clip_uint8(v>>19);
...@@ -494,8 +519,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -494,8 +519,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
Y2 += lumSrc[j][i2+1] * lumFilter[j];\ Y2 += lumSrc[j][i2+1] * lumFilter[j];\
}\ }\
for (j=0; j<chrFilterSize; j++) {\ for (j=0; j<chrFilterSize; j++) {\
U += chrSrc[j][i] * chrFilter[j];\ U += chrUSrc[j][i] * chrFilter[j];\
V += chrSrc[j][i+VOFW] * chrFilter[j];\ V += chrVSrc[j][i] * chrFilter[j];\
}\ }\
Y1>>=19;\ Y1>>=19;\
Y2>>=19;\ Y2>>=19;\
...@@ -542,8 +567,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -542,8 +567,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
Y += lumSrc[j][i ] * lumFilter[j];\ Y += lumSrc[j][i ] * lumFilter[j];\
}\ }\
for (j=0; j<chrFilterSize; j++) {\ for (j=0; j<chrFilterSize; j++) {\
U += chrSrc[j][i ] * chrFilter[j];\ U += chrUSrc[j][i] * chrFilter[j];\
V += chrSrc[j][i+VOFW] * chrFilter[j];\ V += chrVSrc[j][i] * chrFilter[j];\
}\ }\
Y >>=10;\ Y >>=10;\
U >>=10;\ U >>=10;\
...@@ -608,8 +633,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -608,8 +633,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
const int i2= 2*i; \ const int i2= 2*i; \
int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19; \ int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19; \
int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19; \ int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19; \
int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19; \ int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19; \
int V= (uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19; \ int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19; \
type av_unused *r, *b, *g; \ type av_unused *r, *b, *g; \
int av_unused A1, A2; \ int av_unused A1, A2; \
if (alpha) {\ if (alpha) {\
...@@ -634,8 +659,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -634,8 +659,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
const int i2= 2*i;\ const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\ int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\ int Y2= buf0[i2+1]>>7;\
int U= (uvbuf1[i ])>>7;\ int U= (ubuf1[i])>>7;\
int V= (uvbuf1[i+VOFW])>>7;\ int V= (vbuf1[i])>>7;\
type av_unused *r, *b, *g;\ type av_unused *r, *b, *g;\
int av_unused A1, A2;\ int av_unused A1, A2;\
if (alpha) {\ if (alpha) {\
...@@ -660,8 +685,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -660,8 +685,8 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
const int i2= 2*i;\ const int i2= 2*i;\
int Y1= buf0[i2 ]>>7;\ int Y1= buf0[i2 ]>>7;\
int Y2= buf0[i2+1]>>7;\ int Y2= buf0[i2+1]>>7;\
int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\ int U= (ubuf0[i] + ubuf1[i])>>8;\
int V= (uvbuf0[i+VOFW] + uvbuf1[i+VOFW])>>8;\ int V= (vbuf0[i] + vbuf1[i])>>8;\
type av_unused *r, *b, *g;\ type av_unused *r, *b, *g;\
int av_unused A1, A2;\ int av_unused A1, A2;\
if (alpha) {\ if (alpha) {\
...@@ -943,16 +968,20 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc ...@@ -943,16 +968,20 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
break;\ break;\
} }
static inline void yuv2packedXinC(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void yuv2packedXinC(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW, int y) const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
{ {
int i; int i;
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C) YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
} }
static inline void yuv2rgbXinC_full(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, static inline void yuv2rgbXinC_full(SwsContext *c, const int16_t *lumFilter,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW, int y) const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
{ {
int i; int i;
......
...@@ -35,10 +35,6 @@ ...@@ -35,10 +35,6 @@
#define MAX_FILTER_SIZE 256 #define MAX_FILTER_SIZE 256
#define VOFW 21504
#define VOF (VOFW*2)
#if HAVE_BIGENDIAN #if HAVE_BIGENDIAN
#define ALT32_CORR (-1) #define ALT32_CORR (-1)
#else #else
...@@ -108,7 +104,8 @@ typedef struct SwsContext { ...@@ -108,7 +104,8 @@ typedef struct SwsContext {
*/ */
//@{ //@{
int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler. int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
int16_t **chrPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler. int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler. int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer. int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer. int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
...@@ -196,6 +193,7 @@ typedef struct SwsContext { ...@@ -196,6 +193,7 @@ typedef struct SwsContext {
#define V_TEMP "11*8+4*4*256*2+32" #define V_TEMP "11*8+4*4*256*2+32"
#define Y_TEMP "11*8+4*4*256*2+40" #define Y_TEMP "11*8+4*4*256*2+40"
#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48" #define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
#define UV_OFF "11*8+4*4*256*3+48"
DECLARE_ALIGNED(8, uint64_t, redDither); DECLARE_ALIGNED(8, uint64_t, redDither);
DECLARE_ALIGNED(8, uint64_t, greenDither); DECLARE_ALIGNED(8, uint64_t, greenDither);
...@@ -218,6 +216,7 @@ typedef struct SwsContext { ...@@ -218,6 +216,7 @@ typedef struct SwsContext {
DECLARE_ALIGNED(8, uint64_t, v_temp); DECLARE_ALIGNED(8, uint64_t, v_temp);
DECLARE_ALIGNED(8, uint64_t, y_temp); DECLARE_ALIGNED(8, uint64_t, y_temp);
int32_t alpMmxFilter[4*MAX_FILTER_SIZE]; int32_t alpMmxFilter[4*MAX_FILTER_SIZE];
DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
#if HAVE_ALTIVEC #if HAVE_ALTIVEC
vector signed short CY; vector signed short CY;
...@@ -251,36 +250,42 @@ typedef struct SwsContext { ...@@ -251,36 +250,42 @@ typedef struct SwsContext {
/* function pointers for swScale() */ /* function pointers for swScale() */
void (*yuv2nv12X )(struct SwsContext *c, void (*yuv2nv12X )(struct SwsContext *c,
const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
uint8_t *dest, uint8_t *uDest, uint8_t *dest, uint8_t *uDest,
int dstW, int chrDstW, int dstFormat); int dstW, int chrDstW, int dstFormat);
void (*yuv2yuv1 )(struct SwsContext *c, void (*yuv2yuv1 )(struct SwsContext *c,
const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc, const int16_t *lumSrc, const int16_t *chrUSrc,
const int16_t *chrVSrc, const int16_t *alpSrc,
uint8_t *dest, uint8_t *dest,
uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest,
long dstW, long chrDstW); long dstW, long chrDstW);
void (*yuv2yuvX )(struct SwsContext *c, void (*yuv2yuvX )(struct SwsContext *c,
const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, const int16_t **alpSrc,
uint8_t *dest, uint8_t *dest,
uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest,
long dstW, long chrDstW); long dstW, long chrDstW);
void (*yuv2packed1)(struct SwsContext *c, void (*yuv2packed1)(struct SwsContext *c,
const uint16_t *buf0, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, const uint16_t *abuf0,
uint8_t *dest, uint8_t *dest,
int dstW, int uvalpha, int dstFormat, int flags, int y); int dstW, int uvalpha, int dstFormat, int flags, int y);
void (*yuv2packed2)(struct SwsContext *c, void (*yuv2packed2)(struct SwsContext *c,
const uint16_t *buf0, const uint16_t *buf1, const uint16_t *buf0, const uint16_t *buf1,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, const uint16_t *abuf1, const uint16_t *abuf0, const uint16_t *abuf1,
uint8_t *dest, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y); int dstW, int yalpha, int uvalpha, int y);
void (*yuv2packedX)(struct SwsContext *c, void (*yuv2packedX)(struct SwsContext *c,
const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, const int16_t **alpSrc, uint8_t *dest,
long dstW, long dstY); long dstW, long dstY);
...@@ -295,7 +300,7 @@ typedef struct SwsContext { ...@@ -295,7 +300,7 @@ typedef struct SwsContext {
int16_t *dst, long dstWidth, int16_t *dst, long dstWidth,
const uint8_t *src, int srcW, int xInc); const uint8_t *src, int srcW, int xInc);
void (*hcscale_fast)(struct SwsContext *c, void (*hcscale_fast)(struct SwsContext *c,
int16_t *dst, long dstWidth, int16_t *dst1, int16_t *dst2, long dstWidth,
const uint8_t *src1, const uint8_t *src2, const uint8_t *src1, const uint8_t *src2,
int srcW, int xInc); int srcW, int xInc);
...@@ -308,7 +313,7 @@ typedef struct SwsContext { ...@@ -308,7 +313,7 @@ typedef struct SwsContext {
long filterSize, int shift); long filterSize, int shift);
void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed. void (*lumConvertRange)(int16_t *dst, int width); ///< Color range conversion function for luma plane if needed.
void (*chrConvertRange)(int16_t *dst, int width); ///< Color range conversion function for chroma planes if needed. void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width); ///< Color range conversion function for chroma planes if needed.
int lumSrcOffset; ///< Offset given to luma src pointers passed to horizontal input functions. int lumSrcOffset; ///< Offset given to luma src pointers passed to horizontal input functions.
int chrSrcOffset; ///< Offset given to chroma src pointers passed to horizontal input functions. int chrSrcOffset; ///< Offset given to chroma src pointers passed to horizontal input functions.
...@@ -332,9 +337,10 @@ SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c); ...@@ -332,9 +337,10 @@ SwsFunc ff_yuv2rgb_init_mlib(SwsContext *c);
SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c); SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c);
SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c); SwsFunc ff_yuv2rgb_get_func_ptr_bfin(SwsContext *c);
void ff_bfin_get_unscaled_swscale(SwsContext *c); void ff_bfin_get_unscaled_swscale(SwsContext *c);
void ff_yuv2packedX_altivec(SwsContext *c, void ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY); uint8_t *dest, int dstW, int dstY);
const char *sws_format_name(enum PixelFormat format); const char *sws_format_name(enum PixelFormat format);
......
...@@ -20,29 +20,32 @@ ...@@ -20,29 +20,32 @@
static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter, static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
} }
static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter, static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, uint8_t *dest, uint8_t *uDest, int chrFilterSize, uint8_t *dest, uint8_t *uDest,
int dstW, int chrDstW, enum PixelFormat dstFormat) int dstW, int chrDstW, enum PixelFormat dstFormat)
{ {
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
dest, uDest, dstW, chrDstW, dstFormat); dest, uDest, dstW, chrDstW, dstFormat);
} }
static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc, static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrSrc, const int16_t *alpSrc, const int16_t *chrUSrc, const int16_t *chrVSrc,
const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
...@@ -54,8 +57,8 @@ static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc, ...@@ -54,8 +57,8 @@ static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
if (uDest) if (uDest)
for (i=0; i<chrDstW; i++) { for (i=0; i<chrDstW; i++) {
int u=(chrSrc[i ]+64)>>7; int u=(chrUSrc[i]+64)>>7;
int v=(chrSrc[i + VOFW]+64)>>7; int v=(chrVSrc[i]+64)>>7;
uDest[i]= av_clip_uint8(u); uDest[i]= av_clip_uint8(u);
vDest[i]= av_clip_uint8(v); vDest[i]= av_clip_uint8(v);
} }
...@@ -73,12 +76,13 @@ static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc, ...@@ -73,12 +76,13 @@ static inline void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
*/ */
static inline void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter, static inline void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
chrFilter, chrSrc, chrFilterSize, chrFilter, chrUSrc, chrVSrc, chrFilterSize,
alpSrc, dest, dstW, dstY); alpSrc, dest, dstW, dstY);
} }
...@@ -86,8 +90,9 @@ static inline void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter, ...@@ -86,8 +90,9 @@ static inline void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
* vertical bilinear scale YV12 to RGB * vertical bilinear scale YV12 to RGB
*/ */
static inline void yuv2packed2_c(SwsContext *c, const uint16_t *buf0, static inline void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, int dstW, const uint16_t *abuf1, uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y) int yalpha, int uvalpha, int y)
{ {
...@@ -102,7 +107,8 @@ static inline void yuv2packed2_c(SwsContext *c, const uint16_t *buf0, ...@@ -102,7 +107,8 @@ static inline void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
* YV12 to RGB without scaling or interpolating * YV12 to RGB without scaling or interpolating
*/ */
static inline void yuv2packed1_c(SwsContext *c, const uint16_t *buf0, static inline void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, int dstW, const uint16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, enum PixelFormat dstFormat, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
...@@ -373,20 +379,20 @@ static inline void hScale16X_c(int16_t *dst, int dstW, const uint16_t *src, int ...@@ -373,20 +379,20 @@ static inline void hScale16X_c(int16_t *dst, int dstW, const uint16_t *src, int
//FIXME all pal and rgb srcFormats could do this convertion as well //FIXME all pal and rgb srcFormats could do this convertion as well
//FIXME all scalers more complex than bilinear could do half of this transform //FIXME all scalers more complex than bilinear could do half of this transform
static void chrRangeToJpeg_c(int16_t *dst, int width) static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{ {
int i; int i;
for (i = 0; i < width; i++) { for (i = 0; i < width; i++) {
dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
} }
} }
static void chrRangeFromJpeg_c(int16_t *dst, int width) static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{ {
int i; int i;
for (i = 0; i < width; i++) { for (i = 0; i < width; i++) {
dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469 dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469 dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
} }
} }
static void lumRangeToJpeg_c(int16_t *dst, int width) static void lumRangeToJpeg_c(int16_t *dst, int width)
...@@ -446,7 +452,7 @@ static inline void hyscale_c(SwsContext *c, uint16_t *dst, long dstWidth, ...@@ -446,7 +452,7 @@ static inline void hyscale_c(SwsContext *c, uint16_t *dst, long dstWidth,
convertRange(dst, dstWidth); convertRange(dst, dstWidth);
} }
static inline void hcscale_fast_c(SwsContext *c, int16_t *dst, static inline void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
long dstWidth, const uint8_t *src1, long dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc) const uint8_t *src2, int srcW, int xInc)
{ {
...@@ -455,17 +461,13 @@ static inline void hcscale_fast_c(SwsContext *c, int16_t *dst, ...@@ -455,17 +461,13 @@ static inline void hcscale_fast_c(SwsContext *c, int16_t *dst,
for (i=0;i<dstWidth;i++) { for (i=0;i<dstWidth;i++) {
register unsigned int xx=xpos>>16; register unsigned int xx=xpos>>16;
register unsigned int xalpha=(xpos&0xFFFF)>>9; register unsigned int xalpha=(xpos&0xFFFF)>>9;
dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
/* slower
dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
*/
xpos+=xInc; xpos+=xInc;
} }
} }
inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth, inline static void hcscale_c(SwsContext *c, uint16_t *dst1, uint16_t *dst2, long dstWidth,
const uint8_t *src1, const uint8_t *src2, const uint8_t *src1, const uint8_t *src2,
int srcW, int xInc, const int16_t *hChrFilter, int srcW, int xInc, const int16_t *hChrFilter,
const int16_t *hChrFilterPos, int hChrFilterSize, const int16_t *hChrFilterPos, int hChrFilterSize,
...@@ -484,17 +486,17 @@ inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth, ...@@ -484,17 +486,17 @@ inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth,
if (c->hScale16) { if (c->hScale16) {
int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1; int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
c->hScale16(dst , dstWidth, (uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift); c->hScale16(dst1, dstWidth, (uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
c->hScale16(dst+VOFW, dstWidth, (uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift); c->hScale16(dst2, dstWidth, (uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
} else if (!c->hcscale_fast) { } else if (!c->hcscale_fast) {
c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
} else { // fast bilinear upscale / crap downscale } else { // fast bilinear upscale / crap downscale
c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc); c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
} }
if (c->chrConvertRange) if (c->chrConvertRange)
c->chrConvertRange(dst, dstWidth); c->chrConvertRange(dst1, dst2, dstWidth);
} }
#define DEBUG_SWSCALE_BUFFERS 0 #define DEBUG_SWSCALE_BUFFERS 0
...@@ -534,7 +536,8 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -534,7 +536,8 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
const int hLumFilterSize= c->hLumFilterSize; const int hLumFilterSize= c->hLumFilterSize;
const int hChrFilterSize= c->hChrFilterSize; const int hChrFilterSize= c->hChrFilterSize;
int16_t **lumPixBuf= c->lumPixBuf; int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrPixBuf= c->chrPixBuf; int16_t **chrUPixBuf= c->chrUPixBuf;
int16_t **chrVPixBuf= c->chrVPixBuf;
int16_t **alpPixBuf= c->alpPixBuf; int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize; const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize; const int vChrBufSize= c->vChrBufSize;
...@@ -662,10 +665,10 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -662,10 +665,10 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
//FIXME replace parameters through context struct (some at least) //FIXME replace parameters through context struct (some at least)
if (c->needs_hcscale) if (c->needs_hcscale)
hcscale_c(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, hcscale_c(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
hChrFilter, hChrFilterPos, hChrFilterSize, chrDstW, src1, src2, chrSrcW, chrXInc,
formatConvBuffer, hChrFilter, hChrFilterPos, hChrFilterSize,
pal); formatConvBuffer, pal);
lastInChrBuf++; lastInChrBuf++;
DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n", DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
chrBufIndex, lastInChrBuf); chrBufIndex, lastInChrBuf);
...@@ -681,47 +684,54 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -681,47 +684,54 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
#endif #endif
if (dstY < dstH-2) { if (dstY < dstH-2) {
const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
c->yuv2nv12X(c, c->yuv2nv12X(c,
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
dest, uDest, dstW, chrDstW, dstFormat); dest, uDest, dstW, chrDstW, dstFormat);
} else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
if (is16BPS(dstFormat) || isNBPS(dstFormat)) { if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
yuv2yuvX16inC( yuv2yuvX16inC(vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest,
(uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
dstFormat); dstFormat);
} else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
const int16_t *lumBuf = lumSrcPtr[0]; const int16_t *lumBuf = lumSrcPtr[0];
const int16_t *chrBuf= chrSrcPtr[0]; const int16_t *chrUBuf= chrUSrcPtr[0];
const int16_t *chrVBuf= chrVSrcPtr[0];
const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW); c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
uDest, vDest, aDest, dstW, chrDstW);
} else { //General YV12 } else { //General YV12
c->yuv2yuvX(c, c->yuv2yuvX(c,
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
} }
} else { } else {
assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
int chrAlpha= vChrFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1];
if(flags & SWS_FULL_CHR_H_INT) { if(flags & SWS_FULL_CHR_H_INT) {
yuv2rgbXinC_full(c, //FIXME write a packed1_full function yuv2rgbXinC_full(c, //FIXME write a packed1_full function
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr,
chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} else { } else {
c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
*chrVSrcPtr, *(chrVSrcPtr+1),
alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *alpSrcPtr : NULL,
dest, dstW, chrAlpha, dstFormat, flags, dstY); dest, dstW, chrAlpha, dstFormat, flags, dstY);
} }
...@@ -735,10 +745,11 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -735,10 +745,11 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
if(flags & SWS_FULL_CHR_H_INT) { if(flags & SWS_FULL_CHR_H_INT) {
yuv2rgbXinC_full(c, //FIXME write a packed2_full function yuv2rgbXinC_full(c, //FIXME write a packed2_full function
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} else { } else {
c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
*chrVSrcPtr, *(chrVSrcPtr+1),
alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL, alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
dest, dstW, lumAlpha, chrAlpha, dstY); dest, dstW, lumAlpha, chrAlpha, dstY);
} }
...@@ -746,26 +757,27 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -746,26 +757,27 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
if(flags & SWS_FULL_CHR_H_INT) { if(flags & SWS_FULL_CHR_H_INT) {
yuv2rgbXinC_full(c, yuv2rgbXinC_full(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} else { } else {
c->yuv2packedX(c, c->yuv2packedX(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} }
} }
} }
} else { // hmm looks like we can't use MMX here without overwriting this array's tail } else { // hmm looks like we can't use MMX here without overwriting this array's tail
const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
yuv2nv12XinC( yuv2nv12XinC(
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
dest, uDest, dstW, chrDstW, dstFormat); dest, uDest, dstW, chrDstW, dstFormat);
} else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
...@@ -773,27 +785,27 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[], ...@@ -773,27 +785,27 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
if (is16BPS(dstFormat) || isNBPS(dstFormat)) { if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
yuv2yuvX16inC( yuv2yuvX16inC(
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
dstFormat); dstFormat);
} else { } else {
yuv2yuvXinC( yuv2yuvXinC(
vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
} }
} else { } else {
assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
if(flags & SWS_FULL_CHR_H_INT) { if(flags & SWS_FULL_CHR_H_INT) {
yuv2rgbXinC_full(c, yuv2rgbXinC_full(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} else { } else {
yuv2packedXinC(c, yuv2packedXinC(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY); alpSrcPtr, dest, dstW, dstY);
} }
} }
......
...@@ -753,6 +753,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) ...@@ -753,6 +753,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
int srcH= c->srcH; int srcH= c->srcH;
int dstW= c->dstW; int dstW= c->dstW;
int dstH= c->dstH; int dstH= c->dstH;
int dst_stride = FFALIGN(dstW * sizeof(int16_t)+66, 16), dst_stride_px = dst_stride >> 1;
int flags, cpu_flags; int flags, cpu_flags;
enum PixelFormat srcFormat= c->srcFormat; enum PixelFormat srcFormat= c->srcFormat;
enum PixelFormat dstFormat= c->dstFormat; enum PixelFormat dstFormat= c->dstFormat;
...@@ -794,10 +795,6 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) ...@@ -794,10 +795,6 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
srcW, srcH, dstW, dstH); srcW, srcH, dstW, dstH);
return AVERROR(EINVAL); return AVERROR(EINVAL);
} }
if(srcW > VOFW || dstW > VOFW) {
av_log(NULL, AV_LOG_ERROR, "swScaler: Compile-time maximum width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n");
return AVERROR(EINVAL);
}
FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail); FF_ALLOC_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
if (!dstFilter) dstFilter= &dummyFilter; if (!dstFilter) dstFilter= &dummyFilter;
...@@ -1001,29 +998,31 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) ...@@ -1001,29 +998,31 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
// allocate pixbufs (we use dynamic allocation because otherwise we would need to // allocate pixbufs (we use dynamic allocation because otherwise we would need to
// allocate several megabytes to handle all possible cases) // allocate several megabytes to handle all possible cases)
FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail); FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
FF_ALLOC_OR_GOTO(c, c->chrPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail); FF_ALLOC_OR_GOTO(c, c->chrUPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail);
FF_ALLOC_OR_GOTO(c, c->chrVPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail);
if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat)) if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat))
FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail); FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
//Note we need at least one pixel more at the end because of the MMX code (just in case someone wanna replace the 4000/8000) //Note we need at least one pixel more at the end because of the MMX code (just in case someone wanna replace the 4000/8000)
/* align at 16 bytes for AltiVec */ /* align at 16 bytes for AltiVec */
for (i=0; i<c->vLumBufSize; i++) { for (i=0; i<c->vLumBufSize; i++) {
FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], VOF+1, fail); FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], dst_stride+1, fail);
c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize]; c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize];
} }
c->uv_off = dst_stride_px;
for (i=0; i<c->vChrBufSize; i++) { for (i=0; i<c->vChrBufSize; i++) {
FF_ALLOC_OR_GOTO(c, c->chrPixBuf[i+c->vChrBufSize], (VOF+1)*2, fail); FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i+c->vChrBufSize], dst_stride*2+1, fail);
c->chrPixBuf[i] = c->chrPixBuf[i+c->vChrBufSize]; c->chrUPixBuf[i] = c->chrUPixBuf[i+c->vChrBufSize];
c->chrVPixBuf[i] = c->chrVPixBuf[i+c->vChrBufSize] = c->chrUPixBuf[i] + dst_stride_px;
} }
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
for (i=0; i<c->vLumBufSize; i++) { for (i=0; i<c->vLumBufSize; i++) {
FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf[i+c->vLumBufSize], VOF+1, fail); FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf[i+c->vLumBufSize], dst_stride+1, fail);
c->alpPixBuf[i] = c->alpPixBuf[i+c->vLumBufSize]; c->alpPixBuf[i] = c->alpPixBuf[i+c->vLumBufSize];
} }
//try to avoid drawing green stuff between the right end and the stride end //try to avoid drawing green stuff between the right end and the stride end
for (i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, (VOF+1)*2); for (i=0; i<c->vChrBufSize; i++)
memset(c->chrUPixBuf[i], 64, dst_stride*2+1);
assert(2*VOFW == VOF);
assert(c->chrDstH <= dstH); assert(c->chrDstH <= dstH);
...@@ -1481,10 +1480,11 @@ void sws_freeContext(SwsContext *c) ...@@ -1481,10 +1480,11 @@ void sws_freeContext(SwsContext *c)
av_freep(&c->lumPixBuf); av_freep(&c->lumPixBuf);
} }
if (c->chrPixBuf) { if (c->chrUPixBuf) {
for (i=0; i<c->vChrBufSize; i++) for (i=0; i<c->vChrBufSize; i++)
av_freep(&c->chrPixBuf[i]); av_freep(&c->chrUPixBuf[i]);
av_freep(&c->chrPixBuf); av_freep(&c->chrUPixBuf);
av_freep(&c->chrVPixBuf);
} }
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
......
...@@ -37,9 +37,8 @@ ...@@ -37,9 +37,8 @@
#endif #endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#define YSCALEYUV2YV12X(x, offset, dest, width) \ #define YSCALEYUV2YV12X(offset, dest, end, pos) \
__asm__ volatile(\ __asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
"movq %%mm3, %%mm4 \n\t"\ "movq %%mm3, %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\
...@@ -47,8 +46,8 @@ ...@@ -47,8 +46,8 @@
".p2align 4 \n\t" /* FIXME Unroll? */\ ".p2align 4 \n\t" /* FIXME Unroll? */\
"1: \n\t"\ "1: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\
"add $16, %%"REG_d" \n\t"\ "add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\ "test %%"REG_S", %%"REG_S" \n\t"\
...@@ -61,40 +60,40 @@ ...@@ -61,40 +60,40 @@
"psraw $3, %%mm4 \n\t"\ "psraw $3, %%mm4 \n\t"\
"packuswb %%mm4, %%mm3 \n\t"\ "packuswb %%mm4, %%mm3 \n\t"\
MOVNTQ(%%mm3, (%1, %%REGa))\ MOVNTQ(%%mm3, (%1, %%REGa))\
"add $8, %%"REG_a" \n\t"\ "add $8, %3 \n\t"\
"cmp %2, %%"REG_a" \n\t"\ "cmp %2, %3 \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
"movq %%mm3, %%mm4 \n\t"\ "movq %%mm3, %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\ "jb 1b \n\t"\
:: "r" (&c->redDither),\ :: "r" (&c->redDither),\
"r" (dest), "g" ((x86_reg)width)\ "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
: "%"REG_a, "%"REG_d, "%"REG_S\ : "%"REG_d, "%"REG_S\
); );
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
if (uDest) { if (uDest) {
YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest, chrDstW + c->uv_off, c->uv_off)
} }
if (CONFIG_SWSCALE_ALPHA && aDest) { if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
} }
YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
} }
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ #define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
__asm__ volatile(\ __asm__ volatile(\
"lea " offset "(%0), %%"REG_d" \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
"pxor %%mm4, %%mm4 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\ "pxor %%mm5, %%mm5 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\
...@@ -102,10 +101,10 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, ...@@ -102,10 +101,10 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"mov (%%"REG_d"), %%"REG_S" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
"movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\
"movq %%mm0, %%mm3 \n\t"\ "movq %%mm0, %%mm3 \n\t"\
"punpcklwd %%mm1, %%mm0 \n\t"\ "punpcklwd %%mm1, %%mm0 \n\t"\
"punpckhwd %%mm1, %%mm3 \n\t"\ "punpckhwd %%mm1, %%mm3 \n\t"\
...@@ -114,7 +113,7 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, ...@@ -114,7 +113,7 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"pmaddwd %%mm1, %%mm3 \n\t"\ "pmaddwd %%mm1, %%mm3 \n\t"\
"paddd %%mm0, %%mm4 \n\t"\ "paddd %%mm0, %%mm4 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\ "paddd %%mm3, %%mm5 \n\t"\
"movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\ "test %%"REG_S", %%"REG_S" \n\t"\
...@@ -139,8 +138,8 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, ...@@ -139,8 +138,8 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"psraw $3, %%mm6 \n\t"\ "psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm4 \n\t"\ "packuswb %%mm6, %%mm4 \n\t"\
MOVNTQ(%%mm4, (%1, %%REGa))\ MOVNTQ(%%mm4, (%1, %%REGa))\
"add $8, %%"REG_a" \n\t"\ "add $8, %3 \n\t"\
"cmp %2, %%"REG_a" \n\t"\ "cmp %2, %3 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\ "lea " offset "(%0), %%"REG_d" \n\t"\
"pxor %%mm4, %%mm4 \n\t"\ "pxor %%mm4, %%mm4 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\ "pxor %%mm5, %%mm5 \n\t"\
...@@ -149,26 +148,27 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, ...@@ -149,26 +148,27 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter,
"mov (%%"REG_d"), %%"REG_S" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\ "jb 1b \n\t"\
:: "r" (&c->redDither),\ :: "r" (&c->redDither),\
"r" (dest), "g" ((x86_reg)width)\ "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
: "%"REG_a, "%"REG_d, "%"REG_S\ : "%"REG_a, "%"REG_d, "%"REG_S\
); );
static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
if (uDest) { if (uDest) {
YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0)
YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest, chrDstW + c->uv_off, c->uv_off)
} }
if (CONFIG_SWSCALE_ALPHA && aDest) { if (CONFIG_SWSCALE_ALPHA && aDest) {
YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0)
} }
YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, dest, dstW, 0)
} }
#define YSCALEYUV2YV121 \ #define YSCALEYUV2YV121 \
...@@ -185,12 +185,13 @@ static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, ...@@ -185,12 +185,13 @@ static inline void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter,
"jnc 1b \n\t" "jnc 1b \n\t"
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrSrc, const int16_t *alpSrc, const int16_t *chrUSrc, const int16_t *chrVSrc,
const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
long p= 4; long p= 4;
const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW }; const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
uint8_t *dst[4]= { aDest, dest, uDest, vDest }; uint8_t *dst[4]= { aDest, dest, uDest, vDest };
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW }; x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
...@@ -225,12 +226,13 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, ...@@ -225,12 +226,13 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc,
"jnc 1b \n\t" "jnc 1b \n\t"
static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
const int16_t *chrSrc, const int16_t *alpSrc, const int16_t *chrUSrc, const int16_t *chrVSrc,
const int16_t *alpSrc,
uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
uint8_t *aDest, long dstW, long chrDstW) uint8_t *aDest, long dstW, long chrDstW)
{ {
long p= 4; long p= 4;
const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW }; const int16_t *src[4]= { alpSrc + dstW, lumSrc + dstW, chrUSrc + chrDstW, chrVSrc + chrDstW };
uint8_t *dst[4]= { aDest, dest, uDest, vDest }; uint8_t *dst[4]= { aDest, dest, uDest, vDest };
x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW }; x86_reg counter[4]= { dstW, dstW, chrDstW, chrDstW };
...@@ -260,7 +262,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, ...@@ -260,7 +262,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
"2: \n\t"\ "2: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ "add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
"add $16, %%"REG_d" \n\t"\ "add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\ "mov (%%"REG_d"), %%"REG_S" \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\ "pmulhw %%mm0, %%mm2 \n\t"\
...@@ -296,7 +299,7 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, ...@@ -296,7 +299,7 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
#define YSCALEYUV2PACKEDX_END \ #define YSCALEYUV2PACKEDX_END \
:: "r" (&c->redDither), \ :: "r" (&c->redDither), \
"m" (dummy), "m" (dummy), "m" (dummy),\ "m" (dummy), "m" (dummy), "m" (dummy),\
"r" (dest), "m" (dstW_reg) \ "r" (dest), "m" (dstW_reg), "m"(uv_off) \
: "%"REG_a, "%"REG_d, "%"REG_S \ : "%"REG_a, "%"REG_d, "%"REG_S \
); );
...@@ -315,7 +318,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, ...@@ -315,7 +318,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"2: \n\t"\ "2: \n\t"\
"movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ "add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
"movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
"movq %%mm0, %%mm3 \n\t"\ "movq %%mm0, %%mm3 \n\t"\
...@@ -326,7 +330,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, ...@@ -326,7 +330,8 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
"pmaddwd %%mm1, %%mm3 \n\t"\ "pmaddwd %%mm1, %%mm3 \n\t"\
"paddd %%mm0, %%mm4 \n\t"\ "paddd %%mm0, %%mm4 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\ "paddd %%mm3, %%mm5 \n\t"\
"movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ "add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\ "test %%"REG_S", %%"REG_S" \n\t"\
...@@ -461,12 +466,14 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc, ...@@ -461,12 +466,14 @@ static inline void RENAME(yuv2yuv1_ar)(SwsContext *c, const int16_t *lumSrc,
static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2PACKEDX_ACCURATE
...@@ -492,12 +499,14 @@ static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilte ...@@ -492,12 +499,14 @@ static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilte
static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX YSCALEYUV2PACKEDX
...@@ -547,12 +556,14 @@ static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, ...@@ -547,12 +556,14 @@ static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -569,12 +580,14 @@ static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilt ...@@ -569,12 +580,14 @@ static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilt
static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX YSCALEYUV2PACKEDX
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -620,12 +633,14 @@ static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, ...@@ -620,12 +633,14 @@ static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -642,12 +657,14 @@ static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilt ...@@ -642,12 +657,14 @@ static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilt
static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX YSCALEYUV2PACKEDX
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -773,12 +790,14 @@ static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, ...@@ -773,12 +790,14 @@ static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -788,19 +807,21 @@ static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilte ...@@ -788,19 +807,21 @@ static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilte
WRITEBGR24(%%REGc, %5, %%REGa) WRITEBGR24(%%REGc, %5, %%REGa)
:: "r" (&c->redDither), :: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg) "r" (dest), "m" (dstW_reg), "m"(uv_off)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
); );
} }
static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX YSCALEYUV2PACKEDX
YSCALEYUV2RGBX YSCALEYUV2RGBX
...@@ -810,7 +831,7 @@ static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, ...@@ -810,7 +831,7 @@ static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
WRITEBGR24(%%REGc, %5, %%REGa) WRITEBGR24(%%REGc, %5, %%REGa)
:: "r" (&c->redDither), :: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg) "r" (dest), "m" (dstW_reg), "m"(uv_off)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
); );
} }
...@@ -832,15 +853,16 @@ static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, ...@@ -832,15 +853,16 @@ static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
" jb 1b \n\t" " jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
...@@ -854,12 +876,14 @@ static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFil ...@@ -854,12 +876,14 @@ static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFil
static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize, const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrSrc, const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc, int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, long dstW, long dstY) uint8_t *dest, long dstW, long dstY)
{ {
x86_reg dummy=0; x86_reg dummy=0;
x86_reg dstW_reg = dstW; x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_off << 1;
YSCALEYUV2PACKEDX YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
...@@ -871,14 +895,16 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter ...@@ -871,14 +895,16 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter
YSCALEYUV2PACKEDX_END YSCALEYUV2PACKEDX_END
} }
#define REAL_YSCALEYUV2RGB_UV(index, c) \ #define REAL_YSCALEYUV2RGB_UV(index, c, uv_off) \
"xor "#index", "#index" \n\t"\ "xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
...@@ -941,8 +967,8 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter ...@@ -941,8 +967,8 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2RGB(index, c) \ #define YSCALEYUV2RGB(index, c, uv_off) \
REAL_YSCALEYUV2RGB_UV(index, c) \ REAL_YSCALEYUV2RGB_UV(index, c, uv_off) \
REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
REAL_YSCALEYUV2RGB_COEFF(c) REAL_YSCALEYUV2RGB_COEFF(c)
...@@ -950,23 +976,26 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter ...@@ -950,23 +976,26 @@ static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter
* vertical bilinear scale YV12 to RGB * vertical bilinear scale YV12 to RGB
*/ */
static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y) int dstW, int yalpha, int uvalpha, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
#if ARCH_X86_64 #if ARCH_X86_64
__asm__ volatile( __asm__ volatile(
YSCALEYUV2RGB(%%r8, %5) YSCALEYUV2RGB(%%r8, %5, %8)
YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t" "packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
"a" (&c->redDither), "a" (&c->redDither),
"r" (abuf0), "r" (abuf1) "r" (abuf0), "r" (abuf1), "m"(uv_off)
: "%r8" : "%r8"
); );
#else #else
...@@ -976,7 +1005,7 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, ...@@ -976,7 +1005,7 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5) YSCALEYUV2RGB(%%REGBP, %5, %6)
"push %0 \n\t" "push %0 \n\t"
"push %1 \n\t" "push %1 \n\t"
"mov "U_TEMP"(%5), %0 \n\t" "mov "U_TEMP"(%5), %0 \n\t"
...@@ -990,8 +1019,8 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, ...@@ -990,8 +1019,8 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
#endif #endif
} else { } else {
...@@ -999,50 +1028,56 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, ...@@ -999,50 +1028,56 @@ static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5) YSCALEYUV2RGB(%%REGBP, %5, %6)
"pcmpeqd %%mm7, %%mm7 \n\t" "pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y) int dstW, int yalpha, int uvalpha, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5) YSCALEYUV2RGB(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, 8280(%5), %%REGBP) WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y) int dstW, int yalpha, int uvalpha, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5) YSCALEYUV2RGB(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1053,23 +1088,26 @@ static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1053,23 +1088,26 @@ static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0,
WRITERGB15(%%REGb, 8280(%5), %%REGBP) WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y) int dstW, int yalpha, int uvalpha, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5) YSCALEYUV2RGB(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1080,12 +1118,12 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1080,12 +1118,12 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
WRITERGB16(%%REGb, 8280(%5), %%REGBP) WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
#define REAL_YSCALEYUV2PACKED(index, c) \ #define REAL_YSCALEYUV2PACKED(index, c, uv_off) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\ "psraw $3, %%mm0 \n\t"\
...@@ -1097,8 +1135,10 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1097,8 +1135,10 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
...@@ -1121,34 +1161,39 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1121,34 +1161,39 @@ static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0,
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) #define YSCALEYUV2PACKED(index, c, uv_off) REAL_YSCALEYUV2PACKED(index, c, uv_off)
static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *buf1, const uint16_t *ubuf0,
const uint16_t *uvbuf1, const uint16_t *abuf0, const uint16_t *ubuf1, const uint16_t *vbuf0,
const uint16_t *vbuf1, const uint16_t *abuf0,
const uint16_t *abuf1, uint8_t *dest, const uint16_t *abuf1, uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y) int dstW, int yalpha, int uvalpha, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2PACKED(%%REGBP, %5) YSCALEYUV2PACKED(%%REGBP, %5, %6)
WRITEYUY2(%%REGb, 8280(%5), %%REGBP) WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
#define REAL_YSCALEYUV2RGB1(index, c) \ #define REAL_YSCALEYUV2RGB1(index, c, uv_off) \
"xor "#index", "#index" \n\t"\ "xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
...@@ -1190,17 +1235,19 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1190,17 +1235,19 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
"packuswb %%mm6, %%mm5 \n\t"\ "packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\ "packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) #define YSCALEYUV2RGB1(index, c, uv_off) REAL_YSCALEYUV2RGB1(index, c, uv_off)
// do vertical chrominance interpolation // do vertical chrominance interpolation
#define REAL_YSCALEYUV2RGB1b(index, c) \ #define REAL_YSCALEYUV2RGB1b(index, c, uv_off) \
"xor "#index", "#index" \n\t"\ "xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
...@@ -1244,7 +1291,7 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1244,7 +1291,7 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
"packuswb %%mm6, %%mm5 \n\t"\ "packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\ "packuswb %%mm3, %%mm4 \n\t"\
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) #define YSCALEYUV2RGB1b(index, c, uv_off) REAL_YSCALEYUV2RGB1b(index, c, uv_off)
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \ #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
"movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
...@@ -1258,11 +1305,13 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, ...@@ -1258,11 +1305,13 @@ static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0,
* YV12 to RGB without scaling or interpolating * YV12 to RGB without scaling or interpolating
*/ */
static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat, int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
...@@ -1271,26 +1320,26 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1271,26 +1320,26 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1(%%REGBP, %5, %6)
YSCALEYUV2RGB1_ALPHA(%%REGBP) YSCALEYUV2RGB1_ALPHA(%%REGBP)
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1(%%REGBP, %5, %6)
"pcmpeqd %%mm7, %%mm7 \n\t" "pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} else { } else {
...@@ -1299,37 +1348,39 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1299,37 +1348,39 @@ static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1b(%%REGBP, %5, %6)
YSCALEYUV2RGB1_ALPHA(%%REGBP) YSCALEYUV2RGB1_ALPHA(%%REGBP)
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1b(%%REGBP, %5, %6)
"pcmpeqd %%mm7, %%mm7 \n\t" "pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
} }
static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat, int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
...@@ -1337,36 +1388,38 @@ static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1337,36 +1388,38 @@ static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, 8280(%5), %%REGBP) WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1b(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, 8280(%5), %%REGBP) WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat, int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
...@@ -1374,7 +1427,7 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1374,7 +1427,7 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1385,15 +1438,15 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1385,15 +1438,15 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
WRITERGB15(%%REGb, 8280(%5), %%REGBP) WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1b(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1404,18 +1457,20 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1404,18 +1457,20 @@ static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0,
WRITERGB15(%%REGb, 8280(%5), %%REGBP) WRITERGB15(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat, int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
...@@ -1423,7 +1478,7 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1423,7 +1478,7 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5) YSCALEYUV2RGB1(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1434,15 +1489,15 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1434,15 +1489,15 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
WRITERGB16(%%REGb, 8280(%5), %%REGBP) WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5) YSCALEYUV2RGB1b(%%REGBP, %5, %6)
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
...@@ -1453,18 +1508,20 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1453,18 +1508,20 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
WRITERGB16(%%REGb, 8280(%5), %%REGBP) WRITERGB16(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
#define REAL_YSCALEYUV2PACKED1(index, c) \ #define REAL_YSCALEYUV2PACKED1(index, c, uv_off) \
"xor "#index", "#index" \n\t"\ "xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"psraw $7, %%mm3 \n\t" \ "psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \ "psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
...@@ -1472,16 +1529,18 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1472,16 +1529,18 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
"psraw $7, %%mm1 \n\t" \ "psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \ "psraw $7, %%mm7 \n\t" \
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) #define YSCALEYUV2PACKED1(index, c, uv_off) REAL_YSCALEYUV2PACKED1(index, c, uv_off)
#define REAL_YSCALEYUV2PACKED1b(index, c) \ #define REAL_YSCALEYUV2PACKED1b(index, c, uv_off) \
"xor "#index", "#index" \n\t"\ "xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\ ".p2align 4 \n\t"\
"1: \n\t"\ "1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ "add "#uv_off", "#index" \n\t" \
"movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "#uv_off", "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \ "psrlw $8, %%mm3 \n\t" \
...@@ -1490,14 +1549,16 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1490,14 +1549,16 @@ static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0,
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \ "psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" "psraw $7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) #define YSCALEYUV2PACKED1b(index, c, uv_off) REAL_YSCALEYUV2PACKED1b(index, c, uv_off)
static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
const uint16_t *uvbuf0, const uint16_t *uvbuf1, const uint16_t *ubuf0, const uint16_t *ubuf1,
const uint16_t *vbuf0, const uint16_t *vbuf1,
const uint16_t *abuf0, uint8_t *dest, const uint16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, enum PixelFormat dstFormat, int dstW, int uvalpha, enum PixelFormat dstFormat,
int flags, int y) int flags, int y)
{ {
x86_reg uv_off = c->uv_off << 1;
const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
...@@ -1505,24 +1566,24 @@ static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, ...@@ -1505,24 +1566,24 @@ static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0,
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1(%%REGBP, %5) YSCALEYUV2PACKED1(%%REGBP, %5, %6)
WRITEYUY2(%%REGb, 8280(%5), %%REGBP) WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} else { } else {
__asm__ volatile( __asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t" "mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t" "push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1b(%%REGBP, %5) YSCALEYUV2PACKED1b(%%REGBP, %5, %6)
WRITEYUY2(%%REGb, 8280(%5), %%REGBP) WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
"pop %%"REG_BP" \n\t" "pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither) "a" (&c->redDither), "m"(uv_off)
); );
} }
} }
...@@ -2229,7 +2290,7 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, ...@@ -2229,7 +2290,7 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
dst[i] = src[srcW-1]*128; dst[i] = src[srcW-1]*128;
} }
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
long dstWidth, const uint8_t *src1, long dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc) const uint8_t *src2, int srcW, int xInc)
{ {
...@@ -2244,7 +2305,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, ...@@ -2244,7 +2305,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
__asm__ volatile( __asm__ volatile(
#if defined(PIC) #if defined(PIC)
"mov %%"REG_b", %6 \n\t" "mov %%"REG_b", %7 \n\t"
#endif #endif
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t" "mov %0, %%"REG_c" \n\t"
...@@ -2262,8 +2323,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, ...@@ -2262,8 +2323,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
CALL_MMX2_FILTER_CODE CALL_MMX2_FILTER_CODE
"xor %%"REG_a", %%"REG_a" \n\t" // i "xor %%"REG_a", %%"REG_a" \n\t" // i
"mov %5, %%"REG_c" \n\t" // src "mov %5, %%"REG_c" \n\t" // src
"mov %1, %%"REG_D" \n\t" // buf1 "mov %6, %%"REG_D" \n\t" // buf2
"add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
PREFETCH" (%%"REG_c") \n\t" PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t" PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t"
...@@ -2274,10 +2334,10 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, ...@@ -2274,10 +2334,10 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
CALL_MMX2_FILTER_CODE CALL_MMX2_FILTER_CODE
#if defined(PIC) #if defined(PIC)
"mov %6, %%"REG_b" \n\t" "mov %7, %%"REG_b" \n\t"
#endif #endif
:: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos), :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
"m" (mmx2FilterCode), "m" (src2) "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
#if defined(PIC) #if defined(PIC)
,"m" (ebxsave) ,"m" (ebxsave)
#endif #endif
...@@ -2288,8 +2348,8 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, ...@@ -2288,8 +2348,8 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
); );
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst[i] = src1[srcW-1]*128; dst1[i] = src1[srcW-1]*128;
dst[i+VOFW] = src2[srcW-1]*128; dst2[i] = src2[srcW-1]*128;
} }
} }
#endif /* COMPILE_TEMPLATE_MMX2 */ #endif /* COMPILE_TEMPLATE_MMX2 */
...@@ -2301,7 +2361,8 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int ...@@ -2301,7 +2361,8 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int
const int dstH= c->dstH; const int dstH= c->dstH;
const int flags= c->flags; const int flags= c->flags;
int16_t **lumPixBuf= c->lumPixBuf; int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrPixBuf= c->chrPixBuf; int16_t **chrUPixBuf= c->chrUPixBuf;
int16_t **chrVPixBuf= c->chrVPixBuf;
int16_t **alpPixBuf= c->alpPixBuf; int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize; const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize; const int vChrBufSize= c->vChrBufSize;
...@@ -2326,7 +2387,8 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int ...@@ -2326,7 +2387,8 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int
c->redDither= ff_dither8[(dstY+1)&1]; c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) { if (dstY < dstH - 2) {
const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int i; int i;
if (flags & SWS_ACCURATE_RND) { if (flags & SWS_ACCURATE_RND) {
...@@ -2345,29 +2407,26 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int ...@@ -2345,29 +2407,26 @@ static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int
} }
} }
for (i=0; i<vChrFilterSize; i+=2) { for (i=0; i<vChrFilterSize; i+=2) {
*(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ]; *(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)]; *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
chrMmxFilter[s*i+APCK_COEF/4 ]= chrMmxFilter[s*i+APCK_COEF/4 ]=
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
} }
} else { } else {
for (i=0; i<vLumFilterSize; i++) { for (i=0; i<vLumFilterSize; i++) {
lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; *(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
lumMmxFilter[4*i+2]= lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]= lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i]; *(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
alpMmxFilter[4*i+2]= alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
} }
} }
for (i=0; i<vChrFilterSize; i++) { for (i=0; i<vChrFilterSize; i++) {
chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; *(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
chrMmxFilter[4*i+2]= chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]= chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
...@@ -2492,7 +2551,6 @@ static void RENAME(sws_init_swScale)(SwsContext *c) ...@@ -2492,7 +2551,6 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
} }
} }
#endif /* !COMPILE_TEMPLATE_MMX2 */ #endif /* !COMPILE_TEMPLATE_MMX2 */
if(isAnyRGB(c->srcFormat)) if(isAnyRGB(c->srcFormat))
c->hScale16= RENAME(hScale16); c->hScale16= RENAME(hScale16);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment