Commit 73d33554 authored by Michael Niedermayer's avatar Michael Niedermayer

more speed

Originally committed as revision 2438 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
parent 2d83f323
...@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/ after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL; static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture int black=0, white=255; // blackest black and whitest white in the picture
int QPCorrecture= 256;
/* Temporary buffers for handling the last row(s) */ /* Temporary buffers for handling the last row(s) */
static uint8_t *tempDst= NULL; static uint8_t *tempDst= NULL;
...@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset= 0; packedYOffset= 0;
} }
if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
else QPCorrecture= 256;
/* copy first row of 8x8 blocks */ /* copy first row of 8x8 blocks */
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
...@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop //1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]); uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]); uint8_t *dstBlock= &(dst[y*dstStride]);
#ifdef ARCH_X86
int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
int QPFrac= QPDelta;
#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */ than use a temporary buffer */
if(y+15 >= height) if(y+15 >= height)
...@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride; const int stride= dstStride;
int QP; #ifdef ARCH_X86
if(isColor) int QP= *QPptr;
{ asm volatile(
QP=QPs[(y>>3)*QPStride + (x>>3)]; "addl %2, %1 \n\t"
} "sbbl %%eax, %%eax \n\t"
else "shll $2, %%eax \n\t"
"subl %%eax, %0 \n\t"
: "+r" (QPptr), "+m" (QPFrac)
: "r" (QPDelta)
: "%eax"
);
#else
int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
#endif
if(!isColor)
{ {
QP= QPs[(y>>4)*QPStride + (x>>4)]; QP= (QP* QPCorrecture)>>8;
if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8; yHistogram[ srcBlock[srcStride*4 + 4] ]++;
yHistogram[ srcBlock[srcStride*5] ]++;
} }
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
...@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif #endif
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
"addl $5, %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"imul %1, %%eax \n\t"
"imul %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
"addl %1, %%eax \n\t"
"addl %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
"m" (x)
: "%eax", "%ebx"
);
#elif defined(HAVE_3DNOW) #elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
......
...@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2603,6 +2603,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
after watching a black picture for 5 hours*/ after watching a black picture for 5 hours*/
static uint64_t *yHistogram= NULL; static uint64_t *yHistogram= NULL;
int black=0, white=255; // blackest black and whitest white in the picture int black=0, white=255; // blackest black and whitest white in the picture
int QPCorrecture= 256;
/* Temporary buffers for handling the last row(s) */ /* Temporary buffers for handling the last row(s) */
static uint8_t *tempDst= NULL; static uint8_t *tempDst= NULL;
...@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2693,6 +2694,9 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
packedYOffset= 0; packedYOffset= 0;
} }
if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
else QPCorrecture= 256;
/* copy first row of 8x8 blocks */ /* copy first row of 8x8 blocks */
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
...@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2702,7 +2706,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
//1% speedup if these are here instead of the inner loop //1% speedup if these are here instead of the inner loop
uint8_t *srcBlock= &(src[y*srcStride]); uint8_t *srcBlock= &(src[y*srcStride]);
uint8_t *dstBlock= &(dst[y*dstStride]); uint8_t *dstBlock= &(dst[y*dstStride]);
#ifdef ARCH_X86
int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
int QPFrac= QPDelta;
#endif
/* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
than use a temporary buffer */ than use a temporary buffer */
if(y+15 >= height) if(y+15 >= height)
...@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2734,16 +2742,26 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
for(x=0; x<width; x+=BLOCK_SIZE) for(x=0; x<width; x+=BLOCK_SIZE)
{ {
const int stride= dstStride; const int stride= dstStride;
int QP; #ifdef ARCH_X86
if(isColor) int QP= *QPptr;
{ asm volatile(
QP=QPs[(y>>3)*QPStride + (x>>3)]; "addl %2, %1 \n\t"
} "sbbl %%eax, %%eax \n\t"
else "shll $2, %%eax \n\t"
"subl %%eax, %0 \n\t"
: "+r" (QPptr), "+m" (QPFrac)
: "r" (QPDelta)
: "%eax"
);
#else
int QP= isColor ?
QPs[(y>>3)*QPStride + (x>>3)]:
QPs[(y>>4)*QPStride + (x>>4)];
#endif
if(!isColor)
{ {
QP= QPs[(y>>4)*QPStride + (x>>4)]; QP= (QP* QPCorrecture)>>8;
if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8; yHistogram[ srcBlock[srcStride*4 + 4] ]++;
yHistogram[ srcBlock[srcStride*5] ]++;
} }
#ifdef HAVE_MMX #ifdef HAVE_MMX
asm volatile( asm volatile(
...@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri ...@@ -2761,10 +2779,38 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
#endif #endif
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
/*
prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
*/
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
asm(
"movl %4, %%eax \n\t"
"shrl $2, %%eax \n\t"
"andl $6, %%eax \n\t"
"addl $5, %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"imul %1, %%eax \n\t"
"imul %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
"addl %1, %%eax \n\t"
"addl %3, %%ebx \n\t"
"prefetchnta 32(%%eax, %0) \n\t"
"prefetcht0 32(%%ebx, %2) \n\t"
:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
"m" (x)
: "%eax", "%ebx"
);
#elif defined(HAVE_3DNOW) #elif defined(HAVE_3DNOW)
//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment