Commit 6264b622 authored by Tucker DiNapoli's avatar Tucker DiNapoli Committed by Michael Niedermayer

postproc: Replaced inline asm for prefetching with prefetch functions

Prefetching functions are defined in postprocess_template using the
RENAME macro so that prefetching is used when available. For x86
targets inline asm is used and the functions are non-empty only for
cpus where prefetching is available. For non x86 targets the gcc bultin
prefetch is used if it is available, otherwise no prefetching is done.
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 748d4816
......@@ -168,37 +168,6 @@ static const char * const replaceTable[]=
NULL //End Marker
};
#if ARCH_X86 && HAVE_INLINE_ASM
static inline void prefetchnta(const void *p)
{
__asm__ volatile( "prefetchnta (%0)\n\t"
: : "r" (p)
);
}
static inline void prefetcht0(const void *p)
{
__asm__ volatile( "prefetcht0 (%0)\n\t"
: : "r" (p)
);
}
static inline void prefetcht1(const void *p)
{
__asm__ volatile( "prefetcht1 (%0)\n\t"
: : "r" (p)
);
}
static inline void prefetcht2(const void *p)
{
__asm__ volatile( "prefetcht2 (%0)\n\t"
: : "r" (p)
);
}
#endif
/* The horizontal functions exist only in C because the MMX
* code is faster with vertical filters and transposing. */
......
......@@ -3242,6 +3242,69 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride)
#endif
}
#if ARCH_X86 && TEMPLATE_PP_MMXEXT
static inline void RENAME(prefetchnta)(const void *p)
{
__asm__ volatile( "prefetchnta (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht0)(const void *p)
{
__asm__ volatile( "prefetcht0 (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht1)(const void *p)
{
__asm__ volatile( "prefetcht1 (%0)\n\t"
: : "r" (p)
);
}
static inline void RENAME(prefetcht2)(const void *p)
{
__asm__ volatile( "prefetcht2 (%0)\n\t"
: : "r" (p)
);
}
#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
static inline void RENAME(prefetchnta)(const void *p)
{
__builtin_prefetch(p,0,0);
}
static inline void RENAME(prefetcht0)(const void *p)
{
__builtin_prefetch(p,0,1);
}
static inline void RENAME(prefetcht1)(const void *p)
{
__builtin_prefetch(p,0,2);
}
static inline void RENAME(prefetcht2)(const void *p)
{
__builtin_prefetch(p,0,3);
}
#else
static inline void RENAME(prefetchnta)(const void *p)
{
return;
}
static inline void RENAME(prefetcht0)(const void *p)
{
return;
}
static inline void RENAME(prefetcht1)(const void *p)
{
return;
}
static inline void RENAME(prefetcht2)(const void *p)
{
return;
}
#endif
/**
* Filter array of bytes (Y or U or V values)
*/
......@@ -3368,34 +3431,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
// finish 1 block before the next otherwise we might have a problem
// with the L1 Cache of the P4 ... or only a few blocks at a time or something
for(x=0; x<width; x+=BLOCK_SIZE){
#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
__asm__(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
: "%"REG_a, "%"REG_d
);
#endif
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
......@@ -3474,33 +3513,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
uint8_t *dstBlockStart = dstBlock;
const uint8_t *srcBlockStart = srcBlock;
for(; x < endx; x+=BLOCK_SIZE){
#if TEMPLATE_PP_MMXEXT && HAVE_6REGS
/*
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
*/
__asm__(
"mov %4, %%"REG_a" \n\t"
"shr $2, %%"REG_a" \n\t"
"and $6, %%"REG_a" \n\t"
"add %5, %%"REG_a" \n\t"
"mov %%"REG_a", %%"REG_d" \n\t"
"imul %1, %%"REG_a" \n\t"
"imul %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
"add %1, %%"REG_a" \n\t"
"add %3, %%"REG_d" \n\t"
"prefetchnta 32(%%"REG_a", %0) \n\t"
"prefetcht0 32(%%"REG_d", %2) \n\t"
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
: "%"REG_a, "%"REG_d
);
#endif
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment