Commit 9007f514 authored by Romain Dolbeau's avatar Romain Dolbeau Committed by Michael Niedermayer

better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)

Originally committed as revision 3038 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 2750b827
......@@ -516,6 +516,7 @@ fi
# Add processor-specific flags
TUNECPU="generic"
POWERPCMODE="32bits"
if test $tune != "generic"; then
case $tune in
601|ppc601|PowerPC601)
......@@ -561,11 +562,12 @@ if test $tune != "generic"; then
TUNECPU=ppc7400
;;
G5|g5|970|ppc970|PowerPC970|power4*|Power4*)
CFLAGS="$CFLAGS -mcpu=970 -mtune=970 -mpowerpc64 -force_cpusubtype_ALL "
CFLAGS="$CFLAGS -mcpu=970 -mtune=970 -mpowerpc-gfxopt -mpowerpc64"
if test $altivec = "no"; then
echo "WARNING: tuning for PPC970 but altivec disabled !";
fi
TUNECPU=ppc970
POWERPCMODE="64bits"
;;
*)
echo "WARNING: unknown CPU "$tune", ignored"
......@@ -1027,6 +1029,11 @@ elif test "$cpu" = "sparc64" ; then
elif test "$cpu" = "powerpc" ; then
echo "TARGET_ARCH_POWERPC=yes" >> config.mak
echo "#define ARCH_POWERPC 1" >> $TMPH
if test $POWERPCMODE = "32bits"; then
echo "#define POWERPC_MODE_32BITS 1" >> $TMPH
else
echo "#define POWERPC_MODE_64BITS 1" >> $TMPH
fi
if test "$powerpc_perf" = "yes"; then
echo "#define POWERPC_PERFORMANCE_REPORT 1" >> $TMPH
fi
......
......@@ -17,7 +17,7 @@ The firsts are always available, always active, but they're not very accurate :
The PMC are much more useful : not only they can report cycle-accurate timing, but they can also be used to monitor many other parameters, such as the number of AltiVec stalls for every kind of instructions, or instruction cache misses. The downside is that not all processors support the PMC (all G3, all G4 and the 970 do support them), and they're inactive by default - you need to activate them with a dedicated tool. Also, the number of available PMC depend on the procesor : the various 604 have 2, the various 75x (aka. G3) have 4, anbd the various 74xx (aka G4) have 6.
*WARNING*: The powerpc 970 is not very well documented, and it seems its PMC registers are 64bits wide. The current implementation in FFMpeg assume the register are 32bits wide, and will *not* work on a powerpc 970 (aka G5).
*WARNING*: The powerpc 970 is not very well documented, and its PMC registers are 64bits wide. To properly notify the code, you *must* tune for the 970 (using --tune=970), or the code will assume 32bits registers.
II - Enabling FFmpeg PowerPC performance support
......
......@@ -1306,42 +1306,43 @@ int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
int sum;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
{
const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
#ifdef CONFIG_DARWIN
const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
const_vector unsigned char perm1 = (const_vector unsigned char)
{
register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
register const_vector unsigned char perm1 = (const_vector unsigned char)
(0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D);
const_vector unsigned char perm2 = (const_vector unsigned char)
register const_vector unsigned char perm2 = (const_vector unsigned char)
(0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B);
const_vector unsigned char perm3 = (const_vector unsigned char)
register const_vector unsigned char perm3 = (const_vector unsigned char)
(0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07);
#else
const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
const_vector unsigned char perm1 = (const_vector unsigned char)
register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
register const_vector unsigned char perm1 = (const_vector unsigned char)
{0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D};
const_vector unsigned char perm2 = (const_vector unsigned char)
register const_vector unsigned char perm2 = (const_vector unsigned char)
{0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B};
const_vector unsigned char perm3 = (const_vector unsigned char)
register const_vector unsigned char perm3 = (const_vector unsigned char)
{0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
......@@ -1350,8 +1351,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
#define ONEITERBUTTERFLY(i, res) \
{ \
vector unsigned char src1, src2, srcO; \
vector unsigned char dst1, dst2, dstO; \
register vector unsigned char src1, src2, srcO; \
register vector unsigned char dst1, dst2, dstO; \
src1 = vec_ld(stride * i, src); \
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
src2 = vec_ld((stride * i) + 16, src); \
......@@ -1362,20 +1363,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \
/* we're in the 8x8 function, we only care for the first 8 */ \
vector signed short srcV = \
register vector signed short srcV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
vector signed short dstV = \
register vector signed short dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
/* substractions inside the first butterfly */ \
vector signed short but0 = vec_sub(srcV, dstV); \
vector signed short op1 = vec_perm(but0, but0, perm1); \
vector signed short but1 = vec_mladd(but0, vprod1, op1); \
vector signed short op2 = vec_perm(but1, but1, perm2); \
vector signed short but2 = vec_mladd(but1, vprod2, op2); \
vector signed short op3 = vec_perm(but2, but2, perm3); \
register vector signed short but0 = vec_sub(srcV, dstV); \
register vector signed short op1 = vec_perm(but0, but0, perm1); \
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
register vector signed short op2 = vec_perm(but1, but1, perm2); \
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
register vector signed short op3 = vec_perm(but2, but2, perm3); \
res = vec_mladd(but2, vprod3, op3); \
}
vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
ONEITERBUTTERFLY(0, temp0);
ONEITERBUTTERFLY(1, temp1);
ONEITERBUTTERFLY(2, temp2);
......@@ -1384,53 +1384,275 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
ONEITERBUTTERFLY(5, temp5);
ONEITERBUTTERFLY(6, temp6);
ONEITERBUTTERFLY(7, temp7);
}
#undef ONEITERBUTTERFLY
{
vector signed int vsum;
vector signed short line0 = vec_add(temp0, temp1);
vector signed short line1 = vec_sub(temp0, temp1);
vector signed short line2 = vec_add(temp2, temp3);
vector signed short line3 = vec_sub(temp2, temp3);
vector signed short line4 = vec_add(temp4, temp5);
vector signed short line5 = vec_sub(temp4, temp5);
vector signed short line6 = vec_add(temp6, temp7);
vector signed short line7 = vec_sub(temp6, temp7);
{
register vector signed int vsum;
register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1);
register vector signed short line2 = vec_add(temp2, temp3);
register vector signed short line3 = vec_sub(temp2, temp3);
register vector signed short line4 = vec_add(temp4, temp5);
register vector signed short line5 = vec_sub(temp4, temp5);
register vector signed short line6 = vec_add(temp6, temp7);
register vector signed short line7 = vec_sub(temp6, temp7);
register vector signed short line0B = vec_add(line0, line2);
register vector signed short line2B = vec_sub(line0, line2);
register vector signed short line1B = vec_add(line1, line3);
register vector signed short line3B = vec_sub(line1, line3);
register vector signed short line4B = vec_add(line4, line6);
register vector signed short line6B = vec_sub(line4, line6);
register vector signed short line5B = vec_add(line5, line7);
register vector signed short line7B = vec_sub(line5, line7);
register vector signed short line0C = vec_add(line0B, line4B);
register vector signed short line4C = vec_sub(line0B, line4B);
register vector signed short line1C = vec_add(line1B, line5B);
register vector signed short line5C = vec_sub(line1B, line5B);
register vector signed short line2C = vec_add(line2B, line6B);
register vector signed short line6C = vec_sub(line2B, line6B);
register vector signed short line3C = vec_add(line3B, line7B);
register vector signed short line7C = vec_sub(line3B, line7B);
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
vsum = vec_sum4s(vec_abs(line1C), vsum);
vsum = vec_sum4s(vec_abs(line2C), vsum);
vsum = vec_sum4s(vec_abs(line3C), vsum);
vsum = vec_sum4s(vec_abs(line4C), vsum);
vsum = vec_sum4s(vec_abs(line5C), vsum);
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
vsum = vec_sums(vsum, (vector signed int)vzero);
vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum);
}
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
return sum;
}
/*
16x8 works with 16 elements ; it allows to avoid replicating
loads, and give the compiler more rooms for scheduling.
It's only used from inside hadamard8_diff16_altivec.
Unfortunately, it seems gcc-3.3 is a bit dumb, and
the compiled code has a LOT of spill code, it seems
gcc (unlike xlc) cannot keep everything in registers
by itself. The following code include hand-made
registers allocation. It's not clean, but on
a 7450 the resulting code is much faster (best case
fall from 700+ cycles to 550).
xlc doesn't add spill code, but it doesn't know how to
schedule for the 7450, and its code isn't much faster than
gcc-3.3 on the 7450 (but uses 25% less instructions...)
On the 970, the hand-made RA is still a win (arount 690
vs. around 780), but xlc goes to around 660 on the
regular C code...
*/
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
int sum;
register vector signed short
temp0 asm ("v0"),
temp1 asm ("v1"),
temp2 asm ("v2"),
temp3 asm ("v3"),
temp4 asm ("v4"),
temp5 asm ("v5"),
temp6 asm ("v6"),
temp7 asm ("v7");
register vector signed short
temp0S asm ("v8"),
temp1S asm ("v9"),
temp2S asm ("v10"),
temp3S asm ("v11"),
temp4S asm ("v12"),
temp5S asm ("v13"),
temp6S asm ("v14"),
temp7S asm ("v15");
register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
{
#ifdef CONFIG_DARWIN
register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
(0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D);
register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
(0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B);
register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
(0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07);
#else
register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
register const_vector unsigned char perm1 = (const_vector unsigned char)
{0x02, 0x03, 0x00, 0x01,
0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09,
0x0E, 0x0F, 0x0C, 0x0D};
register const_vector unsigned char perm2 = (const_vector unsigned char)
{0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F,
0x08, 0x09, 0x0A, 0x0B};
register const_vector unsigned char perm3 = (const_vector unsigned char)
{0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07};
#endif
#define ONEITERBUTTERFLY(i, res1, res2) \
{ \
register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
src1 = vec_ld(stride * i, src); \
src2 = vec_ld((stride * i) + 16, src); \
register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 16, dst); \
register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/* promote the unsigned chars to signed shorts */ \
register vector signed short srcV asm ("v24") = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
register vector signed short dstV asm ("v25") = \
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
register vector signed short srcW asm ("v26") = \
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
register vector signed short dstW asm ("v27") = \
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
/* substractions inside the first butterfly */ \
register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
res1 = vec_mladd(but2, vprod3, op3); \
register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
res2 = vec_mladd(but2S, vprod3, op3S); \
}
ONEITERBUTTERFLY(0, temp0, temp0S);
ONEITERBUTTERFLY(1, temp1, temp1S);
ONEITERBUTTERFLY(2, temp2, temp2S);
ONEITERBUTTERFLY(3, temp3, temp3S);
ONEITERBUTTERFLY(4, temp4, temp4S);
ONEITERBUTTERFLY(5, temp5, temp5S);
ONEITERBUTTERFLY(6, temp6, temp6S);
ONEITERBUTTERFLY(7, temp7, temp7S);
}
#undef ONEITERBUTTERFLY
{
register vector signed int vsum;
register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1);
register vector signed short line2 = vec_add(temp2, temp3);
register vector signed short line3 = vec_sub(temp2, temp3);
register vector signed short line4 = vec_add(temp4, temp5);
register vector signed short line5 = vec_sub(temp4, temp5);
register vector signed short line6 = vec_add(temp6, temp7);
register vector signed short line7 = vec_sub(temp6, temp7);
vector signed short line0B = vec_add(line0, line2);
vector signed short line2B = vec_sub(line0, line2);
vector signed short line1B = vec_add(line1, line3);
vector signed short line3B = vec_sub(line1, line3);
vector signed short line4B = vec_add(line4, line6);
vector signed short line6B = vec_sub(line4, line6);
vector signed short line5B = vec_add(line5, line7);
vector signed short line7B = vec_sub(line5, line7);
register vector signed short line0B = vec_add(line0, line2);
register vector signed short line2B = vec_sub(line0, line2);
register vector signed short line1B = vec_add(line1, line3);
register vector signed short line3B = vec_sub(line1, line3);
register vector signed short line4B = vec_add(line4, line6);
register vector signed short line6B = vec_sub(line4, line6);
register vector signed short line5B = vec_add(line5, line7);
register vector signed short line7B = vec_sub(line5, line7);
vector signed short line0C = vec_add(line0B, line4B);
vector signed short line4C = vec_sub(line0B, line4B);
vector signed short line1C = vec_add(line1B, line5B);
vector signed short line5C = vec_sub(line1B, line5B);
vector signed short line2C = vec_add(line2B, line6B);
vector signed short line6C = vec_sub(line2B, line6B);
vector signed short line3C = vec_add(line3B, line7B);
vector signed short line7C = vec_sub(line3B, line7B);
register vector signed short line0C = vec_add(line0B, line4B);
register vector signed short line4C = vec_sub(line0B, line4B);
register vector signed short line1C = vec_add(line1B, line5B);
register vector signed short line5C = vec_sub(line1B, line5B);
register vector signed short line2C = vec_add(line2B, line6B);
register vector signed short line6C = vec_sub(line2B, line6B);
register vector signed short line3C = vec_add(line3B, line7B);
register vector signed short line7C = vec_sub(line3B, line7B);
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
vsum = vec_sum4s(vec_abs(line1C), vsum);
vsum = vec_sum4s(vec_abs(line2C), vsum);
vsum = vec_sum4s(vec_abs(line3C), vsum);
vsum = vec_sum4s(vec_abs(line4C), vsum);
vsum = vec_sum4s(vec_abs(line5C), vsum);
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
vsum = vec_sums(vsum, (vector signed int)vzero);
vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum);
}
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
vsum = vec_sum4s(vec_abs(line1C), vsum);
vsum = vec_sum4s(vec_abs(line2C), vsum);
vsum = vec_sum4s(vec_abs(line3C), vsum);
vsum = vec_sum4s(vec_abs(line4C), vsum);
vsum = vec_sum4s(vec_abs(line5C), vsum);
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
register vector signed short line0S = vec_add(temp0S, temp1S);
register vector signed short line1S = vec_sub(temp0S, temp1S);
register vector signed short line2S = vec_add(temp2S, temp3S);
register vector signed short line3S = vec_sub(temp2S, temp3S);
register vector signed short line4S = vec_add(temp4S, temp5S);
register vector signed short line5S = vec_sub(temp4S, temp5S);
register vector signed short line6S = vec_add(temp6S, temp7S);
register vector signed short line7S = vec_sub(temp6S, temp7S);
register vector signed short line0BS = vec_add(line0S, line2S);
register vector signed short line2BS = vec_sub(line0S, line2S);
register vector signed short line1BS = vec_add(line1S, line3S);
register vector signed short line3BS = vec_sub(line1S, line3S);
register vector signed short line4BS = vec_add(line4S, line6S);
register vector signed short line6BS = vec_sub(line4S, line6S);
register vector signed short line5BS = vec_add(line5S, line7S);
register vector signed short line7BS = vec_sub(line5S, line7S);
register vector signed short line0CS = vec_add(line0BS, line4BS);
register vector signed short line4CS = vec_sub(line0BS, line4BS);
register vector signed short line1CS = vec_add(line1BS, line5BS);
register vector signed short line5CS = vec_sub(line1BS, line5BS);
register vector signed short line2CS = vec_add(line2BS, line6BS);
register vector signed short line6CS = vec_sub(line2BS, line6BS);
register vector signed short line3CS = vec_add(line3BS, line7BS);
register vector signed short line7CS = vec_sub(line3BS, line7BS);
vsum = vec_sum4s(vec_abs(line0CS), vsum);
vsum = vec_sum4s(vec_abs(line1CS), vsum);
vsum = vec_sum4s(vec_abs(line2CS), vsum);
vsum = vec_sum4s(vec_abs(line3CS), vsum);
vsum = vec_sum4s(vec_abs(line4CS), vsum);
vsum = vec_sum4s(vec_abs(line5CS), vsum);
vsum = vec_sum4s(vec_abs(line6CS), vsum);
vsum = vec_sum4s(vec_abs(line7CS), vsum);
vsum = vec_sums(vsum, (vector signed int)vzero);
vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum);
}
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
return sum;
}
int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
int score;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
if (h==16) {
dst += 8*stride;
src += 8*stride;
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
}
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
return score;
}
int has_altivec(void)
{
#ifdef CONFIG_DARWIN
......
......@@ -47,6 +47,7 @@ extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder);
......
......@@ -61,6 +61,7 @@ static unsigned char* perfname[] = {
"put_pixels16_xy2_altivec",
"put_no_rnd_pixels16_xy2_altivec",
"hadamard8_diff8x8_altivec",
"hadamard8_diff16_altivec",
"clear_blocks_dcbz32_ppc",
"clear_blocks_dcbz128_ppc"
};
......@@ -226,12 +227,6 @@ long check_dcbzl_effect(void)
}
#endif
#ifdef HAVE_ALTIVEC
// can't put that in dsputil_altivec.c,
// has WARPER8_16_SQ declare the function "static" ...
WARPER8_16_SQ(hadamard8_diff8x8_altivec, hadamard8_diff16_altivec)
#endif
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{
// Common optimizations whether Altivec is available or not
......
......@@ -51,6 +51,7 @@ enum powerpc_perf_index {
altivec_put_pixels16_xy2_num,
altivec_put_no_rnd_pixels16_xy2_num,
altivec_hadamard8_diff8x8_num,
altivec_hadamard8_diff16_num,
powerpc_clear_blocks_dcbz32,
powerpc_clear_blocks_dcbz128,
powerpc_perf_total
......@@ -64,6 +65,8 @@ enum powerpc_data_index {
};
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
#ifndef POWERPC_MODE_64BITS
#define POWERP_PMC_DATATYPE unsigned long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
......@@ -80,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index;
#else /* POWERPC_MODE_64BITS */
#define POWERP_PMC_DATATYPE unsigned long long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
#else
#define POWERPC_GET_PMC3(a) do {} while (0)
#define POWERPC_GET_PMC4(a) do {} while (0)
#endif
#if (POWERPC_NUM_PMC_ENABLED > 4)
#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
#else
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
#endif /* POWERPC_MODE_64BITS */
#define POWERPC_PERF_DECLARE(a, cond) \
POWERP_PMC_DATATYPE \
pmc_start[POWERPC_NUM_PMC_ENABLED], \
pmc_stop[POWERPC_NUM_PMC_ENABLED], \
pmc_loop_index;
#define POWERPC_PERF_START_COUNT(a, cond) do { \
POWERPC_GET_PMC6(pmc_start[5]); \
POWERPC_GET_PMC5(pmc_start[4]); \
......@@ -102,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
pmc_loop_index++) \
{ \
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
{ \
unsigned long diff = \
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
{ \
POWERP_PMC_DATATYPE diff = \
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment