Commit e3905ce0 authored by Diego Biurrun's avatar Diego Biurrun

cosmetics: Reformat PPC code in libavcodec according to style guidelines.

This includes indentation changes, comment reformatting, consistent brace
placement and some prettyprinting.

Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 41f5c62f
...@@ -60,33 +60,33 @@ int mm_support(void) ...@@ -60,33 +60,33 @@ int mm_support(void)
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
/* list below must match enum in dsputil_ppc.h */ /* list below must match enum in dsputil_ppc.h */
static unsigned char* perfname[] = { static unsigned char* perfname[] = {
"ff_fft_calc_altivec", "ff_fft_calc_altivec",
"gmc1_altivec", "gmc1_altivec",
"dct_unquantize_h263_altivec", "dct_unquantize_h263_altivec",
"fdct_altivec", "fdct_altivec",
"idct_add_altivec", "idct_add_altivec",
"idct_put_altivec", "idct_put_altivec",
"put_pixels16_altivec", "put_pixels16_altivec",
"avg_pixels16_altivec", "avg_pixels16_altivec",
"avg_pixels8_altivec", "avg_pixels8_altivec",
"put_pixels8_xy2_altivec", "put_pixels8_xy2_altivec",
"put_no_rnd_pixels8_xy2_altivec", "put_no_rnd_pixels8_xy2_altivec",
"put_pixels16_xy2_altivec", "put_pixels16_xy2_altivec",
"put_no_rnd_pixels16_xy2_altivec", "put_no_rnd_pixels16_xy2_altivec",
"hadamard8_diff8x8_altivec", "hadamard8_diff8x8_altivec",
"hadamard8_diff16_altivec", "hadamard8_diff16_altivec",
"avg_pixels8_xy2_altivec", "avg_pixels8_xy2_altivec",
"clear_blocks_dcbz32_ppc", "clear_blocks_dcbz32_ppc",
"clear_blocks_dcbz128_ppc", "clear_blocks_dcbz128_ppc",
"put_h264_chroma_mc8_altivec", "put_h264_chroma_mc8_altivec",
"avg_h264_chroma_mc8_altivec", "avg_h264_chroma_mc8_altivec",
"put_h264_qpel16_h_lowpass_altivec", "put_h264_qpel16_h_lowpass_altivec",
"avg_h264_qpel16_h_lowpass_altivec", "avg_h264_qpel16_h_lowpass_altivec",
"put_h264_qpel16_v_lowpass_altivec", "put_h264_qpel16_v_lowpass_altivec",
"avg_h264_qpel16_v_lowpass_altivec", "avg_h264_qpel16_v_lowpass_altivec",
"put_h264_qpel16_hv_lowpass_altivec", "put_h264_qpel16_hv_lowpass_altivec",
"avg_h264_qpel16_hv_lowpass_altivec", "avg_h264_qpel16_hv_lowpass_altivec",
"" ""
}; };
#include <stdio.h> #include <stdio.h>
#endif #endif
...@@ -94,51 +94,44 @@ static unsigned char* perfname[] = { ...@@ -94,51 +94,44 @@ static unsigned char* perfname[] = {
#ifdef CONFIG_POWERPC_PERF #ifdef CONFIG_POWERPC_PERF
void powerpc_display_perf_report(void) void powerpc_display_perf_report(void)
{ {
int i, j; int i, j;
av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
for(i = 0 ; i < powerpc_perf_total ; i++) for(i = 0 ; i < powerpc_perf_total ; i++) {
{ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
{ av_log(NULL, AV_LOG_INFO,
if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
av_log(NULL, AV_LOG_INFO, perfname[i],
" Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", j+1,
perfname[i], perfdata[j][i][powerpc_data_min],
j+1, perfdata[j][i][powerpc_data_max],
perfdata[j][i][powerpc_data_min], (double)perfdata[j][i][powerpc_data_sum] /
perfdata[j][i][powerpc_data_max], (double)perfdata[j][i][powerpc_data_num],
(double)perfdata[j][i][powerpc_data_sum] / perfdata[j][i][powerpc_data_num]);
(double)perfdata[j][i][powerpc_data_num], }
perfdata[j][i][powerpc_data_num]); }
}
}
} }
#endif /* CONFIG_POWERPC_PERF */ #endif /* CONFIG_POWERPC_PERF */
/* ***** WARNING ***** WARNING ***** WARNING ***** */ /* ***** WARNING ***** WARNING ***** WARNING ***** */
/* /*
clear_blocks_dcbz32_ppc will not work properly clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
on PowerPC processors with a cache line size cache line size not equal to 32 bytes.
not equal to 32 bytes. Fortunately all processor used by Apple up to at least the 7450 (aka second
Fortunately all processor used by Apple up to generation G4) use 32 bytes cache line.
at least the 7450 (aka second generation G4) This is due to the use of the 'dcbz' instruction. It simply clear to zero a
use 32 bytes cache line. single cache line, so you need to know the cache line size to use it !
This is due to the use of the 'dcbz' instruction. It's absurd, but it's fast...
It simply clear to zero a single cache line,
so you need to know the cache line size to use it !
It's absurd, but it's fast...
update 24/06/2003 : Apple released yesterday the G5, update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
with a PPC970. cache line size : 128 bytes. Oups. size: 128 bytes. Oups.
The semantic of dcbz was changed, it always clear The semantic of dcbz was changed, it always clear 32 bytes. so the function
32 bytes. so the function below will work, but will below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
be slow. So I fixed check_dcbz_effect to use dcbzl, which is defined to clear a cache line (as dcbz before). So we still can
which is defined to clear a cache line (as dcbz before). distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
So we still can distinguish, and use dcbz (32 bytes)
or dcbzl (one cache line) as required.
see <http://developer.apple.com/technotes/tn/tn2087.html> see <http://developer.apple.com/technotes/tn/tn2087.html>
and <http://developer.apple.com/technotes/tn/tn2086.html> and <http://developer.apple.com/technotes/tn/tn2086.html>
*/ */
void clear_blocks_dcbz32_ppc(DCTELEM *blocks) void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{ {
...@@ -148,21 +141,21 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); ...@@ -148,21 +141,21 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
#if 1 #if 1
if (misal) { if (misal) {
((unsigned long*)blocks)[0] = 0L; ((unsigned long*)blocks)[0] = 0L;
((unsigned long*)blocks)[1] = 0L; ((unsigned long*)blocks)[1] = 0L;
((unsigned long*)blocks)[2] = 0L; ((unsigned long*)blocks)[2] = 0L;
((unsigned long*)blocks)[3] = 0L; ((unsigned long*)blocks)[3] = 0L;
i += 16; i += 16;
} }
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
} }
if (misal) { if (misal) {
((unsigned long*)blocks)[188] = 0L; ((unsigned long*)blocks)[188] = 0L;
((unsigned long*)blocks)[189] = 0L; ((unsigned long*)blocks)[189] = 0L;
((unsigned long*)blocks)[190] = 0L; ((unsigned long*)blocks)[190] = 0L;
((unsigned long*)blocks)[191] = 0L; ((unsigned long*)blocks)[191] = 0L;
i += 16; i += 16;
} }
#else #else
memset(blocks, 0, sizeof(DCTELEM)*6*64); memset(blocks, 0, sizeof(DCTELEM)*6*64);
...@@ -180,16 +173,16 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); ...@@ -180,16 +173,16 @@ POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
register int i = 0; register int i = 0;
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
#if 1 #if 1
if (misal) { if (misal) {
// we could probably also optimize this case, // we could probably also optimize this case,
// but there's not much point as the machines // but there's not much point as the machines
// aren't available yet (2003-06-26) // aren't available yet (2003-06-26)
memset(blocks, 0, sizeof(DCTELEM)*6*64); memset(blocks, 0, sizeof(DCTELEM)*6*64);
} }
else else
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
} }
#else #else
memset(blocks, 0, sizeof(DCTELEM)*6*64); memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif #endif
...@@ -198,7 +191,7 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); ...@@ -198,7 +191,7 @@ POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
#else #else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks) void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{ {
memset(blocks, 0, sizeof(DCTELEM)*6*64); memset(blocks, 0, sizeof(DCTELEM)*6*64);
} }
#endif #endif
...@@ -210,34 +203,32 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks) ...@@ -210,34 +203,32 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
knows about dcbzl ... */ knows about dcbzl ... */
long check_dcbzl_effect(void) long check_dcbzl_effect(void)
{ {
register char *fakedata = av_malloc(1024); register char *fakedata = av_malloc(1024);
register char *fakedata_middle; register char *fakedata_middle;
register long zero = 0; register long zero = 0;
register long i = 0; register long i = 0;
long count = 0; long count = 0;
if (!fakedata) if (!fakedata) {
{ return 0L;
return 0L; }
}
fakedata_middle = (fakedata + 512); fakedata_middle = (fakedata + 512);
memset(fakedata, 0xFF, 1024); memset(fakedata, 0xFF, 1024);
/* below the constraint "b" seems to mean "Address base register" /* below the constraint "b" seems to mean "Address base register"
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
for (i = 0; i < 1024 ; i ++) for (i = 0; i < 1024 ; i ++) {
{ if (fakedata[i] == (char)0)
if (fakedata[i] == (char)0) count++;
count++; }
}
av_free(fakedata); av_free(fakedata);
return count; return count;
} }
#else #else
long check_dcbzl_effect(void) long check_dcbzl_effect(void)
...@@ -286,36 +277,31 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) ...@@ -286,36 +277,31 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
#ifdef CONFIG_ENCODERS #ifdef CONFIG_ENCODERS
if (avctx->dct_algo == FF_DCT_AUTO || if (avctx->dct_algo == FF_DCT_AUTO ||
avctx->dct_algo == FF_DCT_ALTIVEC) avctx->dct_algo == FF_DCT_ALTIVEC) {
{
c->fdct = fdct_altivec; c->fdct = fdct_altivec;
} }
#endif //CONFIG_ENCODERS #endif //CONFIG_ENCODERS
if (avctx->lowres==0) if (avctx->lowres==0) {
{ if ((avctx->idct_algo == FF_IDCT_AUTO) ||
if ((avctx->idct_algo == FF_IDCT_AUTO) || (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
(avctx->idct_algo == FF_IDCT_ALTIVEC)) c->idct_put = idct_put_altivec;
{ c->idct_add = idct_add_altivec;
c->idct_put = idct_put_altivec; c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
c->idct_add = idct_add_altivec; }
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
}
} }
#ifdef CONFIG_POWERPC_PERF #ifdef CONFIG_POWERPC_PERF
{ {
int i, j; int i, j;
for (i = 0 ; i < powerpc_perf_total ; i++) for (i = 0 ; i < powerpc_perf_total ; i++) {
{ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
{ perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; }
perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
} }
}
} }
#endif /* CONFIG_POWERPC_PERF */ #endif /* CONFIG_POWERPC_PERF */
} }
......
...@@ -31,40 +31,40 @@ void powerpc_display_perf_report(void); ...@@ -31,40 +31,40 @@ void powerpc_display_perf_report(void);
/* if you add to the enum below, also add to the perfname array /* if you add to the enum below, also add to the perfname array
in dsputil_ppc.c */ in dsputil_ppc.c */
enum powerpc_perf_index { enum powerpc_perf_index {
altivec_fft_num = 0, altivec_fft_num = 0,
altivec_gmc1_num, altivec_gmc1_num,
altivec_dct_unquantize_h263_num, altivec_dct_unquantize_h263_num,
altivec_fdct, altivec_fdct,
altivec_idct_add_num, altivec_idct_add_num,
altivec_idct_put_num, altivec_idct_put_num,
altivec_put_pixels16_num, altivec_put_pixels16_num,
altivec_avg_pixels16_num, altivec_avg_pixels16_num,
altivec_avg_pixels8_num, altivec_avg_pixels8_num,
altivec_put_pixels8_xy2_num, altivec_put_pixels8_xy2_num,
altivec_put_no_rnd_pixels8_xy2_num, altivec_put_no_rnd_pixels8_xy2_num,
altivec_put_pixels16_xy2_num, altivec_put_pixels16_xy2_num,
altivec_put_no_rnd_pixels16_xy2_num, altivec_put_no_rnd_pixels16_xy2_num,
altivec_hadamard8_diff8x8_num, altivec_hadamard8_diff8x8_num,
altivec_hadamard8_diff16_num, altivec_hadamard8_diff16_num,
altivec_avg_pixels8_xy2_num, altivec_avg_pixels8_xy2_num,
powerpc_clear_blocks_dcbz32, powerpc_clear_blocks_dcbz32,
powerpc_clear_blocks_dcbz128, powerpc_clear_blocks_dcbz128,
altivec_put_h264_chroma_mc8_num, altivec_put_h264_chroma_mc8_num,
altivec_avg_h264_chroma_mc8_num, altivec_avg_h264_chroma_mc8_num,
altivec_put_h264_qpel16_h_lowpass_num, altivec_put_h264_qpel16_h_lowpass_num,
altivec_avg_h264_qpel16_h_lowpass_num, altivec_avg_h264_qpel16_h_lowpass_num,
altivec_put_h264_qpel16_v_lowpass_num, altivec_put_h264_qpel16_v_lowpass_num,
altivec_avg_h264_qpel16_v_lowpass_num, altivec_avg_h264_qpel16_v_lowpass_num,
altivec_put_h264_qpel16_hv_lowpass_num, altivec_put_h264_qpel16_hv_lowpass_num,
altivec_avg_h264_qpel16_hv_lowpass_num, altivec_avg_h264_qpel16_hv_lowpass_num,
powerpc_perf_total powerpc_perf_total
}; };
enum powerpc_data_index { enum powerpc_data_index {
powerpc_data_min = 0, powerpc_data_min = 0,
powerpc_data_max, powerpc_data_max,
powerpc_data_sum, powerpc_data_sum,
powerpc_data_num, powerpc_data_num,
powerpc_data_total powerpc_data_total
}; };
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
...@@ -105,45 +105,42 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ ...@@ -105,45 +105,42 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC6(a) do {} while (0) #define POWERPC_GET_PMC6(a) do {} while (0)
#endif #endif
#endif /* HAVE_PPC64 */ #endif /* HAVE_PPC64 */
#define POWERPC_PERF_DECLARE(a, cond) \ #define POWERPC_PERF_DECLARE(a, cond) \
POWERP_PMC_DATATYPE \ POWERP_PMC_DATATYPE \
pmc_start[POWERPC_NUM_PMC_ENABLED], \ pmc_start[POWERPC_NUM_PMC_ENABLED], \
pmc_stop[POWERPC_NUM_PMC_ENABLED], \ pmc_stop[POWERPC_NUM_PMC_ENABLED], \
pmc_loop_index; pmc_loop_index;
#define POWERPC_PERF_START_COUNT(a, cond) do { \ #define POWERPC_PERF_START_COUNT(a, cond) do { \
POWERPC_GET_PMC6(pmc_start[5]); \ POWERPC_GET_PMC6(pmc_start[5]); \
POWERPC_GET_PMC5(pmc_start[4]); \ POWERPC_GET_PMC5(pmc_start[4]); \
POWERPC_GET_PMC4(pmc_start[3]); \ POWERPC_GET_PMC4(pmc_start[3]); \
POWERPC_GET_PMC3(pmc_start[2]); \ POWERPC_GET_PMC3(pmc_start[2]); \
POWERPC_GET_PMC2(pmc_start[1]); \ POWERPC_GET_PMC2(pmc_start[1]); \
POWERPC_GET_PMC1(pmc_start[0]); \ POWERPC_GET_PMC1(pmc_start[0]); \
} while (0) } while (0)
#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ #define POWERPC_PERF_STOP_COUNT(a, cond) do { \
POWERPC_GET_PMC1(pmc_stop[0]); \ POWERPC_GET_PMC1(pmc_stop[0]); \
POWERPC_GET_PMC2(pmc_stop[1]); \ POWERPC_GET_PMC2(pmc_stop[1]); \
POWERPC_GET_PMC3(pmc_stop[2]); \ POWERPC_GET_PMC3(pmc_stop[2]); \
POWERPC_GET_PMC4(pmc_stop[3]); \ POWERPC_GET_PMC4(pmc_stop[3]); \
POWERPC_GET_PMC5(pmc_stop[4]); \ POWERPC_GET_PMC5(pmc_stop[4]); \
POWERPC_GET_PMC6(pmc_stop[5]); \ POWERPC_GET_PMC6(pmc_stop[5]); \
if (cond) \ if (cond) { \
{ \ for(pmc_loop_index = 0; \
for(pmc_loop_index = 0; \ pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ pmc_loop_index++) { \
pmc_loop_index++) \ if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \
{ \ POWERP_PMC_DATATYPE diff = \
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
{ \ if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
POWERP_PMC_DATATYPE diff = \ perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \
if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ perfdata[pmc_loop_index][a][powerpc_data_num] ++; \
perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ } \
perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ } \
perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ } \
} \
} \
} \
} while (0) } while (0)
#else /* CONFIG_POWERPC_PERF */ #else /* CONFIG_POWERPC_PERF */
// those are needed to avoid empty statements. // those are needed to avoid empty statements.
......
...@@ -33,21 +33,21 @@ ...@@ -33,21 +33,21 @@
/* butter fly op */ /* butter fly op */
#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
{\ {\
FFTSample ax, ay, bx, by;\ FFTSample ax, ay, bx, by;\
bx=pre1;\ bx=pre1;\
by=pim1;\ by=pim1;\
ax=qre1;\ ax=qre1;\
ay=qim1;\ ay=qim1;\
pre = (bx + ax);\ pre = (bx + ax);\
pim = (by + ay);\ pim = (by + ay);\
qre = (bx - ax);\ qre = (bx - ax);\
qim = (by - ay);\ qim = (by - ay);\
} }
#define MUL16(a,b) ((a) * (b)) #define MUL16(a,b) ((a) * (b))
#define CMUL(pre, pim, are, aim, bre, bim) \ #define CMUL(pre, pim, are, aim, bre, bim) \
{\ {\
pre = (MUL16(are, bre) - MUL16(aim, bim));\ pre = (MUL16(are, bre) - MUL16(aim, bim));\
pim = (MUL16(are, bim) + MUL16(bre, aim));\ pim = (MUL16(are, bim) + MUL16(bre, aim));\
} }
...@@ -85,14 +85,11 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); ...@@ -85,14 +85,11 @@ POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
c1 = vcii(p,p,n,n); c1 = vcii(p,p,n,n);
if (s->inverse) if (s->inverse) {
{ c2 = vcii(p,p,n,p);
c2 = vcii(p,p,n,p); } else {
} c2 = vcii(p,p,p,n);
else }
{
c2 = vcii(p,p,p,n);
}
j = (np >> 2); j = (np >> 2);
do { do {
......
...@@ -36,16 +36,16 @@ void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int str ...@@ -36,16 +36,16 @@ void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int str
{ {
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
{rounder, rounder, rounder, rounder, {rounder, rounder, rounder, rounder,
rounder, rounder, rounder, rounder}; rounder, rounder, rounder, rounder};
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
{ {
(16-x16)*(16-y16), /* A */ (16-x16)*(16-y16), /* A */
( x16)*(16-y16), /* B */ ( x16)*(16-y16), /* B */
(16-x16)*( y16), /* C */ (16-x16)*( y16), /* C */
( x16)*( y16), /* D */ ( x16)*( y16), /* D */
0, 0, 0, 0 /* padding */ 0, 0, 0, 0 /* padding */
}; };
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
...@@ -74,73 +74,67 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); ...@@ -74,73 +74,67 @@ POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
src_1 = vec_ld(16, src); src_1 = vec_ld(16, src);
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
if (src_really_odd != 0x0000000F) if (src_really_odd != 0x0000000F) {
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. // if src & 0xF == 0xF, then (src+1) is properly aligned
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); // on the second vector.
} srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
else } else {
{ srcvB = src_1;
srcvB = src_1;
} }
srcvA = vec_mergeh(vczero, srcvA); srcvA = vec_mergeh(vczero, srcvA);
srcvB = vec_mergeh(vczero, srcvB); srcvB = vec_mergeh(vczero, srcvB);
for(i=0; i<h; i++) for(i=0; i<h; i++) {
{ dst_odd = (unsigned long)dst & 0x0000000F;
dst_odd = (unsigned long)dst & 0x0000000F; src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
dstv = vec_ld(0, dst);
dstv = vec_ld(0, dst);
// we we'll be able to pick-up our 9 char elements
// we we'll be able to pick-up our 9 char elements // at src + stride from those 32 bytes
// at src + stride from those 32 bytes // then reuse the resulting 2 vectors srvcC and srcvD
// then reuse the resulting 2 vectors srvcC and srcvD // as the next srcvA and srcvB
// as the next srcvA and srcvB src_0 = vec_ld(stride + 0, src);
src_0 = vec_ld(stride + 0, src); src_1 = vec_ld(stride + 16, src);
src_1 = vec_ld(stride + 16, src); srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
if (src_really_odd != 0x0000000F) {
if (src_really_odd != 0x0000000F) // if src & 0xF == 0xF, then (src+1) is properly aligned
{ // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. // on the second vector.
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
} } else {
else srcvD = src_1;
{ }
srcvD = src_1;
} srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
// OK, now we (finally) do the math :-)
// those four instructions replaces 32 int muls & 32 int adds.
// OK, now we (finally) do the math :-) // isn't AltiVec nice ?
// those four instructions replaces 32 int muls & 32 int adds. tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
// isn't AltiVec nice ? tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); srcvA = srcvC;
srcvB = srcvD;
srcvA = srcvC;
srcvB = srcvD; tempD = vec_sr(tempD, vcsr8);
tempD = vec_sr(tempD, vcsr8); dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
dstv2 = vec_pack(tempD, (vector unsigned short)vczero); if (dst_odd) {
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
if (dst_odd) } else {
{ dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); }
}
else vec_st(dstv2, 0, dst);
{
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); dst += stride;
} src += stride;
vec_st(dstv2, 0, dst);
dst += stride;
src += stride;
} }
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
......
...@@ -196,7 +196,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride ...@@ -196,7 +196,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
LOAD_ZERO; LOAD_ZERO;
const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vec_u16_t v6us = vec_splat_u16(6); const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
...@@ -392,8 +392,8 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, ...@@ -392,8 +392,8 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
*/ */
H264_MC(put_, 16, altivec) H264_MC(put_, 16, altivec)
H264_MC(avg_, 16, altivec) H264_MC(avg_, 16, altivec)
/**************************************************************************** /****************************************************************************
...@@ -685,9 +685,9 @@ static inline void write16x4(uint8_t *dst, int dst_stride, ...@@ -685,9 +685,9 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
\ \
/*Third merge*/ \ /*Third merge*/ \
r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
......
...@@ -206,489 +206,489 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, ...@@ -206,489 +206,489 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
/* this code assume stride % 16 == 0 */ /* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i; register int i;
LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src);
const vec_s16_t v5ss = vec_splat_s16(5);
const vec_u16_t v5us = vec_splat_u16(5);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vec_u8_t sum, vdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) {
vec_u8_t srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); LOAD_ZERO;
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); const vec_u8_t permM2 = vec_lvsl(-2, src);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); const vec_u8_t permM1 = vec_lvsl(-1, src);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); const vec_u8_t permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src);
const vec_s16_t v5ss = vec_splat_s16(5);
const vec_u16_t v5us = vec_splat_u16(5);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); register int align = ((((unsigned long)src) - 2) % 16);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vec_u8_t sum, vdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) {
vec_u8_t srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
sum1A = vec_adds(srcP0A, srcP1A); srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
sum1B = vec_adds(srcP0B, srcP1B); srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
sum2A = vec_adds(srcM1A, srcP2A); srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
sum2B = vec_adds(srcM1B, srcP2B); srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, v16ss); srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
pp1B = vec_mladd(sum1B, v20ss, v16ss); srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
pp3A = vec_add(sum3A, pp1A); sum1A = vec_adds(srcP0A, srcP1A);
pp3B = vec_add(sum3B, pp1B); sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
psumA = vec_sub(pp3A, pp2A); pp1A = vec_mladd(sum1A, v20ss, v16ss);
psumB = vec_sub(pp3B, pp2B); pp1B = vec_mladd(sum1B, v20ss, v16ss);
sumA = vec_sra(psumA, v5us); pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
sumB = vec_sra(psumB, v5us); pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
sum = vec_packsu(sumA, sumB); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
ASSERT_ALIGNED(dst); psumA = vec_sub(pp3A, pp2A);
vdst = vec_ld(0, dst); psumB = vec_sub(pp3B, pp2B);
OP_U8_ALTIVEC(fsum, sum, vdst); sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
vec_st(fsum, 0, dst); sum = vec_packsu(sumA, sumB);
src += srcStride; ASSERT_ALIGNED(dst);
dst += dstStride; vdst = vec_ld(0, dst);
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst);
src += srcStride;
dst += dstStride;
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
} }
/* this code assume stride % 16 == 0 */ /* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
register int i; register int i;
LOAD_ZERO; LOAD_ZERO;
const vec_u8_t perm = vec_lvsl(0, src); const vec_u8_t perm = vec_lvsl(0, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16_t v5us = vec_splat_u16(5); const vec_u16_t v5us = vec_splat_u16(5);
const vec_s16_t v5ss = vec_splat_s16(5); const vec_s16_t v5ss = vec_splat_s16(5);
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
uint8_t *srcbis = src - (srcStride * 2); uint8_t *srcbis = src - (srcStride * 2);
const vec_u8_t srcM2a = vec_ld(0, srcbis); const vec_u8_t srcM2a = vec_ld(0, srcbis);
const vec_u8_t srcM2b = vec_ld(16, srcbis); const vec_u8_t srcM2b = vec_ld(16, srcbis);
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
// srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcM1b = vec_ld(16, srcbis); const vec_u8_t srcM1b = vec_ld(16, srcbis);
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
// srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP0b = vec_ld(16, srcbis); const vec_u8_t srcP0b = vec_ld(16, srcbis);
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
// srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP1b = vec_ld(16, srcbis); const vec_u8_t srcP1b = vec_ld(16, srcbis);
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
// srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP2b = vec_ld(16, srcbis); const vec_u8_t srcP2b = vec_ld(16, srcbis);
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
// srcbis += srcStride; //srcbis += srcStride;
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB, psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB, srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
for (i = 0 ; i < 16 ; i++) { for (i = 0 ; i < 16 ; i++) {
srcP3a = vec_ld(0, srcbis += srcStride); srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis); srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
// srcbis += srcStride; //srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1A = vec_adds(srcP0ssA, srcP1ssA);
sum1B = vec_adds(srcP0ssB, srcP1ssB); sum1B = vec_adds(srcP0ssB, srcP1ssB);
sum2A = vec_adds(srcM1ssA, srcP2ssA); sum2A = vec_adds(srcM1ssA, srcP2ssA);
sum2B = vec_adds(srcM1ssB, srcP2ssB); sum2B = vec_adds(srcM1ssB, srcP2ssB);
sum3A = vec_adds(srcM2ssA, srcP3ssA); sum3A = vec_adds(srcM2ssA, srcP3ssA);
sum3B = vec_adds(srcM2ssB, srcP3ssB); sum3B = vec_adds(srcM2ssB, srcP3ssB);
srcM2ssA = srcM1ssA; srcM2ssA = srcM1ssA;
srcM2ssB = srcM1ssB; srcM2ssB = srcM1ssB;
srcM1ssA = srcP0ssA; srcM1ssA = srcP0ssA;
srcM1ssB = srcP0ssB; srcM1ssB = srcP0ssB;
srcP0ssA = srcP1ssA; srcP0ssA = srcP1ssA;
srcP0ssB = srcP1ssB; srcP0ssB = srcP1ssB;
srcP1ssA = srcP2ssA; srcP1ssA = srcP2ssA;
srcP1ssB = srcP2ssB; srcP1ssB = srcP2ssB;
srcP2ssA = srcP3ssA; srcP2ssA = srcP3ssA;
srcP2ssB = srcP3ssB; srcP2ssB = srcP3ssB;
pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B); pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A); psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B); psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us); sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us); sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB); sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst); ASSERT_ALIGNED(dst);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst); OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
dst += dstStride; dst += dstStride;
} }
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
} }
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i; register int i;
LOAD_ZERO; LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src); const vec_u8_t permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src); const vec_u8_t permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src); const vec_u8_t permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src); const vec_u8_t permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src); const vec_u8_t permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src); const vec_u8_t permP3 = vec_lvsl(+3, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32_t v10ui = vec_splat_u32(10); const vec_u32_t v10ui = vec_splat_u32(10);
const vec_s16_t v5ss = vec_splat_s16(5); const vec_s16_t v5ss = vec_splat_s16(5);
const vec_s16_t v1ss = vec_splat_s16(1); const vec_s16_t v1ss = vec_splat_s16(1);
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB; pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vec_u8_t mperm = (const vec_u8_t) const vec_u8_t mperm = (const vec_u8_t)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp; int16_t *tmpbis = tmp;
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB; tmpP2ssA, tmpP2ssB;
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo; ssumAe, ssumAo, ssumBe, ssumBo;
vec_u8_t fsum, sumv, sum, vdst; vec_u8_t fsum, sumv, sum, vdst;
vec_s16_t ssume, ssumo; vec_s16_t ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride); src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) { for (i = 0 ; i < 21 ; i ++) {
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_u8_t srcR1 = vec_ld(-2, src); vec_u8_t srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src); vec_u8_t srcR2 = vec_ld(14, src);
switch (align) { switch (align) {
default: { default: {
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1); srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2); srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3); srcP3 = vec_perm(srcR1, srcR2, permP3);
} break; } break;
case 11: { case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1); srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2); srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1); srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2; srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2; srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2); srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1); srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2); srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1); srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2); srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
} }
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); psumA = vec_sub(pp1A, pp2A);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); psumB = vec_sub(pp1B, pp2B);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
src += srcStride;
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
src += srcStride;
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride; tmpbis += tmpStride;
tmpM2ssA = tmpM1ssA; for (i = 0 ; i < 16 ; i++) {
tmpM2ssB = tmpM1ssB; const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
tmpM1ssA = tmpP0ssA; const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
tmpM1ssB = tmpP0ssB;
tmpP0ssA = tmpP1ssA; const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
tmpP0ssB = tmpP1ssB; const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
tmpP1ssA = tmpP2ssA; const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
tmpP1ssB = tmpP2ssB; const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
tmpP2ssA = tmpP3ssA; const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
tmpP2ssB = tmpP3ssB; const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
pp1Ae = vec_mule(sum1A, v20ss); tmpbis += tmpStride;
pp1Ao = vec_mulo(sum1A, v20ss);
pp1Be = vec_mule(sum1B, v20ss); tmpM2ssA = tmpM1ssA;
pp1Bo = vec_mulo(sum1B, v20ss); tmpM2ssB = tmpM1ssB;
tmpM1ssA = tmpP0ssA;
pp2Ae = vec_mule(sum2A, v5ss); tmpM1ssB = tmpP0ssB;
pp2Ao = vec_mulo(sum2A, v5ss); tmpP0ssA = tmpP1ssA;
pp2Be = vec_mule(sum2B, v5ss); tmpP0ssB = tmpP1ssB;
pp2Bo = vec_mulo(sum2B, v5ss); tmpP1ssA = tmpP2ssA;
tmpP1ssB = tmpP2ssB;
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); tmpP2ssA = tmpP3ssA;
pp3Ao = vec_mulo(sum3A, v1ss); tmpP2ssB = tmpP3ssB;
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss); pp1Ae = vec_mule(sum1A, v20ss);
pp1Ao = vec_mulo(sum1A, v20ss);
pp1cAe = vec_add(pp1Ae, v512si); pp1Be = vec_mule(sum1B, v20ss);
pp1cAo = vec_add(pp1Ao, v512si); pp1Bo = vec_mulo(sum1B, v20ss);
pp1cBe = vec_add(pp1Be, v512si);
pp1cBo = vec_add(pp1Bo, v512si); pp2Ae = vec_mule(sum2A, v5ss);
pp2Ao = vec_mulo(sum2A, v5ss);
pp32Ae = vec_sub(pp3Ae, pp2Ae); pp2Be = vec_mule(sum2B, v5ss);
pp32Ao = vec_sub(pp3Ao, pp2Ao); pp2Bo = vec_mulo(sum2B, v5ss);
pp32Be = vec_sub(pp3Be, pp2Be);
pp32Bo = vec_sub(pp3Bo, pp2Bo); pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss);
sumAe = vec_add(pp1cAe, pp32Ae); pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
sumAo = vec_add(pp1cAo, pp32Ao); pp3Bo = vec_mulo(sum3B, v1ss);
sumBe = vec_add(pp1cBe, pp32Be);
sumBo = vec_add(pp1cBo, pp32Bo); pp1cAe = vec_add(pp1Ae, v512si);
pp1cAo = vec_add(pp1Ao, v512si);
ssumAe = vec_sra(sumAe, v10ui); pp1cBe = vec_add(pp1Be, v512si);
ssumAo = vec_sra(sumAo, v10ui); pp1cBo = vec_add(pp1Bo, v512si);
ssumBe = vec_sra(sumBe, v10ui);
ssumBo = vec_sra(sumBo, v10ui); pp32Ae = vec_sub(pp3Ae, pp2Ae);
pp32Ao = vec_sub(pp3Ao, pp2Ao);
ssume = vec_packs(ssumAe, ssumBe); pp32Be = vec_sub(pp3Be, pp2Be);
ssumo = vec_packs(ssumAo, ssumBo); pp32Bo = vec_sub(pp3Bo, pp2Bo);
sumv = vec_packsu(ssume, ssumo); sumAe = vec_add(pp1cAe, pp32Ae);
sum = vec_perm(sumv, sumv, mperm); sumAo = vec_add(pp1cAo, pp32Ao);
sumBe = vec_add(pp1cBe, pp32Be);
ASSERT_ALIGNED(dst); sumBo = vec_add(pp1cBo, pp32Bo);
vdst = vec_ld(0, dst);
ssumAe = vec_sra(sumAe, v10ui);
OP_U8_ALTIVEC(fsum, sum, vdst); ssumAo = vec_sra(sumAo, v10ui);
ssumBe = vec_sra(sumBe, v10ui);
vec_st(fsum, 0, dst); ssumBo = vec_sra(sumBo, v10ui);
dst += dstStride; ssume = vec_packs(ssumAe, ssumBe);
} ssumo = vec_packs(ssumAo, ssumBo);
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
ASSERT_ALIGNED(dst);
vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst);
dst += dstStride;
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
} }
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
* NOTE: This code is based on GPL code from the libmpeg2 project. The * NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release * author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of ffmpeg. * under LGPL as part of ffmpeg.
*
*/ */
/* /*
......
...@@ -46,8 +46,7 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, ...@@ -46,8 +46,7 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
vector signed short zeros, sumhv, sumlv; vector signed short zeros, sumhv, sumlv;
s = src; s = src;
for(i=0;i<4;i++) for(i=0;i<4;i++) {
{
/* /*
The vec_madds later on does an implicit >>15 on the result. The vec_madds later on does an implicit >>15 on the result.
Since FILTER_BITS is 8, and we have 15 bits of magnitude in Since FILTER_BITS is 8, and we have 15 bits of magnitude in
...@@ -86,13 +85,11 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, ...@@ -86,13 +85,11 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
/* Do our altivec resampling on 16 pixels at once. */ /* Do our altivec resampling on 16 pixels at once. */
while(dst_width>=16) { while(dst_width>=16) {
/* /* Read 16 (potentially unaligned) bytes from each of
Read 16 (potentially unaligned) bytes from each of
4 lines into 4 vectors, and split them into shorts. 4 lines into 4 vectors, and split them into shorts.
Interleave the multipy/accumulate for the resample Interleave the multipy/accumulate for the resample
filter with the loads to hide the 3 cycle latency filter with the loads to hide the 3 cycle latency
the vec_madds have. the vec_madds have. */
*/
tv = (vector unsigned char *) &s[0 * wrap]; tv = (vector unsigned char *) &s[0 * wrap];
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
...@@ -121,10 +118,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, ...@@ -121,10 +118,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
/* /* Pack the results into our destination vector,
Pack the results into our destination vector, and do an aligned write of that back to memory. */
and do an aligned write of that back to memory.
*/
dstv = vec_packsu(sumhv, sumlv) ; dstv = vec_packsu(sumhv, sumlv) ;
vec_st(dstv, 0, (vector unsigned char *) dst); vec_st(dstv, 0, (vector unsigned char *) dst);
...@@ -133,10 +128,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, ...@@ -133,10 +128,8 @@ void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
dst_width-=16; dst_width-=16;
} }
/* /* If there are any leftover pixels, resample them
If there are any leftover pixels, resample them with the slow scalar method. */
with the slow scalar method.
*/
while(dst_width>0) { while(dst_width>0) {
sum = s[0 * wrap] * filter[0] + sum = s[0 * wrap] * filter[0] +
s[1 * wrap] * filter[1] + s[1 * wrap] * filter[1] +
......
...@@ -38,7 +38,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, ...@@ -38,7 +38,7 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
vector signed short vpix2, vdiff, vpix1l,vpix1h; vector signed short vpix2, vdiff, vpix1l,vpix1h;
union { vector signed int vscore; union { vector signed int vscore;
int32_t score[4]; int32_t score[4];
} u; } u;
u.vscore = vec_splat_s32(0); u.vscore = vec_splat_s32(0);
// //
//XXX lazy way, fix it later //XXX lazy way, fix it later
......
...@@ -25,14 +25,14 @@ ...@@ -25,14 +25,14 @@
#if defined(ARCH_POWERPC_405) #if defined(ARCH_POWERPC_405)
/* signed 16x16 -> 32 multiply add accumulate */ /* signed 16x16 -> 32 multiply add accumulate */
# define MAC16(rt, ra, rb) \ #define MAC16(rt, ra, rb) \
asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
/* signed 16x16 -> 32 multiply */ /* signed 16x16 -> 32 multiply */
# define MUL16(ra, rb) \ #define MUL16(ra, rb) \
({ int __rt; \ ({ int __rt; \
asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
__rt; }) __rt; })
#endif #endif
#endif /* FFMPEG_PPC_MATHOPS_H */ #endif /* FFMPEG_PPC_MATHOPS_H */
...@@ -41,15 +41,15 @@ do { \ ...@@ -41,15 +41,15 @@ do { \
// transposes a matrix consisting of four vectors with four elements each // transposes a matrix consisting of four vectors with four elements each
#define TRANSPOSE4(a,b,c,d) \ #define TRANSPOSE4(a,b,c,d) \
do { \ do { \
__typeof__(a) _trans_ach = vec_mergeh(a, c); \ __typeof__(a) _trans_ach = vec_mergeh(a, c); \
__typeof__(a) _trans_acl = vec_mergel(a, c); \ __typeof__(a) _trans_acl = vec_mergel(a, c); \
__typeof__(a) _trans_bdh = vec_mergeh(b, d); \ __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
__typeof__(a) _trans_bdl = vec_mergel(b, d); \ __typeof__(a) _trans_bdl = vec_mergel(b, d); \
\ \
a = vec_mergeh(_trans_ach, _trans_bdh); \ a = vec_mergeh(_trans_ach, _trans_bdh); \
b = vec_mergel(_trans_ach, _trans_bdh); \ b = vec_mergel(_trans_ach, _trans_bdh); \
c = vec_mergeh(_trans_acl, _trans_bdl); \ c = vec_mergeh(_trans_acl, _trans_bdl); \
d = vec_mergel(_trans_acl, _trans_bdl); \ d = vec_mergel(_trans_acl, _trans_bdl); \
} while (0) } while (0)
...@@ -58,19 +58,19 @@ do { \ ...@@ -58,19 +58,19 @@ do { \
// target address is four-byte aligned (which should be always). // target address is four-byte aligned (which should be always).
#define LOAD4(vec, address) \ #define LOAD4(vec, address) \
{ \ { \
__typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
vec = vec_ld(0, _load_addr); \ vec = vec_ld(0, _load_addr); \
vec = vec_perm(vec, vec, _perm_vec); \ vec = vec_perm(vec, vec, _perm_vec); \
vec = vec_splat(vec, 0); \ vec = vec_splat(vec, 0); \
} }
#define FOUROF(a) AVV(a,a,a,a) #define FOUROF(a) AVV(a,a,a,a)
int dct_quantize_altivec(MpegEncContext* s, int dct_quantize_altivec(MpegEncContext* s,
DCTELEM* data, int n, DCTELEM* data, int n,
int qscale, int* overflow) int qscale, int* overflow)
{ {
int lastNonZero; int lastNonZero;
vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float row0, row1, row2, row3, row4, row5, row6, row7;
...@@ -137,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -137,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s,
int whichPass, whichHalf; int whichPass, whichHalf;
for(whichPass = 1; whichPass<=2; whichPass++) for(whichPass = 1; whichPass<=2; whichPass++) {
{ for(whichHalf = 1; whichHalf<=2; whichHalf++) {
for(whichHalf = 1; whichHalf<=2; whichHalf++)
{
vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
vector float tmp10, tmp11, tmp12, tmp13; vector float tmp10, tmp11, tmp12, tmp13;
vector float z1, z2, z3, z4, z5; vector float z1, z2, z3, z4, z5;
...@@ -235,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -235,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s,
SWAP(row7, alt7); SWAP(row7, alt7);
} }
if (whichPass == 1) if (whichPass == 1) {
{
// transpose the data for the second pass // transpose the data for the second pass
// First, block transpose the upper right with lower left. // First, block transpose the upper right with lower left.
...@@ -261,8 +258,7 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -261,8 +258,7 @@ int dct_quantize_altivec(MpegEncContext* s,
const vector signed int* qmat; const vector signed int* qmat;
vector float bias, negBias; vector float bias, negBias;
if (s->mb_intra) if (s->mb_intra) {
{
vector signed int baseVector; vector signed int baseVector;
// We must cache element 0 in the intra case // We must cache element 0 in the intra case
...@@ -272,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -272,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s,
qmat = (vector signed int*)s->q_intra_matrix[qscale]; qmat = (vector signed int*)s->q_intra_matrix[qscale];
biasAddr = &(s->intra_quant_bias); biasAddr = &(s->intra_quant_bias);
} } else {
else
{
qmat = (vector signed int*)s->q_inter_matrix[qscale]; qmat = (vector signed int*)s->q_inter_matrix[qscale];
biasAddr = &(s->inter_quant_bias); biasAddr = &(s->inter_quant_bias);
} }
...@@ -439,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -439,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s,
// and handle it using the vector unit if we can. This is the permute used // and handle it using the vector unit if we can. This is the permute used
// by the altivec idct, so it is common when using the altivec dct. // by the altivec idct, so it is common when using the altivec dct.
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
{
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
} }
...@@ -456,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -456,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s,
} }
// special handling of block[0] // special handling of block[0]
if (s->mb_intra) if (s->mb_intra) {
{ if (!s->h263_aic) {
if (!s->h263_aic)
{
if (n < 4) if (n < 4)
oldBaseValue /= s->y_dc_scale; oldBaseValue /= s->y_dc_scale;
else else
...@@ -474,8 +465,7 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -474,8 +465,7 @@ int dct_quantize_altivec(MpegEncContext* s,
// need to permute the "no" permutation case. // need to permute the "no" permutation case.
if ((lastNonZero > 0) && if ((lastNonZero > 0) &&
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
{
ff_block_permute(data, s->dsp.idct_permutation, ff_block_permute(data, s->dsp.idct_permutation,
s->intra_scantable.scantable, lastNonZero); s->intra_scantable.scantable, lastNonZero);
} }
...@@ -483,10 +473,8 @@ int dct_quantize_altivec(MpegEncContext* s, ...@@ -483,10 +473,8 @@ int dct_quantize_altivec(MpegEncContext* s,
return lastNonZero; return lastNonZero;
} }
/* /* AltiVec version of dct_unquantize_h263
AltiVec version of dct_unquantize_h263 this code assumes `block' is 16 bytes-aligned */
this code assumes `block' is 16 bytes-aligned
*/
void dct_unquantize_h263_altivec(MpegEncContext *s, void dct_unquantize_h263_altivec(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
...@@ -517,82 +505,81 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); ...@@ -517,82 +505,81 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
} }
{ {
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
DECLARE_ALIGNED_16(short, qmul8[]) = DECLARE_ALIGNED_16(short, qmul8[]) =
{ {
qmul, qmul, qmul, qmul, qmul, qmul, qmul, qmul,
qmul, qmul, qmul, qmul qmul, qmul, qmul, qmul
}; };
DECLARE_ALIGNED_16(short, qadd8[]) = DECLARE_ALIGNED_16(short, qadd8[]) =
{ {
qadd, qadd, qadd, qadd, qadd, qadd, qadd, qadd,
qadd, qadd, qadd, qadd qadd, qadd, qadd, qadd
}; };
DECLARE_ALIGNED_16(short, nqadd8[]) = DECLARE_ALIGNED_16(short, nqadd8[]) =
{ {
-qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd,
-qadd, -qadd, -qadd, -qadd -qadd, -qadd, -qadd, -qadd
}; };
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg; register vector bool short blockv_null, blockv_neg;
register short backup_0 = block[0]; register short backup_0 = block[0];
register int j = 0; register int j = 0;
qmulv = vec_ld(0, qmul8); qmulv = vec_ld(0, qmul8);
qaddv = vec_ld(0, qadd8); qaddv = vec_ld(0, qadd8);
nqaddv = vec_ld(0, nqadd8); nqaddv = vec_ld(0, nqadd8);
#if 0 // block *is* 16 bytes-aligned, it seems. #if 0 // block *is* 16 bytes-aligned, it seems.
// first make sure block[j] is 16 bytes-aligned // first make sure block[j] is 16 bytes-aligned
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
level = block[j]; level = block[j];
if (level) { if (level) {
if (level < 0) { if (level < 0) {
level = level * qmul - qadd; level = level * qmul - qadd;
} else { } else {
level = level * qmul + qadd; level = level * qmul + qadd;
}
block[j] = level;
} }
block[j] = level;
} }
}
#endif #endif
// vectorize all the 16 bytes-aligned blocks // vectorize all the 16 bytes-aligned blocks
// of 8 elements // of 8 elements
for(; (j + 7) <= nCoeffs ; j+=8) for(; (j + 7) <= nCoeffs ; j+=8) {
{ blockv = vec_ld(j << 1, block);
blockv = vec_ld(j << 1, block); blockv_neg = vec_cmplt(blockv, vczero);
blockv_neg = vec_cmplt(blockv, vczero); blockv_null = vec_cmpeq(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero); // choose between +qadd or -qadd as the third operand
// choose between +qadd or -qadd as the third operand temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
temp1 = vec_sel(qaddv, nqaddv, blockv_neg); // multiply & add (block{i,i+7} * qmul [+-] qadd)
// multiply & add (block{i,i+7} * qmul [+-] qadd) temp1 = vec_mladd(blockv, qmulv, temp1);
temp1 = vec_mladd(blockv, qmulv, temp1); // put 0 where block[{i,i+7} used to have 0
// put 0 where block[{i,i+7} used to have 0 blockv = vec_sel(temp1, blockv, blockv_null);
blockv = vec_sel(temp1, blockv, blockv_null); vec_st(blockv, j << 1, block);
vec_st(blockv, j << 1, block); }
}
// if nCoeffs isn't a multiple of 8, finish the job
// if nCoeffs isn't a multiple of 8, finish the job // using good old scalar units.
// using good old scalar units. // (we could do it using a truncated vector,
// (we could do it using a truncated vector, // but I'm not sure it's worth the hassle)
// but I'm not sure it's worth the hassle) for(; j <= nCoeffs ; j++) {
for(; j <= nCoeffs ; j++) { level = block[j];
level = block[j]; if (level) {
if (level) { if (level < 0) {
if (level < 0) { level = level * qmul - qadd;
level = level * qmul - qadd; } else {
} else { level = level * qmul + qadd;
level = level * qmul + qadd; }
block[j] = level;
} }
block[j] = level;
} }
}
if (i == 1) if (i == 1) {
{ // cheat. this avoid special-casing the first iteration // cheat. this avoid special-casing the first iteration
block[0] = backup_0; block[0] = backup_0;
} }
} }
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
} }
...@@ -605,11 +592,9 @@ void MPV_common_init_altivec(MpegEncContext *s) ...@@ -605,11 +592,9 @@ void MPV_common_init_altivec(MpegEncContext *s)
{ {
if ((mm_flags & MM_ALTIVEC) == 0) return; if ((mm_flags & MM_ALTIVEC) == 0) return;
if (s->avctx->lowres==0) if (s->avctx->lowres==0) {
{
if ((s->avctx->idct_algo == FF_IDCT_AUTO) || if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
(s->avctx->idct_algo == FF_IDCT_ALTIVEC)) (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) {
{
s->dsp.idct_put = idct_put_altivec; s->dsp.idct_put = idct_put_altivec;
s->dsp.idct_add = idct_add_altivec; s->dsp.idct_add = idct_add_altivec;
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
...@@ -618,15 +603,13 @@ void MPV_common_init_altivec(MpegEncContext *s) ...@@ -618,15 +603,13 @@ void MPV_common_init_altivec(MpegEncContext *s)
// Test to make sure that the dct required alignments are met. // Test to make sure that the dct required alignments are met.
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
(((long)(s->q_inter_matrix) & 0x0f) != 0)) (((long)(s->q_inter_matrix) & 0x0f) != 0)) {
{
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n"); "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return; return;
} }
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) {
{
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n"); "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return; return;
...@@ -634,8 +617,7 @@ void MPV_common_init_altivec(MpegEncContext *s) ...@@ -634,8 +617,7 @@ void MPV_common_init_altivec(MpegEncContext *s)
if ((s->avctx->dct_algo == FF_DCT_AUTO) || if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) (s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
{
#if 0 /* seems to cause trouble under some circumstances */ #if 0 /* seems to cause trouble under some circumstances */
s->dct_quantize = dct_quantize_altivec; s->dct_quantize = dct_quantize_altivec;
#endif #endif
......
...@@ -379,8 +379,7 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, ...@@ -379,8 +379,7 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
v4=(vector signed int *)b4; v4=(vector signed int *)b4;
v5=(vector signed int *)b5; v5=(vector signed int *)b5;
for (i=0; i< w4;i++) for (i=0; i< w4;i++) {
{
#if 0 #if 0
b4[i] -= (3*(b3[i] + b5[i])+4)>>3; b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
...@@ -782,8 +781,8 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, ...@@ -782,8 +781,8 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
{ {
#if 0 #if 0
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
#endif #endif
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment