Commit 08571377 authored by Luca Barbato's avatar Luca Barbato

Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually

Originally committed as revision 11306 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 1e0f3468
...@@ -52,11 +52,12 @@ ...@@ -52,11 +52,12 @@
src += stride; src += stride;
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ \
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
\ \
psum = vec_mladd(vA, vsrc0ssH, v32ss);\ psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vB, vsrc1ssH, psum);\ psum = vec_mladd(vE, vsrc1ssH, psum);\
psum = vec_mladd(vC, vsrc2ssH, psum);\
psum = vec_sr(psum, v6us);\ psum = vec_sr(psum, v6us);\
\ \
vdst = vec_ld(0, dst);\ vdst = vec_ld(0, dst);\
...@@ -66,9 +67,6 @@ ...@@ -66,9 +67,6 @@
OP_U8_ALTIVEC(fsum, vfdst, vdst);\ OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\ \
vec_st(fsum, 0, dst);\ vec_st(fsum, 0, dst);\
\
vsrc0ssH = vsrc1ssH;\
vsrc1ssH = vsrc2ssH;\
\ \
dst += stride;\ dst += stride;\
src += stride; src += stride;
...@@ -155,23 +153,48 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, ...@@ -155,23 +153,48 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
} }
} }
} else { } else {
const vec_s16_t vE = vec_add(vB, vC);
if (ABCD[2]) { // y == 0 B == 0
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
} }
} else { } else {
vec_u8_t vsrcDuc; vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(15, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
}
}
} else { // x == 0 C == 0
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrcCuc = vec_ld(0, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
}
} else {
vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) {
CHROMA_MC8_ALTIVEC_CORE_SIMPLE CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrcCuc = vec_ld(0, src);
vsrcDuc = vec_ld(15, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
} }
} }
}
} }
POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment