Commit d2a7314f authored by James Almer's avatar James Almer Committed by Clément Bœsch

vp9/x86: add ff_vp9_loop_filter_[vh]_16_16_sse2().

Similar gains in performance as the SSSE3 version
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 3ca7085a
...@@ -177,10 +177,17 @@ itxfm_func(idct, idct, 32, avx); ...@@ -177,10 +177,17 @@ itxfm_func(idct, idct, 32, avx);
#undef itxfm_func #undef itxfm_func
#undef itxfm_funcs #undef itxfm_funcs
void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); #define lpf_funcs(size1, size2, opt) \
void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H); int E, int I, int H); \
void ff_vp9_loop_filter_h_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H); void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
int E, int I, int H)
lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx);
#undef lpf_funcs
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
...@@ -230,6 +237,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -230,6 +237,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(2, 1, 16, avg, sse2); init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2); init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2); init_fpel(0, 1, 64, avg, sse2);
if (ARCH_X86_64) {
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
}
} }
if (EXTERNAL_SSSE3(cpu_flags)) { if (EXTERNAL_SSSE3(cpu_flags)) {
......
...@@ -284,11 +284,11 @@ SECTION .text ...@@ -284,11 +284,11 @@ SECTION .text
%endif %endif
; calc fm mask ; calc fm mask
%if cpuflag(ssse3)
pxor m0, m0 pxor m0, m0
movd m2, Id %endif
movd m3, Ed SPLATB_REG m2, I, m0 ; I I I I ...
pshufb m2, m0 ; I I I I ... SPLATB_REG m3, E, m0 ; E E E E ...
pshufb m3, m0 ; E E E E ...
mova m0, [pb_80] mova m0, [pb_80]
pxor m2, m0 pxor m2, m0
pxor m3, m0 pxor m3, m0
...@@ -340,9 +340,10 @@ SECTION .text ...@@ -340,9 +340,10 @@ SECTION .text
ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1 ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1
pand m2, m1 pand m2, m1
ABSSUB m4, m10, m11, m5 ; abs(p1 - p0) ABSSUB m4, m10, m11, m5 ; abs(p1 - p0)
%if cpuflag(ssse3)
pxor m0, m0 pxor m0, m0
movd m7, Hd %endif
pshufb m7, m0 ; H H H H ... SPLATB_REG m7, H, m0 ; H H H H ...
pxor m7, m8 pxor m7, m8
pxor m4, m8 pxor m4, m8
pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition) pcmpgtb m0, m4, m7 ; abs(p1 - p0) > H (1/2 hev condition)
...@@ -665,6 +666,7 @@ cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst ...@@ -665,6 +666,7 @@ cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst
RET RET
%endmacro %endmacro
LPF_16_16_VH sse2
LPF_16_16_VH ssse3 LPF_16_16_VH ssse3
LPF_16_16_VH avx LPF_16_16_VH avx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment