Commit 92d47550 authored by James Almer's avatar James Almer Committed by Anton Khirnov

vp9lpf/x86: add an SSE2 version of vp9_loop_filter_[vh]_88_16

Similar gains as the ssse3 version once again

Additional improvements by Clément Bœsch <u@pkh.me>.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarAnton Khirnov <anton@khirnov.net>
parent 6bea4781
...@@ -226,6 +226,7 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri ...@@ -226,6 +226,7 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
lpf_funcs(16, 16, sse2); lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3); lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx); lpf_funcs(16, 16, avx);
lpf_funcs(88, 16, sse2);
lpf_funcs(88, 16, ssse3); lpf_funcs(88, 16, ssse3);
lpf_funcs(88, 16, avx); lpf_funcs(88, 16, avx);
...@@ -293,6 +294,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -293,6 +294,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(1, 1, 32, avg, sse2); init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2); init_fpel(0, 1, 64, avg, sse2);
if (ARCH_X86_64) { if (ARCH_X86_64) {
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2; dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
} }
......
...@@ -292,6 +292,17 @@ SECTION .text ...@@ -292,6 +292,17 @@ SECTION .text
%define Q7 dst2q + strideq %define Q7 dst2q + strideq
%endmacro %endmacro
; ..............AB -> AAAAAAAABBBBBBBB
%macro SPLATB_MIX 1-2 [mask_mix]
%if cpuflag(ssse3)
pshufb %1, %2
%else
punpcklbw %1, %1
punpcklwd %1, %1
punpckldq %1, %1
%endif
%endmacro
%macro LOOPFILTER 2 ; %1=v/h %2=size1 %macro LOOPFILTER 2 ; %1=v/h %2=size1
lea mstrideq, [strideq] lea mstrideq, [strideq]
neg mstrideq neg mstrideq
...@@ -382,11 +393,13 @@ SECTION .text ...@@ -382,11 +393,13 @@ SECTION .text
SPLATB_REG m2, I, m0 ; I I I I ... SPLATB_REG m2, I, m0 ; I I I I ...
SPLATB_REG m3, E, m0 ; E E E E ... SPLATB_REG m3, E, m0 ; E E E E ...
%elif %2 == 88 %elif %2 == 88
%if cpuflag(ssse3)
mova m0, [mask_mix] mova m0, [mask_mix]
%endif
movd m2, Id movd m2, Id
movd m3, Ed movd m3, Ed
pshufb m2, m0 SPLATB_MIX m2, m0
pshufb m3, m0 SPLATB_MIX m3, m0
%endif %endif
mova m0, [pb_80] mova m0, [pb_80]
pxor m2, m0 pxor m2, m0
...@@ -446,7 +459,7 @@ SECTION .text ...@@ -446,7 +459,7 @@ SECTION .text
SPLATB_REG m7, H, m0 ; H H H H ... SPLATB_REG m7, H, m0 ; H H H H ...
%else %else
movd m7, Hd movd m7, Hd
pshufb m7, [mask_mix] SPLATB_MIX m7
%endif %endif
pxor m7, m8 pxor m7, m8
pxor m4, m8 pxor m4, m8
...@@ -727,6 +740,7 @@ LPF_16_16_VH sse2 ...@@ -727,6 +740,7 @@ LPF_16_16_VH sse2
LPF_16_16_VH ssse3 LPF_16_16_VH ssse3
LPF_16_16_VH avx LPF_16_16_VH avx
LPF_88_16_VH sse2
LPF_88_16_VH ssse3 LPF_88_16_VH ssse3
LPF_88_16_VH avx LPF_88_16_VH avx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment