Commit c5dd73b8 authored by Clément Bœsch's avatar Clément Bœsch

x86/vp9lpf: add ff_vp9_loop_filter_h_{48,84}_16_{sse2,ssse3,avx}().

5.40s → 5.30s overall decode time with -threads 1 on ped1080p.webm
(i7 920, ssse3)
parent 6dc9d2cf
...@@ -187,6 +187,12 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri ...@@ -187,6 +187,12 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
lpf_funcs(16, 16, sse2); lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3); lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx); lpf_funcs(16, 16, avx);
lpf_funcs(84, 16, sse2);
lpf_funcs(84, 16, ssse3);
lpf_funcs(84, 16, avx);
lpf_funcs(48, 16, sse2);
lpf_funcs(48, 16, ssse3);
lpf_funcs(48, 16, avx);
lpf_funcs(88, 16, sse2); lpf_funcs(88, 16, sse2);
lpf_funcs(88, 16, ssse3); lpf_funcs(88, 16, ssse3);
lpf_funcs(88, 16, avx); lpf_funcs(88, 16, avx);
...@@ -224,6 +230,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -224,6 +230,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_subpel2(idx, 0, 1, v, type, opt); \ init_subpel2(idx, 0, 1, v, type, opt); \
init_subpel2(idx, 1, 0, h, type, opt) init_subpel2(idx, 1, 0, h, type, opt)
#define init_lpf(opt) do { \
if (ARCH_X86_64) { \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
} \
} while (0)
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
init_fpel(4, 0, 4, put, mmx); init_fpel(4, 0, 4, put, mmx);
init_fpel(3, 0, 8, put, mmx); init_fpel(3, 0, 8, put, mmx);
...@@ -248,12 +267,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -248,12 +267,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel(2, 1, 16, avg, sse2); init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2); init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2); init_fpel(0, 1, 64, avg, sse2);
if (ARCH_X86_64) { init_lpf(sse2);
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
}
} }
if (EXTERNAL_SSSE3(cpu_flags)) { if (EXTERNAL_SSSE3(cpu_flags)) {
...@@ -276,11 +290,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -276,11 +290,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_ssse3;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_ssse3;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3;
} }
init_lpf(ssse3);
} }
if (EXTERNAL_AVX(cpu_flags)) { if (EXTERNAL_AVX(cpu_flags)) {
...@@ -297,11 +308,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -297,11 +308,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_avx;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_avx;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
} }
init_lpf(avx);
} }
#undef init_fpel #undef init_fpel
......
...@@ -44,6 +44,11 @@ pw_8: times 8 dw 8 ...@@ -44,6 +44,11 @@ pw_8: times 8 dw 8
mask_mix: times 8 db 0 mask_mix: times 8 db 0
times 8 db 1 times 8 db 1
mask_mix84: times 8 db 0xff
times 8 db 0x00
mask_mix48: times 8 db 0x00
times 8 db 0xff
SECTION .text SECTION .text
; %1 = abs(%2-%3) ; %1 = abs(%2-%3)
...@@ -324,7 +329,7 @@ SECTION .text ...@@ -324,7 +329,7 @@ SECTION .text
neg mstride3q neg mstride3q
%ifidn %1, h %ifidn %1, h
%if %2 == 88 %if %2 > 16
%define movx movh %define movx movh
lea dstq, [dstq + 8*strideq - 4] lea dstq, [dstq + 8*strideq - 4]
%else %else
...@@ -372,7 +377,7 @@ SECTION .text ...@@ -372,7 +377,7 @@ SECTION .text
%define Q6 rsp + 224 %define Q6 rsp + 224
%define Q7 rsp + 240 %define Q7 rsp + 240
%if %2 != 88 %if %2 == 16
TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp]
mova [P7], m0 mova [P7], m0
mova [P6], m1 mova [P6], m1
...@@ -389,7 +394,7 @@ SECTION .text ...@@ -389,7 +394,7 @@ SECTION .text
mova [Q1], m9 mova [Q1], m9
mova [Q2], m10 mova [Q2], m10
mova [Q3], m11 mova [Q3], m11
%if %2 != 88 %if %2 == 16
mova [Q4], m12 mova [Q4], m12
mova [Q5], m13 mova [Q5], m13
mova [Q6], m14 mova [Q6], m14
...@@ -404,7 +409,7 @@ SECTION .text ...@@ -404,7 +409,7 @@ SECTION .text
%endif %endif
SPLATB_REG m2, I, m0 ; I I I I ... SPLATB_REG m2, I, m0 ; I I I I ...
SPLATB_REG m3, E, m0 ; E E E E ... SPLATB_REG m3, E, m0 ; E E E E ...
%elif %2 == 88 %else
%if cpuflag(ssse3) %if cpuflag(ssse3)
mova m0, [mask_mix] mova m0, [mask_mix]
%endif %endif
...@@ -462,7 +467,7 @@ SECTION .text ...@@ -462,7 +467,7 @@ SECTION .text
ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1 ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1
pand m2, m1 pand m2, m1
ABSSUB m4, m10, m11, m5 ; abs(p1 - p0) ABSSUB m4, m10, m11, m5 ; abs(p1 - p0)
%if %2 != 88 %if %2 == 16
%if cpuflag(ssse3) %if cpuflag(ssse3)
pxor m0, m0 pxor m0, m0
%endif %endif
...@@ -490,8 +495,11 @@ SECTION .text ...@@ -490,8 +495,11 @@ SECTION .text
pand m2, m1 pand m2, m1
ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1 ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1
pand m2, m1 ; flat8in final value pand m2, m1 ; flat8in final value
%if %2 == 84 || %2 == 48
pand m2, [mask_mix%2]
%endif
%if %2 != 88 %if %2 == 16
; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
; calc flat8out mask ; calc flat8out mask
mova m8, [P7] mova m8, [P7]
...@@ -584,7 +592,7 @@ SECTION .text ...@@ -584,7 +592,7 @@ SECTION .text
; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
; filter6() ; filter6()
pxor m0, m0 pxor m0, m0
%if %2 == 88 %if %2 > 16
pand m3, m2 pand m3, m2
%else %else
pand m2, m3 ; mask(fm) & mask(in) pand m2, m3 ; mask(fm) & mask(in)
...@@ -622,7 +630,7 @@ SECTION .text ...@@ -622,7 +630,7 @@ SECTION .text
; q5 +5 -p2 -q4 +q5 +q7 . q5 . . ; q5 +5 -p2 -q4 +q5 +q7 . q5 . .
; q6 +6 -p1 -q5 +q6 +q7 . q6 . . ; q6 +6 -p1 -q5 +q6 +q7 . q6 . .
%if %2 != 88 %if %2 == 16
pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) pand m1, m2 ; mask(out) & (mask(fm) & mask(in))
mova m2, [P7] mova m2, [P7]
mova m3, [P6] mova m3, [P6]
...@@ -645,7 +653,7 @@ SECTION .text ...@@ -645,7 +653,7 @@ SECTION .text
%endif %endif
%ifidn %1, h %ifidn %1, h
%if %2 != 88 %if %2 == 16
mova m0, [P7] mova m0, [P7]
mova m1, [P6] mova m1, [P6]
mova m2, [P5] mova m2, [P5]
...@@ -753,28 +761,23 @@ SECTION .text ...@@ -753,28 +761,23 @@ SECTION .text
RET RET
%endmacro %endmacro
%macro LPF_16_16_VH 1 %macro LPF_16_VH 2
INIT_XMM %1 INIT_XMM %2
cglobal vp9_loop_filter_v_16_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
LOOPFILTER v, 16 LOOPFILTER v, %1
cglobal vp9_loop_filter_h_16_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
LOOPFILTER h, 16 LOOPFILTER h, %1
%endmacro %endmacro
%macro LPF_88_16_VH 1 %macro LPF_16_VH_ALL_OPTS 1
INIT_XMM %1 LPF_16_VH %1, sse2
cglobal vp9_loop_filter_v_88_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 LPF_16_VH %1, ssse3
LOOPFILTER v, 88 LPF_16_VH %1, avx
cglobal vp9_loop_filter_h_88_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3
LOOPFILTER h, 88
%endmacro %endmacro
LPF_16_16_VH sse2 LPF_16_VH_ALL_OPTS 16
LPF_16_16_VH ssse3 LPF_16_VH_ALL_OPTS 48
LPF_16_16_VH avx LPF_16_VH_ALL_OPTS 84
LPF_16_VH_ALL_OPTS 88
LPF_88_16_VH sse2
LPF_88_16_VH ssse3
LPF_88_16_VH avx
%endif ; x86-64 %endif ; x86-64
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment