Commit 222c46c5 authored by Clément Bœsch's avatar Clément Bœsch

x86/vp9lpf: add ff_vp9_loop_filter_[vh]_88_16_{ssse3,avx}.

9680 decicycles in loop_filter_v_88_16_c, 4193765 runs, 539 skips
9233 decicycles in loop_filter_h_88_16_c, 4193751 runs, 553 skips

1929 decicycles in ff_vp9_loop_filter_v_88_16_ssse3, 4194118 runs, 186 skips
2738 decicycles in ff_vp9_loop_filter_h_88_16_ssse3, 4193861 runs, 443 skips

5.978 → 5.417 overall decode time on ped1080p.webm (-threads 1)

Adding SSE2 support should be relatively trivial (just a matter of
changing the pshufb [mask_mix] with something else), patch welcome.
parent 2a9c5079
...@@ -187,6 +187,8 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri ...@@ -187,6 +187,8 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
lpf_funcs(16, 16, sse2); lpf_funcs(16, 16, sse2);
lpf_funcs(16, 16, ssse3); lpf_funcs(16, 16, ssse3);
lpf_funcs(16, 16, avx); lpf_funcs(16, 16, avx);
lpf_funcs(88, 16, ssse3);
lpf_funcs(88, 16, avx);
#undef lpf_funcs #undef lpf_funcs
...@@ -271,6 +273,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -271,6 +273,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_ssse3;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_ssse3;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3; dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3;
} }
...@@ -290,6 +294,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) ...@@ -290,6 +294,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_ADST] =
dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx;
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_avx;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_avx;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx; dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx;
} }
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment