Commit 9f10cff6 authored by Martin Storsjö's avatar Martin Storsjö

aarch64: Add NEON optimizations for 10 and 12 bit vp9 loop filter

This work is sponsored by, and copyright, Google.

This is similar to the arm version, but due to the larger registers
on aarch64, we can do 8 pixels at a time for all filter sizes.

Examples of runtimes vs the 32 bit version, on a Cortex A53:
                                             ARM AArch64
vp9_loop_filter_h_4_8_10bpp_neon:          213.2   172.6
vp9_loop_filter_h_8_8_10bpp_neon:          281.2   244.2
vp9_loop_filter_h_16_8_10bpp_neon:         657.0   444.5
vp9_loop_filter_h_16_16_10bpp_neon:       1280.4   877.7
vp9_loop_filter_mix2_h_44_16_10bpp_neon:   397.7   358.0
vp9_loop_filter_mix2_h_48_16_10bpp_neon:   465.7   429.0
vp9_loop_filter_mix2_h_84_16_10bpp_neon:   465.7   428.0
vp9_loop_filter_mix2_h_88_16_10bpp_neon:   533.7   499.0
vp9_loop_filter_mix2_v_44_16_10bpp_neon:   271.5   244.0
vp9_loop_filter_mix2_v_48_16_10bpp_neon:   330.0   305.0
vp9_loop_filter_mix2_v_84_16_10bpp_neon:   329.0   306.0
vp9_loop_filter_mix2_v_88_16_10bpp_neon:   386.0   365.0
vp9_loop_filter_v_4_8_10bpp_neon:          150.0   115.2
vp9_loop_filter_v_8_8_10bpp_neon:          209.0   175.5
vp9_loop_filter_v_16_8_10bpp_neon:         492.7   345.2
vp9_loop_filter_v_16_16_10bpp_neon:        951.0   682.7

This is significantly faster than the ARM version in almost
all cases except for the mix2 functions.

Based on START_TIMER/STOP_TIMER wrapping around a few individual
functions, the speedup vs C code is around 2-3x.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent ceb36b81
...@@ -44,6 +44,7 @@ NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o ...@@ -44,6 +44,7 @@ NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
aarch64/vp9itxfm_neon.o \ aarch64/vp9itxfm_neon.o \
aarch64/vp9lpf_16bpp_neon.o \
aarch64/vp9lpf_neon.o \ aarch64/vp9lpf_neon.o \
aarch64/vp9mc_16bpp_neon.o \ aarch64/vp9mc_16bpp_neon.o \
aarch64/vp9mc_neon.o aarch64/vp9mc_neon.o
...@@ -203,8 +203,70 @@ static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp) ...@@ -203,8 +203,70 @@ static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
} }
} }
#define define_loop_filter(dir, wd, size, bpp) \
void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
#define define_loop_filters(wd, size, bpp) \
define_loop_filter(h, wd, size, bpp); \
define_loop_filter(v, wd, size, bpp)
define_loop_filters(4, 8, BPP);
define_loop_filters(8, 8, BPP);
define_loop_filters(16, 8, BPP);
define_loop_filters(16, 16, BPP);
define_loop_filters(44, 16, BPP);
define_loop_filters(48, 16, BPP);
define_loop_filters(84, 16, BPP);
define_loop_filters(88, 16, BPP);
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
#define init_lpf_func_16(idx, dir, bpp) \
dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
#define init_lpf_funcs_8_wd(idx, wd, bpp) \
init_lpf_func_8(idx, 0, h, wd, bpp); \
init_lpf_func_8(idx, 1, v, wd, bpp)
#define init_lpf_funcs_16(bpp) \
init_lpf_func_16(0, h, bpp); \
init_lpf_func_16(1, v, bpp)
#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
#define init_lpf_funcs_8(bpp) \
init_lpf_funcs_8_wd(0, 4, bpp); \
init_lpf_funcs_8_wd(1, 8, bpp); \
init_lpf_funcs_8_wd(2, 16, bpp)
#define init_lpf_funcs_mix2(bpp) \
init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
init_lpf_funcs_8(BPP);
init_lpf_funcs_16(BPP);
init_lpf_funcs_mix2(BPP);
}
}
av_cold void INIT_FUNC(VP9DSPContext *dsp) av_cold void INIT_FUNC(VP9DSPContext *dsp)
{ {
vp9dsp_mc_init_aarch64(dsp); vp9dsp_mc_init_aarch64(dsp);
vp9dsp_loopfilter_init_aarch64(dsp);
vp9dsp_itxfm_init_aarch64(dsp); vp9dsp_itxfm_init_aarch64(dsp);
} }
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment