Commit 28a8b541 authored by Janne Grunau's avatar Janne Grunau

h264/aarch64: add intra loop filter neon asm

Add my neon asm from x264 relicensed under the LGPL 2.1 or later. Ported
(x264 uses nv12 chroma) and optimized.

Cycle count for checkasm --bench on a Snapdragon 820e:
h264_h_loop_filter_luma_intra_8bpp_c: 60.0
h264_h_loop_filter_luma_intra_8bpp_neon: 54.2
h264_v_loop_filter_luma_intra_8bpp_c: 148.3
h264_v_loop_filter_luma_intra_8bpp_neon: 73.8
h264_h_loop_filter_chroma_intra_8bpp_c: 27.8
h264_h_loop_filter_chroma_intra_8bpp_neon: 21.4
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 15.8
h264_h_loop_filter_chroma_mbaff_intra_8bpp_neon: 15.7
h264_v_loop_filter_chroma_intra_8bpp_c: 45.8
h264_v_loop_filter_chroma_intra_8bpp_neon: 17.3
parent 846c3d6a
...@@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, ...@@ -29,10 +29,20 @@ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
int beta);
void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, int stride, int alpha,
int beta);
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, int stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_mbaff_intra_neon(uint8_t *pix, int stride,
int alpha, int beta);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset); int log2_den, int weight, int offset);
...@@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, ...@@ -77,8 +87,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
if (have_neon(cpu_flags) && bit_depth == 8) { if (have_neon(cpu_flags) && bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
c->h264_v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
c->h264_h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon;
c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon;
c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment