Commit 52c9b0a6 authored by Martin Storsjö's avatar Martin Storsjö

aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version

                     Cortex A53    A72    A73
vp8_luma_dc_wht_c:        115.7   75.7   90.7
vp8_luma_dc_wht_neon:      60.7   41.2   45.7
vp8_idct_dc_add4uv_c:     376.1  262.9  282.5
vp8_idct_dc_add4uv_neon:   52.0   29.0   37.0
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent c513fcd7
...@@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); ...@@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
VP8_LF(neon); VP8_LF(neon);
...@@ -55,10 +56,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) ...@@ -55,10 +56,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
{ {
if (!have_neon(av_get_cpu_flags())) if (!have_neon(av_get_cpu_flags()))
return; return;
dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
dsp->vp8_idct_add = ff_vp8_idct_add_neon; dsp->vp8_idct_add = ff_vp8_idct_add_neon;
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* Copyright (c) 2010 Rob Clark <rob@ti.com> * Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com> * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com> * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
* Copyright (c) 2019 Martin Storsjo <martin@martin.st>
* *
* This file is part of Libav. * This file is part of Libav.
* *
...@@ -25,6 +26,62 @@ ...@@ -25,6 +26,62 @@
#include "libavutil/aarch64/asm.S" #include "libavutil/aarch64/asm.S"
#include "neon.S" #include "neon.S"
function ff_vp8_luma_dc_wht_neon, export=1
ld1 {v0.4h - v3.4h}, [x1]
movi v30.8h, #0
add v4.4h, v0.4h, v3.4h
add v6.4h, v1.4h, v2.4h
st1 {v30.8h}, [x1], #16
sub v7.4h, v1.4h, v2.4h
sub v5.4h, v0.4h, v3.4h
st1 {v30.8h}, [x1]
add v0.4h, v4.4h, v6.4h
add v1.4h, v5.4h, v7.4h
sub v2.4h, v4.4h, v6.4h
sub v3.4h, v5.4h, v7.4h
movi v16.4h, #3
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v0.4h, v0.4h, v16.4h
add v4.4h, v0.4h, v3.4h
add v6.4h, v1.4h, v2.4h
sub v7.4h, v1.4h, v2.4h
sub v5.4h, v0.4h, v3.4h
add v0.4h, v4.4h, v6.4h
add v1.4h, v5.4h, v7.4h
sub v2.4h, v4.4h, v6.4h
sub v3.4h, v5.4h, v7.4h
sshr v0.4h, v0.4h, #3
sshr v1.4h, v1.4h, #3
sshr v2.4h, v2.4h, #3
sshr v3.4h, v3.4h, #3
mov x3, #32
st1 {v0.h}[0], [x0], x3
st1 {v1.h}[0], [x0], x3
st1 {v2.h}[0], [x0], x3
st1 {v3.h}[0], [x0], x3
st1 {v0.h}[1], [x0], x3
st1 {v1.h}[1], [x0], x3
st1 {v2.h}[1], [x0], x3
st1 {v3.h}[1], [x0], x3
st1 {v0.h}[2], [x0], x3
st1 {v1.h}[2], [x0], x3
st1 {v2.h}[2], [x0], x3
st1 {v3.h}[2], [x0], x3
st1 {v0.h}[3], [x0], x3
st1 {v1.h}[3], [x0], x3
st1 {v2.h}[3], [x0], x3
st1 {v3.h}[3], [x0], x3
ret
endfunc
function ff_vp8_idct_add_neon, export=1 function ff_vp8_idct_add_neon, export=1
ld1 {v0.8b - v3.8b}, [x1] ld1 {v0.8b - v3.8b}, [x1]
mov w4, #20091 mov w4, #20091
...@@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1 ...@@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1
ret ret
endfunc endfunc
function ff_vp8_idct_dc_add4uv_neon, export=1
movi v0.4h, #0
mov x3, #32
ld1r {v16.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v17.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v18.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ld1r {v19.4h}, [x1]
st1 {v0.h}[0], [x1], x3
ins v16.d[1], v17.d[0]
ins v18.d[1], v19.d[0]
mov x3, x0
srshr v16.8h, v16.8h, #3 // dc >>= 3
ld1 {v0.8b}, [x0], x2
srshr v18.8h, v18.8h, #3
ld1 {v1.8b}, [x0], x2
uaddw v20.8h, v16.8h, v0.8b
ld1 {v2.8b}, [x0], x2
uaddw v0.8h, v16.8h, v1.8b
ld1 {v3.8b}, [x0], x2
uaddw v22.8h, v16.8h, v2.8b
ld1 {v4.8b}, [x0], x2
uaddw v2.8h, v16.8h, v3.8b
ld1 {v5.8b}, [x0], x2
uaddw v24.8h, v18.8h, v4.8b
ld1 {v6.8b}, [x0], x2
uaddw v4.8h, v18.8h, v5.8b
ld1 {v7.8b}, [x0], x2
uaddw v26.8h, v18.8h, v6.8b
sqxtun v20.8b, v20.8h
uaddw v6.8h, v18.8h, v7.8b
sqxtun v21.8b, v0.8h
sqxtun v22.8b, v22.8h
st1 {v20.8b}, [x3], x2
sqxtun v23.8b, v2.8h
st1 {v21.8b}, [x3], x2
sqxtun v24.8b, v24.8h
st1 {v22.8b}, [x3], x2
sqxtun v25.8b, v4.8h
st1 {v23.8b}, [x3], x2
sqxtun v26.8b, v26.8h
st1 {v24.8b}, [x3], x2
sqxtun v27.8b, v6.8h
st1 {v25.8b}, [x3], x2
st1 {v26.8b}, [x3], x2
st1 {v27.8b}, [x3], x2
ret
endfunc
function ff_vp8_idct_dc_add4y_neon, export=1 function ff_vp8_idct_dc_add4y_neon, export=1
movi v0.16b, #0 movi v0.16b, #0
mov x3, #32 mov x3, #32
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment