Commit d7595de0 authored by Janne Grunau's avatar Janne Grunau

aarch64: vp9: use alternative returns in the core loop filter function

Since aarch64 has enough free general purpose registers use them to
branch to the appropiate storage code. 1-2 cycles faster for the
functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
(up to 2 cycles faster/slower) on a cortex-a57.
parent e17567a8
...@@ -410,15 +410,19 @@ ...@@ -410,15 +410,19 @@
.endif .endif
// If no pixels needed flat8in nor flat8out, jump to a // If no pixels needed flat8in nor flat8out, jump to a
// writeout of the inner 4 pixels // writeout of the inner 4 pixels
cbz x5, 7f cbnz x5, 1f
br x14
1:
mov x5, v7.d[0] mov x5, v7.d[0]
.ifc \sz, .16b .ifc \sz, .16b
mov x6, v7.d[1] mov x6, v7.d[1]
orr x5, x5, x6 orr x5, x5, x6
.endif .endif
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
cbz x5, 8f cbnz x5, 1f
br x15
1:
// flat8out // flat8out
// This writes all outputs into v2-v17 (skipping v6 and v16). // This writes all outputs into v2-v17 (skipping v6 and v16).
// If this part is skipped, the output is read from v21-v26 (which is the input // If this part is skipped, the output is read from v21-v26 (which is the input
...@@ -549,35 +553,24 @@ endfunc ...@@ -549,35 +553,24 @@ endfunc
function vp9_loop_filter_8 function vp9_loop_filter_8
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31 loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
mov x5, #0
ret ret
6: 6:
mov x5, #6 br x13
ret
9: 9:
br x10 br x10
endfunc endfunc
function vp9_loop_filter_8_16b_mix function vp9_loop_filter_8_16b_mix
loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31 loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
mov x5, #0
ret ret
6: 6:
mov x5, #6 br x13
ret
9: 9:
br x10 br x10
endfunc endfunc
function vp9_loop_filter_16 function vp9_loop_filter_16
loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15 loop_filter 16, .8b, 0, v8, v9, v10, v11, v12, v13, v14, v15
mov x5, #0
ret
7:
mov x5, #7
ret
8:
mov x5, #8
ret ret
9: 9:
ldp d8, d9, [sp], 0x10 ldp d8, d9, [sp], 0x10
...@@ -589,13 +582,6 @@ endfunc ...@@ -589,13 +582,6 @@ endfunc
function vp9_loop_filter_16_16b function vp9_loop_filter_16_16b
loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15 loop_filter 16, .16b, 0, v8, v9, v10, v11, v12, v13, v14, v15
mov x5, #0
ret
7:
mov x5, #7
ret
8:
mov x5, #8
ret ret
9: 9:
ldp d8, d9, [sp], 0x10 ldp d8, d9, [sp], 0x10
...@@ -614,11 +600,14 @@ endfunc ...@@ -614,11 +600,14 @@ endfunc
.endm .endm
.macro loop_filter_8 .macro loop_filter_8
// calculate alternative 'return' targets
adr x13, 6f
bl vp9_loop_filter_8 bl vp9_loop_filter_8
cbnz x5, 6f
.endm .endm
.macro loop_filter_8_16b_mix mix .macro loop_filter_8_16b_mix mix
// calculate alternative 'return' targets
adr x13, 6f
.if \mix == 48 .if \mix == 48
mov x11, #0xffffffff00000000 mov x11, #0xffffffff00000000
.elseif \mix == 84 .elseif \mix == 84
...@@ -627,21 +616,20 @@ endfunc ...@@ -627,21 +616,20 @@ endfunc
mov x11, #0xffffffffffffffff mov x11, #0xffffffffffffffff
.endif .endif
bl vp9_loop_filter_8_16b_mix bl vp9_loop_filter_8_16b_mix
cbnz x5, 6f
.endm .endm
.macro loop_filter_16 .macro loop_filter_16
// calculate alternative 'return' targets
adr x14, 7f
adr x15, 8f
bl vp9_loop_filter_16 bl vp9_loop_filter_16
cmp x5, 7
b.gt 8f
b.eq 7f
.endm .endm
.macro loop_filter_16_16b .macro loop_filter_16_16b
// calculate alternative 'return' targets
adr x14, 7f
adr x15, 8f
bl vp9_loop_filter_16_16b bl vp9_loop_filter_16_16b
cmp x5, 7
b.gt 8f
b.eq 7f
.endm .endm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment