Commit 0f2705e6 authored by Martin Storsjö's avatar Martin Storsjö

aarch64: vp9itxfm16: Make the larger core transforms standalone functions

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/aarch64/vp9itxfm_16bpp_neon.o from
26288 to 21512 bytes.

This gives a small slowdown of a couple of tens of cycles, but makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_10_neon:    1887.4
vp9_inv_dct_dct_16x16_sub16_add_10_neon:   2801.5
vp9_inv_dct_dct_32x32_sub4_add_10_neon:    9691.4
vp9_inv_dct_dct_32x32_sub32_add_10_neon:  16154.9

After:
vp9_inv_dct_dct_16x16_sub4_add_10_neon:    1899.5
vp9_inv_dct_dct_16x16_sub16_add_10_neon:   2827.2
vp9_inv_dct_dct_32x32_sub4_add_10_neon:    9714.7
vp9_inv_dct_dct_32x32_sub32_add_10_neon:  16175.9
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent 0ea60320
...@@ -710,7 +710,7 @@ function idct16x16_dc_add_neon ...@@ -710,7 +710,7 @@ function idct16x16_dc_add_neon
ret ret
endfunc endfunc
.macro idct16 function idct16
dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
...@@ -753,9 +753,10 @@ endfunc ...@@ -753,9 +753,10 @@ endfunc
butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12] butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10] butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
.endm ret
endfunc
.macro iadst16 function iadst16
ld1 {v0.8h,v1.8h}, [x11] ld1 {v0.8h,v1.8h}, [x11]
sxtl v2.4s, v1.4h sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h sxtl2 v3.4s, v1.8h
...@@ -830,7 +831,8 @@ endfunc ...@@ -830,7 +831,8 @@ endfunc
mov v16.16b, v2.16b mov v16.16b, v2.16b
mov v30.16b, v4.16b mov v30.16b, v4.16b
.endm ret
endfunc
// Helper macros; we can't use these expressions directly within // Helper macros; we can't use these expressions directly within
// e.g. .irp due to the extra concatenation \(). Therefore wrap // e.g. .irp due to the extra concatenation \(). Therefore wrap
...@@ -857,12 +859,14 @@ endfunc ...@@ -857,12 +859,14 @@ endfunc
// x9 = input stride // x9 = input stride
.macro itxfm16_1d_funcs txfm .macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_4x16_pass1_neon function \txfm\()16_1d_4x16_pass1_neon
mov x14, x30
movi v4.4s, #0 movi v4.4s, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9 load_clear \i, x2, x9
.endr .endr
\txfm\()16 bl \txfm\()16
// Do four 4x4 transposes. Originally, v16-v31 contain the // Do four 4x4 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
...@@ -878,7 +882,7 @@ function \txfm\()16_1d_4x16_pass1_neon ...@@ -878,7 +882,7 @@ function \txfm\()16_1d_4x16_pass1_neon
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
store \i, x0, #16 store \i, x0, #16
.endr .endr
ret br x14
1: 1:
// Special case: For the last input column (x1 == 12), // Special case: For the last input column (x1 == 12),
// which would be stored as the last row in the temp buffer, // which would be stored as the last row in the temp buffer,
...@@ -906,7 +910,7 @@ function \txfm\()16_1d_4x16_pass1_neon ...@@ -906,7 +910,7 @@ function \txfm\()16_1d_4x16_pass1_neon
mov v29.16b, v17.16b mov v29.16b, v17.16b
mov v30.16b, v18.16b mov v30.16b, v18.16b
mov v31.16b, v19.16b mov v31.16b, v19.16b
ret br x14
endfunc endfunc
// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
...@@ -917,6 +921,8 @@ endfunc ...@@ -917,6 +921,8 @@ endfunc
// x3 = slice offset // x3 = slice offset
// x9 = temp buffer stride // x9 = temp buffer stride
function \txfm\()16_1d_4x16_pass2_neon function \txfm\()16_1d_4x16_pass2_neon
mov x14, x30
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
load \i, x2, x9 load \i, x2, x9
.endr .endr
...@@ -928,7 +934,7 @@ function \txfm\()16_1d_4x16_pass2_neon ...@@ -928,7 +934,7 @@ function \txfm\()16_1d_4x16_pass2_neon
add x3, x0, x1 add x3, x0, x1
lsl x1, x1, #1 lsl x1, x1, #1
\txfm\()16 bl \txfm\()16
dup v8.8h, w13 dup v8.8h, w13
.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
...@@ -983,7 +989,7 @@ function \txfm\()16_1d_4x16_pass2_neon ...@@ -983,7 +989,7 @@ function \txfm\()16_1d_4x16_pass2_neon
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
.purgem load_add_store .purgem load_add_store
ret br x14
endfunc endfunc
.endm .endm
...@@ -1158,7 +1164,7 @@ function idct32x32_dc_add_neon ...@@ -1158,7 +1164,7 @@ function idct32x32_dc_add_neon
ret ret
endfunc endfunc
.macro idct32_odd function idct32_odd
dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
...@@ -1209,7 +1215,8 @@ endfunc ...@@ -1209,7 +1215,8 @@ endfunc
dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22 dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
.endm ret
endfunc
// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
// The 32-point IDCT can be decomposed into two 16-point IDCTs; // The 32-point IDCT can be decomposed into two 16-point IDCTs;
...@@ -1221,6 +1228,8 @@ endfunc ...@@ -1221,6 +1228,8 @@ endfunc
// x2 = src // x2 = src
// x9 = double input stride // x9 = double input stride
function idct32_1d_4x32_pass1_neon function idct32_1d_4x32_pass1_neon
mov x14, x30
movi v4.4s, #0 movi v4.4s, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30) // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
...@@ -1229,7 +1238,7 @@ function idct32_1d_4x32_pass1_neon ...@@ -1229,7 +1238,7 @@ function idct32_1d_4x32_pass1_neon
st1 {v4.4s}, [x2], x9 st1 {v4.4s}, [x2], x9
.endr .endr
idct16 bl idct16
// Do four 4x4 transposes. Originally, v16-v31 contain the // Do four 4x4 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
...@@ -1280,7 +1289,7 @@ function idct32_1d_4x32_pass1_neon ...@@ -1280,7 +1289,7 @@ function idct32_1d_4x32_pass1_neon
st1 {v4.4s}, [x2], x9 st1 {v4.4s}, [x2], x9
.endr .endr
idct32_odd bl idct32_odd
transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7 transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7 transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
...@@ -1330,7 +1339,7 @@ function idct32_1d_4x32_pass1_neon ...@@ -1330,7 +1339,7 @@ function idct32_1d_4x32_pass1_neon
store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
.purgem store_rev .purgem store_rev
ret br x14
endfunc endfunc
// This is mostly the same as 4x32_pass1, but without the transpose, // This is mostly the same as 4x32_pass1, but without the transpose,
...@@ -1342,13 +1351,15 @@ endfunc ...@@ -1342,13 +1351,15 @@ endfunc
// x7 = negative double temp buffer stride // x7 = negative double temp buffer stride
// x9 = double temp buffer stride // x9 = double temp buffer stride
function idct32_1d_4x32_pass2_neon function idct32_1d_4x32_pass2_neon
mov x14, x30
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30) // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().4s}, [x2], x9 ld1 {v\i\().4s}, [x2], x9
.endr .endr
sub x2, x2, x9, lsl #4 sub x2, x2, x9, lsl #4
idct16 bl idct16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
st1 {v\i\().4s}, [x2], x9 st1 {v\i\().4s}, [x2], x9
...@@ -1364,7 +1375,7 @@ function idct32_1d_4x32_pass2_neon ...@@ -1364,7 +1375,7 @@ function idct32_1d_4x32_pass2_neon
sub x2, x2, x9, lsl #4 sub x2, x2, x9, lsl #4
sub x2, x2, #128 sub x2, x2, #128
idct32_odd bl idct32_odd
.macro load_acc_store a, b, c, d, neg=0 .macro load_acc_store a, b, c, d, neg=0
.if \neg == 0 .if \neg == 0
...@@ -1420,7 +1431,7 @@ function idct32_1d_4x32_pass2_neon ...@@ -1420,7 +1431,7 @@ function idct32_1d_4x32_pass2_neon
load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1 load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1 load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
.purgem load_acc_store .purgem load_acc_store
ret br x14
endfunc endfunc
const min_eob_idct_idct_32, align=4 const min_eob_idct_idct_32, align=4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment