Commit 388f6e67 authored by Martin Storsjö's avatar Martin Storsjö Committed by Michael Niedermayer

arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

                                     Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub16_add_neon:   3188.1   2435.4   2499.0   1969.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18531.7  16582.3  14207.6  12000.3

By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon:     274.6    189.5    211.7    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    2064.0   1534.8   1719.4   1248.7
vp9_inv_dct_dct_16x16_sub4_add_neon:    2135.0   1477.2   1736.3   1249.5
vp9_inv_dct_dct_16x16_sub8_add_neon:    2446.7   1828.7   1993.6   1494.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2832.4   2118.3   2266.5   1735.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.7   2475.3   2523.5   1983.1
vp9_inv_dct_dct_32x32_sub1_add_neon:     756.2    456.7    862.0    553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   10682.2   8190.4   8539.2   6762.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10813.5   8014.9   8518.3   6762.8
vp9_inv_dct_dct_32x32_sub8_add_neon:   11859.6   9313.0   9347.4   7514.5
vp9_inv_dct_dct_32x32_sub12_add_neon:  12946.6  10752.4  10192.2   8280.2
vp9_inv_dct_dct_32x32_sub16_add_neon:  14074.6  11946.5  11001.4   9008.6
vp9_inv_dct_dct_32x32_sub20_add_neon:  15269.9  13662.7  11816.1   9762.6
vp9_inv_dct_dct_32x32_sub24_add_neon:  16327.9  14940.1  12626.7  10516.0
vp9_inv_dct_dct_32x32_sub28_add_neon:  17462.7  15776.1  13446.2  11264.7
vp9_inv_dct_dct_32x32_sub32_add_neon:  18575.5  17157.0  14249.3  12015.1

I.e. in general a very minor overhead for the full subpartition case due
to the additional loads and cmps, but a significant speedup for the cases
when we only need to process a small part of the actual input data.

In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.

This is cherrypicked from libav commit
9c8bc74c.
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent ecd343aa
...@@ -659,9 +659,8 @@ endfunc ...@@ -659,9 +659,8 @@ endfunc
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store. @ transpose into a horizontal 16x4 slice and store.
@ r0 = dst (temp buffer) @ r0 = dst (temp buffer)
@ r1 = unused @ r1 = slice offset
@ r2 = src @ r2 = src
@ r3 = slice offset
function \txfm\()16_1d_4x16_pass1_neon function \txfm\()16_1d_4x16_pass1_neon
mov r12, #32 mov r12, #32
vmov.s16 q2, #0 vmov.s16 q2, #0
...@@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon ...@@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon
transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
@ Store the transposed 4x4 blocks horizontally. @ Store the transposed 4x4 blocks horizontally.
cmp r3, #12 cmp r1, #12
beq 1f beq 1f
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
vst1.16 {d\i}, [r0,:64]! vst1.16 {d\i}, [r0,:64]!
.endr .endr
bx lr bx lr
1: 1:
@ Special case: For the last input column (r3 == 12), @ Special case: For the last input column (r1 == 12),
@ which would be stored as the last row in the temp buffer, @ which would be stored as the last row in the temp buffer,
@ don't store the first 4x4 block, but keep it in registers @ don't store the first 4x4 block, but keep it in registers
@ for the first slice of the second pass (where it is the @ for the first slice of the second pass (where it is the
...@@ -781,15 +780,22 @@ endfunc ...@@ -781,15 +780,22 @@ endfunc
itxfm16_1d_funcs idct itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst itxfm16_1d_funcs iadst
@ This is the minimum eob value for each subpartition, in increments of 4
const min_eob_idct_idct_16, align=4
.short 0, 10, 38, 89
endconst
.macro itxfm_func16x16 txfm1, txfm2 .macro itxfm_func16x16 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct .ifc \txfm1\()_\txfm2,idct_idct
cmp r3, #1 cmp r3, #1
beq idct16x16_dc_add_neon beq idct16x16_dc_add_neon
.endif .endif
push {r4-r7,lr} push {r4-r8,lr}
.ifnc \txfm1\()_\txfm2,idct_idct .ifnc \txfm1\()_\txfm2,idct_idct
vpush {q4-q7} vpush {q4-q7}
.else
movrel r8, min_eob_idct_idct_16 + 2
.endif .endif
@ Align the stack, allocate a temp buffer @ Align the stack, allocate a temp buffer
...@@ -810,10 +816,36 @@ A and r7, sp, #15 ...@@ -810,10 +816,36 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12 .irp i, 0, 4, 8, 12
add r0, sp, #(\i*32) add r0, sp, #(\i*32)
.ifc \txfm1\()_\txfm2,idct_idct
.if \i > 0
ldrh_post r1, r8, #2
cmp r3, r1
it le
movle r1, #(16 - \i)/4
ble 1f
.endif
.endif
mov r1, #\i
add r2, r6, #(\i*2) add r2, r6, #(\i*2)
mov r3, #\i
bl \txfm1\()16_1d_4x16_pass1_neon bl \txfm1\()16_1d_4x16_pass1_neon
.endr .endr
.ifc \txfm1\()_\txfm2,idct_idct
b 3f
1:
@ For all-zero slices in pass 1, set d28-d31 to zero, for the in-register
@ passthrough of coefficients to pass 2 and clear the end of the temp buffer
vmov.i16 q14, #0
vmov.i16 q15, #0
2:
subs r1, r1, #1
.rept 4
vst1.16 {q14-q15}, [r0,:128]!
.endr
bne 2b
3:
.endif
.ifc \txfm1\()_\txfm2,iadst_idct .ifc \txfm1\()_\txfm2,iadst_idct
movrel r12, idct_coeffs movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128] vld1.16 {q0-q1}, [r12,:128]
...@@ -830,7 +862,7 @@ A and r7, sp, #15 ...@@ -830,7 +862,7 @@ A and r7, sp, #15
.ifnc \txfm1\()_\txfm2,idct_idct .ifnc \txfm1\()_\txfm2,idct_idct
vpop {q4-q7} vpop {q4-q7}
.endif .endif
pop {r4-r7,pc} pop {r4-r8,pc}
endfunc endfunc
.endm .endm
...@@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon ...@@ -1110,11 +1142,16 @@ function idct32_1d_4x32_pass2_neon
bx lr bx lr
endfunc endfunc
const min_eob_idct_idct_32, align=4
.short 0, 9, 34, 70, 135, 240, 336, 448
endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1 function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp r3, #1 cmp r3, #1
beq idct32x32_dc_add_neon beq idct32x32_dc_add_neon
push {r4-r7,lr} push {r4-r8,lr}
vpush {q4-q7} vpush {q4-q7}
movrel r8, min_eob_idct_idct_32 + 2
@ Align the stack, allocate a temp buffer @ Align the stack, allocate a temp buffer
T mov r7, sp T mov r7, sp
...@@ -1129,9 +1166,29 @@ A and r7, sp, #15 ...@@ -1129,9 +1166,29 @@ A and r7, sp, #15
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, sp, #(\i*64) add r0, sp, #(\i*64)
.if \i > 0
ldrh_post r1, r8, #2
cmp r3, r1
it le
movle r1, #(32 - \i)/2
ble 1f
.endif
add r2, r6, #(\i*2) add r2, r6, #(\i*2)
bl idct32_1d_4x32_pass1_neon bl idct32_1d_4x32_pass1_neon
.endr .endr
b 3f
1:
@ Write zeros to the temp buffer for pass 2
vmov.i16 q14, #0
vmov.i16 q15, #0
2:
subs r1, r1, #1
.rept 4
vst1.16 {q14-q15}, [r0,:128]!
.endr
bne 2b
3:
.irp i, 0, 4, 8, 12, 16, 20, 24, 28 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
add r0, r4, #(\i) add r0, r4, #(\i)
mov r1, r5 mov r1, r5
...@@ -1141,5 +1198,5 @@ A and r7, sp, #15 ...@@ -1141,5 +1198,5 @@ A and r7, sp, #15
add sp, sp, r7 add sp, sp, r7
vpop {q4-q7} vpop {q4-q7}
pop {r4-r7,pc} pop {r4-r8,pc}
endfunc endfunc
...@@ -334,8 +334,10 @@ static void check_itxfm(void) ...@@ -334,8 +334,10 @@ static void check_itxfm(void)
// skip testing sub-IDCTs for WHT or ADST since they don't // skip testing sub-IDCTs for WHT or ADST since they don't
// implement it in any of the SIMD functions. If they do, // implement it in any of the SIMD functions. If they do,
// consider changing this to ensure we have complete test // consider changing this to ensure we have complete test
// coverage // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) { // since the arm version can distinguish them at that level.
for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
sub < 4 ? (sub <<= 1) : (sub += 4)) {
if (check_func(dsp.itxfm_add[tx][txtp], if (check_func(dsp.itxfm_add[tx][txtp],
"vp9_inv_%s_%dx%d_sub%d_add_%d", "vp9_inv_%s_%dx%d_sub%d_add_%d",
tx == 4 ? "wht_wht" : txtp_types[txtp], tx == 4 ? "wht_wht" : txtp_types[txtp],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment