Commit a76bf8cf authored by Martin Storsjö's avatar Martin Storsjö

arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling

This work is sponsored by, and copyright, Google.

Before:                            Cortex A7      A8      A9     A53
vp9_inv_dct_dct_16x16_sub1_add_neon:   273.0   189.5   211.7   235.8
vp9_inv_dct_dct_32x32_sub1_add_neon:   752.0   459.2   862.2   553.9
After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   226.5   145.0   225.1   171.8
vp9_inv_dct_dct_32x32_sub1_add_neon:   721.2   415.7   727.6   475.0
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent 388e0d25
...@@ -542,16 +542,23 @@ function idct16x16_dc_add_neon ...@@ -542,16 +542,23 @@ function idct16x16_dc_add_neon
vrshr.s16 q8, q8, #6 vrshr.s16 q8, q8, #6
mov r3, r0
mov r12, #16 mov r12, #16
1: 1:
@ Loop to add the constant from q8 into all 16x16 outputs @ Loop to add the constant from q8 into all 16x16 outputs
vld1.8 {q3}, [r0,:128] subs r12, r12, #2
vaddw.u8 q10, q8, d6 vld1.8 {q2}, [r0,:128], r1
vaddw.u8 q11, q8, d7 vaddw.u8 q10, q8, d4
vqmovun.s16 d6, q10 vld1.8 {q3}, [r0,:128], r1
vqmovun.s16 d7, q11 vaddw.u8 q11, q8, d5
vst1.8 {q3}, [r0,:128], r1 vaddw.u8 q12, q8, d6
subs r12, r12, #1 vaddw.u8 q13, q8, d7
vqmovun.s16 d4, q10
vqmovun.s16 d5, q11
vqmovun.s16 d6, q12
vst1.8 {q2}, [r3,:128], r1
vqmovun.s16 d7, q13
vst1.8 {q3}, [r3,:128], r1
bne 1b bne 1b
bx lr bx lr
...@@ -1147,20 +1154,31 @@ function idct32x32_dc_add_neon ...@@ -1147,20 +1154,31 @@ function idct32x32_dc_add_neon
vrshr.s16 q8, q8, #6 vrshr.s16 q8, q8, #6
mov r3, r0
mov r12, #32 mov r12, #32
1: 1:
@ Loop to add the constant from q8 into all 32x32 outputs @ Loop to add the constant from q8 into all 32x32 outputs
vld1.8 {q2-q3}, [r0,:128] subs r12, r12, #2
vaddw.u8 q10, q8, d4 vld1.8 {q0-q1}, [r0,:128], r1
vaddw.u8 q11, q8, d5 vaddw.u8 q9, q8, d0
vaddw.u8 q12, q8, d6 vaddw.u8 q10, q8, d1
vaddw.u8 q13, q8, d7 vld1.8 {q2-q3}, [r0,:128], r1
vqmovun.s16 d4, q10 vaddw.u8 q11, q8, d2
vqmovun.s16 d5, q11 vaddw.u8 q12, q8, d3
vqmovun.s16 d6, q12 vaddw.u8 q13, q8, d4
vqmovun.s16 d7, q13 vaddw.u8 q14, q8, d5
vst1.8 {q2-q3}, [r0,:128], r1 vaddw.u8 q15, q8, d6
subs r12, r12, #1 vqmovun.s16 d0, q9
vaddw.u8 q9, q8, d7
vqmovun.s16 d1, q10
vqmovun.s16 d2, q11
vqmovun.s16 d3, q12
vqmovun.s16 d4, q13
vqmovun.s16 d5, q14
vst1.8 {q0-q1}, [r3,:128], r1
vqmovun.s16 d6, q15
vqmovun.s16 d7, q9
vst1.8 {q2-q3}, [r3,:128], r1
bne 1b bne 1b
bx lr bx lr
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment