arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling

This work is sponsored by, and copyright, Google. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0 Signed-off-by: Martin Storsjö <martin@martin.st>

arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
This work is sponsored by, and copyright, Google. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0 Signed-off-by: Martin Storsjö <martin@martin.st>
a76bf8cf · Martin Storsjö · 388e0d25 · a76bf8cf
Commit a76bf8cf authored Jan 04, 2017 by Martin Storsjö
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 18 deletions

vp9itxfm_neon.S libavcodec/arm/vp9itxfm_neon.S +36 -18

No files found.
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -542,16 +542,23 @@ function idct16x16_dc_add_neon
        vrshr.s16       q8,  q8,  #6
+        mov             r3,  r0
        mov             r12, #16
 1:
        @ Loop to add the constant from q8 into all 16x16 outputs
-        vld1.8          {q3},  [r0,:128]
+        subs            r12, r12, #2
-        vaddw.u8        q10, q8,  d6
+        vld1.8          {q2},  [r0,:128], r1
-        vaddw.u8        q11, q8,  d7
+        vaddw.u8        q10, q8,  d4
-        vqmovun.s16     d6,  q10
+        vld1.8          {q3},  [r0,:128], r1
-        vqmovun.s16     d7,  q11
+        vaddw.u8        q11, q8,  d5
-        vst1.8          {q3},  [r0,:128], r1
+        vaddw.u8        q12, q8,  d6
-        subs            r12, r12, #1
+        vaddw.u8        q13, q8,  d7
+        vqmovun.s16     d4,  q10
+        vqmovun.s16     d5,  q11
+        vqmovun.s16     d6,  q12
+        vst1.8          {q2},  [r3,:128], r1
+        vqmovun.s16     d7,  q13
+        vst1.8          {q3},  [r3,:128], r1
        bne             1b
        bx              lr
@@ -1147,20 +1154,31 @@ function idct32x32_dc_add_neon
        vrshr.s16       q8,  q8,  #6
+        mov             r3,  r0
        mov             r12, #32
 1:
        @ Loop to add the constant from q8 into all 32x32 outputs
-        vld1.8          {q2-q3},  [r0,:128]
+        subs            r12, r12, #2
-        vaddw.u8        q10, q8,  d4
+        vld1.8          {q0-q1},  [r0,:128], r1
-        vaddw.u8        q11, q8,  d5
+        vaddw.u8        q9,  q8,  d0
-        vaddw.u8        q12, q8,  d6
+        vaddw.u8        q10, q8,  d1
-        vaddw.u8        q13, q8,  d7
+        vld1.8          {q2-q3},  [r0,:128], r1
-        vqmovun.s16     d4,  q10
+        vaddw.u8        q11, q8,  d2
-        vqmovun.s16     d5,  q11
+        vaddw.u8        q12, q8,  d3
-        vqmovun.s16     d6,  q12
+        vaddw.u8        q13, q8,  d4
-        vqmovun.s16     d7,  q13
+        vaddw.u8        q14, q8,  d5
-        vst1.8          {q2-q3},  [r0,:128], r1
+        vaddw.u8        q15, q8,  d6
-        subs            r12, r12, #1
+        vqmovun.s16     d0,  q9
+        vaddw.u8        q9,  q8,  d7
+        vqmovun.s16     d1,  q10
+        vqmovun.s16     d2,  q11
+        vqmovun.s16     d3,  q12
+        vqmovun.s16     d4,  q13
+        vqmovun.s16     d5,  q14
+        vst1.8          {q0-q1},  [r3,:128], r1
+        vqmovun.s16     d6,  q15
+        vqmovun.s16     d7,  q9
+        vst1.8          {q2-q3},  [r3,:128], r1
        bne             1b
        bx              lr