Commit 1846ddf0 authored by Mans Rullgard's avatar Mans Rullgard

ARM: fix overreads in neon h264 chroma mc

The loops were reading ahead one line, which could end up outside the
buffer for reference blocks at the edge of the picture.  Removing
this readahead has no measurable performance impact.
Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent 2f41eaa9
...@@ -51,24 +51,20 @@ T cmp r7, #0 ...@@ -51,24 +51,20 @@ T cmp r7, #0
beq 2f beq 2f
add r5, r1, r2
vdup.8 d0, r4 vdup.8 d0, r4
lsl r4, r2, #1
vdup.8 d1, r12 vdup.8 d1, r12
vld1.8 {d4, d5}, [r1], r4 vld1.8 {d4, d5}, [r1], r2
vdup.8 d2, r6 vdup.8 d2, r6
vld1.8 {d6, d7}, [r5], r4
vdup.8 d3, r7 vdup.8 d3, r7
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
1: pld [r5] 1: vld1.8 {d6, d7}, [r1], r2
vmull.u8 q8, d4, d0 vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1 vmlal.u8 q8, d5, d1
vld1.8 {d4, d5}, [r1], r4 vext.8 d7, d6, d7, #1
vld1.8 {d4, d5}, [r1], r2
vmlal.u8 q8, d6, d2 vmlal.u8 q8, d6, d2
pld [r1]
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3 vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
...@@ -76,8 +72,7 @@ T cmp r7, #0 ...@@ -76,8 +72,7 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1 vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2 vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3 vmlal.u8 q9, d5, d3
vld1.8 {d6, d7}, [r5], r4 pld [r1, r2]
pld [r1]
.ifc \codec,h264 .ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
...@@ -92,7 +87,6 @@ T cmp r7, #0 ...@@ -92,7 +87,6 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10 vrhadd.u8 q8, q8, q10
.endif .endif
vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2 vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2
bgt 1b bgt 1b
...@@ -106,18 +100,15 @@ T cmp r7, #0 ...@@ -106,18 +100,15 @@ T cmp r7, #0
beq 4f beq 4f
add r5, r1, r2 vld1.8 {d4}, [r1], r2
lsl r4, r2, #1
vld1.8 {d4}, [r1], r4
vld1.8 {d6}, [r5], r4
3: pld [r5] 3: vld1.8 {d6}, [r1], r2
vmull.u8 q8, d4, d0 vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1 vmlal.u8 q8, d6, d1
vld1.8 {d4}, [r1], r4 vld1.8 {d4}, [r1], r2
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1 vmlal.u8 q9, d4, d1
vld1.8 {d6}, [r5], r4 pld [r1]
.ifc \codec,h264 .ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
...@@ -127,13 +118,13 @@ T cmp r7, #0 ...@@ -127,13 +118,13 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6 vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6 vshrn.u16 d17, q9, #6
.endif .endif
pld [r1, r2]
.ifc \type,avg .ifc \type,avg
vld1.8 {d20}, [lr,:64], r2 vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10 vrhadd.u8 q8, q8, q10
.endif .endif
subs r3, r3, #2 subs r3, r3, #2
pld [r1]
vst1.8 {d16}, [r0,:64], r2 vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2
bgt 3b bgt 3b
...@@ -144,16 +135,13 @@ T cmp r7, #0 ...@@ -144,16 +135,13 @@ T cmp r7, #0
vld1.8 {d6, d7}, [r1], r2 vld1.8 {d6, d7}, [r1], r2
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1 vext.8 d7, d6, d7, #1
pld [r1]
5: pld [r1]
subs r3, r3, #2 subs r3, r3, #2
vmull.u8 q8, d4, d0 vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1 vmlal.u8 q8, d5, d1
vld1.8 {d4, d5}, [r1], r2
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1 vmlal.u8 q9, d7, d1
pld [r1] pld [r1, r2]
vext.8 d5, d4, d5, #1
.ifc \codec,h264 .ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
...@@ -168,11 +156,9 @@ T cmp r7, #0 ...@@ -168,11 +156,9 @@ T cmp r7, #0
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10 vrhadd.u8 q8, q8, q10
.endif .endif
vld1.8 {d6, d7}, [r1], r2
vext.8 d7, d6, d7, #1
vst1.8 {d16}, [r0,:64], r2 vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2
bgt 5b bgt 4b
pop {r4-r7, pc} pop {r4-r7, pc}
endfunc endfunc
...@@ -209,33 +195,29 @@ T cmp r7, #0 ...@@ -209,33 +195,29 @@ T cmp r7, #0
beq 2f beq 2f
add r5, r1, r2
vdup.8 d0, r4 vdup.8 d0, r4
lsl r4, r2, #1
vdup.8 d1, r12 vdup.8 d1, r12
vld1.8 {d4}, [r1], r4 vld1.8 {d4}, [r1], r2
vdup.8 d2, r6 vdup.8 d2, r6
vld1.8 {d6}, [r5], r4
vdup.8 d3, r7 vdup.8 d3, r7
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5 vtrn.32 d4, d5
vtrn.32 d6, d7
vtrn.32 d0, d1 vtrn.32 d0, d1
vtrn.32 d2, d3 vtrn.32 d2, d3
1: pld [r5] 1: vld1.8 {d6}, [r1], r2
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
vmull.u8 q8, d4, d0 vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2 vmlal.u8 q8, d6, d2
vld1.8 {d4}, [r1], r4 vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
vtrn.32 d4, d5 vtrn.32 d4, d5
pld [r1]
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2 vmlal.u8 q9, d4, d2
vld1.8 {d6}, [r5], r4
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
.ifc \codec,h264 .ifc \codec,h264
...@@ -245,14 +227,12 @@ T cmp r7, #0 ...@@ -245,14 +227,12 @@ T cmp r7, #0
vshrn.u16 d16, q8, #6 vshrn.u16 d16, q8, #6
.endif .endif
subs r3, r3, #2 subs r3, r3, #2
pld [r1] pld [r1, r2]
.ifc \type,avg .ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20 vrhadd.u8 d16, d16, d20
.endif .endif
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2
bgt 1b bgt 1b
...@@ -268,18 +248,15 @@ T cmp r7, #0 ...@@ -268,18 +248,15 @@ T cmp r7, #0
beq 4f beq 4f
vext.32 d1, d0, d1, #1 vext.32 d1, d0, d1, #1
add r5, r1, r2 vld1.32 {d4[0]}, [r1], r2
lsl r4, r2, #1
vld1.32 {d4[0]}, [r1], r4
vld1.32 {d4[1]}, [r5], r4
3: pld [r5] 3: vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0 vmull.u8 q8, d4, d0
vld1.32 {d4[0]}, [r1], r4 vld1.32 {d4[0]}, [r1], r2
vmull.u8 q9, d4, d1 vmull.u8 q9, d4, d1
vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
pld [r1]
.ifc \codec,h264 .ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
.else .else
...@@ -292,7 +269,7 @@ T cmp r7, #0 ...@@ -292,7 +269,7 @@ T cmp r7, #0
vrhadd.u8 d16, d16, d20 vrhadd.u8 d16, d16, d20
.endif .endif
subs r3, r3, #2 subs r3, r3, #2
pld [r1] pld [r1, r2]
vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2
bgt 3b bgt 3b
...@@ -305,13 +282,9 @@ T cmp r7, #0 ...@@ -305,13 +282,9 @@ T cmp r7, #0
vext.8 d7, d6, d7, #1 vext.8 d7, d6, d7, #1
vtrn.32 d4, d5 vtrn.32 d4, d5
vtrn.32 d6, d7 vtrn.32 d6, d7
vmull.u8 q8, d4, d0
5: vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
subs r3, r3, #2 subs r3, r3, #2
vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
pld [r1] pld [r1]
...@@ -326,13 +299,10 @@ T cmp r7, #0 ...@@ -326,13 +299,10 @@ T cmp r7, #0
vld1.32 {d20[1]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20 vrhadd.u8 d16, d16, d20
.endif .endif
vld1.8 {d6}, [r1], r2
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
pld [r1] pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2 vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2
bgt 5b bgt 4b
pop {r4-r7, pc} pop {r4-r7, pc}
endfunc endfunc
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment