Commit f23d26a6 authored by Janne Grunau's avatar Janne Grunau

h264: avoid using uninitialized memory in NEON chroma mc

Adapt commit 982b596e for the arm and
aarch64 NEON asm. 5-10% faster on Cortex-A9.
parent ccda51b1
...@@ -95,9 +95,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 ...@@ -95,9 +95,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
b.gt 1b b.gt 1b
ret ret
2: tst w6, w6 2: adds w12, w12, w6
add w12, w12, w6
dup v0.8B, w4 dup v0.8B, w4
b.eq 5f
tst w6, w6
dup v1.8B, w12 dup v1.8B, w12
b.eq 4f b.eq 4f
...@@ -161,6 +162,33 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 ...@@ -161,6 +162,33 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
st1 {v17.8B}, [x0], x2 st1 {v17.8B}, [x0], x2
b.gt 4b b.gt 4b
ret ret
5: ld1 {v4.8B}, [x1], x2
ld1 {v5.8B}, [x1], x2
prfm pldl1strm, [x1]
subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B
umull v17.8H, v5.8B, v0.8B
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
b.gt 5b
ret
endfunc endfunc
.endm .endm
...@@ -238,9 +266,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 ...@@ -238,9 +266,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
b.gt 1b b.gt 1b
ret ret
2: tst w6, w6 2: adds w12, w12, w6
add w12, w12, w6
dup v30.8B, w4 dup v30.8B, w4
b.eq 5f
tst w6, w6
dup v31.8B, w12 dup v31.8B, w12
trn1 v0.2S, v30.2S, v31.2S trn1 v0.2S, v30.2S, v31.2S
trn2 v1.2S, v30.2S, v31.2S trn2 v1.2S, v30.2S, v31.2S
...@@ -303,6 +332,28 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 ...@@ -303,6 +332,28 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
st1 {v16.S}[1], [x0], x2 st1 {v16.S}[1], [x0], x2
b.gt 4b b.gt 4b
ret ret
5: ld1 {v4.S}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2
umull v18.8H, v4.8B, v30.8B
subs w3, w3, #2
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
.endif
prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
b.gt 5b
ret
endfunc endfunc
.endm .endm
......
...@@ -96,9 +96,10 @@ T cmp r7, #0 ...@@ -96,9 +96,10 @@ T cmp r7, #0
pop {r4-r7, pc} pop {r4-r7, pc}
2: tst r6, r6 2: adds r12, r12, r6
add r12, r12, r6
vdup.8 d0, r4 vdup.8 d0, r4
beq 5f
tst r6, r6
vdup.8 d1, r12 vdup.8 d1, r12
beq 4f beq 4f
...@@ -163,6 +164,33 @@ T cmp r7, #0 ...@@ -163,6 +164,33 @@ T cmp r7, #0
vst1.8 {d17}, [r0,:64], r2 vst1.8 {d17}, [r0,:64], r2
bgt 4b bgt 4b
pop {r4-r7, pc}
5: vld1.8 {d4}, [r1], r2
vld1.8 {d5}, [r1], r2
pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmull.u8 q9, d5, d0
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 5b
pop {r4-r7, pc} pop {r4-r7, pc}
endfunc endfunc
.endm .endm
...@@ -245,9 +273,10 @@ T cmp r7, #0 ...@@ -245,9 +273,10 @@ T cmp r7, #0
pop {r4-r7, pc} pop {r4-r7, pc}
2: tst r6, r6 2: adds r12, r12, r6
add r12, r12, r6
vdup.8 d0, r4 vdup.8 d0, r4
beq 5f
tst r6, r6
vdup.8 d1, r12 vdup.8 d1, r12
vtrn.32 d0, d1 vtrn.32 d0, d1
...@@ -310,6 +339,29 @@ T cmp r7, #0 ...@@ -310,6 +339,29 @@ T cmp r7, #0
vst1.32 {d16[1]}, [r0,:32], r2 vst1.32 {d16[1]}, [r0,:32], r2
bgt 4b bgt 4b
pop {r4-r7, pc}
5: vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
subs r3, r3, #2
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 5b
pop {r4-r7, pc} pop {r4-r7, pc}
endfunc endfunc
.endm .endm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment