Commit c3a17fff authored by Sebastian Pop's avatar Sebastian Pop Committed by Michael Niedermayer

swscale/aarch64: use multiply accumulate and shift-right narrow

This patch rewrites the innermost loop of ff_yuv2planeX_8_neon to avoid zips and
horizontal adds by using fused multiply adds. The patch also uses ld1r to load
one element and replicate it across all lanes of the vector. The patch also
improves the clipping code by removing the shift right instructions and
performing the shift with the shift-right narrow instructions.

I see 8% difference on an m6g instance with neoverse-n1 CPUs:
$ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
before: t:0.014015 avg:0.014096 max:0.015018 min:0.013971
after:  t:0.012985 avg:0.013013 max:0.013996 min:0.012818

Tested with `make check` on aarch64-linux.
Signed-off-by: 's avatarSebastian Pop <spop@amazon.com>
Reviewed-by: 's avatarClément Bœsch <u@pkh.me>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent ebbc976a
...@@ -38,29 +38,21 @@ function ff_yuv2planeX_8_neon, export=1 ...@@ -38,29 +38,21 @@ function ff_yuv2planeX_8_neon, export=1
add x12, x12, x7, lsl #1 // &src[j+1][i] add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
ldr w11, [x10], #4 // read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1]) ld1r {v7.8H}, [x10], #2 // read 1x16-bit coeff X at filter[j ] and duplicate across lanes
zip1 v16.8H, v5.8H, v6.8H // A,I,B,J,C,K,D,L ld1r {v8.8H}, [x10], #2 // read 1x16-bit coeff Y at filter[j+1] and duplicate across lanes
zip2 v17.8H, v5.8H, v6.8H // E,M,F,N,F,O,H,P smlal v3.4S, v5.4H, v7.4H // val0 += {A,B,C,D} * X
dup v7.4S, w11 // X,Y,X,Y,X,Y,X,Y smlal2 v4.4S, v5.8H, v7.8H // val1 += {E,F,G,H} * X
smull v18.4S, v16.4H, v7.4H // A.X I.Y B.X J.Y smlal v3.4S, v6.4H, v8.4H // val0 += {I,J,K,L} * Y
smull v20.4S, v17.4H, v7.4H // E.X M.Y F.X N.Y smlal2 v4.4S, v6.8H, v8.8H // val1 += {M,N,O,P} * Y
smull2 v19.4S, v16.8H, v7.8H // C.X K.Y D.X L.Y
smull2 v21.4S, v17.8H, v7.8H // G.X O.Y H.X P.Y
addp v16.4S, v18.4S, v19.4S // A.X+I.Y B.X+J.Y C.X+K.Y D.X+L.Y
addp v17.4S, v20.4S, v21.4S // E.X+M.Y F.X+N.Y F.X+O.Y H.X+P.Y
add v3.4S, v3.4S, v16.4S // update val accumulator for part 1
add v4.4S, v4.4S, v17.4S // update val accumulator for part 2
subs w8, w8, #2 // tmpfilterSize -= 2 subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed b.gt 3b // loop until filterSize consumed
sshr v3.4S, v3.4S, #19 // val>>19 (part 1)
sshr v4.4S, v4.4S, #19 // val>>19 (part 2) sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqxtun v3.4H, v3.4S // clip16(val>>19) (part 1) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
sqxtun v4.4H, v4.4S // clip16(val>>19) (part 2) uqshrn v3.8b, v3.8h, #3 // clip8(val>>19)
mov v3.D[1], v4.D[0] // merge part 1 and part 2 st1 {v3.8b}, [x3], #8 // write to destination
uqxtn v3.8B, v3.8H // clip8(val>>19)
st1 {v3.1D}, [x3], #8 // write to destination
add x7, x7, #8 // i += 8
subs w4, w4, #8 // dstW -= 8 subs w4, w4, #8 // dstW -= 8
add x7, x7, #8 // i += 8
b.gt 2b // loop until width consumed b.gt 2b // loop until width consumed
ret ret
endfunc endfunc
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment