Commit e4a27e2f authored by Clément Bœsch's avatar Clément Bœsch Committed by Clément Bœsch

lavc/arm: fix lack of precision in ff_ps_stereo_interpolate_neon

The code originally pre-multiply by 2 the steps, causing the running sum
of the h factors to drift away due to the lack of precision. It quickly
causes an inaccuracy > 0.01.

I tried diverse approaches such as multiply by 2.0 (instead of adding
the value itself) without success.

I'm unable to bench the impact of this change, feel free to compare.

This commit fixes the incoming aacpsdsp tests.

Following is an alternative simplified function (matching the incoming
AArch64 code) that may be used:

function ff_ps_stereo_interpolate_neon, export=1
        vld1.32         {q0}, [r2]
        vld1.32         {q1}, [r3]
        ldr             r12, [sp]
        vmov.f32        q8, q0
        vmov.f32        q9, q1
        vzip.32         q8, q0
        vzip.32         q9, q1
1:
        vld1.32         {d4}, [r0,:64]
        vld1.32         {d6}, [r1,:64]
        vadd.f32        q8, q8, q9
        vadd.f32        q0, q0, q1
        vmov.f32        d5, d4
        vmov.f32        d7, d6
        vmul.f32        q2, q2, q8
        vmla.f32        q2, q3, q0
        vst1.32         {d4}, [r0,:64]!
        vst1.32         {d5}, [r1,:64]!
        subs            r12, r12, #1
        bgt             1b
        bx              lr
endfunc
parent d2ef9e6e
...@@ -232,12 +232,11 @@ endfunc ...@@ -232,12 +232,11 @@ endfunc
function ff_ps_stereo_interpolate_neon, export=1 function ff_ps_stereo_interpolate_neon, export=1
vld1.32 {q0}, [r2] vld1.32 {q0}, [r2]
vld1.32 {q14}, [r3] vld1.32 {q14}, [r3]
vadd.f32 q15, q14, q14
mov r2, r0 mov r2, r0
mov r3, r1 mov r3, r1
ldr r12, [sp] ldr r12, [sp]
vadd.f32 q1, q0, q14 vadd.f32 q1, q0, q14
vadd.f32 q0, q0, q15 vadd.f32 q0, q1, q14
vld1.32 {q2}, [r0,:64]! vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]! vld1.32 {q3}, [r1,:64]!
subs r12, r12, #1 subs r12, r12, #1
...@@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1 ...@@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1
vmla.f32 d17, d7, d1[0] vmla.f32 d17, d7, d1[0]
vmla.f32 d18, d6, d3[1] vmla.f32 d18, d6, d3[1]
vmla.f32 d19, d7, d1[1] vmla.f32 d19, d7, d1[1]
vadd.f32 q1, q1, q15 vadd.f32 q1, q1, q14
vadd.f32 q0, q0, q15 vadd.f32 q0, q0, q14
vadd.f32 q1, q1, q14
vadd.f32 q0, q0, q14
vld1.32 {q2}, [r0,:64]! vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]! vld1.32 {q3}, [r1,:64]!
vst1.32 {q8}, [r2,:64]! vst1.32 {q8}, [r2,:64]!
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment