Commit 645489cf authored by James Almer's avatar James Almer

x86/dcadsp: optimize lfe_fir0_float_fma3 on x86_32

About 10% faster.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent d915b6e5
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
SECTION .text SECTION .text
%define sizeof_float 4 %define sizeof_float 4
%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64) %define FMA3_OFFSET (8 * cpuflag(fma3))
%macro LFE_FIR0_FLOAT 0 %macro LFE_FIR0_FLOAT 0
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2 cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
...@@ -101,11 +101,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks ...@@ -101,11 +101,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
%endif %endif
%else ; ARCH_X86_32 %else ; ARCH_X86_32
%if cpuflag(fma3) %if cpuflag(fma3)
mulps m0, m7, [coeffq+cnt1q*8 ] mulps m0, m7, [coeffq+cnt1q*8 ]
movaps m1, [coeffq+cnt1q*8+16] mulps m1, m7, [coeffq+cnt1q*8+32 ]
mulps m2, m7, [coeffq+cnt1q*8+32] mulps m2, m7, [coeffq+cnt1q*8+64 ]
fmaddps m0, m6, m1, m0 mulps m3, m7, [coeffq+cnt1q*8+96 ]
fmaddps m2, m6, [coeffq+cnt1q*8+48], m2 fmaddps m0, m6, [coeffq+cnt1q*8+16 ], m0
fmaddps m1, m6, [coeffq+cnt1q*8+48 ], m1
fmaddps m2, m6, [coeffq+cnt1q*8+80 ], m2
fmaddps m3, m6, [coeffq+cnt1q*8+112], m3
haddps m0, m1
haddps m2, m3
haddps m0, m2
movaps [samplesq+cnt1q], m0
%else %else
mulps m0, m7, [coeffq+cnt1q*8 ] mulps m0, m7, [coeffq+cnt1q*8 ]
mulps m1, m6, [coeffq+cnt1q*8+16] mulps m1, m6, [coeffq+cnt1q*8+16]
...@@ -113,13 +121,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks ...@@ -113,13 +121,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
mulps m3, m6, [coeffq+cnt1q*8+48] mulps m3, m6, [coeffq+cnt1q*8+48]
addps m0, m1 addps m0, m1
addps m2, m3 addps m2, m3
%endif
unpckhps m3, m0, m2 unpckhps m3, m0, m2
unpcklps m0, m2 unpcklps m0, m2
addps m3, m0 addps m3, m0
movhlps m2, m3 movhlps m2, m3
addps m2, m3 addps m2, m3
movlps [samplesq+cnt1q], m2 movlps [samplesq+cnt1q], m2
%endif
%endif; ARCH %endif; ARCH
%if ARCH_X86_64 %if ARCH_X86_64
...@@ -154,10 +163,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks ...@@ -154,10 +163,19 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
%endif %endif
%else ; ARCH_X86_32 %else ; ARCH_X86_32
%if cpuflag(fma3) %if cpuflag(fma3)
mulps m0, m5, [coeffq+cnt1q*8 ] mulps m0, m5, [coeffq+cnt1q*8 ]
mulps m2, m5, [coeffq+cnt1q*8+32] mulps m1, m5, [coeffq+cnt1q*8+32 ]
fmaddps m0, m4, m1, m0 mulps m2, m5, [coeffq+cnt1q*8+64 ]
fmaddps m2, m4, [coeffq+cnt1q*8+48], m2 mulps m3, m5, [coeffq+cnt1q*8+96 ]
fmaddps m0, m4, [coeffq+cnt1q*8+16 ], m0
fmaddps m1, m4, [coeffq+cnt1q*8+48 ], m1
fmaddps m2, m4, [coeffq+cnt1q*8+80 ], m2
fmaddps m3, m4, [coeffq+cnt1q*8+112], m3
haddps m1, m0
haddps m3, m2
haddps m3, m1
movaps [samplesq+cnt2q], m3
%else %else
mulps m0, m5, [coeffq+cnt1q*8 ] mulps m0, m5, [coeffq+cnt1q*8 ]
mulps m1, m4, [coeffq+cnt1q*8+16] mulps m1, m4, [coeffq+cnt1q*8+16]
...@@ -165,13 +183,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks ...@@ -165,13 +183,14 @@ cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks
mulps m3, m4, [coeffq+cnt1q*8+48] mulps m3, m4, [coeffq+cnt1q*8+48]
addps m0, m1 addps m0, m1
addps m2, m3 addps m2, m3
%endif
unpckhps m3, m2, m0 unpckhps m3, m2, m0
unpcklps m2, m0 unpcklps m2, m0
addps m3, m2 addps m3, m2
movhlps m0, m3 movhlps m0, m3
addps m0, m3 addps m0, m3
movlps [samplesq+cnt2q], m0 movlps [samplesq+cnt2q], m0
%endif
%endif; ARCH %endif; ARCH
sub cnt2d, 8 + FMA3_OFFSET sub cnt2d, 8 + FMA3_OFFSET
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment