Commit a1ac12bd authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/dcadsp: add ff_dca_lfe_fir0_fma3

~10% faster than the SSE version.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 3c728cee
......@@ -132,10 +132,15 @@ DECODE_HF
mulps va, %2
mulps vb, %2
%if %0 == 3
%if cpuflag(fma3)
fmaddps va, m4, %3, va
fmaddps vb, m0, %3, vb
%else
mulps m4, %3
mulps m0, %3
addps va, m4
addps vb, m0
%endif
%endif
; va = va1 va2 va3 va4
; vb = vb1 vb2 vb3 vb4
......@@ -198,6 +203,10 @@ cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
INIT_XMM sse
DCA_LFE_FIR 0
DCA_LFE_FIR 1
%if HAVE_FMA3_EXTERNAL
INIT_XMM fma3
DCA_LFE_FIR 0
%endif
%macro SETZERO 1
%if cpuflag(sse2) && notcpuflag(avx)
......
......@@ -34,6 +34,7 @@ void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
{
......@@ -54,6 +55,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
if (EXTERNAL_SSE4(cpu_flags)) {
s->decode_hf = ff_decode_hf_sse4;
}
if (EXTERNAL_FMA3(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment