Commit b5a0971f authored by James Almer's avatar James Almer

x86/aacps: add ff_ps_stereo_interpolate_ipdopd_sse3()

About 2x faster than the c version.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 3385989b
......@@ -117,6 +117,57 @@ align 16
.ret:
REP_RET
;***************************************************************************
;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
; float h[2][4], float h_step[2][4],
; int len);
;***************************************************************************
INIT_XMM sse3
cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
cmp nd, 0
jle .ret
movaps m0, [hq]
movaps m1, [hq+mmsize]
%if ARCH_X86_64
movaps m8, [h_stepq]
movaps m9, [h_stepq+mmsize]
%define H_STEP0 m8
%define H_STEP1 m9
%else
%define H_STEP0 [h_stepq]
%define H_STEP1 [h_stepq+mmsize]
%endif
shl nd, 3
add lq, nq
add rq, nq
neg nq
align 16
.loop:
addps m0, H_STEP0
addps m1, H_STEP1
movddup m2, [lq+nq]
movddup m3, [rq+nq]
shufps m4, m2, m2, q2301
shufps m5, m3, m3, q2301
unpcklps m6, m0, m0
unpckhps m7, m0, m0
mulps m2, m6
mulps m3, m7
unpcklps m6, m1, m1
unpckhps m7, m1, m1
mulps m4, m6
mulps m5, m7
addps m2, m3
addsubps m4, m5
addsubps m2, m4
movsd [lq+nq], m2
movhps [rq+nq], m2
add nq, 8
jl .loop
.ret:
REP_RET
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
......
......@@ -37,6 +37,9 @@ void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
{
......@@ -50,6 +53,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if (EXTERNAL_SSE3(cpu_flags)) {
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment