Commit 1a69224f authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/swr: add ff_resample_{common, linear}_float_fma

Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent a441a243
...@@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ ...@@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
pmaddwd m1, [filterq+min_filter_count_x4q*1] pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1 paddd m0, m1
%else ; float/double %else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1 addp%4 m0, m0, m1
%endif ; cpuflag
%endif %endif
add min_filter_count_x4q, mmsize add min_filter_count_x4q, mmsize
js .inner_loop js .inner_loop
%if cpuflag(avx)
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
%ifidn %1, int16 %ifidn %1, int16
%if mmsize == 16 %if mmsize == 16
pshufd m1, m0, q0032 pshufd m1, m0, q0032
...@@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ ...@@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
movd [dstq], m0 movd [dstq], m0
%else ; float/double %else ; float/double
; horizontal sum & store ; horizontal sum & store
%if mmsize == 32
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
movhlps xm1, xm0 movhlps xm1, xm0
%ifidn %1, float %ifidn %1, float
addps xm0, xm1 addps xm0, xm1
...@@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ ...@@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
paddd m2, m3 paddd m2, m3
paddd m0, m1 paddd m0, m1
%else ; float/double %else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3 addp%4 m2, m2, m3
addp%4 m0, m0, m1 addp%4 m0, m0, m1
%endif ; cpuflag
%endif %endif
add min_filter_count_x4q, mmsize add min_filter_count_x4q, mmsize
js .inner_loop js .inner_loop
%if cpuflag(avx)
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
addps xm2, xm3
%endif
%ifidn %1, int16 %ifidn %1, int16
%if mmsize == 16 %if mmsize == 16
pshufd m3, m2, q0032 pshufd m3, m2, q0032
...@@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ ...@@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
; - unix64: eax=r6[filter1], edx=r2[todo] ; - unix64: eax=r6[filter1], edx=r2[todo]
%else ; float/double %else ; float/double
; val += (v2 - val) * (FELEML) frac / c->src_incr; ; val += (v2 - val) * (FELEML) frac / c->src_incr;
%if mmsize == 32
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
addps xm2, xm3
%endif
cvtsi2s%4 xm1, fracd cvtsi2s%4 xm1, fracd
subp%4 xm2, xm0 subp%4 xm2, xm0
mulp%4 xm1, xm4 mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000 shufp%4 xm1, xm1, q0000
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 xm0, xm2, xm1, xm0
%else
mulp%4 xm2, xm1 mulp%4 xm2, xm1
addp%4 xm0, xm2 addp%4 xm0, xm2
%endif ; cpuflag
; horizontal sum & store ; horizontal sum & store
movhlps xm1, xm0 movhlps xm1, xm0
...@@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1 ...@@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
INIT_YMM avx INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1 RESAMPLE_FNS float, 4, 2, s, pf_1
%endif %endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmxext INIT_MMX mmxext
......
...@@ -27,30 +27,19 @@ ...@@ -27,30 +27,19 @@
#include "libswresample/resample.h" #include "libswresample/resample.h"
int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst, #define RESAMPLE_FUNCS(type, opt) \
const uint8_t *src, int sz, int upd); int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst, const uint8_t *src, int sz, int upd); \
const uint8_t *src, int sz, int upd); int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
const uint8_t *src, int sz, int upd)
int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst, RESAMPLE_FUNCS(int16, mmxext);
const uint8_t *src, int sz, int upd); RESAMPLE_FUNCS(int16, sse2);
int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst, RESAMPLE_FUNCS(float, sse);
const uint8_t *src, int sz, int upd); RESAMPLE_FUNCS(float, avx);
RESAMPLE_FUNCS(float, fma3);
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst, RESAMPLE_FUNCS(float, fma4);
const uint8_t *src, int sz, int upd); RESAMPLE_FUNCS(double, sse2);
int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
void swresample_dsp_x86_init(ResampleContext *c) void swresample_dsp_x86_init(ResampleContext *c)
{ {
...@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c) ...@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx; c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx; c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
} }
if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
}
if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment