Commit f1d80bc6 authored by James Almer's avatar James Almer

x86/float_dsp: add ff_vector_fmul_reverse_avx2

~20% faster than AVX.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 5b441d29
...@@ -22,6 +22,9 @@ ...@@ -22,6 +22,9 @@
%include "x86util.asm" %include "x86util.asm"
SECTION_RODATA 32
pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
SECTION .text SECTION .text
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -359,10 +362,16 @@ VECTOR_FMUL_ADD ...@@ -359,10 +362,16 @@ VECTOR_FMUL_ADD
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_REVERSE 0 %macro VECTOR_FMUL_REVERSE 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
%if cpuflag(avx2)
mova m2, [pd_reverse]
%endif
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 2*mmsize]
ALIGN 16 ALIGN 16
.loop: .loop:
%if cpuflag(avx) %if cpuflag(avx2)
vpermd m0, m2, [src1q]
vpermd m1, m2, [src1q+mmsize]
%elif cpuflag(avx)
vmovaps xmm0, [src1q + 16] vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1 vinsertf128 m0, m0, [src1q], 1
vshufps m0, m0, m0, q0123 vshufps m0, m0, m0, q0123
...@@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE ...@@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE
INIT_YMM avx INIT_YMM avx
VECTOR_FMUL_REVERSE VECTOR_FMUL_REVERSE
%endif %endif
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
VECTOR_FMUL_REVERSE
%endif
; float scalarproduct_float_sse(const float *v1, const float *v2, int len) ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
INIT_XMM sse INIT_XMM sse
......
...@@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0, ...@@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
const float *src1, int len); const float *src1, int len);
void ff_vector_fmul_reverse_avx(float *dst, const float *src0, void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
const float *src1, int len); const float *src1, int len);
void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
...@@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) ...@@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
} }
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
}
if (EXTERNAL_FMA3_FAST(cpu_flags)) { if (EXTERNAL_FMA3_FAST(cpu_flags)) {
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment