Commit 5cc6d524 authored by Justin Ruggles's avatar Justin Ruggles

lavr: replace the SSE version of ff_conv_fltp_to_flt_6ch() with SSE4 and AVX

The current SSE version is slower than the MMX version on Athlon64 and Sandy
Bridge, but the SSE4 and AVX versions are faster on Sandy Bridge.
parent 0b45334a
...@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len ...@@ -54,26 +54,24 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
mova m3, [srcq+src3q] mova m3, [srcq+src3q]
mova m4, [srcq+src4q] mova m4, [srcq+src4q]
mova m5, [srcq+src5q] mova m5, [srcq+src5q]
%if cpuflag(sse) %if cpuflag(sse4)
SBUTTERFLYPS 0, 1, 6 SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6 SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6 SBUTTERFLYPS 4, 5, 6
movaps m6, m4 blendps m6, m4, m0, 1100b
shufps m4, m0, q3210
movlhps m0, m2 movlhps m0, m2
movhlps m6, m2 movhlps m4, m2
movaps [dstq ], m0 blendps m2, m5, m1, 1100b
movaps [dstq+16], m4
movaps [dstq+32], m6
movaps m6, m5
shufps m5, m1, q3210
movlhps m1, m3 movlhps m1, m3
movhlps m6, m3 movhlps m5, m3
movaps [dstq ], m0
movaps [dstq+16], m6
movaps [dstq+32], m4
movaps [dstq+48], m1 movaps [dstq+48], m1
movaps [dstq+64], m5 movaps [dstq+64], m2
movaps [dstq+80], m6 movaps [dstq+80], m5
%else ; mmx %else ; mmx
SBUTTERFLY dq, 0, 1, 6 SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6 SBUTTERFLY dq, 2, 3, 6
...@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len ...@@ -100,5 +98,9 @@ cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
INIT_MMX mmx INIT_MMX mmx
CONV_FLTP_TO_FLT_6CH CONV_FLTP_TO_FLT_6CH
INIT_XMM sse INIT_XMM sse4
CONV_FLTP_TO_FLT_6CH
%if HAVE_AVX
INIT_XMM avx
CONV_FLTP_TO_FLT_6CH CONV_FLTP_TO_FLT_6CH
%endif
...@@ -22,8 +22,9 @@ ...@@ -22,8 +22,9 @@
#include "libavutil/cpu.h" #include "libavutil/cpu.h"
#include "libavresample/audio_convert.h" #include "libavresample/audio_convert.h"
extern void ff_conv_fltp_to_flt_6ch_mmx(float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_mmx (float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_sse(float *dst, float *const *src, int len); extern void ff_conv_fltp_to_flt_6ch_sse4(float *dst, float *const *src, int len);
extern void ff_conv_fltp_to_flt_6ch_avx (float *dst, float *const *src, int len);
av_cold void ff_audio_convert_init_x86(AudioConvert *ac) av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
{ {
...@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac) ...@@ -34,9 +35,13 @@ av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx); 6, 1, 4, "MMX", ff_conv_fltp_to_flt_6ch_mmx);
} }
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP, ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "SSE", ff_conv_fltp_to_flt_6ch_sse); 6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
}
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
6, 16, 4, "AVX", ff_conv_fltp_to_flt_6ch_avx);
} }
#endif #endif
} }
...@@ -42,10 +42,9 @@ ...@@ -42,10 +42,9 @@
%endmacro %endmacro
%macro SBUTTERFLYPS 3 %macro SBUTTERFLYPS 3
movaps m%3, m%1 unpcklps m%3, m%1, m%2
unpcklps m%1, m%2 unpckhps m%1, m%1, m%2
unpckhps m%3, m%2 SWAP %1, %3, %2
SWAP %2, %3
%endmacro %endmacro
%macro TRANSPOSE4x4B 5 %macro TRANSPOSE4x4B 5
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment