Commit faa1471f authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Michael Niedermayer

swr: rewrite resample_common/linear_float_sse/avx in yasm.

Linear interpolation goes from 63 (llvm) or 58 (gcc) to 48 (yasm)
cycles/sample on 64bit, or from 66 (llvm/gcc) to 52 (yasm) cycles/
sample on 32bit. Bon-linear goes from 43 (llvm) or 38 (gcc) to
32 (yasm) cycles/sample on 64bit, or from 46 (llvm) or 44 (gcc) to
38 (yasm) cycles/sample on 32bit (all testing on OSX 10.9.2, llvm
5.1 and gcc 4.8/9).
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent a348f4be
......@@ -4460,8 +4460,7 @@ EOF
check_inline_asm inline_asm_direct_symbol_refs '"movl '$extern_prefix'test, %eax"' ||
check_inline_asm inline_asm_direct_symbol_refs '"movl '$extern_prefix'test(%rip), %eax"'
# check whether binutils is new enough to compile AVX/SSSE3/MMXEXT
enabled avx && check_inline_asm avx_inline '"vextractf128 $1, %ymm0, %xmm1"'
# check whether binutils is new enough to compile SSSE3/MMXEXT
enabled ssse3 && check_inline_asm ssse3_inline '"pabsw %xmm0, %xmm0"'
enabled mmxext && check_inline_asm mmxext_inline '"pmaxub %mm0, %mm1"'
......
......@@ -43,9 +43,7 @@
# define RENAME(N) N ## _double_sse2
# endif
#elif defined(TEMPLATE_RESAMPLE_FLT) \
|| defined(TEMPLATE_RESAMPLE_FLT_SSE) \
|| defined(TEMPLATE_RESAMPLE_FLT_AVX)
#elif defined(TEMPLATE_RESAMPLE_FLT)
# define FILTER_SHIFT 0
# define DELEM float
......@@ -56,14 +54,6 @@
# if defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
# elif defined(TEMPLATE_RESAMPLE_FLT_SSE)
# define COMMON_CORE COMMON_CORE_FLT_SSE
# define LINEAR_CORE LINEAR_CORE_FLT_SSE
# define RENAME(N) N ## _float_sse
# elif defined(TEMPLATE_RESAMPLE_FLT_AVX)
# define COMMON_CORE COMMON_CORE_FLT_AVX
# define LINEAR_CORE LINEAR_CORE_FLT_AVX
# define RENAME(N) N ## _float_avx
# endif
#elif defined(TEMPLATE_RESAMPLE_S32)
......
YASM-OBJS += x86/swresample_x86.o\
x86/audio_convert.o\
x86/rematrix.o\
x86/resample.o\
OBJS += x86/resample_x86_dsp.o\
......
This diff is collapsed.
......@@ -132,124 +132,6 @@ __asm__ volatile(\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_FLT_SSE \
x86_reg len= -4*c->filter_length;\
__asm__ volatile(\
"xorps %%xmm0, %%xmm0 \n\t"\
"1: \n\t"\
"movups (%1, %0), %%xmm1 \n\t"\
"mulps (%2, %0), %%xmm1 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"movhlps %%xmm0, %%xmm1 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"movss %%xmm0, %%xmm1 \n\t"\
"shufps $1, %%xmm0, %%xmm0 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"movss %%xmm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
);
#define LINEAR_CORE_FLT_SSE \
x86_reg len= -4*c->filter_length;\
__asm__ volatile(\
"xorps %%xmm0, %%xmm0 \n\t"\
"xorps %%xmm2, %%xmm2 \n\t"\
"1: \n\t"\
"movups (%3, %0), %%xmm1 \n\t"\
"movaps %%xmm1, %%xmm3 \n\t"\
"mulps (%4, %0), %%xmm1 \n\t"\
"mulps (%5, %0), %%xmm3 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"addps %%xmm3, %%xmm2 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"movhlps %%xmm0, %%xmm1 \n\t"\
"movhlps %%xmm2, %%xmm3 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"addps %%xmm3, %%xmm2 \n\t"\
"movss %%xmm0, %%xmm1 \n\t"\
"movss %%xmm2, %%xmm3 \n\t"\
"shufps $1, %%xmm0, %%xmm0 \n\t"\
"shufps $1, %%xmm2, %%xmm2 \n\t"\
"addps %%xmm1, %%xmm0 \n\t"\
"addps %%xmm3, %%xmm2 \n\t"\
"movss %%xmm0, %1 \n\t"\
"movss %%xmm2, %2 \n\t"\
: "+r" (len),\
"=m" (val),\
"=m" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_FLT_AVX \
x86_reg len= -4*c->filter_length;\
__asm__ volatile(\
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"\
"1: \n\t"\
"vmovups (%1, %0), %%ymm1 \n\t"\
"vmulps (%2, %0), %%ymm1, %%ymm1 \n\t"\
"vaddps %%ymm1, %%ymm0, %%ymm0 \n\t"\
"add $32, %0 \n\t"\
" js 1b \n\t"\
"vextractf128 $1, %%ymm0, %%xmm1 \n\t"\
"vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vmovhlps %%xmm0, %%xmm1, %%xmm1 \n\t"\
"vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vshufps $1, %%xmm0, %%xmm0, %%xmm1 \n\t"\
"vaddss %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vmovss %%xmm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
);
#define LINEAR_CORE_FLT_AVX \
x86_reg len= -4*c->filter_length;\
__asm__ volatile(\
"vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"\
"vxorps %%ymm2, %%ymm2, %%ymm2 \n\t"\
"1: \n\t"\
"vmovups (%3, %0), %%ymm1 \n\t"\
"vmulps (%5, %0), %%ymm1, %%ymm3 \n\t"\
"vmulps (%4, %0), %%ymm1, %%ymm1 \n\t"\
"vaddps %%ymm1, %%ymm0, %%ymm0 \n\t"\
"vaddps %%ymm3, %%ymm2, %%ymm2 \n\t"\
"add $32, %0 \n\t"\
" js 1b \n\t"\
"vextractf128 $1, %%ymm0, %%xmm1 \n\t"\
"vextractf128 $1, %%ymm2, %%xmm3 \n\t"\
"vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vaddps %%xmm3, %%xmm2, %%xmm2 \n\t"\
"vmovhlps %%xmm0, %%xmm1, %%xmm1 \n\t"\
"vmovhlps %%xmm2, %%xmm3, %%xmm3 \n\t"\
"vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vaddps %%xmm3, %%xmm2, %%xmm2 \n\t"\
"vshufps $1, %%xmm0, %%xmm0, %%xmm1 \n\t"\
"vshufps $1, %%xmm2, %%xmm2, %%xmm3 \n\t"\
"vaddss %%xmm1, %%xmm0, %%xmm0 \n\t"\
"vaddss %%xmm3, %%xmm2, %%xmm2 \n\t"\
"vmovss %%xmm0, %1 \n\t"\
"vmovss %%xmm2, %2 \n\t"\
: "+r" (len),\
"=m" (val),\
"=m" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_DBL_SSE2 \
x86_reg len= -8*c->filter_length;\
__asm__ volatile(\
......
......@@ -50,12 +50,6 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do
#undef TEMPLATE_RESAMPLE_S16_MMX2
#endif
#if HAVE_SSE_INLINE
#define TEMPLATE_RESAMPLE_FLT_SSE
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_FLT_SSE
#endif
#if HAVE_SSE2_INLINE
#define TEMPLATE_RESAMPLE_S16_SSE2
#include "libswresample/resample_template.c"
......@@ -66,16 +60,20 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do
#undef TEMPLATE_RESAMPLE_DBL_SSE2
#endif
#if HAVE_AVX_INLINE
#define TEMPLATE_RESAMPLE_FLT_AVX
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_FLT_AVX
#endif
#undef DO_RESAMPLE_ONE
#endif // HAVE_MMXEXT_INLINE
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
void swresample_dsp_x86_init(ResampleContext *c)
{
int av_unused mm_flags = av_get_cpu_flags();
......@@ -85,9 +83,9 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_mmx2;
c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_mmx2;
}
if (HAVE_SSE_INLINE && mm_flags & AV_CPU_FLAG_SSE) {
c->dsp.resample_common[FNIDX(FLTP)] = (resample_fn) swri_resample_common_float_sse;
c->dsp.resample_linear[FNIDX(FLTP)] = (resample_fn) swri_resample_linear_float_sse;
if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse;
}
if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_sse2;
......@@ -95,8 +93,8 @@ void swresample_dsp_x86_init(ResampleContext *c)
c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
}
if (HAVE_AVX_INLINE && mm_flags & AV_CPU_FLAG_AVX) {
c->dsp.resample_common[FNIDX(FLTP)] = (resample_fn) swri_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = (resample_fn) swri_resample_linear_float_avx;
if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment