Commit 847bb638 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Michael Niedermayer

swr: convert resample_common/linear_int16_mmx2/sse2 to yasm.

Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent e5c806fd
...@@ -44,17 +44,15 @@ ...@@ -44,17 +44,15 @@
#elif defined(TEMPLATE_RESAMPLE_FLT) #elif defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
# define FILTER_SHIFT 0 # define FILTER_SHIFT 0
# define DELEM float # define DELEM float
# define FELEM float # define FELEM float
# define FELEM2 float # define FELEM2 float
# define OUT(d, v) d = v # define OUT(d, v) d = v
# if defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
# endif
#elif defined(TEMPLATE_RESAMPLE_S32) #elif defined(TEMPLATE_RESAMPLE_S32)
# define RENAME(N) N ## _int32 # define RENAME(N) N ## _int32
# define FILTER_SHIFT 30 # define FILTER_SHIFT 30
# define DELEM int32_t # define DELEM int32_t
...@@ -65,10 +63,9 @@ ...@@ -65,10 +63,9 @@
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\ # define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
d = (uint64_t)(v + 0x80000000) > 0xFFFFFFFF ? (v>>63) ^ 0x7FFFFFFF : v d = (uint64_t)(v + 0x80000000) > 0xFFFFFFFF ? (v>>63) ^ 0x7FFFFFFF : v
#elif defined(TEMPLATE_RESAMPLE_S16) \ #elif defined(TEMPLATE_RESAMPLE_S16)
|| defined(TEMPLATE_RESAMPLE_S16_MMX2) \
|| defined(TEMPLATE_RESAMPLE_S16_SSE2)
# define RENAME(N) N ## _int16
# define FILTER_SHIFT 15 # define FILTER_SHIFT 15
# define DELEM int16_t # define DELEM int16_t
# define FELEM int16_t # define FELEM int16_t
...@@ -79,18 +76,6 @@ ...@@ -79,18 +76,6 @@
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\ # define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
d = (unsigned)(v + 32768) > 65535 ? (v>>31) ^ 32767 : v d = (unsigned)(v + 32768) > 65535 ? (v>>31) ^ 32767 : v
# if defined(TEMPLATE_RESAMPLE_S16)
# define RENAME(N) N ## _int16
# elif defined(TEMPLATE_RESAMPLE_S16_MMX2)
# define COMMON_CORE COMMON_CORE_INT16_MMX2
# define LINEAR_CORE LINEAR_CORE_INT16_MMX2
# define RENAME(N) N ## _int16_mmx2
# elif defined(TEMPLATE_RESAMPLE_S16_SSE2)
# define COMMON_CORE COMMON_CORE_INT16_SSE2
# define LINEAR_CORE LINEAR_CORE_INT16_SSE2
# define RENAME(N) N ## _int16_sse2
# endif
#endif #endif
#if DO_RESAMPLE_ONE #if DO_RESAMPLE_ONE
......
This diff is collapsed.
...@@ -22,116 +22,6 @@ ...@@ -22,116 +22,6 @@
#include "libavutil/cpu.h" #include "libavutil/cpu.h"
#include "libswresample/swresample_internal.h" #include "libswresample/swresample_internal.h"
DECLARE_ALIGNED(16, const uint64_t, ff_resample_int16_rounder)[2] = { 0x0000000000004000ULL, 0x0000000000000000ULL};
#define COMMON_CORE_INT16_MMX2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"movq "MANGLE(ff_resample_int16_rounder)", %%mm0 \n\t"\
"1: \n\t"\
"movq (%1, %0), %%mm1 \n\t"\
"pmaddwd (%2, %0), %%mm1 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"add $8, %0 \n\t"\
" js 1b \n\t"\
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"psrad $15, %%mm0 \n\t"\
"packssdw %%mm0, %%mm0 \n\t"\
"movd %%mm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
);
#define LINEAR_CORE_INT16_MMX2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
"pxor %%mm2, %%mm2 \n\t"\
"1: \n\t"\
"movq (%3, %0), %%mm1 \n\t"\
"movq %%mm1, %%mm3 \n\t"\
"pmaddwd (%4, %0), %%mm1 \n\t"\
"pmaddwd (%5, %0), %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"add $8, %0 \n\t"\
" js 1b \n\t"\
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
"pshufw $0x0E, %%mm2, %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"movd %%mm0, %1 \n\t"\
"movd %%mm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
);
#define COMMON_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"movdqa "MANGLE(ff_resample_int16_rounder)", %%xmm0 \n\t"\
"1: \n\t"\
"movdqu (%1, %0), %%xmm1 \n\t"\
"pmaddwd (%2, %0), %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"psrad $15, %%xmm0 \n\t"\
"packssdw %%xmm0, %%xmm0 \n\t"\
"movd %%xmm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
);
#define LINEAR_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%xmm0, %%xmm0 \n\t"\
"pxor %%xmm2, %%xmm2 \n\t"\
"1: \n\t"\
"movdqu (%3, %0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"pmaddwd (%4, %0), %%xmm1 \n\t"\
"pmaddwd (%5, %0), %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x0E, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x01, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"movd %%xmm0, %1 \n\t"\
"movd %%xmm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_DBL_SSE2 \ #define COMMON_CORE_DBL_SSE2 \
x86_reg len= -8*c->filter_length;\ x86_reg len= -8*c->filter_length;\
__asm__ volatile(\ __asm__ volatile(\
......
...@@ -27,34 +27,14 @@ ...@@ -27,34 +27,14 @@
#include "libswresample/resample.h" #include "libswresample/resample.h"
int swri_resample_common_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
int swri_resample_linear_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
int swri_resample_common_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
int swri_resample_linear_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
int swri_resample_common_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
int swri_resample_linear_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
int swri_resample_common_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
int swri_resample_linear_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx); int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
#if HAVE_MMXEXT_INLINE #if HAVE_SSE2_INLINE
#define DO_RESAMPLE_ONE 0 #define DO_RESAMPLE_ONE 0
#include "resample_mmx.h" #include "resample_mmx.h"
#if ARCH_X86_32
#define TEMPLATE_RESAMPLE_S16_MMX2
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_S16_MMX2
#endif
#if HAVE_SSE2_INLINE
#define TEMPLATE_RESAMPLE_S16_SSE2
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_S16_SSE2
#define TEMPLATE_RESAMPLE_DBL_SSE2 #define TEMPLATE_RESAMPLE_DBL_SSE2
#include "libswresample/resample_template.c" #include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_DBL_SSE2 #undef TEMPLATE_RESAMPLE_DBL_SSE2
...@@ -62,7 +42,15 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do ...@@ -62,7 +42,15 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do
#undef DO_RESAMPLE_ONE #undef DO_RESAMPLE_ONE
#endif // HAVE_MMXEXT_INLINE int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst, int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd); const uint8_t *src, int sz, int upd);
...@@ -79,17 +67,19 @@ void swresample_dsp_x86_init(ResampleContext *c) ...@@ -79,17 +67,19 @@ void swresample_dsp_x86_init(ResampleContext *c)
int av_unused mm_flags = av_get_cpu_flags(); int av_unused mm_flags = av_get_cpu_flags();
#define FNIDX(fmt) (AV_SAMPLE_FMT_##fmt - AV_SAMPLE_FMT_S16P) #define FNIDX(fmt) (AV_SAMPLE_FMT_##fmt - AV_SAMPLE_FMT_S16P)
if (ARCH_X86_32 && HAVE_MMXEXT_INLINE && mm_flags & AV_CPU_FLAG_MMX2) { if (ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL && mm_flags & AV_CPU_FLAG_MMX2) {
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_mmx2; c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_mmxext;
c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_mmx2; c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_mmxext;
} }
if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) { if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse; c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse;
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse; c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse;
} }
if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
}
if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) { if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_sse2;
c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_sse2;
c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2; c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2; c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment