Commit 4e8e2624 authored by Justin Ruggles's avatar Justin Ruggles

fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm

parent 185142a5
...@@ -1055,14 +1055,6 @@ emu_edge mmx ...@@ -1055,14 +1055,6 @@ emu_edge mmx
; int32_t max, unsigned int len) ; int32_t max, unsigned int len)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro SPLATD_MMX 1
punpckldq %1, %1
%endmacro
%macro SPLATD_SSE2 1
pshufd %1, %1, 0
%endmacro
%macro VECTOR_CLIP_INT32 4 %macro VECTOR_CLIP_INT32 4
cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
%ifidn %1, sse2 %ifidn %1, sse2
......
...@@ -24,6 +24,52 @@ ...@@ -24,6 +24,52 @@
SECTION_TEXT SECTION_TEXT
;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 2
%ifdef ARCH_X86_64
cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
%else
cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
movss m0, mulm
%endif
SPLATD m0
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
%ifidn %1, sse2
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add lenq, 32
jl .loop
REP_RET
%endmacro
INIT_XMM
%define SPLATD SPLATD_SSE
%define movdqa movaps
INT32_TO_FLOAT_FMUL_SCALAR sse, 5
%undef movdqa
%define SPLATD SPLATD_SSE2
INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
%undef SPLATD
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len); ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
......
...@@ -26,52 +26,11 @@ ...@@ -26,52 +26,11 @@
#include "libavutil/x86_cpu.h" #include "libavutil/x86_cpu.h"
#include "libavcodec/fmtconvert.h" #include "libavcodec/fmtconvert.h"
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtpi2ps (%2,%0), %%xmm0 \n"
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
"movlhps %%xmm1, %%xmm0 \n"
"movlhps %%xmm3, %%xmm2 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm2 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm2, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
{
x86_reg i = -4*len;
__asm__ volatile(
"movss %3, %%xmm4 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"1: \n"
"cvtdq2ps (%2,%0), %%xmm0 \n"
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
"mulps %%xmm4, %%xmm0 \n"
"mulps %%xmm4, %%xmm1 \n"
"movaps %%xmm0, (%1,%0) \n"
"movaps %%xmm1, 16(%1,%0) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(dst+len), "r"(src+len), "m"(mul)
);
}
#if HAVE_YASM #if HAVE_YASM
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
...@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) ...@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{ {
int mm_flags = av_get_cpu_flags(); int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_YASM #if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx; c->float_interleave = float_interleave_mmx;
if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
...@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) ...@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_3dn2; c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
} }
} }
#endif
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
#if HAVE_YASM
c->float_to_int16 = ff_float_to_int16_sse; c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse; c->float_interleave = float_interleave_sse;
#endif
} }
if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
#if HAVE_YASM
c->float_to_int16 = ff_float_to_int16_sse2; c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#endif
} }
} }
#endif
} }
...@@ -536,6 +536,18 @@ ...@@ -536,6 +536,18 @@
%endif %endif
%endmacro %endmacro
%macro SPLATD_MMX 1
punpckldq %1, %1
%endmacro
%macro SPLATD_SSE 1
shufps %1, %1, 0
%endmacro
%macro SPLATD_SSE2 1
pshufd %1, %1, 0
%endmacro
%macro CLIPW 3 ;(dst, min, max) %macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2 pmaxsw %1, %2
pminsw %1, %3 pminsw %1, %3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment