Commit 7b052672 authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/dsputil: port clear_block functions to yasm

Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Reviewed-by: 's avatarChristophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent afaa39b4
......@@ -513,3 +513,63 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
;----------------------------------------
; void ff_clear_block(int16_t *blocks);
;----------------------------------------
; %1 = number of xmm registers used
; %2 = number of inline store loops
%macro CLEAR_BLOCK 2
cglobal clear_block, 1, 1, %1, blocks
ZERO m0, m0
%assign %%i 0
%rep %2
mova [blocksq+mmsize*(0+%%i)], m0
mova [blocksq+mmsize*(1+%%i)], m0
mova [blocksq+mmsize*(2+%%i)], m0
mova [blocksq+mmsize*(3+%%i)], m0
mova [blocksq+mmsize*(4+%%i)], m0
mova [blocksq+mmsize*(5+%%i)], m0
mova [blocksq+mmsize*(6+%%i)], m0
mova [blocksq+mmsize*(7+%%i)], m0
%assign %%i %%i+8
%endrep
RET
%endmacro
INIT_MMX mmx
%define ZERO pxor
CLEAR_BLOCK 0, 2
INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCK 1, 1
;-----------------------------------------
; void ff_clear_blocks(int16_t *blocks);
;-----------------------------------------
; %1 = number of xmm registers used
%macro CLEAR_BLOCKS 1
cglobal clear_blocks, 1, 2, %1, blocks, len
add blocksq, 768
mov lenq, -768
ZERO m0, m0
.loop
mova [blocksq+lenq+mmsize*0], m0
mova [blocksq+lenq+mmsize*1], m0
mova [blocksq+lenq+mmsize*2], m0
mova [blocksq+lenq+mmsize*3], m0
mova [blocksq+lenq+mmsize*4], m0
mova [blocksq+lenq+mmsize*5], m0
mova [blocksq+lenq+mmsize*6], m0
mova [blocksq+lenq+mmsize*7], m0
add lenq, mmsize*8
js .loop
RET
%endmacro
INIT_MMX mmx
%define ZERO pxor
CLEAR_BLOCKS 0
INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCKS 1
......@@ -534,8 +534,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
}
......@@ -547,6 +545,10 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
if (!high_bit_depth) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
......@@ -585,7 +587,10 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
{
#if HAVE_SSE_INLINE
c->vector_clipf = ff_vector_clipf_sse;
#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
#if HAVE_SSE_EXTERNAL
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
return;
......@@ -594,9 +599,7 @@ static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
#endif
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = ff_gmc_sse;
#endif
......
......@@ -172,61 +172,6 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
} while (--i);
}
#define CLEAR_BLOCKS(name, n) \
void name(int16_t *blocks) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"mov $-"#n", %%"REG_a" \n\t" \
"1: \n\t" \
"movq %%mm7, (%0, %%"REG_a") \n\t" \
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \
"add $32, %%"REG_a" \n\t" \
"js 1b \n\t" \
:: "r"(((uint8_t *) blocks) + n) \
: "%"REG_a); \
}
CLEAR_BLOCKS(ff_clear_blocks_mmx, 768)
CLEAR_BLOCKS(ff_clear_block_mmx, 128)
void ff_clear_block_sse(int16_t *block)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"movaps %%xmm0, (%0) \n"
"movaps %%xmm0, 16(%0) \n"
"movaps %%xmm0, 32(%0) \n"
"movaps %%xmm0, 48(%0) \n"
"movaps %%xmm0, 64(%0) \n"
"movaps %%xmm0, 80(%0) \n"
"movaps %%xmm0, 96(%0) \n"
"movaps %%xmm0, 112(%0) \n"
:: "r" (block)
: "memory");
}
void ff_clear_blocks_sse(int16_t *blocks)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"mov $-768, %%"REG_a" \n"
"1: \n"
"movaps %%xmm0, (%0, %%"REG_a") \n"
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
"add $128, %%"REG_a" \n"
"js 1b \n"
:: "r"(((uint8_t *) blocks) + 128 * 6)
: "%"REG_a);
}
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{
x86_reg i = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment