Commit 610e00b3 authored by Daniel Kang's avatar Daniel Kang Committed by Diego Biurrun

x86: h264: Convert 8-bit QPEL inline assembly to YASM

Signed-off-by: 's avatarDiego Biurrun <diego@biurrun.de>
parent ad01ba6c
......@@ -51,7 +51,8 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_weight_10bit.o
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
......
......@@ -1354,3 +1354,234 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2
mova %2, %1
%endmacro
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS4_L2 1
%define OP op_%1h
cglobal %1_pixels4_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
movd m0, [r1]
movd m1, [r2]
add r1, r4
add r2, 4
pavgb m0, m1
OP m0, [r0], m3
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+4]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+8]
pavgb m1, [r2+12]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
add r2, 16
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2 put
PIXELS4_L2 avg
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS8_L2 1
%define OP op_%1
cglobal %1_pixels8_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pavgb m0, m1
OP m0, [r0]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2 put
PIXELS8_L2 avg
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS16_L2 1
%define OP op_%1
cglobal %1_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
pavgb m0, [r2]
pavgb m1, [r2+8]
add r1, r4
add r2, 16
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS16_L2 put
PIXELS16_L2 avg
INIT_MMX mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
%macro PIXELS48 2
%if %2 == 4
%define OP movh
%else
%define OP mova
%endif
cglobal %1_pixels%2, 4,5
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
OP m0, [r1]
OP m1, [r1+r2]
OP m2, [r1+r2*2]
OP m3, [r1+r4]
lea r1, [r1+r2*4]
%ifidn %1, avg
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
%endif
OP [r0], m0
OP [r0+r2], m1
OP [r0+r2*2], m2
OP [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
PIXELS48 put, 4
PIXELS48 avg, 4
PIXELS48 put, 8
PIXELS48 avg, 8
INIT_XMM sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal put_pixels16, 4,5,4
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
cglobal avg_pixels16, 4,5,4
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
......@@ -56,57 +56,6 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
}
#ifndef SKIP_FOR_3DNOW
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"movd (%2), %%mm2 \n\t"
"movd 4(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGB" %%mm2, %%mm0 \n\t"
PAVGB" %%mm3, %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"movd 8(%2), %%mm2 \n\t"
"movd 12(%2), %%mm3 \n\t"
"add %4, %1 \n\t"
PAVGB" %%mm2, %%mm0 \n\t"
PAVGB" %%mm3, %%mm1 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"add $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
......@@ -227,58 +176,6 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
:"memory");*/
}
static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
"testl $1, %0 \n\t"
" jz 1f \n\t"
"movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t"
"add %4, %1 \n\t"
"add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
"decl %0 \n\t"
"1: \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t"
"add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t"
"add %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t"
"add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t"
"add %5, %3 \n\t"
"add $16, %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
:"memory");
}
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
{
__asm__ volatile(
......@@ -876,33 +773,6 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
:"%"REG_a, "memory");
}
#ifndef SKIP_FOR_3DNOW
static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
do {
__asm__ volatile(
"movd (%1), %%mm0 \n\t"
"movd (%1, %2), %%mm1 \n\t"
"movd (%1, %2, 2), %%mm2 \n\t"
"movd (%1, %3), %%mm3 \n\t"
PAVGB" (%0), %%mm0 \n\t"
PAVGB" (%0, %2), %%mm1 \n\t"
PAVGB" (%0, %2, 2), %%mm2 \n\t"
PAVGB" (%0, %3), %%mm3 \n\t"
"movd %%mm0, (%1) \n\t"
"movd %%mm1, (%1, %2) \n\t"
"movd %%mm2, (%1, %2, 2) \n\t"
"movd %%mm3, (%1, %3) \n\t"
::"S"(pixels), "D"(block),
"r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
:"memory");
block += 4*line_size;
pixels += 4*line_size;
h -= 4;
} while(h > 0);
}
#endif /* SKIP_FOR_3DNOW */
//FIXME the following could be optimized too ...
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
......
......@@ -366,33 +366,6 @@ void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
} while (--i);
}
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movd (%1 ), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movd (%1 ), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
......@@ -455,56 +428,6 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
);
}
static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"1: \n\t"
"movdqu (%1 ), %%xmm0 \n\t"
"movdqu (%1, %3 ), %%xmm1 \n\t"
"movdqu (%1, %3, 2), %%xmm2 \n\t"
"movdqu (%1, %4 ), %%xmm3 \n\t"
"lea (%1, %3, 4), %1 \n\t"
"movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2, %3) \n\t"
"movdqa %%xmm2, (%2, %3, 2) \n\t"
"movdqa %%xmm3, (%2, %4) \n\t"
"subl $4, %0 \n\t"
"lea (%2, %3, 4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
: "memory"
);
}
static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
__asm__ volatile (
"1: \n\t"
"movdqu (%1 ), %%xmm0 \n\t"
"movdqu (%1, %3 ), %%xmm1 \n\t"
"movdqu (%1, %3, 2), %%xmm2 \n\t"
"movdqu (%1, %4 ), %%xmm3 \n\t"
"lea (%1, %3, 4), %1 \n\t"
"pavgb (%2 ), %%xmm0 \n\t"
"pavgb (%2, %3 ), %%xmm1 \n\t"
"pavgb (%2, %3, 2), %%xmm2 \n\t"
"pavgb (%2, %4), %%xmm3 \n\t"
"movdqa %%xmm0, (%2) \n\t"
"movdqa %%xmm1, (%2, %3) \n\t"
"movdqa %%xmm2, (%2, %3, 2) \n\t"
"movdqa %%xmm3, (%2, %4) \n\t"
"subl $4, %0 \n\t"
"lea (%2, %3, 4), %2 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
: "memory"
);
}
#define CLEAR_BLOCKS(name, n) \
static void name(DCTELEM *blocks) \
{ \
......@@ -2381,27 +2304,23 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
}
#endif /* HAVE_INLINE_ASM */
#if HAVE_MMXEXT_EXTERNAL
if (CONFIG_H264QPEL) {
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
#endif /* HAVE_INLINE_ASM */
if (!high_bit_depth) {
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
#endif /* HAVE_INLINE_ASM */
} else if (bit_depth == 10) {
#if HAVE_YASM
#if !ARCH_X86_64
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
......@@ -2410,18 +2329,14 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
#endif /* HAVE_YASM */
}
#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
#endif /* HAVE_INLINE_ASM */
}
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
......@@ -2447,7 +2362,7 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_YASM */
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
......@@ -2546,17 +2461,16 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE2_EXTERNAL
const int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_INLINE_ASM
const int high_bit_depth = bit_depth > 8;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
if (CONFIG_H264QPEL)
H264_QPEL_FUNCS(0, 0, sse2);
}
......@@ -2583,9 +2497,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (bit_depth == 10) {
if (CONFIG_H264QPEL) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
......@@ -2615,16 +2527,16 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_YASM */
#endif /* HAVE_SSE2_EXTERNAL */
}
static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSSE3_EXTERNAL
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
#if HAVE_SSSE3_INLINE
if (!high_bit_depth && CONFIG_H264QPEL) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
......@@ -2639,9 +2551,6 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
#endif /* HAVE_SSSE3_INLINE */
#if HAVE_SSSE3_EXTERNAL
if (bit_depth == 10 && CONFIG_H264QPEL) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
......
......@@ -19,1019 +19,229 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_mmx.h"
#if HAVE_INLINE_ASM
/***********************************/
/* motion compensation */
#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
"mov"#q" "#C", "#T" \n\t"\
"mov"#d" (%0), "#F" \n\t"\
"paddw "#D", "#T" \n\t"\
"psllw $2, "#T" \n\t"\
"psubw "#B", "#T" \n\t"\
"psubw "#E", "#T" \n\t"\
"punpcklbw "#Z", "#F" \n\t"\
"pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
"paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
"add %2, %0 \n\t"\
"paddw "#F", "#A" \n\t"\
"paddw "#A", "#T" \n\t"\
"psraw $5, "#T" \n\t"\
"packuswb "#T", "#T" \n\t"\
OP(T, (%1), A, d)\
"add %3, %1 \n\t"
#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
"mov"#q" "#C", "#T" \n\t"\
"mov"#d" (%0), "#F" \n\t"\
"paddw "#D", "#T" \n\t"\
"psllw $2, "#T" \n\t"\
"paddw "MANGLE(ff_pw_16)", "#A"\n\t"\
"psubw "#B", "#T" \n\t"\
"psubw "#E", "#T" \n\t"\
"punpcklbw "#Z", "#F" \n\t"\
"pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\
"paddw "#F", "#A" \n\t"\
"add %2, %0 \n\t"\
"paddw "#A", "#T" \n\t"\
"mov"#q" "#T", "#OF"(%1) \n\t"
#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
#if HAVE_YASM
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
ff_put_pixels8_mmxext(block, pixels, line_size, h);
ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
}
static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
int line_size, int h)
{
ff_avg_pixels8_mmxext(block, pixels, line_size, h);
ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
}
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
int line_size, int h);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
DEF_QPEL(avg)
DEF_QPEL(put)
#define QPEL_H264(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=4;\
\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"1: \n\t"\
"movd -1(%0), %%mm1 \n\t"\
"movd (%0), %%mm2 \n\t"\
"movd 1(%0), %%mm3 \n\t"\
"movd 2(%0), %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"movd -2(%0), %%mm0 \n\t"\
"movd 3(%0), %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psllw $2, %%mm2 \n\t"\
"psubw %%mm1, %%mm2 \n\t"\
"pmullw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"psraw $5, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm6, d)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=4;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %0, %%mm4 \n\t"\
"movq %1, %%mm5 \n\t"\
:: "m"(ff_pw_5), "m"(ff_pw_16)\
);\
do{\
__asm__ volatile(\
"movd -1(%0), %%mm1 \n\t"\
"movd (%0), %%mm2 \n\t"\
"movd 1(%0), %%mm3 \n\t"\
"movd 2(%0), %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"movd -2(%0), %%mm0 \n\t"\
"movd 3(%0), %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"paddw %%mm3, %%mm0 \n\t"\
"psllw $2, %%mm2 \n\t"\
"psubw %%mm1, %%mm2 \n\t"\
"pmullw %%mm4, %%mm2 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"movd (%2), %%mm3 \n\t"\
"psraw $5, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
PAVGB" %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm6, d)\
"add %4, %0 \n\t"\
"add %4, %1 \n\t"\
"add %3, %2 \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}while(--h);\
}\
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
src -= 2*srcStride;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\
);\
}\
static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int h=4;\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
\
: "+a"(src)\
: "c"(tmp), "S"((x86_reg)srcStride)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4 - 9*srcStride;\
src += 4;\
}\
tmp -= 3*4;\
__asm__ volatile(\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"paddw 10(%0), %%mm0 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"paddw 8(%0), %%mm1 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
"psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
"psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
"paddsw %%mm2, %%mm0 \n\t"\
"psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
"paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
"psraw $6, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, d)\
"add $24, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"psllw $2, %%mm0 \n\t"\
"psllw $2, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movd -2(%0), %%mm2 \n\t"\
"movd 7(%0), %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm5 \n\t"\
"paddw %%mm3, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
"paddw %%mm5, %%mm2 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm4, %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"movq (%2), %%mm4 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
PAVGB" %%mm4, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: "memory"\
);\
}\
\
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
int w= 2;\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
"cmpl $16, %4 \n\t"\
"jne 2f \n\t"\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
"2: \n\t"\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
: "memory"\
);\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
"cmpl $16, %3 \n\t"\
"jne 2f \n\t"\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
"2: \n\t"\
: "+a"(src)\
: "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\
: "memory"\
);\
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
tmp += 4;\
src += 4 - (size+5)*srcStride;\
src += 4;\
}\
}\
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
int h = size;\
__asm__ volatile(\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 8(%0), %%mm3 \n\t"\
"movq 2(%0), %%mm1 \n\t"\
"movq 10(%0), %%mm4 \n\t"\
"paddw %%mm4, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw 18(%0), %%mm3 \n\t"\
"paddw 16(%0), %%mm4 \n\t"\
"movq 4(%0), %%mm2 \n\t"\
"movq 12(%0), %%mm5 \n\t"\
"paddw 6(%0), %%mm2 \n\t"\
"paddw 14(%0), %%mm5 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"psubw %%mm1, %%mm0 \n\t"\
"psubw %%mm4, %%mm3 \n\t"\
"paddsw %%mm2, %%mm0 \n\t"\
"paddsw %%mm5, %%mm3 \n\t"\
"psraw $2, %%mm0 \n\t"\
"psraw $2, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm5, %%mm3 \n\t"\
"psraw $6, %%mm0 \n\t"\
"psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: "memory"\
);\
tmp += 8 - size*24;\
dst += 8 - size*dstStride;\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
__asm__ volatile(\
"movq (%1), %%mm0 \n\t"\
"movq 24(%1), %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
"packuswb %%mm1, %%mm1 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm1 \n\t"\
OP(%%mm0, (%2), %%mm4, d)\
OP(%%mm1, (%2,%4), %%mm5, d)\
"lea (%0,%3,2), %0 \n\t"\
"lea (%2,%4,2), %2 \n\t"\
"movq 48(%1), %%mm0 \n\t"\
"movq 72(%1), %%mm1 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\
"packuswb %%mm1, %%mm1 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm1 \n\t"\
OP(%%mm0, (%2), %%mm4, d)\
OP(%%mm1, (%2,%4), %%mm5, d)\
:"+a"(src8), "+c"(src16), "+d"(dst)\
:"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
:"memory");\
}\
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
do{\
__asm__ volatile(\
"movq (%1), %%mm0 \n\t"\
"movq 8(%1), %%mm1 \n\t"\
"movq 48(%1), %%mm2 \n\t"\
"movq 8+48(%1), %%mm3 \n\t"\
"psraw $5, %%mm0 \n\t"\
"psraw $5, %%mm1 \n\t"\
"psraw $5, %%mm2 \n\t"\
"psraw $5, %%mm3 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
"packuswb %%mm3, %%mm2 \n\t"\
PAVGB" (%0), %%mm0 \n\t"\
PAVGB" (%0,%3), %%mm2 \n\t"\
OP(%%mm0, (%2), %%mm5, q)\
OP(%%mm2, (%2,%4), %%mm5, q)\
::"a"(src8), "c"(src16), "d"(dst),\
"r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
:"memory");\
src8 += 2L*src8Stride;\
src16 += 48;\
dst += 2L*dstStride;\
}while(h-=2);\
}\
static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=16;\
__asm__ volatile(\
"pxor %%xmm15, %%xmm15 \n\t"\
"movdqa %6, %%xmm14 \n\t"\
"movdqa %7, %%xmm13 \n\t"\
"1: \n\t"\
"lddqu 6(%0), %%xmm1 \n\t"\
"lddqu -2(%0), %%xmm7 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm15, %%xmm1 \n\t"\
"punpcklbw %%xmm15, %%xmm0 \n\t"\
"punpcklbw %%xmm15, %%xmm7 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm0, %%xmm6 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm0, %%xmm8 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm0, %%xmm9 \n\t"\
"movdqa %%xmm0, %%xmm12 \n\t"\
"movdqa %%xmm1, %%xmm11 \n\t"\
"palignr $10,%%xmm0, %%xmm11\n\t"\
"palignr $10,%%xmm7, %%xmm12\n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $2, %%xmm7, %%xmm9 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $4, %%xmm7, %%xmm8 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $6, %%xmm7, %%xmm6 \n\t"\
"paddw %%xmm0 ,%%xmm11 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $8, %%xmm7, %%xmm0 \n\t"\
"paddw %%xmm12,%%xmm7 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm8, %%xmm6 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm9, %%xmm0 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"psllw $2, %%xmm6 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"psubw %%xmm0, %%xmm6 \n\t"\
"paddw %%xmm13,%%xmm11 \n\t"\
"paddw %%xmm13,%%xmm7 \n\t"\
"pmullw %%xmm14,%%xmm2 \n\t"\
"pmullw %%xmm14,%%xmm6 \n\t"\
"lddqu (%2), %%xmm3 \n\t"\
"paddw %%xmm11,%%xmm2 \n\t"\
"paddw %%xmm7, %%xmm6 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"psraw $5, %%xmm6 \n\t"\
"packuswb %%xmm2,%%xmm6 \n\t"\
"pavgb %%xmm3, %%xmm6 \n\t"\
OP(%%xmm6, (%1), %%xmm4, dqa)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
"m"(ff_pw_5), "m"(ff_pw_16)\
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \
"%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \
"%xmm12", "%xmm13", "%xmm14", "%xmm15",)\
"memory"\
);\
}
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
int h=8;\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
"1: \n\t"\
"lddqu -2(%0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $10,%%xmm0, %%xmm5 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"movq (%2), %%xmm3 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
"pmullw %%xmm6, %%xmm2 \n\t"\
"paddw %%xmm0, %%xmm2 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"packuswb %%xmm2, %%xmm2 \n\t"\
"pavgb %%xmm3, %%xmm2 \n\t"\
OP(%%xmm2, (%1), %%xmm4, q)\
"add %5, %0 \n\t"\
"add %5, %1 \n\t"\
"add %4, %2 \n\t"\
"decl %3 \n\t"\
"jg 1b \n\t"\
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
\
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
"1: \n\t"\
"lddqu -2(%0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm0 \n\t"\
"punpckhbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $2, %%xmm0, %%xmm4 \n\t"\
"palignr $4, %%xmm0, %%xmm3 \n\t"\
"palignr $6, %%xmm0, %%xmm2 \n\t"\
"palignr $8, %%xmm0, %%xmm1 \n\t"\
"palignr $10,%%xmm0, %%xmm5 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"psllw $2, %%xmm2 \n\t"\
"psubw %%xmm1, %%xmm2 \n\t"\
"paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
"pmullw %%xmm6, %%xmm2 \n\t"\
"paddw %%xmm0, %%xmm2 \n\t"\
"psraw $5, %%xmm2 \n\t"\
"packuswb %%xmm2, %%xmm2 \n\t"\
OP(%%xmm2, (%1), %%xmm4, q)\
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+g"(h)\
: "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
\
__asm__ volatile(\
"pxor %%xmm7, %%xmm7 \n\t"\
"movq (%0), %%xmm0 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm1 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm2 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm3 \n\t"\
"add %2, %0 \n\t"\
"movq (%0), %%xmm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm1 \n\t"\
"punpcklbw %%xmm7, %%xmm2 \n\t"\
"punpcklbw %%xmm7, %%xmm3 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
"cmpl $16, %4 \n\t"\
"jne 2f \n\t"\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
"2: \n\t"\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
int w = (size+8)>>3;
src -= 2*srcStride+2;
while(w--){
__asm__ volatile(
"pxor %%xmm7, %%xmm7 \n\t"
"movq (%0), %%xmm0 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm1 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm2 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm3 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm4 \n\t"
"add %2, %0 \n\t"
"punpcklbw %%xmm7, %%xmm0 \n\t"
"punpcklbw %%xmm7, %%xmm1 \n\t"
"punpcklbw %%xmm7, %%xmm2 \n\t"
"punpcklbw %%xmm7, %%xmm3 \n\t"
"punpcklbw %%xmm7, %%xmm4 \n\t"
QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
"cmpl $16, %3 \n\t"
"jne 2f \n\t"
QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
"2: \n\t"
: "+a"(src)
: "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)
"memory"
);
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
tmp += 8;
src += 8 - (size+5)*srcStride;
src += 8;
}
}
#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int h = size;\
if(size == 16){\
__asm__ volatile(\
"1: \n\t"\
"movdqa 32(%0), %%xmm4 \n\t"\
"movdqa 16(%0), %%xmm5 \n\t"\
"movdqa (%0), %%xmm7 \n\t"\
"movdqa %%xmm4, %%xmm3 \n\t"\
"movdqa %%xmm4, %%xmm2 \n\t"\
"movdqa %%xmm4, %%xmm1 \n\t"\
"movdqa %%xmm4, %%xmm0 \n\t"\
"palignr $10, %%xmm5, %%xmm0 \n\t"\
"palignr $8, %%xmm5, %%xmm1 \n\t"\
"palignr $6, %%xmm5, %%xmm2 \n\t"\
"palignr $4, %%xmm5, %%xmm3 \n\t"\
"palignr $2, %%xmm5, %%xmm4 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"movdqa %%xmm5, %%xmm6 \n\t"\
"movdqa %%xmm5, %%xmm4 \n\t"\
"movdqa %%xmm5, %%xmm3 \n\t"\
"palignr $8, %%xmm7, %%xmm4 \n\t"\
"palignr $2, %%xmm7, %%xmm6 \n\t"\
"palignr $10, %%xmm7, %%xmm3 \n\t"\
"paddw %%xmm6, %%xmm4 \n\t"\
"movdqa %%xmm5, %%xmm6 \n\t"\
"palignr $6, %%xmm7, %%xmm5 \n\t"\
"palignr $4, %%xmm7, %%xmm6 \n\t"\
"paddw %%xmm7, %%xmm3 \n\t"\
"paddw %%xmm6, %%xmm5 \n\t"\
\
"psubw %%xmm1, %%xmm0 \n\t"\
"psubw %%xmm4, %%xmm3 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psraw $2, %%xmm3 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"psubw %%xmm4, %%xmm3 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"paddw %%xmm5, %%xmm3 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psraw $2, %%xmm3 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"paddw %%xmm5, %%xmm3 \n\t"\
"psraw $6, %%xmm0 \n\t"\
"psraw $6, %%xmm3 \n\t"\
"packuswb %%xmm0, %%xmm3 \n\t"\
OP(%%xmm3, (%1), %%xmm7, dqa)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}else{\
__asm__ volatile(\
"1: \n\t"\
"movdqa 16(%0), %%xmm1 \n\t"\
"movdqa (%0), %%xmm0 \n\t"\
"movdqa %%xmm1, %%xmm2 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"movdqa %%xmm1, %%xmm4 \n\t"\
"movdqa %%xmm1, %%xmm5 \n\t"\
"palignr $10, %%xmm0, %%xmm5 \n\t"\
"palignr $8, %%xmm0, %%xmm4 \n\t"\
"palignr $6, %%xmm0, %%xmm3 \n\t"\
"palignr $4, %%xmm0, %%xmm2 \n\t"\
"palignr $2, %%xmm0, %%xmm1 \n\t"\
"paddw %%xmm5, %%xmm0 \n\t"\
"paddw %%xmm4, %%xmm1 \n\t"\
"paddw %%xmm3, %%xmm2 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"psubw %%xmm1, %%xmm0 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"psraw $2, %%xmm0 \n\t"\
"paddw %%xmm2, %%xmm0 \n\t"\
"psraw $6, %%xmm0 \n\t"\
"packuswb %%xmm0, %%xmm0 \n\t"\
OP(%%xmm0, (%1), %%xmm7, q)\
"add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+g"(h)\
: "S"((x86_reg)dstStride)\
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
"memory"\
);\
}\
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
#define put_pixels8_l2_sse2 put_pixels8_l2_mmxext
#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmxext
#define put_pixels16_l2_sse2 put_pixels16_l2_mmxext
#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmxext
#define put_pixels8_l2_ssse3 put_pixels8_l2_mmxext
#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmxext
#define put_pixels16_l2_ssse3 put_pixels16_l2_mmxext
#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmxext
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmxext
#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmxext
#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmxext
#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmxext
#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmxext
#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmxext
#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmxext
#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmxext
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmxext
#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmxext
#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmxext
#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmxext
#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmxext
#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
......@@ -1040,77 +250,77 @@ H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
put_pixels16_sse2(dst, src, stride, 16);
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
avg_pixels16_sse2(dst, src, stride, 16);
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
......@@ -1118,8 +328,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
......@@ -1127,8 +337,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
......@@ -1136,8 +346,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
......@@ -1145,8 +355,8 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
assert(((int)temp & 7) == 0);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
#define H264_MC_4816(MMX)\
......@@ -1171,25 +381,18 @@ QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
#if HAVE_SSSE3_INLINE
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV2_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
#endif
#undef PAVGB
H264_MC_4816(mmxext)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
#if HAVE_SSSE3_INLINE
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
#endif
#endif /* HAVE_INLINE_ASM */
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
......@@ -1285,3 +488,5 @@ QPEL16_OP(mc33, MMX)
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
QPEL16(mmxext)
#endif
#endif /* HAVE_YASM */
;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
cextern pw_16
cextern pw_5
cextern pb_0
SECTION .text
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2-3
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2-3
mova %2, %1
%endmacro
%macro QPEL4_H_LOWPASS_OP 1
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r4d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
psraw m0, 5
packuswb m0, m0
op_%1h m0, [r0], m6
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP 1
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
packuswb m0, m1
op_%1 m0, [r0], m4
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_OP put
QPEL8_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,7 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
op_%1h m2, [r0], m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_OP_XMM put
QPEL8_H_LOWPASS_OP_XMM avg
%macro QPEL4_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r5d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
movh m3, [r2]
psraw m0, 5
packuswb m0, m0
pavgb m0, m3
op_%1h m0, [r0], m6
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_L2_OP put
QPEL4_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
mova m4, [r2]
packuswb m0, m1
pavgb m0, m4
op_%1 m0, [r0], m4
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,7 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
lddqu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
movh m3, [r2]
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
pavgb m2, m3
op_%1h m2, [r0], m4
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_L2_OP_XMM put
QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro FILT_V 1
mova m6, m2
movh m5, [r1]
paddw m6, m3
psllw m6, 2
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, [pw_16]
add r1, r3
paddw m0, m5
paddw m6, m0
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
add r0, r2
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_V_LOWPASS_OP 1
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
RET
%endmacro
INIT_MMX mmxext
QPEL4_V_LOWPASS_OP put
QPEL4_V_LOWPASS_OP avg
%macro QPEL8OR16_V_LOWPASS_OP 1
%if cpuflag(sse2)
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,7 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
%else
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,7 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
%endif
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
cmp r4d, 16
jne .end
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro FILT_HV 1 ; offset
mova m6, m2
movh m5, [r0]
paddw m6, m3
psllw m6, 2
paddw m0, [pw_16]
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
add r0, r2
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
RET
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
movsxdifnidn r2, r2d
mov r3d, 4
.loop:
mova m0, [r0]
paddw m0, [r0+10]
mova m1, [r0+2]
paddw m1, [r0+8]
mova m2, [r0+4]
paddw m2, [r0+6]
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddsw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r1], m7
add r0, 24
add r1, r2
dec r3d
jnz .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_HV1_LOWPASS_OP put
QPEL4_HV1_LOWPASS_OP avg
%macro QPEL8OR16_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,7 ; src, tmp, srcStride, size
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*48
FILT_HV 1*48
FILT_HV 2*48
FILT_HV 3*48
FILT_HV 4*48
FILT_HV 5*48
FILT_HV 6*48
FILT_HV 7*48
cmp r3d, 16
jne .end
FILT_HV 8*48
FILT_HV 9*48
FILT_HV 10*48
FILT_HV 11*48
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
FILT_HV 15*48
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV1_LOWPASS_OP put
QPEL8OR16_HV1_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
%macro QPEL8OR16_HV2_LOWPASS_OP 1
; unused is to match ssse3 and mmxext args
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
movsxdifnidn r2, r2d
.loop:
mova m0, [r1]
mova m3, [r1+8]
mova m1, [r1+2]
mova m4, [r1+10]
paddw m0, m4
paddw m1, m3
paddw m3, [r1+18]
paddw m4, [r1+16]
mova m2, [r1+4]
mova m5, [r1+12]
paddw m2, [r1+6]
paddw m5, [r1+14]
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddsw m0, m2
paddsw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m0, m3
op_%1 m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,7 ; dst, tmp, dstStride, tmpStride, size
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
cmp r4d, 16
je .op16
.loop8:
mova m1, [r1+16]
mova m0, [r1]
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m5, m0, 10
palignr m4, m0, 8
palignr m3, m0, 6
palignr m2, m0, 4
palignr m1, m0, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop8
jmp .done
.op16:
mova m4, [r1+32]
mova m5, [r1+16]
mova m7, [r1]
mova m3, m4
mova m2, m4
mova m1, m4
mova m0, m4
palignr m0, m5, 10
palignr m1, m5, 8
palignr m2, m5, 6
palignr m3, m5, 4
palignr m4, m5, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
mova m6, m5
mova m4, m5
mova m3, m5
palignr m4, m7, 8
palignr m6, m7, 2
palignr m3, m7, 10
paddw m4, m6
mova m6, m5
palignr m5, m7, 6
palignr m6, m7, 4
paddw m3, m7
paddw m5, m6
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddw m0, m2
paddw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m3, m0
op_%1 m3, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .op16
.done:
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM put
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
%macro PIXELS4_L2_SHIFT5 1
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mova m0, [r1]
mova m1, [r1+24]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
lea r2, [r2+r4*2]
lea r0, [r0+r3*2]
mova m0, [r1+48]
mova m1, [r1+72]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
%macro PIXELS8_L2_SHIFT5 1
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
.loop:
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r1+48]
mova m3, [r1+48+8]
psraw m0, 5
psraw m1, 5
psraw m2, 5
psraw m3, 5
packuswb m0, m1
packuswb m2, m3
pavgb m0, [r2]
pavgb m2, [r2+r4]
op_%1 m0, [r0], m4
op_%1 m2, [r0+r3], m5
lea r2, [r2+2*r4]
add r1, 48*2
lea r0, [r0+2*r3]
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2_SHIFT5 put
PIXELS8_L2_SHIFT5 avg
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m15, m15
mova m14, [pw_5]
mova m13, [pw_16]
.loop:
lddqu m1, [r1+6]
lddqu m7, [r1-2]
mova m0, m1
punpckhbw m1, m15
punpcklbw m0, m15
punpcklbw m7, m15
mova m2, m1
mova m6, m0
mova m3, m1
mova m8, m0
mova m4, m1
mova m9, m0
mova m12, m0
mova m11, m1
palignr m11, m0, 10
palignr m12, m7, 10
palignr m4, m0, 2
palignr m9, m7, 2
palignr m3, m0, 4
palignr m8, m7, 4
palignr m2, m0, 6
palignr m6, m7, 6
paddw m11, m0
palignr m1, m0, 8
palignr m0, m7, 8
paddw m7, m12
paddw m2, m3
paddw m6, m8
paddw m1, m4
paddw m0, m9
psllw m2, 2
psllw m6, 2
psubw m2, m1
psubw m6, m0
paddw m11, m13
paddw m7, m13
pmullw m2, m14
pmullw m6, m14
lddqu m3, [r2]
paddw m2, m11
paddw m6, m7
psraw m2, 5
psraw m6, 5
packuswb m6, m2
pavgb m6, m3
op_%1 m6, [r0], m11
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL16_H_LOWPASS_L2_OP put
QPEL16_H_LOWPASS_L2_OP avg
%endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment