Commit 33c752be authored by James Almer's avatar James Almer

x86/me_cmp: port mmxext vsad functions to yasm

Also add mmxext versions of vsad8 and vsad_intra8, and sse2 versions of
vsad16 and vsad_intra16.
Since vsad8 and vsad16 are not bitexact, they are accordingly marked as
approximate.
Reviewed-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 5c073bbb
......@@ -26,6 +26,7 @@
SECTION_RODATA
cextern pb_1
cextern pb_80
SECTION .text
......@@ -772,3 +773,163 @@ SAD_APPROX_XY2 8
SAD_APPROX_XY2 16
INIT_XMM sse2
SAD_APPROX_XY2 16
;--------------------------------------------------------------------
;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
; int line_size, int h);
;--------------------------------------------------------------------
; %1 = 8/16
%macro VSAD_INTRA 1
cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
mova m0, [pix1q]
%if %1 == mmsize
mova m2, [pix1q+lsizeq]
psadbw m0, m2
%else
mova m2, [pix1q+lsizeq]
mova m3, [pix1q+8]
mova m4, [pix1q+lsizeq+8]
psadbw m0, m2
psadbw m3, m4
paddw m0, m3
%endif
sub hd, 2
.loop
lea pix1q, [pix1q + 2*lsizeq]
%if %1 == mmsize
mova m1, [pix1q]
psadbw m2, m1
paddw m0, m2
mova m2, [pix1q+lsizeq]
psadbw m1, m2
paddw m0, m1
%else
mova m1, [pix1q]
mova m3, [pix1q+8]
psadbw m2, m1
psadbw m4, m3
paddw m0, m2
paddw m0, m4
mova m2, [pix1q+lsizeq]
mova m4, [pix1q+lsizeq+8]
psadbw m1, m2
psadbw m3, m4
paddw m0, m1
paddw m0, m3
%endif
sub hd, 2
jg .loop
%if mmsize == 16
pshufd m1, m0, 0xe
paddd m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_MMX mmxext
VSAD_INTRA 8
VSAD_INTRA 16
INIT_XMM sse2
VSAD_INTRA 16
;---------------------------------------------------------------------
;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
; int line_size, int h);
;---------------------------------------------------------------------
; %1 = 8/16
%macro VSAD_APPROX 1
cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
mova m1, [pb_80]
mova m0, [pix1q]
%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
mova m4, [pix1q+lsizeq]
%if mmsize == 16
movu m3, [pix2q]
movu m2, [pix2q+lsizeq]
psubb m0, m3
psubb m4, m2
%else
psubb m0, [pix2q]
psubb m4, [pix2q+lsizeq]
%endif
pxor m0, m1
pxor m4, m1
psadbw m0, m4
%else ; vsad16_mmxext
mova m3, [pix1q+8]
psubb m0, [pix2q]
psubb m3, [pix2q+8]
pxor m0, m1
pxor m3, m1
mova m4, [pix1q+lsizeq]
mova m5, [pix1q+lsizeq+8]
psubb m4, [pix2q+lsizeq]
psubb m5, [pix2q+lsizeq+8]
pxor m4, m1
pxor m5, m1
psadbw m0, m4
psadbw m3, m5
paddw m0, m3
%endif
sub hd, 2
.loop
lea pix1q, [pix1q + 2*lsizeq]
lea pix2q, [pix2q + 2*lsizeq]
mova m2, [pix1q]
%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
%if mmsize == 16
movu m3, [pix2q]
psubb m2, m3
%else
psubb m2, [pix2q]
%endif
pxor m2, m1
psadbw m4, m2
paddw m0, m4
mova m4, [pix1q+lsizeq]
movu m3, [pix2q+lsizeq]
psubb m4, m3
pxor m4, m1
psadbw m2, m4
paddw m0, m2
%else ; vsad16_mmxext
mova m3, [pix1q+8]
psubb m2, [pix2q]
psubb m3, [pix2q+8]
pxor m2, m1
pxor m3, m1
psadbw m4, m2
psadbw m5, m3
paddw m0, m4
paddw m0, m5
mova m4, [pix1q+lsizeq]
mova m5, [pix1q+lsizeq+8]
psubb m4, [pix2q+lsizeq]
psubb m5, [pix2q+lsizeq+8]
pxor m4, m1
pxor m5, m1
psadbw m2, m4
psadbw m3, m5
paddw m0, m2
paddw m0, m3
%endif
sub hd, 2
jg .loop
%if mmsize == 16
pshufd m1, m0, 0xe
paddd m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_MMX mmxext
VSAD_APPROX 8
VSAD_APPROX 16
INIT_XMM sse2
VSAD_APPROX 16
......@@ -65,6 +65,18 @@ int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int stride, int h);
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int stride, int h);
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
......@@ -177,49 +189,6 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
}
#undef SUM
static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
int line_size, int h)
{
int tmp;
av_assert2((((int) pix) & 7) == 0);
av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
"movq 8(%0), " #out1 "\n" \
"add %2, %0\n" \
"psadbw " #out0 ", " #in0 "\n" \
"psadbw " #out1 ", " #in1 "\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw " #in0 ", %%mm6\n"
__asm__ volatile (
"movl %3, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pxor %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"add %2, %0\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movd %%mm6, %1\n"
: "+r" (pix), "=r" (tmp)
: "r" ((x86_reg) line_size), "m" (h)
: "%ecx");
return tmp;
}
#undef SUM
static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
......@@ -301,68 +270,6 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
}
#undef SUM
static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
int tmp;
av_assert2((((int) pix1) & 7) == 0);
av_assert2((((int) pix2) & 7) == 0);
av_assert2((line_size & 7) == 0);
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n" \
"movq (%1), %%mm2\n" \
"movq 8(%0), " #out1 "\n" \
"movq 8(%1), %%mm3\n" \
"add %3, %0\n" \
"add %3, %1\n" \
"psubb %%mm2, " #out0 "\n" \
"psubb %%mm3, " #out1 "\n" \
"pxor %%mm7, " #out0 "\n" \
"pxor %%mm7, " #out1 "\n" \
"psadbw " #out0 ", " #in0 "\n" \
"psadbw " #out1 ", " #in1 "\n" \
"paddw " #in1 ", " #in0 "\n" \
"paddw " #in0 ", %%mm6\n "
__asm__ volatile (
"movl %4, %%ecx\n"
"pxor %%mm6, %%mm6\n"
"pcmpeqw %%mm7, %%mm7\n"
"psllw $15, %%mm7\n"
"packsswb %%mm7, %%mm7\n"
"movq (%0), %%mm0\n"
"movq (%1), %%mm2\n"
"movq 8(%0), %%mm1\n"
"movq 8(%1), %%mm3\n"
"add %3, %0\n"
"add %3, %1\n"
"psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n"
"pxor %%mm7, %%mm0\n"
"pxor %%mm7, %%mm1\n"
"jmp 2f\n"
"1:\n"
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"subl $2, %%ecx\n"
"jnz 1b\n"
"movd %%mm6, %2\n"
: "+r" (pix1), "+r" (pix2), "=r" (tmp)
: "r" ((x86_reg) line_size), "m" (h)
: "%ecx");
return tmp;
}
#undef SUM
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0000000000000000ULL,
0x0001000100010001ULL,
......@@ -667,14 +574,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
}
}
if (INLINE_MMXEXT(cpu_flags)) {
c->vsad[4] = vsad_intra16_mmxext;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vsad[0] = vsad16_mmxext;
}
}
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
......@@ -704,9 +603,15 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[1][1] = ff_sad8_x2_mmxext;
c->pix_abs[1][2] = ff_sad8_y2_mmxext;
c->vsad[4] = ff_vsad_intra16_mmxext;
c->vsad[5] = ff_vsad_intra8_mmxext;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
c->vsad[0] = ff_vsad16_approx_mmxext;
c->vsad[1] = ff_vsad8_approx_mmxext;
}
}
......@@ -724,8 +629,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[0][1] = ff_sad16_x2_sse2;
c->pix_abs[0][2] = ff_sad16_y2_sse2;
c->vsad[4] = ff_vsad_intra16_sse2;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
c->vsad[0] = ff_vsad16_approx_sse2;
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment