Commit 05de4d30 authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/dsputilenc: implement XOP version of pix_sum16

SSE2: 137 cycles
XOP:   87 cycles
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 232959f1
...@@ -446,13 +446,24 @@ cglobal diff_pixels, 4, 5, 5 ...@@ -446,13 +446,24 @@ cglobal diff_pixels, 4, 5, 5
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used ; %1 = number of xmm registers used
; %2 = number of loops ; %2 = number of loops
%macro PIX_SUM16 2 ; %3 = number of GPRs used
cglobal pix_sum16, 2, 3, %1 %macro PIX_SUM16 4
cglobal pix_sum16, 2, %3, %1
movsxdifnidn r1, r1d movsxdifnidn r1, r1d
mov r2, %2 mov r2, %2
%if cpuflag(xop)
lea r3, [r1*3]
%else
pxor m5, m5 pxor m5, m5
%endif
pxor m4, m4 pxor m4, m4
.loop: .loop:
%if cpuflag(xop)
vphaddubq m0, [r0]
vphaddubq m1, [r0+r1]
vphaddubq m2, [r0+r1*2]
vphaddubq m3, [r0+r3]
%else
mova m0, [r0] mova m0, [r0]
%if mmsize == 8 %if mmsize == 8
mova m1, [r0+8] mova m1, [r0+8]
...@@ -463,6 +474,7 @@ cglobal pix_sum16, 2, 3, %1 ...@@ -463,6 +474,7 @@ cglobal pix_sum16, 2, 3, %1
punpcklbw m0, m5 punpcklbw m0, m5
punpckhbw m3, m1, m5 punpckhbw m3, m1, m5
punpcklbw m1, m5 punpcklbw m1, m5
%endif ; cpuflag(xop)
paddw m1, m0 paddw m1, m0
paddw m3, m2 paddw m3, m2
paddw m3, m1 paddw m3, m1
...@@ -470,19 +482,26 @@ cglobal pix_sum16, 2, 3, %1 ...@@ -470,19 +482,26 @@ cglobal pix_sum16, 2, 3, %1
%if mmsize == 8 %if mmsize == 8
add r0, r1 add r0, r1
%else %else
lea r0, [r0+r1*2] lea r0, [r0+r1*%4]
%endif %endif
dec r2 dec r2
jne .loop jne .loop
%if cpuflag(xop)
pshufd m0, m4, q0032
paddd m4, m0
%else
HADDW m4, m5 HADDW m4, m5
%endif
movd eax, m4 movd eax, m4
RET RET
%endmacro %endmacro
INIT_MMX mmx INIT_MMX mmx
PIX_SUM16 0, 16 PIX_SUM16 0, 16, 3, 0
INIT_XMM sse2 INIT_XMM sse2
PIX_SUM16 6, 8 PIX_SUM16 6, 8, 3, 2
INIT_XMM xop
PIX_SUM16 5, 4, 4, 4
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
; %1 = number of xmm registers used ; %1 = number of xmm registers used
......
...@@ -39,6 +39,7 @@ void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, ...@@ -39,6 +39,7 @@ void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride); int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size); int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_sum16_xop(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size); int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmx(int16_t *block);
...@@ -925,5 +926,9 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, ...@@ -925,5 +926,9 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif #endif
} }
if (EXTERNAL_XOP(cpu_flags)) {
c->pix_sum = ff_pix_sum16_xop;
}
ff_dsputil_init_pix_mmx(c, avctx); ff_dsputil_init_pix_mmx(c, avctx);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment