Commit 561bfc85 authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/dsputilenc: implement SSE2 versions of pix_{sum16, norm1}

Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent d2282718
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_1
SECTION .text SECTION .text
%macro DIFF_PIXELS_1 4 %macro DIFF_PIXELS_1 4
...@@ -439,73 +443,92 @@ cglobal diff_pixels, 4, 5, 5 ...@@ -439,73 +443,92 @@ cglobal diff_pixels, 4, 5, 5
jne .loop jne .loop
RET RET
INIT_MMX mmx
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
cglobal pix_sum16, 2, 3 ; %1 = number of xmm registers used
; %2 = number of loops
%macro PIX_SUM16 2
cglobal pix_sum16, 2, 3, %1
movsxdifnidn r1, r1d movsxdifnidn r1, r1d
mov r2, r1 mov r2, %2
neg r2 pxor m5, m5
shl r2, 4 pxor m4, m4
sub r0, r2
pxor m7, m7
pxor m6, m6
.loop: .loop:
mova m0, [r0+r2+0] mova m0, [r0]
mova m1, [r0+r2+0] %if mmsize == 8
mova m2, [r0+r2+8] mova m1, [r0+8]
mova m3, [r0+r2+8] %else
punpcklbw m0, m7 mova m1, [r0+r1]
punpckhbw m1, m7 %endif
punpcklbw m2, m7 punpckhbw m2, m0, m5
punpckhbw m3, m7 punpcklbw m0, m5
punpckhbw m3, m1, m5
punpcklbw m1, m5
paddw m1, m0 paddw m1, m0
paddw m3, m2 paddw m3, m2
paddw m3, m1 paddw m3, m1
paddw m6, m3 paddw m4, m3
add r2, r1 %if mmsize == 8
js .loop add r0, r1
mova m5, m6 %else
psrlq m6, 32 lea r0, [r0+r1*2]
paddw m6, m5 %endif
mova m5, m6 dec r2
psrlq m6, 16 jne .loop
paddw m6, m5 HADDW m4, m5
movd eax, m6 movd eax, m4
and eax, 0xffff and eax, 0xffff
RET RET
%endmacro
INIT_MMX mmx INIT_MMX mmx
PIX_SUM16 0, 16
INIT_XMM sse2
PIX_SUM16 6, 8
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal pix_norm1, 2, 4 ; %1 = number of xmm registers used
; %2 = number of loops
%macro PIX_NORM1 2
cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d movsxdifnidn r1, r1d
mov r2, 16 mov r2, %2
pxor m0, m0 pxor m0, m0
pxor m7, m7 pxor m5, m5
.loop: .loop:
mova m2, [r0+0] mova m2, [r0+0]
%if mmsize == 8
mova m3, [r0+8] mova m3, [r0+8]
mova m1, m2 %else
punpckhbw m1, m0 mova m3, [r0+r1]
%endif
punpckhbw m1, m2, m0
punpcklbw m2, m0 punpcklbw m2, m0
mova m4, m3 punpckhbw m4, m3, m0
punpckhbw m3, m0 punpcklbw m3, m0
punpcklbw m4, m0
pmaddwd m1, m1 pmaddwd m1, m1
pmaddwd m2, m2 pmaddwd m2, m2
pmaddwd m3, m3 pmaddwd m3, m3
pmaddwd m4, m4 pmaddwd m4, m4
paddd m2, m1 paddd m2, m1
paddd m4, m3 paddd m4, m3
paddd m7, m2 paddd m5, m2
paddd m5, m4
%if mmsize == 8
add r0, r1 add r0, r1
paddd m7, m4 %else
lea r0, [r0+r1*2]
%endif
dec r2 dec r2
jne .loop jne .loop
mova m1, m7 HADDD m5, m1
psrlq m7, 32 movd eax, m5
paddd m1, m7
movd eax, m1
RET RET
%endmacro
INIT_MMX mmx
PIX_NORM1 0, 16
INIT_XMM sse2
PIX_NORM1 6, 8
;----------------------------------------------- ;-----------------------------------------------
;int ff_sum_abs_dctelem(int16_t *block) ;int ff_sum_abs_dctelem(int16_t *block)
......
...@@ -38,7 +38,9 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, ...@@ -38,7 +38,9 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride); int stride);
int ff_pix_sum16_mmx(uint8_t *pix, int line_size); int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmx(int16_t *block);
int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block);
int ff_sum_abs_dctelem_sse2(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block);
...@@ -906,6 +908,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, ...@@ -906,6 +908,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->sse[0] = ff_sse16_sse2; c->sse[0] = ff_sse16_sse2;
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
c->diff_pixels = ff_diff_pixels_sse2; c->diff_pixels = ff_diff_pixels_sse2;
c->pix_sum = ff_pix_sum16_sse2;
c->pix_norm1 = ff_pix_norm1_sse2;
#if HAVE_ALIGNED_STACK #if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
......
...@@ -288,7 +288,12 @@ ...@@ -288,7 +288,12 @@
paddd %1, %2 paddd %1, %2
%endif %endif
%if notcpuflag(xop) || sizeof%1 != 16 %if notcpuflag(xop) || sizeof%1 != 16
%if cpuflag(mmxext)
PSHUFLW %2, %1, q0032 PSHUFLW %2, %1, q0032
%else ; mmx
mova %2, %1
psrlq %2, 32
%endif
paddd %1, %2 paddd %1, %2
%endif %endif
%undef %1 %undef %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment