Commit 9722a6a3 authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Michael Niedermayer

x86: hpeldsp: implement SSE2 put_pixels16_xy2

This is obviously equivalent to the avg version, without the avg.

3223(mmx) -> 2006(sse2)
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent f0aca50e
...@@ -551,11 +551,11 @@ AVG_APPROX_PIXELS8_XY2 ...@@ -551,11 +551,11 @@ AVG_APPROX_PIXELS8_XY2
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS_XY2 0 %macro SET_PIXELS_XY2 1
%if cpuflag(sse2) %if cpuflag(sse2)
cglobal avg_pixels16_xy2, 4,5,8 cglobal %1_pixels16_xy2, 4,5,8
%else %else
cglobal avg_pixels8_xy2, 4,5 cglobal %1_pixels8_xy2, 4,5
%endif %endif
pxor m7, m7 pxor m7, m7
mova m6, [pw_2] mova m6, [pw_2]
...@@ -588,9 +588,13 @@ cglobal avg_pixels8_xy2, 4,5 ...@@ -588,9 +588,13 @@ cglobal avg_pixels8_xy2, 4,5
paddusw m5, m1 paddusw m5, m1
psrlw m4, 2 psrlw m4, 2
psrlw m5, 2 psrlw m5, 2
%ifidn %1, avg
mova m3, [r0+r4] mova m3, [r0+r4]
packuswb m4, m5 packuswb m4, m5
PAVGB m4, m3 PAVGB m4, m3
%else
packuswb m4, m5
%endif
mova [r0+r4], m4 mova [r0+r4], m4
add r4, r2 add r4, r2
...@@ -610,9 +614,13 @@ cglobal avg_pixels8_xy2, 4,5 ...@@ -610,9 +614,13 @@ cglobal avg_pixels8_xy2, 4,5
paddusw m1, m5 paddusw m1, m5
psrlw m0, 2 psrlw m0, 2
psrlw m1, 2 psrlw m1, 2
%ifidn %1, avg
mova m3, [r0+r4] mova m3, [r0+r4]
packuswb m0, m1 packuswb m0, m1
PAVGB m0, m3 PAVGB m0, m3
%else
packuswb m0, m1
%endif
mova [r0+r4], m0 mova [r0+r4], m0
add r4, r2 add r4, r2
sub r3d, 2 sub r3d, 2
...@@ -621,8 +629,9 @@ cglobal avg_pixels8_xy2, 4,5 ...@@ -621,8 +629,9 @@ cglobal avg_pixels8_xy2, 4,5
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
AVG_PIXELS_XY2 SET_PIXELS_XY2 avg
INIT_MMX 3dnow INIT_MMX 3dnow
AVG_PIXELS_XY2 SET_PIXELS_XY2 avg
INIT_XMM sse2 INIT_XMM sse2
AVG_PIXELS_XY2 SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
...@@ -48,6 +48,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, ...@@ -48,6 +48,8 @@ void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h); ptrdiff_t line_size, int h);
void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h); ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h); ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
...@@ -296,6 +298,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) ...@@ -296,6 +298,7 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment