Commit e25be471 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp8: convert idct/mc x86 assembly to use cpuflags().

parent 8249a23f
...@@ -29,16 +29,16 @@ ...@@ -29,16 +29,16 @@
/* /*
* MC functions * MC functions
*/ */
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
...@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, ...@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
...@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, ...@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
...@@ -139,16 +139,16 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ ...@@ -139,16 +139,16 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
} }
#if ARCH_X86_32 #if ARCH_X86_32
TAP_W8 (mmxext, epel, h4) TAP_W8 (mmx2, epel, h4)
TAP_W8 (mmxext, epel, h6) TAP_W8 (mmx2, epel, h6)
TAP_W16(mmxext, epel, h6) TAP_W16(mmx2, epel, h6)
TAP_W8 (mmxext, epel, v4) TAP_W8 (mmx2, epel, v4)
TAP_W8 (mmxext, epel, v6) TAP_W8 (mmx2, epel, v6)
TAP_W16(mmxext, epel, v6) TAP_W16(mmx2, epel, v6)
TAP_W8 (mmxext, bilinear, h) TAP_W8 (mmx2, bilinear, h)
TAP_W16(mmxext, bilinear, h) TAP_W16(mmx2, bilinear, h)
TAP_W8 (mmxext, bilinear, v) TAP_W8 (mmx2, bilinear, v)
TAP_W16(mmxext, bilinear, v) TAP_W16(mmx2, bilinear, v)
#endif #endif
TAP_W16(sse2, epel, h6) TAP_W16(sse2, epel, h6)
...@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT ...@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
#if ARCH_X86_32 #if ARCH_X86_32
#define HVTAPMMX(x, y) \ #define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \ HVTAP(mmx2, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16) HVTAP(mmx2, 8, x, y, 8, 16)
HVTAP(mmxext, 8, 6, 6, 16, 16) HVTAP(mmx2, 8, 6, 6, 16, 16)
#else #else
#define HVTAPMMX(x, y) \ #define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) HVTAP(mmx2, 8, x, y, 4, 8)
#endif #endif
HVTAPMMX(4, 4) HVTAPMMX(4, 4)
...@@ -218,10 +218,10 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ ...@@ -218,10 +218,10 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \ dst, dststride, tmp, SIZE, height, mx, my); \
} }
HVBILIN(mmxext, 8, 4, 8) HVBILIN(mmx2, 8, 4, 8)
#if ARCH_X86_32 #if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16) HVBILIN(mmx2, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16) HVBILIN(mmx2, 8, 16, 16)
#endif #endif
HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16) HVBILIN(sse2, 8, 16, 16)
...@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ ...@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
int e, int i, int hvt); int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx) DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(mmx2)
DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4) DECLARE_LOOP_FILTER(sse4)
...@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) ...@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16 /* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */ * is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & AV_CPU_FLAG_MMX2) { if (mm_flags & AV_CPU_FLAG_MMX2) {
VP8_MC_FUNC(2, 4, mmxext); VP8_MC_FUNC(2, 4, mmx2);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext); VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
#if ARCH_X86_32 #if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext); VP8_LUMA_MC_FUNC(0, 16, mmx2);
VP8_MC_FUNC(1, 8, mmxext); VP8_MC_FUNC(1, 8, mmx2);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext); VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext); VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
#endif #endif
} }
......
...@@ -173,8 +173,8 @@ SECTION .text ...@@ -173,8 +173,8 @@ SECTION .text
; int height, int mx, int my); ; int height, int mx, int my);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro FILTER_SSSE3 3 %macro FILTER_SSSE3 1
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 cglobal put_vp8_epel%1_h6, 6, 6, 8
lea r5d, [r5*3] lea r5d, [r5*3]
mova m3, [filter_h6_shuf2] mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3] mova m4, [filter_h6_shuf3]
...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 ...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
movu m0, [r2-2] movu m0, [r2-2]
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
%ifidn %1, 4 %if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand ; shuffle with a memory operand
punpcklbw m0, [r2+3] punpcklbw m0, [r2+3]
...@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 ...@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 cglobal put_vp8_epel%1_h4, 6, 6, 7
shl r5d, 4 shl r5d, 4
mova m2, [pw_64] mova m2, [pw_64]
mova m3, [filter_h2_shuf] mova m3, [filter_h2_shuf]
...@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 ...@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 4 shl r6d, 4
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_hb_m] lea r11, [fourtap_filter_hb_m]
...@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 ...@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 cglobal put_vp8_epel%1_v6, 7, 7, 8
lea r6d, [r6*3] lea r6d, [r6*3]
%ifdef PIC %ifdef PIC
lea r11, [sixtap_filter_hb_m] lea r11, [sixtap_filter_hb_m]
...@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 ...@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX ssse3
FILTER_SSSE3 4, 0, 0 FILTER_SSSE3 4
INIT_XMM INIT_XMM ssse3
FILTER_SSSE3 8, 8, 7 FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter ; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6 INIT_MMX mmx2
cglobal put_vp8_epel4_h4, 6, 6
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_hw_m] lea r11, [fourtap_filter_hw_m]
...@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 ...@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
REP_RET REP_RET
; 4x4 block, H-only 6-tap filter ; 4x4 block, H-only 6-tap filter
cglobal put_vp8_epel4_h6_mmxext, 6, 6 INIT_MMX mmx2
cglobal put_vp8_epel4_h6, 6, 6
lea r5d, [r5*3] lea r5d, [r5*3]
%ifdef PIC %ifdef PIC
lea r11, [sixtap_filter_hw_m] lea r11, [sixtap_filter_hw_m]
...@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 ...@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow jg .nextrow
REP_RET REP_RET
INIT_XMM INIT_XMM sse2
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 cglobal put_vp8_epel8_h4, 6, 6, 10
shl r5d, 5 shl r5d, 5
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_v_m] lea r11, [fourtap_filter_v_m]
...@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 ...@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 INIT_XMM sse2
cglobal put_vp8_epel8_h6, 6, 6, 14
lea r5d, [r5*3] lea r5d, [r5*3]
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
...@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 ...@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
jg .nextrow jg .nextrow
REP_RET REP_RET
%macro FILTER_V 3 %macro FILTER_V 1
; 4x4 block, V-only 4-tap filter ; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 5 shl r6d, 5
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_v_m] lea r11, [fourtap_filter_v_m]
...@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 ...@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
; 4x4 block, V-only 6-tap filter ; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 cglobal put_vp8_epel%1_v6, 7, 7, 8
shl r6d, 4 shl r6d, 4
lea r6, [r6*3] lea r6, [r6*3]
%ifdef PIC %ifdef PIC
...@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 ...@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx2
FILTER_V mmxext, 4, 0 FILTER_V 4
INIT_XMM INIT_XMM sse2
FILTER_V sse2, 8, 8 FILTER_V 8
%macro FILTER_BILINEAR 3 %macro FILTER_BILINEAR 1
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 cglobal put_vp8_bilinear%1_v, 7, 7, 7
mov r5d, 8*16 mov r5d, 8*16
shl r6d, 4 shl r6d, 4
sub r5d, r6d sub r5d, r6d
...@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 ...@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
psraw m2, 2 psraw m2, 2
pavgw m0, m6 pavgw m0, m6
pavgw m2, m6 pavgw m2, m6
%ifidn %1, mmxext %if mmsize == 8
packuswb m0, m0 packuswb m0, m0
packuswb m2, m2 packuswb m2, m2
movh [r0+r1*0], m0 movh [r0+r1*0], m0
...@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 ...@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 cglobal put_vp8_bilinear%1_h, 7, 7, 7
mov r6d, 8*16 mov r6d, 8*16
shl r5d, 4 shl r5d, 4
sub r6d, r5d sub r6d, r5d
...@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 ...@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
psraw m2, 2 psraw m2, 2
pavgw m0, m6 pavgw m0, m6
pavgw m2, m6 pavgw m2, m6
%ifidn %1, mmxext %if mmsize == 8
packuswb m0, m0 packuswb m0, m0
packuswb m2, m2 packuswb m2, m2
movh [r0+r1*0], m0 movh [r0+r1*0], m0
...@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 ...@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx2
FILTER_BILINEAR mmxext, 4, 0 FILTER_BILINEAR 4
INIT_XMM INIT_XMM sse2
FILTER_BILINEAR sse2, 8, 7 FILTER_BILINEAR 8
%macro FILTER_BILINEAR_SSSE3 1 %macro FILTER_BILINEAR_SSSE3 1
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 cglobal put_vp8_bilinear%1_v, 7, 7, 5
shl r6d, 4 shl r6d, 4
%ifdef PIC %ifdef PIC
lea r11, [bilinear_filter_vb_m] lea r11, [bilinear_filter_vb_m]
...@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 ...@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 cglobal put_vp8_bilinear%1_h, 7, 7, 5
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
lea r11, [bilinear_filter_vb_m] lea r11, [bilinear_filter_vb_m]
...@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 ...@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX ssse3
FILTER_BILINEAR_SSSE3 4 FILTER_BILINEAR_SSSE3 4
INIT_XMM INIT_XMM ssse3
FILTER_BILINEAR_SSSE3 8 FILTER_BILINEAR_SSSE3 8
cglobal put_vp8_pixels8_mmx, 5,5 INIT_MMX mmx
cglobal put_vp8_pixels8, 5,5
.nextrow: .nextrow:
movq mm0, [r2+r3*0] movq mm0, [r2+r3*0]
movq mm1, [r2+r3*1] movq mm1, [r2+r3*1]
...@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5 ...@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
REP_RET REP_RET
%if ARCH_X86_32 %if ARCH_X86_32
cglobal put_vp8_pixels16_mmx, 5,5 INIT_MMX mmx
cglobal put_vp8_pixels16, 5,5
.nextrow: .nextrow:
movq mm0, [r2+r3*0+0] movq mm0, [r2+r3*0+0]
movq mm1, [r2+r3*0+8] movq mm1, [r2+r3*0+8]
...@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5 ...@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
REP_RET REP_RET
%endif %endif
cglobal put_vp8_pixels16_sse, 5,5,2 INIT_XMM sse
cglobal put_vp8_pixels16, 5,5,2
.nextrow: .nextrow:
movups xmm0, [r2+r3*0] movups xmm0, [r2+r3*0]
movups xmm1, [r2+r3*1] movups xmm1, [r2+r3*1]
...@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2 ...@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
%4 [r1+r2+%3], m5 %4 [r1+r2+%3], m5
%endmacro %endmacro
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add_mmx, 3, 3 cglobal vp8_idct_dc_add, 3, 3
; load data ; load data
movd m0, [r1] movd m0, [r1]
...@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 ...@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
ADD_DC m0, m1, 0, movh ADD_DC m0, m1, 0, movh
RET RET
INIT_XMM INIT_XMM sse4
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 cglobal vp8_idct_dc_add, 3, 3, 6
; load data ; load data
movd m0, [r1] movd m0, [r1]
pxor m1, m1 pxor m1, m1
...@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ...@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add4y_mmx, 3, 3 cglobal vp8_idct_dc_add4y, 3, 3
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 ...@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
RET RET
%endif %endif
INIT_XMM INIT_XMM sse2
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 cglobal vp8_idct_dc_add4y, 3, 3, 6
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ...@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add4uv_mmx, 3, 3 cglobal vp8_idct_dc_add4uv, 3, 3
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 ...@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
SWAP %4, %3 SWAP %4, %3
%endmacro %endmacro
INIT_MMX %macro VP8_IDCT_ADD 0
%macro VP8_IDCT_ADD 1 cglobal vp8_idct_add, 3, 3
cglobal vp8_idct_add_%1, 3, 3
; load block data ; load block data
movq m0, [r1+ 0] movq m0, [r1+ 0]
movq m1, [r1+ 8] movq m1, [r1+ 8]
...@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3 ...@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
movq m3, [r1+24] movq m3, [r1+24]
movq m6, [pw_20091] movq m6, [pw_20091]
movq m7, [pw_17734] movq m7, [pw_17734]
%ifidn %1, sse %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [r1+ 0], xmm0
movaps [r1+16], xmm0 movaps [r1+16], xmm0
...@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3 ...@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
%endmacro %endmacro
%if ARCH_X86_32 %if ARCH_X86_32
VP8_IDCT_ADD mmx INIT_MMX mmx
VP8_IDCT_ADD
%endif %endif
VP8_IDCT_ADD sse INIT_MMX sse
VP8_IDCT_ADD
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
...@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse ...@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
SWAP %1, %4, %3 SWAP %1, %4, %3
%endmacro %endmacro
%macro VP8_DC_WHT 1 %macro VP8_DC_WHT 0
cglobal vp8_luma_dc_wht_%1, 2,3 cglobal vp8_luma_dc_wht, 2, 3
movq m0, [r1] movq m0, [r1]
movq m1, [r1+8] movq m1, [r1+8]
movq m2, [r1+16] movq m2, [r1+16]
movq m3, [r1+24] movq m3, [r1+24]
%ifidn %1, sse %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [r1+ 0], xmm0
movaps [r1+16], xmm0 movaps [r1+16], xmm0
...@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3 ...@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
RET RET
%endmacro %endmacro
INIT_MMX
%if ARCH_X86_32 %if ARCH_X86_32
VP8_DC_WHT mmx INIT_MMX mmx
VP8_DC_WHT
%endif %endif
VP8_DC_WHT sse INIT_MMX sse
VP8_DC_WHT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment