Commit e25be471 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp8: convert idct/mc x86 assembly to use cpuflags().

parent 8249a23f
...@@ -29,16 +29,16 @@ ...@@ -29,16 +29,16 @@
/* /*
* MC functions * MC functions
*/ */
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
...@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, ...@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
...@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, ...@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride, uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my); int height, int mx, int my);
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
...@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ ...@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
} }
#if ARCH_X86_32 #if ARCH_X86_32
TAP_W8 (mmxext, epel, h4) TAP_W8 (mmx2, epel, h4)
TAP_W8 (mmxext, epel, h6) TAP_W8 (mmx2, epel, h6)
TAP_W16(mmxext, epel, h6) TAP_W16(mmx2, epel, h6)
TAP_W8 (mmxext, epel, v4) TAP_W8 (mmx2, epel, v4)
TAP_W8 (mmxext, epel, v6) TAP_W8 (mmx2, epel, v6)
TAP_W16(mmxext, epel, v6) TAP_W16(mmx2, epel, v6)
TAP_W8 (mmxext, bilinear, h) TAP_W8 (mmx2, bilinear, h)
TAP_W16(mmxext, bilinear, h) TAP_W16(mmx2, bilinear, h)
TAP_W8 (mmxext, bilinear, v) TAP_W8 (mmx2, bilinear, v)
TAP_W16(mmxext, bilinear, v) TAP_W16(mmx2, bilinear, v)
#endif #endif
TAP_W16(sse2, epel, h6) TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6) TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h) TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v) TAP_W16(sse2, bilinear, v)
TAP_W16(ssse3, epel, h6) TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6) TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h) TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v) TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
...@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT ...@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
#if ARCH_X86_32 #if ARCH_X86_32
#define HVTAPMMX(x, y) \ #define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \ HVTAP(mmx2, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16) HVTAP(mmx2, 8, x, y, 8, 16)
HVTAP(mmxext, 8, 6, 6, 16, 16) HVTAP(mmx2, 8, 6, 6, 16, 16)
#else #else
#define HVTAPMMX(x, y) \ #define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) HVTAP(mmx2, 8, x, y, 4, 8)
#endif #endif
HVTAPMMX(4, 4) HVTAPMMX(4, 4)
...@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ ...@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \ dst, dststride, tmp, SIZE, height, mx, my); \
} }
HVBILIN(mmxext, 8, 4, 8) HVBILIN(mmx2, 8, 4, 8)
#if ARCH_X86_32 #if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16) HVBILIN(mmx2, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16) HVBILIN(mmx2, 8, 16, 16)
#endif #endif
HVBILIN(sse2, 8, 8, 16) HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16) HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8) HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16) HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16) HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
ptrdiff_t stride); ptrdiff_t stride);
...@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ ...@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
int e, int i, int hvt); int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx) DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext) DECLARE_LOOP_FILTER(mmx2)
DECLARE_LOOP_FILTER(sse2) DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3) DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4) DECLARE_LOOP_FILTER(sse4)
...@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) ...@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16 /* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */ * is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & AV_CPU_FLAG_MMX2) { if (mm_flags & AV_CPU_FLAG_MMX2) {
VP8_MC_FUNC(2, 4, mmxext); VP8_MC_FUNC(2, 4, mmx2);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext); VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
#if ARCH_X86_32 #if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext); VP8_LUMA_MC_FUNC(0, 16, mmx2);
VP8_MC_FUNC(1, 8, mmxext); VP8_MC_FUNC(1, 8, mmx2);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext); VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext); VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
#endif #endif
} }
......
...@@ -173,8 +173,8 @@ SECTION .text ...@@ -173,8 +173,8 @@ SECTION .text
; int height, int mx, int my); ; int height, int mx, int my);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro FILTER_SSSE3 3 %macro FILTER_SSSE3 1
cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 cglobal put_vp8_epel%1_h6, 6, 6, 8
lea r5d, [r5*3] lea r5d, [r5*3]
mova m3, [filter_h6_shuf2] mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3] mova m4, [filter_h6_shuf3]
...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 ...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
movu m0, [r2-2] movu m0, [r2-2]
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
%ifidn %1, 4 %if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand ; shuffle with a memory operand
punpcklbw m0, [r2+3] punpcklbw m0, [r2+3]
...@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 ...@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 cglobal put_vp8_epel%1_h4, 6, 6, 7
shl r5d, 4 shl r5d, 4
mova m2, [pw_64] mova m2, [pw_64]
mova m3, [filter_h2_shuf] mova m3, [filter_h2_shuf]
...@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 ...@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 4 shl r6d, 4
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_hb_m] lea r11, [fourtap_filter_hb_m]
...@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 ...@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 cglobal put_vp8_epel%1_v6, 7, 7, 8
lea r6d, [r6*3] lea r6d, [r6*3]
%ifdef PIC %ifdef PIC
lea r11, [sixtap_filter_hb_m] lea r11, [sixtap_filter_hb_m]
...@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 ...@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX ssse3
FILTER_SSSE3 4, 0, 0 FILTER_SSSE3 4
INIT_XMM INIT_XMM ssse3
FILTER_SSSE3 8, 8, 7 FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter ; 4x4 block, H-only 4-tap filter
cglobal put_vp8_epel4_h4_mmxext, 6, 6 INIT_MMX mmx2
cglobal put_vp8_epel4_h4, 6, 6
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_hw_m] lea r11, [fourtap_filter_hw_m]
...@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6 ...@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
REP_RET REP_RET
; 4x4 block, H-only 6-tap filter ; 4x4 block, H-only 6-tap filter
cglobal put_vp8_epel4_h6_mmxext, 6, 6 INIT_MMX mmx2
cglobal put_vp8_epel4_h6, 6, 6
lea r5d, [r5*3] lea r5d, [r5*3]
%ifdef PIC %ifdef PIC
lea r11, [sixtap_filter_hw_m] lea r11, [sixtap_filter_hw_m]
...@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6 ...@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow jg .nextrow
REP_RET REP_RET
INIT_XMM INIT_XMM sse2
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 cglobal put_vp8_epel8_h4, 6, 6, 10
shl r5d, 5 shl r5d, 5
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_v_m] lea r11, [fourtap_filter_v_m]
...@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 ...@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 INIT_XMM sse2
cglobal put_vp8_epel8_h6, 6, 6, 14
lea r5d, [r5*3] lea r5d, [r5*3]
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
...@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 ...@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
jg .nextrow jg .nextrow
REP_RET REP_RET
%macro FILTER_V 3 %macro FILTER_V 1
; 4x4 block, V-only 4-tap filter ; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 cglobal put_vp8_epel%1_v4, 7, 7, 8
shl r6d, 5 shl r6d, 5
%ifdef PIC %ifdef PIC
lea r11, [fourtap_filter_v_m] lea r11, [fourtap_filter_v_m]
...@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 ...@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
; 4x4 block, V-only 6-tap filter ; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 cglobal put_vp8_epel%1_v6, 7, 7, 8
shl r6d, 4 shl r6d, 4
lea r6, [r6*3] lea r6, [r6*3]
%ifdef PIC %ifdef PIC
...@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 ...@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx2
FILTER_V mmxext, 4, 0 FILTER_V 4
INIT_XMM INIT_XMM sse2
FILTER_V sse2, 8, 8 FILTER_V 8
%macro FILTER_BILINEAR 3 %macro FILTER_BILINEAR 1
cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 cglobal put_vp8_bilinear%1_v, 7, 7, 7
mov r5d, 8*16 mov r5d, 8*16
shl r6d, 4 shl r6d, 4
sub r5d, r6d sub r5d, r6d
...@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 ...@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
psraw m2, 2 psraw m2, 2
pavgw m0, m6 pavgw m0, m6
pavgw m2, m6 pavgw m2, m6
%ifidn %1, mmxext %if mmsize == 8
packuswb m0, m0 packuswb m0, m0
packuswb m2, m2 packuswb m2, m2
movh [r0+r1*0], m0 movh [r0+r1*0], m0
...@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 ...@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 cglobal put_vp8_bilinear%1_h, 7, 7, 7
mov r6d, 8*16 mov r6d, 8*16
shl r5d, 4 shl r5d, 4
sub r6d, r5d sub r6d, r5d
...@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 ...@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
psraw m2, 2 psraw m2, 2
pavgw m0, m6 pavgw m0, m6
pavgw m2, m6 pavgw m2, m6
%ifidn %1, mmxext %if mmsize == 8
packuswb m0, m0 packuswb m0, m0
packuswb m2, m2 packuswb m2, m2
movh [r0+r1*0], m0 movh [r0+r1*0], m0
...@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 ...@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX mmx2
FILTER_BILINEAR mmxext, 4, 0 FILTER_BILINEAR 4
INIT_XMM INIT_XMM sse2
FILTER_BILINEAR sse2, 8, 7 FILTER_BILINEAR 8
%macro FILTER_BILINEAR_SSSE3 1 %macro FILTER_BILINEAR_SSSE3 1
cglobal put_vp8_bilinear%1_v_ssse3, 7,7 cglobal put_vp8_bilinear%1_v, 7, 7, 5
shl r6d, 4 shl r6d, 4
%ifdef PIC %ifdef PIC
lea r11, [bilinear_filter_vb_m] lea r11, [bilinear_filter_vb_m]
...@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7 ...@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
jg .nextrow jg .nextrow
REP_RET REP_RET
cglobal put_vp8_bilinear%1_h_ssse3, 7,7 cglobal put_vp8_bilinear%1_h, 7, 7, 5
shl r5d, 4 shl r5d, 4
%ifdef PIC %ifdef PIC
lea r11, [bilinear_filter_vb_m] lea r11, [bilinear_filter_vb_m]
...@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7 ...@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
REP_RET REP_RET
%endmacro %endmacro
INIT_MMX INIT_MMX ssse3
FILTER_BILINEAR_SSSE3 4 FILTER_BILINEAR_SSSE3 4
INIT_XMM INIT_XMM ssse3
FILTER_BILINEAR_SSSE3 8 FILTER_BILINEAR_SSSE3 8
cglobal put_vp8_pixels8_mmx, 5,5 INIT_MMX mmx
cglobal put_vp8_pixels8, 5,5
.nextrow: .nextrow:
movq mm0, [r2+r3*0] movq mm0, [r2+r3*0]
movq mm1, [r2+r3*1] movq mm1, [r2+r3*1]
...@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5 ...@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
REP_RET REP_RET
%if ARCH_X86_32 %if ARCH_X86_32
cglobal put_vp8_pixels16_mmx, 5,5 INIT_MMX mmx
cglobal put_vp8_pixels16, 5,5
.nextrow: .nextrow:
movq mm0, [r2+r3*0+0] movq mm0, [r2+r3*0+0]
movq mm1, [r2+r3*0+8] movq mm1, [r2+r3*0+8]
...@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5 ...@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
REP_RET REP_RET
%endif %endif
cglobal put_vp8_pixels16_sse, 5,5,2 INIT_XMM sse
cglobal put_vp8_pixels16, 5,5,2
.nextrow: .nextrow:
movups xmm0, [r2+r3*0] movups xmm0, [r2+r3*0]
movups xmm1, [r2+r3*1] movups xmm1, [r2+r3*1]
...@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2 ...@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
%4 [r1+r2+%3], m5 %4 [r1+r2+%3], m5
%endmacro %endmacro
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add_mmx, 3, 3 cglobal vp8_idct_dc_add, 3, 3
; load data ; load data
movd m0, [r1] movd m0, [r1]
...@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3 ...@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
ADD_DC m0, m1, 0, movh ADD_DC m0, m1, 0, movh
RET RET
INIT_XMM INIT_XMM sse4
cglobal vp8_idct_dc_add_sse4, 3, 3, 6 cglobal vp8_idct_dc_add, 3, 3, 6
; load data ; load data
movd m0, [r1] movd m0, [r1]
pxor m1, m1 pxor m1, m1
...@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 ...@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add4y_mmx, 3, 3 cglobal vp8_idct_dc_add4y, 3, 3
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3 ...@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
RET RET
%endif %endif
INIT_XMM INIT_XMM sse2
cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 cglobal vp8_idct_dc_add4y, 3, 3, 6
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 ...@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX mmx
cglobal vp8_idct_dc_add4uv_mmx, 3, 3 cglobal vp8_idct_dc_add4uv, 3, 3
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [r1+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [r1+32*2] ; C
...@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3 ...@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
SWAP %4, %3 SWAP %4, %3
%endmacro %endmacro
INIT_MMX %macro VP8_IDCT_ADD 0
%macro VP8_IDCT_ADD 1 cglobal vp8_idct_add, 3, 3
cglobal vp8_idct_add_%1, 3, 3
; load block data ; load block data
movq m0, [r1+ 0] movq m0, [r1+ 0]
movq m1, [r1+ 8] movq m1, [r1+ 8]
...@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3 ...@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
movq m3, [r1+24] movq m3, [r1+24]
movq m6, [pw_20091] movq m6, [pw_20091]
movq m7, [pw_17734] movq m7, [pw_17734]
%ifidn %1, sse %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [r1+ 0], xmm0
movaps [r1+16], xmm0 movaps [r1+16], xmm0
...@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3 ...@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
%endmacro %endmacro
%if ARCH_X86_32 %if ARCH_X86_32
VP8_IDCT_ADD mmx INIT_MMX mmx
VP8_IDCT_ADD
%endif %endif
VP8_IDCT_ADD sse INIT_MMX sse
VP8_IDCT_ADD
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
...@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse ...@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
SWAP %1, %4, %3 SWAP %1, %4, %3
%endmacro %endmacro
%macro VP8_DC_WHT 1 %macro VP8_DC_WHT 0
cglobal vp8_luma_dc_wht_%1, 2,3 cglobal vp8_luma_dc_wht, 2, 3
movq m0, [r1] movq m0, [r1]
movq m1, [r1+8] movq m1, [r1+8]
movq m2, [r1+16] movq m2, [r1+16]
movq m3, [r1+24] movq m3, [r1+24]
%ifidn %1, sse %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [r1+ 0], xmm0
movaps [r1+16], xmm0 movaps [r1+16], xmm0
...@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3 ...@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
RET RET
%endmacro %endmacro
INIT_MMX
%if ARCH_X86_32 %if ARCH_X86_32
VP8_DC_WHT mmx INIT_MMX mmx
VP8_DC_WHT
%endif %endif
VP8_DC_WHT sse INIT_MMX sse
VP8_DC_WHT
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment