Commit e9e456d8 authored by Ronald S. Bultje's avatar Ronald S. Bultje

VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)

and chroma (width=8).

Originally committed as revision 24378 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 92bfd746
......@@ -50,12 +50,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
......
......@@ -73,8 +73,6 @@
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
"add %2, %0 \n\t"
DECLARE_ALIGNED(16, const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
/** Sacrifying mm6 allows to pipeline loads from src */
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
......
......@@ -255,6 +255,32 @@ extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
int s, int e, int i, int hvt);
#endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
......@@ -301,6 +327,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
}
/* note that 4-tap width=16 functions are missing because w=16
......@@ -321,6 +352,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
}
if (mm_flags & FF_MM_SSE) {
......@@ -339,11 +375,17 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
}
if (mm_flags & FF_MM_SSE2) {
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
}
if (mm_flags & FF_MM_SSSE3) {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment