Commit f2a30bd8 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).

Originally committed as revision 24029 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent ea28e81f
......@@ -63,12 +63,16 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
......
......@@ -222,6 +222,13 @@ extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
#endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
......@@ -260,6 +267,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
c->put_vp8_epel_pixels_tab[1][0][0] =
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
}
/* note that 4-tap width=16 functions are missing because w=16
......@@ -272,6 +282,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
}
if (mm_flags & FF_MM_SSE) {
......@@ -284,6 +297,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
}
if (mm_flags & FF_MM_SSSE3) {
......
This diff is collapsed.
......@@ -37,6 +37,14 @@
SWAP %2, %4, %3
%endmacro
%macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
SBUTTERFLY wd, %1, %3, %5
SBUTTERFLY wd, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment