Commit dd68d4db authored by Ronald S. Bultje's avatar Ronald S. Bultje

MMX, MMX2, SSE2 and SSSE3 optimizations for pred16x16/8x8_plane H264 intra

prediction (plus some with different rounding for svq3/rv40). Speedup (for
SSSE3) about ~6-fold, 3.6% faster overall with cathedral sample.

Originally committed as revision 25361 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 2f412421
......@@ -48,6 +48,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x00080
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
......
This diff is collapsed.
......@@ -29,6 +29,18 @@ void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride);
void ff_pred16x16_dc_mmxext (uint8_t *src, int stride);
void ff_pred16x16_dc_sse2 (uint8_t *src, int stride);
void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride);
void ff_pred16x16_plane_h264_mmx (uint8_t *src, int stride);
void ff_pred16x16_plane_h264_mmx2 (uint8_t *src, int stride);
void ff_pred16x16_plane_h264_sse2 (uint8_t *src, int stride);
void ff_pred16x16_plane_h264_ssse3 (uint8_t *src, int stride);
void ff_pred16x16_plane_rv40_mmx (uint8_t *src, int stride);
void ff_pred16x16_plane_rv40_mmx2 (uint8_t *src, int stride);
void ff_pred16x16_plane_rv40_sse2 (uint8_t *src, int stride);
void ff_pred16x16_plane_rv40_ssse3 (uint8_t *src, int stride);
void ff_pred16x16_plane_svq3_mmx (uint8_t *src, int stride);
void ff_pred16x16_plane_svq3_mmx2 (uint8_t *src, int stride);
void ff_pred16x16_plane_svq3_sse2 (uint8_t *src, int stride);
void ff_pred16x16_plane_svq3_ssse3 (uint8_t *src, int stride);
void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride);
void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride);
void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride);
......@@ -37,6 +49,10 @@ void ff_pred8x8_vertical_mmx (uint8_t *src, int stride);
void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride);
void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride);
void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride);
void ff_pred8x8_plane_mmx (uint8_t *src, int stride);
void ff_pred8x8_plane_mmx2 (uint8_t *src, int stride);
void ff_pred8x8_plane_sse2 (uint8_t *src, int stride);
void ff_pred8x8_plane_ssse3 (uint8_t *src, int stride);
void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride);
void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride);
void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride);
......@@ -61,6 +77,15 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx;
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx;
} else {
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx;
if (codec_id == CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_mmx;
} else if (codec_id == CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_mmx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_mmx;
}
}
}
......@@ -75,6 +100,15 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext;
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext;
} else {
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_mmx2;
if (codec_id == CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_mmx2;
} else if (codec_id == CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_mmx2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_mmx2;
}
}
}
......@@ -87,6 +121,15 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
if (codec_id == CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2;
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2;
} else {
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_sse2;
if (codec_id == CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_sse2;
} else if (codec_id == CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_sse2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_sse2;
}
}
}
......@@ -97,6 +140,15 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
if (codec_id == CODEC_ID_VP8) {
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3;
} else {
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_ssse3;
if (codec_id == CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_ssse3;
} else if (codec_id == CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_ssse3;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_ssse3;
}
}
}
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment