Commit 648b422e authored by gxw's avatar gxw Committed by Michael Niedermayer

avcodec/mips: msa optimizations for vc1dsp

Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000.
Reviewed-by: 's avatarShiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent af70c94c
...@@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP) += mips/wmv2dsp_mmi.o ...@@ -89,3 +89,4 @@ MMI-OBJS-$(CONFIG_WMV2DSP) += mips/wmv2dsp_mmi.o
MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o MMI-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_mmi.o
MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o MMI-OBJS-$(CONFIG_VP3DSP) += mips/vp3dsp_idct_mmi.o
MMI-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_mmi.o MMI-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_mmi.o
MSA-OBJS-$(CONFIG_VC1_DECODER) += mips/vc1dsp_msa.o
...@@ -23,6 +23,10 @@ ...@@ -23,6 +23,10 @@
#include "vc1dsp_mips.h" #include "vc1dsp_mips.h"
#include "config.h" #include "config.h"
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
#if HAVE_MMI #if HAVE_MMI
static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
{ {
...@@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) ...@@ -49,10 +53,6 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi; dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_mmi;
dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi; dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_mmi;
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##INSN; \
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = ff_##OP##vc1_mspel_mc##X##Y##_16##INSN
FN_ASSIGN(put_, 0, 0, _mmi); FN_ASSIGN(put_, 0, 0, _mmi);
FN_ASSIGN(put_, 0, 1, _mmi); FN_ASSIGN(put_, 0, 1, _mmi);
FN_ASSIGN(put_, 0, 2, _mmi); FN_ASSIGN(put_, 0, 2, _mmi);
...@@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp) ...@@ -100,9 +100,31 @@ static av_cold void vc1dsp_init_mmi(VC1DSPContext *dsp)
} }
#endif /* HAVE_MMI */ #endif /* HAVE_MMI */
#if HAVE_MSA
static av_cold void vc1dsp_init_msa(VC1DSPContext *dsp)
{
dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_msa;
dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_msa;
dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_msa;
FN_ASSIGN(put_, 1, 1, _msa);
FN_ASSIGN(put_, 1, 2, _msa);
FN_ASSIGN(put_, 1, 3, _msa);
FN_ASSIGN(put_, 2, 1, _msa);
FN_ASSIGN(put_, 2, 2, _msa);
FN_ASSIGN(put_, 2, 3, _msa);
FN_ASSIGN(put_, 3, 1, _msa);
FN_ASSIGN(put_, 3, 2, _msa);
FN_ASSIGN(put_, 3, 3, _msa);
}
#endif /* HAVE_MSA */
av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp) av_cold void ff_vc1dsp_init_mips(VC1DSPContext *dsp)
{ {
#if HAVE_MMI #if HAVE_MMI
vc1dsp_init_mmi(dsp); vc1dsp_init_mmi(dsp);
#endif /* HAVE_MMI */ #endif /* HAVE_MMI */
#if HAVE_MSA
vc1dsp_init_msa(dsp);
#endif /* HAVE_MSA */
} }
...@@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */, ...@@ -191,4 +191,27 @@ void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
uint8_t *src /* align 1 */, uint8_t *src /* align 1 */,
ptrdiff_t stride, int h, int x, int y); ptrdiff_t stride, int h, int x, int y);
void ff_vc1_inv_trans_8x8_msa(int16_t block[64]);
void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block);
#define FF_PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \
void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, int rnd); \
void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, int rnd);
FF_PUT_VC1_MSPEL_MC_MSA(1, 1);
FF_PUT_VC1_MSPEL_MC_MSA(1, 2);
FF_PUT_VC1_MSPEL_MC_MSA(1, 3);
FF_PUT_VC1_MSPEL_MC_MSA(2, 1);
FF_PUT_VC1_MSPEL_MC_MSA(2, 2);
FF_PUT_VC1_MSPEL_MC_MSA(2, 3);
FF_PUT_VC1_MSPEL_MC_MSA(3, 1);
FF_PUT_VC1_MSPEL_MC_MSA(3, 2);
FF_PUT_VC1_MSPEL_MC_MSA(3, 3);
#endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */ #endif /* AVCODEC_MIPS_VC1DSP_MIPS_H */
This diff is collapsed.
...@@ -299,6 +299,7 @@ ...@@ -299,6 +299,7 @@
#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
{ \ { \
...@@ -337,6 +338,7 @@ ...@@ -337,6 +338,7 @@
#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__) #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
#define LD_V16(RTYPE, psrc, stride, \ #define LD_V16(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7, \ out0, out1, out2, out3, out4, out5, out6, out7, \
...@@ -1382,6 +1384,7 @@ ...@@ -1382,6 +1384,7 @@
out4, out5, out6, out7); \ out4, out5, out6, out7); \
} }
#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
/* Description : Interleave right half of halfword elements from vectors /* Description : Interleave right half of halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment