Commit 6d191648 authored by Shiyou Yin's avatar Shiyou Yin Committed by Michael Niedermayer

avcodec/mips: [loongson] optimize put_hevc_qpel_hv_8 with mmi.

Optimize put_hevc_qpel_hv_8 with mmi in the case width=4/8/12/16/24/32/48/64.
This optimization improved HEVC decoding performance 11%(1.81x to 2.01x, tested on loongson 3A3000).
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent 8133921a
......@@ -25,6 +25,15 @@ static av_cold void hevc_dsp_init_mmi(HEVCDSPContext *c,
const int bit_depth)
{
if (8 == bit_depth) {
c->put_hevc_qpel[1][1][1] = ff_hevc_put_hevc_qpel_hv4_8_mmi;
c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_mmi;
c->put_hevc_qpel[4][1][1] = ff_hevc_put_hevc_qpel_hv12_8_mmi;
c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_8_mmi;
c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_8_mmi;
c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_8_mmi;
c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_8_mmi;
c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_8_mmi;
c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_hevc_pel_bi_pixels8_8_mmi;
c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_pel_bi_pixels16_8_mmi;
c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_pel_bi_pixels24_8_mmi;
......
......@@ -480,16 +480,33 @@ void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *pi16Coeffs,
void ff_hevc_idct_luma_4x4_msa(int16_t *pi16Coeffs);
/* Loongson optimization */
#define L_BI_MC(PEL, DIR, WIDTH, TYPE) \
void ff_hevc_put_hevc_##PEL##_bi_##DIR##WIDTH##_8_##TYPE(uint8_t *dst, \
ptrdiff_t dst_stride, \
uint8_t *src, \
ptrdiff_t src_stride, \
int16_t *src_16bit, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
#define L_MC(PEL, DIR, WIDTH, TYPE) \
void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_##TYPE(int16_t *dst, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
L_MC(qpel, hv, 4, mmi);
L_MC(qpel, hv, 8, mmi);
L_MC(qpel, hv, 12, mmi);
L_MC(qpel, hv, 16, mmi);
L_MC(qpel, hv, 24, mmi);
L_MC(qpel, hv, 32, mmi);
L_MC(qpel, hv, 48, mmi);
L_MC(qpel, hv, 64, mmi);
#define L_BI_MC(PEL, DIR, WIDTH, TYPE) \
void ff_hevc_put_hevc_##PEL##_bi_##DIR##WIDTH##_8_##TYPE(uint8_t *dst, \
ptrdiff_t dst_stride, \
uint8_t *src, \
ptrdiff_t src_stride, \
int16_t *src_16bit, \
int height, \
intptr_t mx, \
intptr_t my, \
int width)
L_BI_MC(pel, pixels, 8, mmi);
L_BI_MC(pel, pixels, 16, mmi);
......
This diff is collapsed.
......@@ -250,6 +250,15 @@
: "memory" \
);
/**
* brief: Transpose 2X2 word packaged data.
* fr_i0, fr_i1: src
* fr_o0, fr_o1: dst
*/
#define TRANSPOSE_2W(fr_i0, fr_i1, fr_o0, fr_o1) \
"punpcklwd "#fr_o0", "#fr_i0", "#fr_i1" \n\t" \
"punpckhwd "#fr_o1", "#fr_i0", "#fr_i1" \n\t"
/**
* brief: Transpose 4X4 half word packaged data.
* fr_i0, fr_i1, fr_i2, fr_i3: src & dst
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment