Commit 0105ed55 authored by Kaustubh Raste's avatar Kaustubh Raste Committed by Michael Niedermayer

avcodec/mips: Improve avc mc copy msa functions

Remove loops and unroll as block sizes are known.
Signed-off-by: 's avatarKaustubh Raste <kaustubh.raste@imgtec.com>
Reviewed-by: 's avatarManojkumar Bhosale <Manojkumar.Bhosale@imgtec.com>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent e5a650e1
/* /*
* Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com) * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
...@@ -2966,31 +2966,100 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride, ...@@ -2966,31 +2966,100 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride) ptrdiff_t stride)
{ {
copy_width16_msa(src, stride, dst, stride, 16); v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * stride);
LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
dst += (8 * stride);
ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
} }
void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride) ptrdiff_t stride)
{ {
copy_width8_msa(src, stride, dst, stride, 8); uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
LD4(src, stride, src0, src1, src2, src3);
src += 4 * stride;
LD4(src, stride, src4, src5, src6, src7);
SD4(src0, src1, src2, src3, dst, stride);
dst += 4 * stride;
SD4(src4, src5, src6, src7, dst, stride);
} }
void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride) ptrdiff_t stride)
{ {
avg_width16_msa(src, stride, dst, stride, 16); v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * stride);
LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
dst6, dst7);
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
dst += (8 * stride);
LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
dst6, dst7);
ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
} }
void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride) ptrdiff_t stride)
{ {
avg_width8_msa(src, stride, dst, stride, 8); uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
LD4(src, stride, tp0, tp1, tp2, tp3);
src += 4 * stride;
LD4(src, stride, tp4, tp5, tp6, tp7);
INSERT_D2_UB(tp0, tp1, src0);
INSERT_D2_UB(tp2, tp3, src1);
INSERT_D2_UB(tp4, tp5, src2);
INSERT_D2_UB(tp6, tp7, src3);
LD4(dst, stride, tp0, tp1, tp2, tp3);
LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
INSERT_D2_UB(tp0, tp1, dst0);
INSERT_D2_UB(tp2, tp3, dst1);
INSERT_D2_UB(tp4, tp5, dst2);
INSERT_D2_UB(tp6, tp7, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
} }
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride) ptrdiff_t stride)
{ {
avg_width4_msa(src, stride, dst, stride, 4); uint32_t tp0, tp1, tp2, tp3;
v16u8 src0 = { 0 }, dst0 = { 0 };
LW4(src, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
} }
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment