Commit 92fc0bfa authored by gxw's avatar gxw Committed by Michael Niedermayer

avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.

Changing details as following:
1. The previous order of parameters are irregular and difficult to
   understand. Adjust the order of the parameters according to the
   rule: (RTYPE, input registers, input mask/input index/..., output registers).
   Most of the existing msa macros follow the rule.
2. Remove the redundant macro SLDI_Bn_0 and use SLDI_Bn instead.
Reviewed-by: 's avatarShiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent 77937a42
...@@ -618,7 +618,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -618,7 +618,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
\ \
out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \ out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \ out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
SLDI_B2_0_UB(out1, out2, out2, out3, 2); \ SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
} }
#define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \ #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
...@@ -1023,7 +1023,8 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ...@@ -1023,7 +1023,8 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3); ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5); ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
SLDI_B4_0_SB(src6, src1, src3, src5, src0, src2, src4, src7, 8); SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
8, src0, src2, src4, src7);
p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3); p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2); p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
...@@ -1114,10 +1115,10 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride, ...@@ -1114,10 +1115,10 @@ static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src, int32_t stride,
ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3); ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4); ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
SLDI_B2_0_UB(dst0, dst4, dst1, dst5, 8); SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0); dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1); dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
SLDI_B2_0_UB(dst2_x, dst2_y, dst3_x, dst3_y, 8); SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
out0 = __msa_copy_u_w((v4i32) dst0, 0); out0 = __msa_copy_u_w((v4i32) dst0, 0);
out1 = __msa_copy_u_h((v8i16) dst0, 2); out1 = __msa_copy_u_h((v8i16) dst0, 2);
......
...@@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -790,8 +790,8 @@ void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
minus5b, res4, res5, res6, res7); minus5b, res4, res5, res6, res7);
DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
plus20b, res4, res5, res6, res7); plus20b, res4, res5, res6, res7);
SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2); SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2); src0, src2, src4, src6);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
SRARI_H4_SH(res4, res5, res6, res7, 5); SRARI_H4_SH(res4, res5, res6, res7, 5);
SAT_SH4_SH(res0, res1, res2, res3, 7); SAT_SH4_SH(res0, res1, res2, res3, 7);
...@@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -858,8 +858,8 @@ void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
minus5b, res4, res5, res6, res7); minus5b, res4, res5, res6, res7);
DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
plus20b, res4, res5, res6, res7); plus20b, res4, res5, res6, res7);
SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3); SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3); src0, src2, src4, src6);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
SRARI_H4_SH(res4, res5, res6, res7, 5); SRARI_H4_SH(res4, res5, res6, res7, 5);
SAT_SH4_SH(res0, res1, res2, res3, 7); SAT_SH4_SH(res0, res1, res2, res3, 7);
...@@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -911,10 +911,10 @@ void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
res4, res5, res6, res7); res4, res5, res6, res7);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); src0, src1, src2, src3);
SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2); SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2); src4, src5, src6, src7);
PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
...@@ -966,10 +966,10 @@ void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -966,10 +966,10 @@ void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
res4, res5, res6, res7); res4, res5, res6, res7);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3); src0, src1, src2, src3);
SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3); SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3); src4, src5, src6, src7);
PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
...@@ -1007,8 +1007,8 @@ void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -1007,8 +1007,8 @@ void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(res0, res1, 5); SRARI_H2_SH(res0, res1, 5);
SAT_SH2_SH(res0, res1, 7); SAT_SH2_SH(res0, res1, 7);
res = __msa_pckev_b((v16i8) res1, (v16i8) res0); res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); src0, src1, src2, src3);
src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
...@@ -1038,8 +1038,8 @@ void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -1038,8 +1038,8 @@ void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(res0, res1, 5); SRARI_H2_SH(res0, res1, 5);
SAT_SH2_SH(res0, res1, 7); SAT_SH2_SH(res0, res1, 7);
res = __msa_pckev_b((v16i8) res1, (v16i8) res0); res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3); src0, src1, src2, src3);
src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
...@@ -3194,8 +3194,8 @@ void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -3194,8 +3194,8 @@ void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
minus5b, res4, res5, res6, res7); minus5b, res4, res5, res6, res7);
DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
plus20b, res4, res5, res6, res7); plus20b, res4, res5, res6, res7);
SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2); SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 2,
SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2); src0, src2, src4, src6);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
SRARI_H4_SH(res4, res5, res6, res7, 5); SRARI_H4_SH(res4, res5, res6, res7, 5);
SAT_SH4_SH(res0, res1, res2, res3, 7); SAT_SH4_SH(res0, res1, res2, res3, 7);
...@@ -3266,8 +3266,8 @@ void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -3266,8 +3266,8 @@ void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
minus5b, res4, res5, res6, res7); minus5b, res4, res5, res6, res7);
DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
plus20b, res4, res5, res6, res7); plus20b, res4, res5, res6, res7);
SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3); SLDI_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6, 3,
SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3); src0, src2, src4, src6);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
SRARI_H4_SH(res4, res5, res6, res7, 5); SRARI_H4_SH(res4, res5, res6, res7, 5);
SAT_SH4_SH(res0, res1, res2, res3, 7); SAT_SH4_SH(res0, res1, res2, res3, 7);
...@@ -3323,10 +3323,10 @@ void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -3323,10 +3323,10 @@ void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
res4, res5, res6, res7); res4, res5, res6, res7);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); src0, src1, src2, src3);
SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2); SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 2,
SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2); src4, src5, src6, src7);
PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
...@@ -3388,10 +3388,10 @@ void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -3388,10 +3388,10 @@ void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11); VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b, DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
res4, res5, res6, res7); res4, res5, res6, res7);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3); src0, src1, src2, src3);
SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3); SLDI_B4_SB(src4, src4, src5, src5, src6, src6, src7, src7, 3,
SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3); src4, src5, src6, src7);
PCKEV_D2_SB(src1, src0, src3, src2, src0, src1); PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_SB(src5, src4, src7, src6, src4, src5); PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
SRARI_H4_SH(res0, res1, res2, res3, 5); SRARI_H4_SH(res0, res1, res2, res3, 5);
...@@ -3439,8 +3439,8 @@ void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ...@@ -3439,8 +3439,8 @@ void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(out0, out1, 5); SRARI_H2_SH(out0, out1, 5);
SAT_SH2_SH(out0, out1, 7); SAT_SH2_SH(out0, out1, 7);
res = __msa_pckev_b((v16i8) out1, (v16i8) out0); res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 2,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2); src0, src1, src2, src3);
src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
...@@ -3475,8 +3475,8 @@ void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ...@@ -3475,8 +3475,8 @@ void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(out0, out1, 5); SRARI_H2_SH(out0, out1, 5);
SAT_SH2_SH(out0, out1, 7); SAT_SH2_SH(out0, out1, 7);
res = __msa_pckev_b((v16i8) out1, (v16i8) out0); res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3); SLDI_B4_SB(src0, src0, src1, src1, src2, src2, src3, src3, 3,
SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3); src0, src1, src2, src3);
src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1); src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3); src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1); src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
......
...@@ -1357,6 +1357,7 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, ...@@ -1357,6 +1357,7 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
v16u8 cmp_minus10, diff_minus10, diff_minus11; v16u8 cmp_minus10, diff_minus10, diff_minus11;
v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11; v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
v16i8 offset, sao_offset = LD_SB(sao_offset_val); v16i8 offset, sao_offset = LD_SB(sao_offset_val);
v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset); sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src -= 1; src -= 1;
...@@ -1367,8 +1368,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, ...@@ -1367,8 +1368,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
for (height -= 2; height; height -= 2) { for (height -= 2; height; height -= 2) {
src += (src_stride << 1); src += (src_stride << 1);
SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
src_minus10, src_plus10); src_minus10, src_plus10);
...@@ -1404,8 +1405,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, ...@@ -1404,8 +1405,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
dst += dst_stride; dst += dst_stride;
} }
SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10, PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
src_plus10); src_plus10);
...@@ -1473,14 +1474,12 @@ static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst, ...@@ -1473,14 +1474,12 @@ static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
dst_ptr = dst + v_cnt; dst_ptr = dst + v_cnt;
LD_UB4(src_minus1, src_stride, src10, src11, src12, src13); LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0, SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
src_zero1, 1); src12, src_minus12, src13, src_minus13, 1,
SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2, src_zero0, src_zero1, src_zero2, src_zero3);
src_zero3, 1); SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10, src12, src_minus12, src13, src_minus13, 2,
src_plus11, 2); src_plus10, src_plus11, src_plus12, src_plus13);
SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
src_plus13, 2);
cmp_minus10 = ((v16u8) src_zero0 == src_minus10); cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
...@@ -1880,6 +1879,7 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, ...@@ -1880,6 +1879,7 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
v16u8 src_minus11, src10, src11; v16u8 src_minus11, src10, src11;
v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0; v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
v8i16 offset_mask0, offset_mask1; v8i16 offset_mask0, offset_mask1;
v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset); sao_offset = __msa_pckev_b(sao_offset, sao_offset);
...@@ -1892,8 +1892,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, ...@@ -1892,8 +1892,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
for (height -= 2; height; height -= 2) { for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1); src_orig += (src_stride << 1);
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2); SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
src_minus11); src_minus11);
...@@ -1938,8 +1938,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, ...@@ -1938,8 +1938,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
dst += dst_stride; dst += dst_stride;
} }
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2); SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
src_minus11); src_minus11);
...@@ -1992,6 +1992,7 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, ...@@ -1992,6 +1992,7 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
v16u8 src_minus10, src10, src_minus11, src11; v16u8 src_minus10, src10, src_minus11, src11;
v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0; v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
v8i16 offset_mask0, offset_mask1; v8i16 offset_mask0, offset_mask1;
v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset); sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1; src_orig = src - 1;
...@@ -2003,8 +2004,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, ...@@ -2003,8 +2004,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
for (height -= 2; height; height -= 2) { for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1); src_orig += (src_stride << 1);
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2); SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
src_minus10, src_minus11); src_minus10, src_minus11);
...@@ -2048,8 +2049,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, ...@@ -2048,8 +2049,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
dst += dst_stride; dst += dst_stride;
} }
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2); SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10, ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
src_minus11); src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
...@@ -2130,12 +2131,11 @@ static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst, ...@@ -2130,12 +2131,11 @@ static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2)); src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2));
src_orig += 16; src_orig += 16;
SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0, SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
src_zero1, 1); src12, src_minus13, src13, src_minus14, 1,
SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2, src_zero0, src_zero1, src_zero2, src_zero3);
src_zero3, 1); SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10, src_plus11);
src_plus11, 2);
src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2); src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
...@@ -2228,6 +2228,7 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, ...@@ -2228,6 +2228,7 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
v16u8 src_minus10, src10, src_minus11, src11; v16u8 src_minus10, src10, src_minus11, src11;
v8i16 offset_mask0, offset_mask1; v8i16 offset_mask0, offset_mask1;
v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset); sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1; src_orig = src - 1;
...@@ -2239,8 +2240,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, ...@@ -2239,8 +2240,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
for (height -= 2; height; height -= 2) { for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1); src_orig += (src_stride << 1);
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11); src_minus11);
...@@ -2286,8 +2287,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, ...@@ -2286,8 +2287,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
dst += dst_stride; dst += dst_stride;
} }
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11); src_minus11);
...@@ -2342,6 +2343,7 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, ...@@ -2342,6 +2343,7 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
v16u8 src_minus10, src10, src_minus11, src11; v16u8 src_minus10, src10, src_minus11, src11;
v16i8 src_zero0, src_zero1, dst0; v16i8 src_zero0, src_zero1, dst0;
v8i16 offset_mask0, offset_mask1; v8i16 offset_mask0, offset_mask1;
v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset); sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1; src_orig = src - 1;
...@@ -2353,8 +2355,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, ...@@ -2353,8 +2355,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
for (height -= 2; height; height -= 2) { for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1); src_orig += (src_stride << 1);
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11); src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
...@@ -2398,8 +2400,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, ...@@ -2398,8 +2400,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
dst += dst_stride; dst += dst_stride;
} }
SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11); src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
......
...@@ -998,7 +998,8 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, ...@@ -998,7 +998,8 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
diff0, diff2, diff4, diff6); diff0, diff2, diff4, diff6);
SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2); SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
diff1, diff3, diff5, diff7);
ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
MUL2(diff1, fact0, diff3, fact2, diff1, diff3); MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
...@@ -1093,8 +1094,8 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, ...@@ -1093,8 +1094,8 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
UNPCK_UB_SH(top2, diff4, diff5); UNPCK_UB_SH(top2, diff4, diff5);
UNPCK_UB_SH(top3, diff6, diff7); UNPCK_UB_SH(top3, diff6, diff7);
SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2); SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2); diff1, diff3, diff5, diff7);
MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
diff1, diff3, diff5, diff7); diff1, diff3, diff5, diff7);
...@@ -1186,8 +1187,8 @@ static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, ...@@ -1186,8 +1187,8 @@ static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
fact6 = __msa_fill_h(fact_val3); fact6 = __msa_fill_h(fact_val3);
fact7 = __msa_fill_h(32 - fact_val3); fact7 = __msa_fill_h(32 - fact_val3);
SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1); SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1); top1, top3, top5, top7);
UNPCK_UB_SH(top0, diff0, diff1); UNPCK_UB_SH(top0, diff0, diff1);
UNPCK_UB_SH(top1, diff2, diff3); UNPCK_UB_SH(top1, diff2, diff3);
UNPCK_UB_SH(top2, diff4, diff5); UNPCK_UB_SH(top2, diff4, diff5);
...@@ -1297,8 +1298,8 @@ static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, ...@@ -1297,8 +1298,8 @@ static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
top2 = top1; top2 = top1;
top6 = top5; top6 = top5;
SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1); SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1); top1, top3, top5, top7);
UNPCK_UB_SH(top0, diff0, diff1); UNPCK_UB_SH(top0, diff0, diff1);
UNPCK_UB_SH(top1, diff2, diff3); UNPCK_UB_SH(top1, diff2, diff3);
UNPCK_UB_SH(top2, diff4, diff5); UNPCK_UB_SH(top2, diff4, diff5);
...@@ -1407,7 +1408,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, ...@@ -1407,7 +1408,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3); ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3, ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
diff0, diff2, diff4, diff6); diff0, diff2, diff4, diff6);
SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2); SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
diff1, diff3, diff5, diff7);
ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2); ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3); ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
MUL2(diff1, fact0, diff3, fact2, diff1, diff3); MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
...@@ -1511,8 +1513,8 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, ...@@ -1511,8 +1513,8 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
UNPCK_UB_SH(top1, diff2, diff3); UNPCK_UB_SH(top1, diff2, diff3);
UNPCK_UB_SH(top2, diff4, diff5); UNPCK_UB_SH(top2, diff4, diff5);
UNPCK_UB_SH(top3, diff6, diff7); UNPCK_UB_SH(top3, diff6, diff7);
SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2); SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2); diff1, diff3, diff5, diff7);
MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6, MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
diff1, diff3, diff5, diff7); diff1, diff3, diff5, diff7);
...@@ -1606,8 +1608,8 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, ...@@ -1606,8 +1608,8 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
fact6 = __msa_fill_h(fact_val3); fact6 = __msa_fill_h(fact_val3);
fact7 = __msa_fill_h(32 - fact_val3); fact7 = __msa_fill_h(32 - fact_val3);
SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1); SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1); top1, top3, top5, top7);
UNPCK_UB_SH(top0, diff0, diff1); UNPCK_UB_SH(top0, diff0, diff1);
UNPCK_UB_SH(top1, diff2, diff3); UNPCK_UB_SH(top1, diff2, diff3);
...@@ -1713,8 +1715,8 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, ...@@ -1713,8 +1715,8 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
top2 = top1; top2 = top1;
top6 = top5; top6 = top5;
SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1); SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1); top1, top3, top5, top7);
UNPCK_UB_SH(top0, diff0, diff1); UNPCK_UB_SH(top0, diff0, diff1);
UNPCK_UB_SH(top1, diff2, diff3); UNPCK_UB_SH(top1, diff2, diff3);
......
...@@ -59,12 +59,13 @@ static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride, ...@@ -59,12 +59,13 @@ static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
uint8_t loop_cnt; uint8_t loop_cnt;
uint32_t out0, out1; uint32_t out0, out1;
v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
v16i8 zeros = { 0 };
for (loop_cnt = (height >> 1); loop_cnt--;) { for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_UB2(src, src_stride, src0, src1); LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride); src += (2 * src_stride);
SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1); SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1); AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
out0 = __msa_copy_u_w((v4i32) res0, 0); out0 = __msa_copy_u_w((v4i32) res0, 0);
...@@ -82,13 +83,14 @@ static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride, ...@@ -82,13 +83,14 @@ static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
{ {
uint8_t loop_cnt; uint8_t loop_cnt;
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
v16i8 zeros = { 0 };
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
SLDI_B4_0_SB(src0, src1, src2, src3, SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride); src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
...@@ -125,14 +127,15 @@ static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -125,14 +127,15 @@ static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1; v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1; v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
v16i8 zeros = { 0 };
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
SLDI_B4_0_SB(src0, src1, src2, src3, SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); src0_sld1, src1_sld1, src2_sld1, src3_sld1);
SLDI_B4_0_SB(src4, src5, src6, src7, SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1); src4_sld1, src5_sld1, src6_sld1, src7_sld1);
AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride); src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
...@@ -145,10 +148,11 @@ static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -145,10 +148,11 @@ static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride) uint8_t *dst, int32_t dst_stride)
{ {
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
v16i8 zeros = { 0 };
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
SLDI_B4_0_SB(src0, src1, src2, src3, SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
src2, src2_sld1, src3, src3_sld1, dst, dst_stride); src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
} }
...@@ -216,12 +220,13 @@ static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src, ...@@ -216,12 +220,13 @@ static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1; v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
v16u8 tmp0 = { 0 }; v16u8 tmp0 = { 0 };
v16u8 tmp1 = { 0 }; v16u8 tmp1 = { 0 };
v16i8 zeros = { 0 };
for (loop_cnt = (height >> 1); loop_cnt--;) { for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_UB2(src, src_stride, src0, src1); LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride); src += (2 * src_stride);
SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1); SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
dst0 = LW(dst); dst0 = LW(dst);
dst1 = LW(dst + dst_stride); dst1 = LW(dst + dst_stride);
...@@ -247,13 +252,14 @@ static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src, ...@@ -247,13 +252,14 @@ static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
{ {
uint8_t loop_cnt; uint8_t loop_cnt;
v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1; v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
v16i8 zeros = { 0 };
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
SLDI_B4_0_SB(src0, src1, src2, src3, SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1); src0_sld1, src1_sld1, src2_sld1, src3_sld1);
AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1, AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
src3, src3_sld1, dst, dst_stride); src3, src3_sld1, dst, dst_stride);
...@@ -529,6 +535,7 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride, ...@@ -529,6 +535,7 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1; v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
v16u8 src0_r, src1_r, src2_r, res; v16u8 src0_r, src1_r, src2_r, res;
v8u16 add0, add1, add2, sum0, sum1; v8u16 add0, add1, add2, sum0, sum1;
v16i8 zeros = { 0 };
src0 = LD_SB(src); src0 = LD_SB(src);
src += src_stride; src += src_stride;
...@@ -537,7 +544,8 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride, ...@@ -537,7 +544,8 @@ static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
LD_SB2(src, src_stride, src1, src2); LD_SB2(src, src_stride, src1, src2);
src += (2 * src_stride); src += (2 * src_stride);
SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
src1_sld1, src2_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
src0_r, src1_r, src2_r); src0_r, src1_r, src2_r);
HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
...@@ -565,6 +573,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, ...@@ -565,6 +573,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
v8u16 add0, add1, add2, add3, add4; v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3; v8u16 sum0, sum1, sum2, sum3;
v16i8 zeros = { 0 };
src0 = LD_SB(src); src0 = LD_SB(src);
src += src_stride; src += src_stride;
...@@ -573,8 +582,9 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, ...@@ -573,8 +582,9 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
LD_SB4(src, src_stride, src1, src2, src3, src4); LD_SB4(src, src_stride, src1, src2, src3, src4);
src += (4 * src_stride); src += (4 * src_stride);
SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); src1_sld1, src2_sld1);
SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r); src1_r, src2_r);
ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
...@@ -659,15 +669,17 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -659,15 +669,17 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8; v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
v16i8 out0, out1; v16i8 out0, out1;
v16i8 zeros = { 0 };
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
src8 = LD_UB(src); src8 = LD_UB(src);
SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
src3_sld1, 1); src0_sld1, src1_sld1, src2_sld1, src3_sld1);
SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1); SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1); src5_sld1, src6_sld1);
SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1, ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
src3, src0_r, src1_r, src2_r, src3_r); src3, src0_r, src1_r, src2_r, src3_r);
ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r, ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
...@@ -703,13 +715,15 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -703,13 +715,15 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
v8u16 add0, add1, add2, add3, add4; v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3; v8u16 sum0, sum1, sum2, sum3;
v16i8 out0, out1; v16i8 out0, out1;
v16i8 zeros = { 0 };
LD_SB4(src, src_stride, src0, src1, src2, src3); LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
src4 = LD_SB(src); src4 = LD_SB(src);
SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); src1_sld1, src2_sld1);
SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r); src1_r, src2_r);
ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
...@@ -918,6 +932,7 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src, ...@@ -918,6 +932,7 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
v16u8 src0_r, src1_r, src2_r; v16u8 src0_r, src1_r, src2_r;
v8u16 add0, add1, add2, sum0, sum1; v8u16 add0, add1, add2, sum0, sum1;
v16u8 dst0, dst1, res0, res1; v16u8 dst0, dst1, res0, res1;
v16i8 zeros = { 0 };
src0 = LD_SB(src); src0 = LD_SB(src);
src += src_stride; src += src_stride;
...@@ -927,7 +942,8 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src, ...@@ -927,7 +942,8 @@ static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
src += (2 * src_stride); src += (2 * src_stride);
LD_UB2(dst, dst_stride, dst0, dst1); LD_UB2(dst, dst_stride, dst0, dst1);
SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
src1_sld1, src2_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r); src1_r, src2_r);
HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2); HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
...@@ -959,6 +975,7 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src, ...@@ -959,6 +975,7 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
v16u8 src0_r, src1_r, src2_r, src3_r, src4_r; v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
v8u16 add0, add1, add2, add3, add4; v8u16 add0, add1, add2, add3, add4;
v8u16 sum0, sum1, sum2, sum3; v8u16 sum0, sum1, sum2, sum3;
v16i8 zeros = { 0 };
src0 = LD_SB(src); src0 = LD_SB(src);
src += src_stride; src += src_stride;
...@@ -968,8 +985,9 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src, ...@@ -968,8 +985,9 @@ static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1); SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1); src1_sld1, src2_sld1);
SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r, ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
src1_r, src2_r); src1_r, src2_r);
ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r); ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
......
...@@ -87,8 +87,8 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, ...@@ -87,8 +87,8 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); ref0, ref1, ref2, ref3);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
sad += SAD_UB2_UH(src0, src1, comp0, comp1); sad += SAD_UB2_UH(src0, src1, comp0, comp1);
...@@ -100,8 +100,8 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, ...@@ -100,8 +100,8 @@ static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); ref0, ref1, ref2, ref3);
PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1); AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
sad += SAD_UB2_UH(src0, src1, comp0, comp1); sad += SAD_UB2_UH(src0, src1, comp0, comp1);
......
This diff is collapsed.
...@@ -1995,8 +1995,8 @@ static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, ...@@ -1995,8 +1995,8 @@ static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
hz_out3, hz_out5, 8); hz_out3, hz_out5);
hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
......
...@@ -249,6 +249,7 @@ static const int32_t sinpi_4_9 = 15212; ...@@ -249,6 +249,7 @@ static const int32_t sinpi_4_9 = 15212;
v8i16 c0_m, c1_m, c2_m, c3_m; \ v8i16 c0_m, c1_m, c2_m, c3_m; \
v8i16 step0_m, step1_m; \ v8i16 step0_m, step1_m; \
v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16i8 zeros = { 0 }; \
\ \
c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ c0_m = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \
c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ c1_m = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \
...@@ -262,7 +263,7 @@ static const int32_t sinpi_4_9 = 15212; ...@@ -262,7 +263,7 @@ static const int32_t sinpi_4_9 = 15212;
SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \ SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, VP9_DCT_CONST_BITS); \
\ \
PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \
SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ SLDI_B2_SW(zeros, tmp0_m, zeros, tmp2_m, 8, tmp1_m, tmp3_m); \
BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m, \ BUTTERFLY_4((v8i16) tmp0_m, (v8i16) tmp1_m, \
(v8i16) tmp2_m, (v8i16) tmp3_m, \ (v8i16) tmp2_m, (v8i16) tmp3_m, \
out0, out1, out2, out3); \ out0, out1, out2, out3); \
......
...@@ -1673,6 +1673,7 @@ static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, ...@@ -1673,6 +1673,7 @@ static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
v16i8 zeros = { 0 };
LD_UB8(input, in_pitch, LD_UB8(input, in_pitch,
p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org); p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
...@@ -1686,7 +1687,7 @@ static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, ...@@ -1686,7 +1687,7 @@ static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
output += (8 * out_pitch); output += (8 * out_pitch);
......
...@@ -795,7 +795,7 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, ...@@ -795,7 +795,7 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
filt = LD_SH(filter_vert); filt = LD_SH(filter_vert);
SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
...@@ -1585,7 +1585,7 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, ...@@ -1585,7 +1585,7 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3); filt_hz1, filt_hz2, filt_hz3);
SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
filt = LD_SH(filter_vert); filt = LD_SH(filter_vert);
SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
...@@ -2093,7 +2093,7 @@ void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, ...@@ -2093,7 +2093,7 @@ void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
src4 = LD_SB(src + 32); src4 = LD_SB(src + 32);
src6 = LD_SB(src + 48); src6 = LD_SB(src + 48);
src7 = LD_SB(src + 56); src7 = LD_SB(src + 56);
SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
src += src_stride; src += src_stride;
VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
...@@ -2544,8 +2544,8 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -2544,8 +2544,8 @@ static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
hz_out3, hz_out5, 8); hz_out3, hz_out5);
hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
...@@ -3146,7 +3146,7 @@ void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, ...@@ -3146,7 +3146,7 @@ void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
for (loop_cnt = height; loop_cnt--;) { for (loop_cnt = height; loop_cnt--;) {
LD_SB4(src, 16, src0, src2, src4, src6); LD_SB4(src, 16, src0, src2, src4, src6);
src7 = LD_SB(src + 56); src7 = LD_SB(src + 56);
SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
src += src_stride; src += src_stride;
VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
...@@ -3655,8 +3655,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, ...@@ -3655,8 +3655,8 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7); hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7); hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7); hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
hz_out3, hz_out5, 8); hz_out3, hz_out5);
hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6); hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
LW4(dst, dst_stride, tp0, tp1, tp2, tp3); LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
......
...@@ -602,67 +602,48 @@ ...@@ -602,67 +602,48 @@
} }
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
/* Description : Immediate number of columns to slide with zero /* Description : Immediate number of columns to slide
Arguments : Inputs - in0, in1, slide_val Arguments : Inputs - s, d, slide_val
Outputs - out0, out1 Outputs - out
Return Type - as per RTYPE Return Type - as per RTYPE
Details : Byte elements from 'zero_m' vector are slide into 'in0' by Details : Byte elements from 'd' vector are slide into 's' by
number of elements specified by 'slide_val' number of elements specified by 'slide_val'
*/ */
#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ #define SLDI_B(RTYPE, d, s, slide_val, out) \
{ \ { \
v16i8 zero_m = { 0 }; \ out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
}
#define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
#define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
#define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
{ \
v16i8 zero_m = { 0 }; \
SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
}
#define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
#define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
out0, out1, out2, out3, slide_val) \
{ \
SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
} }
#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
#define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
#define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
/* Description : Immediate number of columns to slide #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val { \
Outputs - out0, out1 SLDI_B(RTYPE, d0, s0, slide_val, out0) \
Return Type - as per RTYPE SLDI_B(RTYPE, d1, s1, slide_val, out1) \
Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
number of elements specified by 'slide_val'
*/
#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
{ \
out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
} }
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__) #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
#define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
out0, out1, out2, slide_val) \ out0, out1, out2) \
{ \ { \
SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \ SLDI_B(RTYPE, d2, s2, slide_val, out2) \
} }
#define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
#define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
slide_val, out0, out1, out2, out3) \
{ \
SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
}
#define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
#define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
#define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector /* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1 Outputs - out0, out1
...@@ -2412,6 +2393,7 @@ ...@@ -2412,6 +2393,7 @@
{ \ { \
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
v16i8 zeros = { 0 }; \
\ \
ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
...@@ -2419,8 +2401,8 @@ ...@@ -2419,8 +2401,8 @@
ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ 8, out1, out3, out5, out7); \
} }
#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
#define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment