Commit 153c6075 authored by Shiyou Yin's avatar Shiyou Yin Committed by Michael Niedermayer

avutil/mips: refactor msa load and store macros.

Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.

Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent 00ed04d6
...@@ -86,10 +86,7 @@ static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale) ...@@ -86,10 +86,7 @@ static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1); ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
in0 = (v16u8) __msa_ilvr_h(temp1, temp0); in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
in3 = (v16u8) __msa_ilvl_h(temp1, temp0); in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride); ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
src += 4 * stride;
ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
src += 4 * stride;
} }
static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale) static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
......
This diff is collapsed.
...@@ -45,7 +45,7 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride, ...@@ -45,7 +45,7 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
tmp0 = __msa_srlr_h(tmp0, denom); tmp0 = __msa_srlr_h(tmp0, denom);
tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7); tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(src0, data, stride); ST_W2(src0, 0, 1, data, stride);
} }
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
...@@ -71,7 +71,7 @@ static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, ...@@ -71,7 +71,7 @@ static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
tmp1 = __msa_srlr_h(tmp1, denom); tmp1 = __msa_srlr_h(tmp1, denom);
SAT_UH2_SH(tmp0, tmp1, 7); SAT_UH2_SH(tmp0, tmp1, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride); ST_W4(src0, 0, 1, 2, 3, data, stride);
} }
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
...@@ -102,7 +102,7 @@ static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, ...@@ -102,7 +102,7 @@ static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST4x8_UB(src0, src1, data, stride); ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
} }
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
...@@ -133,7 +133,7 @@ static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom, ...@@ -133,7 +133,7 @@ static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom); SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST8x4_UB(src0, src1, data, stride); ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
} }
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
...@@ -175,7 +175,7 @@ static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom, ...@@ -175,7 +175,7 @@ static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3); src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride); ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
} }
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom, static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
...@@ -218,7 +218,7 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom, ...@@ -218,7 +218,7 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7); SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1, PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3); src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride); ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
data += 8 * stride; data += 8 * stride;
} }
} }
...@@ -253,7 +253,7 @@ static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -253,7 +253,7 @@ static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp0 = __msa_maxi_s_h(tmp0, 0); tmp0 = __msa_maxi_s_h(tmp0, 0);
tmp0 = __msa_min_s_h(max255, tmp0); tmp0 = __msa_min_s_h(max255, tmp0);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0); dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(dst0, dst, stride); ST_W2(dst0, 0, 1, dst, stride);
} }
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
...@@ -287,7 +287,7 @@ static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -287,7 +287,7 @@ static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp1 >>= denom; tmp1 >>= denom;
CLIP_SH2_0_255(tmp0, tmp1); CLIP_SH2_0_255(tmp0, tmp1);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride); ST_W4(dst0, 0, 1, 2, 3, dst, stride);
} }
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
...@@ -327,7 +327,7 @@ static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -327,7 +327,7 @@ static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, stride); ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
} }
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
...@@ -365,7 +365,7 @@ static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -365,7 +365,7 @@ static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom); SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3); CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride); ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
} }
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
...@@ -417,7 +417,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -417,7 +417,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7); CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3); PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
} }
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
...@@ -479,7 +479,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride, ...@@ -479,7 +479,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(temp4, temp5, temp6, temp7); CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
dst0, dst1, dst2, dst3); dst0, dst1, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
dst += 8 * stride; dst += 8 * stride;
} }
} }
...@@ -955,18 +955,18 @@ static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data, ...@@ -955,18 +955,18 @@ static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
ILVRL_H2_SH(tp3, tp2, tmp6, tmp7); ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
src = data - 3; src = data - 3;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width); ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 0, src + 4, img_width); ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width; src += 4 * img_width;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width); ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 4, src + 4, img_width); ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
src += 4 * img_width; src += 4 * img_width;
ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width); ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 0, src + 4, img_width); ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width; src += 4 * img_width;
ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width); ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 4, src + 4, img_width); ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
} }
} }
} }
...@@ -1274,9 +1274,9 @@ static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr, ...@@ -1274,9 +1274,9 @@ static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org); tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
data_cb_or_cr -= 1; data_cb_or_cr -= 1;
ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width); ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
data_cb_or_cr += 4 * img_width; data_cb_or_cr += 4 * img_width;
ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width); ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
} }
} }
...@@ -2110,9 +2110,9 @@ static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data, ...@@ -2110,9 +2110,9 @@ static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
q0_org = __msa_bmnz_v(q0_org, q0, is_less_than); q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org); tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
src = data - 1; src = data - 1;
ST2x4_UB(tmp1, 0, src, img_width); ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
src += 4 * img_width; src += 4 * img_width;
ST2x4_UB(tmp1, 4, src, img_width); ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
} }
} }
} }
...@@ -2136,7 +2136,7 @@ static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride, ...@@ -2136,7 +2136,7 @@ static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
} }
AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res); AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
ST2x4_UB(res, 0, (src - 1), stride); ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
src += (4 * stride); src += (4 * stride);
} }
} }
......
...@@ -237,9 +237,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride) ...@@ -237,9 +237,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
CLIP_SH4_0_255(res4, res5, res6, res7); CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3); dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
} }
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
...@@ -269,9 +267,7 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, ...@@ -269,9 +267,7 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r); CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r, PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
dst0, dst1, dst2, dst3); dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride); ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
} }
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride) void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
...@@ -340,7 +336,7 @@ void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, ...@@ -340,7 +336,7 @@ void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l); ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
CLIP_SH2_0_255(pred_r, pred_l); CLIP_SH2_0_255(pred_r, pred_l);
out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r); out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
} }
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
......
This diff is collapsed.
...@@ -727,7 +727,7 @@ static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) ...@@ -727,7 +727,7 @@ static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0); ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
CLIP_SH2_0_255(dst_r0, dst_l0); CLIP_SH2_0_255(dst_r0, dst_l0);
dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0); dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride); ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
} }
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
...@@ -752,8 +752,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) ...@@ -752,8 +752,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1); dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride); ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD4(temp_dst, stride, dst0, dst1, dst2, dst3); LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst0, dst1, dst_vec0);
...@@ -764,7 +763,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) ...@@ -764,7 +763,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1); dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride); ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
} }
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
......
...@@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, ...@@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride); ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
p2 += (4 * stride); SD(dst_val0, p2 + 4 * stride);
SD(dst_val0, p2); SD(dst_val1, p2 + 5 * stride);
p2 += stride;
SD(dst_val1, p2);
/* strong filter ends */ /* strong filter ends */
} else if (flag0 == flag1) { /* weak only */ } else if (flag0 == flag1) { /* weak only */
/* weak filter */ /* weak filter */
...@@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, ...@@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3); dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
p2 += stride; p2 += stride;
ST8x4_UB(dst0, dst1, p2, stride); ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
/* weak filter ends */ /* weak filter ends */
} else { /* strong + weak */ } else { /* strong + weak */
/* strong filter */ /* strong filter */
...@@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, ...@@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride); ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
p2 += (4 * stride); SD(dst_val0, p2 + 4 * stride);
SD(dst_val0, p2); SD(dst_val1, p2 + 5 * stride);
p2 += stride;
SD(dst_val1, p2);
} }
} }
} }
...@@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, ...@@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0); temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
ST8x2_UB(temp0, p0_ptr, stride); ST_D2(temp0, 0, 1, p0_ptr, stride);
} }
} }
...@@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, ...@@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0); temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
src += 1; src += 1;
ST2x4_UB(temp0, 0, src, stride); ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
src += (4 * stride);
ST2x4_UB(temp0, 4, src, stride);
} }
} }
...@@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, ...@@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
LD_UB4(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
/* store results */ /* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
...@@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, ...@@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
/* store results */ /* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
} }
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
...@@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, ...@@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1); XORI_B2_128_SB(dst0, dst1);
/* store results */ /* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride); ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += dst_stride << 2; dst += dst_stride << 2;
} }
...@@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, ...@@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1); XORI_B2_128_SB(dst0, dst1);
/* store results */ /* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride); ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
} }
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst, static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -589,7 +589,7 @@ static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, ...@@ -589,7 +589,7 @@ static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
PCKEV_D2_SH(res1, res0, res3, res2, res0, res1); PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
SRARI_H2_SH(res0, res1, 3); SRARI_H2_SH(res0, res1, 3);
src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0); src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride); ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
} }
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
...@@ -656,7 +656,8 @@ static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, ...@@ -656,7 +656,8 @@ static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
src_vec0, src_vec1, src_vec2, src_vec3); src_vec0, src_vec1, src_vec2, src_vec3);
ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride); ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
0, 1, 0, 1, dst, stride);
} }
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
...@@ -1007,7 +1008,7 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, ...@@ -1007,7 +1008,7 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
SRARI_H2_SH(diff1, diff3, 5); SRARI_H2_SH(diff1, diff3, 5);
dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1); dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride); ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
} }
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
...@@ -1104,7 +1105,7 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, ...@@ -1104,7 +1105,7 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5); SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1); PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
ST8x4_UB(dst_val0, dst_val1, dst, stride); ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride); dst += (4 * stride);
} }
} }
...@@ -1425,9 +1426,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, ...@@ -1425,9 +1426,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2); dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2); dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
ST4x2_UB(dst_val0, dst, stride); ST_W2(dst_val0, 0, 1, dst, stride);
dst += (2 * stride); ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
ST4x2_UB(dst_val1, dst, stride);
} }
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
...@@ -1526,7 +1526,7 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, ...@@ -1526,7 +1526,7 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
dst_val0, dst_val1, dst_val2, dst_val3); dst_val0, dst_val1, dst_val2, dst_val3);
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1); ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
ILVRL_H2_SH(diff1, diff0, diff3, diff4); ILVRL_H2_SH(diff1, diff0, diff3, diff4);
ST4x8_UB(diff3, diff4, dst_org, stride); ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4; dst += 4;
} }
} }
...@@ -1640,9 +1640,9 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, ...@@ -1640,9 +1640,9 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3); ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
ILVRL_H2_SH(diff1, diff0, diff4, diff5); ILVRL_H2_SH(diff1, diff0, diff4, diff5);
ILVRL_H2_SH(diff3, diff2, diff6, diff7); ILVRL_H2_SH(diff3, diff2, diff6, diff7);
ST4x8_UB(diff4, diff5, dst_org, stride); ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst_org += (8 * stride); dst_org += (8 * stride);
ST4x8_UB(diff6, diff7, dst_org, stride); ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4; dst += 4;
} }
} }
...@@ -1746,23 +1746,14 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, ...@@ -1746,23 +1746,14 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1); ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3); ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
ST2x4_UB(diff0, 0, dst_org, stride); ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (4 * stride); dst_org += (8 * stride);
ST2x4_UB(diff0, 4, dst_org, stride); ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (4 * stride); dst_org += (8 * stride);
ST2x4_UB(diff1, 0, dst_org, stride); ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (4 * stride); dst_org += (8 * stride);
ST2x4_UB(diff1, 4, dst_org, stride); ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (4 * stride); dst_org += (8 * stride);
ST2x4_UB(diff2, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff2, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 4, dst_org, stride);
dst_org += (4 * stride);
dst += 2; dst += 2;
} }
......
...@@ -49,7 +49,7 @@ ...@@ -49,7 +49,7 @@
PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
} }
static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride, static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
...@@ -584,7 +584,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, ...@@ -584,7 +584,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
sum0, sum1, sum2, sum3); sum0, sum1, sum2, sum3);
SRARI_H4_UH(sum0, sum1, sum2, sum3, 2); SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1); PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride); ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
src0 = src4; src0 = src4;
} }
...@@ -689,9 +689,9 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -689,9 +689,9 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2); SRA_4V(sum0, sum1, sum2, sum3, 2);
SRA_4V(sum4, sum5, sum6, sum7, 2); SRA_4V(sum4, sum5, sum6, sum7, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1); PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
} }
static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
...@@ -723,7 +723,7 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, ...@@ -723,7 +723,7 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2); SRA_4V(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1); PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride); ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
} }
static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src, static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
......
This diff is collapsed.
...@@ -47,7 +47,6 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) ...@@ -47,7 +47,6 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
v4i32 cnst8w = {8, 8, 8, 8}; v4i32 cnst8w = {8, 8, 8, 8};
v4i32 cnst2048w = {2048, 2048, 2048, 2048}; v4i32 cnst2048w = {2048, 2048, 2048, 2048};
v4i32 cnst128w = {128, 128, 128, 128}; v4i32 cnst128w = {128, 128, 128, 128};
int nstride = stride;
/* Extended input data */ /* Extended input data */
LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7); LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
...@@ -386,20 +385,14 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) ...@@ -386,20 +385,14 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7); VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */ /* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dst); ST_D1(d0, 0, dst);
ST8x1_UB(d1, dst + nstride); ST_D1(d1, 0, dst + stride);
nstride += stride; ST_D1(d2, 0, dst + 2 * stride);
ST8x1_UB(d2, dst + nstride); ST_D1(d3, 0, dst + 3 * stride);
nstride += stride; ST_D1(d4, 0, dst + 4 * stride);
ST8x1_UB(d3, dst + nstride); ST_D1(d5, 0, dst + 5 * stride);
nstride += stride; ST_D1(d6, 0, dst + 6 * stride);
ST8x1_UB(d4, dst + nstride); ST_D1(d7, 0, dst + 7 * stride);
nstride += stride;
ST8x1_UB(d5, dst + nstride);
nstride += stride;
ST8x1_UB(d6, dst + nstride);
nstride += stride;
ST8x1_UB(d7, dst + nstride);
} }
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
...@@ -424,7 +417,6 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) ...@@ -424,7 +417,6 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
v4i32 r0, r1, r2, r3, r4, r5, r6, r7; v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0}; v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
v16i8 zero = {0}; v16i8 zero = {0};
int nstride = line_size;
LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7); LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3, ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
...@@ -480,20 +472,14 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) ...@@ -480,20 +472,14 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7); VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */ /* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dest); ST_D1(d0, 0, dest);
ST8x1_UB(d1, dest + nstride); ST_D1(d1, 0, dest + line_size);
nstride += line_size; ST_D1(d2, 0, dest + 2 * line_size);
ST8x1_UB(d2, dest + nstride); ST_D1(d3, 0, dest + 3 * line_size);
nstride += line_size; ST_D1(d4, 0, dest + 4 * line_size);
ST8x1_UB(d3, dest + nstride); ST_D1(d5, 0, dest + 5 * line_size);
nstride += line_size; ST_D1(d6, 0, dest + 6 * line_size);
ST8x1_UB(d4, dest + nstride); ST_D1(d7, 0, dest + 7 * line_size);
nstride += line_size;
ST8x1_UB(d5, dest + nstride);
nstride += line_size;
ST8x1_UB(d6, dest + nstride);
nstride += line_size;
ST8x1_UB(d7, dest + nstride);
block[0] = 0; block[0] = 0;
} }
...@@ -537,8 +523,8 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, ...@@ -537,8 +523,8 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2); VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
/* Final move to first_pixel */ /* Final move to first_pixel */
ST8x1_UB(d1, first_pixel + nstride); ST_D1(d1, 0, first_pixel + nstride);
ST8x1_UB(d2, first_pixel); ST_D1(d2, 0, first_pixel);
} }
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
...@@ -583,8 +569,8 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, ...@@ -583,8 +569,8 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
g1 = CLIP_SW_0_255(g1); g1 = CLIP_SW_0_255(g1);
VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2); VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
/* Final move to first_pixel */ /* Final move to first_pixel */
ST2x4_UB(d1, 0, first_pixel - 1, stride); ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride); ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
} }
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
...@@ -641,10 +627,8 @@ void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, ...@@ -641,10 +627,8 @@ void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3); f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
t3 = t3 + (v4u32)f2; t3 = t3 + (v4u32)f2;
ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride); ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride); ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
} else { } else {
int i; int i;
......
...@@ -76,7 +76,8 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride) ...@@ -76,7 +76,8 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
res2 = CLIP_SW_0_255(res2); res2 = CLIP_SW_0_255(res2);
res3 = CLIP_SW_0_255(res3); res3 = CLIP_SW_0_255(res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride); ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
memset(input, 0, 4 * 4 * sizeof(*input)); memset(input, 0, 4 * 4 * sizeof(*input));
} }
...@@ -97,7 +98,8 @@ void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride) ...@@ -97,7 +98,8 @@ void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
CLIP_SH4_0_255(res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1); VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride); ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
in_dc[0] = 0; in_dc[0] = 0;
} }
......
...@@ -540,14 +540,8 @@ void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch, ...@@ -540,14 +540,8 @@ void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q0, p0, tmp1, tmp0); ILVRL_B2_SH(q0, p0, tmp1, tmp0);
src -= 1; src -= 1;
ST2x4_UB(tmp1, 0, src, pitch); ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch)
src += 4 * pitch; ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch)
ST2x4_UB(tmp1, 4, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 4, src, pitch);
src += 4 * pitch;
} }
void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
...@@ -596,7 +590,6 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, ...@@ -596,7 +590,6 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
ptrdiff_t pitch, int b_limit_in, ptrdiff_t pitch, int b_limit_in,
int limit_in, int thresh_in) int limit_in, int thresh_in)
{ {
uint8_t *temp_src_u, *temp_src_v;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit; v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8; v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
...@@ -623,15 +616,8 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v, ...@@ -623,15 +616,8 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0); tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5); ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
temp_src_u = src_u - 2; ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch);
ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch); ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch);
temp_src_u += 4 * pitch;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_v = src_v - 2;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
temp_src_v += 4 * pitch;
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
} }
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
...@@ -684,7 +670,6 @@ void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch, ...@@ -684,7 +670,6 @@ void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2; src -= 2;
ST4x8_UB(tmp2, tmp3, src, pitch); ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch)
src += (8 * pitch); ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch)
ST4x8_UB(tmp4, tmp5, src, pitch);
} }
This diff is collapsed.
This diff is collapsed.
...@@ -378,7 +378,8 @@ void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride, ...@@ -378,7 +378,8 @@ void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); ST_W2(tmp0, 0, 2, dst, dst_stride);
ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
} }
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
...@@ -409,7 +410,7 @@ void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride, ...@@ -409,7 +410,7 @@ void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST8x4_UB(tmp0, tmp1, dst, dst_stride); ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment