Commit 153c6075 authored by Shiyou Yin's avatar Shiyou Yin Committed by Michael Niedermayer

avutil/mips: refactor msa load and store macros.

Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.

Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent 00ed04d6
......@@ -86,10 +86,7 @@ static void h263_h_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
src += 4 * stride;
ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
src += 4 * stride;
ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
}
static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
......
This diff is collapsed.
......@@ -45,7 +45,7 @@ static void avc_wgt_4x2_msa(uint8_t *data, int32_t stride,
tmp0 = __msa_srlr_h(tmp0, denom);
tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(src0, data, stride);
ST_W2(src0, 0, 1, data, stride);
}
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
......@@ -71,7 +71,7 @@ static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
tmp1 = __msa_srlr_h(tmp1, denom);
SAT_UH2_SH(tmp0, tmp1, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
ST_W4(src0, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
......@@ -102,7 +102,7 @@ static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST4x8_UB(src0, src1, data, stride);
ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
......@@ -133,7 +133,7 @@ static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
ST8x4_UB(src0, src1, data, stride);
ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
......@@ -175,7 +175,7 @@ static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride);
ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
......@@ -218,7 +218,7 @@ static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
ST8x8_UB(src0, src1, src2, src3, data, stride);
ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
data += 8 * stride;
}
}
......@@ -253,7 +253,7 @@ static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp0 = __msa_maxi_s_h(tmp0, 0);
tmp0 = __msa_min_s_h(max255, tmp0);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
ST4x2_UB(dst0, dst, stride);
ST_W2(dst0, 0, 1, dst, stride);
}
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
......@@ -287,7 +287,7 @@ static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp1 >>= denom;
CLIP_SH2_0_255(tmp0, tmp1);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
......@@ -327,7 +327,7 @@ static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST4x8_UB(dst0, dst1, dst, stride);
ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
......@@ -365,7 +365,7 @@ static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
ST8x4_UB(dst0, dst1, dst, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
......@@ -417,7 +417,7 @@ static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
......@@ -479,7 +479,7 @@ static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
dst0, dst1, dst2, dst3);
ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
dst += 8 * stride;
}
}
......@@ -955,18 +955,18 @@ static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
src = data - 3;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 0, src + 4, img_width);
ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp2, 4, src + 4, img_width);
ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 0, src + 4, img_width);
ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
ST2x4_UB(tmp5, 4, src + 4, img_width);
ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
}
}
}
......@@ -1274,9 +1274,9 @@ static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
data_cb_or_cr -= 1;
ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
data_cb_or_cr += 4 * img_width;
ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
}
}
......@@ -2110,9 +2110,9 @@ static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
src = data - 1;
ST2x4_UB(tmp1, 0, src, img_width);
ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
src += 4 * img_width;
ST2x4_UB(tmp1, 4, src, img_width);
ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
}
}
}
......@@ -2136,7 +2136,7 @@ static void avc_h_loop_filter_chroma422_msa(uint8_t *src, int32_t stride,
}
AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
ST2x4_UB(res, 0, (src - 1), stride);
ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
src += (4 * stride);
}
}
......
......@@ -237,9 +237,7 @@ static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
......@@ -269,9 +267,7 @@ static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
dst0, dst1, dst2, dst3);
ST8x4_UB(dst0, dst1, dst, dst_stride);
dst += (4 * dst_stride);
ST8x4_UB(dst2, dst3, dst, dst_stride);
ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
......@@ -340,7 +336,7 @@ void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
CLIP_SH2_0_255(pred_r, pred_l);
out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
......
This diff is collapsed.
......@@ -727,7 +727,7 @@ static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
CLIP_SH2_0_255(dst_r0, dst_l0);
dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
}
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
......@@ -752,8 +752,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride);
dst += (4 * stride);
ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
INSERT_D2_SD(dst0, dst1, dst_vec0);
......@@ -764,7 +763,7 @@ static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
ST8x4_UB(dst_r0, dst_r1, dst, stride);
ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
}
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
......
......@@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride);
p2 += (4 * stride);
SD(dst_val0, p2);
p2 += stride;
SD(dst_val1, p2);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
SD(dst_val0, p2 + 4 * stride);
SD(dst_val1, p2 + 5 * stride);
/* strong filter ends */
} else if (flag0 == flag1) { /* weak only */
/* weak filter */
......@@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
p2 += stride;
ST8x4_UB(dst0, dst1, p2, stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
/* weak filter ends */
} else { /* strong + weak */
/* strong filter */
......@@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
ST8x4_UB(dst0, dst1, p2, stride);
p2 += (4 * stride);
SD(dst_val0, p2);
p2 += stride;
SD(dst_val1, p2);
ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
SD(dst_val0, p2 + 4 * stride);
SD(dst_val1, p2 + 5 * stride);
}
}
}
......@@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
ST8x2_UB(temp0, p0_ptr, stride);
ST_D2(temp0, 0, 1, p0_ptr, stride);
}
}
......@@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
src += 1;
ST2x4_UB(temp0, 0, src, stride);
src += (4 * stride);
ST2x4_UB(temp0, 4, src, stride);
ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
}
}
......@@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
LD_UB4(src, src_stride, src0, src1, src2, src3);
/* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
......@@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
/* store results */
ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
......@@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += dst_stride << 2;
}
......@@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
ST8x4_UB(dst0, dst1, dst, dst_stride);
ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -589,7 +589,7 @@ static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
SRARI_H2_SH(res0, res1, 3);
src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
......@@ -656,7 +656,8 @@ static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
src_vec0, src_vec1, src_vec2, src_vec3);
ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
0, 1, 0, 1, dst, stride);
}
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
......@@ -1007,7 +1008,7 @@ static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
SRARI_H2_SH(diff1, diff3, 5);
dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
......@@ -1104,7 +1105,7 @@ static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
ST8x4_UB(dst_val0, dst_val1, dst, stride);
ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
}
}
......@@ -1425,9 +1426,8 @@ static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
ST4x2_UB(dst_val0, dst, stride);
dst += (2 * stride);
ST4x2_UB(dst_val1, dst, stride);
ST_W2(dst_val0, 0, 1, dst, stride);
ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
}
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
......@@ -1526,7 +1526,7 @@ static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
dst_val0, dst_val1, dst_val2, dst_val3);
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
ILVRL_H2_SH(diff1, diff0, diff3, diff4);
ST4x8_UB(diff3, diff4, dst_org, stride);
ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
......@@ -1640,9 +1640,9 @@ static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
ILVRL_H2_SH(diff1, diff0, diff4, diff5);
ILVRL_H2_SH(diff3, diff2, diff6, diff7);
ST4x8_UB(diff4, diff5, dst_org, stride);
ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst_org += (8 * stride);
ST4x8_UB(diff6, diff7, dst_org, stride);
ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
......@@ -1746,23 +1746,14 @@ static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
ST2x4_UB(diff0, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff0, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff1, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff1, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff2, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff2, 4, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 0, dst_org, stride);
dst_org += (4 * stride);
ST2x4_UB(diff3, 4, dst_org, stride);
dst_org += (4 * stride);
ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
dst_org += (8 * stride);
dst += 2;
}
......
......@@ -49,7 +49,7 @@
PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
......@@ -584,7 +584,7 @@ static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
sum0, sum1, sum2, sum3);
SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
ST8x4_UB(src0, src1, dst, dst_stride);
ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src0 = src4;
}
......@@ -689,9 +689,9 @@ static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2);
SRA_4V(sum4, sum5, sum6, sum7, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
......@@ -723,7 +723,7 @@ static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
......
This diff is collapsed.
......@@ -47,7 +47,6 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
v4i32 cnst8w = {8, 8, 8, 8};
v4i32 cnst2048w = {2048, 2048, 2048, 2048};
v4i32 cnst128w = {128, 128, 128, 128};
int nstride = stride;
/* Extended input data */
LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
......@@ -386,20 +385,14 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dst);
ST8x1_UB(d1, dst + nstride);
nstride += stride;
ST8x1_UB(d2, dst + nstride);
nstride += stride;
ST8x1_UB(d3, dst + nstride);
nstride += stride;
ST8x1_UB(d4, dst + nstride);
nstride += stride;
ST8x1_UB(d5, dst + nstride);
nstride += stride;
ST8x1_UB(d6, dst + nstride);
nstride += stride;
ST8x1_UB(d7, dst + nstride);
ST_D1(d0, 0, dst);
ST_D1(d1, 0, dst + stride);
ST_D1(d2, 0, dst + 2 * stride);
ST_D1(d3, 0, dst + 3 * stride);
ST_D1(d4, 0, dst + 4 * stride);
ST_D1(d5, 0, dst + 5 * stride);
ST_D1(d6, 0, dst + 6 * stride);
ST_D1(d7, 0, dst + 7 * stride);
}
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
......@@ -424,7 +417,6 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
v16i8 zero = {0};
int nstride = line_size;
LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
......@@ -480,20 +472,14 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
ST8x1_UB(d0, dest);
ST8x1_UB(d1, dest + nstride);
nstride += line_size;
ST8x1_UB(d2, dest + nstride);
nstride += line_size;
ST8x1_UB(d3, dest + nstride);
nstride += line_size;
ST8x1_UB(d4, dest + nstride);
nstride += line_size;
ST8x1_UB(d5, dest + nstride);
nstride += line_size;
ST8x1_UB(d6, dest + nstride);
nstride += line_size;
ST8x1_UB(d7, dest + nstride);
ST_D1(d0, 0, dest);
ST_D1(d1, 0, dest + line_size);
ST_D1(d2, 0, dest + 2 * line_size);
ST_D1(d3, 0, dest + 3 * line_size);
ST_D1(d4, 0, dest + 4 * line_size);
ST_D1(d5, 0, dest + 5 * line_size);
ST_D1(d6, 0, dest + 6 * line_size);
ST_D1(d7, 0, dest + 7 * line_size);
block[0] = 0;
}
......@@ -537,8 +523,8 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
ST8x1_UB(d1, first_pixel + nstride);
ST8x1_UB(d2, first_pixel);
ST_D1(d1, 0, first_pixel + nstride);
ST_D1(d2, 0, first_pixel);
}
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
......@@ -583,8 +569,8 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
g1 = CLIP_SW_0_255(g1);
VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
ST2x4_UB(d1, 0, first_pixel - 1, stride);
ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
}
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
......@@ -641,10 +627,8 @@ void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
t3 = t3 + (v4u32)f2;
ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
} else {
int i;
......
......@@ -76,7 +76,8 @@ void ff_vp8_idct_add_msa(uint8_t *dst, int16_t input[16], ptrdiff_t stride)
res2 = CLIP_SW_0_255(res2);
res3 = CLIP_SW_0_255(res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
memset(input, 0, 4 * 4 * sizeof(*input));
}
......@@ -97,7 +98,8 @@ void ff_vp8_idct_dc_add_msa(uint8_t *dst, int16_t in_dc[16], ptrdiff_t stride)
ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
CLIP_SH4_0_255(res0, res1, res2, res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
ST_W2(dest0, 0, 1, dst, stride);
ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
in_dc[0] = 0;
}
......
......@@ -540,14 +540,8 @@ void ff_vp8_h_loop_filter_simple_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
src -= 1;
ST2x4_UB(tmp1, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp1, 4, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 0, src, pitch);
src += 4 * pitch;
ST2x4_UB(tmp0, 4, src, pitch);
src += 4 * pitch;
ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch)
ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch)
}
void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
......@@ -596,7 +590,6 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
ptrdiff_t pitch, int b_limit_in,
int limit_in, int thresh_in)
{
uint8_t *temp_src_u, *temp_src_v;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
......@@ -623,15 +616,8 @@ void ff_vp8_h_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
temp_src_u = src_u - 2;
ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_u += 4 * pitch;
ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
temp_src_v = src_v - 2;
ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
temp_src_v += 4 * pitch;
ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch);
ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch);
}
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
......@@ -684,7 +670,6 @@ void ff_vp8_h_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2;
ST4x8_UB(tmp2, tmp3, src, pitch);
src += (8 * pitch);
ST4x8_UB(tmp4, tmp5, src, pitch);
ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch)
ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch)
}
This diff is collapsed.
This diff is collapsed.
......@@ -378,7 +378,8 @@ void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
ST_W2(tmp0, 0, 2, dst, dst_stride);
ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
}
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
......@@ -409,7 +410,7 @@ void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment