Commit 10b77fbf authored by Shivraj Patil's avatar Shivraj Patil Committed by Michael Niedermayer

avcodec/mips: Split uni mc optimizations to new file

This patch moves HEVC code of uni mc cases to new file hevc_mc_uni_msa.c.
(There are total 5 sub-modules of HEVC mc functions, if we add all these modules in one single file, its size would be huge (~750k) & difficult to maintain, so splitting it in multiple files)
This patch also adds new HEVC header file libavcodec/mips/hevc_macros_msa.h
Signed-off-by: 's avatarShivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 6f2c64fd
...@@ -20,6 +20,7 @@ MIPSDSPR1-OBJS-$(CONFIG_AAC_ENCODER) += mips/aaccoder_mips.o ...@@ -20,6 +20,7 @@ MIPSDSPR1-OBJS-$(CONFIG_AAC_ENCODER) += mips/aaccoder_mips.o
MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER) += mips/iirfilter_mips.o MIPSFPU-OBJS-$(CONFIG_AAC_ENCODER) += mips/iirfilter_mips.o
OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_init_mips.o
OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o OBJS-$(CONFIG_H264DSP) += mips/h264dsp_init_mips.o
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_mc_uni_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
/*
* Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_MIPS_HEVC_MACROS_MSA_H
#define AVCODEC_MIPS_HEVC_MACROS_MSA_H
#define HEVC_PCK_SW_SB2(in0, in1, out) \
{ \
v8i16 tmp0_m; \
\
tmp0_m = __msa_pckev_h((v8i16) in0, (v8i16) in1); \
out = (v4i32) __msa_pckev_b((v16i8) tmp0_m, (v16i8) tmp0_m); \
}
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out) \
{ \
v8i16 tmp0_m, tmp1_m; \
\
PCKEV_H2_SH(in0, in1, in2, in3, tmp0_m, tmp1_m); \
out = (v4i32) __msa_pckev_b((v16i8) tmp1_m, (v16i8) tmp0_m); \
}
#define HEVC_FILT_8TAP(in0, in1, in2, in3, \
filt0, filt1, filt2, filt3) \
( { \
v4i32 out_m; \
\
out_m = __msa_dotp_s_w((v8i16) in0, (v8i16) filt0); \
out_m = __msa_dpadd_s_w(out_m, (v8i16) in1, (v8i16) filt1); \
DPADD_SH2_SW(in2, in3, filt2, filt3, out_m, out_m); \
out_m; \
} )
#endif /* AVCODEC_MIPS_HEVC_MACROS_MSA_H */
/*
* Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mips/generic_macros_msa.h"
#include "libavcodec/mips/hevcdsp_mips.h"
#include "libavcodec/mips/hevc_macros_msa.h"
static void copy_width8_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LD_UB8(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
out4 = __msa_copy_u_d((v2i64) src4, 0);
out5 = __msa_copy_u_d((v2i64) src5, 0);
out6 = __msa_copy_u_d((v2i64) src6, 0);
out7 = __msa_copy_u_d((v2i64) src7, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
SD4(out4, out5, out6, out7, dst, dst_stride);
dst += (4 * dst_stride);
LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
}
} else if (0 == height % 8) {
for (cnt = height >> 3; cnt--;) {
LD_UB8(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
out4 = __msa_copy_u_d((v2i64) src4, 0);
out5 = __msa_copy_u_d((v2i64) src5, 0);
out6 = __msa_copy_u_d((v2i64) src6, 0);
out7 = __msa_copy_u_d((v2i64) src7, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
SD4(out4, out5, out6, out7, dst, dst_stride);
dst += (4 * dst_stride);
}
} else if (0 == height % 4) {
for (cnt = (height / 4); cnt--;) {
LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
}
} else if (0 == height % 2) {
for (cnt = (height / 2); cnt--;) {
LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
SD(out0, dst);
dst += dst_stride;
SD(out1, dst);
dst += dst_stride;
}
}
}
static void copy_width12_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
dst += (8 * dst_stride);
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
}
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height, int32_t width)
{
int32_t cnt, loop_cnt;
uint8_t *src_tmp, *dst_tmp;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
for (cnt = (width >> 4); cnt--;) {
src_tmp = src;
dst_tmp = dst;
for (loop_cnt = (height >> 3); loop_cnt--;) {
LD_UB8(src_tmp, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src_tmp += (8 * src_stride);
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
dst_tmp, dst_stride);
dst_tmp += (8 * dst_stride);
}
src += 16;
dst += 16;
}
}
static void copy_width16_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LD_UB8(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
dst, dst_stride);
dst += (8 * dst_stride);
LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
dst += (4 * dst_stride);
}
} else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
} else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) {
LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
}
static void copy_width24_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
}
static void copy_width32_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LD_UB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
dst += (4 * dst_stride);
LD_UB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
dst += (4 * dst_stride);
LD_UB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
} else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
} else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) {
LD_UB4(src, src_stride, src0, src1, src2, src3);
LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
}
static void copy_width48_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
}
static void copy_width64_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
}
uint8_t mc_filt_mask_arr[16 * 3] = {
/* 8 width cases */
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
/* 4 width cases */
0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
/* 4 width cases */
8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
};
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
filt0, filt1, filt2, filt3) \
( { \
v8i16 tmp0, tmp1; \
\
tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
tmp0 = __msa_adds_s_h(tmp0, tmp1); \
\
tmp0; \
} )
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1) \
{ \
v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
\
VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
}
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1, out2, out3) \
{ \
v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
\
VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
res0_m, res1_m, res2_m, res3_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
res4_m, res5_m, res6_m, res7_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
res0_m, res1_m, res2_m, res3_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
res4_m, res5_m, res6_m, res7_m); \
ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
res7_m, out0, out1, out2, out3); \
}
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16u8 mask0, mask1, mask2, mask3, out;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v8i16 filt, out0, out1;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[16]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
SRAR_H2_SH(out0, out1, rnd_vec);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3, out;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[16]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16u8 mask0, mask1, mask2, mask3, out;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[16]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height, uint8_t rnd_val)
{
if (4 == height) {
common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else if (8 == height) {
common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else if (16 == height) {
common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
rnd_val);
}
}
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1,
out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
}
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
if (4 == height) {
common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else {
common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
height, rnd_val);
}
}
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint8_t *src1_ptr, *dst1;
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v8i16 filt, out0, out1, out2, out3;
v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
v8i16 rnd_vec;
mask00 = LD_UB(&mc_filt_mask_arr[0]);
mask0 = LD_UB(&mc_filt_mask_arr[16]);
rnd_vec = __msa_fill_h(rnd_val);
src1_ptr = src - 3;
dst1 = dst;
dst = dst1 + 8;
src = src1_ptr + 8;
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask00 + 2;
mask2 = mask00 + 4;
mask3 = mask00 + 6;
mask4 = mask0 + 2;
mask5 = mask0 + 4;
mask6 = mask0 + 6;
for (loop_cnt = (height >> 2); loop_cnt--;) {
/* 8 width */
LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src1_ptr += (4 * src_stride);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
dst1 += (4 * dst_stride);
/* 4 width */
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (4 * src_stride);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
mask6, filt0, filt1, filt2, filt3, out0,
out1);
SRAR_H2_SH(out0, out1, rnd_vec);
SAT_SH2_SH(out0, out1, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, out;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_SB2(src, src_stride, src0, src2);
LD_SB2(src + 8, src_stride, src1, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (2 * src_stride);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
dst += dst_stride;
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst);
dst += dst_stride;
}
}
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
v16i8 vec11;
v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
v8i16 out11, filt;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
mask4 = mask0 + 8;
mask5 = mask0 + 10;
mask6 = mask0 + 12;
mask7 = mask0 + 14;
for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_SB2(src, src_stride, src0, src2);
LD_SB2(src + 16, src_stride, src1, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
src += (2 * src_stride);
VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
out8, out2, out9);
DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
out10, out6, out11);
DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
out0, out8, out2, out9);
DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
out4, out10, out6, out11);
DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
out8, out2, out9);
ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
SRAR_H4_SH(out0, out8, out2, out9, rnd_vec);
SRAR_H2_SH(out1, out3, rnd_vec);
SAT_SH4_SH(out0, out8, out2, out9, 7);
SAT_SH2_SH(out1, out3, 7);
out = PCKEV_XORI128_UB(out8, out9);
ST8x2_UB(out, dst + 16, dst_stride);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
dst += dst_stride;
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst);
dst += dst_stride;
}
}
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, out;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LD_SB(src);
src2 = LD_SB(src + 16);
src3 = LD_SB(src + 24);
src1 = __msa_sldi_b(src2, src0, 8);
src += src_stride;
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
src0 = LD_SB(src);
src2 = LD_SB(src + 16);
src3 = LD_SB(src + 24);
src1 = __msa_sldi_b(src2, src0, 8);
src += src_stride;
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst + 16);
dst += dst_stride;
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst + 16);
dst += dst_stride;
}
}
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
mask4 = mask0 + 8;
mask5 = mask0 + 10;
mask6 = mask0 + 12;
mask7 = mask0 + 14;
for (loop_cnt = height; loop_cnt--;) {
LD_SB3(src, 16, src0, src2, src3);
src1 = __msa_sldi_b(src2, src0, 8);
XORI_B4_128_SB(src0, src1, src2, src3);
VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
vec0, vec1, vec2);
DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
vec0, vec1, vec2);
DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
out2 = __msa_dpadd_s_h(out2, vec2, filt1);
VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
vec0, vec1, vec2);
DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
vec0, vec1, vec2);
DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
out5 = __msa_dpadd_s_h(out5, vec2, filt3);
ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
out2 = __msa_adds_s_h(out2, out5);
SRAR_H2_SH(out0, out1, rnd_vec);
out6 = __msa_srar_h(out2, rnd_vec);
SAT_SH3_SH(out0, out1, out6, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
src1 = LD_SB(src + 40);
src += src_stride;
src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
vec0, vec1, vec2);
DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
vec0, vec1, vec2);
DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
out2 = __msa_dpadd_s_h(out2, vec2, filt1);
VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
vec0, vec1, vec2);
DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
vec0, vec1, vec2);
DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
out5 = __msa_dpadd_s_h(out5, vec2, filt3);
ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
out5 = __msa_adds_s_h(out2, out5);
SRAR_H3_SH(out3, out4, out5, rnd_vec);
SAT_SH3_SH(out3, out4, out5, 7);
out = PCKEV_XORI128_UB(out6, out3);
ST_UB(out, dst + 16);
out = PCKEV_XORI128_UB(out4, out5);
ST_UB(out, dst + 32);
dst += dst_stride;
}
}
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
int32_t loop_cnt;
v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3, out;
v8i16 filt, out0, out1, out2, out3;
v8i16 rnd_vec;
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= 3;
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter */
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
for (loop_cnt = height; loop_cnt--;) {
src0 = LD_SB(src);
src2 = LD_SB(src + 16);
src3 = LD_SB(src + 24);
src1 = __msa_sldi_b(src2, src0, 8);
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
mask2, mask3, filt0, filt1, filt2, filt3,
out0, out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst + 16);
src0 = LD_SB(src + 32);
src2 = LD_SB(src + 48);
src3 = LD_SB(src + 56);
src1 = __msa_sldi_b(src2, src0, 8);
src += src_stride;
XORI_B4_128_SB(src0, src1, src2, src3);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
mask2, mask3, filt0, filt1, filt2, filt3,
out0, out1, out2, out3);
SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst + 32);
out = PCKEV_XORI128_UB(out2, out3);
ST_UB(out, dst + 48);
dst += dst_stride;
}
}
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
v16i8 src10998, filt0, filt1, filt2, filt3;
v16u8 out;
v8i16 filt, out10, out32;
v8i16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = __msa_fill_h(rnd_val);
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
src4332, src6554);
XORI_B3_128_SB(src2110, src4332, src6554);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
XORI_B2_128_SB(src8776, src10998);
out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
filt1, filt2, filt3);
out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
filt1, filt2, filt3);
SRAR_H2_SH(out10, out32, rnd_vec);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
src4332 = src8776;
src6554 = src10998;
src6 = src10;
}
}
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
v16u8 tmp0, tmp1;
v8i16 filt, out0_r, out1_r, out2_r, out3_r;
v8i16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = __msa_fill_h(rnd_val);
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src7, src8, src9, src10);
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
filt1, filt2, filt3);
SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
ST8x4_UB(tmp0, tmp1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src6 = src10;
}
}
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
int32_t loop_cnt;
uint32_t out2, out3;
uint64_t out0, out1;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
v8i16 filt, filt0, filt1, filt2, filt3;
v8i16 rnd_vec;
v4i32 mask = { 2, 6, 2, 6 };
src -= (3 * src_stride);
rnd_vec = __msa_fill_h(rnd_val);
/* rearranging filter_y */
filt = LD_SH(filter);
SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
/* 4 width */
VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
for (loop_cnt = (height >> 1); loop_cnt--;) {
LD_SB2(src, src_stride, src7, src8);
XORI_B2_128_SB(src7, src8);
src += (2 * src_stride);
ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
vec01, vec23, vec45, vec67);
tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
vec45, vec67);
tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
/* 4 width */
VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
vec45, vec67);
tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
XORI_B3_128_SB(res0, res1, res2);
out0 = __msa_copy_u_d((v2i64) res0, 0);
out1 = __msa_copy_u_d((v2i64) res1, 0);
out2 = __msa_copy_u_w((v4i32) res2, 0);
out3 = __msa_copy_u_w((v4i32) res2, 1);
SD(out0, dst);
SW(out2, (dst + 8));
dst += dst_stride;
SD(out1, dst);
SW(out3, (dst + 8));
dst += dst_stride;
src0 = src2;
src1 = src3;
src2 = src4;
src3 = src5;
src4 = src6;
src5 = src7;
src6 = src8;
vec0 = vec2;
vec1 = vec3;
vec2 = vec4;
vec3 = vec5;
vec4 = vec6;
vec5 = vec7;
}
}
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
v16u8 tmp0, tmp1, tmp2, tmp3;
v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
v8i16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = __msa_fill_h(rnd_val);
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
src54_l, src21_l);
ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src7, src8, src9, src10);
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
src87_l, src98_l, src109_l);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
filt1, filt2, filt3);
out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
filt1, filt2, filt3);
out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
filt1, filt2, filt3);
out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
filt1, filt2, filt3);
out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
filt1, filt2, filt3);
SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
out3_r, tmp0, tmp1, tmp2, tmp3);
XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src10_l = src54_l;
src32_l = src76_l;
src54_l = src98_l;
src21_l = src65_l;
src43_l = src87_l;
src65_l = src109_l;
src6 = src10;
}
}
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val, int32_t width)
{
uint8_t *src_tmp;
uint8_t *dst_tmp;
uint32_t loop_cnt, cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
v16u8 tmp0, tmp1, tmp2, tmp3;
v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
v8i16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = __msa_fill_h(rnd_val);
filt = LD_SH(filter);
SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
for (cnt = (width >> 4); cnt--;) {
src_tmp = src;
dst_tmp = dst;
LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride);
ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
src32_r, src54_r, src21_r);
ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
src32_l, src54_l, src21_l);
ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
XORI_B4_128_SB(src7, src8, src9, src10);
src_tmp += (4 * src_stride);
ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
src87_r, src98_r, src109_r);
ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
src87_l, src98_l, src109_l);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
filt0, filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
filt0, filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
filt0, filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
filt0, filt1, filt2, filt3);
out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
filt0, filt1, filt2, filt3);
out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
filt0, filt1, filt2, filt3);
out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
filt0, filt1, filt2, filt3);
out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
filt0, filt1, filt2, filt3);
SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
out3_r, tmp0, tmp1, tmp2, tmp3);
XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src10_l = src54_l;
src32_l = src76_l;
src54_l = src98_l;
src21_l = src65_l;
src43_l = src87_l;
src65_l = src109_l;
src6 = src10;
}
src += 16;
dst += 16;
}
}
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height, uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
rnd_val, 16);
common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
height, rnd_val);
}
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height, uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
rnd_val, 32);
}
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height, uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
rnd_val, 48);
}
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height, uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
rnd_val, 64);
}
static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v8i16 filt0, filt1, filt2, filt3;
v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
v16i8 mask1, mask2, mask3;
v8i16 filter_vec, const_vec;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
v4i32 dst0_r, dst1_r;
v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
src -= ((3 * src_stride) + 3);
filter_vec = LD_SH(filter_x);
SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
filter_vec = LD_SH(filter_y);
vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
const_vec = __msa_ldi_h(128);
const_vec <<= 6;
LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
vec8, vec9, vec10, vec11);
VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
dst30 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst30, dst30, dst30, dst30);
dst41 = const_vec;
DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
dst41, dst41, dst41, dst41);
dst52 = const_vec;
DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
dst52, dst52, dst52, dst52);
dst63 = const_vec;
DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
dst63, dst63, dst63, dst63);
ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
dst10_r, dst21_r, dst32_r);
dst43_r = __msa_ilvl_h(dst41, dst30);
dst54_r = __msa_ilvl_h(dst52, dst41);
dst65_r = __msa_ilvl_h(dst63, dst52);
dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
for (loop_cnt = height >> 1; loop_cnt--;) {
LD_SB2(src, src_stride, src7, src8);
src += 2 * src_stride;
XORI_B2_128_SB(src7, src8);
VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
dst87 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst87, dst87, dst87, dst87);
dst76_r = __msa_ilvr_h(dst87, dst66);
dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst87_r = __msa_vshf_h(mask4, dst87, dst87);
dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_r >>= 6;
dst1_r >>= 6;
SRARI_W2_SW(dst0_r, dst1_r, 6);
dst0_r = CLIP_SW_0_255(dst0_r);
dst1_r = CLIP_SW_0_255(dst1_r);
HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
ST4x2_UB(dst0_r, dst, dst_stride);
dst += (2 * dst_stride);
dst10_r = dst32_r;
dst32_r = dst54_r;
dst54_r = dst76_r;
dst21_r = dst43_r;
dst43_r = dst65_r;
dst65_r = dst87_r;
dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
}
}
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height, int32_t width)
{
uint32_t loop_cnt, cnt;
uint8_t *src_tmp;
uint8_t *dst_tmp;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v8i16 filt0, filt1, filt2, filt3;
v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
v16i8 mask1, mask2, mask3;
v8i16 filter_vec, const_vec;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
src -= ((3 * src_stride) + 3);
const_vec = __msa_ldi_h(128);
const_vec <<= 6;
filter_vec = LD_SH(filter_x);
SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
filter_vec = LD_SH(filter_y);
vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
for (cnt = width >> 3; cnt--;) {
src_tmp = src;
dst_tmp = dst;
LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
/* row 0 row 1 row 2 row 3 */
VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
vec4, vec5, vec6, vec7);
VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
vec8, vec9, vec10, vec11);
VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
vec12, vec13, vec14, vec15);
dst0 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst0, dst0, dst0, dst0);
dst1 = const_vec;
DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
dst1, dst1, dst1, dst1);
dst2 = const_vec;
DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
dst2, dst2, dst2, dst2);
dst3 = const_vec;
DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
dst3, dst3, dst3, dst3);
VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
vec4, vec5, vec6, vec7);
VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
vec8, vec9, vec10, vec11);
dst4 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst4, dst4, dst4, dst4);
dst5 = const_vec;
DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
dst5, dst5, dst5, dst5);
dst6 = const_vec;
DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
dst6, dst6, dst6, dst6);
ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
dst10_r, dst32_r, dst54_r, dst21_r);
ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
dst10_l, dst32_l, dst54_l, dst21_l);
ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
for (loop_cnt = height >> 1; loop_cnt--;) {
LD_SB2(src_tmp, src_stride, src7, src8);
XORI_B2_128_SB(src7, src8);
src_tmp += 2 * src_stride;
VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
dst7 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst7, dst7, dst7, dst7);
ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_r >>= 6;
dst0_l >>= 6;
VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
vec0, vec1, vec2, vec3);
dst8 = const_vec;
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst8, dst8, dst8, dst8);
ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
filt_h0, filt_h1, filt_h2, filt_h3);
dst1_r >>= 6;
dst1_l >>= 6;
SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
dst0_r = CLIP_SW_0_255(dst0_r);
dst0_l = CLIP_SW_0_255(dst0_l);
dst1_r = CLIP_SW_0_255(dst1_r);
dst1_l = CLIP_SW_0_255(dst1_l);
HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
ST8x2_UB(dst0_r, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst10_r = dst32_r;
dst32_r = dst54_r;
dst54_r = dst76_r;
dst10_l = dst32_l;
dst32_l = dst54_l;
dst54_l = dst76_l;
dst21_r = dst43_r;
dst43_r = dst65_r;
dst65_r = dst87_r;
dst21_l = dst43_l;
dst43_l = dst65_l;
dst65_l = dst87_l;
dst6 = dst8;
}
src += 8;
dst += 8;
}
}
static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 8);
}
static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 8);
hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
filter_x, filter_y, height);
}
static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 16);
}
static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 24);
}
static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 32);
}
static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 48);
}
static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
int32_t src_stride,
uint8_t *dst,
int32_t dst_stride,
const int8_t *filter_x,
const int8_t *filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 64);
}
#define UNI_MC_COPY(WIDTH) \
void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t dst_stride, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
}
UNI_MC_COPY(8);
UNI_MC_COPY(12);
UNI_MC_COPY(16);
UNI_MC_COPY(24);
UNI_MC_COPY(32);
UNI_MC_COPY(48);
UNI_MC_COPY(64);
#undef UNI_MC_COPY
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t \
dst_stride, \
uint8_t *src, \
ptrdiff_t \
src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
\
common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
filter, height, 6); \
}
UNI_MC(qpel, h, 4, 8, hz, mx);
UNI_MC(qpel, h, 8, 8, hz, mx);
UNI_MC(qpel, h, 12, 8, hz, mx);
UNI_MC(qpel, h, 16, 8, hz, mx);
UNI_MC(qpel, h, 24, 8, hz, mx);
UNI_MC(qpel, h, 32, 8, hz, mx);
UNI_MC(qpel, h, 48, 8, hz, mx);
UNI_MC(qpel, h, 64, 8, hz, mx);
UNI_MC(qpel, v, 4, 8, vt, my);
UNI_MC(qpel, v, 8, 8, vt, my);
UNI_MC(qpel, v, 12, 8, vt, my);
UNI_MC(qpel, v, 16, 8, vt, my);
UNI_MC(qpel, v, 24, 8, vt, my);
UNI_MC(qpel, v, 32, 8, vt, my);
UNI_MC(qpel, v, 48, 8, vt, my);
UNI_MC(qpel, v, 64, 8, vt, my);
#undef UNI_MC
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t \
dst_stride, \
uint8_t *src, \
ptrdiff_t \
src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
\
hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
dst_stride, filter_x, \
filter_y, height); \
}
UNI_MC_HV(qpel, hv, 4, 8, hv);
UNI_MC_HV(qpel, hv, 8, 8, hv);
UNI_MC_HV(qpel, hv, 12, 8, hv);
UNI_MC_HV(qpel, hv, 16, 8, hv);
UNI_MC_HV(qpel, hv, 24, 8, hv);
UNI_MC_HV(qpel, hv, 32, 8, hv);
UNI_MC_HV(qpel, hv, 48, 8, hv);
UNI_MC_HV(qpel, hv, 64, 8, hv);
#undef UNI_MC_HV
...@@ -46,24 +46,6 @@ ...@@ -46,24 +46,6 @@
out; \ out; \
} ) } )
#define HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(vec0_r, vec0_l, \
vec1_r, vec1_l, \
out0, out1) \
{ \
(vec0_r) = __msa_srari_w((vec0_r), 6); \
(vec0_l) = __msa_srari_w((vec0_l), 6); \
(vec1_r) = __msa_srari_w((vec1_r), 6); \
(vec1_l) = __msa_srari_w((vec1_l), 6); \
\
(vec0_r) = CLIP_UNSIGNED_CHAR_W((vec0_r)); \
(vec0_l) = CLIP_UNSIGNED_CHAR_W((vec0_l)); \
(vec1_r) = CLIP_UNSIGNED_CHAR_W((vec1_r)); \
(vec1_l) = CLIP_UNSIGNED_CHAR_W((vec1_l)); \
\
out0 = (v4i32) __msa_pckev_h((v8i16) (vec0_l), (v8i16) (vec0_r)); \
out1 = (v4i32) __msa_pckev_h((v8i16) (vec1_l), (v8i16) (vec1_r)); \
}
static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride, static void hevc_copy_4w_msa(uint8_t * __restrict src, int32_t src_stride,
int16_t * __restrict dst, int32_t dst_stride, int16_t * __restrict dst, int32_t dst_stride,
int32_t height) int32_t height)
...@@ -2288,2404 +2270,88 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride, ...@@ -2288,2404 +2270,88 @@ static void hevc_hv_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride,
filter_x, filter_y, height, 64); filter_x, filter_y, height, 64);
} }
static void hevc_hv_uni_8t_4w_msa(uint8_t * __restrict src, #define MC_COPY(WIDTH) \
int32_t src_stride, void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
uint8_t * __restrict dst, uint8_t *src, \
int32_t dst_stride, ptrdiff_t src_stride, \
const int8_t * __restrict filter_x, int height, \
const int8_t * __restrict filter_y, intptr_t mx, \
int32_t height) intptr_t my, \
{ int width) \
uint32_t loop_cnt; { \
uint32_t out0, out1; hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; }
v8i16 filt0, filt1, filt2, filt3, filter_vec;
v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
v16i8 mask1, mask2, mask3;
v8u16 const_vec;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
v4i32 dst0_r, dst1_r;
v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
v16i8 tmp;
v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
src -= ((3 * src_stride) + 3);
filter_vec = LOAD_SH(filter_x);
filt0 = __msa_splati_h(filter_vec, 0);
filt1 = __msa_splati_h(filter_vec, 1);
filt2 = __msa_splati_h(filter_vec, 2);
filt3 = __msa_splati_h(filter_vec, 3);
filter_vec = LOAD_SH(filter_y);
tmp = __msa_clti_s_b((v16i8) filter_vec, 0);
filter_vec = (v8i16) __msa_ilvr_b(tmp, (v16i8) filter_vec);
filt_h0 = __msa_splati_w((v4i32) filter_vec, 0);
filt_h1 = __msa_splati_w((v4i32) filter_vec, 1);
filt_h2 = __msa_splati_w((v4i32) filter_vec, 2);
filt_h3 = __msa_splati_w((v4i32) filter_vec, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
const_vec = (v8u16) __msa_ldi_h(128);
const_vec <<= 6;
LOAD_7VECS_SB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, 128);
/* Row 0 Row 1 Row 2 Row 3 */
vec0 = __msa_vshf_b(mask0, src3, src0);
vec1 = __msa_vshf_b(mask1, src3, src0);
vec2 = __msa_vshf_b(mask2, src3, src0);
vec3 = __msa_vshf_b(mask3, src3, src0);
vec4 = __msa_vshf_b(mask0, src4, src1);
vec5 = __msa_vshf_b(mask1, src4, src1);
vec6 = __msa_vshf_b(mask2, src4, src1);
vec7 = __msa_vshf_b(mask3, src4, src1);
vec8 = __msa_vshf_b(mask0, src5, src2);
vec9 = __msa_vshf_b(mask1, src5, src2);
vec10 = __msa_vshf_b(mask2, src5, src2);
vec11 = __msa_vshf_b(mask3, src5, src2);
vec12 = __msa_vshf_b(mask0, src6, src3);
vec13 = __msa_vshf_b(mask1, src6, src3);
vec14 = __msa_vshf_b(mask2, src6, src3);
vec15 = __msa_vshf_b(mask3, src6, src3);
dst30 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3, const_vec);
dst41 = HEVC_FILT_8TAP_DPADD_H(vec4, vec5, vec6, vec7,
filt0, filt1, filt2, filt3, const_vec);
dst52 = HEVC_FILT_8TAP_DPADD_H(vec8, vec9, vec10, vec11,
filt0, filt1, filt2, filt3, const_vec);
dst63 = HEVC_FILT_8TAP_DPADD_H(vec12, vec13, vec14, vec15,
filt0, filt1, filt2, filt3, const_vec);
dst10_r = __msa_ilvr_h(dst41, dst30);
dst21_r = __msa_ilvr_h(dst52, dst41);
dst32_r = __msa_ilvr_h(dst63, dst52);
dst43_r = __msa_ilvl_h(dst41, dst30);
dst54_r = __msa_ilvl_h(dst52, dst41);
dst65_r = __msa_ilvl_h(dst63, dst52);
dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
for (loop_cnt = height >> 1; loop_cnt--;) {
src7 = LOAD_SB(src);
src += src_stride;
src8 = LOAD_SB(src);
src += src_stride;
XORI_B_2VECS_SB(src7, src8, src7, src8, 128);
vec0 = __msa_vshf_b(mask0, src8, src7);
vec1 = __msa_vshf_b(mask1, src8, src7);
vec2 = __msa_vshf_b(mask2, src8, src7);
vec3 = __msa_vshf_b(mask3, src8, src7);
dst87 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3, const_vec);
dst76_r = __msa_ilvr_h(dst87, dst66);
dst0_r = HEVC_FILT_8TAP_DPADD_W(dst10_r, dst32_r, dst54_r, dst76_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst87_r = __msa_vshf_h(mask4, dst87, dst87);
dst1_r = HEVC_FILT_8TAP_DPADD_W(dst21_r, dst43_r, dst65_r, dst87_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_r >>= 6;
dst1_r >>= 6;
dst0_r = __msa_srari_w(dst0_r, 6);
dst1_r = __msa_srari_w(dst1_r, 6);
dst0_r = CLIP_UNSIGNED_CHAR_W(dst0_r);
dst1_r = CLIP_UNSIGNED_CHAR_W(dst1_r);
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
out0 = __msa_copy_u_w(dst0_r, 0);
out1 = __msa_copy_u_w(dst0_r, 1);
STORE_WORD(dst, out0);
dst += dst_stride;
STORE_WORD(dst, out1);
dst += dst_stride;
dst10_r = dst32_r; MC_COPY(4);
dst32_r = dst54_r; MC_COPY(6);
dst54_r = dst76_r; MC_COPY(8);
MC_COPY(12);
MC_COPY(16);
MC_COPY(24);
MC_COPY(32);
MC_COPY(48);
MC_COPY(64);
dst21_r = dst43_r; #undef MC_COPY
dst43_r = dst65_r;
dst65_r = dst87_r;
dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1); #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
} void ff_hevc_put_hevc_##PEL##_##DIR####WIDTH##_8_msa(int16_t *dst, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
\
hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
MAX_PB_SIZE, filter, height); \
} }
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t * __restrict src, MC(qpel, h, 4, 8, hz, mx);
int32_t src_stride, MC(qpel, h, 8, 8, hz, mx);
uint8_t * __restrict dst, MC(qpel, h, 12, 8, hz, mx);
int32_t dst_stride, MC(qpel, h, 16, 8, hz, mx);
const int8_t * __restrict filter_x, MC(qpel, h, 24, 8, hz, mx);
const int8_t * __restrict filter_y, MC(qpel, h, 32, 8, hz, mx);
int32_t height, int32_t width) MC(qpel, h, 48, 8, hz, mx);
{ MC(qpel, h, 64, 8, hz, mx);
uint32_t loop_cnt, cnt;
uint64_t out0, out1;
uint8_t *src_tmp;
uint8_t *dst_tmp;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v8i16 filt0, filt1, filt2, filt3, filter_vec;
v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
v16i8 mask1, mask2, mask3;
v8u16 const_vec;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
v16i8 tmp;
v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
src -= ((3 * src_stride) + 3);
const_vec = (v8u16) __msa_ldi_h(128); MC(qpel, v, 4, 8, vt, my);
const_vec <<= 6; MC(qpel, v, 8, 8, vt, my);
MC(qpel, v, 12, 8, vt, my);
MC(qpel, v, 16, 8, vt, my);
MC(qpel, v, 24, 8, vt, my);
MC(qpel, v, 32, 8, vt, my);
MC(qpel, v, 48, 8, vt, my);
MC(qpel, v, 64, 8, vt, my);
filter_vec = LOAD_SH(filter_x); #undef MC
filt0 = __msa_splati_h(filter_vec, 0);
filt1 = __msa_splati_h(filter_vec, 1);
filt2 = __msa_splati_h(filter_vec, 2);
filt3 = __msa_splati_h(filter_vec, 3);
filter_vec = LOAD_SH(filter_y); #define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
tmp = __msa_clti_s_b((v16i8) filter_vec, 0); void ff_hevc_put_hevc_##PEL##_##DIR####WIDTH##_8_msa(int16_t *dst, \
filter_vec = (v8i16) __msa_ilvr_b(tmp, (v16i8) filter_vec); uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
\
hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
filter_x, filter_y, height); \
}
filt_h0 = __msa_splati_w((v4i32) filter_vec, 0); MC_HV(qpel, hv, 4, 8, hv);
filt_h1 = __msa_splati_w((v4i32) filter_vec, 1); MC_HV(qpel, hv, 8, 8, hv);
filt_h2 = __msa_splati_w((v4i32) filter_vec, 2); MC_HV(qpel, hv, 12, 8, hv);
filt_h3 = __msa_splati_w((v4i32) filter_vec, 3); MC_HV(qpel, hv, 16, 8, hv);
MC_HV(qpel, hv, 24, 8, hv);
mask1 = mask0 + 2; MC_HV(qpel, hv, 32, 8, hv);
mask2 = mask0 + 4; MC_HV(qpel, hv, 48, 8, hv);
mask3 = mask0 + 6; MC_HV(qpel, hv, 64, 8, hv);
for (cnt = width >> 3; cnt--;) {
src_tmp = src;
dst_tmp = dst;
LOAD_7VECS_SB(src_tmp, src_stride,
src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, 128);
/* Row 0 Row 1 Row 2 Row 3 */
vec0 = __msa_vshf_b(mask0, src0, src0);
vec1 = __msa_vshf_b(mask1, src0, src0);
vec2 = __msa_vshf_b(mask2, src0, src0);
vec3 = __msa_vshf_b(mask3, src0, src0);
vec4 = __msa_vshf_b(mask0, src1, src1);
vec5 = __msa_vshf_b(mask1, src1, src1);
vec6 = __msa_vshf_b(mask2, src1, src1);
vec7 = __msa_vshf_b(mask3, src1, src1);
vec8 = __msa_vshf_b(mask0, src2, src2);
vec9 = __msa_vshf_b(mask1, src2, src2);
vec10 = __msa_vshf_b(mask2, src2, src2);
vec11 = __msa_vshf_b(mask3, src2, src2);
vec12 = __msa_vshf_b(mask0, src3, src3);
vec13 = __msa_vshf_b(mask1, src3, src3);
vec14 = __msa_vshf_b(mask2, src3, src3);
vec15 = __msa_vshf_b(mask3, src3, src3);
dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3, const_vec);
dst1 = HEVC_FILT_8TAP_DPADD_H(vec4, vec5, vec6, vec7,
filt0, filt1, filt2, filt3, const_vec);
dst2 = HEVC_FILT_8TAP_DPADD_H(vec8, vec9, vec10, vec11,
filt0, filt1, filt2, filt3, const_vec);
dst3 = HEVC_FILT_8TAP_DPADD_H(vec12, vec13, vec14, vec15,
filt0, filt1, filt2, filt3, const_vec);
vec0 = __msa_vshf_b(mask0, src4, src4);
vec1 = __msa_vshf_b(mask1, src4, src4);
vec2 = __msa_vshf_b(mask2, src4, src4);
vec3 = __msa_vshf_b(mask3, src4, src4);
vec4 = __msa_vshf_b(mask0, src5, src5);
vec5 = __msa_vshf_b(mask1, src5, src5);
vec6 = __msa_vshf_b(mask2, src5, src5);
vec7 = __msa_vshf_b(mask3, src5, src5);
vec8 = __msa_vshf_b(mask0, src6, src6);
vec9 = __msa_vshf_b(mask1, src6, src6);
vec10 = __msa_vshf_b(mask2, src6, src6);
vec11 = __msa_vshf_b(mask3, src6, src6);
dst4 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3, const_vec);
dst5 = HEVC_FILT_8TAP_DPADD_H(vec4, vec5, vec6, vec7,
filt0, filt1, filt2, filt3, const_vec);
dst6 = HEVC_FILT_8TAP_DPADD_H(vec8, vec9, vec10, vec11,
filt0, filt1, filt2, filt3, const_vec);
ILVR_H_6VECS_SH(dst0, dst2, dst4, dst1, dst3, dst5,
dst1, dst3, dst5, dst2, dst4, dst6,
dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r);
ILVL_H_6VECS_SH(dst0, dst2, dst4, dst1, dst3, dst5,
dst1, dst3, dst5, dst2, dst4, dst6,
dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l);
for (loop_cnt = height >> 1; loop_cnt--;) {
src7 = LOAD_SB(src_tmp);
src_tmp += src_stride;
src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
vec0 = __msa_vshf_b(mask0, src7, src7);
vec1 = __msa_vshf_b(mask1, src7, src7);
vec2 = __msa_vshf_b(mask2, src7, src7);
vec3 = __msa_vshf_b(mask3, src7, src7);
dst7 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3,
const_vec);
dst76_r = __msa_ilvr_h(dst7, dst6);
dst76_l = __msa_ilvl_h(dst7, dst6);
dst0_r = HEVC_FILT_8TAP_DPADD_W(dst10_r, dst32_r, dst54_r, dst76_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_l = HEVC_FILT_8TAP_DPADD_W(dst10_l, dst32_l, dst54_l, dst76_l,
filt_h0, filt_h1, filt_h2, filt_h3);
dst0_r >>= 6;
dst0_l >>= 6;
/* Row 8 */
src8 = LOAD_SB(src_tmp);
src_tmp += src_stride;
src8 = (v16i8) __msa_xori_b((v16u8) src8, 128);
vec0 = __msa_vshf_b(mask0, src8, src8);
vec1 = __msa_vshf_b(mask1, src8, src8);
vec2 = __msa_vshf_b(mask2, src8, src8);
vec3 = __msa_vshf_b(mask3, src8, src8);
dst8 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
filt0, filt1, filt2, filt3,
const_vec);
dst87_r = __msa_ilvr_h(dst8, dst7);
dst87_l = __msa_ilvl_h(dst8, dst7);
dst1_r = HEVC_FILT_8TAP_DPADD_W(dst21_r, dst43_r, dst65_r, dst87_r,
filt_h0, filt_h1, filt_h2, filt_h3);
dst1_l = HEVC_FILT_8TAP_DPADD_W(dst21_l, dst43_l, dst65_l, dst87_l,
filt_h0, filt_h1, filt_h2, filt_h3);
dst1_r >>= 6;
dst1_l >>= 6;
HEVC_RND_W_CLIP_UNSIGNED_CHAR_W_VEC2(dst0_r, dst0_l, dst1_r, dst1_l,
dst0_r, dst1_r);
dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
out0 = __msa_copy_u_d((v2i64) dst0_r, 0);
out1 = __msa_copy_u_d((v2i64) dst0_r, 1);
STORE_DWORD(dst_tmp, out0);
dst_tmp += dst_stride;
STORE_DWORD(dst_tmp, out1);
dst_tmp += dst_stride;
dst10_r = dst32_r;
dst32_r = dst54_r;
dst54_r = dst76_r;
dst10_l = dst32_l;
dst32_l = dst54_l;
dst54_l = dst76_l;
dst21_r = dst43_r;
dst43_r = dst65_r;
dst65_r = dst87_r;
dst21_l = dst43_l;
dst43_l = dst65_l;
dst65_l = dst87_l;
dst6 = dst8;
}
src += 8;
dst += 8;
}
}
static void hevc_hv_uni_8t_8w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 8);
}
static void hevc_hv_uni_8t_12w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 8);
hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
filter_x, filter_y, height);
}
static void hevc_hv_uni_8t_16w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 16);
}
static void hevc_hv_uni_8t_24w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 24);
}
static void hevc_hv_uni_8t_32w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 32);
}
static void hevc_hv_uni_8t_48w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 48);
}
static void hevc_hv_uni_8t_64w_msa(uint8_t * __restrict src,
int32_t src_stride,
uint8_t * __restrict dst,
int32_t dst_stride,
const int8_t * __restrict filter_x,
const int8_t * __restrict filter_y,
int32_t height)
{
hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
filter_x, filter_y, height, 64);
}
static uint8_t mc_filt_mask_arr[16 * 3] = {
/* 8 width cases */
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
/* 4 width cases */
0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
/* 4 width cases */
8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
};
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
filt0, filt1, filt2, filt3) \
( { \
v8i16 tmp0, tmp1; \
\
tmp0 = __msa_dotp_s_h((v16i8) (vec0), (v16i8) (filt0)); \
tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) (vec1), (v16i8) (filt1)); \
tmp1 = __msa_dotp_s_h((v16i8) (vec2), (v16i8) (filt2)); \
tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) (vec3), (v16i8) (filt3)); \
tmp0 = __msa_adds_s_h(tmp0, tmp1); \
\
tmp0; \
} )
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1) \
{ \
v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
\
vec0_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src1), (v16i8) (src0)); \
vec1_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src3), (v16i8) (src2)); \
\
res0_m = __msa_dotp_s_h(vec0_m, (v16i8) (filt0)); \
res1_m = __msa_dotp_s_h(vec1_m, (v16i8) (filt0)); \
\
vec2_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src1), (v16i8) (src0)); \
vec3_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src3), (v16i8) (src2)); \
\
res0_m = __msa_dpadd_s_h(res0_m, (filt1), vec2_m); \
res1_m = __msa_dpadd_s_h(res1_m, (filt1), vec3_m); \
\
vec4_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src1), (v16i8) (src0)); \
vec5_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src3), (v16i8) (src2)); \
\
res2_m = __msa_dotp_s_h((v16i8) (filt2), vec4_m); \
res3_m = __msa_dotp_s_h((v16i8) (filt2), vec5_m); \
\
vec6_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src1), (v16i8) (src0)); \
vec7_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src3), (v16i8) (src2)); \
\
res2_m = __msa_dpadd_s_h(res2_m, (v16i8) (filt3), vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8) (filt3), vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res2_m); \
out1 = __msa_adds_s_h(res1_m, res3_m); \
}
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \
out0, out1, out2, out3) \
{ \
v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
\
vec0_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src0), (v16i8) (src0)); \
vec1_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src1), (v16i8) (src1)); \
vec2_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src2), (v16i8) (src2)); \
vec3_m = __msa_vshf_b((v16i8) (mask0), (v16i8) (src3), (v16i8) (src3)); \
\
res0_m = __msa_dotp_s_h(vec0_m, (v16i8) (filt0)); \
res1_m = __msa_dotp_s_h(vec1_m, (v16i8) (filt0)); \
res2_m = __msa_dotp_s_h(vec2_m, (v16i8) (filt0)); \
res3_m = __msa_dotp_s_h(vec3_m, (v16i8) (filt0)); \
\
vec0_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src0), (v16i8) (src0)); \
vec1_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src1), (v16i8) (src1)); \
vec2_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src2), (v16i8) (src2)); \
vec3_m = __msa_vshf_b((v16i8) (mask2), (v16i8) (src3), (v16i8) (src3)); \
\
res4_m = __msa_dotp_s_h(vec0_m, (v16i8) (filt2)); \
res5_m = __msa_dotp_s_h(vec1_m, (v16i8) (filt2)); \
res6_m = __msa_dotp_s_h(vec2_m, (v16i8) (filt2)); \
res7_m = __msa_dotp_s_h(vec3_m, (v16i8) (filt2)); \
\
vec4_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src0), (v16i8) (src0)); \
vec5_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src1), (v16i8) (src1)); \
vec6_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src2), (v16i8) (src2)); \
vec7_m = __msa_vshf_b((v16i8) (mask1), (v16i8) (src3), (v16i8) (src3)); \
\
res0_m = __msa_dpadd_s_h(res0_m, (v16i8) (filt1), vec4_m); \
res1_m = __msa_dpadd_s_h(res1_m, (v16i8) (filt1), vec5_m); \
res2_m = __msa_dpadd_s_h(res2_m, (v16i8) (filt1), vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8) (filt1), vec7_m); \
\
vec4_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src0), (v16i8) (src0)); \
vec5_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src1), (v16i8) (src1)); \
vec6_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src2), (v16i8) (src2)); \
vec7_m = __msa_vshf_b((v16i8) (mask3), (v16i8) (src3), (v16i8) (src3)); \
\
res4_m = __msa_dpadd_s_h(res4_m, (v16i8) (filt3), vec4_m); \
res5_m = __msa_dpadd_s_h(res5_m, (v16i8) (filt3), vec5_m); \
res6_m = __msa_dpadd_s_h(res6_m, (v16i8) (filt3), vec6_m); \
res7_m = __msa_dpadd_s_h(res7_m, (v16i8) (filt3), vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res4_m); \
out1 = __msa_adds_s_h(res1_m, res5_m); \
out2 = __msa_adds_s_h(res2_m, res6_m); \
out3 = __msa_adds_s_h(res3_m, res7_m); \
}
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
}
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
dst += (4 * dst_stride);
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1);
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out2, out3);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
}
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
if (4 == height) {
common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else if (8 == height) {
common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else if (16 == height) {
common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
rnd_val);
}
}
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, uint8_t rnd_val)
{
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0, out1,
out2, out3);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
}
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3,
dst, dst_stride);
dst += (4 * dst_stride);
}
}
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
if (4 == height) {
common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
} else {
common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride,
filter, height, rnd_val);
}
}
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint8_t *src1_ptr, *dst1;
uint32_t loop_cnt;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src0, src1, src2, src3;
v8u16 rnd_vec;
v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
v8i16 filt, out0, out1, out2, out3;
mask00 = LOAD_UB(&mc_filt_mask_arr[0]);
mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
src1_ptr = src - 3;
dst1 = dst;
dst = dst1 + 8;
src = src1_ptr + 8;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask00 + 2;
mask2 = mask00 + 4;
mask3 = mask00 + 6;
mask4 = mask0 + 2;
mask5 = mask0 + 4;
mask6 = mask0 + 6;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 2); loop_cnt--;) {
/* 8 width */
LOAD_4VECS_SB(src1_ptr, src_stride, src0, src1, src2, src3);
src1_ptr += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3,
dst1, dst_stride);
dst1 += (4 * dst_stride);
/* 4 width */
LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
mask6, filt0, filt1, filt2, filt3, out0,
out1);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3;
v16i8 filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LOAD_SB(src);
src1 = LOAD_SB(src + 8);
src += src_stride;
src2 = LOAD_SB(src);
src3 = LOAD_SB(src + 8);
src += src_stride;
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
dst += dst_stride;
PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
dst += dst_stride;
}
}
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3;
v16i8 filt0, filt1, filt2, filt3;
v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
v8i16 out0, out1, out2, out3, out4, out5;
v8i16 out6, out7, out8, out9, out10, out11;
v8i16 filt;
v8u16 rnd_vec;
mask0 = LOAD_SB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
mask4 = mask0 + 8;
mask5 = mask0 + 10;
mask6 = mask0 + 12;
mask7 = mask0 + 14;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LOAD_SB(src);
src1 = LOAD_SB(src + 16);
src += src_stride;
src2 = LOAD_SB(src);
src3 = LOAD_SB(src + 16);
src += src_stride;
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
vec0 = __msa_vshf_b(mask0, src0, src0);
vec8 = __msa_vshf_b(mask0, src1, src1);
vec2 = __msa_vshf_b(mask0, src2, src2);
vec9 = __msa_vshf_b(mask0, src3, src3);
vec1 = __msa_vshf_b(mask4, src1, src0);
vec3 = __msa_vshf_b(mask4, src3, src2);
out0 = __msa_dotp_s_h(vec0, filt0);
out8 = __msa_dotp_s_h(vec8, filt0);
out2 = __msa_dotp_s_h(vec2, filt0);
out9 = __msa_dotp_s_h(vec9, filt0);
out1 = __msa_dotp_s_h(vec1, filt0);
out3 = __msa_dotp_s_h(vec3, filt0);
vec0 = __msa_vshf_b(mask2, src0, src0);
vec8 = __msa_vshf_b(mask2, src1, src1);
vec2 = __msa_vshf_b(mask2, src2, src2);
vec9 = __msa_vshf_b(mask2, src3, src3);
vec1 = __msa_vshf_b(mask6, src1, src0);
vec3 = __msa_vshf_b(mask6, src3, src2);
out4 = __msa_dotp_s_h(vec0, filt2);
out10 = __msa_dotp_s_h(vec8, filt2);
out6 = __msa_dotp_s_h(vec2, filt2);
out11 = __msa_dotp_s_h(vec9, filt2);
out5 = __msa_dotp_s_h(vec1, filt2);
out7 = __msa_dotp_s_h(vec3, filt2);
vec4 = __msa_vshf_b(mask1, src0, src0);
vec10 = __msa_vshf_b(mask1, src1, src1);
vec6 = __msa_vshf_b(mask1, src2, src2);
vec11 = __msa_vshf_b(mask1, src3, src3);
vec5 = __msa_vshf_b(mask5, src1, src0);
vec7 = __msa_vshf_b(mask5, src3, src2);
out0 = __msa_dpadd_s_h(out0, vec4, filt1);
out8 = __msa_dpadd_s_h(out8, vec10, filt1);
out2 = __msa_dpadd_s_h(out2, vec6, filt1);
out9 = __msa_dpadd_s_h(out9, vec11, filt1);
out1 = __msa_dpadd_s_h(out1, vec5, filt1);
out3 = __msa_dpadd_s_h(out3, vec7, filt1);
vec4 = __msa_vshf_b(mask3, src0, src0);
vec10 = __msa_vshf_b(mask3, src1, src1);
vec6 = __msa_vshf_b(mask3, src2, src2);
vec11 = __msa_vshf_b(mask3, src3, src3);
vec5 = __msa_vshf_b(mask7, src1, src0);
vec7 = __msa_vshf_b(mask7, src3, src2);
out4 = __msa_dpadd_s_h(out4, vec4, filt3);
out10 = __msa_dpadd_s_h(out10, vec10, filt3);
out6 = __msa_dpadd_s_h(out6, vec6, filt3);
out11 = __msa_dpadd_s_h(out11, vec11, filt3);
out5 = __msa_dpadd_s_h(out5, vec5, filt3);
out7 = __msa_dpadd_s_h(out7, vec7, filt3);
out0 = __msa_adds_s_h(out0, out4);
out8 = __msa_adds_s_h(out8, out10);
out2 = __msa_adds_s_h(out2, out6);
out9 = __msa_adds_s_h(out9, out11);
out1 = __msa_adds_s_h(out1, out5);
out3 = __msa_adds_s_h(out3, out7);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out8 = SRAR_SATURATE_SIGNED_H(out8, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out9 = SRAR_SATURATE_SIGNED_H(out9, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_XORI128_STORE_8_BYTES_2(out8, out9, dst + 16, dst_stride);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
dst += dst_stride;
PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
dst += dst_stride;
}
}
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3;
v16i8 filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 1); loop_cnt--;) {
src0 = LOAD_SB(src);
src2 = LOAD_SB(src + 16);
src3 = LOAD_SB(src + 24);
src1 = __msa_sld_b(src2, src0, 8);
src += src_stride;
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
src0 = LOAD_SB(src);
src2 = LOAD_SB(src + 16);
src3 = LOAD_SB(src + 24);
src1 = __msa_sld_b(src2, src0, 8);
src += src_stride;
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
dst += dst_stride;
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
mask3, filt0, filt1, filt2, filt3, out0,
out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
dst += dst_stride;
}
}
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3;
v16i8 vec0, vec1, vec2;
v16i8 filt0, filt1, filt2, filt3;
v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
v8u16 rnd_vec;
mask0 = LOAD_SB(&mc_filt_mask_arr[0]);
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
mask4 = mask0 + 8;
mask5 = mask0 + 10;
mask6 = mask0 + 12;
mask7 = mask0 + 14;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
src -= 3;
for (loop_cnt = height; loop_cnt--;) {
src0 = LOAD_SB(src);
src2 = LOAD_SB(src + 16);
src3 = LOAD_SB(src + 32);
src1 = __msa_sld_b(src2, src0, 8);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
vec0 = __msa_vshf_b(mask0, src0, src0);
vec1 = __msa_vshf_b(mask0, src1, src1);
vec2 = __msa_vshf_b(mask0, src2, src2);
out0 = __msa_dotp_s_h(vec0, filt0);
out1 = __msa_dotp_s_h(vec1, filt0);
out2 = __msa_dotp_s_h(vec2, filt0);
vec0 = __msa_vshf_b(mask1, src0, src0);
vec1 = __msa_vshf_b(mask1, src1, src1);
vec2 = __msa_vshf_b(mask1, src2, src2);
out0 = __msa_dpadd_s_h(out0, vec0, filt1);
out1 = __msa_dpadd_s_h(out1, vec1, filt1);
out2 = __msa_dpadd_s_h(out2, vec2, filt1);
vec0 = __msa_vshf_b(mask2, src0, src0);
vec1 = __msa_vshf_b(mask2, src1, src1);
vec2 = __msa_vshf_b(mask2, src2, src2);
out3 = __msa_dotp_s_h(vec0, filt2);
out4 = __msa_dotp_s_h(vec1, filt2);
out5 = __msa_dotp_s_h(vec2, filt2);
vec0 = __msa_vshf_b(mask3, src0, src0);
vec1 = __msa_vshf_b(mask3, src1, src1);
vec2 = __msa_vshf_b(mask3, src2, src2);
out3 = __msa_dpadd_s_h(out3, vec0, filt3);
out4 = __msa_dpadd_s_h(out4, vec1, filt3);
out5 = __msa_dpadd_s_h(out5, vec2, filt3);
out0 = __msa_adds_s_h(out0, out3);
out1 = __msa_adds_s_h(out1, out4);
out2 = __msa_adds_s_h(out2, out5);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out6 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
src1 = LOAD_SB(src + 40);
src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
vec0 = __msa_vshf_b(mask4, src3, src2);
vec1 = __msa_vshf_b(mask0, src3, src3);
vec2 = __msa_vshf_b(mask0, src1, src1);
out0 = __msa_dotp_s_h(vec0, filt0);
out1 = __msa_dotp_s_h(vec1, filt0);
out2 = __msa_dotp_s_h(vec2, filt0);
vec0 = __msa_vshf_b(mask5, src3, src2);
vec1 = __msa_vshf_b(mask1, src3, src3);
vec2 = __msa_vshf_b(mask1, src1, src1);
out0 = __msa_dpadd_s_h(out0, vec0, filt1);
out1 = __msa_dpadd_s_h(out1, vec1, filt1);
out2 = __msa_dpadd_s_h(out2, vec2, filt1);
vec0 = __msa_vshf_b(mask6, src3, src2);
vec1 = __msa_vshf_b(mask2, src3, src3);
vec2 = __msa_vshf_b(mask2, src1, src1);
out3 = __msa_dotp_s_h(vec0, filt2);
out4 = __msa_dotp_s_h(vec1, filt2);
out5 = __msa_dotp_s_h(vec2, filt2);
vec0 = __msa_vshf_b(mask7, src3, src2);
vec1 = __msa_vshf_b(mask3, src3, src3);
vec2 = __msa_vshf_b(mask3, src1, src1);
out3 = __msa_dpadd_s_h(out3, vec0, filt3);
out4 = __msa_dpadd_s_h(out4, vec1, filt3);
out5 = __msa_dpadd_s_h(out5, vec2, filt3);
out3 = __msa_adds_s_h(out0, out3);
out4 = __msa_adds_s_h(out1, out4);
out5 = __msa_adds_s_h(out2, out5);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
out4 = SRAR_SATURATE_SIGNED_H(out4, rnd_vec, 7);
out5 = SRAR_SATURATE_SIGNED_H(out5, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out3, out6, (dst + 16));
PCKEV_B_XORI128_STORE_VEC(out5, out4, (dst + 32));
src += src_stride;
dst += dst_stride;
}
}
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
int32_t loop_cnt;
v16i8 src0, src1, src2, src3;
v16i8 filt0, filt1, filt2, filt3;
v16u8 mask0, mask1, mask2, mask3;
v8i16 filt, out0, out1, out2, out3;
v8u16 rnd_vec;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
src -= 3;
/* rearranging filter */
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
mask1 = mask0 + 2;
mask2 = mask0 + 4;
mask3 = mask0 + 6;
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = height; loop_cnt--;) {
src0 = LOAD_SB(src);
src2 = LOAD_SB(src + 16);
src3 = LOAD_SB(src + 24);
src1 = __msa_sld_b(src2, src0, 8);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
mask2, mask3, filt0, filt1, filt2, filt3,
out0, out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
PCKEV_B_XORI128_STORE_VEC(out3, out2, dst + 16);
src0 = LOAD_SB(src + 32);
src2 = LOAD_SB(src + 48);
src3 = LOAD_SB(src + 56);
src1 = __msa_sld_b(src2, src0, 8);
XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
mask2, mask3, filt0, filt1, filt2, filt3,
out0, out1, out2, out3);
out0 = SRAR_SATURATE_SIGNED_H(out0, rnd_vec, 7);
out1 = SRAR_SATURATE_SIGNED_H(out1, rnd_vec, 7);
out2 = SRAR_SATURATE_SIGNED_H(out2, rnd_vec, 7);
out3 = SRAR_SATURATE_SIGNED_H(out3, rnd_vec, 7);
PCKEV_B_XORI128_STORE_VEC(out1, out0, dst + 32);
PCKEV_B_XORI128_STORE_VEC(out3, out2, dst + 48);
src += src_stride;
dst += dst_stride;
}
}
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
v16i8 src2110, src4332, src6554, src8776, src10998;
v16i8 filt0, filt1, filt2, filt3;
v8i16 filt, out10, out32;
v8u16 rnd_vec;
src -= (3 * src_stride);
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
src6554, src65_r, src54_r);
XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_r, src87_r, src98_r, src109_r);
ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r);
XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128);
out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776,
filt0, filt1, filt2, filt3);
out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998,
filt0, filt1, filt2, filt3);
out10 = SRAR_SATURATE_SIGNED_H(out10, rnd_vec, 7);
out32 = SRAR_SATURATE_SIGNED_H(out32, rnd_vec, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
src4332 = src8776;
src6554 = src10998;
src6 = src10;
}
}
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
v16i8 filt0, filt1, filt2, filt3;
v8i16 filt, out0_r, out1_r, out2_r, out3_r;
v8u16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, 128);
ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_r, src87_r, src98_r, src109_r);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
filt0, filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
filt0, filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
filt0, filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
filt0, filt1, filt2, filt3);
out0_r = SRAR_SATURATE_SIGNED_H(out0_r, rnd_vec, 7);
out1_r = SRAR_SATURATE_SIGNED_H(out1_r, rnd_vec, 7);
out2_r = SRAR_SATURATE_SIGNED_H(out2_r, rnd_vec, 7);
out3_r = SRAR_SATURATE_SIGNED_H(out3_r, rnd_vec, 7);
PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src6 = src10;
}
}
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
int32_t loop_cnt;
uint32_t out2, out3;
uint64_t out0, out1;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15, vec16;
v16i8 res0, res1, res2;
v8i16 vec01, vec23, vec45, vec67;
v8i16 tmp0, tmp1, tmp2;
v8i16 filt, filt0, filt1, filt2, filt3;
v8u16 rnd_vec;
v4i32 mask = { 2, 6, 2, 6 };
src -= (3 * src_stride);
LOAD_7VECS_UB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B_4VECS_SB(src0, src1, src2, src3, vec0, vec1, vec2, vec3, 128);
vec4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
vec5 = (v16i8) __msa_xori_b((v16u8) src5, 128);
vec6 = (v16i8) __msa_xori_b((v16u8) src6, 128);
/* 4 width */
vec9 = (v16i8) __msa_vshf_w(mask, (v4i32) vec1, (v4i32) vec0);
vec10 = (v16i8) __msa_vshf_w(mask, (v4i32) vec2, (v4i32) vec1);
vec11 = (v16i8) __msa_vshf_w(mask, (v4i32) vec3, (v4i32) vec2);
vec12 = (v16i8) __msa_vshf_w(mask, (v4i32) vec4, (v4i32) vec3);
vec13 = (v16i8) __msa_vshf_w(mask, (v4i32) vec5, (v4i32) vec4);
vec14 = (v16i8) __msa_vshf_w(mask, (v4i32) vec6, (v4i32) vec5);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
/* rearranging filter_y */
filt = LOAD_SH(filter);
filt0 = (v8i16) __msa_splati_h(filt, 0);
filt1 = (v8i16) __msa_splati_h(filt, 1);
filt2 = (v8i16) __msa_splati_h(filt, 2);
filt3 = (v8i16) __msa_splati_h(filt, 3);
for (loop_cnt = (height >> 1); loop_cnt--;) {
LOAD_2VECS_UB(src, src_stride, src7, src8);
src += (2 * src_stride);
XORI_B_2VECS_SB(src7, src8, vec7, vec8, 128);
ILVR_B_4VECS_SH(vec0, vec2, vec4, vec6, vec1, vec3, vec5, vec7,
vec01, vec23, vec45, vec67);
tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
ILVR_B_4VECS_SH(vec1, vec3, vec5, vec7, vec2, vec4, vec6, vec8,
vec01, vec23, vec45, vec67);
tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
/* 4 width */
vec15 = (v16i8) __msa_vshf_w(mask, (v4i32) vec7, (v4i32) vec6);
vec16 = (v16i8) __msa_vshf_w(mask, (v4i32) vec8, (v4i32) vec7);
ILVR_B_4VECS_SH(vec9, vec11, vec13, vec15, vec10, vec12, vec14, vec16,
vec01, vec23, vec45, vec67);
tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
filt2, filt3);
tmp0 = SRAR_SATURATE_SIGNED_H(tmp0, rnd_vec, 7);
tmp1 = SRAR_SATURATE_SIGNED_H(tmp1, rnd_vec, 7);
tmp2 = SRAR_SATURATE_SIGNED_H(tmp2, rnd_vec, 7);
res0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
res1 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp1);
res2 = __msa_pckev_b((v16i8) tmp2, (v16i8) tmp2);
XORI_B_3VECS_SB(res0, res1, res2, res0, res1, res2, 128);
out0 = __msa_copy_u_d((v2i64) res0, 0);
out1 = __msa_copy_u_d((v2i64) res1, 0);
out2 = __msa_copy_u_w((v4i32) res2, 0);
out3 = __msa_copy_u_w((v4i32) res2, 1);
STORE_DWORD(dst, out0);
STORE_WORD((dst + 8), out2);
dst += dst_stride;
STORE_DWORD(dst, out1);
STORE_WORD((dst + 8), out3);
dst += dst_stride;
vec0 = vec2;
vec1 = vec3;
vec2 = vec4;
vec3 = vec5;
vec4 = vec6;
vec5 = vec7;
vec6 = vec8;
vec9 = vec11;
vec10 = vec12;
vec11 = vec13;
vec12 = vec14;
vec13 = vec15;
vec14 = vec16;
}
}
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
v8i16 filt;
v8u16 rnd_vec;
v16u8 tmp0, tmp1, tmp2, tmp3;
src -= (3 * src_stride);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
LOAD_7VECS_SB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, 128);
ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride);
XORI_B_4VECS_SB(src7, src8, src9, src10,
src7, src8, src9, src10, 128);
ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_r, src87_r, src98_r, src109_r);
ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_l, src87_l, src98_l, src109_l);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
filt0, filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
filt0, filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
filt0, filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
filt0, filt1, filt2, filt3);
out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
filt0, filt1, filt2, filt3);
out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
filt0, filt1, filt2, filt3);
out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
filt0, filt1, filt2, filt3);
out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
filt0, filt1, filt2, filt3);
out0_r = SRAR_SATURATE_SIGNED_H(out0_r, rnd_vec, 7);
out1_r = SRAR_SATURATE_SIGNED_H(out1_r, rnd_vec, 7);
out2_r = SRAR_SATURATE_SIGNED_H(out2_r, rnd_vec, 7);
out3_r = SRAR_SATURATE_SIGNED_H(out3_r, rnd_vec, 7);
out0_l = SRAR_SATURATE_SIGNED_H(out0_l, rnd_vec, 7);
out1_l = SRAR_SATURATE_SIGNED_H(out1_l, rnd_vec, 7);
out2_l = SRAR_SATURATE_SIGNED_H(out2_l, rnd_vec, 7);
out3_l = SRAR_SATURATE_SIGNED_H(out3_l, rnd_vec, 7);
PCKEV_B_4VECS_UB(out0_l, out1_l, out2_l, out3_l, out0_r, out1_r, out2_r,
out3_r, tmp0, tmp1, tmp2, tmp3);
XORI_B_4VECS_UB(tmp0, tmp1, tmp2, tmp3, tmp0, tmp1, tmp2, tmp3, 128);
STORE_4VECS_UB(dst, dst_stride, tmp0, tmp1, tmp2, tmp3);
dst += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src10_l = src54_l;
src32_l = src76_l;
src54_l = src98_l;
src21_l = src65_l;
src43_l = src87_l;
src65_l = src109_l;
src6 = src10;
}
}
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val, int32_t width)
{
uint8_t *src_tmp;
uint8_t *dst_tmp;
uint32_t loop_cnt, cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt0, filt1, filt2, filt3;
v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
v16u8 tmp0, tmp1, tmp2, tmp3;
v8u16 rnd_vec;
src -= (3 * src_stride);
rnd_vec = (v8u16) __msa_fill_h(rnd_val);
filt = LOAD_SH(filter);
filt0 = (v16i8) __msa_splati_h(filt, 0);
filt1 = (v16i8) __msa_splati_h(filt, 1);
filt2 = (v16i8) __msa_splati_h(filt, 2);
filt3 = (v16i8) __msa_splati_h(filt, 3);
for (cnt = (width >> 4); cnt--;) {
src_tmp = src;
dst_tmp = dst;
LOAD_7VECS_SB(src_tmp, src_stride,
src0, src1, src2, src3, src4, src5, src6);
src_tmp += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
src0, src1, src2, src3, src4, src5, src6, 128);
ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
src1, src3, src5, src2, src4, src6,
src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
src_tmp += (4 * src_stride);
XORI_B_4VECS_SB(src7, src8, src9, src10,
src7, src8, src9, src10, 128);
ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_r, src87_r, src98_r, src109_r);
ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
src76_l, src87_l, src98_l, src109_l);
out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
filt0, filt1, filt2, filt3);
out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
filt0, filt1, filt2, filt3);
out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
filt0, filt1, filt2, filt3);
out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
filt0, filt1, filt2, filt3);
out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
filt0, filt1, filt2, filt3);
out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
filt0, filt1, filt2, filt3);
out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
filt0, filt1, filt2, filt3);
out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
filt0, filt1, filt2, filt3);
out0_r = SRAR_SATURATE_SIGNED_H(out0_r, rnd_vec, 7);
out1_r = SRAR_SATURATE_SIGNED_H(out1_r, rnd_vec, 7);
out2_r = SRAR_SATURATE_SIGNED_H(out2_r, rnd_vec, 7);
out3_r = SRAR_SATURATE_SIGNED_H(out3_r, rnd_vec, 7);
out0_l = SRAR_SATURATE_SIGNED_H(out0_l, rnd_vec, 7);
out1_l = SRAR_SATURATE_SIGNED_H(out1_l, rnd_vec, 7);
out2_l = SRAR_SATURATE_SIGNED_H(out2_l, rnd_vec, 7);
out3_l = SRAR_SATURATE_SIGNED_H(out3_l, rnd_vec, 7);
PCKEV_B_4VECS_UB(out0_l, out1_l, out2_l, out3_l, out0_r, out1_r,
out2_r, out3_r, tmp0, tmp1, tmp2, tmp3);
XORI_B_4VECS_UB(tmp0, tmp1, tmp2, tmp3,
tmp0, tmp1, tmp2, tmp3, 128);
STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
dst_tmp += (4 * dst_stride);
src10_r = src54_r;
src32_r = src76_r;
src54_r = src98_r;
src21_r = src65_r;
src43_r = src87_r;
src65_r = src109_r;
src10_l = src54_l;
src32_l = src76_l;
src54_l = src98_l;
src21_l = src65_l;
src43_l = src87_l;
src65_l = src109_l;
src6 = src10;
}
src += 16;
dst += 16;
}
}
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter,
height, rnd_val, 16);
common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
filter, height, rnd_val);
}
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, rnd_val, 32);
}
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, rnd_val, 48);
}
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
const int8_t *filter, int32_t height,
uint8_t rnd_val)
{
common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
filter, height, rnd_val, 64);
}
static void copy_width8_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LOAD_8VECS_UB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
out4 = __msa_copy_u_d((v2i64) src4, 0);
out5 = __msa_copy_u_d((v2i64) src5, 0);
out6 = __msa_copy_u_d((v2i64) src6, 0);
out7 = __msa_copy_u_d((v2i64) src7, 0);
STORE_DWORD(dst, out0);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
STORE_DWORD(dst, out4);
dst += dst_stride;
STORE_DWORD(dst, out5);
dst += dst_stride;
STORE_DWORD(dst, out6);
dst += dst_stride;
STORE_DWORD(dst, out7);
dst += dst_stride;
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
STORE_DWORD(dst, out0);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
}
} else if (0 == height % 8) {
for (cnt = height >> 3; cnt--;) {
LOAD_8VECS_UB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
out4 = __msa_copy_u_d((v2i64) src4, 0);
out5 = __msa_copy_u_d((v2i64) src5, 0);
out6 = __msa_copy_u_d((v2i64) src6, 0);
out7 = __msa_copy_u_d((v2i64) src7, 0);
STORE_DWORD(dst, out0);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
STORE_DWORD(dst, out4);
dst += dst_stride;
STORE_DWORD(dst, out5);
dst += dst_stride;
STORE_DWORD(dst, out6);
dst += dst_stride;
STORE_DWORD(dst, out7);
dst += dst_stride;
}
} else if (0 == height % 4) {
for (cnt = (height / 4); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
STORE_DWORD(dst, out0);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
}
} else if (0 == height % 2) {
for (cnt = (height / 2); cnt--;) {
LOAD_2VECS_UB(src, src_stride, src0, src1);
src += (2 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
STORE_DWORD(dst, out0);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
}
}
}
static void copy_width12_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
uint32_t out8, out9, out10, out11, out12, out13, out14, out15;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
for (cnt = 2; cnt--;) {
LOAD_8VECS_UB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64) src0, 0);
out1 = __msa_copy_u_d((v2i64) src1, 0);
out2 = __msa_copy_u_d((v2i64) src2, 0);
out3 = __msa_copy_u_d((v2i64) src3, 0);
out4 = __msa_copy_u_d((v2i64) src4, 0);
out5 = __msa_copy_u_d((v2i64) src5, 0);
out6 = __msa_copy_u_d((v2i64) src6, 0);
out7 = __msa_copy_u_d((v2i64) src7, 0);
out8 = __msa_copy_u_w((v4i32) src0, 2);
out9 = __msa_copy_u_w((v4i32) src1, 2);
out10 = __msa_copy_u_w((v4i32) src2, 2);
out11 = __msa_copy_u_w((v4i32) src3, 2);
out12 = __msa_copy_u_w((v4i32) src4, 2);
out13 = __msa_copy_u_w((v4i32) src5, 2);
out14 = __msa_copy_u_w((v4i32) src6, 2);
out15 = __msa_copy_u_w((v4i32) src7, 2);
STORE_DWORD(dst, out0);
STORE_WORD(dst + 8, out8);
dst += dst_stride;
STORE_DWORD(dst, out1);
STORE_WORD(dst + 8, out9);
dst += dst_stride;
STORE_DWORD(dst, out2);
STORE_WORD(dst + 8, out10);
dst += dst_stride;
STORE_DWORD(dst, out3);
STORE_WORD(dst + 8, out11);
dst += dst_stride;
STORE_DWORD(dst, out4);
STORE_WORD(dst + 8, out12);
dst += dst_stride;
STORE_DWORD(dst, out5);
STORE_WORD(dst + 8, out13);
dst += dst_stride;
STORE_DWORD(dst, out6);
STORE_WORD(dst + 8, out14);
dst += dst_stride;
STORE_DWORD(dst, out7);
STORE_WORD(dst + 8, out15);
dst += dst_stride;
}
}
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height, int32_t width)
{
int32_t cnt, loop_cnt;
uint8_t *src_tmp, *dst_tmp;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
for (cnt = (width >> 4); cnt--;) {
src_tmp = src;
dst_tmp = dst;
for (loop_cnt = (height >> 3); loop_cnt--;) {
LOAD_8VECS_UB(src_tmp, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src_tmp += (8 * src_stride);
STORE_8VECS_UB(dst_tmp, dst_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
dst_tmp += (8 * dst_stride);
}
src += 16;
dst += 16;
}
}
static void copy_width16_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LOAD_8VECS_UB(src, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
STORE_8VECS_UB(dst, dst_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
dst += (8 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
dst += (4 * dst_stride);
}
} else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
} else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
dst += (4 * dst_stride);
}
}
}
static void copy_width24_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
}
static void copy_width32_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride);
}
} else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
} else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride);
}
}
}
static void copy_width48_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
}
static void copy_width64_msa(uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
int32_t height)
{
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
}
#define MC_COPY(WIDTH) \
void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
}
MC_COPY(4);
MC_COPY(6);
MC_COPY(8);
MC_COPY(12);
MC_COPY(16);
MC_COPY(24);
MC_COPY(32);
MC_COPY(48);
MC_COPY(64);
#undef MC_COPY
#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
void ff_hevc_put_hevc_##PEL##_##DIR####WIDTH##_8_msa(int16_t *dst, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
\
hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
MAX_PB_SIZE, filter, height); \
}
MC(qpel, h, 4, 8, hz, mx);
MC(qpel, h, 8, 8, hz, mx);
MC(qpel, h, 12, 8, hz, mx);
MC(qpel, h, 16, 8, hz, mx);
MC(qpel, h, 24, 8, hz, mx);
MC(qpel, h, 32, 8, hz, mx);
MC(qpel, h, 48, 8, hz, mx);
MC(qpel, h, 64, 8, hz, mx);
MC(qpel, v, 4, 8, vt, my);
MC(qpel, v, 8, 8, vt, my);
MC(qpel, v, 12, 8, vt, my);
MC(qpel, v, 16, 8, vt, my);
MC(qpel, v, 24, 8, vt, my);
MC(qpel, v, 32, 8, vt, my);
MC(qpel, v, 48, 8, vt, my);
MC(qpel, v, 64, 8, vt, my);
#undef MC
#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
void ff_hevc_put_hevc_##PEL##_##DIR####WIDTH##_8_msa(int16_t *dst, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
\
hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
filter_x, filter_y, height); \
}
MC_HV(qpel, hv, 4, 8, hv);
MC_HV(qpel, hv, 8, 8, hv);
MC_HV(qpel, hv, 12, 8, hv);
MC_HV(qpel, hv, 16, 8, hv);
MC_HV(qpel, hv, 24, 8, hv);
MC_HV(qpel, hv, 32, 8, hv);
MC_HV(qpel, hv, 48, 8, hv);
MC_HV(qpel, hv, 64, 8, hv);
#undef MC_HV #undef MC_HV
#define UNI_MC_COPY(WIDTH) \
void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t dst_stride, \
uint8_t *src, \
ptrdiff_t src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
}
UNI_MC_COPY(8);
UNI_MC_COPY(12);
UNI_MC_COPY(16);
UNI_MC_COPY(24);
UNI_MC_COPY(32);
UNI_MC_COPY(48);
UNI_MC_COPY(64);
#undef UNI_MC_COPY
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t \
dst_stride, \
uint8_t *src, \
ptrdiff_t \
src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
\
common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
filter, height, 6); \
}
UNI_MC(qpel, h, 4, 8, hz, mx);
UNI_MC(qpel, h, 8, 8, hz, mx);
UNI_MC(qpel, h, 12, 8, hz, mx);
UNI_MC(qpel, h, 16, 8, hz, mx);
UNI_MC(qpel, h, 24, 8, hz, mx);
UNI_MC(qpel, h, 32, 8, hz, mx);
UNI_MC(qpel, h, 48, 8, hz, mx);
UNI_MC(qpel, h, 64, 8, hz, mx);
UNI_MC(qpel, v, 4, 8, vt, my);
UNI_MC(qpel, v, 8, 8, vt, my);
UNI_MC(qpel, v, 12, 8, vt, my);
UNI_MC(qpel, v, 16, 8, vt, my);
UNI_MC(qpel, v, 24, 8, vt, my);
UNI_MC(qpel, v, 32, 8, vt, my);
UNI_MC(qpel, v, 48, 8, vt, my);
UNI_MC(qpel, v, 64, 8, vt, my);
#undef UNI_MC
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
ptrdiff_t \
dst_stride, \
uint8_t *src, \
ptrdiff_t \
src_stride, \
int height, \
intptr_t mx, \
intptr_t my, \
int width) \
{ \
const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
\
hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
dst_stride, filter_x, \
filter_y, height); \
}
UNI_MC_HV(qpel, hv, 4, 8, hv);
UNI_MC_HV(qpel, hv, 8, 8, hv);
UNI_MC_HV(qpel, hv, 12, 8, hv);
UNI_MC_HV(qpel, hv, 16, 8, hv);
UNI_MC_HV(qpel, hv, 24, 8, hv);
UNI_MC_HV(qpel, hv, 32, 8, hv);
UNI_MC_HV(qpel, hv, 48, 8, hv);
UNI_MC_HV(qpel, hv, 64, 8, hv);
#undef UNI_MC_HV
...@@ -278,6 +278,7 @@ ...@@ -278,6 +278,7 @@
out0 = LD_B(RTYPE, (psrc)); \ out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \ out1 = LD_B(RTYPE, (psrc) + stride); \
} }
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
...@@ -349,6 +350,14 @@ ...@@ -349,6 +350,14 @@
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
pdst, stride) \
{ \
ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride /* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, stride Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to) Outputs - pdst (destination pointer to store to)
...@@ -425,6 +434,26 @@ ...@@ -425,6 +434,26 @@
SH(out3_m, pblk_2x4_m + 3 * stride); \ SH(out3_m, pblk_2x4_m + 3 * stride); \
} }
/* Description : Store as 4x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Return Type - unsigned byte
Details : Index 0 word element from input vector is copied and stored
on first line
Index 1 word element from input vector is copied and stored
on second line
*/
#define ST4x2_UB(in, pdst, stride) \
{ \
uint32_t out0_m, out1_m; \
uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
\
out0_m = __msa_copy_u_w((v4i32) in, 0); \
out1_m = __msa_copy_u_w((v4i32) in, 1); \
\
SW(out0_m, pblk_4x2_m); \
SW(out1_m, pblk_4x2_m + stride); \
}
/* Description : Store as 4x4 byte block to destination memory from input vector /* Description : Store as 4x4 byte block to destination memory from input vector
Arguments : Inputs - in0, in1, pdst, stride Arguments : Inputs - in0, in1, pdst, stride
Return Type - unsigned byte Return Type - unsigned byte
...@@ -598,7 +627,18 @@ ...@@ -598,7 +627,18 @@
out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \ out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \ out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
} }
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
out0, out1, out2) \
{ \
VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
}
#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
out0, out1, out2, out3) \ out0, out1, out2, out3) \
...@@ -608,6 +648,57 @@ ...@@ -608,6 +648,57 @@
} }
#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Selective byte elements from in0 & in1 are copied to out0 as
per control vector mask0
Selective byte elements from in2 & in3 are copied to out1 as
per control vector mask1
*/
#define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
{ \
out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
}
#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
/* Description : Dot product of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
Outputs - out0, out1
Return Type - signed halfword
Details : Signed byte elements from mult0 are multiplied with
signed byte elements from cnst0 producing a result
twice the size of input i.e. signed halfword.
Then this multiplication results of adjacent odd-even elements
are added together and stored to the out vector
(2 signed halfword results)
*/
#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
{ \
out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
}
#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
#define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
out0, out1, out2) \
{ \
DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
}
#define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
{ \
DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
}
#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
/* Description : Dot product & addition of byte vector elements /* Description : Dot product & addition of byte vector elements
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
...@@ -701,6 +792,22 @@ ...@@ -701,6 +792,22 @@
CLIP_SH2_0_255(in2, in3); \ CLIP_SH2_0_255(in2, in3); \
} }
/* Description : Clips all signed word elements of input vector
between 0 & 255
Arguments : Inputs - in (input vector)
Outputs - out_m (output vector with clipped elements)
Return Type - signed word
*/
#define CLIP_SW_0_255(in) \
( { \
v4i32 max_m = __msa_ldi_w(255); \
v4i32 out_m; \
\
out_m = __msa_maxi_s_w((v4i32) in, 0); \
out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
out_m; \
} )
/* Description : Horizontal subtraction of unsigned byte vector elements /* Description : Horizontal subtraction of unsigned byte vector elements
Arguments : Inputs - in0, in1 Arguments : Inputs - in0, in1
Outputs - out0, out1 Outputs - out0, out1
...@@ -1021,6 +1128,37 @@ ...@@ -1021,6 +1128,37 @@
} }
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
/* Description : Saturate the halfword element values to the max
unsigned value of (sat_val+1 bits)
The element data width remains unchanged
Arguments : Inputs - in0, in1, in2, in3, sat_val
Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword
Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range
Results are in placed to original vectors
*/
#define SAT_SH2(RTYPE, in0, in1, sat_val) \
{ \
in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
}
#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
#define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
{ \
SAT_SH2(RTYPE, in0, in1, sat_val) \
in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
}
#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
{ \
SAT_SH2(RTYPE, in0, in1, sat_val); \
SAT_SH2(RTYPE, in2, in3, sat_val); \
}
#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
/* Description : Indexed halfword element values are replicated to all /* Description : Indexed halfword element values are replicated to all
elements in output vector elements in output vector
Arguments : Inputs - in, idx0, idx1 Arguments : Inputs - in, idx0, idx1
...@@ -1043,6 +1181,7 @@ ...@@ -1043,6 +1181,7 @@
SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
} }
#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
/* Description : Indexed word element values are replicated to all /* Description : Indexed word element values are replicated to all
...@@ -1097,6 +1236,7 @@ ...@@ -1097,6 +1236,7 @@
out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \ out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
} }
#define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__) #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
#define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \ out0, out1, out2, out3) \
...@@ -1123,6 +1263,7 @@ ...@@ -1123,6 +1263,7 @@
out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \ out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \ out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
} }
#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
...@@ -1131,6 +1272,7 @@ ...@@ -1131,6 +1272,7 @@
PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
} }
#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
#define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__) #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
/* Description : Each byte element is logically xor'ed with immediate 128 /* Description : Each byte element is logically xor'ed with immediate 128
...@@ -1212,6 +1354,7 @@ ...@@ -1212,6 +1354,7 @@
ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
} }
#define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__) #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
/* Description : Shift left all elements of vector (generic for all data types) /* Description : Shift left all elements of vector (generic for all data types)
Arguments : Inputs - in0, in1, in2, in3, shift Arguments : Inputs - in0, in1, in2, in3, shift
...@@ -1266,6 +1409,64 @@ ...@@ -1266,6 +1409,64 @@
} }
#define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__) #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded halfwords
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1, (in place)
Return Type - unsigned halfword
Details : Each element of vector 'in0' is shifted right arithmetic by
number of bits respective element holds in vector 'shift'.
The last discarded bit is added to shifted value for rounding
and the result is in place written to 'in0'
Here, 'shift' is a vector passed in
Similar for other pairs
*/
#define SRAR_H2(RTYPE, in0, in1, shift) \
{ \
in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
}
#define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
#define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
#define SRAR_H3(RTYPE, in0, in1, in2, shift) \
{ \
SRAR_H2(RTYPE, in0, in1, shift) \
in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
}
#define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
#define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRAR_H2(RTYPE, in0, in1, shift) \
SRAR_H2(RTYPE, in2, in3, shift) \
}
#define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
#define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1 (in place)
Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right arithmetic by
value in 'shift'.
The last discarded bit is added to shifted value for rounding
and the result is in place written to 'in0'
Similar for other pairs
*/
#define SRARI_W2(RTYPE, in0, in1, shift) \
{ \
in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
}
#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRARI_W2(RTYPE, in0, in1, shift); \
SRARI_W2(RTYPE, in2, in3, shift); \
}
#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
/* Description : Multiplication of pairs of vectors /* Description : Multiplication of pairs of vectors
Arguments : Inputs - in0, in1, in2, in3 Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1 Outputs - out0, out1
...@@ -1392,6 +1593,22 @@ ...@@ -1392,6 +1593,22 @@
out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \ out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
} }
/* Description : Pack even elements of input vectors & xor with 128
Arguments : Inputs - in0, in1
Outputs - out_m
Return Type - unsigned byte
Details : Signed byte even elements from 'in0' and 'in1' are packed
together in one vector and the resulted vector is xor'ed with
128 to shift the range from signed to unsigned byte
*/
#define PCKEV_XORI128_UB(in0, in1) \
( { \
v16u8 out_m; \
out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
out_m; \
} )
/* Description : Pack even byte elements, extract 0 & 2 index words from pair /* Description : Pack even byte elements, extract 0 & 2 index words from pair
of results and store 4 words in destination memory as per of results and store 4 words in destination memory as per
stride stride
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment