Commit b87dc70c authored by Shivraj Patil's avatar Shivraj Patil Committed by Michael Niedermayer

avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for AVC chroma mc functions

s patch adds MSA (MIPS-SIMD-Arch) optimizations for AVC chroma mc functions in new file h264chroma_msa.c
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h
Signed-off-by: 's avatarShivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent fd004e10
......@@ -31,5 +31,6 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevc_lpf_sao_msa.o \
mips/hevcpred_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o
MSA-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_msa.o
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
/*
* Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
* Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
*
* This file is part of FFmpeg.
*
......@@ -20,6 +21,23 @@
#include "h264chroma_mips.h"
#if HAVE_MSA
static av_cold void h264chroma_init_msa(H264ChromaContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
if (!high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_msa;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_msa;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_msa;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_msa;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_msa;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_msa;
}
}
#endif // #if HAVE_MSA
#if HAVE_LOONGSON3
static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
{
......@@ -36,6 +54,9 @@ static av_cold void h264chroma_init_mmi(H264ChromaContext *c, int bit_depth)
av_cold void ff_h264chroma_init_mips(H264ChromaContext *c, int bit_depth)
{
#if HAVE_MSA
h264chroma_init_msa(c, bit_depth);
#endif // #if HAVE_MSA
#if HAVE_LOONGSON3
h264chroma_init_mmi(c, bit_depth);
#endif /* HAVE_LOONGSON3 */
......
......@@ -22,6 +22,18 @@
#define H264_CHROMA_MIPS_H
#include "libavcodec/h264.h"
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, int stride,
int height, int x, int y);
void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
int h, int x, int y);
......
This diff is collapsed.
......@@ -747,6 +747,33 @@
SW(out15_m, pblk_12x8_m + 8); \
}
/* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0, in1, in2, in3,
Outputs - out0, out1
Return Type - signed byte
Details : Each byte element from 'in0' vector is added with each byte
element from 'in1' vector. The addition of the elements plus 1
(for rounding) is done unsigned with full precision,
i.e. the result has one extra bit. Unsigned division by 2
(or logical shift right by one bit) is performed before writing
the result to vector 'out0'
Similar for the pair of 'in2' and 'in3'
*/
#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \
out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
}
#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
{ \
AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
}
#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
/* Description : Immediate number of columns to slide with zero
Arguments : Inputs - in0, in1, slide_val
Outputs - out0, out1
......@@ -859,6 +886,34 @@
}
#define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
/* Description : Dot product of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
Outputs - out0, out1
Return Type - unsigned halfword
Details : Unsigned byte elements from mult0 are multiplied with
unsigned byte elements from cnst0 producing a result
twice the size of input i.e. unsigned halfword.
Then this multiplication results of adjacent odd-even elements
are added together and stored to the out vector
(2 unsigned halfword results)
*/
#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
{ \
out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
}
#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
cnst0, cnst1, cnst2, cnst3, \
out0, out1, out2, out3) \
{ \
DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
}
#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
/* Description : Dot product of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
......@@ -1363,6 +1418,7 @@
out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
}
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment