Commit 6d5636ad authored by Pierre Edouard Lepere's avatar Pierre Edouard Lepere Committed by Diego Biurrun

hevc: x86: Add add_residual() SIMD optimizations

Initially written by Pierre Edouard Lepere <Pierre-Edouard.Lepere@insa-rennes.fr>,
extended by James Almer <jamrial@gmail.com>.
Signed-off-by: 's avatarAlexandra Hájková <alexandra@khirnov.net>
parent 043b0b9f
......@@ -42,7 +42,7 @@ typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *dst, ptrdiff_t stride, int size,
GetBitContext *gb, int pcm_bit_depth);
void (*add_residual[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void (*dequant)(int16_t *coeffs);
void (*transform_4x4_luma)(int16_t *coeffs);
......
......@@ -115,9 +115,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \
x86/hevc_mc.o \
x86/hevc_idct.o
YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_add_res.o \
x86/hevc_deblock.o \
x86/hevc_idct.o \
x86/hevc_mc.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
......
This diff is collapsed.
......@@ -91,6 +91,25 @@ void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
IDCT_FUNCS(sse2)
IDCT_FUNCS(avx)
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
#define GET_PIXELS(width, depth, cf) \
void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
......@@ -278,17 +297,24 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
c->idct[0] = ff_hevc_idct_4x4_8_sse2;
c->idct[1] = ff_hevc_idct_8x8_8_sse2;
SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
......@@ -307,11 +333,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_AVX(cpu_flags)) {
c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx;
c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
......@@ -330,11 +364,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->idct[0] = ff_hevc_idct_4x4_10_avx;
c->idct[1] = ff_hevc_idct_8x8_10_avx;
}
if (EXTERNAL_AVX2(cpu_flags)) {
c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
}
}
#if ARCH_X86_64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment