Commit 76a99d46 authored by James Almer's avatar James Almer

x86/hecv_res_add: add ff_hevc_transform_add{8,16,32}_8_avx

~15% faster than sse2
Reviewed-by: 's avatarMickaël Raulet <mraulet@gmail.com>
Reviewed-by: 's avatarChristophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent d2163f5e
...@@ -156,8 +156,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6 ...@@ -156,8 +156,8 @@ cglobal hevc_transform_add4_8, 3, 4, 6
%endmacro %endmacro
INIT_XMM sse2 %macro TRANSFORM_ADD_8 0
; void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_transform_add8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
cglobal hevc_transform_add8_8, 3, 4, 8 cglobal hevc_transform_add8_8, 3, 4, 8
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_8_8 TR_ADD_SSE_8_8
...@@ -167,7 +167,7 @@ cglobal hevc_transform_add8_8, 3, 4, 8 ...@@ -167,7 +167,7 @@ cglobal hevc_transform_add8_8, 3, 4, 8
RET RET
%if ARCH_X86_64 %if ARCH_X86_64
; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_transform_add16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
cglobal hevc_transform_add16_8, 3, 4, 12 cglobal hevc_transform_add16_8, 3, 4, 12
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_16_8 TR_ADD_SSE_16_8
...@@ -178,7 +178,7 @@ cglobal hevc_transform_add16_8, 3, 4, 12 ...@@ -178,7 +178,7 @@ cglobal hevc_transform_add16_8, 3, 4, 12
%endrep %endrep
RET RET
; void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_transform_add32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
cglobal hevc_transform_add32_8, 3, 4, 12 cglobal hevc_transform_add32_8, 3, 4, 12
TR_ADD_SSE_32_8 TR_ADD_SSE_32_8
...@@ -190,6 +190,13 @@ cglobal hevc_transform_add32_8, 3, 4, 12 ...@@ -190,6 +190,13 @@ cglobal hevc_transform_add32_8, 3, 4, 12
RET RET
%endif ;ARCH_X86_64 %endif ;ARCH_X86_64
%endmacro
INIT_XMM sse2
TRANSFORM_ADD_8
INIT_XMM avx
TRANSFORM_ADD_8
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
......
...@@ -139,6 +139,10 @@ void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stri ...@@ -139,6 +139,10 @@ void ff_hevc_transform_add8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stri
void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
......
...@@ -509,7 +509,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -509,7 +509,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (ARCH_X86_64) { if (ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
c->transform_add[2] = ff_hevc_transform_add16_8_avx;
c->transform_add[3] = ff_hevc_transform_add32_8_avx;
} }
c->transform_add[1] = ff_hevc_transform_add8_8_avx;
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment