Commit c3d2426c authored by James Almer's avatar James Almer

x86/hevc_res_add: add ff_hevc_transform_add32_8_avx2

~20% faster than AVX.
Reviewed-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 467a55a4
...@@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6 ...@@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
%endmacro %endmacro
%macro TR_ADD_SSE_16_32_8 3 %macro TR_ADD_SSE_16_32_8 3
mova m2, [r1+%1 ] mova xm2, [r1+%1 ]
mova m6, [r1+%1+16] mova xm6, [r1+%1+16]
%if cpuflag(avx2)
vinserti128 m2, m2, [r1+%1+32], 1
vinserti128 m6, m6, [r1+%1+48], 1
%endif
%if cpuflag(avx) %if cpuflag(avx)
psubw m1, m0, m2 psubw m1, m0, m2
psubw m5, m0, m6 psubw m5, m0, m6
...@@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6 ...@@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
packuswb m2, m6 packuswb m2, m6
packuswb m1, m5 packuswb m1, m5
mova m4, [r1+%1+32] mova xm4, [r1+%1+mmsize*2 ]
mova m6, [r1+%1+48] mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2)
vinserti128 m4, m4, [r1+%1+96 ], 1
vinserti128 m6, m6, [r1+%1+112], 1
%endif
%if cpuflag(avx) %if cpuflag(avx)
psubw m3, m0, m4 psubw m3, m0, m4
psubw m5, m0, m6 psubw m5, m0, m6
...@@ -169,6 +177,21 @@ TRANSFORM_ADD_8 ...@@ -169,6 +177,21 @@ TRANSFORM_ADD_8
INIT_XMM avx INIT_XMM avx
TRANSFORM_ADD_8 TRANSFORM_ADD_8
INIT_YMM avx2
; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
cglobal hevc_transform_add32_8, 3, 4, 7
pxor m0, m0
lea r3, [r2*3]
TR_ADD_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
%rep 7
add r1, 256
lea r0, [r0+r2*4]
TR_ADD_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
%endrep
RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride) ; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
......
...@@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid ...@@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
......
...@@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
} }
} else if (bit_depth == 10) { } else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment