Commit 942e22c6 authored by plepere's avatar plepere Committed by Michael Niedermayer

avcodec/x86/hevc: add avx2 dc idct

Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent a30f1b15
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
; */ ; */
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA SECTION_RODATA 32
max_pixels_10: times 8 dw ((1 << 10)-1) max_pixels_10: times 16 dw ((1 << 10)-1)
dc_add_10: times 4 dd ((1 << 14-10) + 1) dc_add_10: times 4 dd ((1 << 14-10) + 1)
SECTION .text SECTION_TEXT 32
;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
...@@ -41,6 +41,18 @@ SECTION .text ...@@ -41,6 +41,18 @@ SECTION .text
packuswb m1, m1 packuswb m1, m1
%endmacro %endmacro
%macro DC_ADD_INIT_AVX2 2
add %1w, ((1 << 14-8) + 1)
sar %1w, (15-8)
movd xm0, %1d
vpbroadcastw m0, xm0 ;SPLATW
lea %1, [%2*3]
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
%endmacro
%macro DC_ADD_OP 4 %macro DC_ADD_OP 4
%1 m2, [%2 ] %1 m2, [%2 ]
%1 m3, [%2+%3 ] %1 m3, [%2+%3 ]
...@@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0 ...@@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0
DC_ADD_OP mova, r0, r2, r3 DC_ADD_OP mova, r0, r2, r3
RET RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
cglobal hevc_idct32_dc_add_8, 3, 4, 6
movsx r3, word [r1]
DC_ADD_INIT_AVX2 r3, r2
DC_ADD_OP mova, r0, r2, r3,
%rep 7
lea r0, [r0+r2*4]
DC_ADD_OP mova, r0, r2, r3
%endrep
RET
%endif ;HAVE_AVX2_EXTERNAL
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride) ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -178,3 +203,23 @@ IDCT8_DC_ADD ...@@ -178,3 +203,23 @@ IDCT8_DC_ADD
INIT_XMM avx INIT_XMM avx
IDCT8_DC_ADD IDCT8_DC_ADD
%endif %endif
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal hevc_idct16_dc_add_10,3,4,7
mov r1w, [r1]
add r1w, ((1 << 4) + 1)
sar r1w, 5
movd xm0, r1d
lea r1, [r2*3]
vpbroadcastw m0, xm0 ;SPLATW
mova m6, [max_pixels_10]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
%endif ;HAVE_AVX_EXTERNAL
...@@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext); ...@@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext);
idct_dc_proto(16,8, sse2); idct_dc_proto(16,8, sse2);
idct_dc_proto(32,8, sse2); idct_dc_proto(32,8, sse2);
idct_dc_proto(32,8, avx2);
idct_dc_proto(4, 10,mmxext); idct_dc_proto(4, 10,mmxext);
idct_dc_proto(8, 10, sse2); idct_dc_proto(8, 10, sse2);
...@@ -142,6 +144,10 @@ idct_dc_proto(8, 10, avx); ...@@ -142,6 +144,10 @@ idct_dc_proto(8, 10, avx);
idct_dc_proto(16,10, avx); idct_dc_proto(16,10, avx);
idct_dc_proto(32,10, avx); idct_dc_proto(32,10, avx);
idct_dc_proto(16,10, avx2);
idct_dc_proto(32,10, avx2);
......
...@@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid ...@@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
} }
#endif //HAVE_AVX_EXTERNAL #endif //HAVE_AVX_EXTERNAL
#if HAVE_AVX2_EXTERNAL
void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
{
ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
}
#endif //HAVE_AVX2_EXTERNAL
#define mc_rep_func(name, bitd, step, W, opt) \ #define mc_rep_func(name, bitd, step, W, opt) \
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride, \ void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride, \
uint8_t *_src, ptrdiff_t _srcstride, int height, \ uint8_t *_src, ptrdiff_t _srcstride, int height, \
...@@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
} }
if (EXTERNAL_AVX2(mm_flags)) {
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_8_avx2;
}
} else if (bit_depth == 10) { } else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(mm_flags)) { if (EXTERNAL_MMXEXT(mm_flags)) {
c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext; c->transform_dc_add[0] = ff_hevc_idct4_dc_add_10_mmxext;
...@@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx; c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx;
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx; c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx;
} }
if (EXTERNAL_AVX2(mm_flags)) {
c->transform_dc_add[2] = ff_hevc_idct16_dc_add_10_avx2;
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2;
}
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment