Commit 3d653598 authored by Clément Bœsch's avatar Clément Bœsch

Merge commit '6d5636ad'

* commit '6d5636ad':
  hevc: x86: Add add_residual() SIMD optimizations

See a6af4bf6

This merge is only cosmetics (renames, space shuffling, etc).

The functionnal changes in the ASM are *not* merged:
- unrolling with %rep is kept
- ADD_RES_MMX_4_8 is left untouched: this needs investigation
Merged-by: 's avatarClément Bœsch <u@pkh.me>
parents 40ac2260 6d5636ad
...@@ -97,6 +97,7 @@ Stuff that didn't reach the codebase: ...@@ -97,6 +97,7 @@ Stuff that didn't reach the codebase:
- VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348) - VAAPI VP8 decode hwaccel (currently under review: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-February/thread.html#207348)
- Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html) - Removal of the custom atomic API (5cc0057f49, see http://ffmpeg.org/pipermail/ffmpeg-devel/2017-March/209003.html)
- Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3) - Use the new bitstream filter for extracting extradata (see 8e2ea69135 and 096a8effa3)
- ADD_RES_MMX_4_8 in libavcodec/x86/hevc_add_res.asm probably needs updating (see 589880710)
Collateral damage that needs work locally: Collateral damage that needs work locally:
------------------------------------------ ------------------------------------------
......
...@@ -46,7 +46,7 @@ typedef struct HEVCDSPContext { ...@@ -46,7 +46,7 @@ typedef struct HEVCDSPContext {
void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
struct GetBitContext *gb, int pcm_bit_depth); struct GetBitContext *gb, int pcm_bit_depth);
void (*add_residual[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void (*dequant)(int16_t *coeffs, int16_t log2_size); void (*dequant)(int16_t *coeffs, int16_t log2_size);
......
; /* ; *****************************************************************************
; * Provide SIMD optimizations for add_residual functions for HEVC decoding ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE ; * Copyright (c) 2014 Pierre-Edouard LEPERE
; * ; *
...@@ -17,7 +17,8 @@ ...@@ -17,7 +17,8 @@
; * You should have received a copy of the GNU Lesser General Public ; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software ; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */ ; ******************************************************************************
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION .text SECTION .text
...@@ -25,9 +26,8 @@ SECTION .text ...@@ -25,9 +26,8 @@ SECTION .text
cextern pw_1023 cextern pw_1023
%define max_pixels_10 pw_1023 %define max_pixels_10 pw_1023
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file %macro ADD_RES_MMX_4_8 0
%macro TR_ADD_MMX_4_8 0
mova m2, [r1] mova m2, [r1]
mova m4, [r1+8] mova m4, [r1+8]
pxor m3, m3 pxor m3, m3
...@@ -39,27 +39,27 @@ cextern pw_1023 ...@@ -39,27 +39,27 @@ cextern pw_1023
packuswb m4, m4 packuswb m4, m4
packuswb m5, m5 packuswb m5, m5
movh m0, [r0 ] movh m0, [r0]
movh m1, [r0+r2 ] movh m1, [r0+r2]
paddusb m0, m2 paddusb m0, m2
paddusb m1, m4 paddusb m1, m4
psubusb m0, m3 psubusb m0, m3
psubusb m1, m5 psubusb m1, m5
movh [r0 ], m0 movh [r0], m0
movh [r0+r2 ], m1 movh [r0+r2], m1
%endmacro %endmacro
INIT_MMX mmxext INIT_MMX mmxext
; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual4_8, 3, 4, 6 cglobal hevc_add_residual_4_8, 3, 4, 6
TR_ADD_MMX_4_8 ADD_RES_MMX_4_8
add r1, 16 add r1, 16
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
TR_ADD_MMX_4_8 ADD_RES_MMX_4_8
RET RET
%macro TR_ADD_SSE_8_8 0 %macro ADD_RES_SSE_8_8 0
pxor m3, m3 pxor m3, m3
mova m4, [r1] mova m4, [r1]
mova m6, [r1+16] mova m6, [r1+16]
...@@ -74,22 +74,22 @@ cglobal hevc_add_residual4_8, 3, 4, 6 ...@@ -74,22 +74,22 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m6, m2 packuswb m6, m2
packuswb m7, m3 packuswb m7, m3
movq m0, [r0 ] movq m0, [r0]
movq m1, [r0+r2 ] movq m1, [r0+r2]
movhps m0, [r0+r2*2] movhps m0, [r0+r2*2]
movhps m1, [r0+r3 ] movhps m1, [r0+r3]
paddusb m0, m4 paddusb m0, m4
paddusb m1, m6 paddusb m1, m6
psubusb m0, m5 psubusb m0, m5
psubusb m1, m7 psubusb m1, m7
movq [r0 ], m0 movq [r0], m0
movq [r0+r2 ], m1 movq [r0+r2], m1
movhps [r0+2*r2], m0 movhps [r0+2*r2], m0
movhps [r0+r3 ], m1 movhps [r0+r3], m1
%endmacro %endmacro
%macro TR_ADD_SSE_16_32_8 3 %macro ADD_RES_SSE_16_32_8 3
mova xm2, [r1+%1 ] mova xm2, [r1+%1]
mova xm6, [r1+%1+16] mova xm6, [r1+%1+16]
%if cpuflag(avx2) %if cpuflag(avx2)
vinserti128 m2, m2, [r1+%1+32], 1 vinserti128 m2, m2, [r1+%1+32], 1
...@@ -107,7 +107,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6 ...@@ -107,7 +107,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m2, m6 packuswb m2, m6
packuswb m1, m5 packuswb m1, m5
mova xm4, [r1+%1+mmsize*2 ] mova xm4, [r1+%1+mmsize*2]
mova xm6, [r1+%1+mmsize*2+16] mova xm6, [r1+%1+mmsize*2+16]
%if cpuflag(avx2) %if cpuflag(avx2)
vinserti128 m4, m4, [r1+%1+96 ], 1 vinserti128 m4, m4, [r1+%1+96 ], 1
...@@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6 ...@@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6
%macro TRANSFORM_ADD_8 0 %macro TRANSFORM_ADD_8 0
; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual8_8, 3, 4, 8 cglobal hevc_add_residual_8_8, 3, 4, 8
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_8_8 ADD_RES_SSE_8_8
add r1, 64 add r1, 64
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
TR_ADD_SSE_8_8 ADD_RES_SSE_8_8
RET RET
; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual16_8, 3, 4, 7 cglobal hevc_add_residual_16_8, 3, 4, 7
pxor m0, m0 pxor m0, m0
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_16_32_8 0, r0, r0+r2 ADD_RES_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
%rep 3 %rep 3
add r1, 128 add r1, 128
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
TR_ADD_SSE_16_32_8 0, r0, r0+r2 ADD_RES_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 64, r0+r2*2, r0+r3 ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
%endrep %endrep
RET RET
; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual32_8, 3, 4, 7 cglobal hevc_add_residual_32_8, 3, 4, 7
pxor m0, m0 pxor m0, m0
TR_ADD_SSE_16_32_8 0, r0, r0+16 ADD_RES_SSE_16_32_8 0, r0, r0+16
TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
%rep 15 %rep 15
add r1, 128 add r1, 128
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
TR_ADD_SSE_16_32_8 0, r0, r0+16 ADD_RES_SSE_16_32_8 0, r0, r0+16
TR_ADD_SSE_16_32_8 64, r0+r2, r0+r2+16 ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
%endrep %endrep
RET RET
%endmacro %endmacro
...@@ -179,80 +179,77 @@ TRANSFORM_ADD_8 ...@@ -179,80 +179,77 @@ TRANSFORM_ADD_8
%if HAVE_AVX2_EXTERNAL %if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 INIT_YMM avx2
; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride) ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
cglobal hevc_add_residual32_8, 3, 4, 7 cglobal hevc_add_residual_32_8, 3, 4, 7
pxor m0, m0 pxor m0, m0
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_16_32_8 0, r0, r0+r2 ADD_RES_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
%rep 7 %rep 7
add r1, 256 add r1, 256
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
TR_ADD_SSE_16_32_8 0, r0, r0+r2 ADD_RES_SSE_16_32_8 0, r0, r0+r2
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3 ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
%endrep %endrep
RET RET
%endif %endif
;----------------------------------------------------------------------------- %macro ADD_RES_SSE_8_10 4
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
%macro TR_ADD_SSE_8_10 4
mova m0, [%4] mova m0, [%4]
mova m1, [%4+16] mova m1, [%4+16]
mova m2, [%4+32] mova m2, [%4+32]
mova m3, [%4+48] mova m3, [%4+48]
paddw m0, [%1+0 ] paddw m0, [%1+0]
paddw m1, [%1+%2 ] paddw m1, [%1+%2]
paddw m2, [%1+%2*2] paddw m2, [%1+%2*2]
paddw m3, [%1+%3 ] paddw m3, [%1+%3]
CLIPW m0, m4, m5 CLIPW m0, m4, m5
CLIPW m1, m4, m5 CLIPW m1, m4, m5
CLIPW m2, m4, m5 CLIPW m2, m4, m5
CLIPW m3, m4, m5 CLIPW m3, m4, m5
mova [%1+0 ], m0 mova [%1+0], m0
mova [%1+%2 ], m1 mova [%1+%2], m1
mova [%1+%2*2], m2 mova [%1+%2*2], m2
mova [%1+%3 ], m3 mova [%1+%3], m3
%endmacro %endmacro
%macro TR_ADD_MMX4_10 3 %macro ADD_RES_MMX_4_10 3
mova m0, [%1+0 ] mova m0, [%1+0]
mova m1, [%1+%2 ] mova m1, [%1+%2]
paddw m0, [%3] paddw m0, [%3]
paddw m1, [%3+8] paddw m1, [%3+8]
CLIPW m0, m2, m3 CLIPW m0, m2, m3
CLIPW m1, m2, m3 CLIPW m1, m2, m3
mova [%1+0 ], m0 mova [%1+0], m0
mova [%1+%2 ], m1 mova [%1+%2], m1
%endmacro %endmacro
%macro TRANS_ADD_SSE_16_10 3 %macro ADD_RES_SSE_16_10 3
mova m0, [%3] mova m0, [%3]
mova m1, [%3+16] mova m1, [%3+16]
mova m2, [%3+32] mova m2, [%3+32]
mova m3, [%3+48] mova m3, [%3+48]
paddw m0, [%1 ] paddw m0, [%1]
paddw m1, [%1+16 ] paddw m1, [%1+16]
paddw m2, [%1+%2 ] paddw m2, [%1+%2]
paddw m3, [%1+%2+16] paddw m3, [%1+%2+16]
CLIPW m0, m4, m5 CLIPW m0, m4, m5
CLIPW m1, m4, m5 CLIPW m1, m4, m5
CLIPW m2, m4, m5 CLIPW m2, m4, m5
CLIPW m3, m4, m5 CLIPW m3, m4, m5
mova [%1 ], m0 mova [%1], m0
mova [%1+16 ], m1 mova [%1+16], m1
mova [%1+%2 ], m2 mova [%1+%2], m2
mova [%1+%2+16], m3 mova [%1+%2+16], m3
%endmacro %endmacro
%macro TRANS_ADD_SSE_32_10 2 %macro ADD_RES_SSE_32_10 2
mova m0, [%2] mova m0, [%2]
mova m1, [%2+16] mova m1, [%2+16]
mova m2, [%2+32] mova m2, [%2+32]
mova m3, [%2+48] mova m3, [%2+48]
paddw m0, [%1 ] paddw m0, [%1]
paddw m1, [%1+16] paddw m1, [%1+16]
paddw m2, [%1+32] paddw m2, [%1+32]
paddw m3, [%1+48] paddw m3, [%1+48]
...@@ -260,129 +257,125 @@ cglobal hevc_add_residual32_8, 3, 4, 7 ...@@ -260,129 +257,125 @@ cglobal hevc_add_residual32_8, 3, 4, 7
CLIPW m1, m4, m5 CLIPW m1, m4, m5
CLIPW m2, m4, m5 CLIPW m2, m4, m5
CLIPW m3, m4, m5 CLIPW m3, m4, m5
mova [%1 ], m0 mova [%1], m0
mova [%1+16], m1 mova [%1+16], m1
mova [%1+32], m2 mova [%1+32], m2
mova [%1+48], m3 mova [%1+48], m3
%endmacro %endmacro
%macro TRANS_ADD16_AVX2 4 %macro ADD_RES_AVX2_16_10 4
mova m0, [%4] mova m0, [%4]
mova m1, [%4+32] mova m1, [%4+32]
mova m2, [%4+64] mova m2, [%4+64]
mova m3, [%4+96] mova m3, [%4+96]
paddw m0, [%1+0 ] paddw m0, [%1+0]
paddw m1, [%1+%2 ] paddw m1, [%1+%2]
paddw m2, [%1+%2*2] paddw m2, [%1+%2*2]
paddw m3, [%1+%3 ] paddw m3, [%1+%3]
CLIPW m0, m4, m5 CLIPW m0, m4, m5
CLIPW m1, m4, m5 CLIPW m1, m4, m5
CLIPW m2, m4, m5 CLIPW m2, m4, m5
CLIPW m3, m4, m5 CLIPW m3, m4, m5
mova [%1+0 ], m0 mova [%1+0], m0
mova [%1+%2 ], m1 mova [%1+%2], m1
mova [%1+%2*2], m2 mova [%1+%2*2], m2
mova [%1+%3 ], m3 mova [%1+%3], m3
%endmacro %endmacro
%macro TRANS_ADD32_AVX2 3 %macro ADD_RES_AVX2_32_10 3
mova m0, [%3] mova m0, [%3]
mova m1, [%3+32] mova m1, [%3+32]
mova m2, [%3+64] mova m2, [%3+64]
mova m3, [%3+96] mova m3, [%3+96]
paddw m0, [%1 ] paddw m0, [%1]
paddw m1, [%1+32 ] paddw m1, [%1+32]
paddw m2, [%1+%2 ] paddw m2, [%1+%2]
paddw m3, [%1+%2+32] paddw m3, [%1+%2+32]
CLIPW m0, m4, m5 CLIPW m0, m4, m5
CLIPW m1, m4, m5 CLIPW m1, m4, m5
CLIPW m2, m4, m5 CLIPW m2, m4, m5
CLIPW m3, m4, m5 CLIPW m3, m4, m5
mova [%1 ], m0 mova [%1], m0
mova [%1+32 ], m1 mova [%1+32], m1
mova [%1+%2 ], m2 mova [%1+%2], m2
mova [%1+%2+32], m3 mova [%1+%2+32], m3
%endmacro %endmacro
; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext INIT_MMX mmxext
cglobal hevc_add_residual4_10,3,4, 6 cglobal hevc_add_residual_4_10, 3, 4, 6
pxor m2, m2 pxor m2, m2
mova m3, [max_pixels_10] mova m3, [max_pixels_10]
TR_ADD_MMX4_10 r0, r2, r1 ADD_RES_MMX_4_10 r0, r2, r1
add r1, 16 add r1, 16
lea r0, [r0+2*r2] lea r0, [r0+2*r2]
TR_ADD_MMX4_10 r0, r2, r1 ADD_RES_MMX_4_10 r0, r2, r1
RET RET
;-----------------------------------------------------------------------------
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2 INIT_XMM sse2
cglobal hevc_add_residual8_10,3,4,6 cglobal hevc_add_residual_8_10, 3, 4, 6
pxor m4, m4 pxor m4, m4
mova m5, [max_pixels_10] mova m5, [max_pixels_10]
lea r3, [r2*3] lea r3, [r2*3]
TR_ADD_SSE_8_10 r0, r2, r3, r1 ADD_RES_SSE_8_10 r0, r2, r3, r1
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
add r1, 64 add r1, 64
TR_ADD_SSE_8_10 r0, r2, r3, r1 ADD_RES_SSE_8_10 r0, r2, r3, r1
RET RET
cglobal hevc_add_residual16_10,3,4,6 cglobal hevc_add_residual_16_10, 3, 4, 6
pxor m4, m4 pxor m4, m4
mova m5, [max_pixels_10] mova m5, [max_pixels_10]
TRANS_ADD_SSE_16_10 r0, r2, r1 ADD_RES_SSE_16_10 r0, r2, r1
%rep 7 %rep 7
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
add r1, 64 add r1, 64
TRANS_ADD_SSE_16_10 r0, r2, r1 ADD_RES_SSE_16_10 r0, r2, r1
%endrep %endrep
RET RET
cglobal hevc_add_residual32_10,3,4,6 cglobal hevc_add_residual_32_10, 3, 4, 6
pxor m4, m4 pxor m4, m4
mova m5, [max_pixels_10] mova m5, [max_pixels_10]
TRANS_ADD_SSE_32_10 r0, r1 ADD_RES_SSE_32_10 r0, r1
%rep 31 %rep 31
lea r0, [r0+r2] lea r0, [r0+r2]
add r1, 64 add r1, 64
TRANS_ADD_SSE_32_10 r0, r1 ADD_RES_SSE_32_10 r0, r1
%endrep %endrep
RET RET
%if HAVE_AVX2_EXTERNAL %if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 INIT_YMM avx2
cglobal hevc_add_residual_16_10, 3, 4, 6
cglobal hevc_add_residual16_10,3,4,6
pxor m4, m4 pxor m4, m4
mova m5, [max_pixels_10] mova m5, [max_pixels_10]
lea r3, [r2*3] lea r3, [r2*3]
TRANS_ADD16_AVX2 r0, r2, r3, r1 ADD_RES_AVX2_16_10 r0, r2, r3, r1
%rep 3 %rep 3
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
add r1, 128 add r1, 128
TRANS_ADD16_AVX2 r0, r2, r3, r1 ADD_RES_AVX2_16_10 r0, r2, r3, r1
%endrep %endrep
RET RET
cglobal hevc_add_residual32_10,3,4,6 cglobal hevc_add_residual_32_10, 3, 4, 6
pxor m4, m4 pxor m4, m4
mova m5, [max_pixels_10] mova m5, [max_pixels_10]
TRANS_ADD32_AVX2 r0, r2, r1 ADD_RES_AVX2_32_10 r0, r2, r1
%rep 15 %rep 15
lea r0, [r0+r2*2] lea r0, [r0+r2*2]
add r1, 128 add r1, 128
TRANS_ADD32_AVX2 r0, r2, r1 ADD_RES_AVX2_32_10 r0, r2, r1
%endrep %endrep
RET RET
%endif ;HAVE_AVX_EXTERNAL %endif ;HAVE_AVX2_EXTERNAL
...@@ -236,23 +236,24 @@ WEIGHTING_PROTOTYPES(12, sse4); ...@@ -236,23 +236,24 @@ WEIGHTING_PROTOTYPES(12, sse4);
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// TRANSFORM_ADD // TRANSFORM_ADD
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
void ff_hevc_add_residual4_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual8_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual16_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual32_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual32_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
void ff_hevc_add_residual16_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual32_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
#endif // AVCODEC_X86_HEVCDSP_H #endif // AVCODEC_X86_HEVCDSP_H
...@@ -713,7 +713,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -713,7 +713,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext; c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext; c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
c->add_residual[0] = ff_hevc_add_residual4_8_mmxext;
c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
} }
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
...@@ -734,9 +735,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -734,9 +735,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_sse2; c->idct[0] = ff_hevc_idct_4x4_8_sse2;
c->idct[1] = ff_hevc_idct_8x8_8_sse2; c->idct[1] = ff_hevc_idct_8x8_8_sse2;
c->add_residual[1] = ff_hevc_add_residual8_8_sse2; c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
c->add_residual[2] = ff_hevc_add_residual16_8_sse2; c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
c->add_residual[3] = ff_hevc_add_residual32_8_sse2; c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
} }
if (EXTERNAL_SSSE3(cpu_flags)) { if (EXTERNAL_SSSE3(cpu_flags)) {
if(ARCH_X86_64) { if(ARCH_X86_64) {
...@@ -772,9 +773,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -772,9 +773,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_8_avx; c->idct[0] = ff_hevc_idct_4x4_8_avx;
c->idct[1] = ff_hevc_idct_8x8_8_avx; c->idct[1] = ff_hevc_idct_8x8_8_avx;
c->add_residual[1] = ff_hevc_add_residual8_8_avx; c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
c->add_residual[2] = ff_hevc_add_residual16_8_avx; c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
c->add_residual[3] = ff_hevc_add_residual32_8_avx; c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2; c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
...@@ -874,11 +875,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -874,11 +875,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2; c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2; c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
c->add_residual[3] = ff_hevc_add_residual32_8_avx2; c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
} }
} else if (bit_depth == 10) { } else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
c->add_residual[0] = ff_hevc_add_residual4_10_mmxext; c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext; c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext; c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
} }
...@@ -902,9 +903,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -902,9 +903,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct[0] = ff_hevc_idct_4x4_10_sse2; c->idct[0] = ff_hevc_idct_4x4_10_sse2;
c->idct[1] = ff_hevc_idct_8x8_10_sse2; c->idct[1] = ff_hevc_idct_8x8_10_sse2;
c->add_residual[1] = ff_hevc_add_residual8_10_sse2; c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
c->add_residual[2] = ff_hevc_add_residual16_10_sse2; c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
c->add_residual[3] = ff_hevc_add_residual32_10_sse2; c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
} }
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
...@@ -1090,9 +1091,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -1090,9 +1091,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
SAO_BAND_INIT(10, avx2); SAO_BAND_INIT(10, avx2);
SAO_EDGE_INIT(10, avx2); SAO_EDGE_INIT(10, avx2);
c->add_residual[2] = ff_hevc_add_residual16_10_avx2; c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
c->add_residual[3] = ff_hevc_add_residual32_10_avx2; c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
} }
} else if (bit_depth == 12) { } else if (bit_depth == 12) {
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment