Commit a0d1300f authored by Pierre Edouard Lepere's avatar Pierre Edouard Lepere Committed by Michael Niedermayer

x86: hevc_mc: add AVX2 optimizations

before
33304 decicycles in luma_bi_1, 523066 runs, 1222 skips
38138 decicycles in luma_bi_2, 523427 runs, 861 skips
13490 decicycles in luma_uni, 516138 runs, 8150 skips
after
20185 decicycles in luma_bi_1, 519970 runs, 4318 skips
24620 decicycles in luma_bi_2, 521024 runs, 3264 skips
10397 decicycles in luma_uni, 515715 runs, 8573 skips

Conflicts:
	libavcodec/x86/hevc_mc.asm
	libavcodec/x86/hevcdsp_init.c
Reviewed-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent f9681664
......@@ -20,19 +20,20 @@
; */
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_8: times 8 dw (1 << 9)
pw_10: times 8 dw (1 << 11)
pw_12: times 8 dw (1 << 13)
pw_bi_8: times 8 dw (1 << 8)
pw_bi_10: times 8 dw (1 << 10)
pw_bi_12: times 8 dw (1 << 12)
max_pixels_10: times 8 dw ((1 << 10)-1)
max_pixels_12: times 8 dw ((1 << 12)-1)
zero: times 4 dd 0
one_per_32: times 4 dd 1
SECTION .text
SECTION_RODATA 32
pw_8: times 16 dw (1 << 9)
pw_10: times 16 dw (1 << 11)
pw_12: times 16 dw (1 << 13)
pw_bi_8: times 16 dw (1 << 8)
pw_bi_10: times 16 dw (1 << 10)
pw_bi_12: times 16 dw (1 << 12)
max_pixels_8: times 16 dw ((1 << 8)-1)
max_pixels_10: times 16 dw ((1 << 10)-1)
max_pixels_12: times 16 dw ((1 << 12)-1)
zero: times 8 dd 0
one_per_32: times 8 dd 1
SECTION_TEXT 32
%macro EPEL_TABLE 4
hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
times %2 d%3 10, -2
......@@ -51,6 +52,8 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
%endmacro
EPEL_TABLE 8,16, b, avx2
EPEL_TABLE 10, 8, w, avx2
EPEL_TABLE 8, 8, b, sse4
EPEL_TABLE 10, 4, w, sse4
......@@ -75,10 +78,15 @@ QPEL_TABLE 8, 8, b, sse4
QPEL_TABLE 10, 4, w, sse4
QPEL_TABLE 12, 4, w, sse4
QPEL_TABLE 8,16, b, avx2
QPEL_TABLE 10, 8, w, avx2
%define MAX_PB_SIZE 64
%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
%if ARCH_X86_64
%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
......@@ -87,11 +95,22 @@ QPEL_TABLE 12, 4, w, sse4
%elif %1 <= 8
movdqa %3, [%2] ; load data from source2
%elif %1 <= 12
%if cpuflag(avx2)
mova %3, [%2]
%else
movdqa %3, [%2] ; load data from source2
movq %4, [%2+16] ; load data from source2
%endif ;avx
%elif %1 <= 16
%if cpuflag(avx2)
movu %3, [%2]
%else
movdqa %3, [%2] ; load data from source2
movdqa %4, [%2+16] ; load data from source2
%endif ; avx
%else ; %1 = 32
movu %3, [%2]
movu %4, [%2+32]
%endif
%endmacro
......@@ -100,71 +119,108 @@ QPEL_TABLE 12, 4, w, sse4
movd %4, [%3] ; load data from source
%elif %1 == 4 || (%2 == 8 && %1 <= 8)
movq %4, [%3] ; load data from source
%elif notcpuflag(avx)
movu %4, [%3] ; load data from source
%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
movdqu %4, [%3]
%else
movdqu %4, [%3] ; load data from source
movu %4, [%3]
%endif
%endmacro
%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
%if %1 == 2 || (%2 == 8 && %1 <= 4)
movq %4, [%3] ; load data from source2
%elif %1 == 4 || (%2 == 8 && %1 <= 8)
movdqa %4, [%3] ; load data from source2
%elif %1 <= 12
movdqa %4, [%3] ; load data from source2
movq %5, [%3+16] ; load data from source2
%else
movdqa %4, [%3] ; load data from source2
movdqa %5, [%3+16] ; load data from source2
%endif
%endmacro
%macro EPEL_FILTER 2-4 ; bit depth, filter index
%if cpuflag(avx2)
%assign %%offset 32
%ifdef PIC
lea rfilterq, [hevc_epel_filters_avx2_%1]
%else
%define rfilterq hevc_epel_filters_avx2_%1
%endif
%else
%assign %%offset 16
%ifdef PIC
lea rfilterq, [hevc_epel_filters_sse4_%1]
%else
%define rfilterq hevc_epel_filters_sse4_%1
%endif
%endif ;cpuflag(avx2)
sub %2q, 1
%if cpuflag(avx2)
shl %2q, 6 ; multiply by 64
%else
shl %2q, 5 ; multiply by 32
movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
%endif
%if %0 == 2
mova m14, [rfilterq + %2q] ; get 2 first values of filters
mova m15, [rfilterq + %2q+%%offset] ; get 2 last values of filters
%else
mova %3, [rfilterq + %2q] ; get 2 first values of filters
mova %4, [rfilterq + %2q+%%offset] ; get 2 last values of filters
%endif
%endmacro
%macro EPEL_HV_FILTER 1
%if cpuflag(avx2)
%assign %%offset 32
%assign %%shift 6
%define %%table hevc_epel_filters_avx2_%1
%else
%assign %%offset 16
%assign %%shift 5
%define %%table hevc_epel_filters_sse4_%1
%endif
%ifdef PIC
lea rfilterq, [hevc_epel_filters_sse4_%1]
lea rfilterq, [%%table]
%else
%define rfilterq hevc_epel_filters_sse4_%1
%define rfilterq %%table
%endif
sub mxq, 1
sub myq, 1
shl mxq, 5 ; multiply by 32
shl myq, 5 ; multiply by 32
movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
mova m14, [rfilterq + mxq] ; get 2 first values of filters
mova m15, [rfilterq + mxq+%%offset] ; get 2 last values of filters
lea r3srcq, [srcstrideq*3]
%if cpuflag(avx2)
%define %%table hevc_epel_filters_avx2_10
%else
%define %%table hevc_epel_filters_sse4_10
%endif
%ifdef PIC
lea rfilterq, [hevc_epel_filters_sse4_10]
lea rfilterq, [%%table]
%else
%define rfilterq hevc_epel_filters_sse4_10
%define rfilterq %%table
%endif
movdqa m12, [rfilterq + myq] ; get 2 first values of filters
movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
mova m12, [rfilterq + myq] ; get 2 first values of filters
mova m13, [rfilterq + myq+%%offset] ; get 2 last values of filters
%endmacro
%macro QPEL_FILTER 2
%if cpuflag(avx2)
%assign %%offset 32
%assign %%shift 7
%define %%table hevc_qpel_filters_avx2_%1
%else
%assign %%offset 16
%assign %%shift 6
%define %%table hevc_qpel_filters_sse4_%1
%endif
%ifdef PIC
lea rfilterq, [hevc_qpel_filters_sse4_%1]
lea rfilterq, [%%table]
%else
%define rfilterq hevc_qpel_filters_sse4_%1
%define rfilterq %%table
%endif
lea %2q, [%2q*8-8]
movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
sub %2q, 1
shl %2q, %%shift ; multiply by 32
mova m12, [rfilterq + %2q] ; get 4 first values of filters
mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
%endmacro
%macro EPEL_LOAD 4
......@@ -191,19 +247,18 @@ QPEL_TABLE 12, 4, w, sse4
%%load m2, [rfilterq+2*%3q]
%%load m3, [rfilterq+r3srcq]
%endif
%if %1 == 8
%if %4 > 8
SBUTTERFLY bw, 0, 1, 10
SBUTTERFLY bw, 2, 3, 10
SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY bw, 2, 3, 7
%else
punpcklbw m0, m1
punpcklbw m2, m3
%endif
%else
%if %4 > 4
SBUTTERFLY wd, 0, 1, 10
SBUTTERFLY wd, 2, 3, 10
SBUTTERFLY wd, 0, 1, 7
SBUTTERFLY wd, 2, 3, 7
%else
punpcklwd m0, m1
punpcklwd m2, m3
......@@ -220,7 +275,7 @@ QPEL_TABLE 12, 4, w, sse4
%elif %3 == 8
%define %%load movq
%else
%define %%load movdqu
%define %%load movu
%endif
%else
%if %3 == 2
......@@ -228,7 +283,7 @@ QPEL_TABLE 12, 4, w, sse4
%elif %3 == 4
%define %%load movq
%else
%define %%load movdqu
%define %%load movu
%endif
%endif
%%load m0, [%2-3*%%stride] ;load data from source
......@@ -247,10 +302,10 @@ QPEL_TABLE 12, 4, w, sse4
SBUTTERFLY wd, 4, 5, %4
SBUTTERFLY wd, 6, 7, %4
%else
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
punpcklwd m6, m7
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
%endif
%else
%if %3 > 4
......@@ -259,10 +314,10 @@ QPEL_TABLE 12, 4, w, sse4
SBUTTERFLY dq, 4, 5, %4
SBUTTERFLY dq, 6, 7, %4
%else
punpckldq m0, m1
punpckldq m2, m3
punpckldq m4, m5
punpckldq m6, m7
punpcklwd m0, m1
punpcklwd m2, m3
punpcklwd m4, m5
punpcklwd m6, m7
%endif
%endif
%endmacro
......@@ -270,14 +325,14 @@ QPEL_TABLE 12, 4, w, sse4
%macro QPEL_V_LOAD 5
lea %5q, [%2]
sub %5q, r3srcq
movdqu m0, [%5q ] ;load x- 3*srcstride
movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
movdqu m3, [%2 ] ;load x
movdqu m4, [%2+ %3q] ;load x+stride
movdqu m5, [%2+ 2*%3q] ;load x+2*stride
movdqu m6, [%2+r3srcq] ;load x+3*stride
movdqu m7, [%2+ 4*%3q] ;load x+4*stride
movu m0, [%5q ] ;load x- 3*srcstride
movu m1, [%5q+ %3q ] ;load x- 2*srcstride
movu m2, [%5q+ 2*%3q ] ;load x-srcstride
movu m3, [%2 ] ;load x
movu m4, [%2+ %3q] ;load x+stride
movu m5, [%2+ 2*%3q] ;load x+2*stride
movu m6, [%2+r3srcq] ;load x+3*stride
movu m7, [%2+ 4*%3q] ;load x+4*stride
%if %1 == 8
%if %4 > 8
SBUTTERFLY bw, 0, 1, 8
......@@ -347,8 +402,17 @@ QPEL_TABLE 12, 4, w, sse4
movq [%1+16], %3
%endmacro
%macro PEL_10STORE16 3
%if cpuflag(avx2)
movu [%1], %2
%else
PEL_10STORE8 %1, %2, %3
movdqa [%1+16], %3
%endif
%endmacro
%macro PEL_10STORE32 3
PEL_10STORE16 %1, %2, %3
movu [%1+32], %3
%endmacro
%macro PEL_8STORE2 3
......@@ -370,7 +434,14 @@ QPEL_TABLE 12, 4, w, sse4
movd [%1+8], %2
%endmacro
%macro PEL_8STORE16 3
movdqa [%1], %2
%if cpuflag(avx2)
movdqu [%1], %2
%else
mova [%1], %2
%endif ; avx
%endmacro
%macro PEL_8STORE32 3
movu [%1], %2
%endmacro
%macro LOOP_END 3
......@@ -381,65 +452,109 @@ QPEL_TABLE 12, 4, w, sse4
%endmacro
%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
%if %2 == 8
%if cpuflag(avx2) && %0 ==3
%if %1 > 16
vextracti128 xm1, m0, 1
pmovzxbw m1, xm1
psllw m1, 14-%2
%endif
pmovzxbw m0, xm0
%else ; not avx
%if %1 > 8
punpckhbw m1, m0, m2
psllw m1, 14-%2
punpckhbw m1, m0, m2
psllw m1, 14-%2
%endif
punpcklbw m0, m2
punpcklbw m0, m2
%endif
psllw m0, 14-%2
%endif ;avx
psllw m0, 14-%2
%endmacro
%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
%if %0 == 8
%define %%reg0 %5
%define %%reg2 %6
%define %%reg1 %7
%define %%reg3 %8
%else
%define %%reg0 m0
%define %%reg2 m2
%define %%reg1 m1
%define %%reg3 m3
%endif
%if %1 == 8
pmaddubsw m0, %3 ;x1*c1+x2*c2
pmaddubsw m2, %4 ;x3*c3+x4*c4
paddw m0, m2
%if cpuflag(avx2) && (%0 == 5)
%if %2 > 16
vextracti128 xm10, m0, 1
vinserti128 m10, m1, xm10, 0
%endif
vinserti128 m0, m0, xm1, 1
mova m1, m10
%if %2 > 16
vextracti128 xm10, m2, 1
vinserti128 m10, m3, xm10, 0
%endif
vinserti128 m2, m2, xm3, 1
mova m3, m10
%endif
pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
paddw %%reg0, %%reg2
%if %2 > 8
pmaddubsw m1, %3
pmaddubsw m3, %4
paddw m1, m3
pmaddubsw %%reg1, %3
pmaddubsw %%reg3, %4
paddw %%reg1, %%reg3
%endif
%else
pmaddwd m0, %3
pmaddwd m2, %4
paddd m0, m2
pmaddwd %%reg0, %3
pmaddwd %%reg2, %4
paddd %%reg0, %%reg2
%if %2 > 4
pmaddwd m1, %3
pmaddwd m3, %4
paddd m1, m3
pmaddwd %%reg1, %3
pmaddwd %%reg3, %4
paddd %%reg1, %%reg3
%if %1 != 8
psrad %%reg1, %1-8
%endif
%endif
%if %1 != 8
psrad m0, %1-8
psrad m1, %1-8
psrad %%reg0, %1-8
%endif
packssdw m0, m1
packssdw %%reg0, %%reg1
%endif
%endmacro
%macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
%if cpuflag(avx2)
%assign %%offset 32
%define %%table hevc_qpel_filters_avx2_%2
%else
%assign %%offset 16
%define %%table hevc_qpel_filters_sse4_%2
%endif
%ifdef PIC
lea rfilterq, [hevc_qpel_filters_sse4_%2]
lea rfilterq, [%%table]
%else
%define rfilterq hevc_qpel_filters_sse4_%2
%define rfilterq %%table
%endif
%if %2 == 8
pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
paddw m0, m2
paddw m4, m6
paddw m0, m4
%else
pmaddwd m0, [rfilterq + %3q*8 ]
pmaddwd m2, [rfilterq + %3q*8+16]
pmaddwd m4, [rfilterq + %3q*8+32]
pmaddwd m6, [rfilterq + %3q*8+48]
pmaddwd m2, [rfilterq + %3q*8+%%offset]
pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
paddd m0, m2
paddd m4, m6
paddd m0, m4
......@@ -448,9 +563,9 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%if %1 > 4
pmaddwd m1, [rfilterq + %3q*8 ]
pmaddwd m3, [rfilterq + %3q*8+16]
pmaddwd m5, [rfilterq + %3q*8+32]
pmaddwd m7, [rfilterq + %3q*8+48]
pmaddwd m3, [rfilterq + %3q*8+%%offset]
pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
paddd m1, m3
paddd m5, m7
paddd m1, m5
......@@ -462,8 +577,32 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%endmacro
%macro QPEL_COMPUTE 2 ; width, bitdepth
%macro QPEL_COMPUTE 2-3 ; width, bitdepth
%if %2 == 8
%if cpuflag(avx2) && (%0 == 3)
vextracti128 xm10, m0, 1
vinserti128 m10, m1, xm10, 0
vinserti128 m0, m0, xm1, 1
mova m1, m10
vextracti128 xm10, m2, 1
vinserti128 m10, m3, xm10, 0
vinserti128 m2, m2, xm3, 1
mova m3, m10
vextracti128 xm10, m4, 1
vinserti128 m10, m5, xm10, 0
vinserti128 m4, m4, xm5, 1
mova m5, m10
vextracti128 xm10, m6, 1
vinserti128 m10, m7, xm10, 0
vinserti128 m6, m6, xm7, 1
mova m7, m10
%endif
pmaddubsw m0, m12 ;x1*c1+x2*c2
pmaddubsw m2, m13 ;x3*c3+x4*c4
pmaddubsw m4, m14 ;x5*c5+x6*c6
......@@ -506,12 +645,16 @@ QPEL_TABLE 12, 4, w, sse4
%endif
%endmacro
%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
paddsw %3, %5
%if %1 > 8
paddsw %4, %6
%endif
UNI_COMPUTE %1, %2, %3, %4, %7
%if %0 == 8 && cpuflag(avx2) && (%2 == 8)
vpermq %3, %3, 216
vpermq %4, %4, 216
%endif
%endmacro
%macro UNI_COMPUTE 5
......@@ -524,14 +667,14 @@ QPEL_TABLE 12, 4, w, sse4
%else
pminsw %3, [max_pixels_%2]
pmaxsw %3, [zero]
%if %1 > 8
%if (%1 > 8 && notcpuflag(avx)) || %1 > 16
pminsw %4, [max_pixels_%2]
pmaxsw %4, [zero]
%endif
%endif
%endmacro
INIT_XMM sse4 ; adds ff_ and _sse4 to function name
; ******************************
; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
; uint8_t *_src, ptrdiff_t _srcstride,
......@@ -539,15 +682,23 @@ INIT_XMM sse4 ; adds ff_ and _sse4 to functio
; ******************************
%macro HEVC_PUT_HEVC_PEL_PIXELS 2
HEVC_PEL_PIXELS %1, %2
HEVC_UNI_PEL_PIXELS %1, %2
HEVC_BI_PEL_PIXELS %1, %2
%endmacro
%macro HEVC_PEL_PIXELS 2
cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
pxor m2, m2
.loop
SIMPLE_LOAD %1, %2, srcq, m0
MC_PIXEL_COMPUTE %1, %2
MC_PIXEL_COMPUTE %1, %2, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
%endmacro
%macro HEVC_UNI_PEL_PIXELS 2
cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
.loop
SIMPLE_LOAD %1, %2, srcq, m0
......@@ -557,15 +708,17 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri
dec heightd ; cmp height
jnz .loop ; height loop
RET
%endmacro
%macro HEVC_BI_PEL_PIXELS 2
cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
movdqa m5, [pw_bi_%2]
.loop
SIMPLE_LOAD %1, %2, srcq, m0
SIMPLE_BILOAD %1, src2q, m3, m4
MC_PIXEL_COMPUTE %1, %2
BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
MC_PIXEL_COMPUTE %1, %2, 1
BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -573,7 +726,6 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
dec heightd ; cmp height
jnz .loop ; height loop
RET
%endmacro
......@@ -591,7 +743,7 @@ cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rf
EPEL_FILTER %2, mx, m4, m5
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m4, m5
EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
......@@ -616,9 +768,9 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride,
EPEL_FILTER %2, mx, m4, m5
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m4, m5
EPEL_COMPUTE %2, %1, m4, m5, 1
SIMPLE_BILOAD %1, src2q, m2, m3
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -640,7 +792,7 @@ cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src,
EPEL_FILTER %2, my, m4, m5
.loop
EPEL_LOAD %2, srcq, srcstride, %1
EPEL_COMPUTE %2, %1, m4, m5
EPEL_COMPUTE %2, %1, m4, m5, 1
PEL_10STORE%1 dstq, m0, m1
LOOP_END dst, src, srcstride
RET
......@@ -669,9 +821,9 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride,
EPEL_FILTER %2, my, m4, m5
.loop
EPEL_LOAD %2, srcq, srcstride, %1
EPEL_COMPUTE %2, %1, m4, m5
EPEL_COMPUTE %2, %1, m4, m5, 1
SIMPLE_BILOAD %1, src2q, m2, m3
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -695,19 +847,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m8, m1
%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m9, m1
%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m10, m1
%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m11, m1
%endif
SWAP m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
......@@ -716,10 +880,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
%if (%1 > 8 && (%2 == 8))
punpcklwd m4, m8, m9
punpcklwd m2, m10, m11
punpckhwd m8, m8, m9
punpckhwd m3, m10, m11
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
%if cpuflag(avx2)
vinserti128 m2, m0, xm4, 1
vextracti128 xm3, m0, 1
vinserti128 m3, m4, xm3, 0
PEL_10STORE%1 dstq, m2, m3
%else
PEL_10STORE%1 dstq, m0, m4
%endif
%else
PEL_10STORE%1 dstq, m0, m1
%endif
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
%if (%1 > 8 && (%2 == 8))
mova m8, m9
mova m9, m10
mova m10, m11
%endif
LOOP_END dst, src, srcstride
RET
......@@ -729,20 +914,32 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m8, m1
%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m9, m1
%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m10, m1
%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
SWAP m7, m0
%if (%1 > 8 && (%2 == 8))
SWAP m11, m1
%endif
mova m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
%if %1 > 4
......@@ -750,37 +947,62 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
%if (%1 > 8 && (%2 == 8))
punpcklwd m4, m8, m9
punpcklwd m2, m10, m11
punpckhwd m8, m8, m9
punpckhwd m3, m10, m11
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
%else
UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
%endif
PEL_%2STORE%1 dstq, m0, m1
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
mova m4, m5
mova m5, m6
mova m6, m7
%if (%1 > 8 && (%2 == 8))
mova m8, m9
mova m9, m10
mova m10, m11
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m8, m1
%endif
SWAP m4, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m9, m1
%endif
SWAP m5, m0
add srcq, srcstrideq
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m10, m1
%endif
SWAP m6, m0
add srcq, srcstrideq
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m14, m15
%if (%1 > 8 && (%2 == 8))
SWAP m11, m1
%endif
SWAP m7, m0
punpcklwd m0, m4, m5
punpcklwd m2, m6, m7
......@@ -789,12 +1011,34 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
punpckhwd m3, m6, m7
%endif
EPEL_COMPUTE 14, %1, m12, m13
%if (%1 > 8 && (%2 == 8))
punpcklwd m4, m8, m9
punpcklwd m2, m10, m11
punpckhwd m8, m8, m9
punpckhwd m3, m10, m11
EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
SIMPLE_BILOAD %1, src2q, m8, m3
%if cpuflag(avx2)
vinserti128 m1, m8, xm3, 1
vextracti128 xm8, m8, 1
vinserti128 m2, m3, xm8, 0
BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
%else
BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
%endif
%else
SIMPLE_BILOAD %1, src2q, m8, m9
BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
PEL_%2STORE%1 dstq, m0, m1
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
%endif
PEL_%2STORE%1 dstq, m0, m4
mova m4, m5
mova m5, m6
mova m6, m7
%if (%1 > 8 && (%2 == 8))
mova m8, m9
mova m9, m10
mova m10, m11
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
add src2q, 2*MAX_PB_SIZE ; src += srcstride
......@@ -814,7 +1058,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
QPEL_COMPUTE %1, %2
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
......@@ -823,7 +1067,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf
RET
cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
movdqa m9, [pw_%2]
mova m9, [pw_%2]
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
......@@ -844,12 +1088,12 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
QPEL_COMPUTE %1, %2
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
SIMPLE_BILOAD %1, src2q, m10, m11
BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -870,7 +1114,7 @@ cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src,
QPEL_FILTER %2, my
.loop
QPEL_V_LOAD %2, srcq, srcstride, %1, r7
QPEL_COMPUTE %1, %2
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
......@@ -901,13 +1145,13 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop
SIMPLE_BILOAD %1, src2q, m10, m11
QPEL_V_LOAD %2, srcq, srcstride, %1, r9
QPEL_COMPUTE %1, %2
QPEL_COMPUTE %1, %2, 1
%if %2 > 8
packssdw m0, m1
%endif
BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
SIMPLE_BILOAD %1, src2q, m10, m11
BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -925,8 +1169,15 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
lea mxq, [mxq*8-8]
lea myq, [myq*8-8]
%if cpuflag(avx2)
%assign %%shift 4
%else
%assign %%shift 3
%endif
sub mxq, 1
sub myq, 1
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
......@@ -994,8 +1245,15 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, m
RET
cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
lea mxq, [mxq*8-8]
lea myq, [myq*8-8]
%if cpuflag(avx2)
%assign %%shift 4
%else
%assign %%shift 3
%endif
sub mxq, 1
sub myq, 1
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
......@@ -1053,13 +1311,13 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
movq m13, m14
movq m14, m15
%else
movdqa m8, m9
movdqa m9, m10
movdqa m10, m11
movdqa m11, m12
movdqa m12, m13
movdqa m13, m14
movdqa m14, m15
mova m8, m9
mova m9, m10
mova m10, m11
mova m11, m12
mova m12, m13
mova m13, m14
mova m14, m15
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
......@@ -1068,8 +1326,15 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
RET
cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
lea mxq, [mxq*8-8]
lea myq, [myq*8-8]
%if cpuflag(avx2)
%assign %%shift 4
%else
%assign %%shift 3
%endif
sub mxq, 1
sub myq, 1
shl mxq, %%shift ; multiply by 32
shl myq, %%shift ; multiply by 32
lea r3srcq, [srcstrideq*3]
sub srcq, r3srcq
QPEL_H_LOAD %2, srcq, %1, 15
......@@ -1286,6 +1551,8 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2,
RET
%endmacro
INIT_XMM sse4 ; adds ff_ and _sse4 to function name
WEIGHTING_FUNCS 2, 8
WEIGHTING_FUNCS 4, 8
WEIGHTING_FUNCS 6, 8
......@@ -1340,6 +1607,7 @@ HEVC_PUT_HEVC_EPEL_HV 2, 8
HEVC_PUT_HEVC_EPEL_HV 4, 8
HEVC_PUT_HEVC_EPEL_HV 6, 8
HEVC_PUT_HEVC_EPEL_HV 8, 8
HEVC_PUT_HEVC_EPEL_HV 16, 8
HEVC_PUT_HEVC_EPEL_HV 2, 10
HEVC_PUT_HEVC_EPEL_HV 4, 10
......@@ -1377,4 +1645,23 @@ HEVC_PUT_HEVC_QPEL_HV 4, 12
HEVC_PUT_HEVC_QPEL_HV 6, 12
HEVC_PUT_HEVC_QPEL_HV 8, 12
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
HEVC_PUT_HEVC_PEL_PIXELS 32, 8
HEVC_PUT_HEVC_PEL_PIXELS 16, 10
HEVC_PUT_HEVC_EPEL 32, 8
HEVC_PUT_HEVC_EPEL 16, 10
HEVC_PUT_HEVC_EPEL_HV 16, 10
HEVC_PUT_HEVC_EPEL_HV 32, 8
HEVC_PUT_HEVC_QPEL 32, 8
HEVC_PUT_HEVC_QPEL 16, 10
HEVC_PUT_HEVC_QPEL_HV 16, 10
%endif ;AVX2
%endif ; ARCH_X86_64
......@@ -96,6 +96,40 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst
EPEL_PROTOTYPES(pel_pixels , 8, sse4);
EPEL_PROTOTYPES(pel_pixels , 10, sse4);
EPEL_PROTOTYPES(pel_pixels , 12, sse4);
void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit
void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
///////////////////////////////////////////////////////////////////////////////
// EPEL
///////////////////////////////////////////////////////////////////////////////
......@@ -111,6 +145,42 @@ EPEL_PROTOTYPES(epel_hv , 8, sse4);
EPEL_PROTOTYPES(epel_hv , 10, sse4);
EPEL_PROTOTYPES(epel_hv , 12, sse4);
PEL_PROTOTYPE(epel_h16, 8, avx2);
PEL_PROTOTYPE(epel_h24, 8, avx2);
PEL_PROTOTYPE(epel_h32, 8, avx2);
PEL_PROTOTYPE(epel_h48, 8, avx2);
PEL_PROTOTYPE(epel_h64, 8, avx2);
PEL_PROTOTYPE(epel_h16,10, avx2);
PEL_PROTOTYPE(epel_h24,10, avx2);
PEL_PROTOTYPE(epel_h32,10, avx2);
PEL_PROTOTYPE(epel_h48,10, avx2);
PEL_PROTOTYPE(epel_h64,10, avx2);
PEL_PROTOTYPE(epel_v16, 8, avx2);
PEL_PROTOTYPE(epel_v24, 8, avx2);
PEL_PROTOTYPE(epel_v32, 8, avx2);
PEL_PROTOTYPE(epel_v48, 8, avx2);
PEL_PROTOTYPE(epel_v64, 8, avx2);
PEL_PROTOTYPE(epel_v16,10, avx2);
PEL_PROTOTYPE(epel_v24,10, avx2);
PEL_PROTOTYPE(epel_v32,10, avx2);
PEL_PROTOTYPE(epel_v48,10, avx2);
PEL_PROTOTYPE(epel_v64,10, avx2);
PEL_PROTOTYPE(epel_hv16, 8, avx2);
PEL_PROTOTYPE(epel_hv24, 8, avx2);
PEL_PROTOTYPE(epel_hv32, 8, avx2);
PEL_PROTOTYPE(epel_hv48, 8, avx2);
PEL_PROTOTYPE(epel_hv64, 8, avx2);
PEL_PROTOTYPE(epel_hv16,10, avx2);
PEL_PROTOTYPE(epel_hv24,10, avx2);
PEL_PROTOTYPE(epel_hv32,10, avx2);
PEL_PROTOTYPE(epel_hv48,10, avx2);
PEL_PROTOTYPE(epel_hv64,10, avx2);
///////////////////////////////////////////////////////////////////////////////
// QPEL
///////////////////////////////////////////////////////////////////////////////
......@@ -126,6 +196,41 @@ QPEL_PROTOTYPES(qpel_hv, 8, sse4);
QPEL_PROTOTYPES(qpel_hv, 10, sse4);
QPEL_PROTOTYPES(qpel_hv, 12, sse4);
PEL_PROTOTYPE(qpel_h16, 8, avx2);
PEL_PROTOTYPE(qpel_h24, 8, avx2);
PEL_PROTOTYPE(qpel_h32, 8, avx2);
PEL_PROTOTYPE(qpel_h48, 8, avx2);
PEL_PROTOTYPE(qpel_h64, 8, avx2);
PEL_PROTOTYPE(qpel_h16,10, avx2);
PEL_PROTOTYPE(qpel_h24,10, avx2);
PEL_PROTOTYPE(qpel_h32,10, avx2);
PEL_PROTOTYPE(qpel_h48,10, avx2);
PEL_PROTOTYPE(qpel_h64,10, avx2);
PEL_PROTOTYPE(qpel_v16, 8, avx2);
PEL_PROTOTYPE(qpel_v24, 8, avx2);
PEL_PROTOTYPE(qpel_v32, 8, avx2);
PEL_PROTOTYPE(qpel_v48, 8, avx2);
PEL_PROTOTYPE(qpel_v64, 8, avx2);
PEL_PROTOTYPE(qpel_v16,10, avx2);
PEL_PROTOTYPE(qpel_v24,10, avx2);
PEL_PROTOTYPE(qpel_v32,10, avx2);
PEL_PROTOTYPE(qpel_v48,10, avx2);
PEL_PROTOTYPE(qpel_v64,10, avx2);
PEL_PROTOTYPE(qpel_hv16, 8, avx2);
PEL_PROTOTYPE(qpel_hv24, 8, avx2);
PEL_PROTOTYPE(qpel_hv32, 8, avx2);
PEL_PROTOTYPE(qpel_hv48, 8, avx2);
PEL_PROTOTYPE(qpel_hv64, 8, avx2);
PEL_PROTOTYPE(qpel_hv16,10, avx2);
PEL_PROTOTYPE(qpel_hv24,10, avx2);
PEL_PROTOTYPE(qpel_hv32,10, avx2);
PEL_PROTOTYPE(qpel_hv48,10, avx2);
PEL_PROTOTYPE(qpel_hv64,10, avx2);
WEIGHTING_PROTOTYPES(8, sse4);
WEIGHTING_PROTOTYPES(10, sse4);
......
......@@ -165,6 +165,149 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts
#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
int height, intptr_t mx, intptr_t my, int width) \
\
{ \
ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
}
#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t _srcstride, int16_t *src2, \
int height, intptr_t mx, intptr_t my, int width) \
{ \
ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
height, mx, my, width); \
ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
height, mx, my, width); \
}
#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t _srcstride, int height, \
intptr_t mx, intptr_t my, int width) \
{ \
ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
height, mx, my, width); \
ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
height, mx, my, width); \
}
#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \
mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \
mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
int height, intptr_t mx, intptr_t my, int width) \
\
{ \
ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
}
#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t _srcstride, int16_t* src2, \
int height, intptr_t mx, intptr_t my, int width) \
{ \
ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
src2, height, mx, my, width); \
ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
src2+width2, height, mx, my, width); \
}
#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t _srcstride, int height, \
intptr_t mx, intptr_t my, int width) \
{ \
ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
height, mx, my, width); \
ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
height, mx, my, width); \
}
#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
mc_rep_mix_8(name, width1, width2, width3, opt1, opt2); \
mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \
mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
#if HAVE_AVX2_EXTERNAL
mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4);
mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4);
mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4);
mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4);
mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32);
mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32);
mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32);
mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit
mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit
mc_rep_funcs(pel_pixels, 8, 32, 64, avx2);
mc_rep_func(pel_pixels, 10, 16, 32, avx2);
mc_rep_func(pel_pixels, 10, 16, 48, avx2);
mc_rep_func(pel_pixels, 10, 32, 64, avx2);
mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2);
mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2);
mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2);
mc_rep_funcs(epel_h, 8, 32, 64, avx2);
mc_rep_funcs(epel_v, 8, 32, 64, avx2);
mc_rep_funcs(epel_h, 10, 16, 32, avx2);
mc_rep_funcs(epel_h, 10, 16, 48, avx2);
mc_rep_funcs(epel_h, 10, 32, 64, avx2);
mc_rep_funcs(epel_v, 10, 16, 32, avx2);
mc_rep_funcs(epel_v, 10, 16, 48, avx2);
mc_rep_funcs(epel_v, 10, 32, 64, avx2);
mc_rep_funcs(epel_hv, 8, 32, 64, avx2);
mc_rep_funcs(epel_hv, 10, 16, 32, avx2);
mc_rep_funcs(epel_hv, 10, 16, 48, avx2);
mc_rep_funcs(epel_hv, 10, 32, 64, avx2);
mc_rep_funcs(qpel_h, 8, 32, 64, avx2);
mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4);
mc_rep_funcs(qpel_v, 8, 32, 64, avx2);
mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4);
mc_rep_funcs(qpel_h, 10, 16, 32, avx2);
mc_rep_funcs(qpel_h, 10, 16, 48, avx2);
mc_rep_funcs(qpel_h, 10, 32, 64, avx2);
mc_rep_funcs(qpel_v, 10, 16, 32, avx2);
mc_rep_funcs(qpel_v, 10, 16, 48, avx2);
mc_rep_funcs(qpel_v, 10, 32, 64, avx2);
mc_rep_funcs(qpel_hv, 10, 16, 32, avx2);
mc_rep_funcs(qpel_hv, 10, 16, 48, avx2);
mc_rep_funcs(qpel_hv, 10, 32, 64, avx2);
#endif //AVX2
mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
......@@ -218,7 +361,6 @@ mc_rep_funcs(epel_hv, 8, 8, 64, sse4);
mc_rep_funcs(epel_hv, 8, 8, 48, sse4);
mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
mc_rep_funcs(epel_hv, 8, 8, 24, sse4);
mc_rep_funcs(epel_hv, 8, 8, 16, sse4);
mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4);
mc_rep_funcs(epel_hv,10, 8, 64, sse4);
mc_rep_funcs(epel_hv,10, 8, 48, sse4);
......@@ -619,6 +761,89 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
if (ARCH_X86_64) {
SAO_BAND_INIT(8, avx2);
c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
}
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
......@@ -685,6 +910,149 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
}
c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment