Commit 406fbd24 authored by Daniel Kang's avatar Daniel Kang Committed by Ronald S. Bultje

H.264: Add optimizations to predict x86 assembly.

Signed-off-by: 's avatarRonald S. Bultje <rsbultje@gmail.com>
parent 505345ed
...@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3 ...@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3
punpckldq m1, [r1] punpckldq m1, [r1]
movq m2, m1 movq m2, m1
movq m3, m1 movq m3, m1
movq m4, m1
psllq m1, 8 psllq m1, 8
pxor m2, m1 pxor m2, m1
psrlq m2, 8 psrlq m2, 8
pxor m3, m2 pxor m2, m3
PRED4x4_LOWPASS m0, m1, m3, m4, m5 PRED4x4_LOWPASS m0, m1, m2, m3, m4
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
psrlq m0, 8 psrlq m0, 8
movd [r0+r2*1], m0 movd [r0+r2*1], m0
......
...@@ -27,8 +27,6 @@ ...@@ -27,8 +27,6 @@
SECTION_RODATA SECTION_RODATA
SECTION .text
cextern pw_16 cextern pw_16
cextern pw_8 cextern pw_8
cextern pw_4 cextern pw_4
...@@ -42,6 +40,8 @@ pw_512: times 8 dw 512 ...@@ -42,6 +40,8 @@ pw_512: times 8 dw 512
pd_17: times 4 dd 17 pd_17: times 4 dd 17
pd_16: times 4 dd 16 pd_16: times 4 dd 16
SECTION .text
; dest, left, right, src ; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED4x4_LOWPASS 4 %macro PRED4x4_LOWPASS 4
...@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3 ...@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3
movq m3, [r0] movq m3, [r0]
punpckhdq m1, m2 punpckhdq m1, m2
PALIGNR m3, m1, 10, m1 PALIGNR m3, m1, 10, m1
mova m1, m3
movhps m4, [r1+r2*1-8] movhps m4, [r1+r2*1-8]
PALIGNR m3, m4, 14, m4 PALIGNR m0, m3, m4, 14, m4
mova m2, m3
movhps m4, [r1+r2*2-8] movhps m4, [r1+r2*2-8]
PALIGNR m3, m4, 14, m4 PALIGNR m2, m0, m4, 14, m4
PRED4x4_LOWPASS m0, m3, m1, m2 PRED4x4_LOWPASS m0, m2, m3, m0
movq [r1+r2*2], m0 movq [r1+r2*2], m0
psrldq m0, 2 psrldq m0, 2
movq [r1+r2*1], m0 movq [r1+r2*1], m0
...@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6 ...@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6
pavgw m5, m0 pavgw m5, m0
movhps m1, [r0+r2*1-8] movhps m1, [r0+r2*1-8]
PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
mova m1, m0
movhps m2, [r0+r2*2-8] movhps m2, [r0+r2*2-8]
PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
mova m2, m0
movhps m3, [r1+r2*1-8] movhps m3, [r1+r2*1-8]
PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
PRED4x4_LOWPASS m3, m1, m0, m2 PRED4x4_LOWPASS m1, m0, m2, m1
pslldq m1, m3, 12 pslldq m0, m1, 12
psrldq m3, 4 psrldq m1, 4
movq [r0+r2*1], m5 movq [r0+r2*1], m5
movq [r0+r2*2], m3 movq [r0+r2*2], m1
PALIGNR m5, m1, 14, m2 PALIGNR m5, m0, 14, m2
pslldq m1, 2 pslldq m0, 2
movq [r1+r2*1], m5 movq [r1+r2*1], m5
PALIGNR m3, m1, 14, m1 PALIGNR m1, m0, 14, m0
movq [r1+r2*2], m3 movq [r1+r2*2], m1
RET RET
%endmacro %endmacro
...@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3 ...@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3
punpckhdq m1, m2 ; l0 l1 l2 l3 punpckhdq m1, m2 ; l0 l1 l2 l3
punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
pavgw m5, m1, m2 pavgw m5, m1, m3
PRED4x4_LOWPASS m3, m1, m0, m2 PRED4x4_LOWPASS m3, m1, m0, m3
punpcklwd m5, m3 punpcklwd m5, m3
psrldq m3, 8 psrldq m3, 8
PALIGNR m3, m5, 12, m4 PALIGNR m3, m5, 12, m4
...@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3 ...@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
;TODO: more AVX here
%macro PRED4x4_DL 1 %macro PRED4x4_DL 1
cglobal pred4x4_down_left_10_%1, 3,3 cglobal pred4x4_down_left_10_%1, 3,3
sub r0, r2 sub r0, r2
movq m1, [r0] movq m0, [r0]
movhps m1, [r1] movhps m0, [r1]
pslldq m5, m1, 2 psrldq m2, m0, 2
pxor m2, m5, m1 pslldq m3, m0, 2
psrldq m2, 2 pshufhw m2, m2, 10100100b
pxor m3, m1, m2 PRED4x4_LOWPASS m0, m3, m2, m0
PRED4x4_LOWPASS m0, m5, m3, m1
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
movhps [r1+r2*2], m0 movhps [r1+r2*2], m0
psrldq m0, 2 psrldq m0, 2
...@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3 ...@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
sub r0, r2 sub r0, r2
movu m1, [r0] movu m1, [r0]
movhps m1, [r1] movhps m1, [r1]
psrldq m3, m1, 2 psrldq m0, m1, 2
psrldq m2, m1, 4 psrldq m2, m1, 4
pavgw m4, m3, m1 pavgw m4, m0, m1
PRED4x4_LOWPASS m0, m1, m2, m3 PRED4x4_LOWPASS m0, m1, m2, m0
lea r1, [r0+r2*2] lea r1, [r0+r2*2]
movq [r0+r2*1], m4 movq [r0+r2*1], m4
movq [r0+r2*2], m0 movq [r0+r2*2], m0
...@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3 ...@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3
pavgw m2, m0 pavgw m2, m0
pshufw m5, m0, 11111110b pshufw m5, m0, 11111110b
PRED4x4_LOWPASS m3, m0, m5, m1 PRED4x4_LOWPASS m1, m0, m5, m1
movq m6, m2 movq m6, m2
punpcklwd m6, m3 punpcklwd m6, m1
movq [r0+r2*1], m6 movq [r0+r2*1], m6
psrlq m2, 16 psrlq m2, 16
psrlq m3, 16 psrlq m1, 16
punpcklwd m2, m3 punpcklwd m2, m1
movq [r0+r2*2], m2 movq [r0+r2*2], m2
psrlq m2, 32 psrlq m2, 32
movd [r1+r2*1], m2 movd [r1+r2*1], m2
...@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2 ...@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_XMM INIT_XMM
cglobal pred8x8_horizontal_10_sse2, 2,3 cglobal pred8x8_horizontal_10_sse2, 2,3
mov r2, 4 mov r2d, 4
.loop: .loop:
movq m0, [r0+r1*0-8] movq m0, [r0+r1*0-8]
movq m1, [r0+r1*1-8] movq m1, [r0+r1*1-8]
...@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 ...@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
mova [r0+r1*0], m0 mova [r0+r1*0], m0
mova [r0+r1*1], m1 mova [r0+r1*1], m1
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
dec r2 dec r2d
jg .loop jg .loop
REP_RET REP_RET
...@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 ...@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
%endmacro %endmacro
%macro PRED8x8_DC 2 %macro PRED8x8_DC 2
cglobal pred8x8_dc_10_%1, 2,4 cglobal pred8x8_dc_10_%1, 2,6
%ifdef ARCH_X86_64
%define t0 r10
%else
%define t0 r0m
%endif
sub r0, r1 sub r0, r1
pxor m4, m4 pxor m4, m4
movq m0, [r0+0] movq m0, [r0+0]
movq m1, [r0+8] movq m1, [r0+8]
HADDW m0, m2 %if mmsize==16
mov t0, r0 punpcklwd m0, m1
HADDW m1, m2 movhlps m1, m0
paddw m0, m1
%else
pshufw m2, m0, 00001110b
pshufw m3, m1, 00001110b
paddw m0, m2
paddw m1, m3
punpcklwd m0, m1
%endif
%2 m2, m0, 00001110b
paddw m0, m2
lea r5, [r1*3]
lea r4, [r0+r1*4]
movzx r2d, word [r0+r1*1-2] movzx r2d, word [r0+r1*1-2]
movzx r3d, word [r0+r1*2-2] movzx r3d, word [r0+r1*2-2]
lea r0, [r0+r1*2]
add r2d, r3d add r2d, r3d
movzx r3d, word [r0+r1*1-2] movzx r3d, word [r0+r5*1-2]
add r2d, r3d add r2d, r3d
movzx r3d, word [r0+r1*2-2] movzx r3d, word [r4-2]
add r2d, r3d add r2d, r3d
lea r0, [r0+r1*2]
movd m2, r2d ; s2 movd m2, r2d ; s2
movzx r2d, word [r0+r1*1-2] movzx r2d, word [r4+r1*1-2]
movzx r3d, word [r0+r1*2-2] movzx r3d, word [r4+r1*2-2]
lea r0, [r0+r1*2]
add r2d, r3d add r2d, r3d
movzx r3d, word [r0+r1*1-2] movzx r3d, word [r4+r5*1-2]
add r2d, r3d add r2d, r3d
movzx r3d, word [r0+r1*2-2] movzx r3d, word [r4+r1*4-2]
add r2d, r3d add r2d, r3d
movd m3, r2d ; s3 movd m3, r2d ; s3
punpcklwd m0, m1
mov r0, t0
punpcklwd m2, m3 punpcklwd m2, m3
punpckldq m0, m2 ; s0, s1, s2, s3 punpckldq m0, m2 ; s0, s1, s2, s3
%2 m3, m0, 11110110b ; s2, s1, s3, s3 %2 m3, m0, 11110110b ; s2, s1, s3, s3
lea r2, [r1+r1*2]
%2 m0, m0, 01110100b ; s0, s1, s3, s1 %2 m0, m0, 01110100b ; s0, s1, s3, s1
paddw m0, m3 paddw m0, m3
lea r3, [r0+r1*4]
psrlw m0, 2 psrlw m0, 2
pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
%ifidn %1, sse2 %if mmsize==16
punpcklwd m0, m0 punpcklwd m0, m0
pshufd m3, m0, 11111010b pshufd m3, m0, 11111010b
punpckldq m0, m0 punpckldq m0, m0
...@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4 ...@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4
%endif %endif
MOV8 r0+r1*1, m1, m2 MOV8 r0+r1*1, m1, m2
MOV8 r0+r1*2, m1, m2 MOV8 r0+r1*2, m1, m2
MOV8 r0+r2*1, m1, m2 MOV8 r0+r5*1, m1, m2
MOV8 r0+r1*4, m1, m2 MOV8 r0+r1*4, m1, m2
MOV8 r3+r1*1, m3, m4 MOV8 r4+r1*1, m3, m4
MOV8 r3+r1*2, m3, m4 MOV8 r4+r1*2, m3, m4
MOV8 r3+r2*1, m3, m4 MOV8 r4+r5*1, m3, m4
MOV8 r3+r1*4, m3, m4 MOV8 r4+r1*4, m3, m4
RET RET
%endmacro %endmacro
...@@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw ...@@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8_top_dc(pixel *src, int stride) ; void pred8x8_top_dc(pixel *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8_TOP_DC 2 INIT_XMM
cglobal pred8x8_top_dc_10_%1, 2,4 cglobal pred8x8_top_dc_10_sse2, 2,4
sub r0, r1 sub r0, r1
movq m0, [r0+0] mova m0, [r0]
movq m1, [r0+8] pshuflw m1, m0, 0x4e
HADDW m0, m2 pshufhw m1, m1, 0x4e
HADDW m1, m3 paddw m0, m1
lea r2, [r1+r1*2] pshuflw m1, m0, 0xb1
paddw m0, [pw_2] pshufhw m1, m1, 0xb1
paddw m1, [pw_2] paddw m0, m1
lea r2, [r1*3]
lea r3, [r0+r1*4] lea r3, [r0+r1*4]
paddw m0, [pw_2]
psrlw m0, 2 psrlw m0, 2
psrlw m1, 2 mova [r0+r1*1], m0
%2 m0, m0, 0 mova [r0+r1*2], m0
%2 m1, m1, 0 mova [r0+r2*1], m0
%ifidn %1, sse2 mova [r0+r1*4], m0
punpcklqdq m0, m1 mova [r3+r1*1], m0
%endif mova [r3+r1*2], m0
MOV8 r0+r1*1, m0, m1 mova [r3+r2*1], m0
MOV8 r0+r1*2, m0, m1 mova [r3+r1*4], m0
MOV8 r0+r2*1, m0, m1
MOV8 r0+r1*4, m0, m1
MOV8 r3+r1*1, m0, m1
MOV8 r3+r1*2, m0, m1
MOV8 r3+r2*1, m0, m1
MOV8 r3+r1*4, m0, m1
RET RET
%endmacro
INIT_MMX
PRED8x8_TOP_DC mmxext, pshufw
INIT_XMM
PRED8x8_TOP_DC sse2 , pshuflw
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8_plane(pixel *src, int stride) ; void pred8x8_plane(pixel *src, int stride)
...@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw ...@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw
INIT_XMM INIT_XMM
cglobal pred8x8_plane_10_sse2, 2,7,7 cglobal pred8x8_plane_10_sse2, 2,7,7
sub r0, r1 sub r0, r1
lea r2, [r1+r1*2] lea r2, [r1*3]
lea r3, [r0+r1*4] lea r3, [r0+r1*4]
mova m2, [r0] mova m2, [r0]
pmaddwd m2, [pw_m32101234] pmaddwd m2, [pw_m32101234]
...@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 ...@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
sub r5d, r6d sub r5d, r6d
lea r5d, [r5+r5*2] lea r5d, [r5*3]
add r4d, r5d add r4d, r5d
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
...@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 ...@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_128_DC 1 %macro PRED8x8L_128_DC 1
cglobal pred8x8l_128_dc_10_%1, 4,4 cglobal pred8x8l_128_dc_10_%1, 4,4
mova m0, [pw_512] mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
lea r1, [r3+r3*2] lea r1, [r3*3]
lea r2, [r0+r3*4] lea r2, [r0+r3*4]
MOV8 r0+r3*0, m0, m0 MOV8 r0+r3*0, m0, m0
MOV8 r0+r3*1, m0, m0 MOV8 r0+r3*1, m0, m0
...@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2 ...@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2
%macro PRED8x8L_TOP_DC 1 %macro PRED8x8L_TOP_DC 1
cglobal pred8x8l_top_dc_10_%1, 4,4,6 cglobal pred8x8l_top_dc_10_%1, 4,4,6
sub r0, r3 sub r0, r3
pxor m7, m7 mova m0, [r0]
mova m0, [r0-16] shr r1d, 14
mova m3, [r0] shr r2d, 13
mova m1, [r0+16] neg r1
mova m2, m3 pslldq m1, m0, 2
mova m4, m3 psrldq m2, m0, 2
PALIGNR m2, m0, 14, m0 pinsrw m1, [r0+r1], 0
PALIGNR m1, m4, 2, m4 pinsrw m2, [r0+r2+14], 7
test r1, r1 ; top_left lea r1, [r3*3]
jz .fix_lt_2
test r2, r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
mova m5, m3
pxor m5, m2
pslldq m5, 14
psrldq m5, 14
pxor m2, m5
test r2, r2 ; top_right
jnz .body
.fix_tr_1:
mova m5, m3
pxor m5, m1
psrldq m5, 14
pslldq m5, 14
pxor m1, m5
.body
lea r1, [r3+r3*2]
lea r2, [r0+r3*4] lea r2, [r0+r3*4]
PRED4x4_LOWPASS m0, m2, m1, m3 PRED4x4_LOWPASS m0, m2, m1, m0
HADDW m0, m1 HADDW m0, m1
paddw m0, [pw_4] paddw m0, [pw_4]
psrlw m0, 3 psrlw m0, 3
...@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6 ...@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6
%endmacro %endmacro
INIT_XMM INIT_XMM
%define PALIGNR PALIGNR_MMX
PRED8x8L_TOP_DC sse2 PRED8x8L_TOP_DC sse2
%define PALIGNR PALIGNR_SSSE3 %ifdef HAVE_AVX
PRED8x8L_TOP_DC ssse3 INIT_AVX
PRED8x8L_TOP_DC avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
;TODO: see if scalar is faster ;TODO: see if scalar is faster
%macro PRED8x8L_DC 1 %macro PRED8x8L_DC 1
cglobal pred8x8l_dc_10_%1, 4,5,8 cglobal pred8x8l_dc_10_%1, 4,6,6
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*4]
mova m0, [r0+r3*1-16] lea r5, [r3*3]
punpckhwd m0, [r0+r3*0-16] mova m0, [r0+r3*2-16]
mova m1, [r4+r3*1-16] punpckhwd m0, [r0+r3*1-16]
punpckhwd m1, [r0+r3*2-16] mova m1, [r4+r3*0-16]
mov r4, r0 punpckhwd m1, [r0+r5*1-16]
punpckhdq m1, m0 punpckhdq m1, m0
lea r0, [r0+r3*4] mova m2, [r4+r3*2-16]
mova m2, [r0+r3*1-16] punpckhwd m2, [r4+r3*1-16]
punpckhwd m2, [r0+r3*0-16] mova m3, [r4+r3*4-16]
lea r0, [r0+r3*2] punpckhwd m3, [r4+r5*1-16]
mova m3, [r0+r3*1-16]
punpckhwd m3, [r0+r3*0-16]
punpckhdq m3, m2 punpckhdq m3, m2
punpckhqdq m3, m1 punpckhqdq m3, m1
lea r0, [r0+r3*2] mova m0, [r0]
mova m0, [r0+r3*0-16] shr r1d, 14
mova m1, [r4] shr r2d, 13
mov r0, r4 neg r1
mova m4, m3 pslldq m1, m0, 2
mova m2, m3 psrldq m2, m0, 2
PALIGNR m4, m0, 14, m0 pinsrw m1, [r0+r1], 0
PALIGNR m1, m2, 2, m2 pinsrw m2, [r0+r2+14], 7
test r1, r1 not r1
jnz .do_left and r1, r3
.fix_lt_1: pslldq m4, m3, 2
mova m5, m3 psrldq m5, m3, 2
pxor m5, m4 pshuflw m4, m4, 11100101b
psrldq m5, 14 pinsrw m5, [r0+r1-2], 7
pslldq m5, 12 PRED4x4_LOWPASS m3, m4, m5, m3
pxor m1, m5 PRED4x4_LOWPASS m0, m2, m1, m0
jmp .do_left paddw m0, m3
.fix_lt_2: HADDW m0, m1
mova m5, m3 paddw m0, [pw_8]
pxor m5, m2 psrlw m0, 4
pslldq m5, 14 SPLATW m0, m0
psrldq m5, 14 mova [r0+r3*1], m0
pxor m2, m5 mova [r0+r3*2], m0
test r2, r2 mova [r0+r5*1], m0
jnz .body mova [r0+r3*4], m0
.fix_tr_1: mova [r4+r3*1], m0
mova m5, m3 mova [r4+r3*2], m0
pxor m5, m1 mova [r4+r5*1], m0
psrldq m5, 14 mova [r4+r3*4], m0
pslldq m5, 14
pxor m1, m5
jmp .body
.do_left:
mova m0, m4
PRED4x4_LOWPASS m2, m1, m4, m3
mova m4, m0
mova m7, m2
PRED4x4_LOWPASS m1, m3, m0, m4
pslldq m1, 14
PALIGNR m7, m1, 14, m3
mova m0, [r0-16]
mova m3, [r0]
mova m1, [r0+16]
mova m2, m3
mova m4, m3
PALIGNR m2, m0, 14, m0
PALIGNR m1, m4, 2, m4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.body
lea r1, [r3+r3*2]
PRED4x4_LOWPASS m6, m2, m1, m3
HADDW m7, m0
HADDW m6, m0
lea r2, [r0+r3*4]
paddw m7, [pw_8]
paddw m7, m6
psrlw m7, 4
SPLATW m7, m7
mova [r0+r3*1], m7
mova [r0+r3*2], m7
mova [r0+r1*1], m7
mova [r0+r3*4], m7
mova [r2+r3*1], m7
mova [r2+r3*2], m7
mova [r2+r1*1], m7
mova [r2+r3*4], m7
RET RET
%endmacro %endmacro
INIT_XMM INIT_XMM
%define PALIGNR PALIGNR_MMX
PRED8x8L_DC sse2 PRED8x8L_DC sse2
%define PALIGNR PALIGNR_SSSE3 %ifdef HAVE_AVX
PRED8x8L_DC ssse3 INIT_AVX
PRED8x8L_DC avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
...@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3 ...@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3
%macro PRED8x8L_VERTICAL 1 %macro PRED8x8L_VERTICAL 1
cglobal pred8x8l_vertical_10_%1, 4,4,6 cglobal pred8x8l_vertical_10_%1, 4,4,6
sub r0, r3 sub r0, r3
mova m0, [r0-16] mova m0, [r0]
mova m3, [r0] shr r1d, 14
mova m1, [r0+16] shr r2d, 13
mova m2, m3 neg r1
mova m4, m3 pslldq m1, m0, 2
PALIGNR m2, m0, 14, m0 psrldq m2, m0, 2
PALIGNR m1, m4, 2, m4 pinsrw m1, [r0+r1], 0
test r1, r1 ; top_left pinsrw m2, [r0+r2+14], 7
jz .fix_lt_2 lea r1, [r3*3]
test r2, r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
mova m5, m3
pxor m5, m2
pslldq m5, 14
psrldq m5, 14
pxor m2, m5
test r2, r2 ; top_right
jnz .body
.fix_tr_1:
mova m5, m3
pxor m5, m1
psrldq m5, 14
pslldq m5, 14
pxor m1, m5
.body
lea r1, [r3+r3*2]
lea r2, [r0+r3*4] lea r2, [r0+r3*4]
PRED4x4_LOWPASS m0, m2, m1, m3 PRED4x4_LOWPASS m0, m2, m1, m0
mova [r0+r3*1], m0 mova [r0+r3*1], m0
mova [r0+r3*2], m0 mova [r0+r3*2], m0
mova [r0+r1*1], m0 mova [r0+r1*1], m0
...@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6 ...@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
%endmacro %endmacro
INIT_XMM INIT_XMM
%define PALIGNR PALIGNR_MMX
PRED8x8L_VERTICAL sse2 PRED8x8L_VERTICAL sse2
%define PALIGNR PALIGNR_SSSE3 %ifdef HAVE_AVX
PRED8x8L_VERTICAL ssse3 INIT_AVX
PRED8x8L_VERTICAL avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL 1 %macro PRED8x8L_HORIZONTAL 1
cglobal pred8x8l_horizontal_10_%1, 4,4,8 cglobal pred8x8l_horizontal_10_%1, 4,4,5
sub r0, r3 mova m0, [r0-16]
lea r2, [r0+r3*2] shr r1d, 14
mova m0, [r0+r3*1-16] dec r1
test r1, r1 and r1, r3
lea r1, [r0+r3] sub r1, r3
cmovnz r1, r0 punpckhwd m0, [r0+r1-16]
punpckhwd m0, [r1+r3*0-16] mova m1, [r0+r3*2-16]
mova m1, [r2+r3*1-16] punpckhwd m1, [r0+r3*1-16]
punpckhwd m1, [r0+r3*2-16] lea r2, [r0+r3*4]
mov r2, r0 lea r1, [r3*3]
punpckhdq m1, m0 punpckhdq m1, m0
lea r0, [r0+r3*4] mova m2, [r2+r3*0-16]
mova m2, [r0+r3*1-16] punpckhwd m2, [r0+r1-16]
punpckhwd m2, [r0+r3*0-16] mova m3, [r2+r3*2-16]
lea r0, [r0+r3*2] punpckhwd m3, [r2+r3*1-16]
mova m3, [r0+r3*1-16]
punpckhwd m3, [r0+r3*0-16]
punpckhdq m3, m2 punpckhdq m3, m2
punpckhqdq m3, m1 punpckhqdq m3, m1
lea r0, [r0+r3*2] PALIGNR m4, m3, [r2+r1-16], 14, m0
mova m0, [r0+r3*0-16] pslldq m0, m4, 2
mova m1, [r1+r3*0-16] pshuflw m0, m0, 11100101b
mov r0, r2 PRED4x4_LOWPASS m4, m3, m0, m4
mova m4, m3 punpckhwd m3, m4, m4
mova m2, m3 punpcklwd m4, m4
PALIGNR m4, m0, 14, m0
PALIGNR m1, m2, 2, m2
mova m0, m4
PRED4x4_LOWPASS m2, m1, m4, m3
mova m4, m0
mova m7, m2
PRED4x4_LOWPASS m1, m3, m0, m4
pslldq m1, 14
PALIGNR m7, m1, 14, m3
lea r1, [r3+r3*2]
punpckhwd m3, m7, m7
punpcklwd m7, m7
pshufd m0, m3, 0xff pshufd m0, m3, 0xff
pshufd m1, m3, 0xaa pshufd m1, m3, 0xaa
lea r2, [r0+r3*4]
pshufd m2, m3, 0x55 pshufd m2, m3, 0x55
pshufd m3, m3, 0x00 pshufd m3, m3, 0x00
pshufd m4, m7, 0xff mova [r0+r3*0], m0
pshufd m5, m7, 0xaa mova [r0+r3*1], m1
pshufd m6, m7, 0x55 mova [r0+r3*2], m2
pshufd m7, m7, 0x00 mova [r0+r1*1], m3
mova [r0+r3*1], m0 pshufd m0, m4, 0xff
mova [r0+r3*2], m1 pshufd m1, m4, 0xaa
mova [r0+r1*1], m2 pshufd m2, m4, 0x55
mova [r0+r3*4], m3 pshufd m3, m4, 0x00
mova [r2+r3*1], m4 mova [r2+r3*0], m0
mova [r2+r3*2], m5 mova [r2+r3*1], m1
mova [r2+r1*1], m6 mova [r2+r3*2], m2
mova [r2+r3*4], m7 mova [r2+r1*1], m3
RET RET
%endmacro %endmacro
...@@ -837,116 +728,68 @@ INIT_XMM ...@@ -837,116 +728,68 @@ INIT_XMM
PRED8x8L_HORIZONTAL sse2 PRED8x8L_HORIZONTAL sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL ssse3 PRED8x8L_HORIZONTAL ssse3
%ifdef HAVE_AVX
INIT_AVX
PRED8x8L_HORIZONTAL avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_LEFT 1 %macro PRED8x8L_DOWN_LEFT 1
cglobal pred8x8l_down_left_10_%1, 4,4,8 cglobal pred8x8l_down_left_10_%1, 4,4,7
sub r0, r3 sub r0, r3
mova m0, [r0-16]
mova m3, [r0] mova m3, [r0]
shr r1d, 14
neg r1
shr r2d, 13
pslldq m1, m3, 2
psrldq m2, m3, 2
pinsrw m1, [r0+r1], 0
pinsrw m2, [r0+r2+14], 7
PRED4x4_LOWPASS m6, m2, m1, m3
jz .fix_tr ; flags from shr r2d
mova m1, [r0+16] mova m1, [r0+16]
mova m2, m3 psrldq m5, m1, 2
mova m4, m3 PALIGNR m2, m1, m3, 14, m3
PALIGNR m2, m0, 14, m0 pshufhw m5, m5, 10100100b
PALIGNR m1, m4, 2, m4 PRED4x4_LOWPASS m1, m2, m5, m1
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
mova m5, m3
pxor m5, m2
pslldq m5, 14
psrldq m5, 14
pxor m2, m5
test r2, r2
jnz .do_top
.fix_tr_1:
mova m5, m3
pxor m5, m1
psrldq m5, 14
pslldq m5, 14
pxor m1, m5
jmp .do_top
.fix_tr_2:
punpckhwd m3, m3
pshufd m1, m3, 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS m4, m2, m1, m3
mova m7, m4
test r2, r2
jz .fix_tr_2
mova m0, [r0+16]
mova m5, m0
mova m2, m0
mova m4, m0
psrldq m5, 14
PALIGNR m2, m3, 14, m3
PALIGNR m5, m4, 2, m4
PRED4x4_LOWPASS m1, m2, m5, m0
.do_topright: .do_topright:
lea r1, [r3+r3*2] lea r1, [r3*3]
mova m6, m1 psrldq m5, m1, 14
psrldq m1, 14
mova m4, m1
lea r2, [r0+r3*4] lea r2, [r0+r3*4]
mova m2, m6 PALIGNR m2, m1, m6, 2, m0
PALIGNR m2, m7, 2, m0 PALIGNR m3, m1, m6, 14, m0
mova m3, m6 PALIGNR m5, m1, 2, m0
PALIGNR m3, m7, 14, m0 pslldq m4, m6, 2
PALIGNR m4, m6, 2, m0 PRED4x4_LOWPASS m6, m4, m2, m6
mova m5, m7 PRED4x4_LOWPASS m1, m3, m5, m1
mova m1, m7
mova m7, m6
pslldq m1, 2
PRED4x4_LOWPASS m0, m1, m2, m5
PRED4x4_LOWPASS m1, m3, m4, m7
mova [r2+r3*4], m1 mova [r2+r3*4], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r2+r1*1], m1 mova [r2+r1*1], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r2+r3*2], m1 mova [r2+r3*2], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r2+r3*1], m1 mova [r2+r3*1], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r0+r3*4], m1 mova [r0+r3*4], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r0+r1*1], m1 mova [r0+r1*1], m1
mova m2, m0 PALIGNR m1, m6, 14, m2
pslldq m1, 2 pslldq m6, 2
psrldq m2, 14
pslldq m0, 2
por m1, m2
mova [r0+r3*2], m1 mova [r0+r3*2], m1
pslldq m1, 2 PALIGNR m1, m6, 14, m6
psrldq m0, 14
por m1, m0
mova [r0+r3*1], m1 mova [r0+r3*1], m1
RET RET
.fix_tr:
punpckhwd m3, m3
pshufd m1, m3, 0xFF
jmp .do_topright
%endmacro %endmacro
INIT_XMM INIT_XMM
...@@ -954,139 +797,73 @@ INIT_XMM ...@@ -954,139 +797,73 @@ INIT_XMM
PRED8x8L_DOWN_LEFT sse2 PRED8x8L_DOWN_LEFT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT ssse3 PRED8x8L_DOWN_LEFT ssse3
%ifdef HAVE_AVX
INIT_AVX
PRED8x8L_DOWN_LEFT avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride) ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_RIGHT 1 %macro PRED8x8L_DOWN_RIGHT 1
; standard forbids this when has_topleft is false
; no need to check
cglobal pred8x8l_down_right_10_%1, 4,5,8 cglobal pred8x8l_down_right_10_%1, 4,5,8
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*4]
lea r1, [r3*3]
mova m0, [r0+r3*1-16] mova m0, [r0+r3*1-16]
punpckhwd m0, [r0+r3*0-16] punpckhwd m0, [r0+r3*0-16]
mova m1, [r4+r3*1-16] mova m1, [r0+r1*1-16]
punpckhwd m1, [r0+r3*2-16] punpckhwd m1, [r0+r3*2-16]
mov r4, r0
punpckhdq m1, m0 punpckhdq m1, m0
lea r0, [r0+r3*4] mova m2, [r4+r3*1-16]
mova m2, [r0+r3*1-16] punpckhwd m2, [r4+r3*0-16]
punpckhwd m2, [r0+r3*0-16] mova m3, [r4+r1*1-16]
lea r0, [r0+r3*2] punpckhwd m3, [r4+r3*2-16]
mova m3, [r0+r3*1-16]
punpckhwd m3, [r0+r3*0-16]
punpckhdq m3, m2 punpckhdq m3, m2
punpckhqdq m3, m1 punpckhqdq m3, m1
lea r0, [r0+r3*2] mova m0, [r4+r3*4-16]
mova m0, [r0+r3*0-16] mova m1, [r0]
mova m1, [r4] PALIGNR m4, m3, m0, 14, m0
mov r0, r4 PALIGNR m1, m3, 2, m2
mova m4, m3 pslldq m0, m4, 2
mova m2, m3 pshuflw m0, m0, 11100101b
PALIGNR m4, m0, 14, m0 PRED4x4_LOWPASS m6, m1, m4, m3
PALIGNR m1, m2, 2, m2 PRED4x4_LOWPASS m4, m3, m0, m4
test r1, r1 ; top_left
jz .fix_lt_1
.do_left:
mova m0, m4
PRED4x4_LOWPASS m2, m1, m4, m3
mova m4, m0
mova m7, m2
mova m6, m2
PRED4x4_LOWPASS m1, m3, m0, m4
pslldq m1, 14
PALIGNR m7, m1, 14, m3
mova m0, [r0-16]
mova m3, [r0] mova m3, [r0]
mova m1, [r0+16] shr r2d, 13
mova m2, m3 pslldq m1, m3, 2
mova m4, m3 psrldq m2, m3, 2
PALIGNR m2, m0, 14, m0 pinsrw m1, [r0-2], 0
PALIGNR m1, m4, 2, m4 pinsrw m2, [r0+r2+14], 7
test r1, r1 ; top_left PRED4x4_LOWPASS m3, m2, m1, m3
jz .fix_lt_2 PALIGNR m2, m3, m6, 2, m0
test r2, r2 ; top_right PALIGNR m5, m3, m6, 14, m0
jz .fix_tr_1 psrldq m7, m3, 2
.do_top: PRED4x4_LOWPASS m6, m4, m2, m6
PRED4x4_LOWPASS m4, m2, m1, m3 PRED4x4_LOWPASS m3, m5, m7, m3
mova m5, m4 mova [r4+r3*4], m6
jmp .body PALIGNR m3, m6, 14, m2
.fix_lt_1: pslldq m6, 2
mova m5, m3 mova [r0+r3*1], m3
pxor m5, m4 PALIGNR m3, m6, 14, m2
psrldq m5, 14 pslldq m6, 2
pslldq m5, 12 mova [r0+r3*2], m3
pxor m1, m5 PALIGNR m3, m6, 14, m2
jmp .do_left pslldq m6, 2
.fix_lt_2: mova [r0+r1*1], m3
mova m5, m3 PALIGNR m3, m6, 14, m2
pxor m5, m2 pslldq m6, 2
pslldq m5, 14 mova [r0+r3*4], m3
psrldq m5, 14 PALIGNR m3, m6, 14, m2
pxor m2, m5 pslldq m6, 2
test r2, r2 ; top_right mova [r4+r3*1], m3
jnz .do_top PALIGNR m3, m6, 14, m2
.fix_tr_1: pslldq m6, 2
mova m5, m3 mova [r4+r3*2], m3
pxor m5, m1 PALIGNR m3, m6, 14, m6
psrldq m5, 14 mova [r4+r1*1], m3
pslldq m5, 14
pxor m1, m5
jmp .do_top
.body
lea r1, [r3+r3*2]
mova m1, m7
mova m7, m5
mova m5, m6
mova m2, m7
lea r2, [r0+r3*4]
PALIGNR m2, m6, 2, m0
mova m3, m7
PALIGNR m3, m6, 14, m0
mova m4, m7
psrldq m4, 2
PRED4x4_LOWPASS m0, m1, m2, m5
PRED4x4_LOWPASS m1, m3, m4, m7
mova [r2+r3*4], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r2+r1*1], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r2+r3*2], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r2+r3*1], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r0+r3*4], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r0+r1*1], m0
mova m2, m1
psrldq m0, 2
pslldq m2, 14
psrldq m1, 2
por m0, m2
mova [r0+r3*2], m0
psrldq m0, 2
pslldq m1, 14
por m0, m1
mova [r0+r3*1], m0
RET RET
%endmacro %endmacro
...@@ -1095,114 +872,69 @@ INIT_XMM ...@@ -1095,114 +872,69 @@ INIT_XMM
PRED8x8L_DOWN_RIGHT sse2 PRED8x8L_DOWN_RIGHT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT ssse3 PRED8x8L_DOWN_RIGHT ssse3
%ifdef HAVE_AVX
INIT_AVX
PRED8x8L_DOWN_RIGHT avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_RIGHT 1 %macro PRED8x8L_VERTICAL_RIGHT 1
cglobal pred8x8l_vertical_right_10_%1, 4,5,8 ; likewise with 8x8l_down_right
cglobal pred8x8l_vertical_right_10_%1, 4,5,7
sub r0, r3 sub r0, r3
lea r4, [r0+r3*2] lea r4, [r0+r3*4]
lea r1, [r3*3]
mova m0, [r0+r3*1-16] mova m0, [r0+r3*1-16]
punpckhwd m0, [r0+r3*0-16] punpckhwd m0, [r0+r3*0-16]
mova m1, [r4+r3*1-16] mova m1, [r0+r1*1-16]
punpckhwd m1, [r0+r3*2-16] punpckhwd m1, [r0+r3*2-16]
mov r4, r0
punpckhdq m1, m0 punpckhdq m1, m0
lea r0, [r0+r3*4] mova m2, [r4+r3*1-16]
mova m2, [r0+r3*1-16] punpckhwd m2, [r4+r3*0-16]
punpckhwd m2, [r0+r3*0-16] mova m3, [r4+r1*1-16]
lea r0, [r0+r3*2] punpckhwd m3, [r4+r3*2-16]
mova m3, [r0+r3*1-16]
punpckhwd m3, [r0+r3*0-16]
punpckhdq m3, m2 punpckhdq m3, m2
punpckhqdq m3, m1 punpckhqdq m3, m1
lea r0, [r0+r3*2] mova m0, [r4+r3*4-16]
mova m0, [r0+r3*0-16] mova m1, [r0]
mova m1, [r4] PALIGNR m4, m3, m0, 14, m0
mov r0, r4 PALIGNR m1, m3, 2, m2
mova m4, m3 PRED4x4_LOWPASS m3, m1, m4, m3
mova m2, m3 mova m2, [r0]
PALIGNR m4, m0, 14, m0 shr r2d, 13
PALIGNR m1, m2, 2, m2 pslldq m1, m2, 2
test r1, r1 psrldq m5, m2, 2
jz .fix_lt_1 pinsrw m1, [r0-2], 0
jmp .do_left pinsrw m5, [r0+r2+14], 7
.fix_lt_1: PRED4x4_LOWPASS m2, m5, m1, m2
mova m5, m3 PALIGNR m6, m2, m3, 12, m1
pxor m5, m4 PALIGNR m5, m2, m3, 14, m0
psrldq m5, 14 PRED4x4_LOWPASS m0, m6, m2, m5
pslldq m5, 12 pavgw m2, m5
pxor m1, m5
jmp .do_left
.fix_lt_2:
mova m5, m3
pxor m5, m2
pslldq m5, 14
psrldq m5, 14
pxor m2, m5
test r2, r2
jnz .do_top
.fix_tr_1:
mova m5, m3
pxor m5, m1
psrldq m5, 14
pslldq m5, 14
pxor m1, m5
jmp .do_top
.do_left:
mova m0, m4
PRED4x4_LOWPASS m2, m1, m4, m3
mova m7, m2
mova m0, [r0-16]
mova m3, [r0]
mova m1, [r0+16]
mova m2, m3
mova m4, m3
PALIGNR m2, m0, 14, m0
PALIGNR m1, m4, 2, m4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top
PRED4x4_LOWPASS m6, m2, m1, m3
lea r1, [r3+r3*2]
mova m2, m6
mova m3, m6
PALIGNR m3, m7, 14, m0
PALIGNR m6, m7, 12, m1
mova m4, m3
pavgw m3, m2
lea r2, [r0+r3*4]
PRED4x4_LOWPASS m0, m6, m2, m4
mova [r0+r3*1], m3
mova [r0+r3*2], m0 mova [r0+r3*2], m0
mova m5, m0 mova [r0+r3*1], m2
mova m6, m3 pslldq m6, m3, 4
mova m1, m7 pslldq m1, m3, 2
mova m2, m1 PRED4x4_LOWPASS m1, m3, m6, m1
pslldq m2, 2 PALIGNR m2, m1, 14, m4
mova m3, m1 mova [r0+r1*1], m2
pslldq m3, 4 pslldq m1, 2
PRED4x4_LOWPASS m0, m1, m3, m2 PALIGNR m0, m1, 14, m3
PALIGNR m6, m0, 14, m2 mova [r0+r3*4], m0
mova [r0+r1*1], m6 pslldq m1, 2
pslldq m0, 2 PALIGNR m2, m1, 14, m4
PALIGNR m5, m0, 14, m1 mova [r4+r3*1], m2
mova [r0+r3*4], m5 pslldq m1, 2
pslldq m0, 2 PALIGNR m0, m1, 14, m3
PALIGNR m6, m0, 14, m2 mova [r4+r3*2], m0
mova [r2+r3*1], m6 pslldq m1, 2
pslldq m0, 2 PALIGNR m2, m1, 14, m4
PALIGNR m5, m0, 14, m1 mova [r4+r1*1], m2
mova [r2+r3*2], m5 pslldq m1, 2
pslldq m0, 2 PALIGNR m0, m1, 14, m1
PALIGNR m6, m0, 14, m2 mova [r4+r3*4], m0
mova [r2+r1*1], m6
pslldq m0, 2
PALIGNR m5, m0, 14, m1
mova [r2+r3*4], m5
RET RET
%endmacro %endmacro
...@@ -1211,84 +943,60 @@ INIT_XMM ...@@ -1211,84 +943,60 @@ INIT_XMM
PRED8x8L_VERTICAL_RIGHT sse2 PRED8x8L_VERTICAL_RIGHT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT ssse3 PRED8x8L_VERTICAL_RIGHT ssse3
%ifdef HAVE_AVX
INIT_AVX
PRED8x8L_VERTICAL_RIGHT avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL_UP 1 %macro PRED8x8L_HORIZONTAL_UP 1
cglobal pred8x8l_horizontal_up_10_%1, 4,4,8 cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
sub r0, r3
lea r2, [r0+r3*2]
mova m0, [r0+r3*1-16]
test r1, r1
lea r1, [r0+r3]
cmovnz r1, r0
punpckhwd m0, [r1+r3*0-16]
mova m1, [r2+r3*1-16]
punpckhwd m1, [r0+r3*2-16]
mov r2, r0
punpckhdq m1, m0
lea r0, [r0+r3*4]
mova m2, [r0+r3*1-16]
punpckhwd m2, [r0+r3*0-16]
lea r0, [r0+r3*2]
mova m3, [r0+r3*1-16]
punpckhwd m3, [r0+r3*0-16]
punpckhdq m3, m2
punpckhqdq m3, m1
lea r0, [r0+r3*2]
mova m0, [r0+r3*0-16] mova m0, [r0+r3*0-16]
mova m1, [r1+r3*0-16] punpckhwd m0, [r0+r3*1-16]
mov r0, r2 shr r1d, 14
mova m4, m3 dec r1
mova m2, m3 and r1, r3
PALIGNR m4, m0, 14, m0 sub r1, r3
PALIGNR m1, m2, 2, m2 mova m4, [r0+r1*1-16]
mova m0, m4 lea r1, [r3*3]
PRED4x4_LOWPASS m2, m1, m4, m3
mova m4, m0
mova m7, m2
PRED4x4_LOWPASS m1, m3, m0, m4
pslldq m1, 14
PALIGNR m7, m1, 14, m3
lea r1, [r3+r3*2]
pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
pslldq m7, 14 ; l7 .. .. .. .. .. .. ..
mova m2, m0
pslld m0, 16
psrld m2, 16
por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0
mova m3, m2
mova m4, m2
mova m5, m2
psrldq m2, 2
psrldq m3, 4
lea r2, [r0+r3*4] lea r2, [r0+r3*4]
por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1 mova m1, [r0+r3*2-16]
punpckhwd m7, m7 punpckhwd m1, [r0+r1*1-16]
por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2 punpckhdq m0, m1
pavgw m4, m2 mova m2, [r2+r3*0-16]
PRED4x4_LOWPASS m1, m3, m5, m2 punpckhwd m2, [r2+r3*1-16]
mova m5, m4 mova m3, [r2+r3*2-16]
punpcklwd m4, m1 ; p4 p3 p2 p1 punpckhwd m3, [r2+r1*1-16]
punpckhwd m5, m1 ; p8 p7 p6 p5 punpckhdq m2, m3
mova m6, m5 punpckhqdq m0, m2
mova m7, m5 PALIGNR m1, m0, m4, 14, m4
mova m0, m5 psrldq m2, m0, 2
PALIGNR m5, m4, 4, m1 pshufhw m2, m2, 10100100b
pshufd m1, m6, 11111001b PRED4x4_LOWPASS m0, m1, m2, m0
PALIGNR m6, m4, 8, m2 psrldq m1, m0, 2
pshufd m2, m7, 11111110b psrldq m2, m0, 4
PALIGNR m7, m4, 12, m3 pshufhw m1, m1, 10100100b
pshufd m3, m0, 11111111b pshufhw m2, m2, 01010100b
mova [r0+r3*1], m4 pavgw m4, m0, m1
mova [r0+r3*2], m5 PRED4x4_LOWPASS m1, m2, m0, m1
mova [r0+r1*1], m6 punpckhwd m5, m4, m1
mova [r0+r3*4], m7 punpcklwd m4, m1
mova [r2+r3*0], m5
mova [r0+r3*0], m4
pshufd m0, m5, 11111001b
pshufd m1, m5, 11111110b
pshufd m2, m5, 11111111b
mova [r2+r3*1], m0 mova [r2+r3*1], m0
mova [r2+r3*2], m1 mova [r2+r3*2], m1
mova [r2+r1*1], m2 mova [r2+r1*1], m2
mova [r2+r3*4], m3 PALIGNR m2, m5, m4, 4, m0
PALIGNR m3, m5, m4, 8, m1
PALIGNR m5, m5, m4, 12, m4
mova [r0+r3*1], m2
mova [r0+r3*2], m3
mova [r0+r1*1], m5
RET RET
%endmacro %endmacro
...@@ -1297,7 +1005,10 @@ INIT_XMM ...@@ -1297,7 +1005,10 @@ INIT_XMM
PRED8x8L_HORIZONTAL_UP sse2 PRED8x8L_HORIZONTAL_UP sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP ssse3 PRED8x8L_HORIZONTAL_UP ssse3
%ifdef HAVE_AVX
INIT_AVX
PRED8x8L_HORIZONTAL_UP avx
%endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3 ...@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3
%macro PRED16x16_VERTICAL 1 %macro PRED16x16_VERTICAL 1
cglobal pred16x16_vertical_10_%1, 2,3 cglobal pred16x16_vertical_10_%1, 2,3
sub r0, r1 sub r0, r1
mov r2, 8 mov r2d, 8
mova m0, [r0+ 0] mova m0, [r0+ 0]
mova m1, [r0+mmsize] mova m1, [r0+mmsize]
%if mmsize==8 %if mmsize==8
...@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3 ...@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3
MOV16 r0+r1*1, m0, m1, m2, m3 MOV16 r0+r1*1, m0, m1, m2, m3
MOV16 r0+r1*2, m0, m1, m2, m3 MOV16 r0+r1*2, m0, m1, m2, m3
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
dec r2 dec r2d
jg .loop jg .loop
REP_RET REP_RET
%endmacro %endmacro
...@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2 ...@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED16x16_HORIZONTAL 1 %macro PRED16x16_HORIZONTAL 1
cglobal pred16x16_horizontal_10_%1, 2,3 cglobal pred16x16_horizontal_10_%1, 2,3
mov r2, 8 mov r2d, 8
.vloop: .vloop:
movd m0, [r0+r1*0-4] movd m0, [r0+r1*0-4]
movd m1, [r0+r1*1-4] movd m1, [r0+r1*1-4]
...@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 ...@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
MOV16 r0+r1*0, m0, m0, m0, m0 MOV16 r0+r1*0, m0, m0, m0, m0
MOV16 r0+r1*1, m1, m1, m1, m1 MOV16 r0+r1*1, m1, m1, m1, m1
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
dec r2 dec r2d
jg .vloop jg .vloop
REP_RET REP_RET
%endmacro %endmacro
...@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2 ...@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2
; void pred16x16_dc(pixel *src, int stride) ; void pred16x16_dc(pixel *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED16x16_DC 1 %macro PRED16x16_DC 1
cglobal pred16x16_dc_10_%1, 2,7 cglobal pred16x16_dc_10_%1, 2,6
mov r4, r0 mov r5, r0
sub r0, r1 sub r0, r1
mova m0, [r0+0] mova m0, [r0+0]
paddw m0, [r0+mmsize] paddw m0, [r0+mmsize]
...@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7 ...@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7
%endif %endif
HADDW m0, m2 HADDW m0, m2
sub r0, 2 lea r0, [r0+r1-2]
movzx r3d, word [r0+r1*1] movzx r3d, word [r0]
movzx r5d, word [r0+r1*2] movzx r4d, word [r0+r1]
%rep 7 %rep 7
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
movzx r2d, word [r0+r1*1] movzx r2d, word [r0]
add r3d, r2d add r3d, r2d
movzx r2d, word [r0+r1*2] movzx r2d, word [r0+r1]
add r5d, r2d add r4d, r2d
%endrep %endrep
lea r3d, [r3+r5+16] lea r3d, [r3+r4+16]
movd m1, r3d movd m1, r3d
paddw m0, m1 paddw m0, m1
...@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7 ...@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7
SPLATW m0, m0 SPLATW m0, m0
mov r3d, 8 mov r3d, 8
.loop: .loop:
MOV16 r4+r1*0, m0, m0, m0, m0 MOV16 r5+r1*0, m0, m0, m0, m0
MOV16 r4+r1*1, m0, m0, m0, m0 MOV16 r5+r1*1, m0, m0, m0, m0
lea r4, [r4+r1*2] lea r5, [r5+r1*2]
dec r3d dec r3d
jg .loop jg .loop
REP_RET REP_RET
...@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2 ...@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2
; void pred16x16_left_dc(pixel *src, int stride) ; void pred16x16_left_dc(pixel *src, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro PRED16x16_LEFT_DC 1 %macro PRED16x16_LEFT_DC 1
cglobal pred16x16_left_dc_10_%1, 2,7 cglobal pred16x16_left_dc_10_%1, 2,6
mov r4, r0 mov r5, r0
sub r0, 2 sub r0, 2
movzx r5d, word [r0+r1*0] movzx r3d, word [r0]
movzx r6d, word [r0+r1*1] movzx r4d, word [r0+r1]
%rep 7 %rep 7
lea r0, [r0+r1*2] lea r0, [r0+r1*2]
movzx r2d, word [r0+r1*0] movzx r2d, word [r0]
movzx r3d, word [r0+r1*1] add r3d, r2d
add r5d, r2d movzx r2d, word [r0+r1]
add r6d, r3d add r4d, r2d
%endrep %endrep
lea r2d, [r5+r6+8] lea r3d, [r3+r4+8]
shr r2d, 4 shr r3d, 4
movd m0, r2d movd m0, r3d
SPLATW m0, m0 SPLATW m0, m0
mov r3d, 8 mov r3d, 8
.loop: .loop:
MOV16 r4+r1*0, m0, m0, m0, m0 MOV16 r5+r1*0, m0, m0, m0, m0
MOV16 r4+r1*1, m0, m0, m0, m0 MOV16 r5+r1*1, m0, m0, m0, m0
lea r4, [r4+r1*2] lea r5, [r5+r1*2]
dec r3d dec r3d
jg .loop jg .loop
REP_RET REP_RET
......
...@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); ...@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
PRED8x8(dc, 10, mmxext) PRED8x8(dc, 10, mmxext)
PRED8x8(dc, 10, sse2) PRED8x8(dc, 10, sse2)
PRED8x8(top_dc, 10, mmxext)
PRED8x8(top_dc, 10, sse2) PRED8x8(top_dc, 10, sse2)
PRED8x8(plane, 10, sse2) PRED8x8(plane, 10, sse2)
PRED8x8(vertical, 10, sse2) PRED8x8(vertical, 10, sse2)
...@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2) ...@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2)
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride); void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride);
PRED8x8L(dc, 10, sse2) PRED8x8L(dc, 10, sse2)
PRED8x8L(dc, 10, ssse3) PRED8x8L(dc, 10, avx)
PRED8x8L(128_dc, 10, mmxext) PRED8x8L(128_dc, 10, mmxext)
PRED8x8L(128_dc, 10, sse2) PRED8x8L(128_dc, 10, sse2)
PRED8x8L(top_dc, 10, sse2) PRED8x8L(top_dc, 10, sse2)
PRED8x8L(top_dc, 10, ssse3) PRED8x8L(top_dc, 10, avx)
PRED8x8L(vertical, 10, sse2) PRED8x8L(vertical, 10, sse2)
PRED8x8L(vertical, 10, ssse3) PRED8x8L(vertical, 10, avx)
PRED8x8L(horizontal, 10, sse2) PRED8x8L(horizontal, 10, sse2)
PRED8x8L(horizontal, 10, ssse3) PRED8x8L(horizontal, 10, ssse3)
PRED8x8L(horizontal, 10, avx)
PRED8x8L(down_left, 10, sse2) PRED8x8L(down_left, 10, sse2)
PRED8x8L(down_left, 10, ssse3) PRED8x8L(down_left, 10, ssse3)
PRED8x8L(down_left, 10, avx)
PRED8x8L(down_right, 10, sse2) PRED8x8L(down_right, 10, sse2)
PRED8x8L(down_right, 10, ssse3) PRED8x8L(down_right, 10, ssse3)
PRED8x8L(down_right, 10, avx)
PRED8x8L(vertical_right, 10, sse2) PRED8x8L(vertical_right, 10, sse2)
PRED8x8L(vertical_right, 10, ssse3) PRED8x8L(vertical_right, 10, ssse3)
PRED8x8L(vertical_right, 10, avx)
PRED8x8L(horizontal_up, 10, sse2) PRED8x8L(horizontal_up, 10, sse2)
PRED8x8L(horizontal_up, 10, ssse3) PRED8x8L(horizontal_up, 10, ssse3)
PRED8x8L(horizontal_up, 10, avx)
#define PRED16x16(TYPE, DEPTH, OPT)\ #define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
...@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth ...@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
...@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth ...@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
} }
#if HAVE_AVX #if HAVE_AVX
if (mm_flags & AV_CPU_FLAG_AVX) { if (mm_flags & AV_CPU_FLAG_AVX) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
} }
#endif /* HAVE_AVX */ #endif /* HAVE_AVX */
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment