Commit 8ad77b65 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

Update x86 H.264 deblock asm

Includes AVX versions from x264.
parent b6675279
;***************************************************************************** ;*****************************************************************************
;* MMX/SSE2-optimized H.264 deblocking code ;* MMX/SSE2/AVX-optimized H.264 deblocking code
;***************************************************************************** ;*****************************************************************************
;* Copyright (C) 2005-2008 x264 project ;* Copyright (C) 2005-2011 x264 project
;* ;*
;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Oskar Arvidsson <oskar@irock.se>
;* ;*
;* This file is part of Libav. ;* This file is part of Libav.
;* ;*
...@@ -26,96 +27,135 @@ ...@@ -26,96 +27,135 @@
%include "x86inc.asm" %include "x86inc.asm"
%include "x86util.asm" %include "x86util.asm"
SECTION_RODATA SECTION .text
cextern pb_0 cextern pb_0
cextern pb_1 cextern pb_1
cextern pb_3 cextern pb_3
cextern pb_A1 cextern pb_A1
SECTION .text
; expands to [base],...,[base+7*stride] ; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \ %define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \ [base], [base+stride], [base+stride*2], [base3], \
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
; in: 8 rows of 4 bytes in %1..%8 %define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3 ; out: 4 rows of 8 bytes in m0..m3
%macro TRANSPOSE4x8_LOAD 8 %macro TRANSPOSE4x8_LOAD 11
movd m0, %1 movh m0, %4
movd m2, %2 movh m2, %5
movd m1, %3 movh m1, %6
movd m3, %4 movh m3, %7
punpcklbw m0, m2 punpckl%1 m0, m2
punpcklbw m1, m3 punpckl%1 m1, m3
movq m2, m0 mova m2, m0
punpcklwd m0, m1 punpckl%2 m0, m1
punpckhwd m2, m1 punpckh%2 m2, m1
movd m4, %5 movh m4, %8
movd m6, %6 movh m6, %9
movd m5, %7 movh m5, %10
movd m7, %8 movh m7, %11
punpcklbw m4, m6 punpckl%1 m4, m6
punpcklbw m5, m7 punpckl%1 m5, m7
movq m6, m4 mova m6, m4
punpcklwd m4, m5 punpckl%2 m4, m5
punpckhwd m6, m5 punpckh%2 m6, m5
movq m1, m0 punpckh%3 m1, m0, m4
movq m3, m2 punpckh%3 m3, m2, m6
punpckldq m0, m4 punpckl%3 m0, m4
punpckhdq m1, m4 punpckl%3 m2, m6
punpckldq m2, m6
punpckhdq m3, m6
%endmacro %endmacro
; in: 4 rows of 8 bytes in m0..m3 ; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8 ; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4_STORE 8 %macro TRANSPOSE8x4B_STORE 8
movq m4, m0 punpckhdq m4, m0, m0
movq m5, m1 punpckhdq m5, m1, m1
movq m6, m2 punpckhdq m6, m2, m2
punpckhdq m4, m4
punpckhdq m5, m5
punpckhdq m6, m6
punpcklbw m0, m1 punpcklbw m0, m1
punpcklbw m2, m3 punpcklbw m2, m3
movq m1, m0 punpcklwd m1, m0, m2
punpcklwd m0, m2 punpckhwd m0, m2
punpckhwd m1, m2 movh %1, m1
movd %1, m0
punpckhdq m0, m0
movd %2, m0
movd %3, m1
punpckhdq m1, m1 punpckhdq m1, m1
movd %4, m1 movh %2, m1
movh %3, m0
punpckhdq m0, m0
movh %4, m0
punpckhdq m3, m3 punpckhdq m3, m3
punpcklbw m4, m5 punpcklbw m4, m5
punpcklbw m6, m3 punpcklbw m6, m3
movq m5, m4 punpcklwd m5, m4, m6
punpcklwd m4, m6 punpckhwd m4, m6
punpckhwd m5, m6 movh %5, m5
movd %5, m4
punpckhdq m4, m4
movd %6, m4
movd %7, m5
punpckhdq m5, m5 punpckhdq m5, m5
movd %8, m5 movh %6, m5
movh %7, m4
punpckhdq m4, m4
movh %8, m4
%endmacro
%macro TRANSPOSE4x8B_LOAD 8
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
%endmacro
%macro TRANSPOSE4x8W_LOAD 8
%if mmsize==16
TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
%else
SWAP 1, 4, 2, 3
mova m0, [t5]
mova m1, [t5+r1]
mova m2, [t5+r1*2]
mova m3, [t5+t6]
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
%endmacro
%macro TRANSPOSE8x2W_STORE 8
punpckhwd m0, m1, m2
punpcklwd m1, m2
%if mmsize==8
movd %3, m0
movd %1, m1
psrlq m1, 32
psrlq m0, 32
movd %2, m1
movd %4, m0
%else
movd %5, m0
movd %1, m1
psrldq m1, 4
psrldq m0, 4
movd %2, m1
movd %6, m0
psrldq m1, 4
psrldq m0, 4
movd %3, m1
movd %7, m0
psrldq m1, 4
psrldq m0, 4
movd %4, m1
movd %8, m0
%endif
%endmacro %endmacro
%macro SBUTTERFLY3 4 %macro SBUTTERFLY3 4
movq %4, %2 punpckh%1 %4, %2, %3
punpckl%1 %2, %3 punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro %endmacro
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9 %macro TRANSPOSE6x8_MEM 9
RESET_MM_PERMUTATION
movq m0, %1 movq m0, %1
movq m1, %2 movq m1, %2
movq m2, %3 movq m2, %3
...@@ -123,30 +163,32 @@ SECTION .text ...@@ -123,30 +163,32 @@ SECTION .text
movq m4, %5 movq m4, %5
movq m5, %6 movq m5, %6
movq m6, %7 movq m6, %7
SBUTTERFLY3 bw, m0, m1, m7 SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY3 bw, m2, m3, m1 SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY3 bw, m4, m5, m3 SBUTTERFLY bw, 4, 5, 7
movq [%9+0x10], m1 movq [%9+0x10], m3
SBUTTERFLY3 bw, m6, %8, m5 SBUTTERFLY3 bw, m6, %8, m7
SBUTTERFLY3 wd, m0, m2, m1 SBUTTERFLY wd, 0, 2, 3
SBUTTERFLY3 wd, m4, m6, m2 SBUTTERFLY wd, 4, 6, 3
punpckhdq m0, m4 punpckhdq m0, m4
movq [%9+0x00], m0 movq [%9+0x00], m0
SBUTTERFLY3 wd, m7, [%9+0x10], m6 SBUTTERFLY3 wd, m1, [%9+0x10], m3
SBUTTERFLY3 wd, m3, m5, m4 SBUTTERFLY wd, 5, 7, 0
SBUTTERFLY3 dq, m7, m3, m0 SBUTTERFLY dq, 1, 5, 0
SBUTTERFLY3 dq, m1, m2, m5 SBUTTERFLY dq, 2, 6, 0
punpckldq m6, m4 punpckldq m3, m7
movq [%9+0x10], m1 movq [%9+0x10], m2
movq [%9+0x20], m5 movq [%9+0x20], m6
movq [%9+0x30], m7 movq [%9+0x30], m1
movq [%9+0x40], m0 movq [%9+0x40], m5
movq [%9+0x50], m6 movq [%9+0x50], m3
RESET_MM_PERMUTATION
%endmacro %endmacro
; in: 8 rows of 8 in %1..%8 ; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16 ; out: 8 rows of 8 in %9..%16
%macro TRANSPOSE8x8_MEM 16 %macro TRANSPOSE8x8_MEM 16
RESET_MM_PERMUTATION
movq m0, %1 movq m0, %1
movq m1, %2 movq m1, %2
movq m2, %3 movq m2, %3
...@@ -154,38 +196,44 @@ SECTION .text ...@@ -154,38 +196,44 @@ SECTION .text
movq m4, %5 movq m4, %5
movq m5, %6 movq m5, %6
movq m6, %7 movq m6, %7
SBUTTERFLY3 bw, m0, m1, m7 SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY3 bw, m2, m3, m1 SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY3 bw, m4, m5, m3 SBUTTERFLY bw, 4, 5, 7
SBUTTERFLY3 bw, m6, %8, m5 SBUTTERFLY3 bw, m6, %8, m7
movq %9, m3 movq %9, m5
SBUTTERFLY3 wd, m0, m2, m3 SBUTTERFLY wd, 0, 2, 5
SBUTTERFLY3 wd, m4, m6, m2 SBUTTERFLY wd, 4, 6, 5
SBUTTERFLY3 wd, m7, m1, m6 SBUTTERFLY wd, 1, 3, 5
movq %11, m2 movq %11, m6
movq m2, %9 movq m6, %9
SBUTTERFLY3 wd, m2, m5, m1 SBUTTERFLY wd, 6, 7, 5
SBUTTERFLY3 dq, m0, m4, m5 SBUTTERFLY dq, 0, 4, 5
SBUTTERFLY3 dq, m7, m2, m4 SBUTTERFLY dq, 1, 6, 5
movq %9, m0 movq %9, m0
movq %10, m5 movq %10, m4
movq %13, m7 movq %13, m1
movq %14, m4 movq %14, m6
SBUTTERFLY3 dq, m3, %11, m0 SBUTTERFLY3 dq, m2, %11, m0
SBUTTERFLY3 dq, m6, m1, m5 SBUTTERFLY dq, 3, 7, 4
movq %11, m3 movq %11, m2
movq %12, m0 movq %12, m0
movq %15, m6 movq %15, m3
movq %16, m5 movq %16, m7
RESET_MM_PERMUTATION
%endmacro %endmacro
; out: %4 = |%1-%2|>%3 ; out: %4 = |%1-%2|>%3
; clobbers: %5 ; clobbers: %5
%macro DIFF_GT 5 %macro DIFF_GT 5
%if avx_enabled == 0
mova %5, %2 mova %5, %2
mova %4, %1 mova %4, %1
psubusb %5, %1 psubusb %5, %1
psubusb %4, %2 psubusb %4, %2
%else
psubusb %5, %2, %1
psubusb %4, %1, %2
%endif
por %4, %5 por %4, %5
psubusb %4, %3 psubusb %4, %3
%endmacro %endmacro
...@@ -193,32 +241,28 @@ SECTION .text ...@@ -193,32 +241,28 @@ SECTION .text
; out: %4 = |%1-%2|>%3 ; out: %4 = |%1-%2|>%3
; clobbers: %5 ; clobbers: %5
%macro DIFF_GT2 5 %macro DIFF_GT2 5
%ifdef ARCH_X86_64
psubusb %5, %2, %1
psubusb %4, %1, %2
%else
mova %5, %2 mova %5, %2
mova %4, %1 mova %4, %1
psubusb %5, %1 psubusb %5, %1
psubusb %4, %2 psubusb %4, %2
%endif
psubusb %5, %3 psubusb %5, %3
psubusb %4, %3 psubusb %4, %3
pcmpeqb %4, %5 pcmpeqb %4, %5
%endmacro %endmacro
%macro SPLATW 1
%ifidn m0, xmm0
pshuflw %1, %1, 0
punpcklqdq %1, %1
%else
pshufw %1, %1, 0
%endif
%endmacro
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
; out: m5=beta-1, m7=mask, %3=alpha-1 ; out: m5=beta-1, m7=mask, %3=alpha-1
; clobbers: m4,m6 ; clobbers: m4,m6
%macro LOAD_MASK 2-3 %macro LOAD_MASK 2-3
movd m4, %1 movd m4, %1
movd m5, %2 movd m5, %2
SPLATW m4 SPLATW m4, m4
SPLATW m5 SPLATW m5, m5
packuswb m4, m4 ; 16x alpha-1 packuswb m4, m4 ; 16x alpha-1
packuswb m5, m5 ; 16x beta-1 packuswb m5, m5 ; 16x beta-1
%if %0>2 %if %0>2
...@@ -237,8 +281,7 @@ SECTION .text ...@@ -237,8 +281,7 @@ SECTION .text
; out: m1=p0' m2=q0' ; out: m1=p0' m2=q0'
; clobbers: m0,3-6 ; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0 %macro DEBLOCK_P0_Q0 0
mova m5, m1 pxor m5, m1, m2 ; p0^q0
pxor m5, m2 ; p0^q0
pand m5, [pb_1] ; (p0^q0)&1 pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4 pcmpeqb m4, m4
pxor m3, m4 pxor m3, m4
...@@ -264,14 +307,12 @@ SECTION .text ...@@ -264,14 +307,12 @@ SECTION .text
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0 ; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6 %macro LUMA_Q1 6
mova %6, m1 pavgb %6, m1, m2
pavgb %6, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0)) pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3 pxor %6, %3
pand %6, [pb_1] ; (p2^avg(p0,q0))&1 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1 psubusb %6, %1, %5
psubusb %6, %5
paddusb %5, %1 paddusb %5, %1
pmaxub %2, %6 pmaxub %2, %6
pminub %2, %5 pminub %2, %5
...@@ -280,10 +321,10 @@ SECTION .text ...@@ -280,10 +321,10 @@ SECTION .text
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_XMM %macro DEBLOCK_LUMA 1
cglobal x264_deblock_v_luma_sse2, 5,5,10 cglobal deblock_v_luma_%1, 5,5,10
movd m8, [r4] ; tc0 movd m8, [r4] ; tc0
lea r4, [r1*3] lea r4, [r1*3]
dec r2d ; alpha-1 dec r2d ; alpha-1
...@@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 ...@@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
movdqa m3, [r4] ; p2 movdqa m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9 pand m6, m9
mova m7, m8 psubb m7, m8, m6
psubb m7, m6
pand m6, m8 pand m6, m8
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
...@@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 ...@@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX
cglobal x264_deblock_h_luma_sse2, 5,7 cglobal deblock_h_luma_%1, 5,7
movsxd r10, r1d movsxd r10, r1d
lea r11, [r10+r10*2] lea r11, [r10+r10*2]
lea r6, [r0-4] lea r6, [r0-4]
...@@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ...@@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
; vertical filter ; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4 ; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30] lea r0, [pix_tmp+0x30]
mov r1d, 0x10 mov r1d, 0x10
%ifdef WIN64 %ifdef WIN64
mov [rsp+0x20], r4 mov [rsp+0x20], r4
%endif %endif
call x264_deblock_v_luma_sse2 call deblock_v_luma_%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2 add r6, 2
...@@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ...@@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq m1, [pix_tmp+0x28] movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38] movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3 shl r10, 3
sub r6, r10 sub r6, r10
...@@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ...@@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64 %ifdef WIN64
add rsp, 0x98 add rsp, 0x98
...@@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ...@@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7
add rsp, 0x68 add rsp, 0x68
%endif %endif
RET RET
%endmacro
INIT_XMM
DEBLOCK_LUMA sse2
INIT_AVX
DEBLOCK_LUMA avx
%else %else
%macro DEBLOCK_LUMA 3 %macro DEBLOCK_LUMA 3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_%1, 5,5 cglobal deblock_%2_luma_%1, 5,5
lea r4, [r1*3] lea r4, [r1*3]
dec r2 ; alpha-1 dec r2 ; alpha-1
neg r4 neg r4
...@@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 ...@@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4 pand m6, m4
pand m4, [esp+%3] ; tc pand m4, [esp+%3] ; tc
mova m7, m4 psubb m7, m4, m6
psubb m7, m6
pand m6, m4 pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
...@@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5 ...@@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5 cglobal deblock_h_luma_%1, 0,5
mov r0, r0mp mov r0, r0mp
mov r3, r1m mov r3, r1m
lea r4, [r3*3] lea r4, [r3*3]
...@@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5 ...@@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
PUSH dword r2m PUSH dword r2m
PUSH dword 16 PUSH dword 16
PUSH dword r0 PUSH dword r0
call x264_deblock_%2_luma_%1 call deblock_%2_luma_%1
%ifidn %2, v8 %ifidn %2, v8
add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2 add dword [esp+16], 2 ; tc0+2
call x264_deblock_%2_luma_%1 call deblock_%2_luma_%1
%endif %endif
ADD esp, 20 ADD esp, 20
...@@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 ...@@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
lea r0, [r0+r3*8] lea r0, [r0+r3*8]
lea r1, [r1+r3*8] lea r1, [r1+r3*8]
...@@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 ...@@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq m1, [pix_tmp+0x28] movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38] movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad ADD esp, pad
RET RET
...@@ -502,22 +547,34 @@ INIT_MMX ...@@ -502,22 +547,34 @@ INIT_MMX
DEBLOCK_LUMA mmxext, v8, 8 DEBLOCK_LUMA mmxext, v8, 8
INIT_XMM INIT_XMM
DEBLOCK_LUMA sse2, v, 16 DEBLOCK_LUMA sse2, v, 16
INIT_AVX
DEBLOCK_LUMA avx, v, 16
%endif ; ARCH %endif ; ARCH
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
%ifdef ARCH_X86_64
pavgb t0, p2, p1
pavgb t1, p0, q0
%else
mova t0, p2 mova t0, p2
mova t1, p0 mova t1, p0
pavgb t0, p1 pavgb t0, p1
pavgb t1, q0 pavgb t1, q0
%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1 mova t5, t1
%ifdef ARCH_X86_64
paddb t2, p2, p1
paddb t3, p0, q0
%else
mova t2, p2 mova t2, p2
mova t3, p0 mova t3, p0
paddb t2, p1 paddb t2, p1
paddb t3, q0 paddb t3, q0
%endif
paddb t2, t3 paddb t2, t3
mova t3, t2 mova t3, t2
mova t4, t2 mova t4, t2
...@@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16 ...@@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16
pand t2, mpb_1 pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
%ifdef ARCH_X86_64
pavgb t1, p2, q1
psubb t2, p2, q1
%else
mova t1, p2 mova t1, p2
mova t2, p2 mova t2, p2
pavgb t1, q1 pavgb t1, q1
psubb t2, q1 psubb t2, q1
%endif
paddb t3, t3 paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
pand t2, mpb_1 pand t2, mpb_1
...@@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16 ...@@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16
pand t3, mpb_1 pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
mova t3, p0 pxor t3, p0, q1
mova t2, p0 pavgb t2, p0, q1
pxor t3, q1
pavgb t2, q1
pand t3, mpb_1 pand t3, mpb_1
psubb t2, t3 psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
...@@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16 ...@@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16
mova %1, t1 ; store p0 mova %1, t1 ; store p0
mova t1, %4 ; p3 mova t1, %4 ; p3
mova t2, t1 paddb t2, t1, p2
pavgb t1, p2 pavgb t1, p2
paddb t2, p2
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
paddb t2, t2 paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
...@@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16 ...@@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16
%endif %endif
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 cglobal deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
sub esp, 0x60 sub esp, 0x60
%endif %endif
...@@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 ...@@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
INIT_MMX INIT_MMX
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_h_luma_intra_%1, 4,7 cglobal deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d movsxd r10, r1d
lea r11, [r10*3] lea r11, [r10*3]
lea r6, [r0-4] lea r6, [r0-4]
...@@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 ...@@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
lea r0, [pix_tmp+0x40] lea r0, [pix_tmp+0x40]
mov r1, 0x10 mov r1, 0x10
call x264_deblock_v_luma_intra_%1 call deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11] lea r5, [r6+r11]
...@@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 ...@@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
add rsp, 0x88 add rsp, 0x88
RET RET
%else %else
cglobal x264_deblock_h_luma_intra_%1, 2,4 cglobal deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3] lea r3, [r1*3]
sub r0, 4 sub r0, 4
lea r2, [r0+r3] lea r2, [r0+r3]
...@@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 ...@@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
PUSH dword r2m PUSH dword r2m
PUSH dword 16 PUSH dword 16
PUSH r0 PUSH r0
call x264_deblock_%2_luma_intra_%1 call deblock_%2_luma_intra_%1
%ifidn %2, v8 %ifidn %2, v8
add dword [rsp], 8 ; pix_tmp+8 add dword [rsp], 8 ; pix_tmp+8
call x264_deblock_%2_luma_intra_%1 call deblock_%2_luma_intra_%1
%endif %endif
ADD esp, 16 ADD esp, 16
...@@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 ...@@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
INIT_XMM INIT_XMM
DEBLOCK_LUMA_INTRA sse2, v DEBLOCK_LUMA_INTRA sse2, v
INIT_AVX
DEBLOCK_LUMA_INTRA avx , v
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
INIT_MMX INIT_MMX
DEBLOCK_LUMA_INTRA mmxext, v8 DEBLOCK_LUMA_INTRA mmxext, v8
%endif %endif
INIT_MMX INIT_MMX
%macro CHROMA_V_START 0 %macro CHROMA_V_START 0
...@@ -790,23 +849,23 @@ INIT_MMX ...@@ -790,23 +849,23 @@ INIT_MMX
%define t6 r6 %define t6 r6
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_v_chroma_mmxext, 5,6 cglobal deblock_v_chroma_mmxext, 5,6
CHROMA_V_START CHROMA_V_START
movq m0, [t5] movq m0, [t5]
movq m1, [t5+r1] movq m1, [t5+r1]
movq m2, [r0] movq m2, [r0]
movq m3, [r0+r1] movq m3, [r0+r1]
call x264_chroma_inter_body_mmxext call ff_chroma_inter_body_mmxext
movq [t5+r1], m1 movq [t5+r1], m1
movq [r0], m2 movq [r0], m2
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_h_chroma_mmxext, 5,7 cglobal deblock_h_chroma_mmxext, 5,7
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
%define buf0 [rsp-24] %define buf0 [rsp-24]
%define buf1 [rsp-16] %define buf1 [rsp-16]
...@@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 ...@@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7
%define buf1 r2m %define buf1 r2m
%endif %endif
CHROMA_H_START CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0 movq buf0, m0
movq buf1, m3 movq buf1, m3
call x264_chroma_inter_body_mmxext call ff_chroma_inter_body_mmxext
movq m0, buf0 movq m0, buf0
movq m3, buf1 movq m3, buf1
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET RET
ALIGN 16 ALIGN 16
x264_chroma_inter_body_mmxext: ff_chroma_inter_body_mmxext:
LOAD_MASK r2d, r3d LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0 movd m6, [r4] ; tc0
punpcklbw m6, m6 punpcklbw m6, m6
...@@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext: ...@@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext:
%define t6 r5 %define t6 r5
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 cglobal deblock_v_chroma_intra_mmxext, 4,5
CHROMA_V_START CHROMA_V_START
movq m0, [t5] movq m0, [t5]
movq m1, [t5+r1] movq m1, [t5+r1]
movq m2, [r0] movq m2, [r0]
movq m3, [r0+r1] movq m3, [r0+r1]
call x264_chroma_intra_body_mmxext call ff_chroma_intra_body_mmxext
movq [t5+r1], m1 movq [t5+r1], m1
movq [r0], m2 movq [r0], m2
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 cglobal deblock_h_chroma_intra_mmxext, 4,6
CHROMA_H_START CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call x264_chroma_intra_body_mmxext call ff_chroma_intra_body_mmxext
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET RET
ALIGN 16 ALIGN 16
x264_chroma_intra_body_mmxext: ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d LOAD_MASK r2d, r3d
movq m5, m1 movq m5, m1
movq m6, m2 movq m6, m2
......
...@@ -219,10 +219,10 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ...@@ -219,10 +219,10 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
} }
#define LF_FUNC(DIR, TYPE, OPT) \ #define LF_FUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0); int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \ #define LF_IFUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta); int alpha, int beta);
LF_FUNC (h, chroma, mmxext) LF_FUNC (h, chroma, mmxext)
...@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext) ...@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext)
LF_IFUNC(h, luma_intra, mmxext) LF_IFUNC(h, luma_intra, mmxext)
#if HAVE_YASM && ARCH_X86_32 #if HAVE_YASM && ARCH_X86_32
LF_FUNC (v8, luma, mmxext) LF_FUNC (v8, luma, mmxext)
static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
if((tc0[0] & tc0[1]) >= 0) if((tc0[0] & tc0[1]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
if((tc0[2] & tc0[3]) >= 0) if((tc0[2] & tc0[3]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
} }
LF_IFUNC(v8, luma_intra, mmxext) LF_IFUNC(v8, luma_intra, mmxext)
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{ {
ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
} }
#endif #endif
...@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2) ...@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2)
LF_IFUNC(h, luma_intra, sse2) LF_IFUNC(h, luma_intra, sse2)
LF_FUNC (v, luma, sse2) LF_FUNC (v, luma, sse2)
LF_IFUNC(v, luma_intra, sse2) LF_IFUNC(v, luma_intra, sse2)
LF_FUNC (h, luma, avx)
LF_IFUNC(h, luma_intra, avx)
LF_FUNC (v, luma, avx)
LF_IFUNC(v, luma_intra, avx)
/***********************************/ /***********************************/
/* weighted prediction */ /* weighted prediction */
...@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
#if ARCH_X86_32 #if ARCH_X86_32
c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
#endif #endif
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
...@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
#if HAVE_ALIGNED_STACK #if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
#endif #endif
c->h264_idct_add16 = ff_h264_idct_add16_sse2; c->h264_idct_add16 = ff_h264_idct_add16_sse2;
...@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
} }
if (mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
#endif
}
} }
} }
#endif #endif
......
...@@ -24,16 +24,20 @@ ...@@ -24,16 +24,20 @@
;****************************************************************************** ;******************************************************************************
%macro SBUTTERFLY 4 %macro SBUTTERFLY 4
%if avx_enabled == 0
mova m%4, m%2 mova m%4, m%2
punpckl%1 m%2, m%3 punpckl%1 m%2, m%3
punpckh%1 m%4, m%3 punpckh%1 m%4, m%3
%else
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%endif
SWAP %3, %4 SWAP %3, %4
%endmacro %endmacro
%macro SBUTTERFLY2 4 %macro SBUTTERFLY2 4
mova m%4, m%2 punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%3 punpckh%1 m%2, m%2, m%3
punpckl%1 m%4, m%3
SWAP %2, %4, %3 SWAP %2, %4, %3
%endmacro %endmacro
...@@ -444,3 +448,12 @@ ...@@ -444,3 +448,12 @@
%macro PMINUB_MMXEXT 3 ; dst, src, ignored %macro PMINUB_MMXEXT 3 ; dst, src, ignored
pminub %1, %2 pminub %1, %2
%endmacro %endmacro
%macro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%else
pshufw %1, %2, (%3)*0x55
%endif
%endmacro
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment