Commit 8ad77b65 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

Update x86 H.264 deblock asm

Includes AVX versions from x264.
parent b6675279
;*****************************************************************************
;* MMX/SSE2-optimized H.264 deblocking code
;* MMX/SSE2/AVX-optimized H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-2008 x264 project
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
;* This file is part of Libav.
;*
......@@ -26,96 +27,135 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
SECTION .text
cextern pb_0
cextern pb_1
cextern pb_3
cextern pb_A1
SECTION .text
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
; in: 8 rows of 4 bytes in %1..%8
%define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3
%macro TRANSPOSE4x8_LOAD 8
movd m0, %1
movd m2, %2
movd m1, %3
movd m3, %4
punpcklbw m0, m2
punpcklbw m1, m3
movq m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
movd m4, %5
movd m6, %6
movd m5, %7
movd m7, %8
punpcklbw m4, m6
punpcklbw m5, m7
movq m6, m4
punpcklwd m4, m5
punpckhwd m6, m5
movq m1, m0
movq m3, m2
punpckldq m0, m4
punpckhdq m1, m4
punpckldq m2, m6
punpckhdq m3, m6
%macro TRANSPOSE4x8_LOAD 11
movh m0, %4
movh m2, %5
movh m1, %6
movh m3, %7
punpckl%1 m0, m2
punpckl%1 m1, m3
mova m2, m0
punpckl%2 m0, m1
punpckh%2 m2, m1
movh m4, %8
movh m6, %9
movh m5, %10
movh m7, %11
punpckl%1 m4, m6
punpckl%1 m5, m7
mova m6, m4
punpckl%2 m4, m5
punpckh%2 m6, m5
punpckh%3 m1, m0, m4
punpckh%3 m3, m2, m6
punpckl%3 m0, m4
punpckl%3 m2, m6
%endmacro
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4_STORE 8
movq m4, m0
movq m5, m1
movq m6, m2
punpckhdq m4, m4
punpckhdq m5, m5
punpckhdq m6, m6
%macro TRANSPOSE8x4B_STORE 8
punpckhdq m4, m0, m0
punpckhdq m5, m1, m1
punpckhdq m6, m2, m2
punpcklbw m0, m1
punpcklbw m2, m3
movq m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd %1, m0
punpckhdq m0, m0
movd %2, m0
movd %3, m1
punpcklwd m1, m0, m2
punpckhwd m0, m2
movh %1, m1
punpckhdq m1, m1
movd %4, m1
movh %2, m1
movh %3, m0
punpckhdq m0, m0
movh %4, m0
punpckhdq m3, m3
punpcklbw m4, m5
punpcklbw m6, m3
movq m5, m4
punpcklwd m4, m6
punpckhwd m5, m6
movd %5, m4
punpckhdq m4, m4
movd %6, m4
movd %7, m5
punpcklwd m5, m4, m6
punpckhwd m4, m6
movh %5, m5
punpckhdq m5, m5
movd %8, m5
movh %6, m5
movh %7, m4
punpckhdq m4, m4
movh %8, m4
%endmacro
%macro TRANSPOSE4x8B_LOAD 8
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
%endmacro
%macro TRANSPOSE4x8W_LOAD 8
%if mmsize==16
TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
%else
SWAP 1, 4, 2, 3
mova m0, [t5]
mova m1, [t5+r1]
mova m2, [t5+r1*2]
mova m3, [t5+t6]
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
%endmacro
%macro TRANSPOSE8x2W_STORE 8
punpckhwd m0, m1, m2
punpcklwd m1, m2
%if mmsize==8
movd %3, m0
movd %1, m1
psrlq m1, 32
psrlq m0, 32
movd %2, m1
movd %4, m0
%else
movd %5, m0
movd %1, m1
psrldq m1, 4
psrldq m0, 4
movd %2, m1
movd %6, m0
psrldq m1, 4
psrldq m0, 4
movd %3, m1
movd %7, m0
psrldq m1, 4
psrldq m0, 4
movd %4, m1
movd %8, m0
%endif
%endmacro
%macro SBUTTERFLY3 4
movq %4, %2
punpckh%1 %4, %2, %3
punpckl%1 %2, %3
punpckh%1 %4, %3
%endmacro
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9
RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
......@@ -123,30 +163,32 @@ SECTION .text
movq m4, %5
movq m5, %6
movq m6, %7
SBUTTERFLY3 bw, m0, m1, m7
SBUTTERFLY3 bw, m2, m3, m1
SBUTTERFLY3 bw, m4, m5, m3
movq [%9+0x10], m1
SBUTTERFLY3 bw, m6, %8, m5
SBUTTERFLY3 wd, m0, m2, m1
SBUTTERFLY3 wd, m4, m6, m2
SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY bw, 4, 5, 7
movq [%9+0x10], m3
SBUTTERFLY3 bw, m6, %8, m7
SBUTTERFLY wd, 0, 2, 3
SBUTTERFLY wd, 4, 6, 3
punpckhdq m0, m4
movq [%9+0x00], m0
SBUTTERFLY3 wd, m7, [%9+0x10], m6
SBUTTERFLY3 wd, m3, m5, m4
SBUTTERFLY3 dq, m7, m3, m0
SBUTTERFLY3 dq, m1, m2, m5
punpckldq m6, m4
movq [%9+0x10], m1
movq [%9+0x20], m5
movq [%9+0x30], m7
movq [%9+0x40], m0
movq [%9+0x50], m6
SBUTTERFLY3 wd, m1, [%9+0x10], m3
SBUTTERFLY wd, 5, 7, 0
SBUTTERFLY dq, 1, 5, 0
SBUTTERFLY dq, 2, 6, 0
punpckldq m3, m7
movq [%9+0x10], m2
movq [%9+0x20], m6
movq [%9+0x30], m1
movq [%9+0x40], m5
movq [%9+0x50], m3
RESET_MM_PERMUTATION
%endmacro
; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16
%macro TRANSPOSE8x8_MEM 16
RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
......@@ -154,38 +196,44 @@ SECTION .text
movq m4, %5
movq m5, %6
movq m6, %7
SBUTTERFLY3 bw, m0, m1, m7
SBUTTERFLY3 bw, m2, m3, m1
SBUTTERFLY3 bw, m4, m5, m3
SBUTTERFLY3 bw, m6, %8, m5
movq %9, m3
SBUTTERFLY3 wd, m0, m2, m3
SBUTTERFLY3 wd, m4, m6, m2
SBUTTERFLY3 wd, m7, m1, m6
movq %11, m2
movq m2, %9
SBUTTERFLY3 wd, m2, m5, m1
SBUTTERFLY3 dq, m0, m4, m5
SBUTTERFLY3 dq, m7, m2, m4
SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY bw, 4, 5, 7
SBUTTERFLY3 bw, m6, %8, m7
movq %9, m5
SBUTTERFLY wd, 0, 2, 5
SBUTTERFLY wd, 4, 6, 5
SBUTTERFLY wd, 1, 3, 5
movq %11, m6
movq m6, %9
SBUTTERFLY wd, 6, 7, 5
SBUTTERFLY dq, 0, 4, 5
SBUTTERFLY dq, 1, 6, 5
movq %9, m0
movq %10, m5
movq %13, m7
movq %14, m4
SBUTTERFLY3 dq, m3, %11, m0
SBUTTERFLY3 dq, m6, m1, m5
movq %11, m3
movq %10, m4
movq %13, m1
movq %14, m6
SBUTTERFLY3 dq, m2, %11, m0
SBUTTERFLY dq, 3, 7, 4
movq %11, m2
movq %12, m0
movq %15, m6
movq %16, m5
movq %15, m3
movq %16, m7
RESET_MM_PERMUTATION
%endmacro
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 5
%if avx_enabled == 0
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
%else
psubusb %5, %2, %1
psubusb %4, %1, %2
%endif
por %4, %5
psubusb %4, %3
%endmacro
......@@ -193,32 +241,28 @@ SECTION .text
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
%ifdef ARCH_X86_64
psubusb %5, %2, %1
psubusb %4, %1, %2
%else
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
%endif
psubusb %5, %3
psubusb %4, %3
pcmpeqb %4, %5
%endmacro
%macro SPLATW 1
%ifidn m0, xmm0
pshuflw %1, %1, 0
punpcklqdq %1, %1
%else
pshufw %1, %1, 0
%endif
%endmacro
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
; out: m5=beta-1, m7=mask, %3=alpha-1
; clobbers: m4,m6
%macro LOAD_MASK 2-3
movd m4, %1
movd m5, %2
SPLATW m4
SPLATW m5
SPLATW m4, m4
SPLATW m5, m5
packuswb m4, m4 ; 16x alpha-1
packuswb m5, m5 ; 16x beta-1
%if %0>2
......@@ -237,8 +281,7 @@ SECTION .text
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
mova m5, m1
pxor m5, m2 ; p0^q0
pxor m5, m1, m2 ; p0^q0
pand m5, [pb_1] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
......@@ -264,14 +307,12 @@ SECTION .text
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6
mova %6, m1
pavgb %6, m2
pavgb %6, m1, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1
psubusb %6, %5
psubusb %6, %1, %5
paddusb %5, %1
pmaxub %2, %6
pminub %2, %5
......@@ -280,10 +321,10 @@ SECTION .text
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal x264_deblock_v_luma_sse2, 5,5,10
%macro DEBLOCK_LUMA 1
cglobal deblock_v_luma_%1, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
......@@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
movdqa m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
mova m7, m8
psubb m7, m6
psubb m7, m8, m6
pand m6, m8
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
......@@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_sse2, 5,7
cglobal deblock_h_luma_%1, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
lea r6, [r0-4]
......@@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
call x264_deblock_v_luma_sse2
call deblock_v_luma_%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
......@@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub r6, r10
......@@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
......@@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7
add rsp, 0x68
%endif
RET
%endmacro
INIT_XMM
DEBLOCK_LUMA sse2
INIT_AVX
DEBLOCK_LUMA avx
%else
%macro DEBLOCK_LUMA 3
;-----------------------------------------------------------------------------
; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_%1, 5,5
cglobal deblock_%2_luma_%1, 5,5
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
......@@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4
pand m4, [esp+%3] ; tc
mova m7, m4
psubb m7, m6
psubb m7, m4, m6
pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
......@@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5
cglobal deblock_h_luma_%1, 0,5
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
......@@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
PUSH dword r2m
PUSH dword 16
PUSH dword r0
call x264_deblock_%2_luma_%1
call deblock_%2_luma_%1
%ifidn %2, v8
add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2
call x264_deblock_%2_luma_%1
call deblock_%2_luma_%1
%endif
ADD esp, 20
......@@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
lea r0, [r0+r3*8]
lea r1, [r1+r3*8]
......@@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad
RET
......@@ -502,22 +547,34 @@ INIT_MMX
DEBLOCK_LUMA mmxext, v8, 8
INIT_XMM
DEBLOCK_LUMA sse2, v, 16
INIT_AVX
DEBLOCK_LUMA avx, v, 16
%endif ; ARCH
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
%ifdef ARCH_X86_64
pavgb t0, p2, p1
pavgb t1, p0, q0
%else
mova t0, p2
mova t1, p0
pavgb t0, p1
pavgb t1, q0
%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1
%ifdef ARCH_X86_64
paddb t2, p2, p1
paddb t3, p0, q0
%else
mova t2, p2
mova t3, p0
paddb t2, p1
paddb t3, q0
%endif
paddb t2, t3
mova t3, t2
mova t4, t2
......@@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16
pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
%ifdef ARCH_X86_64
pavgb t1, p2, q1
psubb t2, p2, q1
%else
mova t1, p2
mova t2, p2
pavgb t1, q1
psubb t2, q1
%endif
paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
pand t2, mpb_1
......@@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16
pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
mova t3, p0
mova t2, p0
pxor t3, q1
pavgb t2, q1
pxor t3, p0, q1
pavgb t2, p0, q1
pand t3, mpb_1
psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
......@@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16
mova %1, t1 ; store p0
mova t1, %4 ; p3
mova t2, t1
paddb t2, t1, p2
pavgb t1, p2
paddb t2, p2
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
......@@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16
%endif
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
cglobal deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
......@@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
INIT_MMX
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_h_luma_intra_%1, 4,7
cglobal deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
lea r6, [r0-4]
......@@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call x264_deblock_v_luma_intra_%1
call deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
......@@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
add rsp, 0x88
RET
%else
cglobal x264_deblock_h_luma_intra_%1, 2,4
cglobal deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
......@@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
PUSH dword r2m
PUSH dword 16
PUSH r0
call x264_deblock_%2_luma_intra_%1
call deblock_%2_luma_intra_%1
%ifidn %2, v8
add dword [rsp], 8 ; pix_tmp+8
call x264_deblock_%2_luma_intra_%1
call deblock_%2_luma_intra_%1
%endif
ADD esp, 16
......@@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
INIT_XMM
DEBLOCK_LUMA_INTRA sse2, v
INIT_AVX
DEBLOCK_LUMA_INTRA avx , v
%ifndef ARCH_X86_64
INIT_MMX
DEBLOCK_LUMA_INTRA mmxext, v8
%endif
INIT_MMX
%macro CHROMA_V_START 0
......@@ -790,23 +849,23 @@ INIT_MMX
%define t6 r6
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal x264_deblock_v_chroma_mmxext, 5,6
cglobal deblock_v_chroma_mmxext, 5,6
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call x264_chroma_inter_body_mmxext
call ff_chroma_inter_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal x264_deblock_h_chroma_mmxext, 5,7
cglobal deblock_h_chroma_mmxext, 5,7
%ifdef ARCH_X86_64
%define buf0 [rsp-24]
%define buf1 [rsp-16]
......@@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7
%define buf1 r2m
%endif
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
call x264_chroma_inter_body_mmxext
call ff_chroma_inter_body_mmxext
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
ALIGN 16
x264_chroma_inter_body_mmxext:
ff_chroma_inter_body_mmxext:
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
......@@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext:
%define t6 r5
;-----------------------------------------------------------------------------
; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
cglobal deblock_v_chroma_intra_mmxext, 4,5
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call x264_chroma_intra_body_mmxext
call ff_chroma_intra_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
cglobal deblock_h_chroma_intra_mmxext, 4,6
CHROMA_H_START
TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
call x264_chroma_intra_body_mmxext
TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call ff_chroma_intra_body_mmxext
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
ALIGN 16
x264_chroma_intra_body_mmxext:
ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d
movq m5, m1
movq m6, m2
......
......@@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
}
#define LF_FUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
LF_FUNC (h, chroma, mmxext)
LF_IFUNC(h, chroma_intra, mmxext)
......@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext)
LF_IFUNC(h, luma_intra, mmxext)
#if HAVE_YASM && ARCH_X86_32
LF_FUNC (v8, luma, mmxext)
static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{
if((tc0[0] & tc0[1]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
if((tc0[2] & tc0[3]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
}
LF_IFUNC(v8, luma_intra, mmxext)
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{
ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
}
#endif
......@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2)
LF_IFUNC(h, luma_intra, sse2)
LF_FUNC (v, luma, sse2)
LF_IFUNC(v, luma_intra, sse2)
LF_FUNC (h, luma, avx)
LF_IFUNC(h, luma_intra, avx)
LF_FUNC (v, luma, avx)
LF_IFUNC(v, luma_intra, avx)
/***********************************/
/* weighted prediction */
......@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
#if ARCH_X86_32
c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
#endif
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
......@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
#endif
c->h264_idct_add16 = ff_h264_idct_add16_sse2;
......@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
}
if (mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
#endif
}
}
}
#endif
......
......@@ -24,16 +24,20 @@
;******************************************************************************
%macro SBUTTERFLY 4
%if avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
%else
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
mova m%4, m%2
punpckh%1 m%2, m%3
punpckl%1 m%4, m%3
punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
......@@ -444,3 +448,12 @@
%macro PMINUB_MMXEXT 3 ; dst, src, ignored
pminub %1, %2
%endmacro
%macro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%else
pshufw %1, %2, (%3)*0x55
%endif
%endmacro
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment