Commit 3f87f39c authored by John Adcock's avatar John Adcock Committed by Jason Garrett-Glaser

Update x264 asm code to latest to add support for 64-bit Windows.

Use the new x86inc features to support 64-bit Windows on all non-x264 nasm
assembly code as well.
Patch by John Adcock, dscaler.johnad AT googlemail DOT com.
Win64 changes originally by Anton Mitrofanov.
x86util changes mostly by Holger Lubitz.

Originally committed as revision 19580 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent d8c2f8f7
...@@ -457,7 +457,7 @@ section .text ...@@ -457,7 +457,7 @@ section .text
; On x86_32, this function does the register saving and restoring for all of fft. ; On x86_32, this function does the register saving and restoring for all of fft.
; The others pass args in registers and don't spill anything. ; The others pass args in registers and don't spill anything.
cglobal fft_dispatch%3%2, 2,5,0, z, nbits cglobal fft_dispatch%3%2, 2,5,8, z, nbits
lea r2, [dispatch_tab%3%2 GLOBAL] lea r2, [dispatch_tab%3%2 GLOBAL]
mov r2, [r2 + (nbitsq-2)*gprsize] mov r2, [r2 + (nbitsq-2)*gprsize]
call r2 call r2
......
...@@ -278,7 +278,7 @@ SECTION .text ...@@ -278,7 +278,7 @@ SECTION .text
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_XMM INIT_XMM
cglobal x264_deblock_v_luma_sse2 cglobal x264_deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0 movd m8, [r4] ; tc0
lea r4, [r1*3] lea r4, [r1*3]
dec r2d ; alpha-1 dec r2d ; alpha-1
...@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2 ...@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
DEBLOCK_P0_Q0 DEBLOCK_P0_Q0
mova [r4+2*r1], m1 mova [r4+2*r1], m1
mova [r0], m2 mova [r0], m2
ret RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX
cglobal x264_deblock_h_luma_sse2 cglobal x264_deblock_h_luma_sse2, 5,7
movsxd r10, esi movsxd r10, r1d
lea r11, [r10+r10*2] lea r11, [r10+r10*2]
lea rax, [r0-4] lea r6, [r0-4]
lea r9, [r0-4+r11] lea r5, [r0-4+r11]
%ifdef WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
sub rsp, 0x68 sub rsp, 0x68
%define pix_tmp rsp %define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space ; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea rax, [rax+r10*8] lea r6, [r6+r10*8]
lea r9, [r9 +r10*8] lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
; vertical filter ; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4 ; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30] lea r0, [pix_tmp+0x30]
mov esi, 0x10 mov r1d, 0x10
%ifdef WIN64
mov [rsp+0x20], r4
%endif
call x264_deblock_v_luma_sse2 call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add rax, 2 add r6, 2
add r9, 2 add r5, 2
movq m0, [pix_tmp+0x18] movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28] movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38] movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48] movq m3, [pix_tmp+0x48]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3 shl r10, 3
sub rax, r10 sub r6, r10
sub r9, r10 sub r5, r10
shr r10, 3 shr r10, 3
movq m0, [pix_tmp+0x10] movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20] movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30] movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
%else
add rsp, 0x68 add rsp, 0x68
ret %endif
RET
%else %else
...@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 ...@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
mova m3, [r0+r1] ; q1 mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3 LOAD_MASK r2, r3
mov r3, r4m mov r3, r4mp
movd m4, [r3] ; tc0 movd m4, [r3] ; tc0
punpcklbw m4, m4 punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
...@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 ...@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5 cglobal x264_deblock_h_luma_%1, 0,5
mov r0, r0m mov r0, r0mp
mov r3, r1m mov r3, r1m
lea r4, [r3*3] lea r4, [r3*3]
sub r0, 4 sub r0, 4
...@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 ...@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
ADD esp, 20 ADD esp, 20
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov r0, r0m mov r0, r0mp
sub r0, 2 sub r0, 2
lea r1, [r0+r4] lea r1, [r0+r4]
...@@ -607,7 +619,7 @@ DEBLOCK_LUMA sse2, v, 16 ...@@ -607,7 +619,7 @@ DEBLOCK_LUMA sse2, v, 16
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_%2_luma_intra_%1, 4,6 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
sub esp, 0x60 sub esp, 0x60
%endif %endif
...@@ -669,34 +681,34 @@ INIT_MMX ...@@ -669,34 +681,34 @@ INIT_MMX
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_deblock_h_luma_intra_%1 cglobal x264_deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d movsxd r10, r1d
lea r11, [r10*3] lea r11, [r10*3]
lea rax, [r0-4] lea r6, [r0-4]
lea r9, [r0-4+r11] lea r5, [r0-4+r11]
sub rsp, 0x88 sub rsp, 0x88
%define pix_tmp rsp %define pix_tmp rsp
; transpose 8x16 -> tmp space ; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea rax, [rax+r10*8] lea r6, [r6+r10*8]
lea r9, [r9+r10*8] lea r5, [r5+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40] lea r0, [pix_tmp+0x40]
mov r1, 0x10 mov r1, 0x10
call x264_deblock_v_luma_intra_%1 call x264_deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r9, [rax+r11] lea r5, [r6+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3 shl r10, 3
sub rax, r10 sub r6, r10
sub r9, r10 sub r5, r10
shr r10, 3 shr r10, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
add rsp, 0x88 add rsp, 0x88
ret RET
%else %else
cglobal x264_deblock_h_luma_intra_%1, 2,4 cglobal x264_deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3] lea r3, [r1*3]
...@@ -725,7 +737,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 ...@@ -725,7 +737,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
ADD esp, 16 ADD esp, 16
mov r1, r1m mov r1, r1m
mov r0, r0m mov r0, r0mp
lea r3, [r1*3] lea r3, [r1*3]
sub r0, 4 sub r0, 4
lea r2, [r0+r3] lea r2, [r0+r3]
......
...@@ -31,15 +31,8 @@ pw_32: times 8 dw 32 ...@@ -31,15 +31,8 @@ pw_32: times 8 dw 32
SECTION .text SECTION .text
%macro IDCT4_1D 6
SUMSUB_BA m%3, m%1
SUMSUBD2_AB m%2, m%4, m%6, m%5
SUMSUB_BADC m%2, m%3, m%5, m%1
SWAP %1, %2, %5, %4, %3
%endmacro
INIT_XMM INIT_XMM
cglobal x264_add8x4_idct_sse2, 3,3 cglobal x264_add8x4_idct_sse2, 3,3,8
movq m0, [r1+ 0] movq m0, [r1+ 0]
movq m1, [r1+ 8] movq m1, [r1+ 8]
movq m2, [r1+16] movq m2, [r1+16]
......
...@@ -20,6 +20,14 @@ ...@@ -20,6 +20,14 @@
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;***************************************************************************** ;*****************************************************************************
%ifdef ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
%define WIN64
%else
%define UNIX64
%endif
%endif
; FIXME: All of the 64bit asm functions that take a stride as an argument ; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0. ; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides, ; This is true in practice (since we never do any 64bit arithmetic on strides,
...@@ -28,68 +36,39 @@ ...@@ -28,68 +36,39 @@
; Name of the .rodata section. ; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute, ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section. ; so use a different read-only section.
%macro SECTION_RODATA 0 %macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64 %ifidn __OUTPUT_FORMAT__,macho64
SECTION .text align=16 SECTION .text align=%1
%elifidn __OUTPUT_FORMAT__,macho %elifidn __OUTPUT_FORMAT__,macho
SECTION .text align=16 SECTION .text align=%1
fakegot: fakegot:
%else %else
SECTION .rodata align=16 SECTION .rodata align=%1
%endif %endif
%endmacro %endmacro
; PIC support macros. All these macros are totally harmless when PIC is ; PIC support macros.
; not defined but can ruin everything if misused in PIC mode. On x86_32, shared ; x86_64 can't fit 64bit address literals in most instruction types,
; objects cannot directly access global variables by address, they need to ; so shared objects (under the assumption that they might be anywhere
; go through the GOT (global offset table). Most OSes do not care about it ; in memory) must use an address mode that does fit.
; and let you load non-shared .so objects (Linux, Win32...). However, OS X ; So all accesses to global variables must use this macro, e.g.
; requires PIC code in its .dylib objects.
;
; - GLOBAL should be used as a suffix for global addressing, eg.
; picgetgot ebx
; mov eax, [foo GLOBAL] ; mov eax, [foo GLOBAL]
; instead of ; instead of
; mov eax, [foo] ; mov eax, [foo]
; ;
; - picgetgot computes the GOT address into the given register in PIC ; x86_32 doesn't require PIC.
; mode, otherwise does nothing. You need to do this before using GLOBAL. ; Some distros prefer shared objects to be PIC, but nothing breaks if
; Before in both execution order and compiled code order (so GLOBAL knows ; the code contains a few textrels, so we'll skip that complexity.
; which register the GOT is in).
%ifdef WIN64
%ifndef PIC %define PIC
%define GLOBAL %elifndef ARCH_X86_64
%macro picgetgot 1 %undef PIC
%endmacro %endif
%elifdef ARCH_X86_64 %ifdef PIC
%define PIC64
%define GLOBAL wrt rip %define GLOBAL wrt rip
%macro picgetgot 1
%endmacro
%else %else
%define PIC32 %define GLOBAL
%ifidn __OUTPUT_FORMAT__,macho
; There is no real global offset table on OS X, but we still
; need to reference our variables by offset.
%macro picgetgot 1
call %%getgot
%%getgot:
pop %1
add %1, $$ - %%getgot
%undef GLOBAL
%define GLOBAL + %1 - fakegot
%endmacro
%else ; elf
extern _GLOBAL_OFFSET_TABLE_
%macro picgetgot 1
call %%getgot
%%getgot:
pop %1
add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc
%undef GLOBAL
%define GLOBAL + %1 wrt ..gotoff
%endmacro
%endif
%endif %endif
; Macros to eliminate most code duplication between x86_32 and x86_64: ; Macros to eliminate most code duplication between x86_32 and x86_64:
...@@ -99,14 +78,14 @@ ...@@ -99,14 +78,14 @@
; PROLOGUE: ; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed. ; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. ; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = list of names to define to registers ; %4 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal ; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g. ; e.g.
; cglobal foo, 2,3,0, dst, src, tmp ; cglobal foo, 2,3, dst, src, tmp
; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
; TODO Some functions can use some args directly from the stack. If they're the ; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle ; last args then you can just not declare them, but if they're in the middle
...@@ -119,12 +98,25 @@ ...@@ -119,12 +98,25 @@
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch. ; which are slow when a normal ret follows a branch.
; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size
%macro DECLARE_REG 6 %macro DECLARE_REG 6
%define r%1q %2 %define r%1q %2
%define r%1d %3 %define r%1d %3
%define r%1w %4 %define r%1w %4
%define r%1b %5 %define r%1b %5
%define r%1m %6 %define r%1m %6
%ifid %6 ; i.e. it's a register
%define r%1mp %2
%elifdef ARCH_X86_64 ; memory
%define r%1mp qword %6
%else
%define r%1mp dword %6
%endif
%define r%1 %2 %define r%1 %2
%endmacro %endmacro
...@@ -150,6 +142,29 @@ DECLARE_REG_SIZE si, sil ...@@ -150,6 +142,29 @@ DECLARE_REG_SIZE si, sil
DECLARE_REG_SIZE di, dil DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl DECLARE_REG_SIZE bp, bpl
; t# defines for when per-arch register allocation is more complex than just function arguments
%macro DECLARE_REG_TMP 1-*
%assign %%i 0
%rep %0
CAT_XDEFINE t, %%i, r%1
%assign %%i %%i+1
%rotate 1
%endrep
%endmacro
%macro DECLARE_REG_TMP_SIZE 0-*
%rep %0
%define t%1q t%1 %+ q
%define t%1d t%1 %+ d
%define t%1w t%1 %+ w
%define t%1b t%1 %+ b
%rotate 1
%endrep
%endmacro
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
%define gprsize 8 %define gprsize 8
%else %else
...@@ -224,8 +239,7 @@ DECLARE_REG_SIZE bp, bpl ...@@ -224,8 +239,7 @@ DECLARE_REG_SIZE bp, bpl
%assign n_arg_names %%i %assign n_arg_names %%i
%endmacro %endmacro
%ifdef ARCH_X86_64 ;========================================================== %ifdef WIN64 ; Windows x64 ;=================================================
%ifidn __OUTPUT_FORMAT__,win32
DECLARE_REG 0, rcx, ecx, cx, cl, ecx DECLARE_REG 0, rcx, ecx, cx, cl, ecx
DECLARE_REG 1, rdx, edx, dx, dl, edx DECLARE_REG 1, rdx, edx, dx, dl, edx
...@@ -239,11 +253,75 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] ...@@ -239,11 +253,75 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%macro LOAD_IF_USED 2 ; reg_id, number_of_args %macro LOAD_IF_USED 2 ; reg_id, number_of_args
%if %1 < %2 %if %1 < %2
mov r%1, [rsp + 8 + %1*8] mov r%1, [rsp + stack_offset + 8 + %1*8]
%endif
%endmacro
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
ASSERT %2 >= %1
%assign regs_used %2
ASSERT regs_used <= 7
%if %0 > 2
%assign xmm_regs_used %3
%else
%assign xmm_regs_used 0
%endif
ASSERT xmm_regs_used <= 16
%if regs_used > 4
push r4
push r5
%assign stack_offset stack_offset+16
%endif
%if xmm_regs_used > 6
sub rsp, (xmm_regs_used-6)*16+16
%assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
%endrep
%endif
LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1
LOAD_IF_USED 6, %1
DEFINE_ARGS %4
%endmacro
%macro RESTORE_XMM_INTERNAL 1
%if xmm_regs_used > 6
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
%endrep
add %1, (xmm_regs_used-6)*16+16
%endif
%endmacro
%macro RESTORE_XMM 1
RESTORE_XMM_INTERNAL %1
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
%assign xmm_regs_used 0
%endmacro
%macro RET 0
RESTORE_XMM_INTERNAL rsp
%if regs_used > 4
pop r5
pop r4
%endif %endif
ret
%endmacro %endmacro
%else ;======================================================================= %macro REP_RET 0
%if regs_used > 4 || xmm_regs_used > 6
RET
%else
rep ret
%endif
%endmacro
%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
DECLARE_REG 0, rdi, edi, di, dil, edi DECLARE_REG 0, rdi, edi, di, dil, edi
DECLARE_REG 1, rsi, esi, si, sil, esi DECLARE_REG 1, rsi, esi, si, sil, esi
...@@ -261,16 +339,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] ...@@ -261,16 +339,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%endif %endif
%endmacro %endmacro
%endif ; !WIN64 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
ASSERT %2 >= %1 ASSERT %2 >= %1
ASSERT %2 <= 7 ASSERT %2 <= 7
%assign stack_offset 0
%ifidn __OUTPUT_FORMAT__,win32
LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1
%endif
LOAD_IF_USED 6, %1 LOAD_IF_USED 6, %1
DEFINE_ARGS %4 DEFINE_ARGS %4
%endmacro %endmacro
...@@ -315,15 +386,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ...@@ -315,15 +386,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endif %endif
%endmacro %endmacro
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
ASSERT %2 >= %1 ASSERT %2 >= %1
%assign stack_offset 0
%assign regs_used %2 %assign regs_used %2
%ifdef PIC
%if %3
%assign regs_used regs_used+1
%endif
%endif
ASSERT regs_used <= 7 ASSERT regs_used <= 7
PUSH_IF_USED 3 PUSH_IF_USED 3
PUSH_IF_USED 4 PUSH_IF_USED 4
...@@ -336,9 +401,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ...@@ -336,9 +401,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
LOAD_IF_USED 4, %1 LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1 LOAD_IF_USED 5, %1
LOAD_IF_USED 6, %1 LOAD_IF_USED 6, %1
%if %3
picgetgot r%2
%endif
DEFINE_ARGS %4 DEFINE_ARGS %4
%endmacro %endmacro
...@@ -382,6 +444,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ...@@ -382,6 +444,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
align function_align align function_align
%1: %1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%assign stack_offset 0
%if %0 > 1 %if %0 > 1
PROLOGUE %2 PROLOGUE %2
%endif %endif
...@@ -389,11 +452,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ...@@ -389,11 +452,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%macro cextern 1 %macro cextern 1
%ifdef PREFIX %ifdef PREFIX
extern _%1 %xdefine %1 _%1
%define %1 _%1
%else
extern %1
%endif %endif
extern %1
%endmacro %endmacro
; This is needed for ELF, otherwise the GNU linker assumes the stack is ; This is needed for ELF, otherwise the GNU linker assumes the stack is
...@@ -523,6 +584,7 @@ INIT_MMX ...@@ -523,6 +584,7 @@ INIT_MMX
%assign %%i 0 %assign %%i 0
%rep num_mmregs %rep num_mmregs
CAT_XDEFINE m, %%i, %1_m %+ %%i CAT_XDEFINE m, %%i, %1_m %+ %%i
CAT_XDEFINE n, m %+ %%i, %%i
%assign %%i %%i+1 %assign %%i %%i+1
%endrep %endrep
%endmacro %endmacro
...@@ -534,7 +596,30 @@ INIT_MMX ...@@ -534,7 +596,30 @@ INIT_MMX
%endif %endif
%endmacro %endmacro
; substitutions which are functionally identical but reduce code size ;Substitutions that reduce instruction size but are functionally equivalent
%define movdqa movaps %define movdqa movaps
%define movdqu movups %define movdqu movups
%macro add 2
%ifnum %2
%if %2==128
sub %1, -128
%else
add %1, %2
%endif
%else
add %1, %2
%endif
%endmacro
%macro sub 2
%ifnum %2
%if %2==128
add %1, -128
%else
sub %1, %2
%endif
%else
sub %1, %2
%endif
%endmacro
...@@ -93,7 +93,7 @@ ...@@ -93,7 +93,7 @@
SBUTTERFLY qdq, %4, %8, %2 SBUTTERFLY qdq, %4, %8, %2
SWAP %2, %5 SWAP %2, %5
SWAP %4, %7 SWAP %4, %7
%if 0<11 %if %0<11
movdqa m%5, %10 movdqa m%5, %10
%endif %endif
%endif %endif
...@@ -165,28 +165,203 @@ ...@@ -165,28 +165,203 @@
palignr %1, %2, %3 palignr %1, %2, %3
%endmacro %endmacro
%macro SUMSUB_BA 2 %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
mova m%1, m%5
mova m%3, m%5
%else
mova m%1, %5
mova m%3, m%1
%endif
pand m%1, m%2 ; dst .. y6 .. y4
pand m%3, m%4 ; src .. y6 .. y4
psrlw m%2, 8 ; dst .. y7 .. y5
psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 2-3
%if %0==2
paddw %1, %2 paddw %1, %2
paddw %2, %2 paddw %2, %2
psubw %2, %1 psubw %2, %1
%else
mova %3, %1
paddw %1, %2
psubw %2, %3
%endif
%endmacro %endmacro
%macro SUMSUB_BADC 4 %macro SUMSUB_BADC 4-5
%if %0==5
SUMSUB_BA %1, %2, %5
SUMSUB_BA %3, %4, %5
%else
paddw %1, %2 paddw %1, %2
paddw %3, %4 paddw %3, %4
paddw %2, %2 paddw %2, %2
paddw %4, %4 paddw %4, %4
psubw %2, %1 psubw %2, %1
psubw %4, %3 psubw %4, %3
%endif
%endmacro %endmacro
%macro HADAMARD8_1D 8 %macro HADAMARD4_V 4+
SUMSUB_BADC %1, %5, %2, %6 SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %3, %7, %4, %8
SUMSUB_BADC %1, %3, %2, %4 SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8 %endmacro
%macro HADAMARD8_V 8+
SUMSUB_BADC %1, %2, %3, %4 SUMSUB_BADC %1, %2, %3, %4
SUMSUB_BADC %5, %6, %7, %8 SUMSUB_BADC %5, %6, %7, %8
SUMSUB_BADC %1, %3, %2, %4
SUMSUB_BADC %5, %7, %6, %8
SUMSUB_BADC %1, %5, %2, %6
SUMSUB_BADC %3, %7, %4, %8
%endmacro
%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10 GLOBAL]
%define shift 16
%elifidn %1, q
%define mask [mask_1100 GLOBAL]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
mova m%5, mask ; ff00
mova m%6, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pand m%6, m%5 ; x5..
pandn m%5, m%3 ; ..x0
psrl%1 m%3, shift ; ..x1
por m%4, m%5 ; x4x0
por m%3, m%6 ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
mova m%5, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pxor m%4, m%3 ; (x4^x1)x0
pand m%4, mask ; (x4^x1)..
pxor m%3, m%4 ; x4x0
psrl%1 m%4, shift ; ..(x1^x4)
pxor m%5, m%4 ; x5x1
SWAP %4, %3, %5
%endif
%endmacro
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
mova m%5, m%3
%ifidn %2, ord
psrl%1 m%3, 16
%endif
pblendw m%3, m%4, 10101010b
psll%1 m%4, 16
%ifidn %2, ord
pblendw m%4, m%5, 01010101b
%else
psrl%1 m%5, 16
por m%4, m%5
%endif
%elifidn %1, q
mova m%5, m%3
shufps m%3, m%4, 10001000b
shufps m%5, m%4, 11011101b
SWAP %4, %5
%endif
%endmacro
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
; %3/%4: regs
; %5(%6): tmpregs
%if %1!=0 ; have to reorder stuff for horizontal op
%ifidn %2, sumsub
%define ORDER ord
; sumsub needs order because a-b != b-a unless a=b
%else
%define ORDER unord
; if we just max, order doesn't matter (allows pblendw+or in sse4)
%endif
%if %1==1
TRANS d, ORDER, %3, %4, %5, %6
%elif %1==2
%if mmsize==8
SBUTTERFLY dq, %3, %4, %5
%else
TRANS q, ORDER, %3, %4, %5, %6
%endif
%elif %1==4
SBUTTERFLY qdq, %3, %4, %5
%endif
%endif
%ifidn %2, sumsub
SUMSUB_BA m%3, m%4, m%5
%else
%ifidn %2, amax
%if %0==6
ABS2 m%3, m%4, m%5, m%6
%else
ABS1 m%3, m%5
ABS1 m%4, m%5
%endif
%endif
pmaxsw m%3, m%4
%endif
%endmacro
%macro HADAMARD2_2D 6-7 sumsub
HADAMARD 0, sumsub, %1, %2, %5
HADAMARD 0, sumsub, %3, %4, %5
SBUTTERFLY %6, %1, %2, %5
%ifnum %7
HADAMARD 0, amax, %1, %2, %5, %7
%else
HADAMARD 0, %7, %1, %2, %5
%endif
SBUTTERFLY %6, %3, %4, %5
%ifnum %7
HADAMARD 0, amax, %3, %4, %5, %7
%else
HADAMARD 0, %7, %3, %4, %5
%endif
%endmacro
%macro HADAMARD4_2D 5-6 sumsub
HADAMARD2_2D %1, %2, %3, %4, %5, wd
HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
SWAP %2, %3
%endmacro
%macro HADAMARD4_2D_SSE 5-6 sumsub
HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
HADAMARD2_2D %1, %3, %2, %4, %5, dq
SBUTTERFLY qdq, %1, %2, %5
HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
SBUTTERFLY qdq, %3, %4, %5
HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
%endmacro
%macro HADAMARD8_2D 9-10 sumsub
HADAMARD2_2D %1, %2, %3, %4, %9, wd
HADAMARD2_2D %5, %6, %7, %8, %9, wd
HADAMARD2_2D %1, %3, %2, %4, %9, dq
HADAMARD2_2D %5, %7, %6, %8, %9, dq
HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
%ifnidn %10, amax
SWAP %2, %5
SWAP %4, %7
%endif
%endmacro %endmacro
%macro SUMSUB2_AB 3 %macro SUMSUB2_AB 3
...@@ -197,13 +372,49 @@ ...@@ -197,13 +372,49 @@
psubw %3, %2 psubw %3, %2
%endmacro %endmacro
%macro SUMSUB2_BA 3
mova m%3, m%1
paddw m%1, m%2
paddw m%1, m%2
psubw m%2, m%3
psubw m%2, m%3
%endmacro
%macro SUMSUBD2_AB 4 %macro SUMSUBD2_AB 4
mova %4, %1 mova %4, %1
mova %3, %2 mova %3, %2
psraw %2, 1 psraw %2, 1
psraw %4, 1 psraw %1, 1
paddw %1, %2 paddw %2, %4
psubw %4, %3 psubw %1, %3
%endmacro
%macro DCT4_1D 5
%ifnum %5
SUMSUB_BADC m%4, m%1, m%3, m%2; m%5
SUMSUB_BA m%3, m%4, m%5
SUMSUB2_AB m%1, m%2, m%5
SWAP %1, %3, %4, %5, %2
%else
SUMSUB_BADC m%4, m%1, m%3, m%2
SUMSUB_BA m%3, m%4
mova [%5], m%2
SUMSUB2_AB m%1, [%5], m%2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 5-6
%ifnum %5
SUMSUBD2_AB m%2, m%4, m%6, m%5
SUMSUB_BA m%3, m%1, m%6
SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
%else
SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
SUMSUB_BA m%3, m%1
SUMSUB_BADC m%4, m%3, m%2, m%1
%endif
SWAP %1, %4, %3
%endmacro %endmacro
%macro LOAD_DIFF 5 %macro LOAD_DIFF 5
...@@ -222,17 +433,81 @@ ...@@ -222,17 +433,81 @@
%endif %endif
%endmacro %endmacro
%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer %macro LOAD_DIFF8x4_SSE2 8
LOAD_DIFF %1, %5, none, [%7], [%8] LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDE], [%8+%1*FDEC_STRIDE]
LOAD_DIFF %2, %6, none, [%7+r1], [%8+r3] LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDE], [%8+%2*FDEC_STRIDE]
LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3] LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDE], [%8+%3*FDEC_STRIDE]
LOAD_DIFF %4, %6, none, [%7+r4], [%8+r5] LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDE], [%8+%4*FDEC_STRIDE]
%endmacro %endmacro
%macro STORE_DIFF 4 %macro LOAD_DIFF8x4_SSSE3 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
movh m%3, [%8+%2*FDEC_STRIDE]
movh m%2, [%7+%2*FENC_STRIDE]
punpcklbw m%2, m%3
movh m%4, [%8+%3*FDEC_STRIDE]
movh m%3, [%7+%3*FENC_STRIDE]
punpcklbw m%3, m%4
movh m%5, [%8+%4*FDEC_STRIDE]
movh m%4, [%7+%4*FENC_STRIDE]
punpcklbw m%4, m%5
pmaddubsw m%1, m%6
pmaddubsw m%2, m%6
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
%endmacro
%macro STORE_DCT 6
movq [%5+%6+ 0], m%1
movq [%5+%6+ 8], m%2
movq [%5+%6+16], m%3
movq [%5+%6+24], m%4
movhps [%5+%6+32], m%1
movhps [%5+%6+40], m%2
movhps [%5+%6+48], m%3
movhps [%5+%6+56], m%4
%endmacro
%macro STORE_IDCT 4
movhps [r0-4*FDEC_STRIDE], %1
movh [r0-3*FDEC_STRIDE], %1
movhps [r0-2*FDEC_STRIDE], %2
movh [r0-1*FDEC_STRIDE], %2
movhps [r0+0*FDEC_STRIDE], %3
movh [r0+1*FDEC_STRIDE], %3
movhps [r0+2*FDEC_STRIDE], %4
movh [r0+3*FDEC_STRIDE], %4
%endmacro
%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
%macro DIFFx2 6-7
movh %3, %5
punpcklbw %3, %4
psraw %1, 6 psraw %1, 6
paddsw %1, %3
movh %3, %6
punpcklbw %3, %4
psraw %2, 6
paddsw %2, %3
packuswb %2, %1
%endmacro
%macro STORE_DIFF 4
movh %2, %4 movh %2, %4
punpcklbw %2, %3 punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2 paddsw %1, %2
packuswb %1, %1 packuswb %1, %1
movh %4, %1 movh %4, %1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment