Commit bbe4a6db authored by Henrik Gramner's avatar Henrik Gramner Committed by Derek Buitenhuis

x86inc: Utilize the shadow space on 64-bit Windows

Store XMM6 and XMM7 in the shadow space in functions that
clobbers them. This way we don't have to adjust the stack
pointer as often, reducing the number of instructions as
well as code size.
Signed-off-by: 's avatarDerek Buitenhuis <derek.buitenhuis@gmail.com>
parent 3fb78e99
...@@ -667,13 +667,13 @@ cglobal imdct_calc, 3,5,3 ...@@ -667,13 +667,13 @@ cglobal imdct_calc, 3,5,3
push r1 push r1
push r0 push r0
%else %else
sub rsp, 8 sub rsp, 8+32*WIN64 ; allocate win64 shadow space
%endif %endif
call r4 call r4
%if ARCH_X86_32 %if ARCH_X86_32
add esp, 12 add esp, 12
%else %else
add rsp, 8 add rsp, 8+32*WIN64
%endif %endif
POP r1 POP r1
POP r3 POP r3
......
...@@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10 ...@@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX cpuname INIT_MMX cpuname
cglobal deblock_h_luma_8, 5,9 cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
movsxd r7, r1d movsxd r7, r1d
lea r8, [r7+r7*2] lea r8, [r7+r7*2]
lea r6, [r0-4] lea r6, [r0-4]
lea r5, [r0-4+r8] lea r5, [r0-4+r8]
%if WIN64 %if WIN64
sub rsp, 0x98 %define pix_tmp rsp+0x30 ; shadow space + r4
%define pix_tmp rsp+0x30
%else %else
sub rsp, 0x68
%define pix_tmp rsp %define pix_tmp rsp
%endif %endif
...@@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9 ...@@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%if WIN64
add rsp, 0x98
%else
add rsp, 0x68
%endif
RET RET
%endmacro %endmacro
...@@ -704,13 +697,16 @@ INIT_MMX cpuname ...@@ -704,13 +697,16 @@ INIT_MMX cpuname
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_8, 4,9 cglobal deblock_h_luma_intra_8, 4,9,0,0x80
movsxd r7, r1d movsxd r7, r1d
lea r8, [r7*3] lea r8, [r7*3]
lea r6, [r0-4] lea r6, [r0-4]
lea r5, [r0-4+r8] lea r5, [r0-4+r8]
sub rsp, 0x88 %if WIN64
%define pix_tmp rsp+0x20 ; shadow space
%else
%define pix_tmp rsp %define pix_tmp rsp
%endif
; transpose 8x16 -> tmp space ; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
...@@ -730,7 +726,6 @@ cglobal deblock_h_luma_intra_8, 4,9 ...@@ -730,7 +726,6 @@ cglobal deblock_h_luma_intra_8, 4,9
sub r5, r7 sub r5, r7
shr r7, 3 shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88
RET RET
%else %else
cglobal deblock_h_luma_intra_8, 2,4,8,0x80 cglobal deblock_h_luma_intra_8, 2,4,8,0x80
......
...@@ -334,14 +334,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -334,14 +334,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%if stack_size < 0 %if stack_size < 0
%assign stack_size -stack_size %assign stack_size -stack_size
%endif %endif
%if mmsize != 8 %assign stack_size_padded stack_size
%assign xmm_regs_used %2 %if WIN64
%assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
%if mmsize != 8
%assign xmm_regs_used %2
%if xmm_regs_used > 8
%assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
%endif
%endif
%endif %endif
%if mmsize <= 16 && HAVE_ALIGNED_STACK %if mmsize <= 16 && HAVE_ALIGNED_STACK
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%endif
SUB rsp, stack_size_padded SUB rsp, stack_size_padded
%else %else
%assign %%reg_num (regs_used - 1) %assign %%reg_num (regs_used - 1)
...@@ -351,14 +355,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -351,14 +355,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
; stack in a single instruction (i.e. mov rsp, rstk or mov ; stack in a single instruction (i.e. mov rsp, rstk or mov
; rsp, [rsp+stack_size_padded]) ; rsp, [rsp+stack_size_padded])
mov rstk, rsp mov rstk, rsp
%assign stack_size_padded stack_size
%if xmm_regs_used > 6
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
%if mmsize == 32 && xmm_regs_used & 1
; re-align to 32 bytes
%assign stack_size_padded (stack_size_padded + 16)
%endif
%endif
%if %1 < 0 ; need to store rsp on stack %if %1 < 0 ; need to store rsp on stack
sub rsp, gprsize+stack_size_padded sub rsp, gprsize+stack_size_padded
and rsp, ~(%%stack_alignment-1) and rsp, ~(%%stack_alignment-1)
...@@ -370,9 +366,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 ...@@ -370,9 +366,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%xdefine rstkm rstk %xdefine rstkm rstk
%endif %endif
%endif %endif
%if xmm_regs_used > 6 WIN64_PUSH_XMM
WIN64_PUSH_XMM
%endif
%endif %endif
%endif %endif
%endmacro %endmacro
...@@ -433,40 +427,55 @@ DECLARE_REG 14, R15, 120 ...@@ -433,40 +427,55 @@ DECLARE_REG 14, R15, 120
%endmacro %endmacro
%macro WIN64_PUSH_XMM 0 %macro WIN64_PUSH_XMM 0
%assign %%i xmm_regs_used ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%rep (xmm_regs_used-6) %if xmm_regs_used > 6
%assign %%i %%i-1 movaps [rstk + stack_offset + 8], xmm6
movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i %endif
%endrep %if xmm_regs_used > 7
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%i 8
%rep xmm_regs_used-8
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
%endif
%endmacro %endmacro
%macro WIN64_SPILL_XMM 1 %macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1 %assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16 ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6 %if xmm_regs_used > 8
SUB rsp, (xmm_regs_used-6)*16+16 %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
WIN64_PUSH_XMM SUB rsp, stack_size_padded
%endif %endif
WIN64_PUSH_XMM
%endmacro %endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1 %macro WIN64_RESTORE_XMM_INTERNAL 1
%if xmm_regs_used > 6 %assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used %assign %%i xmm_regs_used
%rep (xmm_regs_used-6) %rep xmm_regs_used-8
%assign %%i %%i-1 %assign %%i %%i-1
movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)] movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
%endrep %endrep
%if stack_size_padded == 0
add %1, (xmm_regs_used-6)*16+16
%endif
%endif %endif
%if stack_size_padded > 0 %if stack_size_padded > 0
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
mov rsp, rstkm mov rsp, rstkm
%else %else
add %1, stack_size_padded add %1, stack_size_padded
%assign %%pad_size stack_size_padded
%endif %endif
%endif %endif
%if xmm_regs_used > 7
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
%endif
%endmacro %endmacro
%macro WIN64_RESTORE_XMM 1 %macro WIN64_RESTORE_XMM 1
...@@ -683,12 +692,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ...@@ -683,12 +692,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif %endif
align function_align align function_align
%2: %2:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
%xdefine rstk rsp %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
%assign stack_offset 0 %assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%ifnidn %3, "" %ifnidn %3, ""
PROLOGUE %3 PROLOGUE %3
%endif %endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment