Commit 729f90e2 authored by Henrik Gramner's avatar Henrik Gramner Committed by Justin Ruggles

x86inc improvements for 64-bit

Add support for all x86-64 registers
Prefer caller-saved register over callee-saved on WIN64
Support up to 15 function arguments

Also (by Ronald S. Bultje)
Fix up our asm to work with new x86inc.asm.
Signed-off-by: 's avatarRonald S. Bultje <rsbultje@gmail.com>
Signed-off-by: 's avatarJustin Ruggles <justin.ruggles@gmail.com>
parent e1ce7568
......@@ -497,9 +497,9 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
%macro EMU_EDGE_FUNC 0
%if ARCH_X86_64
%define w_reg r10
cglobal emu_edge_core, 6, 7, 1
mov r11, r5 ; save block_h
%define w_reg r7
cglobal emu_edge_core, 6, 9, 1
mov r8, r5 ; save block_h
%else
%define w_reg r6
cglobal emu_edge_core, 2, 7, 0
......@@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0
sub r0, w_reg
%if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize
mov r5, r11
mov r5, r8
%else
mov r0m, r0 ; backup of buf+block_h*linesize
mov r5, r5m
......@@ -550,7 +550,7 @@ cglobal emu_edge_core, 2, 7, 0
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
sar w_reg, 1
sal w_reg, 6
; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
%ifdef PIC
lea rax, [.emuedge_extend_left_2]
......@@ -560,7 +560,7 @@ cglobal emu_edge_core, 2, 7, 0
%endif
call w_reg
; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
.right_extend:
%if ARCH_X86_32
mov r0, r0m
......@@ -591,7 +591,7 @@ cglobal emu_edge_core, 2, 7, 0
%define vall al
%define valh ah
%define valw ax
%define valw2 r10w
%define valw2 r7w
%define valw3 r3w
%if WIN64
%define valw4 r4w
......@@ -618,7 +618,7 @@ cglobal emu_edge_core, 2, 7, 0
; - else if (%2 & 8) fills 8 bytes into mm0
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
; - else if (%2 & 4) fills 4 bytes into mm0-1
; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
; (note that we're using r3 for body/bottom because it's a shorter
; opcode, and then the loop fits in 128 bytes)
; - else fills remaining bytes into rax
......@@ -848,7 +848,7 @@ ALIGN 64
%endrep
%endmacro ; LEFT_EXTEND
; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
%macro RIGHT_EXTEND 0
%assign %%n 2
%rep 11
......@@ -858,7 +858,7 @@ ALIGN 64
sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
dec r11
dec r8
%else ; ARCH_X86_32
sub r0, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
......@@ -937,11 +937,11 @@ ALIGN 64
%macro SLOW_V_EXTEND 0
.slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
%if ARCH_X86_64
push r11 ; save old value of block_h
push r8 ; save old value of block_h
test r3, r3
%define cnt_reg r11
%define cnt_reg r8
jz .do_body_copy ; if (!start_y) goto do_body_copy
V_COPY_ROW top, r3
%else
......@@ -955,7 +955,7 @@ ALIGN 64
V_COPY_ROW body, r4
%if ARCH_X86_64
pop r11 ; restore old value of block_h
pop r8 ; restore old value of block_h
%define cnt_reg r3
%endif
test r5, r5
......@@ -974,7 +974,7 @@ ALIGN 64
%macro SLOW_LEFT_EXTEND 0
.slow_left_extend_loop:
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
mov r4, 8
sub r0, linesize
READ_V_PIXEL 8, [r0+w_reg]
......@@ -1002,11 +1002,11 @@ ALIGN 64
%macro SLOW_RIGHT_EXTEND 0
.slow_right_extend_loop:
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%if ARCH_X86_64
%define buf_reg r3
%define bh_reg r11
%define bh_reg r8
%else
%define buf_reg r0
%define bh_reg r5
......
......@@ -749,14 +749,11 @@ INIT_XMM
%endmacro
%macro DECL_IMDCT 2
cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
cglobal imdct_half%1, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64
%define rrevtab r10
%define rtcos r11
%define rtsin r12
push r12
push r13
push r14
%define rrevtab r7
%define rtcos r8
%define rtsin r9
%else
%define rrevtab r6
%define rtsin r6
......@@ -798,12 +795,12 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
%if ARCH_X86_64
movzx r5, word [rrevtab+r4-4]
movzx r6, word [rrevtab+r4-2]
movzx r13, word [rrevtab+r3]
movzx r14, word [rrevtab+r3+2]
movzx r10, word [rrevtab+r3]
movzx r11, word [rrevtab+r3+2]
movlps [r1+r5 *8], xmm0
movhps [r1+r6 *8], xmm0
movlps [r1+r13*8], xmm1
movhps [r1+r14*8], xmm1
movlps [r1+r10*8], xmm1
movhps [r1+r11*8], xmm1
add r4, 4
%else
mov r6, [esp]
......@@ -839,11 +836,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
mov r1, -mmsize
sub r1, r0
%2 r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64
pop r14
pop r13
pop r12
%else
%if ARCH_X86_64 == 0
add esp, 12
%endif
%ifidn avx_enabled, 1
......
......@@ -179,9 +179,8 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
%macro FLOAT_TO_INT16_INTERLEAVE6 1
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
%define lend r10d
mov lend, r2d
%else
%define lend dword r2m
......@@ -240,9 +239,8 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE6 2
cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
%define lend r10d
mov lend, r2d
%else
%define lend dword r2m
......
......@@ -91,9 +91,22 @@ SECTION .text
%endmacro
%macro chroma_mc8_mmx_func 3
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%define extra_regs 1
%endif ; PIC
%else
%define extra_regs 0
%endif ; rv40
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
......@@ -106,19 +119,12 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
.at_least_one_non_zero
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r11
%define rnd_2d_rv40 r11
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
%if ARCH_X86_64
mov r10, r5
and r10, 6 ; &~1 for mx/my=[0,7]
lea r10, [r10*4+r4]
sar r10d, 1
%define rnd_bias r10
mov r7, r5
and r7, 6 ; &~1 for mx/my=[0,7]
lea r7, [r7*4+r4]
sar r7d, 1
%define rnd_bias r7
%define dest_reg r0
%else ; x86-32
mov r0, r5
......@@ -145,7 +151,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%ifidn %2, rv40
%ifdef PIC
lea r11, [rnd_rv40_1d_tbl]
lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
......@@ -196,7 +202,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
lea r11, [rnd_rv40_2d_tbl]
lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
......@@ -278,7 +284,13 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%endmacro
%macro chroma_mc4_mmx_func 3
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
......@@ -296,8 +308,8 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
%ifidn %2, rv40
%ifdef PIC
lea r11, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r11
lea r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
......
......@@ -328,11 +328,11 @@ cglobal deblock_v_luma_8_%1, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal deblock_h_luma_8_%1, 5,7
movsxd r10, r1d
lea r11, [r10+r10*2]
cglobal deblock_h_luma_8_%1, 5,9
movsxd r7, r1d
lea r8, [r7+r7*2]
lea r6, [r0-4]
lea r5, [r0-4+r11]
lea r5, [r0-4+r8]
%if WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
......@@ -342,14 +342,14 @@ cglobal deblock_h_luma_8_%1, 5,7
%endif
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%if WIN64
......@@ -364,17 +364,17 @@ cglobal deblock_h_luma_8_%1, 5,7
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
shl r10, 3
sub r6, r10
sub r5, r10
shr r10, 3
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%if WIN64
add rsp, 0x98
......@@ -705,32 +705,32 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_8_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
cglobal deblock_h_luma_intra_8_%1, 4,9
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r11]
lea r5, [r0-4+r8]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r10*8]
lea r5, [r5+r10*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra_8_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r11]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub r6, r10
sub r5, r10
shr r10, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
lea r5, [r6+r8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
add rsp, 0x88
RET
%else
......
This diff is collapsed.
......@@ -29,24 +29,6 @@ SECTION_RODATA
pw_pixel_max: times 8 dw ((1 << 10)-1)
pd_32: times 4 dd 32
scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
db 4+11*8, 5+11*8, 4+12*8, 5+12*8
db 6+11*8, 7+11*8, 6+12*8, 7+12*8
db 4+13*8, 5+13*8, 4+14*8, 5+14*8
db 6+13*8, 7+13*8, 6+14*8, 7+14*8
%ifdef PIC
%define scan8 r11
%else
%define scan8 scan8_mem
%endif
SECTION .text
......@@ -315,9 +297,9 @@ IDCT_ADD16INTRA_10 avx
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro IDCT_ADD8 1
cglobal h264_idct_add8_10_%1,5,7,7
cglobal h264_idct_add8_10_%1,5,8,7
%if ARCH_X86_64
mov r10, r0
mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
......@@ -325,7 +307,7 @@ cglobal h264_idct_add8_10_%1,5,7,7
ADD16_OP_INTRA %1, 18, 4+ 7*8
add r2, 1024-128*2
%if ARCH_X86_64
mov r0, [r10+gprsize]
mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
......
......@@ -289,7 +289,7 @@ cglobal pred16x16_tm_vp8_sse2, 2,6,6
;-----------------------------------------------------------------------------
%macro H264_PRED16x16_PLANE 3
cglobal pred16x16_plane_%3_%1, 2, 7, %2
cglobal pred16x16_plane_%3_%1, 2, 9, %2
mov r2, r1 ; +stride
neg r1 ; -stride
......@@ -349,7 +349,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
add r4, r2
%if ARCH_X86_64
%define e_reg r11
%define e_reg r8
%else
%define e_reg r0
%endif
......@@ -370,8 +370,8 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
movzx r10, byte [r4+r2 ]
sub r10, e_reg
movzx r7, byte [r4+r2 ]
sub r7, e_reg
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
......@@ -386,7 +386,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r6, byte [r3 ]
sub r6, r4
%if ARCH_X86_64
lea r6, [r10+r6*2]
lea r6, [r7+r6*2]
lea r5, [r5+r6*2]
add r5, r6
%else
......@@ -396,9 +396,9 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r4, byte [e_reg ]
%if ARCH_X86_64
movzx r10, byte [r3 +r2 ]
sub r10, r4
sub r5, r10
movzx r7, byte [r3 +r2 ]
sub r7, r4
sub r5, r7
%else
movzx r6, byte [r3 +r2 ]
sub r6, r4
......@@ -410,7 +410,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r6, byte [r3 +r2*2]
sub r6, r4
%if ARCH_X86_64
add r6, r10
add r6, r7
%endif
lea r5, [r5+r6*8]
......@@ -588,7 +588,7 @@ H264_PRED16x16_PLANE ssse3, 8, svq3
;-----------------------------------------------------------------------------
%macro H264_PRED8x8_PLANE 2
cglobal pred8x8_plane_%1, 2, 7, %2
cglobal pred8x8_plane_%1, 2, 9, %2
mov r2, r1 ; +stride
neg r1 ; -stride
......@@ -642,7 +642,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
add r4, r2
%if ARCH_X86_64
%define e_reg r11
%define e_reg r8
%else
%define e_reg r0
%endif
......@@ -653,9 +653,9 @@ cglobal pred8x8_plane_%1, 2, 7, %2
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
movzx r10, byte [r4+r2 ]
sub r10, e_reg
sub r5, r10
movzx r7, byte [r4+r2 ]
sub r7, e_reg
sub r5, r7
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
......@@ -667,7 +667,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
movzx r6, byte [r4+r2*2 ]
sub r6, e_reg
%if ARCH_X86_64
add r6, r10
add r6, r7
%endif
lea r5, [r5+r6*4]
......
......@@ -121,8 +121,8 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endmacro
%macro MCAxA_OP 8
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
%if ARCH_X86_32
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
call stub_%2_h264_qpel%4_%3_10_%1
mov r0, r0m
mov r1, r1m
......@@ -141,17 +141,19 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
call stub_%2_h264_qpel%4_%3_10_%1
RET
%else ; ARCH_X86_64
mov r10, r0
mov r11, r1
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
mov r%7, r0
%assign p1 %7+1
mov r %+ p1, r1
call stub_%2_h264_qpel%4_%3_10_%1
lea r0, [r10+%4*2]
lea r1, [r11+%4*2]
lea r0, [r%7+%4*2]
lea r1, [r %+ p1+%4*2]
call stub_%2_h264_qpel%4_%3_10_%1
lea r0, [r10+r2*%4]
lea r1, [r11+r2*%4]
lea r0, [r%7+r2*%4]
lea r1, [r %+ p1+r2*%4]
call stub_%2_h264_qpel%4_%3_10_%1
lea r0, [r10+r2*%4+%4*2]
lea r1, [r11+r2*%4+%4*2]
lea r0, [r%7+r2*%4+%4*2]
lea r1, [r %+ p1+r2*%4+%4*2]
%if UNIX64 == 0 ; fall through to function
call stub_%2_h264_qpel%4_%3_10_%1
RET
......
......@@ -127,7 +127,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0
%if ARCH_X86_64
%define off_regd r11d
%define off_regd r7d
%else
%define off_regd r3d
%endif
......@@ -175,7 +175,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2
%endmacro
INIT_MMX
cglobal h264_biweight_16_mmx2, 7, 7, 0
cglobal h264_biweight_16_mmx2, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow
......@@ -194,7 +194,7 @@ cglobal h264_biweight_16_mmx2, 7, 7, 0
REP_RET
%macro BIWEIGHT_FUNC_MM 3
cglobal h264_biweight_%1_%3, 7, 7, %2
cglobal h264_biweight_%1_%3, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow
......@@ -215,7 +215,7 @@ INIT_XMM
BIWEIGHT_FUNC_MM 16, 8, sse2
%macro BIWEIGHT_FUNC_HALF_MM 3
cglobal h264_biweight_%1_%3, 7, 7, %2
cglobal h264_biweight_%1_%3, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
......@@ -245,7 +245,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0
%if ARCH_X86_64
%define off_regd r11d
%define off_regd r7d
%else
%define off_regd r3d
%endif
......@@ -277,7 +277,7 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%endmacro
INIT_XMM
cglobal h264_biweight_16_ssse3, 7, 7, 8
cglobal h264_biweight_16_ssse3, 7, 8, 8
BIWEIGHT_SSSE3_SETUP
movifnidn r3d, r3m
......@@ -296,7 +296,7 @@ cglobal h264_biweight_16_ssse3, 7, 7, 8
REP_RET
INIT_XMM
cglobal h264_biweight_8_ssse3, 7, 7, 8
cglobal h264_biweight_8_ssse3, 7, 8, 8
BIWEIGHT_SSSE3_SETUP
movifnidn r3d, r3m
sar r3, 1
......
This diff is collapsed.
......@@ -62,11 +62,11 @@ SECTION .text
%define cntr_reg fltsizeq
%define movsx mov
%else
%define cntr_reg r11
%define cntr_reg r7
%define movsx movsxd
%endif
cglobal yuv2planeX_%1, %3, 7, %2, filter, fltsize, src, dst, w, dither, offset
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
......
......@@ -53,7 +53,7 @@ SECTION .text
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
......@@ -245,10 +245,9 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
push r12
%define srcq r11
%define pos1q r10
%define srcendq r12
%define srcq r8
%define pos1q r7
%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
......@@ -388,16 +387,7 @@ cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, srcmem, filter, fltpos, fltsiz
add wq, 2
%endif ; %3 ==/!= X
jl .loop
%ifnidn %3, X
REP_RET
%else ; %3 == X
%if ARCH_X86_64
pop r12
RET
%else ; x86-32
REP_RET
%endif ; x86-32/64
%endif ; %3 ==/!= X
%endmacro
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment