Commit 3b15a6d7 authored by Ronald S. Bultje's avatar Ronald S. Bultje

config.asm: change %ifdef directives to %if directives.

This allows combining multiple conditionals in a single statement.
parent 08628b6a
...@@ -541,7 +541,8 @@ print_config_mak(){ ...@@ -541,7 +541,8 @@ print_config_mak(){
} }
print_config_asm(){ print_config_asm(){
enabled $1 && echo "%define $2" enabled $1 && v=1 || v=0
echo "%define $2 $v"
} }
print_config(){ print_config(){
......
...@@ -69,12 +69,12 @@ cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset ...@@ -69,12 +69,12 @@ cglobal ac3_exponent_min_%1, 3,4,2, exp, reuse_blks, expn, offset
%define LOOP_ALIGN %define LOOP_ALIGN
INIT_MMX INIT_MMX
AC3_EXPONENT_MIN mmx AC3_EXPONENT_MIN mmx
%ifdef HAVE_MMX2 %if HAVE_MMX2
%define PMINUB PMINUB_MMXEXT %define PMINUB PMINUB_MMXEXT
%define LOOP_ALIGN ALIGN 16 %define LOOP_ALIGN ALIGN 16
AC3_EXPONENT_MIN mmxext AC3_EXPONENT_MIN mmxext
%endif %endif
%ifdef HAVE_SSE %if HAVE_SSE
INIT_XMM INIT_XMM
AC3_EXPONENT_MIN sse2 AC3_EXPONENT_MIN sse2
%endif %endif
...@@ -367,7 +367,7 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum ...@@ -367,7 +367,7 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
pabsd %1, %1 pabsd %1, %1
%endmacro %endmacro
%ifdef HAVE_AMD3DNOW %if HAVE_AMD3DNOW
INIT_MMX INIT_MMX
cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
add expq, lenq add expq, lenq
...@@ -439,11 +439,11 @@ cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len ...@@ -439,11 +439,11 @@ cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
REP_RET REP_RET
%endmacro %endmacro
%ifdef HAVE_SSE %if HAVE_SSE
INIT_XMM INIT_XMM
%define PABSD PABSD_MMX %define PABSD PABSD_MMX
AC3_EXTRACT_EXPONENTS sse2 AC3_EXTRACT_EXPONENTS sse2
%ifdef HAVE_SSSE3 %if HAVE_SSSE3
%define PABSD PABSD_SSSE3 %define PABSD PABSD_SSSE3
AC3_EXTRACT_EXPONENTS ssse3 AC3_EXTRACT_EXPONENTS ssse3
%endif %endif
......
...@@ -211,7 +211,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 ...@@ -211,7 +211,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
INIT_YMM INIT_YMM
SECTION_TEXT SECTION_TEXT
%ifdef HAVE_AVX %if HAVE_AVX
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float_avx, 2,3,8, out, in, tmp cglobal dct32_float_avx, 2,3,8, out, in, tmp
; pass 1 ; pass 1
...@@ -289,7 +289,7 @@ INIT_XMM ...@@ -289,7 +289,7 @@ INIT_XMM
%define BUTTERFLY BUTTERFLY_SSE %define BUTTERFLY BUTTERFLY_SSE
%define BUTTERFLY0 BUTTERFLY0_SSE %define BUTTERFLY0 BUTTERFLY0_SSE
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define SPILL SWAP %define SPILL SWAP
%define UNSPILL SWAP %define UNSPILL SWAP
......
...@@ -138,7 +138,7 @@ align 16 ...@@ -138,7 +138,7 @@ align 16
%endif %endif
%define t0 [v1q + orderq] %define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize] %define t1 [v1q + orderq + mmsize]
%ifdef ARCH_X86_64 %if ARCH_X86_64
mova m8, t0 mova m8, t0
mova m9, t1 mova m9, t1
%define t0 m8 %define t0 m8
...@@ -474,7 +474,7 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset ...@@ -474,7 +474,7 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
movss xmm1, xmm0 movss xmm1, xmm0
shufps xmm0, xmm0, 1 shufps xmm0, xmm0, 1
addss xmm0, xmm1 addss xmm0, xmm1
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
movd r0m, xmm0 movd r0m, xmm0
fld dword r0m fld dword r0m
%endif %endif
...@@ -498,7 +498,7 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset ...@@ -498,7 +498,7 @@ cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
; function implementations. Fast are fixed-width, slow is variable-width ; function implementations. Fast are fixed-width, slow is variable-width
%macro EMU_EDGE_FUNC 0 %macro EMU_EDGE_FUNC 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define w_reg r10 %define w_reg r10
cglobal emu_edge_core, 6, 7, 1 cglobal emu_edge_core, 6, 7, 1
mov r11, r5 ; save block_h mov r11, r5 ; save block_h
...@@ -513,14 +513,14 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -513,14 +513,14 @@ cglobal emu_edge_core, 2, 7, 0
mov w_reg, r7m mov w_reg, r7m
sub w_reg, r6m ; w = start_x - end_x sub w_reg, r6m ; w = start_x - end_x
sub r5, r4 sub r5, r4
%ifdef ARCH_X86_64 %if ARCH_X86_64
sub r4, r3 sub r4, r3
%else %else
sub r4, dword r3m sub r4, dword r3m
%endif %endif
cmp w_reg, 22 cmp w_reg, 22
jg .slow_v_extend_loop jg .slow_v_extend_loop
%ifdef ARCH_X86_32 %if ARCH_X86_32
mov r2, r2m ; linesize mov r2, r2m ; linesize
%endif %endif
sal w_reg, 7 ; w * 128 sal w_reg, 7 ; w * 128
...@@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -536,7 +536,7 @@ cglobal emu_edge_core, 2, 7, 0
; horizontal extend (left/right) ; horizontal extend (left/right)
mov w_reg, r6m ; start_x mov w_reg, r6m ; start_x
sub r0, w_reg sub r0, w_reg
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r3, r0 ; backup of buf+block_h*linesize mov r3, r0 ; backup of buf+block_h*linesize
mov r5, r11 mov r5, r11
%else %else
...@@ -564,7 +564,7 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -564,7 +564,7 @@ cglobal emu_edge_core, 2, 7, 0
; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
.right_extend: .right_extend:
%ifdef ARCH_X86_32 %if ARCH_X86_32
mov r0, r0m mov r0, r0m
mov r5, r5m mov r5, r5m
%endif %endif
...@@ -589,13 +589,13 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -589,13 +589,13 @@ cglobal emu_edge_core, 2, 7, 0
.h_extend_end: .h_extend_end:
RET RET
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define vall al %define vall al
%define valh ah %define valh ah
%define valw ax %define valw ax
%define valw2 r10w %define valw2 r10w
%define valw3 r3w %define valw3 r3w
%ifdef WIN64 %if WIN64
%define valw4 r4w %define valw4 r4w
%else ; unix64 %else ; unix64
%define valw4 r3w %define valw4 r3w
...@@ -643,7 +643,7 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -643,7 +643,7 @@ cglobal emu_edge_core, 2, 7, 0
%endrep ; %2/16 %endrep ; %2/16
%endif %endif
%ifdef ARCH_X86_64 %if ARCH_X86_64
%if (%2-%%src_off) == 8 %if (%2-%%src_off) == 8
mov rax, [r1+%%src_off] mov rax, [r1+%%src_off]
%assign %%src_off %%src_off+8 %assign %%src_off %%src_off+8
...@@ -692,7 +692,7 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -692,7 +692,7 @@ cglobal emu_edge_core, 2, 7, 0
%endrep ; %2/16 %endrep ; %2/16
%endif %endif
%ifdef ARCH_X86_64 %if ARCH_X86_64
%if (%2-%%dst_off) == 8 %if (%2-%%dst_off) == 8
mov [r0+%%dst_off], rax mov [r0+%%dst_off], rax
%assign %%dst_off %%dst_off+8 %assign %%dst_off %%dst_off+8
...@@ -740,7 +740,7 @@ cglobal emu_edge_core, 2, 7, 0 ...@@ -740,7 +740,7 @@ cglobal emu_edge_core, 2, 7, 0
ALIGN 128 ALIGN 128
.emuedge_v_extend_ %+ %%n: .emuedge_v_extend_ %+ %%n:
; extend pixels above body ; extend pixels above body
%ifdef ARCH_X86_64 %if ARCH_X86_64
test r3 , r3 ; if (!start_y) test r3 , r3 ; if (!start_y)
jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
%else ; ARCH_X86_32 %else ; ARCH_X86_32
...@@ -751,7 +751,7 @@ ALIGN 128 ...@@ -751,7 +751,7 @@ ALIGN 128
.emuedge_extend_top_ %+ %%n %+ _loop: ; do { .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
WRITE_NUM_BYTES top, %%n ; write bytes WRITE_NUM_BYTES top, %%n ; write bytes
add r0 , r2 ; dst += linesize add r0 , r2 ; dst += linesize
%ifdef ARCH_X86_64 %if ARCH_X86_64
dec r3d dec r3d
%else ; ARCH_X86_32 %else ; ARCH_X86_32
dec dword r3m dec dword r3m
...@@ -779,7 +779,7 @@ ALIGN 128 ...@@ -779,7 +779,7 @@ ALIGN 128
jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
.emuedge_v_extend_end_ %+ %%n: .emuedge_v_extend_end_ %+ %%n:
%ifdef ARCH_X86_64 %if ARCH_X86_64
ret ret
%else ; ARCH_X86_32 %else ; ARCH_X86_32
rep ret rep ret
...@@ -841,7 +841,7 @@ ALIGN 64 ...@@ -841,7 +841,7 @@ ALIGN 64
WRITE_V_PIXEL %%n, r0 ; write pixels WRITE_V_PIXEL %%n, r0 ; write pixels
dec r5 dec r5
jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
%ifdef ARCH_X86_64 %if ARCH_X86_64
ret ret
%else ; ARCH_X86_32 %else ; ARCH_X86_32
rep ret rep ret
...@@ -856,7 +856,7 @@ ALIGN 64 ...@@ -856,7 +856,7 @@ ALIGN 64
%rep 11 %rep 11
ALIGN 64 ALIGN 64
.emuedge_extend_right_ %+ %%n: ; do { .emuedge_extend_right_ %+ %%n: ; do {
%ifdef ARCH_X86_64 %if ARCH_X86_64
sub r3, r2 ; dst -= linesize sub r3, r2 ; dst -= linesize
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
...@@ -868,7 +868,7 @@ ALIGN 64 ...@@ -868,7 +868,7 @@ ALIGN 64
dec r5 dec r5
%endif ; ARCH_X86_64/32 %endif ; ARCH_X86_64/32
jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
%ifdef ARCH_X86_64 %if ARCH_X86_64
ret ret
%else ; ARCH_X86_32 %else ; ARCH_X86_32
rep ret rep ret
...@@ -876,7 +876,7 @@ ALIGN 64 ...@@ -876,7 +876,7 @@ ALIGN 64
%assign %%n %%n+2 %assign %%n %%n+2
%endrep %endrep
%ifdef ARCH_X86_32 %if ARCH_X86_32
%define stack_offset 0x10 %define stack_offset 0x10
%endif %endif
%endmacro ; RIGHT_EXTEND %endmacro ; RIGHT_EXTEND
...@@ -916,7 +916,7 @@ ALIGN 64 ...@@ -916,7 +916,7 @@ ALIGN 64
V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
%else ; sse %else ; sse
V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define linesize r2 %define linesize r2
V_COPY_NPX %1, rax , mov, 8 V_COPY_NPX %1, rax , mov, 8
%else ; ARCH_X86_32 %else ; ARCH_X86_32
...@@ -940,7 +940,7 @@ ALIGN 64 ...@@ -940,7 +940,7 @@ ALIGN 64
.slow_v_extend_loop: .slow_v_extend_loop:
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
%ifdef ARCH_X86_64 %if ARCH_X86_64
push r11 ; save old value of block_h push r11 ; save old value of block_h
test r3, r3 test r3, r3
%define cnt_reg r11 %define cnt_reg r11
...@@ -956,18 +956,18 @@ ALIGN 64 ...@@ -956,18 +956,18 @@ ALIGN 64
.do_body_copy: .do_body_copy:
V_COPY_ROW body, r4 V_COPY_ROW body, r4
%ifdef ARCH_X86_64 %if ARCH_X86_64
pop r11 ; restore old value of block_h pop r11 ; restore old value of block_h
%define cnt_reg r3 %define cnt_reg r3
%endif %endif
test r5, r5 test r5, r5
%ifdef ARCH_X86_64 %if ARCH_X86_64
jz .v_extend_end jz .v_extend_end
%else %else
jz .skip_bottom_extend jz .skip_bottom_extend
%endif %endif
V_COPY_ROW bottom, r5 V_COPY_ROW bottom, r5
%ifdef ARCH_X86_32 %if ARCH_X86_32
.skip_bottom_extend: .skip_bottom_extend:
mov r2, r2m mov r2, r2m
%endif %endif
...@@ -996,7 +996,7 @@ ALIGN 64 ...@@ -996,7 +996,7 @@ ALIGN 64
.left_extend_loop_end: .left_extend_loop_end:
dec r5 dec r5
jnz .slow_left_extend_loop jnz .slow_left_extend_loop
%ifdef ARCH_X86_32 %if ARCH_X86_32
mov r2, r2m mov r2, r2m
%endif %endif
jmp .right_extend jmp .right_extend
...@@ -1006,7 +1006,7 @@ ALIGN 64 ...@@ -1006,7 +1006,7 @@ ALIGN 64
.slow_right_extend_loop: .slow_right_extend_loop:
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h, ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define buf_reg r3 %define buf_reg r3
%define bh_reg r11 %define bh_reg r11
%else %else
...@@ -1047,7 +1047,7 @@ SLOW_RIGHT_EXTEND ...@@ -1047,7 +1047,7 @@ SLOW_RIGHT_EXTEND
%endmacro %endmacro
emu_edge sse emu_edge sse
%ifdef ARCH_X86_32 %if ARCH_X86_32
emu_edge mmx emu_edge mmx
%endif %endif
...@@ -1138,7 +1138,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 ...@@ -1138,7 +1138,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd lenq, lend movsxd lenq, lend
%endif %endif
test lenq, lenq test lenq, lenq
......
...@@ -245,7 +245,7 @@ hadamard8x8_diff_%1: ...@@ -245,7 +245,7 @@ hadamard8x8_diff_%1:
lea r0, [r3*3] lea r0, [r3*3]
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
HADAMARD8 HADAMARD8
%ifdef ARCH_X86_64 %if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else %else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
...@@ -270,7 +270,7 @@ HADAMARD8_DIFF_MMX mmx2 ...@@ -270,7 +270,7 @@ HADAMARD8_DIFF_MMX mmx2
INIT_XMM INIT_XMM
%define ABS2 ABS2_MMX2 %define ABS2 ABS2_MMX2
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64 %define ABS_SUM_8x8 ABS_SUM_8x8_64
%else %else
%define ABS_SUM_8x8 ABS_SUM_8x8_32 %define ABS_SUM_8x8 ABS_SUM_8x8_32
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
%include "x86inc.asm" %include "x86inc.asm"
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define pointer resq %define pointer resq
%else %else
%define pointer resd %define pointer resd
...@@ -73,7 +73,7 @@ cextern cos_ %+ i ...@@ -73,7 +73,7 @@ cextern cos_ %+ i
%assign i i<<1 %assign i i<<1
%endrep %endrep
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define pointer dq %define pointer dq
%else %else
%define pointer dd %define pointer dd
...@@ -299,7 +299,7 @@ IF%1 mova Z(1), m5 ...@@ -299,7 +299,7 @@ IF%1 mova Z(1), m5
INIT_YMM INIT_YMM
%ifdef HAVE_AVX %if HAVE_AVX
align 16 align 16
fft8_avx: fft8_avx:
mova m0, Z(0) mova m0, Z(0)
...@@ -534,7 +534,7 @@ DEFINE_ARGS z, w, n, o1, o3 ...@@ -534,7 +534,7 @@ DEFINE_ARGS z, w, n, o1, o3
INIT_YMM INIT_YMM
%ifdef HAVE_AVX %if HAVE_AVX
%macro INTERL_AVX 5 %macro INTERL_AVX 5
vunpckhps %3, %2, %1 vunpckhps %3, %2, %1
vunpcklps %2, %2, %1 vunpcklps %2, %2, %1
...@@ -638,7 +638,7 @@ cglobal fft_dispatch%3%2, 2,5,8, z, nbits ...@@ -638,7 +638,7 @@ cglobal fft_dispatch%3%2, 2,5,8, z, nbits
RET RET
%endmacro ; DECL_FFT %endmacro ; DECL_FFT
%ifdef HAVE_AVX %if HAVE_AVX
INIT_YMM INIT_YMM
DECL_FFT 6, _avx DECL_FFT 6, _avx
DECL_FFT 6, _avx, _interleave DECL_FFT 6, _avx, _interleave
...@@ -750,7 +750,7 @@ INIT_XMM ...@@ -750,7 +750,7 @@ INIT_XMM
%macro DECL_IMDCT 2 %macro DECL_IMDCT 2
cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *input
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define rrevtab r10 %define rrevtab r10
%define rtcos r11 %define rtcos r11
%define rtsin r12 %define rtsin r12
...@@ -769,24 +769,24 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * ...@@ -769,24 +769,24 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
mov rtsin, [r0+FFTContext.tsin] mov rtsin, [r0+FFTContext.tsin]
add rtcos, r3 add rtcos, r3
add rtsin, r3 add rtsin, r3
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
push rtcos push rtcos
push rtsin push rtsin
%endif %endif
shr r3, 1 shr r3, 1
mov rrevtab, [r0+FFTContext.revtab] mov rrevtab, [r0+FFTContext.revtab]
add rrevtab, r3 add rrevtab, r3
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
push rrevtab push rrevtab
%endif %endif
sub r3, 4 sub r3, 4
%ifdef ARCH_X86_64 %if ARCH_X86_64
xor r4, r4 xor r4, r4
sub r4, r3 sub r4, r3
%endif %endif
.pre: .pre:
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
;unspill ;unspill
xor r4, r4 xor r4, r4
sub r4, r3 sub r4, r3
...@@ -795,7 +795,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * ...@@ -795,7 +795,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
%endif %endif
PREROTATER r4, r3, r2, rtcos, rtsin PREROTATER r4, r3, r2, rtcos, rtsin
%ifdef ARCH_X86_64 %if ARCH_X86_64
movzx r5, word [rrevtab+r4-4] movzx r5, word [rrevtab+r4-4]
movzx r6, word [rrevtab+r4-2] movzx r6, word [rrevtab+r4-2]
movzx r13, word [rrevtab+r3] movzx r13, word [rrevtab+r3]
...@@ -829,7 +829,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * ...@@ -829,7 +829,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
mov r0d, [r5+FFTContext.mdctsize] mov r0d, [r5+FFTContext.mdctsize]
add r6, r0 add r6, r0
shr r0, 1 shr r0, 1
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
%define rtcos r2 %define rtcos r2
%define rtsin r3 %define rtsin r3
mov rtcos, [esp+8] mov rtcos, [esp+8]
...@@ -839,7 +839,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample * ...@@ -839,7 +839,7 @@ cglobal imdct_half%1, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample *
mov r1, -mmsize mov r1, -mmsize
sub r1, r0 sub r1, r0
%2 r0, r1, r6, rtcos, rtsin %2 r0, r1, r6, rtcos, rtsin
%ifdef ARCH_X86_64 %if ARCH_X86_64
pop r14 pop r14
pop r13 pop r13
pop r12 pop r12
...@@ -856,6 +856,6 @@ DECL_IMDCT _sse, POSROTATESHUF ...@@ -856,6 +856,6 @@ DECL_IMDCT _sse, POSROTATESHUF
INIT_YMM INIT_YMM
%ifdef HAVE_AVX %if HAVE_AVX
DECL_IMDCT _avx, POSROTATESHUF_AVX DECL_IMDCT _avx, POSROTATESHUF_AVX
%endif %endif
...@@ -28,14 +28,14 @@ SECTION_TEXT ...@@ -28,14 +28,14 @@ SECTION_TEXT
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len); ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
;--------------------------------------------------------------------------------- ;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 2 %macro INT32_TO_FLOAT_FMUL_SCALAR 2
%ifdef UNIX64 %if UNIX64
cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
%else %else
cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
%endif %endif
%ifdef WIN64 %if WIN64
SWAP 0, 2 SWAP 0, 2
%elifdef ARCH_X86_32 %elif ARCH_X86_32
movss m0, mulm movss m0, mulm
%endif %endif
SPLATD m0 SPLATD m0
...@@ -180,7 +180,7 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2 ...@@ -180,7 +180,7 @@ FLOAT_TO_INT16_INTERLEAVE2 sse2
%macro FLOAT_TO_INT16_INTERLEAVE6 1 %macro FLOAT_TO_INT16_INTERLEAVE6 1
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define lend r10d %define lend r10d
mov lend, r2d mov lend, r2d
%else %else
...@@ -241,7 +241,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 ...@@ -241,7 +241,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
%macro FLOAT_INTERLEAVE6 2 %macro FLOAT_INTERLEAVE6 2
cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define lend r10d %define lend r10d
mov lend, r2d mov lend, r2d
%else %else
......
...@@ -94,7 +94,7 @@ SECTION .text ...@@ -94,7 +94,7 @@ SECTION .text
; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my) ; int stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
mov r6d, r5d mov r6d, r5d
...@@ -113,7 +113,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 ...@@ -113,7 +113,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%define rnd_1d_rv40 rnd_rv40_1d_tbl %define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl %define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif %endif
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r5 mov r10, r5
and r10, 6 ; &~1 for mx/my=[0,7] and r10, 6 ; &~1 for mx/my=[0,7]
lea r10, [r10*4+r4] lea r10, [r10*4+r4]
...@@ -147,7 +147,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 ...@@ -147,7 +147,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%ifdef PIC %ifdef PIC
lea r11, [rnd_rv40_1d_tbl] lea r11, [rnd_rv40_1d_tbl]
%endif %endif
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r5, r0m mov r5, r0m
%endif %endif
%endif %endif
...@@ -198,7 +198,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 ...@@ -198,7 +198,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%ifdef PIC %ifdef PIC
lea r11, [rnd_rv40_2d_tbl] lea r11, [rnd_rv40_2d_tbl]
%endif %endif
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r5, r0m mov r5, r0m
%endif %endif
%endif %endif
...@@ -279,7 +279,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 ...@@ -279,7 +279,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 0
%macro chroma_mc4_mmx_func 3 %macro chroma_mc4_mmx_func 3
cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
pxor m7, m7 pxor m7, m7
...@@ -364,7 +364,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 ...@@ -364,7 +364,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6, 0
%macro chroma_mc2_mmx_func 3 %macro chroma_mc2_mmx_func 3
cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
...@@ -452,7 +452,7 @@ chroma_mc4_mmx_func avg, rv40, 3dnow ...@@ -452,7 +452,7 @@ chroma_mc4_mmx_func avg, rv40, 3dnow
%macro chroma_mc8_ssse3_func 3 %macro chroma_mc8_ssse3_func 3
cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
mov r6d, r5d mov r6d, r5d
...@@ -600,7 +600,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -600,7 +600,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
%macro chroma_mc4_ssse3_func 3 %macro chroma_mc4_ssse3_func 3
cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
mov r6, r4 mov r6, r4
......
...@@ -252,7 +252,7 @@ cglobal %1_h264_chroma_mc2_10_%2, 6,7 ...@@ -252,7 +252,7 @@ cglobal %1_h264_chroma_mc2_10_%2, 6,7
%define CHROMAMC_AVG NOTHING %define CHROMAMC_AVG NOTHING
INIT_XMM INIT_XMM
CHROMA_MC8 put, sse2 CHROMA_MC8 put, sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
CHROMA_MC8 put, avx CHROMA_MC8 put, avx
%endif %endif
...@@ -264,7 +264,7 @@ CHROMA_MC2 put, mmxext ...@@ -264,7 +264,7 @@ CHROMA_MC2 put, mmxext
%define PAVG pavgw %define PAVG pavgw
INIT_XMM INIT_XMM
CHROMA_MC8 avg, sse2 CHROMA_MC8 avg, sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
CHROMA_MC8 avg, avx CHROMA_MC8 avg, avx
%endif %endif
......
...@@ -200,7 +200,7 @@ cextern pb_A1 ...@@ -200,7 +200,7 @@ cextern pb_A1
; out: %4 = |%1-%2|>%3 ; out: %4 = |%1-%2|>%3
; clobbers: %5 ; clobbers: %5
%macro DIFF_GT2 5 %macro DIFF_GT2 5
%ifdef ARCH_X86_64 %if ARCH_X86_64
psubusb %5, %2, %1 psubusb %5, %2, %1
psubusb %4, %1, %2 psubusb %4, %1, %2
%else %else
...@@ -278,7 +278,7 @@ cextern pb_A1 ...@@ -278,7 +278,7 @@ cextern pb_A1
mova %4, %2 mova %4, %2
%endmacro %endmacro
%ifdef ARCH_X86_64 %if ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -333,7 +333,7 @@ cglobal deblock_h_luma_8_%1, 5,7 ...@@ -333,7 +333,7 @@ cglobal deblock_h_luma_8_%1, 5,7
lea r11, [r10+r10*2] lea r11, [r10+r10*2]
lea r6, [r0-4] lea r6, [r0-4]
lea r5, [r0-4+r11] lea r5, [r0-4+r11]
%ifdef WIN64 %if WIN64
sub rsp, 0x98 sub rsp, 0x98
%define pix_tmp rsp+0x30 %define pix_tmp rsp+0x30
%else %else
...@@ -352,7 +352,7 @@ cglobal deblock_h_luma_8_%1, 5,7 ...@@ -352,7 +352,7 @@ cglobal deblock_h_luma_8_%1, 5,7
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30] lea r0, [pix_tmp+0x30]
mov r1d, 0x10 mov r1d, 0x10
%ifdef WIN64 %if WIN64
mov [rsp+0x20], r4 mov [rsp+0x20], r4
%endif %endif
call deblock_v_luma_8_%1 call deblock_v_luma_8_%1
...@@ -376,7 +376,7 @@ cglobal deblock_h_luma_8_%1, 5,7 ...@@ -376,7 +376,7 @@ cglobal deblock_h_luma_8_%1, 5,7
movq m3, [pix_tmp+0x40] movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64 %if WIN64
add rsp, 0x98 add rsp, 0x98
%else %else
add rsp, 0x68 add rsp, 0x68
...@@ -513,7 +513,7 @@ DEBLOCK_LUMA avx, v, 16 ...@@ -513,7 +513,7 @@ DEBLOCK_LUMA avx, v, 16
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
%ifdef ARCH_X86_64 %if ARCH_X86_64
pavgb t0, p2, p1 pavgb t0, p2, p1
pavgb t1, p0, q0 pavgb t1, p0, q0
%else %else
...@@ -524,7 +524,7 @@ DEBLOCK_LUMA avx, v, 16 ...@@ -524,7 +524,7 @@ DEBLOCK_LUMA avx, v, 16
%endif %endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1 mova t5, t1
%ifdef ARCH_X86_64 %if ARCH_X86_64
paddb t2, p2, p1 paddb t2, p2, p1
paddb t3, p0, q0 paddb t3, p0, q0
%else %else
...@@ -542,7 +542,7 @@ DEBLOCK_LUMA avx, v, 16 ...@@ -542,7 +542,7 @@ DEBLOCK_LUMA avx, v, 16
pand t2, mpb_1 pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
%ifdef ARCH_X86_64 %if ARCH_X86_64
pavgb t1, p2, q1 pavgb t1, p2, q1
psubb t2, p2, q1 psubb t2, p2, q1
%else %else
...@@ -617,7 +617,7 @@ DEBLOCK_LUMA avx, v, 16 ...@@ -617,7 +617,7 @@ DEBLOCK_LUMA avx, v, 16
%define t1 m5 %define t1 m5
%define t2 m6 %define t2 m6
%define t3 m7 %define t3 m7
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define p2 m8 %define p2 m8
%define q2 m9 %define q2 m9
%define t4 m10 %define t4 m10
...@@ -644,7 +644,7 @@ DEBLOCK_LUMA avx, v, 16 ...@@ -644,7 +644,7 @@ DEBLOCK_LUMA avx, v, 16
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_%2_luma_intra_8_%1, 4,6,16 cglobal deblock_%2_luma_intra_8_%1, 4,6,16
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
sub esp, 0x60 sub esp, 0x60
%endif %endif
lea r4, [r1*4] lea r4, [r1*4]
...@@ -659,7 +659,7 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16 ...@@ -659,7 +659,7 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16
mova p0, [r4+r5] mova p0, [r4+r5]
mova q0, [r0] mova q0, [r0]
mova q1, [r0+r1] mova q1, [r0+r1]
%ifdef ARCH_X86_64 %if ARCH_X86_64
pxor mpb_0, mpb_0 pxor mpb_0, mpb_0
mova mpb_1, [pb_1] mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
...@@ -695,13 +695,13 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16 ...@@ -695,13 +695,13 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16
LUMA_INTRA_SWAP_PQ LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end: .end:
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
add esp, 0x60 add esp, 0x60
%endif %endif
RET RET
INIT_MMX INIT_MMX
%ifdef ARCH_X86_64 %if ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -779,7 +779,7 @@ INIT_XMM ...@@ -779,7 +779,7 @@ INIT_XMM
DEBLOCK_LUMA_INTRA sse2, v DEBLOCK_LUMA_INTRA sse2, v
INIT_AVX INIT_AVX
DEBLOCK_LUMA_INTRA avx , v DEBLOCK_LUMA_INTRA avx , v
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
INIT_MMX INIT_MMX
DEBLOCK_LUMA_INTRA mmxext, v8 DEBLOCK_LUMA_INTRA mmxext, v8
%endif %endif
...@@ -824,7 +824,7 @@ cglobal deblock_v_chroma_8_mmxext, 5,6 ...@@ -824,7 +824,7 @@ cglobal deblock_v_chroma_8_mmxext, 5,6
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_8_mmxext, 5,7 cglobal deblock_h_chroma_8_mmxext, 5,7
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define buf0 [rsp-24] %define buf0 [rsp-24]
%define buf1 [rsp-16] %define buf1 [rsp-16]
%else %else
......
...@@ -302,7 +302,7 @@ cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) ...@@ -302,7 +302,7 @@ cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
%endmacro %endmacro
INIT_XMM INIT_XMM
%ifdef ARCH_X86_64 %if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta ; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0' ; out: m0=p1', m3=q1', m1=p0', m2=q0'
...@@ -435,7 +435,7 @@ DEBLOCK_LUMA_64 avx ...@@ -435,7 +435,7 @@ DEBLOCK_LUMA_64 avx
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
%ifdef ARCH_X86_64 %if ARCH_X86_64
paddw t0, %3, %2 paddw t0, %3, %2
mova t2, %4 mova t2, %4
paddw t2, %3 paddw t2, %3
...@@ -501,7 +501,7 @@ DEBLOCK_LUMA_64 avx ...@@ -501,7 +501,7 @@ DEBLOCK_LUMA_64 avx
LOAD_AB t0, t1, r2d, r3d LOAD_AB t0, t1, r2d, r3d
mova %1, t0 mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
%ifdef ARCH_X86_64 %if ARCH_X86_64
mova %2, t0 ; mask0 mova %2, t0 ; mask0
psrlw t3, %1, 2 psrlw t3, %1, 2
%else %else
...@@ -598,7 +598,7 @@ DEBLOCK_LUMA_64 avx ...@@ -598,7 +598,7 @@ DEBLOCK_LUMA_64 avx
%endif %endif
%endmacro %endmacro
%ifdef ARCH_X86_64 %if ARCH_X86_64
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -792,7 +792,7 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) ...@@ -792,7 +792,7 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
RET RET
%endmacro %endmacro
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
INIT_MMX INIT_MMX
DEBLOCK_LUMA mmxext DEBLOCK_LUMA mmxext
DEBLOCK_LUMA_INTRA mmxext DEBLOCK_LUMA_INTRA mmxext
...@@ -907,7 +907,7 @@ cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) ...@@ -907,7 +907,7 @@ cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
%endif %endif
%endmacro %endmacro
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
INIT_MMX INIT_MMX
DEBLOCK_CHROMA mmxext DEBLOCK_CHROMA mmxext
%endif %endif
......
...@@ -198,14 +198,14 @@ cglobal h264_idct8_add_8_mmx, 3, 4, 0 ...@@ -198,14 +198,14 @@ cglobal h264_idct8_add_8_mmx, 3, 4, 0
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4 %macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2 IDCT8_1D_FULL %2
%ifdef ARCH_X86_64 %if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else %else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif %endif
paddw m0, [pw_32] paddw m0, [pw_32]
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mova [%2 ], m0 mova [%2 ], m0
mova [%2+16], m4 mova [%2+16], m4
IDCT8_1D [%2], [%2+ 16] IDCT8_1D [%2], [%2+ 16]
...@@ -225,7 +225,7 @@ cglobal h264_idct8_add_8_mmx, 3, 4, 0 ...@@ -225,7 +225,7 @@ cglobal h264_idct8_add_8_mmx, 3, 4, 0
STORE_DIFF m1, m6, m7, [%1+%3 ] STORE_DIFF m1, m6, m7, [%1+%3 ]
STORE_DIFF m2, m6, m7, [%1+%3*2] STORE_DIFF m2, m6, m7, [%1+%3*2]
STORE_DIFF m3, m6, m7, [%1+%4 ] STORE_DIFF m3, m6, m7, [%1+%4 ]
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mova m0, [%2 ] mova m0, [%2 ]
mova m1, [%2+16] mova m1, [%2+16]
%else %else
...@@ -371,7 +371,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 ...@@ -371,7 +371,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define dst_reg r10 %define dst_reg r10
%define dst_regd r10d %define dst_regd r10d
%else %else
...@@ -381,7 +381,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0 ...@@ -381,7 +381,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 7, 0
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
lea dst_reg, [r0+dst_reg] lea dst_reg, [r0+dst_reg]
DC_ADD_MMX2_OP movh, dst_reg, r3, r6 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
inc r5 inc r5
...@@ -448,7 +448,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 ...@@ -448,7 +448,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define dst_reg r10 %define dst_reg r10
%define dst_regd r10d %define dst_regd r10d
%else %else
...@@ -458,7 +458,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 ...@@ -458,7 +458,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
add dst_reg, r0 add dst_reg, r0
DC_ADD_MMX2_OP movh, dst_reg, r3, r6 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
.skipblock .skipblock
...@@ -489,7 +489,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 ...@@ -489,7 +489,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define dst_reg r10 %define dst_reg r10
%define dst_regd r10d %define dst_regd r10d
%else %else
...@@ -501,7 +501,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 ...@@ -501,7 +501,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
lea dst_reg, [dst_reg+r3*4] lea dst_reg, [dst_reg+r3*4]
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
add r5, 4 add r5, 4
...@@ -550,7 +550,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10 ...@@ -550,7 +550,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 7, 10
jz .no_dc jz .no_dc
INIT_MMX INIT_MMX
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define dst_reg r10 %define dst_reg r10
%define dst_regd r10d %define dst_regd r10d
%else %else
...@@ -562,7 +562,7 @@ INIT_MMX ...@@ -562,7 +562,7 @@ INIT_MMX
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
lea dst_reg, [dst_reg+r3*4] lea dst_reg, [dst_reg+r3*4]
DC_ADD_MMX2_OP mova, dst_reg, r3, r6 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
add r5, 4 add r5, 4
...@@ -575,7 +575,7 @@ INIT_XMM ...@@ -575,7 +575,7 @@ INIT_XMM
mov dst_regd, dword [r1+r5*4] mov dst_regd, dword [r1+r5*4]
add dst_reg, r0 add dst_reg, r0
IDCT8_ADD_SSE dst_reg, r2, r3, r6 IDCT8_ADD_SSE dst_reg, r2, r3, r6
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
.skipblock .skipblock
...@@ -593,7 +593,7 @@ h264_idct_add8_mmx_plane: ...@@ -593,7 +593,7 @@ h264_idct_add8_mmx_plane:
or r6w, word [r2] or r6w, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+r5*4]
add r0, [r10] add r0, [r10]
%else %else
...@@ -617,13 +617,13 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0 ...@@ -617,13 +617,13 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0
%ifdef PIC %ifdef PIC
lea r11, [scan8_mem] lea r11, [scan8_mem]
%endif %endif
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
call h264_idct_add8_mmx_plane call h264_idct_add8_mmx_plane
mov r5, 32 mov r5, 32
add r2, 384 add r2, 384
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r10, gprsize add r10, gprsize
%else %else
add r0mp, gprsize add r0mp, gprsize
...@@ -637,7 +637,7 @@ h264_idct_add8_mmx2_plane ...@@ -637,7 +637,7 @@ h264_idct_add8_mmx2_plane
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
jz .try_dc jz .try_dc
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+r5*4]
add r0, [r10] add r0, [r10]
%else %else
...@@ -656,7 +656,7 @@ h264_idct_add8_mmx2_plane ...@@ -656,7 +656,7 @@ h264_idct_add8_mmx2_plane
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6 DC_ADD_MMX2_INIT r2, r3, r6
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+r5*4]
add r0, [r10] add r0, [r10]
%else %else
...@@ -677,7 +677,7 @@ h264_idct_add8_mmx2_plane ...@@ -677,7 +677,7 @@ h264_idct_add8_mmx2_plane
cglobal h264_idct_add8_8_mmx2, 5, 7, 0 cglobal h264_idct_add8_8_mmx2, 5, 7, 0
mov r5, 16 mov r5, 16
add r2, 512 add r2, 512
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
%ifdef PIC %ifdef PIC
...@@ -686,7 +686,7 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0 ...@@ -686,7 +686,7 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0
call h264_idct_add8_mmx2_plane call h264_idct_add8_mmx2_plane
mov r5, 32 mov r5, 32
add r2, 384 add r2, 384
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r10, gprsize add r10, gprsize
%else %else
add r0mp, gprsize add r0mp, gprsize
...@@ -738,7 +738,7 @@ x264_add8x4_idct_sse2: ...@@ -738,7 +738,7 @@ x264_add8x4_idct_sse2:
test r0, r0 test r0, r0
jz .cycle%1end jz .cycle%1end
mov r0d, dword [r1+%1*8] mov r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r0, r10 add r0, r10
%else %else
add r0, r0m add r0, r0m
...@@ -753,7 +753,7 @@ x264_add8x4_idct_sse2: ...@@ -753,7 +753,7 @@ x264_add8x4_idct_sse2:
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16_8_sse2, 5, 5, 8 cglobal h264_idct_add16_8_sse2, 5, 5, 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
; unrolling of the loop leads to an average performance gain of ; unrolling of the loop leads to an average performance gain of
...@@ -773,7 +773,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 ...@@ -773,7 +773,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
test r0, r0 test r0, r0
jz .try%1dc jz .try%1dc
mov r0d, dword [r1+%1*8] mov r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r0, r10 add r0, r10
%else %else
add r0, r0m add r0, r0m
...@@ -785,7 +785,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 ...@@ -785,7 +785,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
or r0w, word [r2+32] or r0w, word [r2+32]
jz .cycle%1end jz .cycle%1end
mov r0d, dword [r1+%1*8] mov r0d, dword [r1+%1*8]
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r0, r10 add r0, r10
%else %else
add r0, r0m add r0, r0m
...@@ -800,7 +800,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8 ...@@ -800,7 +800,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5, 8
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
add16intra_sse2_cycle 0, 0xc add16intra_sse2_cycle 0, 0xc
...@@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 ...@@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
movzx r0, word [r4+%2] movzx r0, word [r4+%2]
test r0, r0 test r0, r0
jz .try%1dc jz .try%1dc
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
add r0, [r10] add r0, [r10]
%else %else
...@@ -831,7 +831,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 ...@@ -831,7 +831,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
movsx r0, word [r2 ] movsx r0, word [r2 ]
or r0w, word [r2+32] or r0w, word [r2+32]
jz .cycle%1end jz .cycle%1end
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
add r0, [r10] add r0, [r10]
%else %else
...@@ -852,12 +852,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 ...@@ -852,12 +852,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
cglobal h264_idct_add8_8_sse2, 5, 7, 8 cglobal h264_idct_add8_8_sse2, 5, 7, 8
add r2, 512 add r2, 512
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
add8_sse2_cycle 0, 0x34 add8_sse2_cycle 0, 0x34
add8_sse2_cycle 1, 0x3c add8_sse2_cycle 1, 0x3c
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r10, gprsize add r10, gprsize
%else %else
add r0mp, gprsize add r0mp, gprsize
...@@ -977,11 +977,11 @@ cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 ...@@ -977,11 +977,11 @@ cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
WALSH4_1D 0,1,2,3,4 WALSH4_1D 0,1,2,3,4
; shift, tmp, output, qmul ; shift, tmp, output, qmul
%ifdef WIN64 %if WIN64
DECLARE_REG_TMP 0,3,1,2 DECLARE_REG_TMP 0,3,1,2
; we can't avoid this, because r0 is the shift register (ecx) on win64 ; we can't avoid this, because r0 is the shift register (ecx) on win64
xchg r0, t2 xchg r0, t2
%elifdef ARCH_X86_64 %elif ARCH_X86_64
DECLARE_REG_TMP 3,1,0,2 DECLARE_REG_TMP 3,1,0,2
%else %else
DECLARE_REG_TMP 1,3,0,2 DECLARE_REG_TMP 1,3,0,2
......
...@@ -98,7 +98,7 @@ cglobal h264_idct_add_10_%1, 3,3 ...@@ -98,7 +98,7 @@ cglobal h264_idct_add_10_%1, 3,3
INIT_XMM INIT_XMM
IDCT_ADD_10 sse2 IDCT_ADD_10 sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT_ADD_10 avx IDCT_ADD_10 avx
%endif %endif
...@@ -128,7 +128,7 @@ add4x4_idct_%1: ...@@ -128,7 +128,7 @@ add4x4_idct_%1:
INIT_XMM INIT_XMM
ALIGN 16 ALIGN 16
ADD4x4IDCT sse2 ADD4x4IDCT sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
ALIGN 16 ALIGN 16
ADD4x4IDCT avx ADD4x4IDCT avx
...@@ -168,7 +168,7 @@ cglobal h264_idct_add16_10_%1, 5,6 ...@@ -168,7 +168,7 @@ cglobal h264_idct_add16_10_%1, 5,6
INIT_XMM INIT_XMM
IDCT_ADD16_10 sse2 IDCT_ADD16_10 sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT_ADD16_10 avx IDCT_ADD16_10 avx
%endif %endif
...@@ -234,7 +234,7 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7 ...@@ -234,7 +234,7 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7
INIT_XMM INIT_XMM
IDCT8_DC_ADD sse2 IDCT8_DC_ADD sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT8_DC_ADD avx IDCT8_DC_ADD avx
%endif %endif
...@@ -305,7 +305,7 @@ cglobal h264_idct_add16intra_10_%1,5,7,8 ...@@ -305,7 +305,7 @@ cglobal h264_idct_add16intra_10_%1,5,7,8
INIT_XMM INIT_XMM
IDCT_ADD16INTRA_10 sse2 IDCT_ADD16INTRA_10 sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT_ADD16INTRA_10 avx IDCT_ADD16INTRA_10 avx
%endif %endif
...@@ -316,7 +316,7 @@ IDCT_ADD16INTRA_10 avx ...@@ -316,7 +316,7 @@ IDCT_ADD16INTRA_10 avx
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro IDCT_ADD8 1 %macro IDCT_ADD8 1
cglobal h264_idct_add8_10_%1,5,7 cglobal h264_idct_add8_10_%1,5,7
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r10, r0 mov r10, r0
%endif %endif
add r2, 1024 add r2, 1024
...@@ -324,7 +324,7 @@ cglobal h264_idct_add8_10_%1,5,7 ...@@ -324,7 +324,7 @@ cglobal h264_idct_add8_10_%1,5,7
ADD16_OP_INTRA %1, 16, 4+ 6*8 ADD16_OP_INTRA %1, 16, 4+ 6*8
ADD16_OP_INTRA %1, 18, 4+ 7*8 ADD16_OP_INTRA %1, 18, 4+ 7*8
add r2, 1024-128*2 add r2, 1024-128*2
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r0, [r10+gprsize] mov r0, [r10+gprsize]
%else %else
mov r0, r0m mov r0, r0m
...@@ -342,7 +342,7 @@ cglobal h264_idct_add8_10_%1,5,7 ...@@ -342,7 +342,7 @@ cglobal h264_idct_add8_10_%1,5,7
INIT_XMM INIT_XMM
IDCT_ADD8 sse2 IDCT_ADD8 sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT_ADD8 avx IDCT_ADD8 avx
%endif %endif
...@@ -411,7 +411,7 @@ IDCT_ADD8 avx ...@@ -411,7 +411,7 @@ IDCT_ADD8 avx
; %1=int16_t *block, %2=int16_t *dstblock ; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_SSE_START 2 %macro IDCT8_ADD_SSE_START 2
IDCT8_1D_FULL %1 IDCT8_1D_FULL %1
%ifdef ARCH_X86_64 %if ARCH_X86_64
TRANSPOSE4x4D 0,1,2,3,8 TRANSPOSE4x4D 0,1,2,3,8
mova [%2 ], m0 mova [%2 ], m0
TRANSPOSE4x4D 4,5,6,7,8 TRANSPOSE4x4D 4,5,6,7,8
...@@ -452,7 +452,7 @@ IDCT_ADD8 avx ...@@ -452,7 +452,7 @@ IDCT_ADD8 avx
%macro IDCT8_ADD 1 %macro IDCT8_ADD 1
cglobal h264_idct8_add_10_%1, 3,4,16 cglobal h264_idct8_add_10_%1, 3,4,16
%ifndef UNIX64 %if UNIX64 == 0
%assign pad 16-gprsize-(stack_offset&15) %assign pad 16-gprsize-(stack_offset&15)
sub rsp, pad sub rsp, pad
call h264_idct8_add1_10_%1 call h264_idct8_add1_10_%1
...@@ -467,7 +467,7 @@ h264_idct8_add1_10_%1: ...@@ -467,7 +467,7 @@ h264_idct8_add1_10_%1:
sub rsp, pad sub rsp, pad
add dword [r1], 32 add dword [r1], 32
%ifdef ARCH_X86_64 %if ARCH_X86_64
IDCT8_ADD_SSE_START r1, rsp IDCT8_ADD_SSE_START r1, rsp
SWAP 1, 9 SWAP 1, 9
SWAP 2, 10 SWAP 2, 10
...@@ -519,7 +519,7 @@ h264_idct8_add1_10_%1: ...@@ -519,7 +519,7 @@ h264_idct8_add1_10_%1:
INIT_XMM INIT_XMM
IDCT8_ADD sse2 IDCT8_ADD sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT8_ADD avx IDCT8_ADD avx
%endif %endif
...@@ -559,7 +559,7 @@ cglobal h264_idct8_add4_10_%1, 0,7,16 ...@@ -559,7 +559,7 @@ cglobal h264_idct8_add4_10_%1, 0,7,16
INIT_XMM INIT_XMM
IDCT8_ADD4 sse2 IDCT8_ADD4 sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
IDCT8_ADD4 avx IDCT8_ADD4 avx
%endif %endif
...@@ -348,7 +348,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -348,7 +348,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
lea r3, [r0+r2*4-1] lea r3, [r0+r2*4-1]
add r4, r2 add r4, r2
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define e_reg r11 %define e_reg r11
%else %else
%define e_reg r0 %define e_reg r0
...@@ -369,7 +369,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -369,7 +369,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
lea r5, [r5+r6*4] lea r5, [r5+r6*4]
movzx e_reg, byte [r3 ] movzx e_reg, byte [r3 ]
%ifdef ARCH_X86_64 %if ARCH_X86_64
movzx r10, byte [r4+r2 ] movzx r10, byte [r4+r2 ]
sub r10, e_reg sub r10, e_reg
%else %else
...@@ -385,7 +385,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -385,7 +385,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r4, byte [e_reg+r2 ] movzx r4, byte [e_reg+r2 ]
movzx r6, byte [r3 ] movzx r6, byte [r3 ]
sub r6, r4 sub r6, r4
%ifdef ARCH_X86_64 %if ARCH_X86_64
lea r6, [r10+r6*2] lea r6, [r10+r6*2]
lea r5, [r5+r6*2] lea r5, [r5+r6*2]
add r5, r6 add r5, r6
...@@ -395,7 +395,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -395,7 +395,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
%endif %endif
movzx r4, byte [e_reg ] movzx r4, byte [e_reg ]
%ifdef ARCH_X86_64 %if ARCH_X86_64
movzx r10, byte [r3 +r2 ] movzx r10, byte [r3 +r2 ]
sub r10, r4 sub r10, r4
sub r5, r10 sub r5, r10
...@@ -409,7 +409,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -409,7 +409,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
movzx r4, byte [e_reg+r1 ] movzx r4, byte [e_reg+r1 ]
movzx r6, byte [r3 +r2*2] movzx r6, byte [r3 +r2*2]
sub r6, r4 sub r6, r4
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r6, r10 add r6, r10
%endif %endif
lea r5, [r5+r6*8] lea r5, [r5+r6*8]
...@@ -420,7 +420,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2 ...@@ -420,7 +420,7 @@ cglobal pred16x16_plane_%3_%1, 2, 7, %2
lea r5, [r5+r6*4] lea r5, [r5+r6*4]
add r5, r6 ; sum of V coefficients add r5, r6 ; sum of V coefficients
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r0, r0m mov r0, r0m
%endif %endif
...@@ -641,7 +641,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 ...@@ -641,7 +641,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
lea r3, [r0 -1] lea r3, [r0 -1]
add r4, r2 add r4, r2
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define e_reg r11 %define e_reg r11
%else %else
%define e_reg r0 %define e_reg r0
...@@ -652,7 +652,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 ...@@ -652,7 +652,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
sub r5, e_reg sub r5, e_reg
movzx e_reg, byte [r3 ] movzx e_reg, byte [r3 ]
%ifdef ARCH_X86_64 %if ARCH_X86_64
movzx r10, byte [r4+r2 ] movzx r10, byte [r4+r2 ]
sub r10, e_reg sub r10, e_reg
sub r5, r10 sub r5, r10
...@@ -666,7 +666,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 ...@@ -666,7 +666,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
movzx e_reg, byte [r3+r1 ] movzx e_reg, byte [r3+r1 ]
movzx r6, byte [r4+r2*2 ] movzx r6, byte [r4+r2*2 ]
sub r6, e_reg sub r6, e_reg
%ifdef ARCH_X86_64 %if ARCH_X86_64
add r6, r10 add r6, r10
%endif %endif
lea r5, [r5+r6*4] lea r5, [r5+r6*4]
...@@ -680,7 +680,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2 ...@@ -680,7 +680,7 @@ cglobal pred8x8_plane_%1, 2, 7, %2
lea r5, [r5+r6*8] lea r5, [r5+r6*8]
sar r5, 5 sar r5, 5
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
mov r0, r0m mov r0, r0m
%endif %endif
......
...@@ -84,7 +84,7 @@ INIT_XMM ...@@ -84,7 +84,7 @@ INIT_XMM
PRED4x4_DR sse2 PRED4x4_DR sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED4x4_DR ssse3 PRED4x4_DR ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED4x4_DR avx PRED4x4_DR avx
%endif %endif
...@@ -124,7 +124,7 @@ INIT_XMM ...@@ -124,7 +124,7 @@ INIT_XMM
PRED4x4_VR sse2 PRED4x4_VR sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED4x4_VR ssse3 PRED4x4_VR ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED4x4_VR avx PRED4x4_VR avx
%endif %endif
...@@ -167,7 +167,7 @@ INIT_XMM ...@@ -167,7 +167,7 @@ INIT_XMM
PRED4x4_HD sse2 PRED4x4_HD sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED4x4_HD ssse3 PRED4x4_HD ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED4x4_HD avx PRED4x4_HD avx
%endif %endif
...@@ -238,7 +238,7 @@ cglobal pred4x4_down_left_10_%1, 3,3 ...@@ -238,7 +238,7 @@ cglobal pred4x4_down_left_10_%1, 3,3
INIT_XMM INIT_XMM
PRED4x4_DL sse2 PRED4x4_DL sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED4x4_DL avx PRED4x4_DL avx
%endif %endif
...@@ -267,7 +267,7 @@ cglobal pred4x4_vertical_left_10_%1, 3,3 ...@@ -267,7 +267,7 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
INIT_XMM INIT_XMM
PRED4x4_VL sse2 PRED4x4_VL sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED4x4_VL avx PRED4x4_VL avx
%endif %endif
...@@ -577,7 +577,7 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6 ...@@ -577,7 +577,7 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6
INIT_XMM INIT_XMM
PRED8x8L_TOP_DC sse2 PRED8x8L_TOP_DC sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_TOP_DC avx PRED8x8L_TOP_DC avx
%endif %endif
...@@ -636,7 +636,7 @@ cglobal pred8x8l_dc_10_%1, 4,6,6 ...@@ -636,7 +636,7 @@ cglobal pred8x8l_dc_10_%1, 4,6,6
INIT_XMM INIT_XMM
PRED8x8L_DC sse2 PRED8x8L_DC sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_DC avx PRED8x8L_DC avx
%endif %endif
...@@ -671,7 +671,7 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6 ...@@ -671,7 +671,7 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
INIT_XMM INIT_XMM
PRED8x8L_VERTICAL sse2 PRED8x8L_VERTICAL sse2
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_VERTICAL avx PRED8x8L_VERTICAL avx
%endif %endif
...@@ -728,7 +728,7 @@ INIT_XMM ...@@ -728,7 +728,7 @@ INIT_XMM
PRED8x8L_HORIZONTAL sse2 PRED8x8L_HORIZONTAL sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL ssse3 PRED8x8L_HORIZONTAL ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_HORIZONTAL avx PRED8x8L_HORIZONTAL avx
%endif %endif
...@@ -797,7 +797,7 @@ INIT_XMM ...@@ -797,7 +797,7 @@ INIT_XMM
PRED8x8L_DOWN_LEFT sse2 PRED8x8L_DOWN_LEFT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT ssse3 PRED8x8L_DOWN_LEFT ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_DOWN_LEFT avx PRED8x8L_DOWN_LEFT avx
%endif %endif
...@@ -872,7 +872,7 @@ INIT_XMM ...@@ -872,7 +872,7 @@ INIT_XMM
PRED8x8L_DOWN_RIGHT sse2 PRED8x8L_DOWN_RIGHT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT ssse3 PRED8x8L_DOWN_RIGHT ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_DOWN_RIGHT avx PRED8x8L_DOWN_RIGHT avx
%endif %endif
...@@ -943,7 +943,7 @@ INIT_XMM ...@@ -943,7 +943,7 @@ INIT_XMM
PRED8x8L_VERTICAL_RIGHT sse2 PRED8x8L_VERTICAL_RIGHT sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT ssse3 PRED8x8L_VERTICAL_RIGHT ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_VERTICAL_RIGHT avx PRED8x8L_VERTICAL_RIGHT avx
%endif %endif
...@@ -1005,7 +1005,7 @@ INIT_XMM ...@@ -1005,7 +1005,7 @@ INIT_XMM
PRED8x8L_HORIZONTAL_UP sse2 PRED8x8L_HORIZONTAL_UP sse2
%define PALIGNR PALIGNR_SSSE3 %define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP ssse3 PRED8x8L_HORIZONTAL_UP ssse3
%ifdef HAVE_AVX %if HAVE_AVX
INIT_AVX INIT_AVX
PRED8x8L_HORIZONTAL_UP avx PRED8x8L_HORIZONTAL_UP avx
%endif %endif
......
...@@ -111,7 +111,7 @@ INIT_XMM ...@@ -111,7 +111,7 @@ INIT_XMM
%endmacro %endmacro
%macro MCAxA 8 %macro MCAxA 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
%ifnidn %1,mmxext %ifnidn %1,mmxext
MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%endif %endif
...@@ -122,7 +122,7 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 ...@@ -122,7 +122,7 @@ MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
%macro MCAxA_OP 8 %macro MCAxA_OP 8
cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
%ifdef ARCH_X86_32 %if ARCH_X86_32
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%2_h264_qpel%4_%3_10_%1
mov r0, r0m mov r0, r0m
mov r1, r1m mov r1, r1m
...@@ -152,7 +152,7 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 ...@@ -152,7 +152,7 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%2_h264_qpel%4_%3_10_%1
lea r0, [r10+r2*%4+%4*2] lea r0, [r10+r2*%4+%4*2]
lea r1, [r11+r2*%4+%4*2] lea r1, [r11+r2*%4+%4*2]
%ifndef UNIX64 ; fall through to function %if UNIX64 == 0 ; fall through to function
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%2_h264_qpel%4_%3_10_%1
RET RET
%endif %endif
...@@ -165,7 +165,7 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 ...@@ -165,7 +165,7 @@ cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
MCAxA %1, %2, %3, %4, i, %5,%6,%7 MCAxA %1, %2, %3, %4, i, %5,%6,%7
cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7 cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
%ifndef UNIX64 ; no prologue or epilogue for UNIX64 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
call stub_%2_h264_qpel%4_%3_10_%1 call stub_%2_h264_qpel%4_%3_10_%1
RET RET
%endif %endif
......
...@@ -126,7 +126,7 @@ INIT_XMM ...@@ -126,7 +126,7 @@ INIT_XMM
WEIGHT_FUNC_HALF_MM 8, 8, sse2 WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0 %macro BIWEIGHT_SETUP 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define off_regd r11d %define off_regd r11d
%else %else
%define off_regd r3d %define off_regd r3d
...@@ -244,7 +244,7 @@ INIT_XMM ...@@ -244,7 +244,7 @@ INIT_XMM
BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0 %macro BIWEIGHT_SSSE3_SETUP 0
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define off_regd r11d %define off_regd r11d
%else %else
%define off_regd r3d %define off_regd r3d
......
...@@ -152,7 +152,7 @@ WEIGHT_FUNC_HALF_MM sse4 ...@@ -152,7 +152,7 @@ WEIGHT_FUNC_HALF_MM sse4
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
; int log2_denom, int weightd, int weights, int offset); ; int log2_denom, int weightd, int weights, int offset);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32 %if ARCH_X86_32
DECLARE_REG_TMP 3 DECLARE_REG_TMP 3
%else %else
DECLARE_REG_TMP 10 DECLARE_REG_TMP 10
......
...@@ -219,13 +219,13 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win ...@@ -219,13 +219,13 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
subps m5, m0, m3 subps m5, m0, m3
%ifdef ARCH_X86_64 %if ARCH_X86_64
SWAP m5, m8 SWAP m5, m8
%endif %endif
mulps m7, m2, [ps_val1] mulps m7, m2, [ps_val1]
%ifdef ARCH_X86_64 %if ARCH_X86_64
mulps m5, m8, [ps_val2] mulps m5, m8, [ps_val2]
%else %else
mulps m5, m5, [ps_val2] mulps m5, m5, [ps_val2]
...@@ -235,7 +235,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win ...@@ -235,7 +235,7 @@ cglobal imdct36_float, 4,4,9, out, buf, in, win
mulps m5, m6, [ps_val1] mulps m5, m6, [ps_val1]
subps m7, m7, m5 subps m7, m7, m5
%ifdef ARCH_X86_64 %if ARCH_X86_64
SWAP m5, m8 SWAP m5, m8
%else %else
subps m5, m0, m3 subps m5, m0, m3
...@@ -376,7 +376,7 @@ DEFINE_IMDCT ...@@ -376,7 +376,7 @@ DEFINE_IMDCT
INIT_XMM sse INIT_XMM sse
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define SPILL SWAP %define SPILL SWAP
%define UNSPILL SWAP %define UNSPILL SWAP
%define SPILLED(x) m %+ x %define SPILLED(x) m %+ x
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
%define W6sh2 8867 ; W6 = 35468 = 8867<<2 %define W6sh2 8867 ; W6 = 35468 = 8867<<2
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 %define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
%ifdef ARCH_X86_64 %if ARCH_X86_64
SECTION_RODATA SECTION_RODATA
......
...@@ -106,7 +106,7 @@ SECTION .text ...@@ -106,7 +106,7 @@ SECTION .text
INIT_MMX INIT_MMX
cglobal vp3_v_loop_filter_mmx2, 3, 4 cglobal vp3_v_loop_filter_mmx2, 3, 4
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r1, r1d movsxd r1, r1d
%endif %endif
mov r3, r1 mov r3, r1
...@@ -123,7 +123,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4 ...@@ -123,7 +123,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4
RET RET
cglobal vp3_h_loop_filter_mmx2, 3, 4 cglobal vp3_h_loop_filter_mmx2, 3, 4
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r1, r1d movsxd r1, r1d
%endif %endif
lea r3, [r1*3] lea r3, [r1*3]
...@@ -510,7 +510,7 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 ...@@ -510,7 +510,7 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
%define SHIFT(x) %define SHIFT(x)
%define ADD(x) %define ADD(x)
VP3_1D_IDCT_SSE2 VP3_1D_IDCT_SSE2
%ifdef ARCH_X86_64 %if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else %else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
...@@ -530,7 +530,7 @@ cglobal vp3_idct_%1, 1, 1, %2 ...@@ -530,7 +530,7 @@ cglobal vp3_idct_%1, 1, 1, %2
cglobal vp3_idct_put_%1, 3, %3, %2 cglobal vp3_idct_put_%1, 3, %3, %2
VP3_IDCT_%1 r2 VP3_IDCT_%1 r2
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r3, r2 mov r3, r2
mov r2, r1 mov r2, r1
mov r1, r0 mov r1, r0
...@@ -540,7 +540,7 @@ cglobal vp3_idct_put_%1, 3, %3, %2 ...@@ -540,7 +540,7 @@ cglobal vp3_idct_put_%1, 3, %3, %2
mov r1m, r0 mov r1m, r0
mov r2m, r1 mov r2m, r1
%endif %endif
%ifdef WIN64 %if WIN64
call put_signed_pixels_clamped_mmx call put_signed_pixels_clamped_mmx
RET RET
%else %else
...@@ -549,7 +549,7 @@ cglobal vp3_idct_put_%1, 3, %3, %2 ...@@ -549,7 +549,7 @@ cglobal vp3_idct_put_%1, 3, %3, %2
cglobal vp3_idct_add_%1, 3, %3, %2 cglobal vp3_idct_add_%1, 3, %3, %2
VP3_IDCT_%1 r2 VP3_IDCT_%1 r2
%ifdef ARCH_X86_64 %if ARCH_X86_64
mov r3, r2 mov r3, r2
mov r2, r1 mov r2, r1
mov r1, r0 mov r1, r0
...@@ -559,7 +559,7 @@ cglobal vp3_idct_add_%1, 3, %3, %2 ...@@ -559,7 +559,7 @@ cglobal vp3_idct_add_%1, 3, %3, %2
mov r1m, r0 mov r1m, r0
mov r2m, r1 mov r2m, r1
%endif %endif
%ifdef WIN64 %if WIN64
call add_pixels_clamped_mmx call add_pixels_clamped_mmx
RET RET
%else %else
...@@ -567,7 +567,7 @@ cglobal vp3_idct_add_%1, 3, %3, %2 ...@@ -567,7 +567,7 @@ cglobal vp3_idct_add_%1, 3, %3, %2
%endif %endif
%endmacro %endmacro
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define REGS 4 %define REGS 4
%else %else
%define REGS 3 %define REGS 3
...@@ -599,7 +599,7 @@ vp3_idct_funcs sse2, 9, REGS ...@@ -599,7 +599,7 @@ vp3_idct_funcs sse2, 9, REGS
INIT_MMX INIT_MMX
cglobal vp3_idct_dc_add_mmx2, 3, 4 cglobal vp3_idct_dc_add_mmx2, 3, 4
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r1, r1d movsxd r1, r1d
%endif %endif
lea r3, [r1*3] lea r3, [r1*3]
......
...@@ -127,7 +127,7 @@ cglobal vp6_filter_diag4_%1, 5, 7, %2 ...@@ -127,7 +127,7 @@ cglobal vp6_filter_diag4_%1, 5, 7, %2
sub rsp, 8*15 sub rsp, 8*15
movq m6, [pw_64] movq m6, [pw_64]
%endif %endif
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif %endif
......
...@@ -35,11 +35,13 @@ ...@@ -35,11 +35,13 @@
%define program_name ff %define program_name ff
%ifdef ARCH_X86_64 %define UNIX64 0
%define WIN64 0
%if ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32 %ifidn __OUTPUT_FORMAT__,win32
%define WIN64 %define WIN64 1
%else %else
%define UNIX64 %define UNIX64 1
%endif %endif
%endif %endif
...@@ -79,9 +81,9 @@ ...@@ -79,9 +81,9 @@
%endif %endif
%endmacro %endmacro
%ifdef WIN64 %if WIN64
%define PIC %define PIC
%elifndef ARCH_X86_64 %elif !ARCH_X86_64
; x86_32 doesn't require PIC. ; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if ; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity. ; the code contains a few textrels, so we'll skip that complexity.
...@@ -132,7 +134,7 @@ ...@@ -132,7 +134,7 @@
%define r%1m %6 %define r%1m %6
%ifid %6 ; i.e. it's a register %ifid %6 ; i.e. it's a register
%define r%1mp %2 %define r%1mp %2
%elifdef ARCH_X86_64 ; memory %elif ARCH_X86_64 ; memory
%define r%1mp qword %6 %define r%1mp qword %6
%else %else
%define r%1mp dword %6 %define r%1mp dword %6
...@@ -149,7 +151,7 @@ ...@@ -149,7 +151,7 @@
%define e%1w %1 %define e%1w %1
%define r%1b %2 %define r%1b %2
%define e%1b %2 %define e%1b %2
%ifndef ARCH_X86_64 %if ARCH_X86_64 == 0
%define r%1 e%1 %define r%1 e%1
%endif %endif
%endmacro %endmacro
...@@ -185,7 +187,7 @@ DECLARE_REG_SIZE bp, bpl ...@@ -185,7 +187,7 @@ DECLARE_REG_SIZE bp, bpl
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define gprsize 8 %define gprsize 8
%else %else
%define gprsize 4 %define gprsize 4
...@@ -261,7 +263,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 ...@@ -261,7 +263,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
%assign n_arg_names %%i %assign n_arg_names %%i
%endmacro %endmacro
%ifdef WIN64 ; Windows x64 ;================================================= %if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx, ecx, cx, cl, ecx DECLARE_REG 0, rcx, ecx, cx, cl, ecx
DECLARE_REG 1, rdx, edx, dx, dl, edx DECLARE_REG 1, rdx, edx, dx, dl, edx
...@@ -346,7 +348,7 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] ...@@ -346,7 +348,7 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%endif %endif
%endmacro %endmacro
%elifdef ARCH_X86_64 ; *nix x64 ;============================================= %elif ARCH_X86_64 ; *nix x64 ;=============================================
DECLARE_REG 0, rdi, edi, di, dil, edi DECLARE_REG 0, rdi, edi, di, dil, edi
DECLARE_REG 1, rsi, esi, si, sil, esi DECLARE_REG 1, rsi, esi, si, sil, esi
...@@ -447,7 +449,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ...@@ -447,7 +449,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endif ;====================================================================== %endif ;======================================================================
%ifndef WIN64 %if WIN64 == 0
%macro WIN64_SPILL_XMM 1 %macro WIN64_SPILL_XMM 1
%endmacro %endmacro
%macro WIN64_RESTORE_XMM 1 %macro WIN64_RESTORE_XMM 1
...@@ -617,7 +619,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -617,7 +619,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define RESET_MM_PERMUTATION INIT_XMM %1 %define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16 %define mmsize 16
%define num_mmregs 8 %define num_mmregs 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define num_mmregs 16 %define num_mmregs 16
%endif %endif
%define mova movdqa %define mova movdqa
...@@ -646,7 +648,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -646,7 +648,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define RESET_MM_PERMUTATION INIT_YMM %1 %define RESET_MM_PERMUTATION INIT_YMM %1
%define mmsize 32 %define mmsize 32
%define num_mmregs 8 %define num_mmregs 8
%ifdef ARCH_X86_64 %if ARCH_X86_64
%define num_mmregs 16 %define num_mmregs 16
%endif %endif
%define mova vmovaps %define mova vmovaps
......
...@@ -95,7 +95,7 @@ ...@@ -95,7 +95,7 @@
%endmacro %endmacro
%macro TRANSPOSE8x8W 9-11 %macro TRANSPOSE8x8W 9-11
%ifdef ARCH_X86_64 %if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9 SBUTTERFLY wd, %1, %2, %9
SBUTTERFLY wd, %3, %4, %9 SBUTTERFLY wd, %3, %4, %9
SBUTTERFLY wd, %5, %6, %9 SBUTTERFLY wd, %5, %6, %9
......
...@@ -64,7 +64,7 @@ SECTION .text ...@@ -64,7 +64,7 @@ SECTION .text
; split the loop in an aligned and unaligned case ; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3 %macro YUYV_TO_Y_FN 2-3
cglobal %2ToY, 3, 3, %1, dst, src, w cglobal %2ToY, 3, 3, %1, dst, src, w
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd wq, wd movsxd wq, wd
%endif %endif
add dstq, wq add dstq, wq
...@@ -134,7 +134,7 @@ cglobal %2ToY, 3, 3, %1, dst, src, w ...@@ -134,7 +134,7 @@ cglobal %2ToY, 3, 3, %1, dst, src, w
; split the loop in an aligned and unaligned case ; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3 %macro YUYV_TO_UV_FN 2-3
cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd wq, dword r4m movsxd wq, dword r4m
%else ; x86-32 %else ; x86-32
mov wq, r4m mov wq, r4m
...@@ -189,7 +189,7 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ...@@ -189,7 +189,7 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
; %2 = nv12 or nv21 ; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2 %macro NVXX_TO_UV_FN 2
cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd wq, dword r4m movsxd wq, dword r4m
%else ; x86-32 %else ; x86-32
mov wq, r4m mov wq, r4m
...@@ -215,7 +215,7 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w ...@@ -215,7 +215,7 @@ cglobal %2ToUV, 3, 4, %1, dstU, dstV, src, w
%endif ; mmsize == 8/16 %endif ; mmsize == 8/16
%endmacro %endmacro
%ifdef ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmx INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy YUYV_TO_Y_FN 0, uyvy
......
...@@ -58,7 +58,7 @@ SECTION .text ...@@ -58,7 +58,7 @@ SECTION .text
%macro yuv2planeX_fn 3 %macro yuv2planeX_fn 3
%ifdef ARCH_X86_32 %if ARCH_X86_32
%define cntr_reg r1 %define cntr_reg r1
%define movsx mov %define movsx mov
%else %else
...@@ -72,7 +72,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -72,7 +72,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
%endif ; %1 == 8/9/10 %endif ; %1 == 8/9/10
%if %1 == 8 %if %1 == 8
%ifdef ARCH_X86_32 %if ARCH_X86_32
%assign pad 0x2c - (stack_offset & 15) %assign pad 0x2c - (stack_offset & 15)
SUB rsp, pad SUB rsp, pad
%define m_dith m7 %define m_dith m7
...@@ -91,7 +91,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -91,7 +91,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
.no_rot: .no_rot:
%if mmsize == 16 %if mmsize == 16
punpcklbw m_dith, m6 punpcklbw m_dith, m6
%ifdef ARCH_X86_64 %if ARCH_X86_64
punpcklwd m8, m_dith, m6 punpcklwd m8, m_dith, m6
pslld m8, 12 pslld m8, 12
%else ; x86-32 %else ; x86-32
...@@ -100,7 +100,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -100,7 +100,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
%endif ; x86-32/64 %endif ; x86-32/64
punpckhwd m_dith, m6 punpckhwd m_dith, m6
pslld m_dith, 12 pslld m_dith, 12
%ifdef ARCH_X86_32 %if ARCH_X86_32
mova [rsp+ 0], m5 mova [rsp+ 0], m5
mova [rsp+16], m_dith mova [rsp+16], m_dith
%endif %endif
...@@ -135,7 +135,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -135,7 +135,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
%endif ; %1 == 8 %endif ; %1 == 8
%if %1 == 8 %if %1 == 8
%ifdef ARCH_X86_32 %if ARCH_X86_32
mova m2, [rsp+mmsize*(0+%%i)] mova m2, [rsp+mmsize*(0+%%i)]
mova m1, [rsp+mmsize*(1+%%i)] mova m1, [rsp+mmsize*(1+%%i)]
%else ; x86-64 %else ; x86-64
...@@ -233,7 +233,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -233,7 +233,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
jg .pixelloop jg .pixelloop
%if %1 == 8 %if %1 == 8
%ifdef ARCH_X86_32 %if ARCH_X86_32
ADD rsp, pad ADD rsp, pad
RET RET
%else ; x86-64 %else ; x86-64
...@@ -245,7 +245,7 @@ cglobal yuv2planeX_%1, %3, 7, %2 ...@@ -245,7 +245,7 @@ cglobal yuv2planeX_%1, %3, 7, %2
%endmacro %endmacro
%define PALIGNR PALIGNR_MMX %define PALIGNR PALIGNR_MMX
%ifdef ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmx2 INIT_MMX mmx2
yuv2planeX_fn 8, 0, 7 yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5 yuv2planeX_fn 9, 0, 5
...@@ -382,7 +382,7 @@ cglobal yuv2plane1_%1, %3, %3, %2 ...@@ -382,7 +382,7 @@ cglobal yuv2plane1_%1, %3, %3, %2
REP_RET REP_RET
%endmacro %endmacro
%ifdef ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmx INIT_MMX mmx
yuv2plane1_fn 8, 0, 5 yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3 yuv2plane1_fn 16, 0, 3
......
...@@ -51,7 +51,7 @@ SECTION .text ...@@ -51,7 +51,7 @@ SECTION .text
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm ; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, opt, n_args, n_xmm
%macro SCALE_FUNC 7 %macro SCALE_FUNC 7
cglobal hscale%1to%2_%4_%5, %6, 7, %7 cglobal hscale%1to%2_%4_%5, %6, 7, %7
%ifdef ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%endif ; x86-64 %endif ; x86-64
%if %2 == 19 %if %2 == 19
...@@ -237,7 +237,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -237,7 +237,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%else ; %4 == X || %4 == X8 %else ; %4 == X || %4 == X8
%define r6sub 0 %define r6sub 0
%endif ; %4 ==/!= X4 %endif ; %4 ==/!= X4
%ifdef ARCH_X86_64 %if ARCH_X86_64
push r12 push r12
movsxd r6, r6d ; filterSize movsxd r6, r6d ; filterSize
lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4] lea r12, [r3+(r6-r6sub)*srcmul] ; &src[filterSize&~4]
...@@ -384,7 +384,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -384,7 +384,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%ifnidn %3, X %ifnidn %3, X
REP_RET REP_RET
%else ; %3 == X %else ; %3 == X
%ifdef ARCH_X86_64 %if ARCH_X86_64
pop r12 pop r12
RET RET
%else ; x86-32 %else ; x86-32
...@@ -419,7 +419,7 @@ SCALE_FUNCS 10, 19, %1, %3 ...@@ -419,7 +419,7 @@ SCALE_FUNCS 10, 19, %1, %3
SCALE_FUNCS 16, 19, %1, %4 SCALE_FUNCS 16, 19, %1, %4
%endmacro %endmacro
%ifdef ARCH_X86_32 %if ARCH_X86_32
INIT_MMX INIT_MMX
SCALE_FUNCS2 mmx, 0, 0, 0 SCALE_FUNCS2 mmx, 0, 0, 0
%endif %endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment