Commit 2fc7c818 authored by Michael Niedermayer's avatar Michael Niedermayer

Merge remote-tracking branch 'qatar/master'

* qatar/master:
  x86: fix build with nasm 2.08
  x86: use nop cpu directives only if supported
  x86: fix rNmp macros with nasm
  build: add trailing / to yasm/nasm -I flags
  x86: use 32-bit source registers with movd instruction
  x86: add colons after labels

Conflicts:
	Makefile
	libavutil/x86/x86inc.asm
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 2da5a5ce edd82267
...@@ -29,7 +29,8 @@ CFLAGS += $(ECFLAGS) ...@@ -29,7 +29,8 @@ CFLAGS += $(ECFLAGS)
CCFLAGS = $(CPPFLAGS) $(CFLAGS) CCFLAGS = $(CPPFLAGS) $(CFLAGS)
ASFLAGS := $(CPPFLAGS) $(ASFLAGS) ASFLAGS := $(CPPFLAGS) $(ASFLAGS)
CXXFLAGS += $(CPPFLAGS) $(CFLAGS) CXXFLAGS += $(CPPFLAGS) $(CFLAGS)
YASMFLAGS += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm YASMFLAGS += $(IFLAGS:%=%/) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm
HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS) HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS)
LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS) LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS)
......
...@@ -1220,6 +1220,7 @@ HAVE_LIST=" ...@@ -1220,6 +1220,7 @@ HAVE_LIST="
closesocket closesocket
cmov cmov
cpuid cpuid
cpunop
dcbzl dcbzl
dev_bktr_ioctl_bt848_h dev_bktr_ioctl_bt848_h
dev_bktr_ioctl_meteor_h dev_bktr_ioctl_meteor_h
...@@ -3229,6 +3230,7 @@ EOF ...@@ -3229,6 +3230,7 @@ EOF
die "yasm not found, use --disable-yasm for a crippled build" die "yasm not found, use --disable-yasm for a crippled build"
check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx
check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4 check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4
check_yasm "CPU amdnop" && enable cpunop
fi fi
case "$cpu" in case "$cpu" in
......
...@@ -39,7 +39,7 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1 ...@@ -39,7 +39,7 @@ cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1
%endif %endif
pxor mm7, mm7 pxor mm7, mm7
movq mm6, [pw_4] movq mm6, [pw_4]
.nextrow .nextrow:
movd mm0, [lum_m4q] movd mm0, [lum_m4q]
movd mm1, [lum_m3q] movd mm1, [lum_m3q]
movd mm2, [lum_m2q] movd mm2, [lum_m2q]
......
...@@ -1143,7 +1143,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 ...@@ -1143,7 +1143,7 @@ VECTOR_CLIP_INT32 6, 1, 0, 0
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 2*mmsize]
ALIGN 16 ALIGN 16
.loop .loop:
%if cpuflag(avx) %if cpuflag(avx)
vmovaps xmm0, [src1q + 16] vmovaps xmm0, [src1q + 16]
vinsertf128 m0, m0, [src1q], 1 vinsertf128 m0, m0, [src1q], 1
...@@ -1182,7 +1182,7 @@ VECTOR_FMUL_REVERSE ...@@ -1182,7 +1182,7 @@ VECTOR_FMUL_REVERSE
cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 2*mmsize]
ALIGN 16 ALIGN 16
.loop .loop:
mova m0, [src0q + lenq] mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize] mova m1, [src0q + lenq + mmsize]
mulps m0, m0, [src1q + lenq] mulps m0, m0, [src1q + lenq]
...@@ -1313,7 +1313,7 @@ cglobal bswap32_buf, 3,4,5 ...@@ -1313,7 +1313,7 @@ cglobal bswap32_buf, 3,4,5
add r0, 4 add r0, 4
dec r2 dec r2
jnz .loop2 jnz .loop2
.end .end:
RET RET
; %1 = aligned/unaligned ; %1 = aligned/unaligned
......
...@@ -184,7 +184,7 @@ cglobal hadamard8_diff16_%1, 5, 6, %2 ...@@ -184,7 +184,7 @@ cglobal hadamard8_diff16_%1, 5, 6, %2
call hadamard8x8_diff_%1 call hadamard8x8_diff_%1
add r5d, eax add r5d, eax
.done .done:
mov eax, r5d mov eax, r5d
%ifndef m8 %ifndef m8
ADD rsp, pad ADD rsp, pad
...@@ -288,7 +288,7 @@ cglobal sse16_sse2, 5, 5, 8 ...@@ -288,7 +288,7 @@ cglobal sse16_sse2, 5, 5, 8
pxor m0, m0 ; mm0 = 0 pxor m0, m0 ; mm0 = 0
pxor m7, m7 ; mm7 holds the sum pxor m7, m7 ; mm7 holds the sum
.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
movu m1, [r1 ] ; mm1 = pix1[0][0-15] movu m1, [r1 ] ; mm1 = pix1[0][0-15]
movu m2, [r2 ] ; mm2 = pix2[0][0-15] movu m2, [r2 ] ; mm2 = pix2[0][0-15]
movu m3, [r1+r3] ; mm3 = pix1[1][0-15] movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
......
...@@ -608,7 +608,7 @@ cglobal fft_calc, 2,5,8 ...@@ -608,7 +608,7 @@ cglobal fft_calc, 2,5,8
add rcx, 3 add rcx, 3
shl r2, cl shl r2, cl
sub r4, r2 sub r4, r2
.loop .loop:
%if mmsize == 8 %if mmsize == 8
PSWAPD m0, [r4 + r2 + 4] PSWAPD m0, [r4 + r2 + 4]
mova [r4 + r2 + 4], m0 mova [r4 + r2 + 4], m0
......
...@@ -404,7 +404,7 @@ cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 ...@@ -404,7 +404,7 @@ cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
mov src1q, [srcq+gprsize] mov src1q, [srcq+gprsize]
mov srcq, [srcq ] mov srcq, [srcq ]
sub src1q, srcq sub src1q, srcq
.loop .loop:
MOVPS m0, [srcq ] MOVPS m0, [srcq ]
MOVPS m1, [srcq+src1q ] MOVPS m1, [srcq+src1q ]
MOVPS m3, [srcq +mmsize] MOVPS m3, [srcq +mmsize]
......
...@@ -69,7 +69,7 @@ SECTION .text ...@@ -69,7 +69,7 @@ SECTION .text
%macro mv0_pixels_mc8 0 %macro mv0_pixels_mc8 0
lea r4, [r2*2 ] lea r4, [r2*2 ]
.next4rows .next4rows:
movq mm0, [r1 ] movq mm0, [r1 ]
movq mm1, [r1+r2] movq mm1, [r1+r2]
add r1, r4 add r1, r4
...@@ -117,7 +117,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 ...@@ -117,7 +117,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
mv0_pixels_mc8 mv0_pixels_mc8
REP_RET REP_RET
.at_least_one_non_zero .at_least_one_non_zero:
%ifidn %2, rv40 %ifidn %2, rv40
%if ARCH_X86_64 %if ARCH_X86_64
mov r7, r5 mov r7, r5
...@@ -145,7 +145,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 ...@@ -145,7 +145,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
test r4d, r4d test r4d, r4d
mov r6, r2 ; dxy = x ? 1 : stride mov r6, r2 ; dxy = x ? 1 : stride
jne .both_non_zero jne .both_non_zero
.my_is_zero .my_is_zero:
; mx == 0 XOR my == 0 - 1 dimensional filter only ; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y or r4d, r5d ; x + y
...@@ -166,7 +166,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 ...@@ -166,7 +166,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
pxor m7, m7 pxor m7, m7
psubw m4, m5 ; mm4 = A = 8-x psubw m4, m5 ; mm4 = A = 8-x
.next1drow .next1drow:
movq m0, [r1 ] ; mm0 = src[0..7] movq m0, [r1 ] ; mm0 = src[0..7]
movq m2, [r1+r6] ; mm1 = src[1..8] movq m2, [r1+r6] ; mm1 = src[1..8]
...@@ -197,7 +197,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 ...@@ -197,7 +197,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
jne .next1drow jne .next1drow
REP_RET REP_RET
.both_non_zero ; general case, bilinear .both_non_zero: ; general case, bilinear
movd m4, r4d ; x movd m4, r4d ; x
movd m6, r5d ; y movd m6, r5d ; y
%ifidn %2, rv40 %ifidn %2, rv40
...@@ -232,7 +232,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0 ...@@ -232,7 +232,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7 + extra_regs, 0
movq m0, [r1 ] ; mm0 = src[0..7] movq m0, [r1 ] ; mm0 = src[0..7]
movq m1, [r1+1] ; mm1 = src[1..8] movq m1, [r1+1] ; mm1 = src[1..8]
.next2drow .next2drow:
add r1, r2 add r1, r2
movq m2, m0 movq m2, m0
...@@ -330,7 +330,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0 ...@@ -330,7 +330,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 6 + extra_regs, 0
pmullw m6, m2 pmullw m6, m2
paddw m6, m0 paddw m6, m0
.next2rows .next2rows:
movd m0, [r1 ] movd m0, [r1 ]
movd m1, [r1+1] movd m1, [r1+1]
add r1, r2 add r1, r2
...@@ -397,7 +397,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 ...@@ -397,7 +397,7 @@ cglobal %1_%2_chroma_mc2_%3, 6, 7, 0
punpcklbw m2, m7 punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
.nextrow .nextrow:
add r1, r2 add r1, r2
movq m1, m2 movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
...@@ -474,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -474,7 +474,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
mv0_pixels_mc8 mv0_pixels_mc8
REP_RET REP_RET
.at_least_one_non_zero .at_least_one_non_zero:
test r5d, r5d test r5d, r5d
je .my_is_zero je .my_is_zero
test r4d, r4d test r4d, r4d
...@@ -501,7 +501,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -501,7 +501,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
movlhps m7, m7 movlhps m7, m7
movlhps m6, m6 movlhps m6, m6
.next2rows .next2rows:
movq m1, [r1+r2*1 ] movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1] movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ] movq m3, [r1+r2*2 ]
...@@ -535,7 +535,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -535,7 +535,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
jg .next2rows jg .next2rows
REP_RET REP_RET
.my_is_zero .my_is_zero:
mov r5d, r4d mov r5d, r4d
shl r4d, 8 shl r4d, 8
add r4, 8 add r4, 8
...@@ -545,7 +545,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -545,7 +545,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
pshuflw m7, m7, 0 pshuflw m7, m7, 0
movlhps m7, m7 movlhps m7, m7
.next2xrows .next2xrows:
movq m0, [r1 ] movq m0, [r1 ]
movq m1, [r1 +1] movq m1, [r1 +1]
movq m2, [r1+r2 ] movq m2, [r1+r2 ]
...@@ -572,7 +572,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -572,7 +572,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
jg .next2xrows jg .next2xrows
REP_RET REP_RET
.mx_is_zero .mx_is_zero:
mov r4d, r5d mov r4d, r5d
shl r5d, 8 shl r5d, 8
add r5, 8 add r5, 8
...@@ -582,7 +582,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 ...@@ -582,7 +582,7 @@ cglobal %1_%2_chroma_mc8_%3, 6, 7, 8
pshuflw m7, m7, 0 pshuflw m7, m7, 0
movlhps m7, m7 movlhps m7, m7
.next2yrows .next2yrows:
movq m0, [r1 ] movq m0, [r1 ]
movq m1, [r1+r2 ] movq m1, [r1+r2 ]
movdqa m2, m1 movdqa m2, m1
...@@ -632,7 +632,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 ...@@ -632,7 +632,7 @@ cglobal %1_%2_chroma_mc4_%3, 6, 7, 0
punpcklbw m0, [r1+1] punpcklbw m0, [r1+1]
pshufw m6, m6, 0 pshufw m6, m6, 0
.next2rows .next2rows:
movd m1, [r1+r2*1 ] movd m1, [r1+r2*1 ]
movd m3, [r1+r2*2 ] movd m3, [r1+r2*2 ]
punpcklbw m1, [r1+r2*1+1] punpcklbw m1, [r1+r2*1+1]
......
...@@ -38,7 +38,7 @@ SECTION .text ...@@ -38,7 +38,7 @@ SECTION .text
%macro MV0_PIXELS_MC8 0 %macro MV0_PIXELS_MC8 0
lea r4, [r2*3 ] lea r4, [r2*3 ]
lea r5, [r2*4 ] lea r5, [r2*4 ]
.next4rows .next4rows:
movu m0, [r1 ] movu m0, [r1 ]
movu m1, [r1+r2 ] movu m1, [r1+r2 ]
CHROMAMC_AVG m0, [r0 ] CHROMAMC_AVG m0, [r0 ]
...@@ -72,14 +72,14 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 ...@@ -72,14 +72,14 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
MV0_PIXELS_MC8 MV0_PIXELS_MC8
REP_RET REP_RET
.at_least_one_non_zero .at_least_one_non_zero:
mov r6d, 2 mov r6d, 2
test r5d, r5d test r5d, r5d
je .x_interpolation je .x_interpolation
mov r6, r2 ; dxy = x ? 1 : stride mov r6, r2 ; dxy = x ? 1 : stride
test r4d, r4d test r4d, r4d
jne .xy_interpolation jne .xy_interpolation
.x_interpolation .x_interpolation:
; mx == 0 XOR my == 0 - 1 dimensional filter only ; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y or r4d, r5d ; x + y
movd m5, r4d movd m5, r4d
...@@ -88,7 +88,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 ...@@ -88,7 +88,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
SPLATW m5, m5 ; mm5 = B = x SPLATW m5, m5 ; mm5 = B = x
psubw m4, m5 ; mm4 = A = 8-x psubw m4, m5 ; mm4 = A = 8-x
.next1drow .next1drow:
movu m0, [r1 ] ; mm0 = src[0..7] movu m0, [r1 ] ; mm0 = src[0..7]
movu m2, [r1+r6] ; mm2 = src[1..8] movu m2, [r1+r6] ; mm2 = src[1..8]
...@@ -107,7 +107,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 ...@@ -107,7 +107,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
jne .next1drow jne .next1drow
REP_RET REP_RET
.xy_interpolation ; general case, bilinear .xy_interpolation: ; general case, bilinear
movd m4, r4m ; x movd m4, r4m ; x
movd m6, r5m ; y movd m6, r5m ; y
...@@ -125,7 +125,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8 ...@@ -125,7 +125,7 @@ cglobal %1_h264_chroma_mc8_10, 6,7,8
movu m0, [r1 ] ; mm0 = src[0..7] movu m0, [r1 ] ; mm0 = src[0..7]
movu m1, [r1+2] ; mm1 = src[1..8] movu m1, [r1+2] ; mm1 = src[1..8]
.next2drow .next2drow:
add r1, r2 add r1, r2
pmullw m2, m0, m4 pmullw m2, m0, m4
...@@ -192,7 +192,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7 ...@@ -192,7 +192,7 @@ cglobal %1_h264_chroma_mc4_10, 6,6,7
pmullw m6, m2 pmullw m6, m2
paddw m6, m0 paddw m6, m0
.next2rows .next2rows:
MC4_OP m0, m6 MC4_OP m0, m6
MC4_OP m6, m0 MC4_OP m6, m0
sub r3d, 2 sub r3d, 2
...@@ -221,7 +221,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7 ...@@ -221,7 +221,7 @@ cglobal %1_h264_chroma_mc2_10, 6,7
pxor m7, m7 pxor m7, m7
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
.nextrow .nextrow:
add r1, r2 add r1, r2
movq m1, m2 movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
......
...@@ -625,7 +625,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16 ...@@ -625,7 +625,7 @@ cglobal deblock_v_luma_intra_10, 4,7,16
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
LOAD_AB aa, bb, r2d, r3d LOAD_AB aa, bb, r2d, r3d
.loop .loop:
mova p2, [r4+r1] mova p2, [r4+r1]
mova p1, [r4+2*r1] mova p1, [r4+2*r1]
mova p0, [r4+r5] mova p0, [r4+r5]
...@@ -676,7 +676,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16 ...@@ -676,7 +676,7 @@ cglobal deblock_h_luma_intra_10, 4,7,16
mova m0, [pw_2] mova m0, [pw_2]
shl r2d, 2 shl r2d, 2
shl r3d, 2 shl r3d, 2
.loop .loop:
movu q3, [r0-8] movu q3, [r0-8]
movu q2, [r0+r1-8] movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8] movu q1, [r0+r1*2-8]
......
...@@ -308,7 +308,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str ...@@ -308,7 +308,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -316,7 +316,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str ...@@ -316,7 +316,7 @@ cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6] lea r6, [r0+r6]
IDCT4_ADD r6, r2, r3 IDCT4_ADD r6, r2, r3
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
cmp r5, 16 cmp r5, 16
...@@ -333,7 +333,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str ...@@ -333,7 +333,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -347,7 +347,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str ...@@ -347,7 +347,7 @@ cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, str
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4] lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3 IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock .skipblock:
add r5, 4 add r5, 4
add r2, 128 add r2, 128
cmp r5, 16 cmp r5, 16
...@@ -362,7 +362,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -362,7 +362,7 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -388,11 +388,11 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -388,11 +388,11 @@ cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
cmp r5, 16 cmp r5, 16
jl .nextblock jl .nextblock
REP_RET REP_RET
.no_dc .no_dc:
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
add r6, r0 add r6, r0
IDCT4_ADD r6, r2, r3 IDCT4_ADD r6, r2, r3
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
cmp r5, 16 cmp r5, 16
...@@ -406,7 +406,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block ...@@ -406,7 +406,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
or r6w, word [r2] or r6w, word [r2]
...@@ -415,7 +415,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block ...@@ -415,7 +415,7 @@ cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
add r6, r0 add r6, r0
IDCT4_ADD r6, r2, r3 IDCT4_ADD r6, r2, r3
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
cmp r5, 16 cmp r5, 16
...@@ -429,7 +429,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo ...@@ -429,7 +429,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -442,7 +442,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo ...@@ -442,7 +442,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo
cmp r5, 16 cmp r5, 16
jl .nextblock jl .nextblock
REP_RET REP_RET
.try_dc .try_dc:
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
...@@ -457,7 +457,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo ...@@ -457,7 +457,7 @@ cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, blo
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
cmp r5, 16 cmp r5, 16
...@@ -474,7 +474,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -474,7 +474,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -504,7 +504,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -504,7 +504,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
ADD rsp, pad ADD rsp, pad
RET RET
.no_dc .no_dc:
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
add r6, r0 add r6, r0
add word [r2], 32 add word [r2], 32
...@@ -514,7 +514,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -514,7 +514,7 @@ cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, s
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4] lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3 IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock .skipblock:
add r5, 4 add r5, 4
add r2, 128 add r2, 128
cmp r5, 16 cmp r5, 16
...@@ -531,7 +531,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, ...@@ -531,7 +531,7 @@ cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block,
%ifdef PIC %ifdef PIC
lea picregq, [scan8_mem] lea picregq, [scan8_mem]
%endif %endif
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -560,7 +560,7 @@ INIT_MMX ...@@ -560,7 +560,7 @@ INIT_MMX
cmp r5, 16 cmp r5, 16
jl .nextblock jl .nextblock
REP_RET REP_RET
.no_dc .no_dc:
INIT_XMM INIT_XMM
mov dst2d, dword [r1+r5*4] mov dst2d, dword [r1+r5*4]
add dst2q, r0 add dst2q, r0
...@@ -568,7 +568,7 @@ INIT_XMM ...@@ -568,7 +568,7 @@ INIT_XMM
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
mov r1, r1m mov r1, r1m
%endif %endif
.skipblock .skipblock:
add r5, 4 add r5, 4
add r2, 128 add r2, 128
cmp r5, 16 cmp r5, 16
...@@ -577,7 +577,7 @@ INIT_XMM ...@@ -577,7 +577,7 @@ INIT_XMM
INIT_MMX INIT_MMX
h264_idct_add8_mmx_plane: h264_idct_add8_mmx_plane:
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
or r6w, word [r2] or r6w, word [r2]
...@@ -592,7 +592,7 @@ h264_idct_add8_mmx_plane: ...@@ -592,7 +592,7 @@ h264_idct_add8_mmx_plane:
add r0, dword [r1+r5*4] add r0, dword [r1+r5*4]
%endif %endif
IDCT4_ADD r0, r2, r3 IDCT4_ADD r0, r2, r3
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
test r5, 3 test r5, 3
...@@ -621,8 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, str ...@@ -621,8 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, str
call h264_idct_add8_mmx_plane call h264_idct_add8_mmx_plane
RET RET
h264_idct_add8_mmx2_plane h264_idct_add8_mmx2_plane:
.nextblock .nextblock:
movzx r6, byte [scan8+r5] movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6] movzx r6, byte [r4+r6]
test r6, r6 test r6, r6
...@@ -641,7 +641,7 @@ h264_idct_add8_mmx2_plane ...@@ -641,7 +641,7 @@ h264_idct_add8_mmx2_plane
test r5, 3 test r5, 3
jnz .nextblock jnz .nextblock
rep ret rep ret
.try_dc .try_dc:
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
...@@ -655,7 +655,7 @@ h264_idct_add8_mmx2_plane ...@@ -655,7 +655,7 @@ h264_idct_add8_mmx2_plane
add r0, dword [r1+r5*4] add r0, dword [r1+r5*4]
%endif %endif
DC_ADD_MMX2_OP movh, r0, r3, r6 DC_ADD_MMX2_OP movh, r0, r3, r6
.skipblock .skipblock:
inc r5 inc r5
add r2, 32 add r2, 32
test r5, 3 test r5, 3
...@@ -734,7 +734,7 @@ h264_add8x4_idct_sse2: ...@@ -734,7 +734,7 @@ h264_add8x4_idct_sse2:
add r0, r0m add r0, r0m
%endif %endif
call h264_add8x4_idct_sse2 call h264_add8x4_idct_sse2
.cycle%1end .cycle%1end:
%if %1 < 7 %if %1 < 7
add r2, 64 add r2, 64
%endif %endif
...@@ -770,7 +770,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 ...@@ -770,7 +770,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
%endif %endif
call h264_add8x4_idct_sse2 call h264_add8x4_idct_sse2
jmp .cycle%1end jmp .cycle%1end
.try%1dc .try%1dc:
movsx r0, word [r2 ] movsx r0, word [r2 ]
or r0w, word [r2+32] or r0w, word [r2+32]
jz .cycle%1end jz .cycle%1end
...@@ -781,7 +781,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8 ...@@ -781,7 +781,7 @@ cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
add r0, r0m add r0, r0m
%endif %endif
call h264_idct_dc_add8_mmx2 call h264_idct_dc_add8_mmx2
.cycle%1end .cycle%1end:
%if %1 < 7 %if %1 < 7
add r2, 64 add r2, 64
%endif %endif
...@@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 ...@@ -817,7 +817,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
%endif %endif
call h264_add8x4_idct_sse2 call h264_add8x4_idct_sse2
jmp .cycle%1end jmp .cycle%1end
.try%1dc .try%1dc:
movsx r0, word [r2 ] movsx r0, word [r2 ]
or r0w, word [r2+32] or r0w, word [r2+32]
jz .cycle%1end jz .cycle%1end
...@@ -830,7 +830,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8 ...@@ -830,7 +830,7 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif %endif
call h264_idct_dc_add8_mmx2 call h264_idct_dc_add8_mmx2
.cycle%1end .cycle%1end:
%if %1 == 1 %if %1 == 1
add r2, 384+64 add r2, 384+64
%elif %1 < 3 %elif %1 < 3
......
...@@ -225,7 +225,7 @@ IDCT8_DC_ADD ...@@ -225,7 +225,7 @@ IDCT8_DC_ADD
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro AC 1 %macro AC 1
.ac%1 .ac%1:
mov r5d, [r1+(%1+0)*4] mov r5d, [r1+(%1+0)*4]
call add4x4_idct %+ SUFFIX call add4x4_idct %+ SUFFIX
mov r5d, [r1+(%1+1)*4] mov r5d, [r1+(%1+1)*4]
......
...@@ -484,7 +484,7 @@ cglobal pred16x16_plane_%1, 2,9,7 ...@@ -484,7 +484,7 @@ cglobal pred16x16_plane_%1, 2,9,7
%endif %endif
mov r4, 8 mov r4, 8
.loop .loop:
mova m3, m0 ; b[0..7] mova m3, m0 ; b[0..7]
mova m4, m2 ; b[8..15] mova m4, m2 ; b[8..15]
psraw m3, 5 psraw m3, 5
...@@ -680,7 +680,7 @@ cglobal pred8x8_plane, 2,9,7 ...@@ -680,7 +680,7 @@ cglobal pred8x8_plane, 2,9,7
mov r4, 4 mov r4, 4
ALIGN 16 ALIGN 16
.loop .loop:
%if mmsize == 16 %if mmsize == 16
mova m3, m0 ; b[0..7] mova m3, m0 ; b[0..7]
paddw m0, m1 paddw m0, m1
...@@ -1045,7 +1045,7 @@ cglobal pred8x8l_top_dc_%1, 4,4 ...@@ -1045,7 +1045,7 @@ cglobal pred8x8l_top_dc_%1, 4,4
psrlq mm5, 56 psrlq mm5, 56
psllq mm5, 56 psllq mm5, 56
pxor mm1, mm5 pxor mm1, mm5
.body .body:
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
psadbw mm7, mm0 psadbw mm7, mm0
paddw mm7, [pw_4] paddw mm7, [pw_4]
...@@ -1141,7 +1141,7 @@ cglobal pred8x8l_dc_%1, 4,5 ...@@ -1141,7 +1141,7 @@ cglobal pred8x8l_dc_%1, 4,5
jz .fix_lt_2 jz .fix_lt_2
test r2, r2 test r2, r2
jz .fix_tr_1 jz .fix_tr_1
.body .body:
lea r1, [r0+r3*2] lea r1, [r0+r3*2]
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
pxor mm0, mm0 pxor mm0, mm0
...@@ -1276,7 +1276,7 @@ cglobal pred8x8l_vertical_%1, 4,4 ...@@ -1276,7 +1276,7 @@ cglobal pred8x8l_vertical_%1, 4,4
psrlq mm5, 56 psrlq mm5, 56
psllq mm5, 56 psllq mm5, 56
pxor mm1, mm5 pxor mm1, mm5
.body .body:
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
%rep 3 %rep 3
movq [r0+r3*1], mm0 movq [r0+r3*1], mm0
...@@ -1576,7 +1576,7 @@ cglobal pred8x8l_down_right_mmxext, 4,5 ...@@ -1576,7 +1576,7 @@ cglobal pred8x8l_down_right_mmxext, 4,5
psllq mm5, 56 psllq mm5, 56
pxor mm1, mm5 pxor mm1, mm5
jmp .do_top jmp .do_top
.body .body:
lea r1, [r0+r3*2] lea r1, [r0+r3*2]
movq mm1, mm7 movq mm1, mm7
movq mm7, mm5 movq mm7, mm5
...@@ -1822,7 +1822,7 @@ cglobal pred8x8l_vertical_right_mmxext, 4,5 ...@@ -1822,7 +1822,7 @@ cglobal pred8x8l_vertical_right_mmxext, 4,5
jz .fix_lt_2 jz .fix_lt_2
test r2, r2 test r2, r2
jz .fix_tr_1 jz .fix_tr_1
.do_top .do_top:
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
lea r1, [r0+r3*2] lea r1, [r0+r3*2]
movq mm2, mm6 movq mm2, mm6
...@@ -1931,7 +1931,7 @@ cglobal pred8x8l_vertical_right_%1, 4,5,7 ...@@ -1931,7 +1931,7 @@ cglobal pred8x8l_vertical_right_%1, 4,5,7
jz .fix_lt_2 jz .fix_lt_2
test r2, r2 test r2, r2
jz .fix_tr_1 jz .fix_tr_1
.do_top .do_top:
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
lea r1, [r0+r3*2] lea r1, [r0+r3*2]
movq2dq xmm4, mm6 movq2dq xmm4, mm6
......
...@@ -264,7 +264,7 @@ cglobal_mc %1, %2, mc20, %3, 3,4,9 ...@@ -264,7 +264,7 @@ cglobal_mc %1, %2, mc20, %3, 3,4,9
%else %else
%define p16 [pw_16] %define p16 [pw_16]
%endif %endif
.nextrow .nextrow:
%if %0 == 4 %if %0 == 4
movu m2, [r1-4] movu m2, [r1-4]
movu m3, [r1-2] movu m3, [r1-2]
...@@ -330,7 +330,7 @@ MC_CACHE MC30 ...@@ -330,7 +330,7 @@ MC_CACHE MC30
%macro MC10 3-4 %macro MC10 3-4
cglobal_mc %1, %2, mc10, %3, 3,5,9 cglobal_mc %1, %2, mc10, %3, 3,5,9
mov r4, r1 mov r4, r1
.body .body:
mov r3d, %3 mov r3d, %3
mova m1, [pw_pixel_max] mova m1, [pw_pixel_max]
%if num_mmregs > 8 %if num_mmregs > 8
...@@ -339,7 +339,7 @@ cglobal_mc %1, %2, mc10, %3, 3,5,9 ...@@ -339,7 +339,7 @@ cglobal_mc %1, %2, mc10, %3, 3,5,9
%else %else
%define p16 [pw_16] %define p16 [pw_16]
%endif %endif
.nextrow .nextrow:
%if %0 == 4 %if %0 == 4
movu m2, [r1-4] movu m2, [r1-4]
movu m3, [r1-2] movu m3, [r1-2]
...@@ -446,7 +446,7 @@ MC MC02 ...@@ -446,7 +446,7 @@ MC MC02
%macro MC01 3 %macro MC01 3
cglobal_mc %1, %2, mc01, %3, 3,5,8 cglobal_mc %1, %2, mc01, %3, 3,5,8
mov r4, r1 mov r4, r1
.body .body:
PRELOAD_V PRELOAD_V
sub r4, r2 sub r4, r2
...@@ -535,7 +535,7 @@ SWAP 0,1,2,3,4,5 ...@@ -535,7 +535,7 @@ SWAP 0,1,2,3,4,5
; this REALLY needs x86_64 ; this REALLY needs x86_64
cglobal_mc %1, %2, mc11, %3, 3,6,8 cglobal_mc %1, %2, mc11, %3, 3,6,8
mov r4, r1 mov r4, r1
.body .body:
PRELOAD_V PRELOAD_V
sub r0, r2 sub r0, r2
...@@ -778,7 +778,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12 ...@@ -778,7 +778,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12
call put_hv%3_10_%1 call put_hv%3_10_%1
xor r4d, r4d xor r4d, r4d
.body .body:
mov r3d, %3 mov r3d, %3
pxor m0, m0 pxor m0, m0
mova m7, [pw_pixel_max] mova m7, [pw_pixel_max]
...@@ -837,7 +837,7 @@ put_h%2_10_%1: ...@@ -837,7 +837,7 @@ put_h%2_10_%1:
mov r3d, %2 mov r3d, %2
xor r4d, r4d xor r4d, r4d
mova m6, [pad20] mova m6, [pad20]
.nextrow .nextrow:
movu m2, [r5-4] movu m2, [r5-4]
movu m3, [r5-2] movu m3, [r5-2]
movu m4, [r5+0] movu m4, [r5+0]
...@@ -864,7 +864,7 @@ H_NRD sse2 , 8 ...@@ -864,7 +864,7 @@ H_NRD sse2 , 8
%macro MC21 3 %macro MC21 3
cglobal_mc %1, %2, mc21, %3, 3,7,12 cglobal_mc %1, %2, mc21, %3, 3,7,12
mov r5, r1 mov r5, r1
.body .body:
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack and rsp, ~(mmsize-1) ; align stack
......
...@@ -73,7 +73,7 @@ SECTION .text ...@@ -73,7 +73,7 @@ SECTION .text
INIT_MMX INIT_MMX
cglobal h264_weight_16_mmx2, 6, 6, 0 cglobal h264_weight_16_mmx2, 6, 6, 0
WEIGHT_SETUP WEIGHT_SETUP
.nextrow .nextrow:
WEIGHT_OP 0, 4 WEIGHT_OP 0, 4
mova [r0 ], m0 mova [r0 ], m0
WEIGHT_OP 8, 12 WEIGHT_OP 8, 12
...@@ -86,7 +86,7 @@ cglobal h264_weight_16_mmx2, 6, 6, 0 ...@@ -86,7 +86,7 @@ cglobal h264_weight_16_mmx2, 6, 6, 0
%macro WEIGHT_FUNC_MM 3 %macro WEIGHT_FUNC_MM 3
cglobal h264_weight_%1_%3, 6, 6, %2 cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP WEIGHT_SETUP
.nextrow .nextrow:
WEIGHT_OP 0, mmsize/2 WEIGHT_OP 0, mmsize/2
mova [r0], m0 mova [r0], m0
add r0, r1 add r0, r1
...@@ -105,7 +105,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2 ...@@ -105,7 +105,7 @@ cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP WEIGHT_SETUP
sar r2d, 1 sar r2d, 1
lea r3, [r1*2] lea r3, [r1*2]
.nextrow .nextrow:
WEIGHT_OP 0, r1 WEIGHT_OP 0, r1
movh [r0], m0 movh [r0], m0
%if mmsize == 16 %if mmsize == 16
...@@ -178,7 +178,7 @@ INIT_MMX ...@@ -178,7 +178,7 @@ INIT_MMX
cglobal h264_biweight_16_mmx2, 7, 8, 0 cglobal h264_biweight_16_mmx2, 7, 8, 0
BIWEIGHT_SETUP BIWEIGHT_SETUP
movifnidn r3d, r3m movifnidn r3d, r3m
.nextrow .nextrow:
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4 BIWEIGHT_STEPA 1, 2, 4
BIWEIGHT_STEPB BIWEIGHT_STEPB
...@@ -197,7 +197,7 @@ cglobal h264_biweight_16_mmx2, 7, 8, 0 ...@@ -197,7 +197,7 @@ cglobal h264_biweight_16_mmx2, 7, 8, 0
cglobal h264_biweight_%1_%3, 7, 8, %2 cglobal h264_biweight_%1_%3, 7, 8, %2
BIWEIGHT_SETUP BIWEIGHT_SETUP
movifnidn r3d, r3m movifnidn r3d, r3m
.nextrow .nextrow:
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2 BIWEIGHT_STEPA 1, 2, mmsize/2
BIWEIGHT_STEPB BIWEIGHT_STEPB
...@@ -220,7 +220,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 ...@@ -220,7 +220,7 @@ cglobal h264_biweight_%1_%3, 7, 8, %2
movifnidn r3d, r3m movifnidn r3d, r3m
sar r3, 1 sar r3, 1
lea r4, [r2*2] lea r4, [r2*2]
.nextrow .nextrow:
BIWEIGHT_STEPA 0, 1, 0 BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2 BIWEIGHT_STEPA 1, 2, r2
BIWEIGHT_STEPB BIWEIGHT_STEPB
...@@ -288,7 +288,7 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8 ...@@ -288,7 +288,7 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8
BIWEIGHT_SSSE3_SETUP BIWEIGHT_SSSE3_SETUP
movifnidn r3d, r3m movifnidn r3d, r3m
.nextrow .nextrow:
movh m0, [r0] movh m0, [r0]
movh m2, [r0+8] movh m2, [r0+8]
movh m3, [r1+8] movh m3, [r1+8]
...@@ -309,7 +309,7 @@ cglobal h264_biweight_8_ssse3, 7, 8, 8 ...@@ -309,7 +309,7 @@ cglobal h264_biweight_8_ssse3, 7, 8, 8
sar r3, 1 sar r3, 1
lea r4, [r2*2] lea r4, [r2*2]
.nextrow .nextrow:
movh m0, [r0] movh m0, [r0]
movh m1, [r1] movh m1, [r1]
movh m2, [r0+r2] movh m2, [r0+r2]
......
...@@ -40,7 +40,7 @@ SECTION .text ...@@ -40,7 +40,7 @@ SECTION .text
; int weight, int offset); ; int weight, int offset);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro WEIGHT_PROLOGUE 0 %macro WEIGHT_PROLOGUE 0
.prologue .prologue:
PROLOGUE 0,6,8 PROLOGUE 0,6,8
movifnidn r0, r0mp movifnidn r0, r0mp
movifnidn r1d, r1m movifnidn r1d, r1m
...@@ -93,7 +93,7 @@ SECTION .text ...@@ -93,7 +93,7 @@ SECTION .text
cglobal h264_weight_16_10_%1 cglobal h264_weight_16_10_%1
WEIGHT_PROLOGUE WEIGHT_PROLOGUE
WEIGHT_SETUP %1 WEIGHT_SETUP %1
.nextrow .nextrow:
WEIGHT_OP %1, 0 WEIGHT_OP %1, 0
mova [r0 ], m5 mova [r0 ], m5
WEIGHT_OP %1, 16 WEIGHT_OP %1, 16
...@@ -113,7 +113,7 @@ WEIGHT_FUNC_DBL sse4 ...@@ -113,7 +113,7 @@ WEIGHT_FUNC_DBL sse4
cglobal h264_weight_8_10_%1 cglobal h264_weight_8_10_%1
WEIGHT_PROLOGUE WEIGHT_PROLOGUE
WEIGHT_SETUP %1 WEIGHT_SETUP %1
.nextrow .nextrow:
WEIGHT_OP %1, 0 WEIGHT_OP %1, 0
mova [r0], m5 mova [r0], m5
add r0, r1 add r0, r1
...@@ -133,7 +133,7 @@ cglobal h264_weight_4_10_%1 ...@@ -133,7 +133,7 @@ cglobal h264_weight_4_10_%1
sar r2d, 1 sar r2d, 1
WEIGHT_SETUP %1 WEIGHT_SETUP %1
lea r3, [r1*2] lea r3, [r1*2]
.nextrow .nextrow:
WEIGHT_OP %1, 0, r1 WEIGHT_OP %1, 0, r1
movh [r0], m5 movh [r0], m5
movhps [r0+r1], m5 movhps [r0+r1], m5
...@@ -159,7 +159,7 @@ DECLARE_REG_TMP 7 ...@@ -159,7 +159,7 @@ DECLARE_REG_TMP 7
%endif %endif
%macro BIWEIGHT_PROLOGUE 0 %macro BIWEIGHT_PROLOGUE 0
.prologue .prologue:
PROLOGUE 0,8,8 PROLOGUE 0,8,8
movifnidn r0, r0mp movifnidn r0, r0mp
movifnidn r1, r1mp movifnidn r1, r1mp
...@@ -221,7 +221,7 @@ DECLARE_REG_TMP 7 ...@@ -221,7 +221,7 @@ DECLARE_REG_TMP 7
cglobal h264_biweight_16_10_%1 cglobal h264_biweight_16_10_%1
BIWEIGHT_PROLOGUE BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
.nextrow .nextrow:
BIWEIGHT %1, 0 BIWEIGHT %1, 0
mova [r0 ], m0 mova [r0 ], m0
BIWEIGHT %1, 16 BIWEIGHT %1, 16
...@@ -241,7 +241,7 @@ BIWEIGHT_FUNC_DBL sse4 ...@@ -241,7 +241,7 @@ BIWEIGHT_FUNC_DBL sse4
cglobal h264_biweight_8_10_%1 cglobal h264_biweight_8_10_%1
BIWEIGHT_PROLOGUE BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
.nextrow .nextrow:
BIWEIGHT %1, 0 BIWEIGHT %1, 0
mova [r0], m0 mova [r0], m0
add r0, r2 add r0, r2
...@@ -261,7 +261,7 @@ cglobal h264_biweight_4_10_%1 ...@@ -261,7 +261,7 @@ cglobal h264_biweight_4_10_%1
BIWEIGHT_SETUP %1 BIWEIGHT_SETUP %1
sar r3d, 1 sar r3d, 1
lea r4, [r2*2] lea r4, [r2*2]
.nextrow .nextrow:
BIWEIGHT %1, 0, r2 BIWEIGHT %1, 0, r2
movh [r0 ], m0 movh [r0 ], m0
movhps [r0+r2], m0 movhps [r0+r2], m0
......
...@@ -139,7 +139,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 ...@@ -139,7 +139,7 @@ cglobal vp6_filter_diag4, 5, 7, 8
mov r3, rsp mov r3, rsp
mov r6, 11 mov r6, 11
.nextrow .nextrow:
DIAG4 r1, -1, 0, 1, 2, r3 DIAG4 r1, -1, 0, 1, 2, r3
add r3, 8 add r3, 8
add r1, r2 add r1, r2
...@@ -151,7 +151,7 @@ cglobal vp6_filter_diag4, 5, 7, 8 ...@@ -151,7 +151,7 @@ cglobal vp6_filter_diag4, 5, 7, 8
lea r3, [rsp+8] lea r3, [rsp+8]
mov r6, 8 mov r6, 8
.nextcol .nextcol:
DIAG4 r3, -8, 0, 8, 16, r0 DIAG4 r3, -8, 0, 8, 16, r0
add r3, 8 add r3, 8
add r0, r2 add r0, r2
......
...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h ...@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, h
mova m6, [sixtap_filter_hb+mxq*8-32] mova m6, [sixtap_filter_hb+mxq*8-32]
mova m7, [sixtap_filter_hb+mxq*8-16] mova m7, [sixtap_filter_hb+mxq*8-16]
.nextrow .nextrow:
movu m0, [srcq-2] movu m0, [srcq-2]
mova m1, m0 mova m1, m0
mova m2, m0 mova m2, m0
...@@ -229,7 +229,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h ...@@ -229,7 +229,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, h
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
mova m6, [fourtap_filter_hb+mxq] mova m6, [fourtap_filter_hb+mxq]
.nextrow .nextrow:
movu m0, [srcq-1] movu m0, [srcq-1]
mova m1, m0 mova m1, m0
pshufb m0, m3 pshufb m0, m3
...@@ -264,7 +264,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ...@@ -264,7 +264,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
movh m2, [srcq+2*srcstrideq] movh m2, [srcq+2*srcstrideq]
add srcq, srcstrideq add srcq, srcstrideq
.nextrow .nextrow:
movh m3, [srcq+2*srcstrideq] ; read new row movh m3, [srcq+2*srcstrideq] ; read new row
mova m4, m0 mova m4, m0
mova m0, m1 mova m0, m1
...@@ -304,7 +304,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr ...@@ -304,7 +304,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
movh m3, [srcq] movh m3, [srcq]
movh m4, [srcq+srcstrideq] movh m4, [srcq+srcstrideq]
.nextrow .nextrow:
movh m5, [srcq+2*srcstrideq] ; read new row movh m5, [srcq+2*srcstrideq] ; read new row
mova m6, m0 mova m6, m0
punpcklbw m6, m5 punpcklbw m6, m5
...@@ -350,7 +350,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he ...@@ -350,7 +350,7 @@ cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
movq mm7, [pw_64] movq mm7, [pw_64]
pxor mm6, mm6 pxor mm6, mm6
.nextrow .nextrow:
movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels ; first set of 2 pixels
...@@ -399,7 +399,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he ...@@ -399,7 +399,7 @@ cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, he
movq mm7, [pw_64] movq mm7, [pw_64]
pxor mm3, mm3 pxor mm3, mm3
.nextrow .nextrow:
movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels ; first set of 2 pixels
...@@ -459,7 +459,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h ...@@ -459,7 +459,7 @@ cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, h
mova m8, [mxq+32] mova m8, [mxq+32]
mova m9, [mxq+48] mova m9, [mxq+48]
%endif %endif
.nextrow .nextrow:
movq m0, [srcq-1] movq m0, [srcq-1]
movq m1, [srcq-0] movq m1, [srcq-0]
movq m2, [srcq+1] movq m2, [srcq+1]
...@@ -510,7 +510,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h ...@@ -510,7 +510,7 @@ cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, h
mova m12, [mxq+64] mova m12, [mxq+64]
mova m13, [mxq+80] mova m13, [mxq+80]
%endif %endif
.nextrow .nextrow:
movq m0, [srcq-2] movq m0, [srcq-2]
movq m1, [srcq-1] movq m1, [srcq-1]
movq m2, [srcq-0] movq m2, [srcq-0]
...@@ -577,7 +577,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr ...@@ -577,7 +577,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr
punpcklbw m1, m7 punpcklbw m1, m7
punpcklbw m2, m7 punpcklbw m2, m7
.nextrow .nextrow:
; first calculate negative taps (to prevent losing positive overflows) ; first calculate negative taps (to prevent losing positive overflows)
movh m4, [srcq+2*srcstrideq] ; read new row movh m4, [srcq+2*srcstrideq] ; read new row
punpcklbw m4, m7 punpcklbw m4, m7
...@@ -635,7 +635,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr ...@@ -635,7 +635,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr
punpcklbw m3, m7 punpcklbw m3, m7
punpcklbw m4, m7 punpcklbw m4, m7
.nextrow .nextrow:
; first calculate negative taps (to prevent losing positive overflows) ; first calculate negative taps (to prevent losing positive overflows)
mova m5, m1 mova m5, m1
pmullw m5, [myq+16] pmullw m5, [myq+16]
...@@ -689,7 +689,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p ...@@ -689,7 +689,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
mova m5, [bilinear_filter_vw+myq-1*16] mova m5, [bilinear_filter_vw+myq-1*16]
neg myq neg myq
mova m4, [bilinear_filter_vw+myq+7*16] mova m4, [bilinear_filter_vw+myq+7*16]
.nextrow .nextrow:
movh m0, [srcq+srcstrideq*0] movh m0, [srcq+srcstrideq*0]
movh m1, [srcq+srcstrideq*1] movh m1, [srcq+srcstrideq*1]
movh m3, [srcq+srcstrideq*2] movh m3, [srcq+srcstrideq*2]
...@@ -733,7 +733,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride ...@@ -733,7 +733,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
mova m5, [bilinear_filter_vw+mxq-1*16] mova m5, [bilinear_filter_vw+mxq-1*16]
neg mxq neg mxq
mova m4, [bilinear_filter_vw+mxq+7*16] mova m4, [bilinear_filter_vw+mxq+7*16]
.nextrow .nextrow:
movh m0, [srcq+srcstrideq*0+0] movh m0, [srcq+srcstrideq*0+0]
movh m1, [srcq+srcstrideq*0+1] movh m1, [srcq+srcstrideq*0+1]
movh m2, [srcq+srcstrideq*1+0] movh m2, [srcq+srcstrideq*1+0]
...@@ -783,7 +783,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p ...@@ -783,7 +783,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, p
%endif %endif
pxor m4, m4 pxor m4, m4
mova m3, [bilinear_filter_vb+myq-16] mova m3, [bilinear_filter_vb+myq-16]
.nextrow .nextrow:
movh m0, [srcq+srcstrideq*0] movh m0, [srcq+srcstrideq*0]
movh m1, [srcq+srcstrideq*1] movh m1, [srcq+srcstrideq*1]
movh m2, [srcq+srcstrideq*2] movh m2, [srcq+srcstrideq*2]
...@@ -820,7 +820,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride ...@@ -820,7 +820,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride
pxor m4, m4 pxor m4, m4
mova m2, [filter_h2_shuf] mova m2, [filter_h2_shuf]
mova m3, [bilinear_filter_vb+mxq-16] mova m3, [bilinear_filter_vb+mxq-16]
.nextrow .nextrow:
movu m0, [srcq+srcstrideq*0] movu m0, [srcq+srcstrideq*0]
movu m1, [srcq+srcstrideq*1] movu m1, [srcq+srcstrideq*1]
pshufb m0, m2 pshufb m0, m2
...@@ -1488,7 +1488,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr ...@@ -1488,7 +1488,7 @@ cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
%endif %endif
%if mmsize == 8 ; mmx / mmxext %if mmsize == 8 ; mmx / mmxext
.next8px .next8px:
%endif %endif
%ifidn %1, v %ifidn %1, v
; read 4 half/full rows of pixels ; read 4 half/full rows of pixels
......
...@@ -361,7 +361,7 @@ cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, s ...@@ -361,7 +361,7 @@ cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, s
mov src0q, [src0q] mov src0q, [src0q]
add src0q, lenq add src0q, lenq
neg lenq neg lenq
.loop .loop:
; for x86-32 with 7-8 channels we do not have enough gp registers for all src ; for x86-32 with 7-8 channels we do not have enough gp registers for all src
; pointers, so we have to load some of them from the stack each time ; pointers, so we have to load some of them from the stack each time
%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
......
...@@ -32,7 +32,7 @@ SECTION .text ...@@ -32,7 +32,7 @@ SECTION .text
cglobal vector_fmul, 4,4,2, dst, src0, src1, len cglobal vector_fmul, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 2*mmsize]
ALIGN 16 ALIGN 16
.loop .loop:
mova m0, [src0q + lenq] mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize] mova m1, [src0q + lenq + mmsize]
mulps m0, m0, [src1q + lenq] mulps m0, m0, [src1q + lenq]
...@@ -74,7 +74,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len ...@@ -74,7 +74,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
%endif %endif
%endif %endif
lea lenq, [lend*4-2*mmsize] lea lenq, [lend*4-2*mmsize]
.loop .loop:
mulps m1, m0, [srcq+lenq ] mulps m1, m0, [srcq+lenq ]
mulps m2, m0, [srcq+lenq+mmsize] mulps m2, m0, [srcq+lenq+mmsize]
addps m1, m1, [dstq+lenq ] addps m1, m1, [dstq+lenq ]
......
...@@ -110,12 +110,14 @@ ...@@ -110,12 +110,14 @@
default rel default rel
%endif %endif
%macro CPUNOP 1
%if HAVE_CPUNOP
CPU %1
%endif
%endmacro
; Always use long nops (reduces 0x90 spam in disassembly on x86_32) ; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
; Not supported by NASM (except via smartalign package + ALIGNMODE k8, CPUNOP amdnop
; however that fails when used together with the -M option)
%ifdef __YASM_VER__
CPU amdnop
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64: ; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments ; Currently this works only for leaf functions which load all their arguments
...@@ -522,22 +524,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ...@@ -522,22 +524,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; Applies any symbol mangling needed for C linkage, and sets up a define such that ; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version. ; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified. ; Appends cpuflags to the function name if cpuflags has been specified.
%macro cglobal 1-2+ ; name, [PROLOGUE args] %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
%if %0 == 1
; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10
%ifndef cpuname
cglobal_internal %1
%else
cglobal_internal %1 %+ SUFFIX
%endif
%else
; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10
%ifndef cpuname
cglobal_internal %1, %2
%else
cglobal_internal %1 %+ SUFFIX, %2 cglobal_internal %1 %+ SUFFIX, %2
%endif
%endif
%endmacro %endmacro
%macro cglobal_internal 1-2+ %macro cglobal_internal 1-2+
%ifndef cglobaled_%1 %ifndef cglobaled_%1
...@@ -555,7 +543,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ...@@ -555,7 +543,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%1: %1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%assign stack_offset 0 %assign stack_offset 0
%if %0 > 1 %ifnidn %2, ""
PROLOGUE %2 PROLOGUE %2
%endif %endif
%endmacro %endmacro
...@@ -622,9 +610,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -622,9 +610,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
%macro INIT_CPUFLAGS 0-2 %macro INIT_CPUFLAGS 0-2
%ifdef __YASM_VER__ CPUNOP amdnop
CPU amdnop
%endif
%if %0 >= 1 %if %0 >= 1
%xdefine cpuname %1 %xdefine cpuname %1
%assign cpuflags cpuflags_%1 %assign cpuflags cpuflags_%1
...@@ -648,7 +634,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits ...@@ -648,7 +634,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif %endif
%ifdef __YASM_VER__ %ifdef __YASM_VER__
%if notcpuflag(mmx2) %if notcpuflag(mmx2)
CPU basicnop CPUNOP basicnop
%endif %endif
%endif %endif
%else %else
...@@ -826,18 +812,13 @@ INIT_XMM ...@@ -826,18 +812,13 @@ INIT_XMM
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1 %macro call 1
; HACK: work around %+ broken with empty SUFFIX for nasm 2.09.10 call_internal %1 %+ SUFFIX, %1
%ifndef cpuname
call_internal %1, %1
%else
call_internal %1, %1 %+ SUFFIX
%endif
%endmacro %endmacro
%macro call_internal 2 %macro call_internal 2
%xdefine %%i %1
%ifndef cglobaled_%1
%ifdef cglobaled_%2
%xdefine %%i %2 %xdefine %%i %2
%ifndef cglobaled_%2
%ifdef cglobaled_%1
%xdefine %%i %1
%endif %endif
%endif %endif
call %%i call %%i
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment