Commit a1878a88 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Derek Buitenhuis

vp3: don't use calls to inline asm in yasm code.

Mixing yasm and inline asm is a bad idea, since if either yasm or inline
asm is not supported by your toolchain, all of the asm stops working.
Thus, better to use either one or the other alone.
Signed-off-by: 's avatarDerek Buitenhuis <derek.buitenhuis@gmail.com>
parent 79195ce5
...@@ -38,13 +38,11 @@ cextern pb_1 ...@@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3 cextern pb_3
cextern pb_7 cextern pb_7
cextern pb_1F cextern pb_1F
cextern pb_80
cextern pb_81 cextern pb_81
cextern pw_8 cextern pw_8
cextern put_signed_pixels_clamped_mmx
cextern add_pixels_clamped_mmx
SECTION .text SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63 ; this is off by one or two for some cases when filter_limit is greater than 63
...@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 ...@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro %endmacro
%macro vp3_idct_funcs 3 %macro vp3_idct_funcs 1
cglobal vp3_idct_put_%1, 3, %3, %2 cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2 VP3_IDCT_%1 r2
%if ARCH_X86_64
mov r3, r2 movsxdifnidn r1, r1d
mov r2, r1 mova m4, [pb_80]
mov r1, r0 lea r3, [r1*3]
mov r0, r3 %assign %%i 0
%rep 16/mmsize
mova m0, [r2+mmsize*0+%%i]
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%else %else
mov r0m, r2 movhps [r0+r1 ], m0
mov r1m, r0 movq [r0+r1*2], m1
mov r2m, r1 movhps [r0+r3 ], m1
%endif %endif
%if WIN64 %if %%i == 0
call put_signed_pixels_clamped_mmx lea r0, [r0+r1*4]
RET %endif
%else %if mmsize == 16
jmp put_signed_pixels_clamped_mmx movq [r0 ], m2
movhps [r0+r1 ], m2
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif %endif
%assign %%i %%i+64
%endrep
RET
cglobal vp3_idct_add_%1, 3, %3, %2 cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2 VP3_IDCT_%1 r2
%if ARCH_X86_64
mov r3, r2 mov r3, 4
mov r2, r1 pxor m4, m4
mov r1, r0 movsxdifnidn r1, r1d
mov r0, r3 .loop:
%else movq m0, [r0]
mov r0m, r2 movq m1, [r0+r1]
mov r1m, r0 %if mmsize == 8
mov r2m, r1 mova m2, m0
mova m3, m1
%endif %endif
%if WIN64 punpcklbw m0, m4
call add_pixels_clamped_mmx punpcklbw m1, m4
RET %if mmsize == 8
%else punpckhbw m2, m4
jmp add_pixels_clamped_mmx punpckhbw m3, m4
%endif
paddsw m0, [r2+ 0]
paddsw m1, [r2+16]
%if mmsize == 8
paddsw m2, [r2+ 8]
paddsw m3, [r2+24]
packuswb m0, m2
packuswb m1, m3
%else ; mmsize == 16
packuswb m0, m1
%endif %endif
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1], m1
%else ; mmsize == 16
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
add r2, 32
dec r3
jg .loop
RET
%endmacro %endmacro
%if ARCH_X86_64
%define REGS 4
%else
%define REGS 3
%endif
INIT_MMX INIT_MMX
vp3_idct_funcs mmx, 0, REGS vp3_idct_funcs mmx
INIT_XMM INIT_XMM
vp3_idct_funcs sse2, 9, REGS vp3_idct_funcs sse2
%undef REGS
%macro DC_ADD 0 %macro DC_ADD 0
movq m2, [r0 ] movq m2, [r0 ]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment