Commit 8476ca3b authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp8: convert idct x86 assembly to use named arguments.

parent 21ffc78f
...@@ -906,10 +906,10 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height ...@@ -906,10 +906,10 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro ADD_DC 4 %macro ADD_DC 4
%4 m2, [r0+%3] %4 m2, [dst1q+%3]
%4 m3, [r0+r2+%3] %4 m3, [dst1q+strideq+%3]
%4 m4, [r1+%3] %4 m4, [dst2q+%3]
%4 m5, [r1+r2+%3] %4 m5, [dst2q+strideq+%3]
paddusb m2, %1 paddusb m2, %1
paddusb m3, %1 paddusb m3, %1
paddusb m4, %1 paddusb m4, %1
...@@ -918,22 +918,22 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height ...@@ -918,22 +918,22 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
psubusb m3, %2 psubusb m3, %2
psubusb m4, %2 psubusb m4, %2
psubusb m5, %2 psubusb m5, %2
%4 [r0+%3], m2 %4 [dst1q+%3], m2
%4 [r0+r2+%3], m3 %4 [dst1q+strideq+%3], m3
%4 [r1+%3], m4 %4 [dst2q+%3], m4
%4 [r1+r2+%3], m5 %4 [dst2q+strideq+%3], m5
%endmacro %endmacro
INIT_MMX mmx INIT_MMX mmx
cglobal vp8_idct_dc_add, 3, 3 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data ; load data
movd m0, [r1] movd m0, [blockq]
; calculate DC ; calculate DC
paddw m0, [pw_4] paddw m0, [pw_4]
pxor m1, m1 pxor m1, m1
psraw m0, 3 psraw m0, 3
movd [r1], m1 movd [blockq], m1
psubw m1, m0 psubw m1, m0
packuswb m0, m0 packuswb m0, m0
packuswb m1, m1 packuswb m1, m1
...@@ -943,24 +943,26 @@ cglobal vp8_idct_dc_add, 3, 3 ...@@ -943,24 +943,26 @@ cglobal vp8_idct_dc_add, 3, 3
punpcklwd m1, m1 punpcklwd m1, m1
; add DC ; add DC
lea r1, [r0+r2*2] DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh ADD_DC m0, m1, 0, movh
RET RET
INIT_XMM sse4 INIT_XMM sse4
cglobal vp8_idct_dc_add, 3, 3, 6 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data ; load data
movd m0, [r1] movd m0, [blockq]
pxor m1, m1 pxor m1, m1
; calculate DC ; calculate DC
paddw m0, [pw_4] paddw m0, [pw_4]
movd [r1], m1 movd [blockq], m1
lea r1, [r0+r2*2] DEFINE_ARGS dst1, dst2, stride
movd m2, [r0] lea dst2q, [dst1q+strideq*2]
movd m3, [r0+r2] movd m2, [dst1q]
movd m4, [r1] movd m3, [dst1q+strideq]
movd m5, [r1+r2] movd m4, [dst2q]
movd m5, [dst2q+strideq]
psraw m0, 3 psraw m0, 3
pshuflw m0, m0, 0 pshuflw m0, m0, 0
punpcklqdq m0, m0 punpcklqdq m0, m0
...@@ -971,10 +973,10 @@ cglobal vp8_idct_dc_add, 3, 3, 6 ...@@ -971,10 +973,10 @@ cglobal vp8_idct_dc_add, 3, 3, 6
paddw m2, m0 paddw m2, m0
paddw m4, m0 paddw m4, m0
packuswb m2, m4 packuswb m2, m4
movd [r0], m2 movd [dst1q], m2
pextrd [r0+r2], m2, 1 pextrd [dst1q+strideq], m2, 1
pextrd [r1], m2, 2 pextrd [dst2q], m2, 2
pextrd [r1+r2], m2, 3 pextrd [dst2q+strideq], m2, 3
RET RET
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
...@@ -983,21 +985,21 @@ cglobal vp8_idct_dc_add, 3, 3, 6 ...@@ -983,21 +985,21 @@ cglobal vp8_idct_dc_add, 3, 3, 6
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX mmx INIT_MMX mmx
cglobal vp8_idct_dc_add4y, 3, 3 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [blockq+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [blockq+32*2] ; C
punpcklwd m0, [r1+32*1] ; A B punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [r1+32*3] ; C D punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D punpckldq m0, m1 ; A B C D
pxor m6, m6 pxor m6, m6
; calculate DC ; calculate DC
paddw m0, [pw_4] paddw m0, [pw_4]
movd [r1+32*0], m6 movd [blockq+32*0], m6
movd [r1+32*1], m6 movd [blockq+32*1], m6
movd [r1+32*2], m6 movd [blockq+32*2], m6
movd [r1+32*3], m6 movd [blockq+32*3], m6
psraw m0, 3 psraw m0, 3
psubw m6, m0 psubw m6, m0
packuswb m0, m0 packuswb m0, m0
...@@ -1012,28 +1014,29 @@ cglobal vp8_idct_dc_add4y, 3, 3 ...@@ -1012,28 +1014,29 @@ cglobal vp8_idct_dc_add4y, 3, 3
punpckhbw m7, m7 ; CCCCDDDD punpckhbw m7, m7 ; CCCCDDDD
; add DC ; add DC
lea r1, [r0+r2*2] DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova ADD_DC m0, m6, 0, mova
ADD_DC m1, m7, 8, mova ADD_DC m1, m7, 8, mova
RET RET
%endif %endif
INIT_XMM sse2 INIT_XMM sse2
cglobal vp8_idct_dc_add4y, 3, 3, 6 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [blockq+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [blockq+32*2] ; C
punpcklwd m0, [r1+32*1] ; A B punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [r1+32*3] ; C D punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D punpckldq m0, m1 ; A B C D
pxor m1, m1 pxor m1, m1
; calculate DC ; calculate DC
paddw m0, [pw_4] paddw m0, [pw_4]
movd [r1+32*0], m1 movd [blockq+32*0], m1
movd [r1+32*1], m1 movd [blockq+32*1], m1
movd [r1+32*2], m1 movd [blockq+32*2], m1
movd [r1+32*3], m1 movd [blockq+32*3], m1
psraw m0, 3 psraw m0, 3
psubw m1, m0 psubw m1, m0
packuswb m0, m0 packuswb m0, m0
...@@ -1044,7 +1047,8 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6 ...@@ -1044,7 +1047,8 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6
punpcklbw m1, m1 punpcklbw m1, m1
; add DC ; add DC
lea r1, [r0+r2*2] DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, mova ADD_DC m0, m1, 0, mova
RET RET
...@@ -1053,21 +1057,21 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6 ...@@ -1053,21 +1057,21 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
INIT_MMX mmx INIT_MMX mmx
cglobal vp8_idct_dc_add4uv, 3, 3 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
; load data ; load data
movd m0, [r1+32*0] ; A movd m0, [blockq+32*0] ; A
movd m1, [r1+32*2] ; C movd m1, [blockq+32*2] ; C
punpcklwd m0, [r1+32*1] ; A B punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [r1+32*3] ; C D punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D punpckldq m0, m1 ; A B C D
pxor m6, m6 pxor m6, m6
; calculate DC ; calculate DC
paddw m0, [pw_4] paddw m0, [pw_4]
movd [r1+32*0], m6 movd [blockq+32*0], m6
movd [r1+32*1], m6 movd [blockq+32*1], m6
movd [r1+32*2], m6 movd [blockq+32*2], m6
movd [r1+32*3], m6 movd [blockq+32*3], m6
psraw m0, 3 psraw m0, 3
psubw m6, m0 psubw m6, m0
packuswb m0, m0 packuswb m0, m0
...@@ -1082,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv, 3, 3 ...@@ -1082,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv, 3, 3
punpckhbw m7, m7 ; CCCCDDDD punpckhbw m7, m7 ; CCCCDDDD
; add DC ; add DC
lea r1, [r0+r2*2] DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova ADD_DC m0, m6, 0, mova
lea r0, [r0+r2*4] lea dst1q, [dst1q+strideq*4]
lea r1, [r1+r2*4] lea dst2q, [dst2q+strideq*4]
ADD_DC m1, m7, 0, mova ADD_DC m1, m7, 0, mova
RET RET
...@@ -1125,24 +1130,24 @@ cglobal vp8_idct_dc_add4uv, 3, 3 ...@@ -1125,24 +1130,24 @@ cglobal vp8_idct_dc_add4uv, 3, 3
%endmacro %endmacro
%macro VP8_IDCT_ADD 0 %macro VP8_IDCT_ADD 0
cglobal vp8_idct_add, 3, 3 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
; load block data ; load block data
movq m0, [r1+ 0] movq m0, [blockq+ 0]
movq m1, [r1+ 8] movq m1, [blockq+ 8]
movq m2, [r1+16] movq m2, [blockq+16]
movq m3, [r1+24] movq m3, [blockq+24]
movq m6, [pw_20091] movq m6, [pw_20091]
movq m7, [pw_17734] movq m7, [pw_17734]
%if cpuflag(sse) %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [blockq+ 0], xmm0
movaps [r1+16], xmm0 movaps [blockq+16], xmm0
%else %else
pxor m4, m4 pxor m4, m4
movq [r1+ 0], m4 movq [blockq+ 0], m4
movq [r1+ 8], m4 movq [blockq+ 8], m4
movq [r1+16], m4 movq [blockq+16], m4
movq [r1+24], m4 movq [blockq+24], m4
%endif %endif
; actual IDCT ; actual IDCT
...@@ -1154,9 +1159,10 @@ cglobal vp8_idct_add, 3, 3 ...@@ -1154,9 +1159,10 @@ cglobal vp8_idct_add, 3, 3
; store ; store
pxor m4, m4 pxor m4, m4
lea r1, [r0+2*r2] DEFINE_ARGS dst1, dst2, stride
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 lea dst2q, [dst1q+2*strideq]
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
RET RET
%endmacro %endmacro
...@@ -1173,24 +1179,24 @@ VP8_IDCT_ADD ...@@ -1173,24 +1179,24 @@ VP8_IDCT_ADD
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro SCATTER_WHT 3 %macro SCATTER_WHT 3
movd r1d, m%1 movd dc1d, m%1
movd r2d, m%2 movd dc2d, m%2
mov [r0+2*16*(0+%3)], r1w mov [blockq+2*16*(0+%3)], dc1w
mov [r0+2*16*(1+%3)], r2w mov [blockq+2*16*(1+%3)], dc2w
shr r1d, 16 shr dc1d, 16
shr r2d, 16 shr dc2d, 16
psrlq m%1, 32 psrlq m%1, 32
psrlq m%2, 32 psrlq m%2, 32
mov [r0+2*16*(4+%3)], r1w mov [blockq+2*16*(4+%3)], dc1w
mov [r0+2*16*(5+%3)], r2w mov [blockq+2*16*(5+%3)], dc2w
movd r1d, m%1 movd dc1d, m%1
movd r2d, m%2 movd dc2d, m%2
mov [r0+2*16*(8+%3)], r1w mov [blockq+2*16*(8+%3)], dc1w
mov [r0+2*16*(9+%3)], r2w mov [blockq+2*16*(9+%3)], dc2w
shr r1d, 16 shr dc1d, 16
shr r2d, 16 shr dc2d, 16
mov [r0+2*16*(12+%3)], r1w mov [blockq+2*16*(12+%3)], dc1w
mov [r0+2*16*(13+%3)], r2w mov [blockq+2*16*(13+%3)], dc2w
%endmacro %endmacro
%macro HADAMARD4_1D 4 %macro HADAMARD4_1D 4
...@@ -1200,21 +1206,21 @@ VP8_IDCT_ADD ...@@ -1200,21 +1206,21 @@ VP8_IDCT_ADD
%endmacro %endmacro
%macro VP8_DC_WHT 0 %macro VP8_DC_WHT 0
cglobal vp8_luma_dc_wht, 2, 3 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
movq m0, [r1] movq m0, [dc1q]
movq m1, [r1+8] movq m1, [dc1q+8]
movq m2, [r1+16] movq m2, [dc1q+16]
movq m3, [r1+24] movq m3, [dc1q+24]
%if cpuflag(sse) %if cpuflag(sse)
xorps xmm0, xmm0 xorps xmm0, xmm0
movaps [r1+ 0], xmm0 movaps [dc1q+ 0], xmm0
movaps [r1+16], xmm0 movaps [dc1q+16], xmm0
%else %else
pxor m4, m4 pxor m4, m4
movq [r1+ 0], m4 movq [dc1q+ 0], m4
movq [r1+ 8], m4 movq [dc1q+ 8], m4
movq [r1+16], m4 movq [dc1q+16], m4
movq [r1+24], m4 movq [dc1q+24], m4
%endif %endif
HADAMARD4_1D 0, 1, 2, 3 HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4 TRANSPOSE4x4W 0, 1, 2, 3, 4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment