Commit e5786383 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp9: use registers for constant loading where possible.

parent 408bb855
...@@ -345,9 +345,9 @@ IADST4_FN iadst, IADST4, iadst, IADST4 ...@@ -345,9 +345,9 @@ IADST4_FN iadst, IADST4, iadst, IADST4
; ;
; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14 ; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14 ; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
%macro SUMSUB_MUL 6 ; src/dst 1-2, tmp1-2, coef1-2 %macro SUMSUB_MUL 6-8 [pd_8192], [pd_3fff] ; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
pand m%3, m%1, [pd_3fff] pand m%3, m%1, %8
pand m%4, m%2, [pd_3fff] pand m%4, m%2, %8
psrad m%1, 14 psrad m%1, 14
psrad m%2, 14 psrad m%2, 14
packssdw m%4, m%2 packssdw m%4, m%2
...@@ -358,20 +358,20 @@ IADST4_FN iadst, IADST4, iadst, IADST4 ...@@ -358,20 +358,20 @@ IADST4_FN iadst, IADST4, iadst, IADST4
pmaddwd m%1, m%2, [pw_%6_%5] pmaddwd m%1, m%2, [pw_%6_%5]
pmaddwd m%4, [pw_m%5_%6] pmaddwd m%4, [pw_m%5_%6]
pmaddwd m%2, [pw_m%5_%6] pmaddwd m%2, [pw_m%5_%6]
paddd m%3, [pd_8192] paddd m%3, %7
paddd m%4, [pd_8192] paddd m%4, %7
psrad m%3, 14 psrad m%3, 14
psrad m%4, 14 psrad m%4, 14
paddd m%1, m%3 paddd m%1, m%3
paddd m%2, m%4 paddd m%2, m%4
%endmacro %endmacro
%macro IDCT4_12BPP_1D 0-6 0, 1, 2, 3, 4, 5 %macro IDCT4_12BPP_1D 0-8 [pd_8192], [pd_3fff], 0, 1, 2, 3, 4, 5 ; rnd, mask, in/out0-3, tmp0-1
SUMSUB_MUL %1, %3, %5, %6, 11585, 11585 SUMSUB_MUL %3, %5, %7, %8, 11585, 11585, %1, %2
SUMSUB_MUL %2, %4, %5, %6, 15137, 6270 SUMSUB_MUL %4, %6, %7, %8, 15137, 6270, %1, %2
SUMSUB_BA d, %2, %1, %5 SUMSUB_BA d, %4, %3, %7
SUMSUB_BA d, %4, %3, %5 SUMSUB_BA d, %6, %5, %7
SWAP %2, %4, %1 SWAP %4, %6, %3
%endmacro %endmacro
%macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max %macro STORE_4x4 6 ; tmp1-2, reg1-2, min, max
...@@ -433,10 +433,12 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -433,10 +433,12 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
mova m1, [blockq+1*16] mova m1, [blockq+1*16]
mova m2, [blockq+2*16] mova m2, [blockq+2*16]
mova m3, [blockq+3*16] mova m3, [blockq+3*16]
mova m6, [pd_8192]
mova m7, [pd_3fff]
IDCT4_12BPP_1D IDCT4_12BPP_1D m6, m7
TRANSPOSE4x4D 0, 1, 2, 3, 4 TRANSPOSE4x4D 0, 1, 2, 3, 4
IDCT4_12BPP_1D IDCT4_12BPP_1D m6, m7
pxor m4, m4 pxor m4, m4
ZERO_BLOCK blockq, 16, 4, m4 ZERO_BLOCK blockq, 16, 4, m4
...@@ -445,7 +447,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -445,7 +447,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
DEFINE_ARGS dst, stride, stride3 DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3] lea stride3q, [strideq*3]
mova m5, [pw_4095] mova m5, [pw_4095]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4 mova m6, [pd_8]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
RET RET
%macro SCRATCH 3-4 %macro SCRATCH 3-4
...@@ -473,21 +476,32 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -473,21 +476,32 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endif %endif
%endmacro %endmacro
%macro PRELOAD 2-3
%if ARCH_X86_64
mova m%1, [%2]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2]
%endif
%endmacro
; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14 ; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14 ; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14 ; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14 ; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
%macro IADST4_12BPP_1D 0 %macro IADST4_12BPP_1D 0-2 [pd_8192], [pd_3fff] ; rnd, mask
pand m4, m0, [pd_3fff] pand m4, m0, %2
pand m5, m1, [pd_3fff] pand m5, m1, %2
psrad m0, 14 psrad m0, 14
psrad m1, 14 psrad m1, 14
packssdw m5, m1 packssdw m5, m1
packssdw m4, m0 packssdw m4, m0
punpckhwd m1, m4, m5 punpckhwd m1, m4, m5
punpcklwd m4, m5 punpcklwd m4, m5
pand m5, m2, [pd_3fff] pand m5, m2, %2
pand m6, m3, [pd_3fff] pand m6, m3, %2
psrad m2, 14 psrad m2, 14
psrad m3, 14 psrad m3, 14
packssdw m6, m3 packssdw m6, m3
...@@ -501,29 +515,35 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -501,29 +515,35 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
; m4/5 have the low bits of 0,1,2,3 ; m4/5 have the low bits of 0,1,2,3
; m0/2/6/7 are free ; m0/2/6/7 are free
pmaddwd m7, reg_b, [pw_15212_9929] mova m2, [pw_15212_9929]
pmaddwd m6, m4, [pw_5283_13377] mova m0, [pw_5283_13377]
pmaddwd m2, m3, [pw_15212_9929] pmaddwd m7, m2, reg_b
pmaddwd m0, reg_a, [pw_5283_13377] pmaddwd m6, m4, m0
pmaddwd m2, m3
pmaddwd m0, reg_a
paddd m6, m7 paddd m6, m7
paddd m0, m2 paddd m0, m2
pmaddwd m7, reg_b, [pw_m13377_13377] mova m1, [pw_m13377_13377]
pmaddwd m2, m4, [pw_13377_0] mova m5, [pw_13377_0]
pmaddwd m1, m3, [pw_m13377_13377] pmaddwd m7, m1, reg_b
pmaddwd m5, reg_a, [pw_13377_0] pmaddwd m2, m4, m5
pmaddwd m1, m3
pmaddwd m5, reg_a
paddd m2, m7 paddd m2, m7
paddd m1, m5 paddd m1, m5
paddd m6, [pd_8192] paddd m6, %1
paddd m2, [pd_8192] paddd m2, %1
psrad m6, 14 psrad m6, 14
psrad m2, 14 psrad m2, 14
paddd m0, m6 ; t0 paddd m0, m6 ; t0
paddd m2, m1 ; t2 paddd m2, m1 ; t2
pmaddwd m1, reg_b, [pw_m5283_m15212] mova m7, [pw_m5283_m15212]
pmaddwd m6, m4, [pw_9929_13377] mova m5, [pw_9929_13377]
pmaddwd m7, m3, [pw_m5283_m15212] pmaddwd m1, m7, reg_b
pmaddwd m5, reg_a, [pw_9929_13377] pmaddwd m6, m4, m5
pmaddwd m7, m3
pmaddwd m5, reg_a
paddd m6, m1 paddd m6, m1
paddd m7, m5 paddd m7, m5
UNSCRATCH 5, 9, rsp+1*mmsize, b UNSCRATCH 5, 9, rsp+1*mmsize, b
...@@ -534,8 +554,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -534,8 +554,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
pmaddwd m1, [pw_15212_m13377] pmaddwd m1, [pw_15212_m13377]
paddd m4, m5 paddd m4, m5
paddd m3, m1 paddd m3, m1
paddd m6, [pd_8192] paddd m6, %1
paddd m4, [pd_8192] paddd m4, %1
psrad m6, 14 psrad m6, 14
psrad m4, 14 psrad m4, 14
paddd m7, m6 ; t1 paddd m7, m6 ; t1
...@@ -545,15 +565,17 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob ...@@ -545,15 +565,17 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endmacro %endmacro
%macro IADST4_12BPP_FN 4 %macro IADST4_12BPP_FN 4
cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob cglobal vp9_%1_%3_4x4_add_12, 3, 3, 12, 2 * ARCH_X86_32 * mmsize, dst, stride, block, eob
mova m0, [blockq+0*16] mova m0, [blockq+0*16]
mova m1, [blockq+1*16] mova m1, [blockq+1*16]
mova m2, [blockq+2*16] mova m2, [blockq+2*16]
mova m3, [blockq+3*16] mova m3, [blockq+3*16]
%2_12BPP_1D PRELOAD 10, pd_8192, rnd
PRELOAD 11, pd_3fff, mask
%2_12BPP_1D reg_rnd, reg_mask
TRANSPOSE4x4D 0, 1, 2, 3, 4 TRANSPOSE4x4D 0, 1, 2, 3, 4
%4_12BPP_1D %4_12BPP_1D reg_rnd, reg_mask
pxor m4, m4 pxor m4, m4
ZERO_BLOCK blockq, 16, 4, m4 ZERO_BLOCK blockq, 16, 4, m4
...@@ -562,7 +584,8 @@ cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, b ...@@ -562,7 +584,8 @@ cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, b
DEFINE_ARGS dst, stride, stride3 DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3] lea stride3q, [strideq*3]
mova m5, [pw_4095] mova m5, [pw_4095]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, [pd_8], 4 mova m6, [pd_8]
ROUND_AND_STORE_4x4 0, 1, 2, 3, m4, m5, m6, 4
RET RET
%endmacro %endmacro
...@@ -573,30 +596,30 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 ...@@ -573,30 +596,30 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
; the following line has not been executed at the end of this macro: ; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+%3*mmsize ; UNSCRATCH 6, 8, rsp+%3*mmsize
%macro IDCT8_1D 1-3 2 * mmsize, 17 ; src, src_stride, stack_offset %macro IDCT8_1D 1-5 [pd_8192], [pd_3fff], 2 * mmsize, 17 ; src, rnd, mask, src_stride, stack_offset
mova m0, [%1+0*%2] mova m0, [%1+0*%4]
mova m2, [%1+2*%2] mova m2, [%1+2*%4]
mova m4, [%1+4*%2] mova m4, [%1+4*%4]
mova m6, [%1+6*%2] mova m6, [%1+6*%4]
IDCT4_12BPP_1D 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3 IDCT4_12BPP_1D %2, %3, 0, 2, 4, 6, 1, 3 ; m0/2/4/6 have t0/1/2/3
SCRATCH 4, 8, rsp+(%3+0)*mmsize SCRATCH 4, 8, rsp+(%5+0)*mmsize
SCRATCH 6, 9, rsp+(%3+1)*mmsize SCRATCH 6, 9, rsp+(%5+1)*mmsize
mova m1, [%1+1*%2] mova m1, [%1+1*%4]
mova m3, [%1+3*%2] mova m3, [%1+3*%4]
mova m5, [%1+5*%2] mova m5, [%1+5*%4]
mova m7, [%1+7*%2] mova m7, [%1+7*%4]
SUMSUB_MUL 1, 7, 4, 6, 16069, 3196 ; m1=t7a, m7=t4a SUMSUB_MUL 1, 7, 4, 6, 16069, 3196, %2, %3 ; m1=t7a, m7=t4a
SUMSUB_MUL 5, 3, 4, 6, 9102, 13623 ; m5=t6a, m3=t5a SUMSUB_MUL 5, 3, 4, 6, 9102, 13623, %2, %3 ; m5=t6a, m3=t5a
SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a SUMSUB_BA d, 3, 7, 4 ; m3=t4, m7=t5a
SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a SUMSUB_BA d, 5, 1, 4 ; m5=t7, m1=t6a
SUMSUB_MUL 1, 7, 4, 6, 11585, 11585 ; m1=t6, m7=t5 SUMSUB_MUL 1, 7, 4, 6, 11585, 11585, %2, %3 ; m1=t6, m7=t5
SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7 SUMSUB_BA d, 5, 0, 4 ; m5=out0, m0=out7
SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6 SUMSUB_BA d, 1, 2, 4 ; m1=out1, m2=out6
UNSCRATCH 4, 8, rsp+(%3+0)*mmsize UNSCRATCH 4, 8, rsp+(%5+0)*mmsize
UNSCRATCH 6, 9, rsp+(%3+1)*mmsize UNSCRATCH 6, 9, rsp+(%5+1)*mmsize
SCRATCH 2, 8, rsp+(%3+0)*mmsize SCRATCH 2, 8, rsp+(%5+0)*mmsize
SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5 SUMSUB_BA d, 7, 4, 2 ; m7=out2, m4=out5
SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4 SUMSUB_BA d, 3, 6, 2 ; m3=out3, m6=out4
SWAP 0, 5, 4, 6, 2, 7 SWAP 0, 5, 4, 6, 2, 7
%endmacro %endmacro
...@@ -613,23 +636,12 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4 ...@@ -613,23 +636,12 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
mova [%6+%7*1], m%2 mova [%6+%7*1], m%2
%endmacro %endmacro
%macro PRELOAD 2-3
%if ARCH_X86_64
mova m%1, [%2]
%if %0 == 3
%define reg_%3 m%1
%endif
%elif %0 == 3
%define reg_%3 [%2]
%endif
%endmacro
; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp ; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
; storage also instead of allocating two more stack spaces. This doesn't ; storage also instead of allocating two more stack spaces. This doesn't
; matter much but it's something... ; matter much but it's something...
INIT_XMM sse2 INIT_XMM sse2
cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 14, \
17 * mmsize + 2 * ARCH_X86_32 * mmsize, \ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
dst, stride, block, eob dst, stride, block, eob
mova m0, [pw_1023] mova m0, [pw_1023]
cmp eobd, 1 cmp eobd, 1
...@@ -654,7 +666,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ ...@@ -654,7 +666,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
RET RET
.idctfull: .idctfull:
mova [rsp+16*mmsize], m0 SCRATCH 0, 12, rsp+16*mmsize, max
DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64 %if ARCH_X86_64
mov dstbakq, dstq mov dstbakq, dstq
...@@ -669,8 +681,11 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ ...@@ -669,8 +681,11 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov skipd, 2 mov skipd, 2
sub skipd, cntd sub skipd, cntd
mov ptrq, rsp mov ptrq, rsp
PRELOAD 10, pd_8192, rnd
PRELOAD 11, pd_3fff, mask
PRELOAD 13, pd_16, srnd
.loop_1: .loop_1:
IDCT8_1D blockq IDCT8_1D blockq, reg_rnd, reg_mask
TRANSPOSE4x4D 0, 1, 2, 3, 6 TRANSPOSE4x4D 0, 1, 2, 3, 6
mova [ptrq+ 0*mmsize], m0 mova [ptrq+ 0*mmsize], m0
...@@ -709,14 +724,15 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ ...@@ -709,14 +724,15 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov cntd, 2 mov cntd, 2
mov ptrq, rsp mov ptrq, rsp
.loop_2: .loop_2:
IDCT8_1D ptrq IDCT8_1D ptrq, reg_rnd, reg_mask
pxor m6, m6 pxor m6, m6
PRELOAD 9, rsp+16*mmsize, max ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5 UNSCRATCH 1, 12, rsp+16*mmsize, max
UNSCRATCH 2, 13, pd_16, srnd
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
add ptrq, 16 add ptrq, 16
%if ARCH_X86_64 %if ARCH_X86_64
lea dstq, [dstbakq+8] lea dstq, [dstbakq+8]
...@@ -763,8 +779,8 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \ ...@@ -763,8 +779,8 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 14, \
17 * mmsize + 2 * ARCH_X86_32 * mmsize, \ 16 * mmsize + 3 * ARCH_X86_32 * mmsize, \
dst, stride, block, eob dst, stride, block, eob
mova m0, [pw_4095] mova m0, [pw_4095]
cmp eobd, 1 cmp eobd, 1
...@@ -791,9 +807,9 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ ...@@ -791,9 +807,9 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; ;
; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2 ; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1 ; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
%macro SUMSUB_MUL_D 6 ; src/dst 1-2, dst3-4, coef1-2 %macro SUMSUB_MUL_D 6-7 [pd_3fff] ; src/dst 1-2, dst3-4, coef1-2, mask
pand m%3, m%1, [pd_3fff] pand m%3, m%1, %7
pand m%4, m%2, [pd_3fff] pand m%4, m%2, %7
psrad m%1, 14 psrad m%1, 14
psrad m%2, 14 psrad m%2, 14
packssdw m%4, m%2 packssdw m%4, m%2
...@@ -808,11 +824,11 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ ...@@ -808,11 +824,11 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14 ; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14 ; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
%macro SUMSUB_PACK_D 5 ; src/dst 1-2, src3-4, tmp %macro SUMSUB_PACK_D 5-6 [pd_8192] ; src/dst 1-2, src3-4, tmp, rnd
SUMSUB_BA d, %1, %2, %5 SUMSUB_BA d, %1, %2, %5
SUMSUB_BA d, %3, %4, %5 SUMSUB_BA d, %3, %4, %5
paddd m%3, [pd_8192] paddd m%3, %6
paddd m%4, [pd_8192] paddd m%4, %6
psrad m%3, 14 psrad m%3, 14
psrad m%4, 14 psrad m%4, 14
paddd m%1, m%3 paddd m%1, m%3
...@@ -830,17 +846,17 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ ...@@ -830,17 +846,17 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; the following line has not been executed at the end of this macro: ; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+17*mmsize ; UNSCRATCH 6, 8, rsp+17*mmsize
%macro IADST8_1D 1 ; src %macro IADST8_1D 1-3 [pd_8192], [pd_3fff] ; src, rnd, mask
mova m0, [%1+ 0*mmsize] mova m0, [%1+ 0*mmsize]
mova m3, [%1+ 6*mmsize] mova m3, [%1+ 6*mmsize]
mova m4, [%1+ 8*mmsize] mova m4, [%1+ 8*mmsize]
mova m7, [%1+14*mmsize] mova m7, [%1+14*mmsize]
SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606 ; m7/1=t0a, m0/2=t1a SUMSUB_MUL_D 7, 0, 1, 2, 16305, 1606, %3 ; m7/1=t0a, m0/2=t1a
SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665 ; m3/5=t4a, m4/6=t5a SUMSUB_MUL_D 3, 4, 5, 6, 10394, 12665, %3 ; m3/5=t4a, m4/6=t5a
SCRATCH 0, 8, rsp+17*mmsize SCRATCH 0, 8, rsp+17*mmsize
SUMSUB_PACK_D 3, 7, 5, 1, 0 ; m3=t0, m7=t4 SUMSUB_PACK_D 3, 7, 5, 1, 0, %2 ; m3=t0, m7=t4
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
SUMSUB_PACK_D 4, 0, 6, 2, 1 ; m4=t1, m0=t5 SUMSUB_PACK_D 4, 0, 6, 2, 1, %2 ; m4=t1, m0=t5
SCRATCH 3, 8, rsp+17*mmsize SCRATCH 3, 8, rsp+17*mmsize
SCRATCH 4, 9, rsp+18*mmsize SCRATCH 4, 9, rsp+18*mmsize
...@@ -851,26 +867,26 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ ...@@ -851,26 +867,26 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
mova m2, [%1+ 4*mmsize] mova m2, [%1+ 4*mmsize]
mova m5, [%1+10*mmsize] mova m5, [%1+10*mmsize]
mova m6, [%1+12*mmsize] mova m6, [%1+12*mmsize]
SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723 ; m5/8=t2a, m2/9=t3a SUMSUB_MUL_D 5, 2, 3, 4, 14449, 7723, %3 ; m5/8=t2a, m2/9=t3a
SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679 ; m1/10=t6a, m6/11=t7a SUMSUB_MUL_D 1, 6, 7, 0, 4756, 15679, %3 ; m1/10=t6a, m6/11=t7a
SCRATCH 2, 12, rsp+21*mmsize SCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 1, 5, 7, 3, 2 ; m1=t2, m5=t6 SUMSUB_PACK_D 1, 5, 7, 3, 2, %2 ; m1=t2, m5=t6
UNSCRATCH 2, 12, rsp+21*mmsize UNSCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 6, 2, 0, 4, 3 ; m6=t3, m2=t7 SUMSUB_PACK_D 6, 2, 0, 4, 3, %2 ; m6=t3, m2=t7
UNSCRATCH 7, 10, rsp+19*mmsize UNSCRATCH 7, 10, rsp+19*mmsize
UNSCRATCH 0, 11, rsp+20*mmsize UNSCRATCH 0, 11, rsp+20*mmsize
SCRATCH 1, 10, rsp+19*mmsize SCRATCH 1, 10, rsp+19*mmsize
SCRATCH 6, 11, rsp+20*mmsize SCRATCH 6, 11, rsp+20*mmsize
SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270 ; m7/8=t4a, m0/9=t5a SUMSUB_MUL_D 7, 0, 3, 4, 15137, 6270, %3 ; m7/8=t4a, m0/9=t5a
SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137 ; m2/10=t7a, m5/11=t6a SUMSUB_MUL_D 2, 5, 1, 6, 6270, 15137, %3 ; m2/10=t7a, m5/11=t6a
SCRATCH 2, 12, rsp+21*mmsize SCRATCH 2, 12, rsp+21*mmsize
SUMSUB_PACK_D 5, 7, 6, 3, 2 ; m5=-out1, m7=t6 SUMSUB_PACK_D 5, 7, 6, 3, 2, %2 ; m5=-out1, m7=t6
UNSCRATCH 2, 12, rsp+21*mmsize UNSCRATCH 2, 12, rsp+21*mmsize
NEGD m5 ; m5=out1 NEGD m5 ; m5=out1
SUMSUB_PACK_D 2, 0, 1, 4, 3 ; m2=out6, m0=t7 SUMSUB_PACK_D 2, 0, 1, 4, 3, %2 ; m2=out6, m0=t7
SUMSUB_MUL 7, 0, 3, 4, 11585, 11585 ; m7=out2, m0=-out5 SUMSUB_MUL 7, 0, 3, 4, 11585, 11585, %2, %3 ; m7=out2, m0=-out5
NEGD m0 ; m0=out5 NEGD m0 ; m0=out5
UNSCRATCH 3, 8, rsp+17*mmsize UNSCRATCH 3, 8, rsp+17*mmsize
...@@ -883,7 +899,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \ ...@@ -883,7 +899,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2 SUMSUB_BA d, 1, 3, 2 ; m1=out0, m3=t2
SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3 SUMSUB_BA d, 6, 4, 2 ; m6=-out7, m4=t3
NEGD m6 ; m6=out7 NEGD m6 ; m6=out7
SUMSUB_MUL 3, 4, 2, 0, 11585, 11585 ; m3=-out3, m4=out4 SUMSUB_MUL 3, 4, 2, 0, 11585, 11585, %2, %3 ; m3=-out3, m4=out4
NEGD m3 ; m3=out3 NEGD m3 ; m3=out3
UNSCRATCH 0, 9, rsp+18*mmsize UNSCRATCH 0, 9, rsp+18*mmsize
...@@ -899,7 +915,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -899,7 +915,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mova m0, [pw_1023] mova m0, [pw_1023]
.body: .body:
mova [rsp+16*mmsize], m0 SCRATCH 0, 13, rsp+16*mmsize, max
DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, skip, dstbak
%if ARCH_X86_64 %if ARCH_X86_64
mov dstbakq, dstq mov dstbakq, dstq
...@@ -914,8 +930,10 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -914,8 +930,10 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov skipd, 2 mov skipd, 2
sub skipd, cntd sub skipd, cntd
mov ptrq, rsp mov ptrq, rsp
PRELOAD 14, pd_8192, rnd
PRELOAD 15, pd_3fff, mask
.loop_1: .loop_1:
%2_1D blockq %2_1D blockq, reg_rnd, reg_mask
TRANSPOSE4x4D 0, 1, 2, 3, 6 TRANSPOSE4x4D 0, 1, 2, 3, 6
mova [ptrq+ 0*mmsize], m0 mova [ptrq+ 0*mmsize], m0
...@@ -954,14 +972,16 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -954,14 +972,16 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov cntd, 2 mov cntd, 2
mov ptrq, rsp mov ptrq, rsp
.loop_2: .loop_2:
%4_1D ptrq %4_1D ptrq, reg_rnd, reg_mask
pxor m6, m6 pxor m6, m6
PRELOAD 9, rsp+16*mmsize, max PRELOAD 9, pd_16, srnd
ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, [pd_16], 5 ROUND_AND_STORE_4x4 0, 1, 2, 3, m6, reg_max, reg_srnd, 5
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
UNSCRATCH 0, 8, rsp+17*mmsize UNSCRATCH 0, 8, rsp+17*mmsize
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, reg_max, [pd_16], 5 UNSCRATCH 1, 13, rsp+16*mmsize, max
UNSCRATCH 2, 9, pd_16, srnd
ROUND_AND_STORE_4x4 4, 5, 0, 7, m6, m1, m2, 5
add ptrq, 16 add ptrq, 16
%if ARCH_X86_64 %if ARCH_X86_64
lea dstq, [dstbakq+8] lea dstq, [dstbakq+8]
...@@ -989,7 +1009,7 @@ IADST8_FN iadst, IADST8, idct, IDCT8, col ...@@ -989,7 +1009,7 @@ IADST8_FN iadst, IADST8, idct, IDCT8, col
IADST8_FN iadst, IADST8, iadst, IADST8, default IADST8_FN iadst, IADST8, iadst, IADST8, default
%macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset %macro IDCT16_1D 1-4 4 * mmsize, 65, 67 ; src, src_stride, stack_offset, mm32bit_stack_offset
IDCT8_1D %1, %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7 IDCT8_1D %1, [pd_8192], [pd_3fff], %2 * 2, %4 ; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6 ; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a SCRATCH 0, 15, rsp+(%4+7)*mmsize ; t0a
SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a SCRATCH 1, 14, rsp+(%4+6)*mmsize ; t1a
...@@ -1186,7 +1206,9 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1186,7 +1206,9 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
mova m0, [rsp+65*mmsize] mova m0, [rsp+65*mmsize]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6 mova m1, [rsp+64*mmsize]
mova m2, [pd_32]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
...@@ -1194,10 +1216,10 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1194,10 +1216,10 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstq, dstm mov dstq, dstm
%endif %endif
UNSCRATCH 0, 8, rsp+67*mmsize UNSCRATCH 0, 8, rsp+67*mmsize
UNSCRATCH 1, 9, rsp+68*mmsize UNSCRATCH 4, 9, rsp+68*mmsize
UNSCRATCH 2, 10, rsp+69*mmsize UNSCRATCH 5, 10, rsp+69*mmsize
UNSCRATCH 3, 11, rsp+70*mmsize UNSCRATCH 3, 11, rsp+70*mmsize
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
lea dstq, [dstbakq+stride3q*4] lea dstq, [dstbakq+stride3q*4]
...@@ -1208,7 +1230,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1208,7 +1230,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH 5, 13, rsp+72*mmsize UNSCRATCH 5, 13, rsp+72*mmsize
UNSCRATCH 6, 14, rsp+73*mmsize UNSCRATCH 6, 14, rsp+73*mmsize
UNSCRATCH 0, 15, rsp+74*mmsize UNSCRATCH 0, 15, rsp+74*mmsize
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
add ptrq, mmsize add ptrq, mmsize
%if ARCH_X86_64 %if ARCH_X86_64
...@@ -1501,7 +1523,9 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1501,7 +1523,9 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6
lea dstq, [dstq+strideq*4] lea dstq, [dstq+strideq*4]
mova m0, [rsp+65*mmsize] mova m0, [rsp+65*mmsize]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6 mova m1, [rsp+64*mmsize]
mova m2, [pd_32]
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst DEFINE_ARGS dstbak, stride, block, cnt, ptr, stride3, dst
...@@ -1509,10 +1533,10 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1509,10 +1533,10 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov dstq, dstm mov dstq, dstm
%endif %endif
UNSCRATCH 0, 8, rsp+(%6+0)*mmsize UNSCRATCH 0, 8, rsp+(%6+0)*mmsize
UNSCRATCH 1, 9, rsp+(%6+1)*mmsize UNSCRATCH 4, 9, rsp+(%6+1)*mmsize
UNSCRATCH 2, 10, rsp+(%6+2)*mmsize UNSCRATCH 5, 10, rsp+(%6+2)*mmsize
UNSCRATCH 3, 11, rsp+(%6+3)*mmsize UNSCRATCH 3, 11, rsp+(%6+3)*mmsize
ROUND_AND_STORE_4x4 0, 1, 2, 3, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 0, 4, 5, 3, m7, m1, m2, 6
%if ARCH_X86_64 %if ARCH_X86_64
DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak DEFINE_ARGS dst, stride, block, cnt, ptr, stride3, dstbak
lea dstq, [dstbakq+stride3q*4] lea dstq, [dstbakq+stride3q*4]
...@@ -1523,7 +1547,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \ ...@@ -1523,7 +1547,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH 5, 13, rsp+(%6+5)*mmsize UNSCRATCH 5, 13, rsp+(%6+5)*mmsize
UNSCRATCH 6, 14, rsp+(%6+6)*mmsize UNSCRATCH 6, 14, rsp+(%6+6)*mmsize
UNSCRATCH 0, 15, rsp+(%6+7)*mmsize UNSCRATCH 0, 15, rsp+(%6+7)*mmsize
ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, [rsp+64*mmsize], [pd_32], 6 ROUND_AND_STORE_4x4 4, 5, 6, 0, m7, m1, m2, 6
add ptrq, mmsize add ptrq, mmsize
%if ARCH_X86_64 %if ARCH_X86_64
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment