Commit 0810608e authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Michael Niedermayer

x86: hevc_mc: better register allocation

The xmm reg count was incorrect, and manual loading of the gprs
furthermore allows to noticeable reduce the number needed.

The modified functions are used in weighted prediction, so only a
few samples like WP_* exhibit a change. For this one and Win64
(some widths removed because of too few occurrences):

WP_A_Toshiba_3.bit, ff_hevc_put_hevc_uni_w
         16    32
before: 2194  3872
after:  2119  3767

WP_B_Toshiba_3.bit, ff_hevc_put_hevc_bi_w
         16    32    64
before: 2819  4960  9396
after:  2617  4788  9150
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent ec33f59f
...@@ -1103,19 +1103,32 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride ...@@ -1103,19 +1103,32 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride
%endmacro %endmacro
%macro WEIGHTING_FUNCS 2 %macro WEIGHTING_FUNCS 2
cglobal hevc_put_hevc_uni_w%1_%2, 8, 10, 11, dst, dststride, src, srcstride, height, denom, wx, ox, shift %if WIN64 || ARCH_X86_32
lea shiftd, [denomd+14-%2] ; shift = 14 - bitd + denom cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
shl oxd, %2-8 ; ox << (bitd - 8) mov r4d, denomm
movd m2, wxd ; WX %define SHIFT r4d
movd m3, oxd ; OX %else
movd m4, shiftd ; shift cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
%define SHIFT denomd
%endif
lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
movd m2, wxm ; WX
movd m4, SHIFT ; shift
punpcklwd m2, m2 punpcklwd m2, m2
pshufd m3, m3, 0 dec SHIFT
pshufd m2, m2, 0
sub shiftd, 1
movd m6, shiftd
movdqu m5, [one_per_32] movdqu m5, [one_per_32]
movd m6, SHIFT
pshufd m2, m2, 0
mov SHIFT, oxm
pslld m5, m6 pslld m5, m6
%if %2 != 8
shl SHIFT, %2-8 ; ox << (bitd - 8)
%endif
movd m3, SHIFT ; OX
pshufd m3, m3, 0
%if WIN64 || ARCH_X86_32
mov SHIFT, heightm
%endif
.loop .loop
SIMPLE_LOAD %1, 10, srcq, m0 SIMPLE_LOAD %1, 10, srcq, m0
pmulhw m6, m0, m2 pmulhw m6, m0, m2
...@@ -1141,38 +1154,42 @@ cglobal hevc_put_hevc_uni_w%1_%2, 8, 10, 11, dst, dststride, src, srcstride, hei ...@@ -1141,38 +1154,42 @@ cglobal hevc_put_hevc_uni_w%1_%2, 8, 10, 11, dst, dststride, src, srcstride, hei
jnz .loop ; height loop jnz .loop ; height loop
RET RET
cglobal hevc_put_hevc_bi_w%1_%2, 12, 14, 14, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1, shift, temp cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
shl ox0d, %2-8 ; ox << (bitd - 8) mov r6d, denomm
shl ox1d, %2-8 ; ox << (bitd - 8) movd m2, wx0m ; WX0
lea shiftd, [denomd+14-%2] ; shift = 14 - bitd + denom lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
movd m2, wx0d ; WX0 movd m3, wx1m ; WX1
movd m3, wx1d ; WX1 movd m0, r6d ; shift
punpcklwd m2, m2 punpcklwd m2, m2
inc r6d
punpcklwd m3, m3 punpcklwd m3, m3
movd m5, r6d ; shift+1
pshufd m2, m2, 0 pshufd m2, m2, 0
mov r6d, ox0m
pshufd m3, m3, 0 pshufd m3, m3, 0
add ox0d, ox1d add r6d, ox1m
add ox0d, 1 %if %2 != 8
movd m4, ox0d ; offset shl r6d, %2-8 ; ox << (bitd - 8)
%endif
inc r6d
movd m4, r6d ; offset
pshufd m4, m4, 0 pshufd m4, m4, 0
movd m5, shiftd ; shift mov r6d, heightm
pslld m4, m5 pslld m4, m0
add shiftd, 1
movd m5, shiftd ; shift
.loop .loop
SIMPLE_LOAD %1, 10, srcq, m0 SIMPLE_LOAD %1, 10, srcq, m0
SIMPLE_LOAD %1, 10, src2q, m10 SIMPLE_LOAD %1, 10, src2q, m8
pmulhw m6, m0, m3 pmulhw m6, m0, m3
pmullw m0, m3 pmullw m0, m3
pmulhw m7, m10, m2 pmulhw m7, m8, m2
pmullw m10, m2 pmullw m8, m2
punpckhwd m1, m0, m6 punpckhwd m1, m0, m6
punpcklwd m0, m6 punpcklwd m0, m6
punpckhwd m11, m10, m7 punpckhwd m9, m8, m7
punpcklwd m10, m7 punpcklwd m8, m7
paddd m0, m10 paddd m0, m8
paddd m1, m11 paddd m1, m9
paddd m0, m4 paddd m0, m4
paddd m1, m4 paddd m1, m4
psrad m0, m5 psrad m0, m5
...@@ -1187,7 +1204,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 12, 14, 14, dst, dststride, src, srcstride, src ...@@ -1187,7 +1204,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 12, 14, 14, dst, dststride, src, srcstride, src
lea dstq, [dstq+dststrideq] ; dst += dststride lea dstq, [dstq+dststrideq] ; dst += dststride
lea srcq, [srcq+2*srcstrideq] ; src += srcstride lea srcq, [srcq+2*srcstrideq] ; src += srcstride
lea src2q, [src2q+2*src2strideq] ; src2 += srcstride lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
dec heightd ; cmp height dec r6d ; cmp height
jnz .loop ; height loop jnz .loop ; height loop
RET RET
%endmacro %endmacro
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment