Commit ac8ad8d0 authored by James Almer's avatar James Almer

x86/sbrdsp: sign extend start and end gprs in ff_sbr_hf_gen_sse

Tested-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent de54a37c
...@@ -149,19 +149,19 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E ...@@ -149,19 +149,19 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; start and end 6th and 7th args on stack ; start and end 6th and 7th args on stack
mov r2d, Sm mov r2d, Sm
mov r3d, Em mov r3d, Em
%define start r2q DEFINE_ARGS X_high, X_low, start, end
%define end r3q
%else %else
; BW does not actually occupy a register, so shift by 1 ; BW does not actually occupy a register, so shift by 1
%define start BWq DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
%define end Sq movsxd startq, startd
movsxd endq, endd
%endif %endif
sub start, end ; neg num of loops sub startq, endq ; neg num of loops
lea X_highq, [X_highq + end*2*4] lea X_highq, [X_highq + endq*2*4]
lea X_lowq, [X_lowq + end*2*4 - 2*2*4] lea X_lowq, [X_lowq + endq*2*4 - 2*2*4]
shl start, 3 ; offset from num loops shl startq, 3 ; offset from num loops
mova m0, [X_lowq + start] mova m0, [X_lowq + startq]
shufps m3, m3, q1111 shufps m3, m3, q1111
shufps m4, m4, q1111 shufps m4, m4, q1111
xorps m3, [ps_mask] xorps m3, [ps_mask]
...@@ -169,7 +169,7 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E ...@@ -169,7 +169,7 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
shufps m2, m2, q0000 shufps m2, m2, q0000
xorps m4, [ps_mask] xorps m4, [ps_mask]
.loop2: .loop2:
movu m7, [X_lowq + start + 8] ; BbCc movu m7, [X_lowq + startq + 8] ; BbCc
mova m6, m0 mova m6, m0
mova m5, m7 mova m5, m7
shufps m0, m0, q2301 ; aAbB shufps m0, m0, q2301 ; aAbB
...@@ -179,12 +179,12 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E ...@@ -179,12 +179,12 @@ cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
mulps m6, m2 mulps m6, m2
mulps m5, m1 mulps m5, m1
addps m7, m0 addps m7, m0
mova m0, [X_lowq + start +16] ; CcDd mova m0, [X_lowq + startq + 16] ; CcDd
addps m7, m0 addps m7, m0
addps m6, m5 addps m6, m5
addps m7, m6 addps m7, m6
mova [X_highq + start], m7 mova [X_highq + startq], m7
add start, 16 add startq, 16
jnz .loop2 jnz .loop2
RET RET
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment