Commit a5bfa66d authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Mans Rullgard

x86: fft: replace call to memcpy by a loop

The function call was a mess to handle, and memcpy cannot make
the assumptions we do in the new code.

Tested on an IMC sample: 430c -> 370c.
Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent 75d339e0
...@@ -615,8 +615,6 @@ cglobal fft_calc, 2,5,8 ...@@ -615,8 +615,6 @@ cglobal fft_calc, 2,5,8
.end: .end:
REP_RET REP_RET
cextern_naked memcpy
cglobal fft_permute, 2,7,1 cglobal fft_permute, 2,7,1
mov r4, [r0 + FFTContext.revtab] mov r4, [r0 + FFTContext.revtab]
mov r5, [r0 + FFTContext.tmpbuf] mov r5, [r0 + FFTContext.tmpbuf]
...@@ -637,29 +635,18 @@ cglobal fft_permute, 2,7,1 ...@@ -637,29 +635,18 @@ cglobal fft_permute, 2,7,1
cmp r0, r2 cmp r0, r2
jl .loop jl .loop
shl r2, 3 shl r2, 3
%if ARCH_X86_64 add r1, r2
mov r0, r1 add r5, r2
mov r1, r5 neg r2
%endif ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
%if WIN64 .loopcopy:
sub rsp, 8 movaps xmm0, [r5 + r2]
call memcpy movaps xmm1, [r5 + r2 + 16]
add rsp, 8 movaps [r1 + r2], xmm0
RET movaps [r1 + r2 + 16], xmm1
%elif ARCH_X86_64 add r2, 32
%ifdef PIC jl .loopcopy
jmp memcpy wrt ..plt REP_RET
%else
jmp memcpy
%endif
%else
push r2
push r5
push r1
call memcpy
add esp, 12
RET
%endif
cglobal imdct_calc, 3,5,3 cglobal imdct_calc, 3,5,3
mov r3d, [r0 + FFTContext.mdctsize] mov r3d, [r0 + FFTContext.mdctsize]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment