Commit b7c16a3f authored by James Almer's avatar James Almer

Merge commit '681a86ab'

* commit '681a86ab':
  x86: fft: Port to cpuflags
Merged-by: 's avatarJames Almer <jamrial@gmail.com>
parents 11f5ffd3 681a86ab
...@@ -191,6 +191,23 @@ SECTION .text ...@@ -191,6 +191,23 @@ SECTION .text
addps %2, %2, %5 ; {i0,i1,i2,i3} addps %2, %2, %5 ; {i0,i1,i2,i3}
%endmacro %endmacro
%macro INTERL 5
%if cpuflag(avx)
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
vextractf128 %4(%5), %2, 0
vextractf128 %4 %+ H(%5), %3, 0
vextractf128 %4(%5 + 1), %2, 1
vextractf128 %4 %+ H(%5 + 1), %3, 1
%elif cpuflag(sse)
mova %3, %2
unpcklps %2, %1
unpckhps %3, %1
mova %4(%5), %2
mova %4(%5+1), %3
%endif
%endmacro
; scheduled for cpu-bound sizes ; scheduled for cpu-bound sizes
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
IF%1 mova m4, Z(4) IF%1 mova m4, Z(4)
...@@ -541,17 +558,6 @@ DEFINE_ARGS zc, w, n, o1, o3 ...@@ -541,17 +558,6 @@ DEFINE_ARGS zc, w, n, o1, o3
INIT_YMM avx INIT_YMM avx
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
vextractf128 %4(%5), %2, 0
vextractf128 %4 %+ H(%5), %3, 0
vextractf128 %4(%5 + 1), %2, 1
vextractf128 %4 %+ H(%5 + 1), %3, 1
%endmacro
%define INTERL INTERL_AVX
DECL_PASS pass_avx, PASS_BIG 1 DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0 DECL_PASS pass_interleave_avx, PASS_BIG 0
...@@ -566,16 +572,6 @@ cglobal fft_calc, 2,5,8 ...@@ -566,16 +572,6 @@ cglobal fft_calc, 2,5,8
INIT_XMM sse INIT_XMM sse
%macro INTERL_SSE 5
mova %3, %2
unpcklps %2, %1
unpckhps %3, %1
mova %4(%5), %2
mova %4(%5+1), %3
%endmacro
%define INTERL INTERL_SSE
DECL_PASS pass_sse, PASS_BIG 1 DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0 DECL_PASS pass_interleave_sse, PASS_BIG 0
...@@ -861,16 +857,30 @@ INIT_XMM sse ...@@ -861,16 +857,30 @@ INIT_XMM sse
%endmacro %endmacro
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
%if cpuflag(sse)
mulps m6, %3, [%5+%1] mulps m6, %3, [%5+%1]
mulps m7, %2, [%5+%1] mulps m7, %2, [%5+%1]
mulps %2, %2, [%6+%1] mulps %2, %2, [%6+%1]
mulps %3, %3, [%6+%1] mulps %3, %3, [%6+%1]
subps %2, %2, m6 subps %2, %2, m6
addps %3, %3, m7 addps %3, %3, m7
%elif cpuflag(3dnow)
mova m6, [%1+%2*2]
mova %3, [%1+%2*2+8]
mova %4, m6
mova m7, %3
pfmul m6, [%5+%2]
pfmul %3, [%6+%2]
pfmul %4, [%6+%2]
pfmul m7, [%5+%2]
pfsub %3, m6
pfadd %4, m7
%endif
%endmacro %endmacro
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post: .post:
%if cpuflag(avx)
vmovaps ymm1, [%3+%1*2] vmovaps ymm1, [%3+%1*2]
vmovaps ymm0, [%3+%1*2+0x20] vmovaps ymm0, [%3+%1*2+0x20]
vmovaps ymm3, [%3+%2*2] vmovaps ymm3, [%3+%2*2]
...@@ -899,10 +909,7 @@ INIT_XMM sse ...@@ -899,10 +909,7 @@ INIT_XMM sse
sub %2, 0x20 sub %2, 0x20
add %1, 0x20 add %1, 0x20
jl .post jl .post
%endmacro %elif cpuflag(sse)
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
movaps xmm1, [%3+%1*2] movaps xmm1, [%3+%1*2]
movaps xmm0, [%3+%1*2+0x10] movaps xmm0, [%3+%1*2+0x10]
CMUL %1, xmm0, xmm1, %3, %4, %5 CMUL %1, xmm0, xmm1, %3, %4, %5
...@@ -924,25 +931,9 @@ INIT_XMM sse ...@@ -924,25 +931,9 @@ INIT_XMM sse
sub %2, 0x10 sub %2, 0x10
add %1, 0x10 add %1, 0x10
jl .post jl .post
%endmacro %elif cpuflag(3dnow)
CMUL %3, %1, m0, m1, %4, %5
%macro CMUL_3DNOW 6 CMUL %3, %2, m2, m3, %4, %5
mova m6, [%1+%2*2]
mova %3, [%1+%2*2+8]
mova %4, m6
mova m7, %3
pfmul m6, [%5+%2]
pfmul %3, [%6+%2]
pfmul %4, [%6+%2]
pfmul m7, [%5+%2]
pfsub %3, m6
pfadd %4, m7
%endmacro
%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
CMUL_3DNOW %3, %1, m0, m1, %4, %5
CMUL_3DNOW %3, %2, m2, m3, %4, %5
movd [%3+%1*2+ 0], m0 movd [%3+%1*2+ 0], m0
movd [%3+%2*2+12], m1 movd [%3+%2*2+12], m1
movd [%3+%2*2+ 0], m2 movd [%3+%2*2+ 0], m2
...@@ -958,9 +949,10 @@ INIT_XMM sse ...@@ -958,9 +949,10 @@ INIT_XMM sse
sub %2, 8 sub %2, 8
add %1, 8 add %1, 8
jl .post jl .post
%endif
%endmacro %endmacro
%macro DECL_IMDCT 1 %macro DECL_IMDCT 0
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64 %if ARCH_X86_64
%define rrevtab r7 %define rrevtab r7
...@@ -1066,7 +1058,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i ...@@ -1066,7 +1058,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
neg r0 neg r0
mov r1, -mmsize mov r1, -mmsize
sub r1, r0 sub r1, r0
%1 r0, r1, r6, rtcos, rtsin POSROTATESHUF r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
add esp, 12 add esp, 12
%endif %endif
...@@ -1076,18 +1068,18 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i ...@@ -1076,18 +1068,18 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
RET RET
%endmacro %endmacro
DECL_IMDCT POSROTATESHUF DECL_IMDCT
%if ARCH_X86_32 %if ARCH_X86_32
INIT_MMX 3dnow INIT_MMX 3dnow
DECL_IMDCT POSROTATESHUF_3DNOW DECL_IMDCT
INIT_MMX 3dnowext INIT_MMX 3dnowext
DECL_IMDCT POSROTATESHUF_3DNOW DECL_IMDCT
%endif %endif
INIT_YMM avx INIT_YMM avx
%if HAVE_AVX_EXTERNAL %if HAVE_AVX_EXTERNAL
DECL_IMDCT POSROTATESHUF_AVX DECL_IMDCT
%endif %endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment