Commit 5d0ddd1a authored by Loren Merritt's avatar Loren Merritt

split-radix FFT

c is 1.9x faster than previous c (on various x86 cpus), sse is 1.6x faster than previous sse.

Originally committed as revision 14698 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent bafad220
......@@ -388,6 +388,8 @@ OBJS += i386/fdct_mmx.o \
i386/simple_idct_mmx.o \
i386/idct_mmx_xvid.o \
i386/idct_sse2_xvid.o \
OBJS-$(HAVE_YASM) += i386/fft_mmx.o \
i386/fft_sse.o \
i386/fft_3dn.o \
i386/fft_3dn2.o \
......
......@@ -639,6 +639,8 @@ typedef struct FFTContext {
uint16_t *revtab;
FFTComplex *exptab;
FFTComplex *exptab1; /* only used by SSE code */
FFTComplex *tmp_buf;
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
......@@ -647,13 +649,18 @@ typedef struct FFTContext {
} FFTContext;
int ff_fft_init(FFTContext *s, int nbits, int inverse);
void ff_fft_permute(FFTContext *s, FFTComplex *z);
void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
{
s->fft_permute(s, z);
}
static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
{
s->fft_calc(s, z);
......
This diff is collapsed.
/*
* FFT/MDCT transform with 3DNow! optimizations
* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
* Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
* Copyright (c) 2008 Loren Merritt
*
* This file is part of FFmpeg.
*
......@@ -20,109 +19,5 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
static const int p1m1[2] __attribute__((aligned(8))) =
{ 0, 1 << 31 };
static const int m1p1[2] __attribute__((aligned(8))) =
{ 1 << 31, 0 };
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
{
int ln = s->nbits;
long j;
x86_reg i;
long nblocks, nloops;
FFTComplex *p, *cptr;
asm volatile(
/* FEMMS is not a must here but recommended by AMD */
"femms \n\t"
"movq %0, %%mm7 \n\t"
::"m"(*(s->inverse ? m1p1 : p1m1))
);
i = 8 << ln;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
"movq (%0,%1), %%mm0 \n\t"
"movq 16(%0,%1), %%mm1 \n\t"
"movq 8(%0,%1), %%mm2 \n\t"
"movq 24(%0,%1), %%mm3 \n\t"
"movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm5 \n\t"
"pfadd %%mm2, %%mm0 \n\t"
"pfadd %%mm3, %%mm1 \n\t"
"pfsub %%mm2, %%mm4 \n\t"
"pfsub %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm6 \n\t"
"punpckhdq %%mm6, %%mm5 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"pfadd %%mm1, %%mm0 \n\t"
"pfadd %%mm5, %%mm4 \n\t"
"pfsub %%mm1, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%0,%1) \n\t"
"movq %%mm4, 8(%0,%1) \n\t"
"movq %%mm2, 16(%0,%1) \n\t"
"movq %%mm3, 24(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
nblocks = 1 << (ln-3);
nloops = 1 << 2;
cptr = s->exptab1;
do {
p = z;
j = nblocks;
do {
i = nloops*8;
asm volatile(
"1: \n\t"
"sub $16, %0 \n\t"
"movq (%1,%0), %%mm0 \n\t"
"movq 8(%1,%0), %%mm1 \n\t"
"movq (%2,%0), %%mm2 \n\t"
"movq 8(%2,%0), %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"punpckldq %%mm2, %%mm2 \n\t"
"punpckldq %%mm3, %%mm3 \n\t"
"punpckhdq %%mm4, %%mm4 \n\t"
"punpckhdq %%mm5, %%mm5 \n\t"
"pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re
"pfmul 8(%3,%0,2), %%mm3 \n\t"
"pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
"pfmul 24(%3,%0,2), %%mm5 \n\t"
"pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
"pfadd %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pfadd %%mm4, %%mm0 \n\t"
"pfadd %%mm5, %%mm1 \n\t"
"pfsub %%mm4, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%1,%0) \n\t"
"movq %%mm1, 8(%1,%0) \n\t"
"movq %%mm2, (%2,%0) \n\t"
"movq %%mm3, 8(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
asm volatile("femms");
}
#define EMULATE_3DNOWEXT
#include "fft_3dn2.c"
......@@ -23,105 +23,26 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
static const int p1m1[2] __attribute__((aligned(8))) =
{ 0, 1 << 31 };
#ifdef EMULATE_3DNOWEXT
#define ff_fft_calc_3dn2 ff_fft_calc_3dn
#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
#define ff_imdct_half_3dn2 ff_imdct_half_3dn
#endif
static const int m1p1[2] __attribute__((aligned(8))) =
{ 1 << 31, 0 };
void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
{
int ln = s->nbits;
long j;
x86_reg i;
long nblocks, nloops;
FFTComplex *p, *cptr;
asm volatile(
/* FEMMS is not a must here but recommended by AMD */
"femms \n\t"
"movq %0, %%mm7 \n\t"
::"m"(*(s->inverse ? m1p1 : p1m1))
);
i = 8 << ln;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
"movq (%0,%1), %%mm0 \n\t"
"movq 16(%0,%1), %%mm1 \n\t"
"movq 8(%0,%1), %%mm2 \n\t"
"movq 24(%0,%1), %%mm3 \n\t"
"movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm5 \n\t"
"pfadd %%mm2, %%mm0 \n\t"
"pfadd %%mm3, %%mm1 \n\t"
"pfsub %%mm2, %%mm4 \n\t"
"pfsub %%mm3, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"pswapd %%mm5, %%mm5 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pxor %%mm7, %%mm5 \n\t"
"pfadd %%mm1, %%mm0 \n\t"
"pfadd %%mm5, %%mm4 \n\t"
"pfsub %%mm1, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%0,%1) \n\t"
"movq %%mm4, 8(%0,%1) \n\t"
"movq %%mm2, 16(%0,%1) \n\t"
"movq %%mm3, 24(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
nblocks = 1 << (ln-3);
nloops = 1 << 2;
cptr = s->exptab1;
do {
p = z;
j = nblocks;
do {
i = nloops*8;
asm volatile(
"1: \n\t"
"sub $16, %0 \n\t"
"movq (%1,%0), %%mm0 \n\t"
"movq 8(%1,%0), %%mm1 \n\t"
"movq (%2,%0), %%mm2 \n\t"
"movq 8(%2,%0), %%mm3 \n\t"
"movq (%3,%0,2), %%mm4 \n\t"
"movq 8(%3,%0,2), %%mm5 \n\t"
"pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
"pswapd %%mm5, %%mm7 \n\t"
"pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im
"pfmul %%mm3, %%mm5 \n\t"
"pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im
"pfmul %%mm3, %%mm7 \n\t"
"pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
"pfpnacc %%mm7, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pfadd %%mm4, %%mm0 \n\t"
"pfadd %%mm5, %%mm1 \n\t"
"pfsub %%mm4, %%mm2 \n\t"
"pfsub %%mm5, %%mm3 \n\t"
"movq %%mm0, (%1,%0) \n\t"
"movq %%mm1, 8(%1,%0) \n\t"
"movq %%mm2, (%2,%0) \n\t"
"movq %%mm3, 8(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
int n = 1<<s->nbits;
int i;
ff_fft_dispatch_interleave_3dn2(z, s->nbits);
asm volatile("femms");
if(n <= 8)
for(i=0; i<n; i+=2)
FFSWAP(FFTSample, z[i].im, z[i+1].re);
}
static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
......@@ -162,7 +83,7 @@ static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
);
}
ff_fft_calc(&s->fft, z);
ff_fft_calc_3dn2(&s->fft, z);
/* post rotation + reordering */
for(k = 0; k < n4; k++) {
......
This diff is collapsed.
......@@ -22,124 +22,55 @@
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
static const int p1p1p1m1[4] __attribute__((aligned(16))) =
{ 0, 0, 0, 1 << 31 };
static const int p1p1m1p1[4] __attribute__((aligned(16))) =
{ 0, 0, 1 << 31, 0 };
static const int p1p1m1m1[4] __attribute__((aligned(16))) =
{ 0, 0, 1 << 31, 1 << 31 };
static const int p1m1p1m1[4] __attribute__((aligned(16))) =
{ 0, 1 << 31, 0, 1 << 31 };
static const int m1m1m1m1[4] __attribute__((aligned(16))) =
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
#if 0
static void print_v4sf(const char *str, __m128 a)
{
float *p = (float *)&a;
printf("%s: %f %f %f %f\n",
str, p[0], p[1], p[2], p[3]);
}
#endif
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
/* XXX: handle reverse case */
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
int ln = s->nbits;
x86_reg i;
long j;
long nblocks, nloops;
FFTComplex *p, *cptr;
int n = 1 << s->nbits;
asm volatile(
"movaps %0, %%xmm4 \n\t"
"movaps %1, %%xmm5 \n\t"
::"m"(*p1p1m1m1),
"m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
);
ff_fft_dispatch_interleave_sse(z, s->nbits);
i = 8 << ln;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
/* do the pass 0 butterfly */
"movaps (%0,%1), %%xmm0 \n\t"
"movaps %%xmm0, %%xmm1 \n\t"
"shufps $0x4E, %%xmm0, %%xmm0 \n\t"
"xorps %%xmm4, %%xmm1 \n\t"
"addps %%xmm1, %%xmm0 \n\t"
"movaps 16(%0,%1), %%xmm2 \n\t"
"movaps %%xmm2, %%xmm3 \n\t"
"shufps $0x4E, %%xmm2, %%xmm2 \n\t"
"xorps %%xmm4, %%xmm3 \n\t"
"addps %%xmm3, %%xmm2 \n\t"
/* multiply third by -i */
/* by toggling the sign bit */
"shufps $0xB4, %%xmm2, %%xmm2 \n\t"
"xorps %%xmm5, %%xmm2 \n\t"
/* do the pass 1 butterfly */
"movaps %%xmm0, %%xmm1 \n\t"
"addps %%xmm2, %%xmm0 \n\t"
"subps %%xmm2, %%xmm1 \n\t"
"movaps %%xmm0, (%0,%1) \n\t"
"movaps %%xmm1, 16(%0,%1) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(z)
);
/* pass 2 .. ln-1 */
if(n <= 16) {
x86_reg i = -8*n;
asm volatile(
"1: \n"
"movaps (%0,%1), %%xmm0 \n"
"movaps %%xmm0, %%xmm1 \n"
"unpcklps 16(%0,%1), %%xmm0 \n"
"unpckhps 16(%0,%1), %%xmm1 \n"
"movaps %%xmm0, (%0,%1) \n"
"movaps %%xmm1, 16(%0,%1) \n"
"add $32, %0 \n"
"jl 1b \n"
:"+r"(i)
:"r"(z+n)
:"memory"
);
}
}
nblocks = 1 << (ln-3);
nloops = 1 << 2;
cptr = s->exptab1;
do {
p = z;
j = nblocks;
do {
i = nloops*8;
asm volatile(
"1: \n\t"
"sub $32, %0 \n\t"
"movaps (%2,%0), %%xmm1 \n\t"
"movaps (%1,%0), %%xmm0 \n\t"
"movaps 16(%2,%0), %%xmm5 \n\t"
"movaps 16(%1,%0), %%xmm4 \n\t"
"movaps %%xmm1, %%xmm2 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"shufps $0xA0, %%xmm1, %%xmm1 \n\t"
"shufps $0xF5, %%xmm2, %%xmm2 \n\t"
"shufps $0xA0, %%xmm5, %%xmm5 \n\t"
"shufps $0xF5, %%xmm6, %%xmm6 \n\t"
"mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re
"mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
"mulps 32(%3,%0,2), %%xmm5 \n\t" // cre*re cim*re
"mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
"addps %%xmm2, %%xmm1 \n\t"
"addps %%xmm6, %%xmm5 \n\t"
"movaps %%xmm0, %%xmm3 \n\t"
"movaps %%xmm4, %%xmm7 \n\t"
"addps %%xmm1, %%xmm0 \n\t"
"subps %%xmm1, %%xmm3 \n\t"
"addps %%xmm5, %%xmm4 \n\t"
"subps %%xmm5, %%xmm7 \n\t"
"movaps %%xmm0, (%1,%0) \n\t"
"movaps %%xmm3, (%2,%0) \n\t"
"movaps %%xmm4, 16(%1,%0) \n\t"
"movaps %%xmm7, 16(%2,%0) \n\t"
"jg 1b \n\t"
:"+r"(i)
:"r"(p), "r"(p + nloops), "r"(cptr)
);
p += nloops*2;
} while (--j);
cptr += nloops*2;
nblocks >>= 1;
nloops <<= 1;
} while (nblocks != 0);
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
{
int n = 1 << s->nbits;
int i;
for(i=0; i<n; i+=2) {
asm volatile(
"movaps %2, %%xmm0 \n"
"movlps %%xmm0, %0 \n"
"movhps %%xmm0, %1 \n"
:"=m"(s->tmp_buf[s->revtab[i]]),
"=m"(s->tmp_buf[s->revtab[i+1]])
:"m"(z[i])
);
}
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment