Commit 9d35fa52 authored by Vitor Sessak's avatar Vitor Sessak Committed by Reinhard Tartler

Add AVX FFT implementation.

Signed-off-by: 's avatarReinhard Tartler <siretart@tauware.de>
parent 13dfce3d
......@@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
version <next>:
- Lots of deprecated API cruft removed
- fft and imdct optimizations for AVX (Sandy Bridge) processors
version 0.7_beta1:
......
......@@ -223,9 +223,9 @@ typedef struct {
float sf[120]; ///< scalefactors
int sf_idx[128]; ///< scalefactor indices (used by encoder)
uint8_t zeroes[128]; ///< band is not coded (used by encoder)
DECLARE_ALIGNED(16, float, coeffs)[1024]; ///< coefficients for IMDCT
DECLARE_ALIGNED(16, float, saved)[1024]; ///< overlap
DECLARE_ALIGNED(16, float, ret)[2048]; ///< PCM output
DECLARE_ALIGNED(32, float, coeffs)[1024]; ///< coefficients for IMDCT
DECLARE_ALIGNED(32, float, saved)[1024]; ///< overlap
DECLARE_ALIGNED(32, float, ret)[2048]; ///< PCM output
DECLARE_ALIGNED(16, int16_t, ltp_state)[3072]; ///< time signal for LTP
PredictorState predictor_state[MAX_PREDICTORS];
} SingleChannelElement;
......@@ -272,7 +272,7 @@ typedef struct {
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @{
*/
DECLARE_ALIGNED(16, float, buf_mdct)[1024];
DECLARE_ALIGNED(32, float, buf_mdct)[1024];
/** @} */
/**
......@@ -296,7 +296,7 @@ typedef struct {
int sf_offset; ///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
/** @} */
DECLARE_ALIGNED(16, float, temp)[128];
DECLARE_ALIGNED(32, float, temp)[128];
enum OCStatus output_configured;
} AACContext;
......
......@@ -64,7 +64,7 @@ typedef struct AACEncContext {
int last_frame;
float lambda;
DECLARE_ALIGNED(16, int, qcoefs)[96]; ///< quantized coefficients
DECLARE_ALIGNED(16, float, scoefs)[1024]; ///< scaled coefficients
DECLARE_ALIGNED(32, float, scoefs)[1024]; ///< scaled coefficients
} AACEncContext;
#endif /* AVCODEC_AACENC_H */
......@@ -200,11 +200,11 @@ typedef struct {
///@defgroup arrays aligned arrays
DECLARE_ALIGNED(16, int, fixed_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients
DECLARE_ALIGNED(16, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients
DECLARE_ALIGNED(16, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block
DECLARE_ALIGNED(16, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients
DECLARE_ALIGNED(16, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing
DECLARE_ALIGNED(16, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing
DECLARE_ALIGNED(32, float, transform_coeffs)[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///< transform coefficients
DECLARE_ALIGNED(32, float, delay)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< delay - added to the next block
DECLARE_ALIGNED(32, float, window)[AC3_BLOCK_SIZE]; ///< window coefficients
DECLARE_ALIGNED(32, float, tmp_output)[AC3_BLOCK_SIZE]; ///< temporary storage for output before windowing
DECLARE_ALIGNED(32, float, output)[AC3_MAX_CHANNELS][AC3_BLOCK_SIZE]; ///< output after imdct transform and windowing
///@}
} AC3DecodeContext;
......
......@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
DECLARE_ALIGNED(16, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
DECLARE_ALIGNED(32, SampleType, windowed_samples)[AC3_WINDOW_SIZE];
} AC3EncodeContext;
typedef struct AC3Mant {
......
......@@ -60,11 +60,11 @@ typedef struct {
int log2_block_count[AT1_QMF_BANDS]; ///< log2 number of blocks in a band
int num_bfus; ///< number of Block Floating Units
float* spectrum[2];
DECLARE_ALIGNED(16, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(16, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED(16, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED(16, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter
DECLARE_ALIGNED(32, float, spec1)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(32, float, spec2)[AT1_SU_SAMPLES]; ///< mdct buffer
DECLARE_ALIGNED(32, float, fst_qmf_delay)[46]; ///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED(32, float, snd_qmf_delay)[46]; ///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED(32, float, last_qmf_delay)[256+23]; ///< delay line for the last stacked QMF filter
} AT1SUCtx;
/**
......@@ -72,13 +72,13 @@ typedef struct {
*/
typedef struct {
AT1SUCtx SUs[AT1_MAX_CHANNELS]; ///< channel sound unit
DECLARE_ALIGNED(16, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer
DECLARE_ALIGNED(32, float, spec)[AT1_SU_SAMPLES]; ///< the mdct spectrum buffer
DECLARE_ALIGNED(16, float, low)[256];
DECLARE_ALIGNED(16, float, mid)[256];
DECLARE_ALIGNED(16, float, high)[512];
DECLARE_ALIGNED(32, float, low)[256];
DECLARE_ALIGNED(32, float, mid)[256];
DECLARE_ALIGNED(32, float, high)[512];
float* bands[3];
DECLARE_ALIGNED(16, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
DECLARE_ALIGNED(32, float, out_samples)[AT1_MAX_CHANNELS][AT1_SU_SAMPLES];
FFTContext mdct_ctx[3];
int channels;
DSPContext dsp;
......
......@@ -74,8 +74,8 @@ typedef struct {
int gcBlkSwitch;
gain_block gainBlock[2];
DECLARE_ALIGNED(16, float, spectrum)[1024];
DECLARE_ALIGNED(16, float, IMDCT_buf)[1024];
DECLARE_ALIGNED(32, float, spectrum)[1024];
DECLARE_ALIGNED(32, float, IMDCT_buf)[1024];
float delayBuf1[46]; ///<qmf delay buffers
float delayBuf2[46];
......@@ -122,7 +122,7 @@ typedef struct {
FFTContext mdct_ctx;
} ATRAC3Context;
static DECLARE_ALIGNED(16, float,mdct_window)[512];
static DECLARE_ALIGNED(32, float, mdct_window)[512];
static VLC spectral_coeff_tab[7];
static float gain_tab1[16];
static float gain_tab2[31];
......
......@@ -55,7 +55,7 @@ typedef struct {
int num_bands;
unsigned int *bands;
float root;
DECLARE_ALIGNED(16, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, FFTSample, coeffs)[BINK_BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, short, previous)[BINK_BLOCK_MAX_SIZE / 16]; ///< coeffs from previous audio block
float *coeffs_ptr[MAX_CHANNELS]; ///< pointers to the coeffs arrays for float_to_int16_interleave
union {
......
......@@ -153,7 +153,7 @@ typedef struct cook {
/* data buffers */
uint8_t* decoded_bytes_buffer;
DECLARE_ALIGNED(16, float,mono_mdct_output)[2048];
DECLARE_ALIGNED(32, float, mono_mdct_output)[2048];
float decode_buffer_1[1024];
float decode_buffer_2[1024];
float decode_buffer_0[1060]; /* static allocation for joint decode */
......
......@@ -321,16 +321,16 @@ typedef struct {
/* Subband samples history (for ADPCM) */
float subband_samples_hist[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][4];
DECLARE_ALIGNED(16, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
DECLARE_ALIGNED(16, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
DECLARE_ALIGNED(32, float, subband_fir_hist)[DCA_PRIM_CHANNELS_MAX][512];
DECLARE_ALIGNED(32, float, subband_fir_noidea)[DCA_PRIM_CHANNELS_MAX][32];
int hist_index[DCA_PRIM_CHANNELS_MAX];
DECLARE_ALIGNED(16, float, raXin)[32];
DECLARE_ALIGNED(32, float, raXin)[32];
int output; ///< type of output
float scale_bias; ///< output scale
DECLARE_ALIGNED(16, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
DECLARE_ALIGNED(16, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][8];
DECLARE_ALIGNED(32, float, samples)[(DCA_PRIM_CHANNELS_MAX+1)*256];
const float *samples_chanptr[DCA_PRIM_CHANNELS_MAX+1];
uint8_t dca_buffer[DCA_MAX_FRAME_SIZE + DCA_MAX_EXSS_HEADER_SIZE + DCA_BUFFER_PADDING_SIZE];
......
......@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
#endif
}
static const int avx_tab[] = {
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static int is_second_half_of_fft32(int i, int n)
{
if (n <= 32)
return i >= 16;
else if (i < n/2)
return is_second_half_of_fft32(i, n/2);
else if (i < 3*n/4)
return is_second_half_of_fft32(i - n/2, n/4);
else
return is_second_half_of_fft32(i - 3*n/4, n/4);
}
static av_cold void fft_perm_avx(FFTContext *s)
{
int i;
int n = 1 << s->nbits;
for (i = 0; i < n; i += 16) {
int k;
if (is_second_half_of_fft32(i, n)) {
for (k = 0; k < 16; k++)
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
i + avx_tab[k];
} else {
for (k = 0; k < 16; k++) {
int j = i + k;
j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
}
}
}
}
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
{
int i, j, n;
......@@ -132,11 +170,16 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j);
}
for(i=0; i<n; i++) {
int j = i;
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
if (s->fft_permutation == FF_FFT_PERM_AVX) {
fft_perm_avx(s);
} else {
for(i=0; i<n; i++) {
int j = i;
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS)
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2);
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j;
}
}
return 0;
......
......@@ -85,6 +85,7 @@ struct FFTContext {
int fft_permutation;
#define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_AVX 2
int mdct_permutation;
#define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_INTERLEAVE 1
......@@ -97,7 +98,7 @@ struct FFTContext {
#endif
#define COSTABLE(size) \
COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern COSTABLE(16);
extern COSTABLE(32);
......
......@@ -88,7 +88,7 @@ typedef struct {
DSPContext dsp;
FFTContext fft;
DECLARE_ALIGNED(16, FFTComplex, samples)[COEFFS/2];
DECLARE_ALIGNED(32, FFTComplex, samples)[COEFFS/2];
float *out_samples;
} IMCContext;
......
......@@ -47,7 +47,7 @@
typedef struct NellyMoserDecodeContext {
AVCodecContext* avctx;
DECLARE_ALIGNED(16, float,float_buf)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, float_buf)[NELLY_SAMPLES];
float state[128];
AVLFG random_state;
GetBitContext gb;
......@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
DSPContext dsp;
FFTContext imdct_ctx;
FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
DECLARE_ALIGNED(32, float, imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext;
static void overlap_and_window(NellyMoserDecodeContext *s, float *state, float *audio, float *a_in)
......
......@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
int have_saved;
DSPContext dsp;
FFTContext mdct_ctx;
DECLARE_ALIGNED(16, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, in_buff)[NELLY_SAMPLES];
DECLARE_ALIGNED(16, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
DECLARE_ALIGNED(32, float, mdct_out)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, in_buff)[NELLY_SAMPLES];
DECLARE_ALIGNED(32, float, buf)[2][3 * NELLY_BUF_LEN]; ///< sample buffer
float (*opt )[NELLY_BANDS];
uint8_t (*path)[NELLY_BANDS];
} NellyMoserEncodeContext;
......
......@@ -120,7 +120,7 @@ typedef struct {
} FFTCoefficient;
typedef struct {
DECLARE_ALIGNED(16, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
DECLARE_ALIGNED(32, QDM2Complex, complex)[MPA_MAX_CHANNELS][256];
} QDM2FFT;
/**
......
......@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
uint8_t ms_stereo; ///< true if mid/side stereo mode
uint8_t channel_coded[MAX_CHANNELS]; ///< true if channel is coded
int exponents_bsize[MAX_CHANNELS]; ///< log2 ratio frame/exp. length
DECLARE_ALIGNED(16, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, float, exponents)[MAX_CHANNELS][BLOCK_MAX_SIZE];
float max_exponent[MAX_CHANNELS];
WMACoef coefs1[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(16, FFTSample, output)[BLOCK_MAX_SIZE * 2];
DECLARE_ALIGNED(32, float, coefs)[MAX_CHANNELS][BLOCK_MAX_SIZE];
DECLARE_ALIGNED(32, FFTSample, output)[BLOCK_MAX_SIZE * 2];
FFTContext mdct_ctx[BLOCK_NB_SIZES];
float *windows[BLOCK_NB_SIZES];
/* output buffer for one frame and the last for IMDCT windowing */
DECLARE_ALIGNED(16, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
DECLARE_ALIGNED(32, float, frame_out)[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
/* last frame info */
uint8_t last_superframe[MAX_CODED_SUPERFRAME_SIZE + 4]; /* padding added */
int last_bitoffset;
......
......@@ -145,7 +145,7 @@ typedef struct {
uint8_t table_idx; ///< index in sf_offsets for the scale factor reference block
float* coeffs; ///< pointer to the subframe decode buffer
uint16_t num_vec_coeffs; ///< number of vector coded coefficients
DECLARE_ALIGNED(16, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
DECLARE_ALIGNED(32, float, out)[WMAPRO_BLOCK_MAX_SIZE + WMAPRO_BLOCK_MAX_SIZE / 2]; ///< output buffer
} WMAProChannelCtx;
/**
......@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
FF_INPUT_BUFFER_PADDING_SIZE];///< compressed frame data
PutBitContext pb; ///< context for filling the frame_data buffer
FFTContext mdct_ctx[WMAPRO_BLOCK_SIZES]; ///< MDCT context per block size
DECLARE_ALIGNED(16, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
DECLARE_ALIGNED(32, float, tmp)[WMAPRO_BLOCK_MAX_SIZE]; ///< IMDCT output buffer
float* windows[WMAPRO_BLOCK_SIZES]; ///< windows for the different block sizes
/* frame size dependent frame information (set during initialization) */
......
......@@ -275,11 +275,11 @@ typedef struct {
///< by postfilter
float denoise_filter_cache[MAX_FRAMESIZE];
int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
///< aligned buffer for LPC tilting
DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
///< aligned buffer for denoise coefficients
DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
///< aligned buffer for postfilter speech
///< synthesis
/**
......
......@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{
#if HAVE_YASM
int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) {
/* AVX for SB */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_avx;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
} else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
/* SSE for P3/P4/K8 */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
......
......@@ -22,6 +22,7 @@
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
......@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
#endif
This diff is collapsed.
......@@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
{
ff_fft_dispatch_interleave_avx(z, s->nbits);
}
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
{
......@@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
long n = s->mdct_size;
long n4 = n >> 2;
ff_imdct_half_sse(s, output+n4, input);
s->imdct_half(s, output + n4, input);
j = -n;
k = n-16;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment