Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
9d35fa52
Commit
9d35fa52
authored
Apr 25, 2011
by
Vitor Sessak
Committed by
Reinhard Tartler
Apr 26, 2011
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add AVX FFT implementation.
Signed-off-by:
Reinhard Tartler
<
siretart@tauware.de
>
parent
13dfce3d
Show whitespace changes
Inline
Side-by-side
Showing
23 changed files
with
450 additions
and
207 deletions
+450
-207
Changelog
Changelog
+1
-1
aac.h
libavcodec/aac.h
+5
-5
aacenc.h
libavcodec/aacenc.h
+1
-1
ac3dec.h
libavcodec/ac3dec.h
+5
-5
ac3enc.c
libavcodec/ac3enc.c
+1
-1
atrac1.c
libavcodec/atrac1.c
+10
-10
atrac3.c
libavcodec/atrac3.c
+3
-3
binkaudio.c
libavcodec/binkaudio.c
+1
-1
cook.c
libavcodec/cook.c
+1
-1
dca.c
libavcodec/dca.c
+5
-5
fft.c
libavcodec/fft.c
+48
-5
fft.h
libavcodec/fft.h
+2
-1
imc.c
libavcodec/imc.c
+1
-1
nellymoserdec.c
libavcodec/nellymoserdec.c
+2
-2
nellymoserenc.c
libavcodec/nellymoserenc.c
+3
-3
qdm2.c
libavcodec/qdm2.c
+1
-1
wma.h
libavcodec/wma.h
+4
-4
wmaprodec.c
libavcodec/wmaprodec.c
+2
-2
wmavoice.c
libavcodec/wmavoice.c
+3
-3
fft.c
libavcodec/x86/fft.c
+8
-1
fft.h
libavcodec/x86/fft.h
+2
-0
fft_mmx.asm
libavcodec/x86/fft_mmx.asm
+334
-150
fft_sse.c
libavcodec/x86/fft_sse.c
+7
-1
No files found.
Changelog
View file @
9d35fa52
...
@@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
...
@@ -5,7 +5,7 @@ releases are sorted from youngest to oldest.
version <next>:
version <next>:
- Lots of deprecated API cruft removed
- Lots of deprecated API cruft removed
- fft and imdct optimizations for AVX (Sandy Bridge) processors
version 0.7_beta1:
version 0.7_beta1:
...
...
libavcodec/aac.h
View file @
9d35fa52
...
@@ -223,9 +223,9 @@ typedef struct {
...
@@ -223,9 +223,9 @@ typedef struct {
float
sf
[
120
];
///< scalefactors
float
sf
[
120
];
///< scalefactors
int
sf_idx
[
128
];
///< scalefactor indices (used by encoder)
int
sf_idx
[
128
];
///< scalefactor indices (used by encoder)
uint8_t
zeroes
[
128
];
///< band is not coded (used by encoder)
uint8_t
zeroes
[
128
];
///< band is not coded (used by encoder)
DECLARE_ALIGNED
(
16
,
float
,
coeffs
)[
1024
];
///< coefficients for IMDCT
DECLARE_ALIGNED
(
32
,
float
,
coeffs
)[
1024
];
///< coefficients for IMDCT
DECLARE_ALIGNED
(
16
,
float
,
saved
)[
1024
];
///< overlap
DECLARE_ALIGNED
(
32
,
float
,
saved
)[
1024
];
///< overlap
DECLARE_ALIGNED
(
16
,
float
,
ret
)[
2048
];
///< PCM output
DECLARE_ALIGNED
(
32
,
float
,
ret
)[
2048
];
///< PCM output
DECLARE_ALIGNED
(
16
,
int16_t
,
ltp_state
)[
3072
];
///< time signal for LTP
DECLARE_ALIGNED
(
16
,
int16_t
,
ltp_state
)[
3072
];
///< time signal for LTP
PredictorState
predictor_state
[
MAX_PREDICTORS
];
PredictorState
predictor_state
[
MAX_PREDICTORS
];
}
SingleChannelElement
;
}
SingleChannelElement
;
...
@@ -272,7 +272,7 @@ typedef struct {
...
@@ -272,7 +272,7 @@ typedef struct {
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @defgroup temporary aligned temporary buffers (We do not want to have these on the stack.)
* @{
* @{
*/
*/
DECLARE_ALIGNED
(
16
,
float
,
buf_mdct
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
buf_mdct
)[
1024
];
/** @} */
/** @} */
/**
/**
...
@@ -296,7 +296,7 @@ typedef struct {
...
@@ -296,7 +296,7 @@ typedef struct {
int
sf_offset
;
///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
int
sf_offset
;
///< offset into pow2sf_tab as appropriate for dsp.float_to_int16
/** @} */
/** @} */
DECLARE_ALIGNED
(
16
,
float
,
temp
)[
128
];
DECLARE_ALIGNED
(
32
,
float
,
temp
)[
128
];
enum
OCStatus
output_configured
;
enum
OCStatus
output_configured
;
}
AACContext
;
}
AACContext
;
...
...
libavcodec/aacenc.h
View file @
9d35fa52
...
@@ -64,7 +64,7 @@ typedef struct AACEncContext {
...
@@ -64,7 +64,7 @@ typedef struct AACEncContext {
int
last_frame
;
int
last_frame
;
float
lambda
;
float
lambda
;
DECLARE_ALIGNED
(
16
,
int
,
qcoefs
)[
96
];
///< quantized coefficients
DECLARE_ALIGNED
(
16
,
int
,
qcoefs
)[
96
];
///< quantized coefficients
DECLARE_ALIGNED
(
16
,
float
,
scoefs
)[
1024
];
///< scaled coefficients
DECLARE_ALIGNED
(
32
,
float
,
scoefs
)[
1024
];
///< scaled coefficients
}
AACEncContext
;
}
AACEncContext
;
#endif
/* AVCODEC_AACENC_H */
#endif
/* AVCODEC_AACENC_H */
libavcodec/ac3dec.h
View file @
9d35fa52
...
@@ -200,11 +200,11 @@ typedef struct {
...
@@ -200,11 +200,11 @@ typedef struct {
///@defgroup arrays aligned arrays
///@defgroup arrays aligned arrays
DECLARE_ALIGNED
(
16
,
int
,
fixed_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///> fixed-point transform coefficients
DECLARE_ALIGNED
(
16
,
int
,
fixed_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///> fixed-point transform coefficients
DECLARE_ALIGNED
(
16
,
float
,
transform_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///< transform coefficients
DECLARE_ALIGNED
(
32
,
float
,
transform_coeffs
)[
AC3_MAX_CHANNELS
][
AC3_MAX_COEFS
];
///< transform coefficients
DECLARE_ALIGNED
(
16
,
float
,
delay
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< delay - added to the next block
DECLARE_ALIGNED
(
32
,
float
,
delay
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< delay - added to the next block
DECLARE_ALIGNED
(
16
,
float
,
window
)[
AC3_BLOCK_SIZE
];
///< window coefficients
DECLARE_ALIGNED
(
32
,
float
,
window
)[
AC3_BLOCK_SIZE
];
///< window coefficients
DECLARE_ALIGNED
(
16
,
float
,
tmp_output
)[
AC3_BLOCK_SIZE
];
///< temporary storage for output before windowing
DECLARE_ALIGNED
(
32
,
float
,
tmp_output
)[
AC3_BLOCK_SIZE
];
///< temporary storage for output before windowing
DECLARE_ALIGNED
(
16
,
float
,
output
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< output after imdct transform and windowing
DECLARE_ALIGNED
(
32
,
float
,
output
)[
AC3_MAX_CHANNELS
][
AC3_BLOCK_SIZE
];
///< output after imdct transform and windowing
///@}
///@}
}
AC3DecodeContext
;
}
AC3DecodeContext
;
...
...
libavcodec/ac3enc.c
View file @
9d35fa52
...
@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
...
@@ -201,7 +201,7 @@ typedef struct AC3EncodeContext {
uint8_t
exp_strategy
[
AC3_MAX_CHANNELS
][
AC3_MAX_BLOCKS
];
///< exponent strategies
uint8_t
exp_strategy
[
AC3_MAX_CHANNELS
][
AC3_MAX_BLOCKS
];
///< exponent strategies
DECLARE_ALIGNED
(
16
,
SampleType
,
windowed_samples
)[
AC3_WINDOW_SIZE
];
DECLARE_ALIGNED
(
32
,
SampleType
,
windowed_samples
)[
AC3_WINDOW_SIZE
];
}
AC3EncodeContext
;
}
AC3EncodeContext
;
typedef
struct
AC3Mant
{
typedef
struct
AC3Mant
{
...
...
libavcodec/atrac1.c
View file @
9d35fa52
...
@@ -60,11 +60,11 @@ typedef struct {
...
@@ -60,11 +60,11 @@ typedef struct {
int
log2_block_count
[
AT1_QMF_BANDS
];
///< log2 number of blocks in a band
int
log2_block_count
[
AT1_QMF_BANDS
];
///< log2 number of blocks in a band
int
num_bfus
;
///< number of Block Floating Units
int
num_bfus
;
///< number of Block Floating Units
float
*
spectrum
[
2
];
float
*
spectrum
[
2
];
DECLARE_ALIGNED
(
16
,
float
,
spec1
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
32
,
float
,
spec1
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
16
,
float
,
spec2
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
32
,
float
,
spec2
)[
AT1_SU_SAMPLES
];
///< mdct buffer
DECLARE_ALIGNED
(
16
,
float
,
fst_qmf_delay
)[
46
];
///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
fst_qmf_delay
)[
46
];
///< delay line for the 1st stacked QMF filter
DECLARE_ALIGNED
(
16
,
float
,
snd_qmf_delay
)[
46
];
///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
snd_qmf_delay
)[
46
];
///< delay line for the 2nd stacked QMF filter
DECLARE_ALIGNED
(
16
,
float
,
last_qmf_delay
)[
256
+
23
];
///< delay line for the last stacked QMF filter
DECLARE_ALIGNED
(
32
,
float
,
last_qmf_delay
)[
256
+
23
];
///< delay line for the last stacked QMF filter
}
AT1SUCtx
;
}
AT1SUCtx
;
/**
/**
...
@@ -72,13 +72,13 @@ typedef struct {
...
@@ -72,13 +72,13 @@ typedef struct {
*/
*/
typedef
struct
{
typedef
struct
{
AT1SUCtx
SUs
[
AT1_MAX_CHANNELS
];
///< channel sound unit
AT1SUCtx
SUs
[
AT1_MAX_CHANNELS
];
///< channel sound unit
DECLARE_ALIGNED
(
16
,
float
,
spec
)[
AT1_SU_SAMPLES
];
///< the mdct spectrum buffer
DECLARE_ALIGNED
(
32
,
float
,
spec
)[
AT1_SU_SAMPLES
];
///< the mdct spectrum buffer
DECLARE_ALIGNED
(
16
,
float
,
low
)[
256
];
DECLARE_ALIGNED
(
32
,
float
,
low
)[
256
];
DECLARE_ALIGNED
(
16
,
float
,
mid
)[
256
];
DECLARE_ALIGNED
(
32
,
float
,
mid
)[
256
];
DECLARE_ALIGNED
(
16
,
float
,
high
)[
512
];
DECLARE_ALIGNED
(
32
,
float
,
high
)[
512
];
float
*
bands
[
3
];
float
*
bands
[
3
];
DECLARE_ALIGNED
(
16
,
float
,
out_samples
)[
AT1_MAX_CHANNELS
][
AT1_SU_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
out_samples
)[
AT1_MAX_CHANNELS
][
AT1_SU_SAMPLES
];
FFTContext
mdct_ctx
[
3
];
FFTContext
mdct_ctx
[
3
];
int
channels
;
int
channels
;
DSPContext
dsp
;
DSPContext
dsp
;
...
...
libavcodec/atrac3.c
View file @
9d35fa52
...
@@ -74,8 +74,8 @@ typedef struct {
...
@@ -74,8 +74,8 @@ typedef struct {
int
gcBlkSwitch
;
int
gcBlkSwitch
;
gain_block
gainBlock
[
2
];
gain_block
gainBlock
[
2
];
DECLARE_ALIGNED
(
16
,
float
,
spectrum
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
spectrum
)[
1024
];
DECLARE_ALIGNED
(
16
,
float
,
IMDCT_buf
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
IMDCT_buf
)[
1024
];
float
delayBuf1
[
46
];
///<qmf delay buffers
float
delayBuf1
[
46
];
///<qmf delay buffers
float
delayBuf2
[
46
];
float
delayBuf2
[
46
];
...
@@ -122,7 +122,7 @@ typedef struct {
...
@@ -122,7 +122,7 @@ typedef struct {
FFTContext
mdct_ctx
;
FFTContext
mdct_ctx
;
}
ATRAC3Context
;
}
ATRAC3Context
;
static
DECLARE_ALIGNED
(
16
,
float
,
mdct_window
)[
512
];
static
DECLARE_ALIGNED
(
32
,
float
,
mdct_window
)[
512
];
static
VLC
spectral_coeff_tab
[
7
];
static
VLC
spectral_coeff_tab
[
7
];
static
float
gain_tab1
[
16
];
static
float
gain_tab1
[
16
];
static
float
gain_tab2
[
31
];
static
float
gain_tab2
[
31
];
...
...
libavcodec/binkaudio.c
View file @
9d35fa52
...
@@ -55,7 +55,7 @@ typedef struct {
...
@@ -55,7 +55,7 @@ typedef struct {
int
num_bands
;
int
num_bands
;
unsigned
int
*
bands
;
unsigned
int
*
bands
;
float
root
;
float
root
;
DECLARE_ALIGNED
(
16
,
FFTSample
,
coeffs
)[
BINK_BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
FFTSample
,
coeffs
)[
BINK_BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
short
,
previous
)[
BINK_BLOCK_MAX_SIZE
/
16
];
///< coeffs from previous audio block
DECLARE_ALIGNED
(
16
,
short
,
previous
)[
BINK_BLOCK_MAX_SIZE
/
16
];
///< coeffs from previous audio block
float
*
coeffs_ptr
[
MAX_CHANNELS
];
///< pointers to the coeffs arrays for float_to_int16_interleave
float
*
coeffs_ptr
[
MAX_CHANNELS
];
///< pointers to the coeffs arrays for float_to_int16_interleave
union
{
union
{
...
...
libavcodec/cook.c
View file @
9d35fa52
...
@@ -153,7 +153,7 @@ typedef struct cook {
...
@@ -153,7 +153,7 @@ typedef struct cook {
/* data buffers */
/* data buffers */
uint8_t
*
decoded_bytes_buffer
;
uint8_t
*
decoded_bytes_buffer
;
DECLARE_ALIGNED
(
16
,
float
,
mono_mdct_output
)[
2048
];
DECLARE_ALIGNED
(
32
,
float
,
mono_mdct_output
)[
2048
];
float
decode_buffer_1
[
1024
];
float
decode_buffer_1
[
1024
];
float
decode_buffer_2
[
1024
];
float
decode_buffer_2
[
1024
];
float
decode_buffer_0
[
1060
];
/* static allocation for joint decode */
float
decode_buffer_0
[
1060
];
/* static allocation for joint decode */
...
...
libavcodec/dca.c
View file @
9d35fa52
...
@@ -321,16 +321,16 @@ typedef struct {
...
@@ -321,16 +321,16 @@ typedef struct {
/* Subband samples history (for ADPCM) */
/* Subband samples history (for ADPCM) */
float
subband_samples_hist
[
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
4
];
float
subband_samples_hist
[
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
4
];
DECLARE_ALIGNED
(
16
,
float
,
subband_fir_hist
)[
DCA_PRIM_CHANNELS_MAX
][
512
];
DECLARE_ALIGNED
(
32
,
float
,
subband_fir_hist
)[
DCA_PRIM_CHANNELS_MAX
][
512
];
DECLARE_ALIGNED
(
16
,
float
,
subband_fir_noidea
)[
DCA_PRIM_CHANNELS_MAX
][
32
];
DECLARE_ALIGNED
(
32
,
float
,
subband_fir_noidea
)[
DCA_PRIM_CHANNELS_MAX
][
32
];
int
hist_index
[
DCA_PRIM_CHANNELS_MAX
];
int
hist_index
[
DCA_PRIM_CHANNELS_MAX
];
DECLARE_ALIGNED
(
16
,
float
,
raXin
)[
32
];
DECLARE_ALIGNED
(
32
,
float
,
raXin
)[
32
];
int
output
;
///< type of output
int
output
;
///< type of output
float
scale_bias
;
///< output scale
float
scale_bias
;
///< output scale
DECLARE_ALIGNED
(
16
,
float
,
subband_samples
)[
DCA_BLOCKS_MAX
][
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
8
];
DECLARE_ALIGNED
(
32
,
float
,
subband_samples
)[
DCA_BLOCKS_MAX
][
DCA_PRIM_CHANNELS_MAX
][
DCA_SUBBANDS
][
8
];
DECLARE_ALIGNED
(
16
,
float
,
samples
)[(
DCA_PRIM_CHANNELS_MAX
+
1
)
*
256
];
DECLARE_ALIGNED
(
32
,
float
,
samples
)[(
DCA_PRIM_CHANNELS_MAX
+
1
)
*
256
];
const
float
*
samples_chanptr
[
DCA_PRIM_CHANNELS_MAX
+
1
];
const
float
*
samples_chanptr
[
DCA_PRIM_CHANNELS_MAX
+
1
];
uint8_t
dca_buffer
[
DCA_MAX_FRAME_SIZE
+
DCA_MAX_EXSS_HEADER_SIZE
+
DCA_BUFFER_PADDING_SIZE
];
uint8_t
dca_buffer
[
DCA_MAX_FRAME_SIZE
+
DCA_MAX_EXSS_HEADER_SIZE
+
DCA_BUFFER_PADDING_SIZE
];
...
...
libavcodec/fft.c
View file @
9d35fa52
...
@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
...
@@ -93,6 +93,44 @@ av_cold void ff_init_ff_cos_tabs(int index)
#endif
#endif
}
}
static
const
int
avx_tab
[]
=
{
0
,
4
,
1
,
5
,
8
,
12
,
9
,
13
,
2
,
6
,
3
,
7
,
10
,
14
,
11
,
15
};
static
int
is_second_half_of_fft32
(
int
i
,
int
n
)
{
if
(
n
<=
32
)
return
i
>=
16
;
else
if
(
i
<
n
/
2
)
return
is_second_half_of_fft32
(
i
,
n
/
2
);
else
if
(
i
<
3
*
n
/
4
)
return
is_second_half_of_fft32
(
i
-
n
/
2
,
n
/
4
);
else
return
is_second_half_of_fft32
(
i
-
3
*
n
/
4
,
n
/
4
);
}
static
av_cold
void
fft_perm_avx
(
FFTContext
*
s
)
{
int
i
;
int
n
=
1
<<
s
->
nbits
;
for
(
i
=
0
;
i
<
n
;
i
+=
16
)
{
int
k
;
if
(
is_second_half_of_fft32
(
i
,
n
))
{
for
(
k
=
0
;
k
<
16
;
k
++
)
s
->
revtab
[
-
split_radix_permutation
(
i
+
k
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
i
+
avx_tab
[
k
];
}
else
{
for
(
k
=
0
;
k
<
16
;
k
++
)
{
int
j
=
i
+
k
;
j
=
(
j
&
~
7
)
|
((
j
>>
1
)
&
3
)
|
((
j
<<
2
)
&
4
);
s
->
revtab
[
-
split_radix_permutation
(
i
+
k
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
}
}
}
}
av_cold
int
ff_fft_init
(
FFTContext
*
s
,
int
nbits
,
int
inverse
)
av_cold
int
ff_fft_init
(
FFTContext
*
s
,
int
nbits
,
int
inverse
)
{
{
int
i
,
j
,
n
;
int
i
,
j
,
n
;
...
@@ -132,12 +170,17 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
...
@@ -132,12 +170,17 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
for
(
j
=
4
;
j
<=
nbits
;
j
++
)
{
for
(
j
=
4
;
j
<=
nbits
;
j
++
)
{
ff_init_ff_cos_tabs
(
j
);
ff_init_ff_cos_tabs
(
j
);
}
}
if
(
s
->
fft_permutation
==
FF_FFT_PERM_AVX
)
{
fft_perm_avx
(
s
);
}
else
{
for
(
i
=
0
;
i
<
n
;
i
++
)
{
for
(
i
=
0
;
i
<
n
;
i
++
)
{
int
j
=
i
;
int
j
=
i
;
if
(
s
->
fft_permutation
==
FF_FFT_PERM_SWAP_LSBS
)
if
(
s
->
fft_permutation
==
FF_FFT_PERM_SWAP_LSBS
)
j
=
(
j
&~
3
)
|
((
j
>>
1
)
&
1
)
|
((
j
<<
1
)
&
2
);
j
=
(
j
&~
3
)
|
((
j
>>
1
)
&
1
)
|
((
j
<<
1
)
&
2
);
s
->
revtab
[
-
split_radix_permutation
(
i
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
s
->
revtab
[
-
split_radix_permutation
(
i
,
n
,
s
->
inverse
)
&
(
n
-
1
)]
=
j
;
}
}
}
return
0
;
return
0
;
fail:
fail:
...
...
libavcodec/fft.h
View file @
9d35fa52
...
@@ -85,6 +85,7 @@ struct FFTContext {
...
@@ -85,6 +85,7 @@ struct FFTContext {
int
fft_permutation
;
int
fft_permutation
;
#define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_DEFAULT 0
#define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_SWAP_LSBS 1
#define FF_FFT_PERM_AVX 2
int
mdct_permutation
;
int
mdct_permutation
;
#define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_NONE 0
#define FF_MDCT_PERM_INTERLEAVE 1
#define FF_MDCT_PERM_INTERLEAVE 1
...
@@ -97,7 +98,7 @@ struct FFTContext {
...
@@ -97,7 +98,7 @@ struct FFTContext {
#endif
#endif
#define COSTABLE(size) \
#define COSTABLE(size) \
COSTABLE_CONST DECLARE_ALIGNED(
16
, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
COSTABLE_CONST DECLARE_ALIGNED(
32
, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern
COSTABLE
(
16
);
extern
COSTABLE
(
16
);
extern
COSTABLE
(
32
);
extern
COSTABLE
(
32
);
...
...
libavcodec/imc.c
View file @
9d35fa52
...
@@ -88,7 +88,7 @@ typedef struct {
...
@@ -88,7 +88,7 @@ typedef struct {
DSPContext
dsp
;
DSPContext
dsp
;
FFTContext
fft
;
FFTContext
fft
;
DECLARE_ALIGNED
(
16
,
FFTComplex
,
samples
)[
COEFFS
/
2
];
DECLARE_ALIGNED
(
32
,
FFTComplex
,
samples
)[
COEFFS
/
2
];
float
*
out_samples
;
float
*
out_samples
;
}
IMCContext
;
}
IMCContext
;
...
...
libavcodec/nellymoserdec.c
View file @
9d35fa52
...
@@ -47,7 +47,7 @@
...
@@ -47,7 +47,7 @@
typedef
struct
NellyMoserDecodeContext
{
typedef
struct
NellyMoserDecodeContext
{
AVCodecContext
*
avctx
;
AVCodecContext
*
avctx
;
DECLARE_ALIGNED
(
16
,
float
,
float_buf
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
float_buf
)[
NELLY_SAMPLES
];
float
state
[
128
];
float
state
[
128
];
AVLFG
random_state
;
AVLFG
random_state
;
GetBitContext
gb
;
GetBitContext
gb
;
...
@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
...
@@ -55,7 +55,7 @@ typedef struct NellyMoserDecodeContext {
DSPContext
dsp
;
DSPContext
dsp
;
FFTContext
imdct_ctx
;
FFTContext
imdct_ctx
;
FmtConvertContext
fmt_conv
;
FmtConvertContext
fmt_conv
;
DECLARE_ALIGNED
(
16
,
float
,
imdct_out
)[
NELLY_BUF_LEN
*
2
];
DECLARE_ALIGNED
(
32
,
float
,
imdct_out
)[
NELLY_BUF_LEN
*
2
];
}
NellyMoserDecodeContext
;
}
NellyMoserDecodeContext
;
static
void
overlap_and_window
(
NellyMoserDecodeContext
*
s
,
float
*
state
,
float
*
audio
,
float
*
a_in
)
static
void
overlap_and_window
(
NellyMoserDecodeContext
*
s
,
float
*
state
,
float
*
audio
,
float
*
a_in
)
...
...
libavcodec/nellymoserenc.c
View file @
9d35fa52
...
@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
...
@@ -55,9 +55,9 @@ typedef struct NellyMoserEncodeContext {
int
have_saved
;
int
have_saved
;
DSPContext
dsp
;
DSPContext
dsp
;
FFTContext
mdct_ctx
;
FFTContext
mdct_ctx
;
DECLARE_ALIGNED
(
16
,
float
,
mdct_out
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
mdct_out
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
16
,
float
,
in_buff
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
32
,
float
,
in_buff
)[
NELLY_SAMPLES
];
DECLARE_ALIGNED
(
16
,
float
,
buf
)[
2
][
3
*
NELLY_BUF_LEN
];
///< sample buffer
DECLARE_ALIGNED
(
32
,
float
,
buf
)[
2
][
3
*
NELLY_BUF_LEN
];
///< sample buffer
float
(
*
opt
)[
NELLY_BANDS
];
float
(
*
opt
)[
NELLY_BANDS
];
uint8_t
(
*
path
)[
NELLY_BANDS
];
uint8_t
(
*
path
)[
NELLY_BANDS
];
}
NellyMoserEncodeContext
;
}
NellyMoserEncodeContext
;
...
...
libavcodec/qdm2.c
View file @
9d35fa52
...
@@ -120,7 +120,7 @@ typedef struct {
...
@@ -120,7 +120,7 @@ typedef struct {
}
FFTCoefficient
;
}
FFTCoefficient
;
typedef
struct
{
typedef
struct
{
DECLARE_ALIGNED
(
16
,
QDM2Complex
,
complex
)[
MPA_MAX_CHANNELS
][
256
];
DECLARE_ALIGNED
(
32
,
QDM2Complex
,
complex
)[
MPA_MAX_CHANNELS
][
256
];
}
QDM2FFT
;
}
QDM2FFT
;
/**
/**
...
...
libavcodec/wma.h
View file @
9d35fa52
...
@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
...
@@ -113,15 +113,15 @@ typedef struct WMACodecContext {
uint8_t
ms_stereo
;
///< true if mid/side stereo mode
uint8_t
ms_stereo
;
///< true if mid/side stereo mode
uint8_t
channel_coded
[
MAX_CHANNELS
];
///< true if channel is coded
uint8_t
channel_coded
[
MAX_CHANNELS
];
///< true if channel is coded
int
exponents_bsize
[
MAX_CHANNELS
];
///< log2 ratio frame/exp. length
int
exponents_bsize
[
MAX_CHANNELS
];
///< log2 ratio frame/exp. length
DECLARE_ALIGNED
(
16
,
float
,
exponents
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
float
,
exponents
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
float
max_exponent
[
MAX_CHANNELS
];
float
max_exponent
[
MAX_CHANNELS
];
WMACoef
coefs1
[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
WMACoef
coefs1
[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
float
,
coefs
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
32
,
float
,
coefs
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
];
DECLARE_ALIGNED
(
16
,
FFTSample
,
output
)[
BLOCK_MAX_SIZE
*
2
];
DECLARE_ALIGNED
(
32
,
FFTSample
,
output
)[
BLOCK_MAX_SIZE
*
2
];
FFTContext
mdct_ctx
[
BLOCK_NB_SIZES
];
FFTContext
mdct_ctx
[
BLOCK_NB_SIZES
];
float
*
windows
[
BLOCK_NB_SIZES
];
float
*
windows
[
BLOCK_NB_SIZES
];
/* output buffer for one frame and the last for IMDCT windowing */
/* output buffer for one frame and the last for IMDCT windowing */
DECLARE_ALIGNED
(
16
,
float
,
frame_out
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
*
2
];
DECLARE_ALIGNED
(
32
,
float
,
frame_out
)[
MAX_CHANNELS
][
BLOCK_MAX_SIZE
*
2
];
/* last frame info */
/* last frame info */
uint8_t
last_superframe
[
MAX_CODED_SUPERFRAME_SIZE
+
4
];
/* padding added */
uint8_t
last_superframe
[
MAX_CODED_SUPERFRAME_SIZE
+
4
];
/* padding added */
int
last_bitoffset
;
int
last_bitoffset
;
...
...
libavcodec/wmaprodec.c
View file @
9d35fa52
...
@@ -145,7 +145,7 @@ typedef struct {
...
@@ -145,7 +145,7 @@ typedef struct {
uint8_t
table_idx
;
///< index in sf_offsets for the scale factor reference block
uint8_t
table_idx
;
///< index in sf_offsets for the scale factor reference block
float
*
coeffs
;
///< pointer to the subframe decode buffer
float
*
coeffs
;
///< pointer to the subframe decode buffer
uint16_t
num_vec_coeffs
;
///< number of vector coded coefficients
uint16_t
num_vec_coeffs
;
///< number of vector coded coefficients
DECLARE_ALIGNED
(
16
,
float
,
out
)[
WMAPRO_BLOCK_MAX_SIZE
+
WMAPRO_BLOCK_MAX_SIZE
/
2
];
///< output buffer
DECLARE_ALIGNED
(
32
,
float
,
out
)[
WMAPRO_BLOCK_MAX_SIZE
+
WMAPRO_BLOCK_MAX_SIZE
/
2
];
///< output buffer
}
WMAProChannelCtx
;
}
WMAProChannelCtx
;
/**
/**
...
@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
...
@@ -170,7 +170,7 @@ typedef struct WMAProDecodeCtx {
FF_INPUT_BUFFER_PADDING_SIZE
];
///< compressed frame data
FF_INPUT_BUFFER_PADDING_SIZE
];
///< compressed frame data
PutBitContext
pb
;
///< context for filling the frame_data buffer
PutBitContext
pb
;
///< context for filling the frame_data buffer
FFTContext
mdct_ctx
[
WMAPRO_BLOCK_SIZES
];
///< MDCT context per block size
FFTContext
mdct_ctx
[
WMAPRO_BLOCK_SIZES
];
///< MDCT context per block size
DECLARE_ALIGNED
(
16
,
float
,
tmp
)[
WMAPRO_BLOCK_MAX_SIZE
];
///< IMDCT output buffer
DECLARE_ALIGNED
(
32
,
float
,
tmp
)[
WMAPRO_BLOCK_MAX_SIZE
];
///< IMDCT output buffer
float
*
windows
[
WMAPRO_BLOCK_SIZES
];
///< windows for the different block sizes
float
*
windows
[
WMAPRO_BLOCK_SIZES
];
///< windows for the different block sizes
/* frame size dependent frame information (set during initialization) */
/* frame size dependent frame information (set during initialization) */
...
...
libavcodec/wmavoice.c
View file @
9d35fa52
...
@@ -275,11 +275,11 @@ typedef struct {
...
@@ -275,11 +275,11 @@ typedef struct {
///< by postfilter
///< by postfilter
float
denoise_filter_cache
[
MAX_FRAMESIZE
];
float
denoise_filter_cache
[
MAX_FRAMESIZE
];
int
denoise_filter_cache_size
;
///< samples in #denoise_filter_cache
int
denoise_filter_cache_size
;
///< samples in #denoise_filter_cache
DECLARE_ALIGNED
(
16
,
float
,
tilted_lpcs_pf
)[
0x80
];
DECLARE_ALIGNED
(
32
,
float
,
tilted_lpcs_pf
)[
0x80
];
///< aligned buffer for LPC tilting
///< aligned buffer for LPC tilting
DECLARE_ALIGNED
(
16
,
float
,
denoise_coeffs_pf
)[
0x80
];
DECLARE_ALIGNED
(
32
,
float
,
denoise_coeffs_pf
)[
0x80
];
///< aligned buffer for denoise coefficients
///< aligned buffer for denoise coefficients
DECLARE_ALIGNED
(
16
,
float
,
synth_filter_out_buf
)[
0x80
+
MAX_LSPS_ALIGN16
];
DECLARE_ALIGNED
(
32
,
float
,
synth_filter_out_buf
)[
0x80
+
MAX_LSPS_ALIGN16
];
///< aligned buffer for postfilter speech
///< aligned buffer for postfilter speech
///< synthesis
///< synthesis
/**
/**
...
...
libavcodec/x86/fft.c
View file @
9d35fa52
...
@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
...
@@ -25,7 +25,14 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
{
{
#if HAVE_YASM
#if HAVE_YASM
int
has_vectors
=
av_get_cpu_flags
();
int
has_vectors
=
av_get_cpu_flags
();
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
{
if
(
has_vectors
&
AV_CPU_FLAG_AVX
&&
HAVE_AVX
&&
s
->
nbits
>=
5
)
{
/* AVX for SB */
s
->
imdct_calc
=
ff_imdct_calc_sse
;
s
->
imdct_half
=
ff_imdct_half_avx
;
s
->
fft_permute
=
ff_fft_permute_sse
;
s
->
fft_calc
=
ff_fft_calc_avx
;
s
->
fft_permutation
=
FF_FFT_PERM_AVX
;
}
else
if
(
has_vectors
&
AV_CPU_FLAG_SSE
&&
HAVE_SSE
)
{
/* SSE for P3/P4/K8 */
/* SSE for P3/P4/K8 */
s
->
imdct_calc
=
ff_imdct_calc_sse
;
s
->
imdct_calc
=
ff_imdct_calc_sse
;
s
->
imdct_half
=
ff_imdct_half_sse
;
s
->
imdct_half
=
ff_imdct_half_sse
;
...
...
libavcodec/x86/fft.h
View file @
9d35fa52
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
#include "libavcodec/fft.h"
#include "libavcodec/fft.h"
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn2
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_3dn2
(
FFTContext
*
s
,
FFTComplex
*
z
);
...
@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
...
@@ -32,6 +33,7 @@ void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input
void
ff_imdct_half_3dn2
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_3dn2
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_imdct_half_avx
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
);
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
);
void
ff_dct32_float_sse
(
FFTSample
*
out
,
const
FFTSample
*
in
);
#endif
#endif
libavcodec/x86/fft_mmx.asm
View file @
9d35fa52
;******************************************************************************
;******************************************************************************
;* FFT transform with SSE/3DNow optimizations
;* FFT transform with SSE/3DNow optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
;*
;* This algorithm (though not any of the implementation details) is
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;* based on libdjbfft by D. J. Bernstein.
...
@@ -49,9 +50,21 @@ endstruc
...
@@ -49,9 +50,21 @@ endstruc
SECTION_RODATA
SECTION_RODATA
%define
M_SQRT1_2
0
.
70710678118654752440
%define
M_SQRT1_2
0
.
70710678118654752440
ps_root2
:
times
4
dd
M_SQRT1_2
%define
M_COS_PI_1_8
0
.
923879532511287
ps_root2mppm
:
dd
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
%define
M_COS_PI_3_8
0
.
38268343236509
ps_p1p1m1p1
:
dd
0
,
0
,
1
<<
31
,
0
align
32
ps_cos16_1
:
dd
1
.
0
,
M_COS_PI_1_8
,
M_SQRT1_2
,
M_COS_PI_3_8
,
1
.
0
,
M_COS_PI_1_8
,
M_SQRT1_2
,
M_COS_PI_3_8
ps_cos16_2
:
dd
0
,
M_COS_PI_3_8
,
M_SQRT1_2
,
M_COS_PI_1_8
,
0
,
-
M_COS_PI_3_8
,
-
M_SQRT1_2
,
-
M_COS_PI_1_8
ps_root2
:
times
8
dd
M_SQRT1_2
ps_root2mppm
:
dd
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
,
-
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
-
M_SQRT1_2
ps_p1p1m1p1
:
dd
0
,
0
,
1
<<
31
,
0
,
0
,
0
,
1
<<
31
,
0
perm1
:
dd
0x00
,
0x02
,
0x03
,
0x01
,
0x03
,
0x00
,
0x02
,
0x01
perm2
:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
ps_p1p1m1p1root2
:
dd
1
.
0
,
1
.
0
,
-
1
.
0
,
1
.
0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_m1m1p1m1p1m1m1m1
:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1p1
:
dd
1
<<
31
,
0
ps_m1p1
:
dd
1
<<
31
,
0
%assign
i
16
%assign
i
16
...
@@ -96,51 +109,80 @@ section .text align=16
...
@@ -96,51 +109,80 @@ section .text align=16
SWAP
%3
,
%6
SWAP
%3
,
%6
%endmacro
%endmacro
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro
T8_AVX
5
vsubps
%5
,
%1
,
%2
; v = %1 - %2
vaddps
%3
,
%1
,
%2
; w = %1 + %2
vmulps
%2
,
%5
,
[
ps_p1p1m1p1root2
]
; v *= vals1
vpermilps
%2
,
%2
,
[perm1]
vblendps
%1
,
%2
,
%3
,
0x33
; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps
%5
,
%3
,
%2
,
0x4e
; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps
%4
,
%5
,
%1
; s = r - q
vaddps
%1
,
%5
,
%1
; u = r + q
vpermilps
%1
,
%1
,
[perm2]
; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps
%5
,
%4
,
%1
,
0xbb
vshufps
%3
,
%4
,
%1
,
0xee
vperm2f128
%3
,
%3
,
%5
,
0x13
vxorps
%4
,
%4
,
[
ps_m1m1p1m1p1m1m1m1
]
; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps
%2
,
%1
,
%4
,
0xdd
vshufps
%1
,
%1
,
%4
,
0x88
vperm2f128
%4
,
%2
,
%1
,
0x02
; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128
%1
,
%1
,
%2
,
0x13
; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps
%5
,
%1
,
%3
vblendps
%1
,
%5
,
%1
,
0x55
; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps
%2
,
%4
,
%1
; %2 = v - w
vaddps
%1
,
%4
,
%1
; %1 = v + w
%endmacro
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro
T4_SSE
3
%macro
T4_SSE
3
mova
%3
,
%1
subps
%3
,
%1
,
%2
; {t3,t4,-t8,t7}
addps
%1
,
%2
; {t1,t2,t6,t5}
addps
%1
,
%1
,
%2
; {t1,t2,t6,t5}
subps
%3
,
%2
; {t3,t4,-t8,t7}
xorps
%3
,
%3
,
[
ps_p1p1m1p1
]
xorps
%3
,
[
ps_p1p1m1p1
]
shufps
%2
,
%1
,
%3
,
0xbe
; {t6,t5,t7,t8}
mova
%2
,
%1
shufps
%1
,
%1
,
%3
,
0x44
; {t1,t2,t3,t4}
shufps
%1
,
%3
,
0x44
; {t1,t2,t3,t4}
subps
%3
,
%1
,
%2
; {r2,i2,r3,i3}
shufps
%2
,
%3
,
0xbe
; {t6,t5,t7,t8}
addps
%1
,
%1
,
%2
; {r0,i0,r1,i1}
mova
%3
,
%1
shufps
%2
,
%1
,
%3
,
0xdd
; {i0,i1,i2,i3}
addps
%1
,
%2
; {r0,i0,r1,i1}
shufps
%1
,
%1
,
%3
,
0x88
; {r0,r1,r2,r3}
subps
%3
,
%2
; {r2,i2,r3,i3}
mova
%2
,
%1
shufps
%1
,
%3
,
0x88
; {r0,r1,r2,r3}
shufps
%2
,
%3
,
0xdd
; {i0,i1,i2,i3}
%endmacro
%endmacro
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro
T8_SSE
6
%macro
T8_SSE
6
mova
%6
,
%3
addps
%6
,
%3
,
%4
; {t1,t2,t3,t4}
subps
%3
,
%4
; {r5,i5,r7,i7}
subps
%3
,
%3
,
%4
; {r5,i5,r7,i7}
addps
%6
,
%4
; {t1,t2,t3,t4}
shufps
%4
,
%3
,
%3
,
0xb1
; {i5,r5,i7,r7}
mova
%4
,
%3
mulps
%3
,
%3
,
[
ps_root2mppm
]
; {-r5,i5,r7,-i7}
shufps
%4
,
%4
,
0xb1
; {i5,r5,i7,r7}
mulps
%4
,
%4
,
[
ps_root2
]
mulps
%3
,
[
ps_root2mppm
]
; {-r5,i5,r7,-i7}
addps
%3
,
%3
,
%4
; {t8,t7,ta,t9}
mulps
%4
,
[
ps_root2
]
shufps
%4
,
%6
,
%3
,
0x9c
; {t1,t4,t7,ta}
addps
%3
,
%4
; {t8,t7,ta,t9}
shufps
%6
,
%6
,
%3
,
0x36
; {t3,t2,t9,t8}
mova
%4
,
%6
subps
%3
,
%6
,
%4
; {t6,t5,tc,tb}
shufps
%6
,
%3
,
0x36
; {t3,t2,t9,t8}
addps
%6
,
%6
,
%4
; {t1,t2,t9,ta}
shufps
%4
,
%3
,
0x9c
; {t1,t4,t7,ta}
shufps
%5
,
%6
,
%3
,
0x8d
; {t2,ta,t6,tc}
mova
%3
,
%6
shufps
%6
,
%6
,
%3
,
0xd8
; {t1,t9,t5,tb}
addps
%6
,
%4
; {t1,t2,t9,ta}
subps
%3
,
%1
,
%6
; {r4,r5,r6,r7}
subps
%3
,
%4
; {t6,t5,tc,tb}
addps
%1
,
%1
,
%6
; {r0,r1,r2,r3}
mova
%4
,
%6
subps
%4
,
%2
,
%5
; {i4,i5,i6,i7}
shufps
%6
,
%3
,
0xd8
; {t1,t9,t5,tb}
addps
%2
,
%2
,
%5
; {i0,i1,i2,i3}
shufps
%4
,
%3
,
0x8d
; {t2,ta,t6,tc}
mova
%3
,
%1
mova
%5
,
%2
addps
%1
,
%6
; {r0,r1,r2,r3}
addps
%2
,
%4
; {i0,i1,i2,i3}
subps
%3
,
%6
; {r4,r5,r6,r7}
subps
%5
,
%4
; {i4,i5,i6,i7}
SWAP
%4
,
%5
%endmacro
%endmacro
; scheduled for cpu-bound sizes
; scheduled for cpu-bound sizes
...
@@ -148,52 +190,44 @@ section .text align=16
...
@@ -148,52 +190,44 @@ section .text align=16
IF%1
mova
m4
,
Z
(
4
)
IF%1
mova
m4
,
Z
(
4
)
IF%1
mova
m5
,
Z
(
5
)
IF%1
mova
m5
,
Z
(
5
)
mova
m0
,
%2
; wre
mova
m0
,
%2
; wre
mova
m2
,
m4
mova
m1
,
%3
; wim
mova
m1
,
%3
; wim
mova
m3
,
m5
mulps
m2
,
m4
,
m0
; r2*wre
mulps
m2
,
m0
; r2*wre
IF%1
mova
m6
,
Z2
(
6
)
IF%1
mova
m6
,
Z2
(
6
)
mulps
m3
,
m1
; i2*wim
mulps
m3
,
m
5
,
m
1
; i2*wim
IF%1
mova
m7
,
Z2
(
7
)
IF%1
mova
m7
,
Z2
(
7
)
mulps
m4
,
m1
; r2*wim
mulps
m4
,
m4
,
m1
; r2*wim
mulps
m5
,
m0
; i2*wre
mulps
m5
,
m5
,
m0
; i2*wre
addps
m2
,
m3
; r2*wre + i2*wim
addps
m2
,
m2
,
m3
; r2*wre + i2*wim
mova
m3
,
m1
mulps
m3
,
m1
,
m7
; i3*wim
mulps
m1
,
m6
; r3*wim
subps
m5
,
m5
,
m4
; i2*wre - r2*wim
subps
m5
,
m4
; i2*wre - r2*wim
mulps
m1
,
m1
,
m6
; r3*wim
mova
m4
,
m0
mulps
m4
,
m0
,
m6
; r3*wre
mulps
m3
,
m7
; i3*wim
mulps
m0
,
m0
,
m7
; i3*wre
mulps
m4
,
m6
; r3*wre
subps
m4
,
m4
,
m3
; r3*wre - i3*wim
mulps
m0
,
m7
; i3*wre
subps
m4
,
m3
; r3*wre - i3*wim
mova
m3
,
Z
(
0
)
mova
m3
,
Z
(
0
)
addps
m0
,
m1
; i3*wre + r3*wim
addps
m0
,
m0
,
m1
; i3*wre + r3*wim
mova
m1
,
m4
subps
m1
,
m4
,
m2
; t3
addps
m4
,
m2
; t5
addps
m4
,
m4
,
m2
; t5
subps
m1
,
m2
; t3
subps
m3
,
m3
,
m4
; r2
subps
m3
,
m4
; r2
addps
m4
,
m4
,
Z
(
0
)
; r0
addps
m4
,
Z
(
0
)
; r0
mova
m6
,
Z
(
2
)
mova
m6
,
Z
(
2
)
mova
Z
(
4
),
m3
mova
Z
(
4
),
m3
mova
Z
(
0
),
m4
mova
Z
(
0
),
m4
mova
m3
,
m5
subps
m3
,
m5
,
m0
; t4
subps
m5
,
m0
; t4
subps
m4
,
m6
,
m3
; r3
mova
m4
,
m6
addps
m3
,
m3
,
m6
; r1
subps
m6
,
m5
; r3
mova
Z2
(
6
),
m4
addps
m5
,
m4
; r1
mova
Z
(
2
),
m3
mova
Z2
(
6
),
m6
mova
Z
(
2
),
m5
mova
m2
,
Z
(
3
)
mova
m2
,
Z
(
3
)
addps
m3
,
m0
; t6
addps
m3
,
m
5
,
m
0
; t6
subps
m2
,
m1
; i3
subps
m2
,
m
2
,
m
1
; i3
mova
m7
,
Z
(
1
)
mova
m7
,
Z
(
1
)
addps
m1
,
Z
(
3
)
; i1
addps
m1
,
m1
,
Z
(
3
)
; i1
mova
Z2
(
7
),
m2
mova
Z2
(
7
),
m2
mova
Z
(
3
),
m1
mova
Z
(
3
),
m1
mova
m4
,
m7
subps
m4
,
m7
,
m3
; i2
subps
m7
,
m3
; i2
addps
m3
,
m3
,
m7
; i0
addps
m3
,
m4
; i0
mova
Z
(
5
),
m4
mova
Z
(
5
),
m7
mova
Z
(
1
),
m3
mova
Z
(
1
),
m3
%endmacro
%endmacro
...
@@ -201,77 +235,55 @@ IF%1 mova m7, Z2(7)
...
@@ -201,77 +235,55 @@ IF%1 mova m7, Z2(7)
%macro
PASS_BIG
1
; (!interleave)
%macro
PASS_BIG
1
; (!interleave)
mova
m4
,
Z
(
4
)
; r2
mova
m4
,
Z
(
4
)
; r2
mova
m5
,
Z
(
5
)
; i2
mova
m5
,
Z
(
5
)
; i2
mova
m2
,
m4
mova
m0
,
[wq]
; wre
mova
m0
,
[wq]
; wre
mova
m3
,
m5
mova
m1
,
[
wq
+
o1q
]
; wim
mova
m1
,
[
wq
+
o1q
]
; wim
mulps
m2
,
m0
; r2*wre
mulps
m2
,
m
4
,
m
0
; r2*wre
mova
m6
,
Z2
(
6
)
; r3
mova
m6
,
Z2
(
6
)
; r3
mulps
m3
,
m1
; i2*wim
mulps
m3
,
m
5
,
m
1
; i2*wim
mova
m7
,
Z2
(
7
)
; i3
mova
m7
,
Z2
(
7
)
; i3
mulps
m4
,
m1
; r2*wim
mulps
m4
,
m4
,
m1
; r2*wim
mulps
m5
,
m0
; i2*wre
mulps
m5
,
m5
,
m0
; i2*wre
addps
m2
,
m3
; r2*wre + i2*wim
addps
m2
,
m2
,
m3
; r2*wre + i2*wim
mova
m3
,
m1
mulps
m3
,
m1
,
m7
; i3*wim
mulps
m1
,
m6
; r3*wim
mulps
m1
,
m1
,
m6
; r3*wim
subps
m5
,
m4
; i2*wre - r2*wim
subps
m5
,
m5
,
m4
; i2*wre - r2*wim
mova
m4
,
m0
mulps
m4
,
m0
,
m6
; r3*wre
mulps
m3
,
m7
; i3*wim
mulps
m0
,
m0
,
m7
; i3*wre
mulps
m4
,
m6
; r3*wre
subps
m4
,
m4
,
m3
; r3*wre - i3*wim
mulps
m0
,
m7
; i3*wre
subps
m4
,
m3
; r3*wre - i3*wim
mova
m3
,
Z
(
0
)
mova
m3
,
Z
(
0
)
addps
m0
,
m1
; i3*wre + r3*wim
addps
m0
,
m0
,
m1
; i3*wre + r3*wim
mova
m1
,
m4
subps
m1
,
m4
,
m2
; t3
addps
m4
,
m2
; t5
addps
m4
,
m4
,
m2
; t5
subps
m1
,
m2
; t3
subps
m3
,
m3
,
m4
; r2
subps
m3
,
m4
; r2
addps
m4
,
m4
,
Z
(
0
)
; r0
addps
m4
,
Z
(
0
)
; r0
mova
m6
,
Z
(
2
)
mova
m6
,
Z
(
2
)
mova
Z
(
4
),
m3
mova
Z
(
4
),
m3
mova
Z
(
0
),
m4
mova
Z
(
0
),
m4
mova
m3
,
m5
subps
m3
,
m5
,
m0
; t4
subps
m5
,
m0
; t4
subps
m4
,
m6
,
m3
; r3
mova
m4
,
m6
addps
m3
,
m3
,
m6
; r1
subps
m6
,
m5
; r3
IF%1
mova
Z2
(
6
),
m4
addps
m5
,
m4
; r1
IF%1
mova
Z
(
2
),
m3
IF%1
mova
Z2
(
6
),
m6
IF%1
mova
Z
(
2
),
m5
mova
m2
,
Z
(
3
)
mova
m2
,
Z
(
3
)
addps
m
3
,
m0
; t6
addps
m
5
,
m5
,
m0
; t6
subps
m2
,
m1
; i3
subps
m2
,
m
2
,
m
1
; i3
mova
m7
,
Z
(
1
)
mova
m7
,
Z
(
1
)
addps
m1
,
Z
(
3
)
; i1
addps
m1
,
m1
,
Z
(
3
)
; i1
IF%1
mova
Z2
(
7
),
m2
IF%1
mova
Z2
(
7
),
m2
IF%1
mova
Z
(
3
),
m1
IF%1
mova
Z
(
3
),
m1
mova
m4
,
m7
subps
m6
,
m7
,
m5
; i2
subps
m7
,
m3
; i2
addps
m5
,
m5
,
m7
; i0
addps
m3
,
m4
; i0
IF%1
mova
Z
(
5
),
m6
IF%1
mova
Z
(
5
),
m7
IF%1
mova
Z
(
1
),
m5
IF%1
mova
Z
(
1
),
m3
%if
%1
==
0
%if
%1
==
0
mova
m4
,
m5
; r1
INTERL
m1
,
m3
,
m7
,
Z
,
2
mova
m0
,
m6
; r3
INTERL
m2
,
m4
,
m0
,
Z2
,
6
unpcklps
m5
,
m1
unpckhps
m4
,
m1
unpcklps
m6
,
m2
unpckhps
m0
,
m2
mova
m1
,
Z
(
0
)
mova
m1
,
Z
(
0
)
mova
m2
,
Z
(
4
)
mova
m2
,
Z
(
4
)
mova
Z
(
2
),
m5
mova
Z
(
3
),
m4
INTERL
m5
,
m1
,
m3
,
Z
,
0
mova
Z2
(
6
),
m6
INTERL
m6
,
m2
,
m7
,
Z
,
4
mova
Z2
(
7
),
m0
mova
m5
,
m1
; r0
mova
m4
,
m2
; r2
unpcklps
m1
,
m3
unpckhps
m5
,
m3
unpcklps
m2
,
m7
unpckhps
m4
,
m7
mova
Z
(
0
),
m1
mova
Z
(
1
),
m5
mova
Z
(
4
),
m2
mova
Z
(
5
),
m4
%endif
%endif
%endmacro
%endmacro
...
@@ -281,13 +293,106 @@ IF%1 mova Z(1), m3
...
@@ -281,13 +293,106 @@ IF%1 mova Z(1), m3
punpckhdq
%3
,
%2
punpckhdq
%3
,
%2
%endmacro
%endmacro
INIT_XMM
%define
mova
movaps
%define
Z
(
x
)
[
r0
+
mmsize
*
x
]
%define
Z
(
x
)
[
r0
+
mmsize
*
x
]
%define
Z2
(
x
)
[
r0
+
mmsize
*
x
]
%define
Z2
(
x
)
[
r0
+
mmsize
*
x
]
%define
ZH
(
x
)
[
r0
+
mmsize
*
x
+
mmsize
/
2
]
INIT_YMM
align
16
fft8_avx
:
mova
m0
,
Z
(
0
)
mova
m1
,
Z
(
1
)
T8_AVX
m0
,
m1
,
m2
,
m3
,
m4
mova
Z
(
0
),
m0
mova
Z
(
1
),
m1
ret
align
16
fft16_avx
:
mova
m2
,
Z
(
2
)
mova
m3
,
Z
(
3
)
T4_SSE
m2
,
m3
,
m7
mova
m0
,
Z
(
0
)
mova
m1
,
Z
(
1
)
T8_AVX
m0
,
m1
,
m4
,
m5
,
m7
mova
m4
,
[
ps_cos16_1
]
mova
m5
,
[
ps_cos16_2
]
vmulps
m6
,
m2
,
m4
vmulps
m7
,
m3
,
m5
vaddps
m7
,
m7
,
m6
vmulps
m2
,
m2
,
m5
vmulps
m3
,
m3
,
m4
vsubps
m3
,
m3
,
m2
vblendps
m2
,
m7
,
m3
,
0xf0
vperm2f128
m3
,
m7
,
m3
,
0x21
vaddps
m4
,
m2
,
m3
vsubps
m2
,
m3
,
m2
vperm2f128
m2
,
m2
,
m2
,
0x01
vsubps
m3
,
m1
,
m2
vaddps
m1
,
m1
,
m2
vsubps
m5
,
m0
,
m4
vaddps
m0
,
m0
,
m4
vextractf128
Z
(
0
),
m0
,
0
vextractf128
ZH
(
0
),
m1
,
0
vextractf128
Z
(
1
),
m0
,
1
vextractf128
ZH
(
1
),
m1
,
1
vextractf128
Z
(
2
),
m5
,
0
vextractf128
ZH
(
2
),
m3
,
0
vextractf128
Z
(
3
),
m5
,
1
vextractf128
ZH
(
3
),
m3
,
1
ret
align
16
fft32_avx
:
call
fft16_avx
mova
m0
,
Z
(
4
)
mova
m1
,
Z
(
5
)
T4_SSE
m0
,
m1
,
m4
mova
m2
,
Z
(
6
)
mova
m3
,
Z
(
7
)
T8_SSE
m0
,
m1
,
m2
,
m3
,
m4
,
m6
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
vperm2f128
m4
,
m0
,
m2
,
0x20
vperm2f128
m5
,
m1
,
m3
,
0x20
vperm2f128
m6
,
m0
,
m2
,
0x31
vperm2f128
m7
,
m1
,
m3
,
0x31
PASS_SMALL
0
,
[
cos_32
]
,
[
cos_32
+
32
]
ret
fft32_interleave_avx
:
call
fft32_avx
mov
r2d
,
32
.
deint_loop
:
mova
m2
,
Z
(
0
)
mova
m3
,
Z
(
1
)
vunpcklps
m0
,
m2
,
m3
vunpckhps
m1
,
m2
,
m3
vextractf128
Z
(
0
),
m0
,
0
vextractf128
ZH
(
0
),
m1
,
0
vextractf128
Z
(
1
),
m0
,
1
vextractf128
ZH
(
1
),
m1
,
1
add
r0
,
mmsize
*
2
sub
r2d
,
mmsize
/
4
jg
.
deint_loop
ret
INIT_XMM
%define
movdqa
movaps
align
16
align
16
fft4_avx
:
fft4_sse
:
fft4_sse
:
mova
m0
,
Z
(
0
)
mova
m0
,
Z
(
0
)
mova
m1
,
Z
(
1
)
mova
m1
,
Z
(
1
)
...
@@ -406,6 +511,8 @@ FFT48_3DN _3dn
...
@@ -406,6 +511,8 @@ FFT48_3DN _3dn
%define
Z
(
x
)
[
zq
+
o1q
*
(
x
&
6
)
+
mmsize
*
(
x
&
1
)
]
%define
Z
(
x
)
[
zq
+
o1q
*
(
x
&
6
)
+
mmsize
*
(
x
&
1
)
]
%define
Z2
(
x
)
[
zq
+
o3q
+
mmsize
*
(
x
&
1
)
]
%define
Z2
(
x
)
[
zq
+
o3q
+
mmsize
*
(
x
&
1
)
]
%define
ZH
(
x
)
[
zq
+
o1q
*
(
x
&
6
)
+
mmsize
*
(
x
&
1
)
+
mmsize
/
2
]
%define
Z2H
(
x
)
[
zq
+
o3q
+
mmsize
*
(
x
&
1
)
+
mmsize
/
2
]
%macro
DECL_PASS
2
+
; name, payload
%macro
DECL_PASS
2
+
; name, payload
align
16
align
16
...
@@ -423,8 +530,34 @@ DEFINE_ARGS z, w, n, o1, o3
...
@@ -423,8 +530,34 @@ DEFINE_ARGS z, w, n, o1, o3
rep
ret
rep
ret
%endmacro
%endmacro
INIT_YMM
%macro
INTERL_AVX
5
vunpckhps
%3
,
%2
,
%1
vunpcklps
%2
,
%2
,
%1
vextractf128
%4
(
%5
),
%2
,
0
vextractf128
%4
%
+
H
(
%5
),
%3
,
0
vextractf128
%4
(
%5
+
1
),
%2
,
1
vextractf128
%4
%
+
H
(
%5
+
1
),
%3
,
1
%endmacro
%define
INTERL
INTERL_AVX
DECL_PASS
pass_avx
,
PASS_BIG
1
DECL_PASS
pass_interleave_avx
,
PASS_BIG
0
INIT_XMM
INIT_XMM
%define
mova
movaps
%macro
INTERL_SSE
5
mova
%3
,
%2
unpcklps
%2
,
%1
unpckhps
%3
,
%1
mova
%4
(
%5
),
%2
mova
%4
(
%5
+
1
),
%3
%endmacro
%define
INTERL
INTERL_SSE
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
...
@@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
...
@@ -457,9 +590,12 @@ DECL_PASS pass_interleave_3dn, PASS_BIG 0
%macro
DECL_FFT
2
-
3
; nbits, cpu, suffix
%macro
DECL_FFT
2
-
3
; nbits, cpu, suffix
%xdefine
list_of_fft
fft4
%2
SECTION_REL
,
fft8
%2
SECTION_REL
%xdefine
list_of_fft
fft4
%2
SECTION_REL
,
fft8
%2
SECTION_REL
%if
%1
=
=
5
%if
%1
>
=
5
%xdefine
list_of_fft
list_of_fft
,
fft16
%2
SECTION_REL
%xdefine
list_of_fft
list_of_fft
,
fft16
%2
SECTION_REL
%endif
%endif
%if
%1
>=
6
%xdefine
list_of_fft
list_of_fft
,
fft32
%3%2
SECTION_REL
%endif
%assign
n
1
<<
%1
%assign
n
1
<<
%1
%rep
17
-
%1
%rep
17
-
%1
...
@@ -492,9 +628,14 @@ section .text
...
@@ -492,9 +628,14 @@ section .text
; The others pass args in registers and don't spill anything.
; The others pass args in registers and don't spill anything.
cglobal
fft_dispatch
%3%2
,
2
,
5
,
8
,
z
,
nbits
cglobal
fft_dispatch
%3%2
,
2
,
5
,
8
,
z
,
nbits
FFT_DISPATCH
%3%2
,
nbits
FFT_DISPATCH
%3%2
,
nbits
%ifidn
%2
,
_avx
vzeroupper
%endif
RET
RET
%endmacro
; DECL_FFT
%endmacro
; DECL_FFT
DECL_FFT
6
,
_avx
DECL_FFT
6
,
_avx
,
_interleave
DECL_FFT
5
,
_sse
DECL_FFT
5
,
_sse
DECL_FFT
5
,
_sse
,
_interleave
DECL_FFT
5
,
_sse
,
_interleave
DECL_FFT
4
,
_3dn
DECL_FFT
4
,
_3dn
...
@@ -533,21 +674,53 @@ INIT_XMM
...
@@ -533,21 +674,53 @@ INIT_XMM
%endmacro
%endmacro
%macro
CMUL
6
;j, xmm0, xmm1, 3, 4, 5
%macro
CMUL
6
;j, xmm0, xmm1, 3, 4, 5
movaps
xmm6
,
[
%4
+
%1
*
2
]
mulps
m6
,
%3
,
[
%5
+
%1
]
movaps
%2
,
[
%4
+
%1
*
2
+
0x10
]
mulps
m7
,
%2
,
[
%5
+
%1
]
movaps
%3
,
xmm6
mulps
%2
,
%2
,
[
%6
+
%1
]
movaps
xmm7
,
%2
mulps
%3
,
%3
,
[
%6
+
%1
]
mulps
xmm6
,
[
%5
+
%1
]
subps
%2
,
%2
,
m6
mulps
%2
,
[
%6
+
%1
]
addps
%3
,
%3
,
m7
mulps
%3
,
[
%6
+
%1
]
%endmacro
mulps
xmm7
,
[
%5
+
%1
]
subps
%2
,
xmm6
%macro
POSROTATESHUF_AVX
5
;j, k, z+n8, tcos+n8, tsin+n8
addps
%3
,
xmm7
.
post
:
vmovaps
ymm1
,
[
%3
+
%1
*
2
]
vmovaps
ymm0
,
[
%3
+
%1
*
2
+
0x20
]
vmovaps
ymm3
,
[
%3
+
%2
*
2
]
vmovaps
ymm2
,
[
%3
+
%2
*
2
+
0x20
]
CMUL
%1
,
ymm0
,
ymm1
,
%3
,
%4
,
%5
CMUL
%2
,
ymm2
,
ymm3
,
%3
,
%4
,
%5
vshufps
ymm1
,
ymm1
,
ymm1
,
0x1b
vshufps
ymm3
,
ymm3
,
ymm3
,
0x1b
vperm2f128
ymm1
,
ymm1
,
ymm1
,
0x01
vperm2f128
ymm3
,
ymm3
,
ymm3
,
0x01
vunpcklps
ymm6
,
ymm2
,
ymm1
vunpckhps
ymm4
,
ymm2
,
ymm1
vunpcklps
ymm7
,
ymm0
,
ymm3
vunpckhps
ymm5
,
ymm0
,
ymm3
vextractf128
[
%3
+
%1
*
2
]
,
ymm7
,
0
vextractf128
[
%3
+
%1
*
2
+
0x10
]
,
ymm5
,
0
vextractf128
[
%3
+
%1
*
2
+
0x20
]
,
ymm7
,
1
vextractf128
[
%3
+
%1
*
2
+
0x30
]
,
ymm5
,
1
vextractf128
[
%3
+
%2
*
2
]
,
ymm6
,
0
vextractf128
[
%3
+
%2
*
2
+
0x10
]
,
ymm4
,
0
vextractf128
[
%3
+
%2
*
2
+
0x20
]
,
ymm6
,
1
vextractf128
[
%3
+
%2
*
2
+
0x30
]
,
ymm4
,
1
sub
%2
,
0x20
add
%1
,
0x20
jl
.
post
%endmacro
%endmacro
%macro
POSROTATESHUF
5
;j, k, z+n8, tcos+n8, tsin+n8
%macro
POSROTATESHUF
5
;j, k, z+n8, tcos+n8, tsin+n8
.
post
:
.
post
:
movaps
xmm1
,
[
%3
+
%1
*
2
]
movaps
xmm0
,
[
%3
+
%1
*
2
+
0x10
]
CMUL
%1
,
xmm0
,
xmm1
,
%3
,
%4
,
%5
CMUL
%1
,
xmm0
,
xmm1
,
%3
,
%4
,
%5
movaps
xmm5
,
[
%3
+
%2
*
2
]
movaps
xmm4
,
[
%3
+
%2
*
2
+
0x10
]
CMUL
%2
,
xmm4
,
xmm5
,
%3
,
%4
,
%5
CMUL
%2
,
xmm4
,
xmm5
,
%3
,
%4
,
%5
shufps
xmm1
,
xmm1
,
0x1b
shufps
xmm1
,
xmm1
,
0x1b
shufps
xmm5
,
xmm5
,
0x1b
shufps
xmm5
,
xmm5
,
0x1b
...
@@ -566,7 +739,8 @@ INIT_XMM
...
@@ -566,7 +739,8 @@ INIT_XMM
jl
.
post
jl
.
post
%endmacro
%endmacro
cglobal
imdct_half_sse
,
3
,
7
,
8
; FFTContext *s, FFTSample *output, const FFTSample *input
%macro
DECL_IMDCT
2
cglobal
imdct_half
%1
,
3
,
7
,
8
; FFTContext *s, FFTSample *output, const FFTSample *input
%ifdef
ARCH_X86_64
%ifdef
ARCH_X86_64
%define
rrevtab
r10
%define
rrevtab
r10
%define
rtcos
r11
%define
rtcos
r11
...
@@ -641,7 +815,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
...
@@ -641,7 +815,7 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
mov
r0
,
r1
mov
r0
,
r1
mov
r1d
,
[
r5
+
FFTContext
.
nbits
]
mov
r1d
,
[
r5
+
FFTContext
.
nbits
]
FFT_DISPATCH
_sse
,
r1
FFT_DISPATCH
%1
,
r1
mov
r0d
,
[
r5
+
FFTContext
.
mdctsize
]
mov
r0d
,
[
r5
+
FFTContext
.
mdctsize
]
add
r6
,
r0
add
r6
,
r0
...
@@ -653,14 +827,24 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
...
@@ -653,14 +827,24 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample
mov
rtsin
,
[
esp
+
4
]
mov
rtsin
,
[
esp
+
4
]
%endif
%endif
neg
r0
neg
r0
mov
r1
,
-
16
mov
r1
,
-
mmsize
sub
r1
,
r0
sub
r1
,
r0
POSROTATESHUF
r0
,
r1
,
r6
,
rtcos
,
rtsin
%2
r0
,
r1
,
r6
,
rtcos
,
rtsin
%ifdef
ARCH_X86_64
%ifdef
ARCH_X86_64
pop
r14
pop
r14
pop
r13
pop
r13
pop
r12
pop
r12
%else
%else
add
esp
,
12
add
esp
,
12
%endif
%ifidn
avx_enabled
,
1
vzeroupper
%endif
%endif
RET
RET
%endmacro
DECL_IMDCT
_sse
,
POSROTATESHUF
INIT_YMM
DECL_IMDCT
_avx
,
POSROTATESHUF_AVX
libavcodec/x86/fft_sse.c
View file @
9d35fa52
...
@@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
...
@@ -28,6 +28,12 @@ DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
void
ff_fft_dispatch_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_avx
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
ff_fft_dispatch_interleave_avx
(
z
,
s
->
nbits
);
}
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
{
...
@@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
...
@@ -77,7 +83,7 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
long
n
=
s
->
mdct_size
;
long
n
=
s
->
mdct_size
;
long
n4
=
n
>>
2
;
long
n4
=
n
>>
2
;
ff_imdct_half_sse
(
s
,
output
+
n4
,
input
);
s
->
imdct_half
(
s
,
output
+
n4
,
input
);
j
=
-
n
;
j
=
-
n
;
k
=
n
-
16
;
k
=
n
-
16
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment