Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
5ff01259
Commit
5ff01259
authored
Apr 20, 2012
by
Kieran Kunhya
Committed by
Justin Ruggles
May 21, 2012
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Convert vector_fmul range of functions to YASM and add AVX versions
Signed-off-by:
Justin Ruggles
<
justin.ruggles@gmail.com
>
parent
afeb3590
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
149 additions
and
158 deletions
+149
-158
aacsbrdata.h
libavcodec/aacsbrdata.h
+2
-2
aactab.c
libavcodec/aactab.c
+2
-2
aactab.h
libavcodec/aactab.h
+2
-2
dsputil.h
libavcodec/dsputil.h
+1
-1
ra288.c
libavcodec/ra288.c
+7
-7
ra288.h
libavcodec/ra288.h
+4
-4
sbr.h
libavcodec/sbr.h
+2
-2
sinewin.h
libavcodec/sinewin.h
+1
-1
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+23
-137
dsputil_yasm.asm
libavcodec/x86/dsputil_yasm.asm
+105
-0
No files found.
libavcodec/aacsbrdata.h
View file @
5ff01259
...
...
@@ -267,8 +267,8 @@ static const int8_t sbr_offset[6][16] = {
};
///< window coefficients for analysis/synthesis QMF banks
static
DECLARE_ALIGNED
(
16
,
float
,
sbr_qmf_window_ds
)[
320
];
static
DECLARE_ALIGNED
(
16
,
float
,
sbr_qmf_window_us
)[
640
]
=
{
static
DECLARE_ALIGNED
(
32
,
float
,
sbr_qmf_window_ds
)[
320
];
static
DECLARE_ALIGNED
(
32
,
float
,
sbr_qmf_window_us
)[
640
]
=
{
0
.
0000000000
,
-
0
.
00055252
86
,
-
0
.
00056176
92
,
-
0
.
0004
947518
,
-
0
.
0004
875227
,
-
0
.
0004
893791
,
-
0
.
0005040714
,
-
0
.
0005226564
,
-
0
.
0005466565
,
-
0
.
0005677
802
,
-
0
.
0005
870930
,
-
0
.
0006132747
,
...
...
libavcodec/aactab.c
View file @
5ff01259
...
...
@@ -33,8 +33,8 @@
#include <stdint.h>
DECLARE_ALIGNED
(
16
,
float
,
ff_aac_kbd_long_1024
)[
1024
];
DECLARE_ALIGNED
(
16
,
float
,
ff_aac_kbd_short_128
)[
128
];
DECLARE_ALIGNED
(
32
,
float
,
ff_aac_kbd_long_1024
)[
1024
];
DECLARE_ALIGNED
(
32
,
float
,
ff_aac_kbd_short_128
)[
128
];
const
uint8_t
ff_aac_num_swb_1024
[]
=
{
41
,
41
,
47
,
49
,
49
,
51
,
47
,
47
,
43
,
43
,
43
,
40
,
40
...
...
libavcodec/aactab.h
View file @
5ff01259
...
...
@@ -44,8 +44,8 @@
/* @name window coefficients
* @{
*/
DECLARE_ALIGNED
(
16
,
extern
float
,
ff_aac_kbd_long_1024
)[
1024
];
DECLARE_ALIGNED
(
16
,
extern
float
,
ff_aac_kbd_short_128
)[
128
];
DECLARE_ALIGNED
(
32
,
extern
float
,
ff_aac_kbd_long_1024
)[
1024
];
DECLARE_ALIGNED
(
32
,
extern
float
,
ff_aac_kbd_short_128
)[
128
];
// @}
/* @name number of scalefactor window bands for long and short transform windows respectively
...
...
libavcodec/dsputil.h
View file @
5ff01259
...
...
@@ -398,7 +398,7 @@ typedef struct DSPContext {
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
void
(
*
vorbis_inverse_coupling
)(
float
*
mag
,
float
*
ang
,
int
blocksize
);
void
(
*
ac3_downmix
)(
float
(
*
samples
)[
256
],
float
(
*
matrix
)[
2
],
int
out_ch
,
int
in_ch
,
int
len
);
/* assume len is a multiple of
8, and arrays are 16
-byte aligned */
/* assume len is a multiple of
16, and arrays are 32
-byte aligned */
void
(
*
vector_fmul
)(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
void
(
*
vector_fmul_reverse
)(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */
...
...
libavcodec/ra288.c
View file @
5ff01259
...
...
@@ -38,8 +38,8 @@
typedef
struct
{
AVFrame
frame
;
DSPContext
dsp
;
DECLARE_ALIGNED
(
16
,
float
,
sp_lpc
)[
FFALIGN
(
36
,
8
)];
///< LPC coefficients for speech data (spec: A)
DECLARE_ALIGNED
(
16
,
float
,
gain_lpc
)[
FFALIGN
(
10
,
8
)];
///< LPC coefficients for gain (spec: GB)
DECLARE_ALIGNED
(
32
,
float
,
sp_lpc
)[
FFALIGN
(
36
,
16
)];
///< LPC coefficients for speech data (spec: A)
DECLARE_ALIGNED
(
32
,
float
,
gain_lpc
)[
FFALIGN
(
10
,
16
)];
///< LPC coefficients for gain (spec: GB)
/** speech data history (spec: SB).
* Its first 70 coefficients are updated only at backward filtering.
...
...
@@ -133,11 +133,11 @@ static void do_hybrid_window(RA288Context *ractx,
int
i
;
float
buffer1
[
MAX_BACKWARD_FILTER_ORDER
+
1
];
float
buffer2
[
MAX_BACKWARD_FILTER_ORDER
+
1
];
LOCAL_ALIGNED
_16
(
float
,
work
,
[
FFALIGN
(
MAX_BACKWARD_FILTER_ORDER
+
MAX_BACKWARD_FILTER_LEN
+
MAX_BACKWARD_FILTER_NONREC
,
8
)]);
LOCAL_ALIGNED
(
32
,
float
,
work
,
[
FFALIGN
(
MAX_BACKWARD_FILTER_ORDER
+
MAX_BACKWARD_FILTER_LEN
+
MAX_BACKWARD_FILTER_NONREC
,
16
)]);
ractx
->
dsp
.
vector_fmul
(
work
,
window
,
hist
,
FFALIGN
(
order
+
n
+
non_rec
,
8
));
ractx
->
dsp
.
vector_fmul
(
work
,
window
,
hist
,
FFALIGN
(
order
+
n
+
non_rec
,
16
));
convolve
(
buffer1
,
work
+
order
,
n
,
order
);
convolve
(
buffer2
,
work
+
order
+
n
,
non_rec
,
order
);
...
...
@@ -164,7 +164,7 @@ static void backward_filter(RA288Context *ractx,
do_hybrid_window
(
ractx
,
order
,
n
,
non_rec
,
temp
,
hist
,
rec
,
window
);
if
(
!
compute_lpc_coefs
(
temp
,
order
,
lpc
,
0
,
1
,
1
))
ractx
->
dsp
.
vector_fmul
(
lpc
,
lpc
,
tab
,
FFALIGN
(
order
,
8
));
ractx
->
dsp
.
vector_fmul
(
lpc
,
lpc
,
tab
,
FFALIGN
(
order
,
16
));
memmove
(
hist
,
hist
+
n
,
move_size
*
sizeof
(
*
hist
));
}
...
...
libavcodec/ra288.h
View file @
5ff01259
...
...
@@ -97,7 +97,7 @@ static const int16_t codetable[128][5]={
{
3746
,
-
606
,
53
,
-
269
,
-
3301
},
{
606
,
2018
,
-
1316
,
4064
,
398
}
};
DECLARE_ALIGNED
(
16
,
static
const
float
,
syn_window
)[
FFALIGN
(
111
,
8
)]
=
{
DECLARE_ALIGNED
(
32
,
static
const
float
,
syn_window
)[
FFALIGN
(
111
,
16
)]
=
{
0
.
576690972
,
0
.
580838025
,
0
.
585013986
,
0
.
589219987
,
0
.
59345597
,
0
.
597723007
,
0
.
602020264
,
0
.
606384277
,
0
.
610748291
,
0
.
615142822
,
0
.
619598389
,
0
.
624084473
,
0
.
628570557
,
0
.
633117676
,
0
.
637695313
,
0
.
642272949
,
0
.
646911621
,
0
.
651580811
,
...
...
@@ -119,7 +119,7 @@ DECLARE_ALIGNED(16, static const float, syn_window)[FFALIGN(111, 8)]={
0
.
142852783
,
0
.
0954284668
,
0
.
04776000
98
};
DECLARE_ALIGNED
(
16
,
static
const
float
,
gain_window
)[
FFALIGN
(
38
,
8
)]
=
{
DECLARE_ALIGNED
(
32
,
static
const
float
,
gain_window
)[
FFALIGN
(
38
,
16
)]
=
{
0
.
505699992
,
0
.
524200022
,
0
.
54339999
,
0
.
563300014
,
0
.
583953857
,
0
.
60534668
,
0
.
627502441
,
0
.
650482178
,
0
.
674316406
,
0
.
699005127
,
0
.
724578857
,
0
.
75112915
,
0
.
778625488
,
0
.
807128906
,
0
.
836669922
,
0
.
86730957
,
0
.
899078369
,
0
.
932006836
,
...
...
@@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, static const float, gain_window)[FFALIGN(38, 8)]={
};
/** synthesis bandwidth broadening table */
DECLARE_ALIGNED
(
16
,
static
const
float
,
syn_bw_tab
)[
FFALIGN
(
36
,
8
)]
=
{
DECLARE_ALIGNED
(
32
,
static
const
float
,
syn_bw_tab
)[
FFALIGN
(
36
,
16
)]
=
{
0
.
98828125
,
0
.
976699829
,
0
.
965254128
,
0
.
953942537
,
0
.
942763507
,
0
.
931715488
,
0
.
920796931
,
0
.
910006344
,
0
.
899342179
,
0
.
888803005
,
0
.
878387332
,
0
.
868093729
,
0
.
857920766
,
0
.
847867012
,
0
.
837931097
,
0
.
828111589
,
0
.
818407178
,
0
.
808816493
,
...
...
@@ -140,7 +140,7 @@ DECLARE_ALIGNED(16, static const float, syn_bw_tab)[FFALIGN(36, 8)] = {
};
/** gain bandwidth broadening table */
DECLARE_ALIGNED
(
16
,
static
const
float
,
gain_bw_tab
)[
FFALIGN
(
10
,
8
)]
=
{
DECLARE_ALIGNED
(
32
,
static
const
float
,
gain_bw_tab
)[
FFALIGN
(
10
,
16
)]
=
{
0
.
90625
,
0
.
821289063
,
0
.
74432373
,
0
.
674499512
,
0
.
61126709
,
0
.
553955078
,
0
.
50201416
,
0
.
454956055
,
0
.
41229248
,
0
.
373657227
};
...
...
libavcodec/sbr.h
View file @
5ff01259
...
...
@@ -78,8 +78,8 @@ typedef struct {
* @name State variables
* @{
*/
DECLARE_ALIGNED
(
16
,
float
,
synthesis_filterbank_samples
)[
SBR_SYNTHESIS_BUF_SIZE
];
DECLARE_ALIGNED
(
16
,
float
,
analysis_filterbank_samples
)
[
1312
];
DECLARE_ALIGNED
(
32
,
float
,
synthesis_filterbank_samples
)[
SBR_SYNTHESIS_BUF_SIZE
];
DECLARE_ALIGNED
(
32
,
float
,
analysis_filterbank_samples
)
[
1312
];
int
synthesis_filterbank_samples_offset
;
///l_APrev and l_A
int
e_a
[
2
];
...
...
libavcodec/sinewin.h
View file @
5ff01259
...
...
@@ -31,7 +31,7 @@
#endif
#define SINETABLE(size) \
SINETABLE_CONST DECLARE_ALIGNED(
16
, float, ff_sine_##size)[size]
SINETABLE_CONST DECLARE_ALIGNED(
32
, float, ff_sine_##size)[size]
/**
* Generate a sine window.
...
...
libavcodec/x86/dsputil_mmx.c
View file @
5ff01259
...
...
@@ -2348,135 +2348,6 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
}
}
static
void
vector_fmul_3dnow
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
)
{
x86_reg
i
=
(
len
-
4
)
*
4
;
__asm__
volatile
(
"1:
\n\t
"
"movq (%2, %0), %%mm0
\n\t
"
"movq 8(%2, %0), %%mm1
\n\t
"
"pfmul (%3, %0), %%mm0
\n\t
"
"pfmul 8(%3, %0), %%mm1
\n\t
"
"movq %%mm0, (%1, %0)
\n\t
"
"movq %%mm1, 8(%1, %0)
\n\t
"
"sub $16, %0
\n\t
"
"jge 1b
\n\t
"
"femms
\n\t
"
:
"+r"
(
i
)
:
"r"
(
dst
),
"r"
(
src0
),
"r"
(
src1
)
:
"memory"
);
}
static
void
vector_fmul_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
)
{
x86_reg
i
=
(
len
-
8
)
*
4
;
__asm__
volatile
(
"1:
\n\t
"
"movaps (%2, %0), %%xmm0
\n\t
"
"movaps 16(%2, %0), %%xmm1
\n\t
"
"mulps (%3, %0), %%xmm0
\n\t
"
"mulps 16(%3, %0), %%xmm1
\n\t
"
"movaps %%xmm0, (%1, %0)
\n\t
"
"movaps %%xmm1, 16(%1, %0)
\n\t
"
"sub $32, %0
\n\t
"
"jge 1b
\n\t
"
:
"+r"
(
i
)
:
"r"
(
dst
),
"r"
(
src0
),
"r"
(
src1
)
:
"memory"
);
}
static
void
vector_fmul_reverse_3dnow2
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
)
{
x86_reg
i
=
len
*
4
-
16
;
__asm__
volatile
(
"1:
\n\t
"
"pswapd 8(%1), %%mm0
\n\t
"
"pswapd (%1), %%mm1
\n\t
"
"pfmul (%3, %0), %%mm0
\n\t
"
"pfmul 8(%3, %0), %%mm1
\n\t
"
"movq %%mm0, (%2, %0)
\n\t
"
"movq %%mm1, 8(%2, %0)
\n\t
"
"add $16, %1
\n\t
"
"sub $16, %0
\n\t
"
"jge 1b
\n\t
"
:
"+r"
(
i
),
"+r"
(
src1
)
:
"r"
(
dst
),
"r"
(
src0
)
);
__asm__
volatile
(
"femms"
);
}
static
void
vector_fmul_reverse_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
)
{
x86_reg
i
=
len
*
4
-
32
;
__asm__
volatile
(
"1:
\n\t
"
"movaps 16(%1), %%xmm0
\n\t
"
"movaps (%1), %%xmm1
\n\t
"
"shufps $0x1b, %%xmm0, %%xmm0
\n\t
"
"shufps $0x1b, %%xmm1, %%xmm1
\n\t
"
"mulps (%3, %0), %%xmm0
\n\t
"
"mulps 16(%3, %0), %%xmm1
\n\t
"
"movaps %%xmm0, (%2, %0)
\n\t
"
"movaps %%xmm1, 16(%2, %0)
\n\t
"
"add $32, %1
\n\t
"
"sub $32, %0
\n\t
"
"jge 1b
\n\t
"
:
"+r"
(
i
),
"+r"
(
src1
)
:
"r"
(
dst
),
"r"
(
src0
)
);
}
static
void
vector_fmul_add_3dnow
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
const
float
*
src2
,
int
len
)
{
x86_reg
i
=
(
len
-
4
)
*
4
;
__asm__
volatile
(
"1:
\n\t
"
"movq (%2, %0), %%mm0
\n\t
"
"movq 8(%2, %0), %%mm1
\n\t
"
"pfmul (%3, %0), %%mm0
\n\t
"
"pfmul 8(%3, %0), %%mm1
\n\t
"
"pfadd (%4, %0), %%mm0
\n\t
"
"pfadd 8(%4, %0), %%mm1
\n\t
"
"movq %%mm0, (%1, %0)
\n\t
"
"movq %%mm1, 8(%1, %0)
\n\t
"
"sub $16, %0
\n\t
"
"jge 1b
\n\t
"
:
"+r"
(
i
)
:
"r"
(
dst
),
"r"
(
src0
),
"r"
(
src1
),
"r"
(
src2
)
:
"memory"
);
__asm__
volatile
(
"femms"
);
}
static
void
vector_fmul_add_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
const
float
*
src2
,
int
len
)
{
x86_reg
i
=
(
len
-
8
)
*
4
;
__asm__
volatile
(
"1:
\n\t
"
"movaps (%2, %0), %%xmm0
\n\t
"
"movaps 16(%2, %0), %%xmm1
\n\t
"
"mulps (%3, %0), %%xmm0
\n\t
"
"mulps 16(%3, %0), %%xmm1
\n\t
"
"addps (%4, %0), %%xmm0
\n\t
"
"addps 16(%4, %0), %%xmm1
\n\t
"
"movaps %%xmm0, (%1, %0)
\n\t
"
"movaps %%xmm1, 16(%1, %0)
\n\t
"
"sub $32, %0
\n\t
"
"jge 1b
\n\t
"
:
"+r"
(
i
)
:
"r"
(
dst
),
"r"
(
src0
),
"r"
(
src1
),
"r"
(
src2
)
:
"memory"
);
}
#if HAVE_6REGS
static
void
vector_fmul_window_3dnow2
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
const
float
*
win
,
...
...
@@ -2631,6 +2502,21 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
float
ff_scalarproduct_float_sse
(
const
float
*
v1
,
const
float
*
v2
,
int
order
);
void
ff_vector_fmul_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
void
ff_vector_fmul_avx
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
void
ff_vector_fmul_reverse_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
void
ff_vector_fmul_reverse_avx
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
int
len
);
void
ff_vector_fmul_add_sse
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
const
float
*
src2
,
int
len
);
void
ff_vector_fmul_add_avx
(
float
*
dst
,
const
float
*
src0
,
const
float
*
src1
,
const
float
*
src2
,
int
len
);
void
ff_vector_clip_int32_mmx
(
int32_t
*
dst
,
const
int32_t
*
src
,
int32_t
min
,
int32_t
max
,
unsigned
int
len
);
void
ff_vector_clip_int32_sse2
(
int32_t
*
dst
,
const
int32_t
*
src
,
...
...
@@ -2918,8 +2804,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
#endif
c
->
vorbis_inverse_coupling
=
vorbis_inverse_coupling_3dnow
;
c
->
vector_fmul
=
vector_fmul_3dnow
;
c
->
vector_fmul_add
=
vector_fmul_add_3dnow
;
#if HAVE_7REGS
c
->
add_hfyu_median_prediction
=
add_hfyu_median_prediction_cmov
;
...
...
@@ -2929,7 +2813,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
static
void
dsputil_init_3dnow2
(
DSPContext
*
c
,
AVCodecContext
*
avctx
,
int
mm_flags
)
{
c
->
vector_fmul_reverse
=
vector_fmul_reverse_3dnow2
;
#if HAVE_6REGS
c
->
vector_fmul_window
=
vector_fmul_window_3dnow2
;
#endif
...
...
@@ -2949,11 +2832,11 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
c
->
vorbis_inverse_coupling
=
vorbis_inverse_coupling_sse
;
c
->
ac3_downmix
=
ac3_downmix_sse
;
c
->
vector_fmul
=
vector_fmul_sse
;
c
->
vector_fmul
_reverse
=
vector_fmul_reverse
_sse
;
if
(
!
(
mm_flags
&
AV_CPU_FLAG_3DNOW
))
c
->
vector_fmul_add
=
vector_fmul_add_sse
;
#if HAVE_YASM
c
->
vector_fmul
=
ff_vector_fmul
_sse
;
c
->
vector_fmul_reverse
=
ff_vector_fmul_reverse_sse
;
c
->
vector_fmul_add
=
ff_vector_fmul_add_sse
;
#endif
#if HAVE_6REGS
c
->
vector_fmul_window
=
vector_fmul_window_sse
;
...
...
@@ -3112,6 +2995,9 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
}
}
c
->
butterflies_float_interleave
=
ff_butterflies_float_interleave_avx
;
c
->
vector_fmul
=
ff_vector_fmul_avx
;
c
->
vector_fmul_reverse
=
ff_vector_fmul_reverse_avx
;
c
->
vector_fmul_add
=
ff_vector_fmul_add_avx
;
#endif
}
...
...
libavcodec/x86/dsputil_yasm.asm
View file @
5ff01259
...
...
@@ -1129,6 +1129,111 @@ VECTOR_CLIP_INT32 11, 1, 1, 0
VECTOR_CLIP_INT32
6
,
1
,
0
,
0
%endif
;-----------------------------------------------------------------------------
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
;-----------------------------------------------------------------------------
%macro
VECTOR_FMUL
0
cglobal
vector_fmul
,
4
,
4
,
2
,
dst
,
src0
,
src1
,
len
lea
lenq
,
[
lend
*
4
-
2
*
mmsize
]
ALIGN
16
.
loop
mova
m0
,
[
src0q
+
lenq
]
mova
m1
,
[
src0q
+
lenq
+
mmsize
]
mulps
m0
,
m0
,
[
src1q
+
lenq
]
mulps
m1
,
m1
,
[
src1q
+
lenq
+
mmsize
]
mova
[
dstq
+
lenq
]
,
m0
mova
[
dstq
+
lenq
+
mmsize
]
,
m1
sub
lenq
,
2
*
mmsize
jge
.
loop
%if
mmsize
==
32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM
sse
VECTOR_FMUL
INIT_YMM
avx
VECTOR_FMUL
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
; int len)
;-----------------------------------------------------------------------------
%macro
VECTOR_FMUL_REVERSE
0
cglobal
vector_fmul_reverse
,
4
,
4
,
2
,
dst
,
src0
,
src1
,
len
lea
lenq
,
[
lend
*
4
-
2
*
mmsize
]
ALIGN
16
.
loop
%if
cpuflag
(
avx
)
vmovaps
xmm0
,
[
src1q
+
16
]
vinsertf128
m0
,
m0
,
[src1q],
1
vshufps
m0
,
m0
,
m0
,
q0123
vmovaps
xmm1
,
[
src1q
+
mmsize
+
16
]
vinsertf128
m1
,
m1
,
[
src1q
+
mmsize
]
,
1
vshufps
m1
,
m1
,
m1
,
q0123
%else
mova
m0
,
[src1q]
mova
m1
,
[
src1q
+
mmsize
]
shufps
m0
,
m0
,
q0123
shufps
m1
,
m1
,
q0123
%endif
mulps
m0
,
m0
,
[
src0q
+
lenq
+
mmsize
]
mulps
m1
,
m1
,
[
src0q
+
lenq
]
mova
[
dstq
+
lenq
+
mmsize
]
,
m0
mova
[
dstq
+
lenq
]
,
m1
add
src1q
,
2
*
mmsize
sub
lenq
,
2
*
mmsize
jge
.
loop
%if
mmsize
==
32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM
sse
VECTOR_FMUL_REVERSE
INIT_YMM
avx
VECTOR_FMUL_REVERSE
;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro
VECTOR_FMUL_ADD
0
cglobal
vector_fmul_add
,
5
,
5
,
2
,
dst
,
src0
,
src1
,
src2
,
len
lea
lenq
,
[
lend
*
4
-
2
*
mmsize
]
ALIGN
16
.
loop
mova
m0
,
[
src0q
+
lenq
]
mova
m1
,
[
src0q
+
lenq
+
mmsize
]
mulps
m0
,
m0
,
[
src1q
+
lenq
]
mulps
m1
,
m1
,
[
src1q
+
lenq
+
mmsize
]
addps
m0
,
m0
,
[
src2q
+
lenq
]
addps
m1
,
m1
,
[
src2q
+
lenq
+
mmsize
]
mova
[
dstq
+
lenq
]
,
m0
mova
[
dstq
+
lenq
+
mmsize
]
,
m1
sub
lenq
,
2
*
mmsize
jge
.
loop
%if
mmsize
==
32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM
sse
VECTOR_FMUL_ADD
INIT_YMM
avx
VECTOR_FMUL_ADD
;-----------------------------------------------------------------------------
; void ff_butterflies_float_interleave(float *dst, const float *src0,
; const float *src1, int len);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment