Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
82992604
Commit
82992604
authored
Jun 23, 2012
by
Mans Rullgard
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86: fft: convert sse inline asm to yasm
parent
8123e090
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
121 deletions
+129
-121
Makefile
libavcodec/x86/Makefile
+0
-1
fft_mmx.asm
libavcodec/x86/fft_mmx.asm
+129
-10
fft_sse.c
libavcodec/x86/fft_sse.c
+0
-110
No files found.
libavcodec/x86/Makefile
View file @
82992604
...
...
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS)
+=
x86/dsputilenc_yasm.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOW)
+=
x86/fft_3dn.o
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT)
+=
x86/fft_3dn2.o
YASM-OBJS-FFT-$(HAVE_SSE)
+=
x86/fft_sse.o
YASM-OBJS-$(CONFIG_FFT)
+=
x86/fft_mmx.o
\
$(YASM-OBJS-FFT-yes)
YASM-OBJS-$(CONFIG_H264CHROMA)
+=
x86/h264_chromamc.o
\
...
...
libavcodec/x86/fft_mmx.asm
View file @
82992604
...
...
@@ -45,6 +45,10 @@ struc FFTContext
.
mdctbits
:
resd
1
.
tcos
:
pointer
1
.
tsin
:
pointer
1
.
fftperm
:
pointer
1
.
fftcalc
:
pointer
1
.
imdctcalc
:
pointer
1
.
imdcthalf
:
pointer
1
endstruc
SECTION_RODATA
...
...
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2
:
dd
0x00
,
0x01
,
0x02
,
0x03
,
0x01
,
0x00
,
0x02
,
0x03
ps_p1p1m1p1root2
:
dd
1
.
0
,
1
.
0
,
-
1
.
0
,
1
.
0
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
,
M_SQRT1_2
ps_m1m1p1m1p1m1m1m1
:
dd
1
<<
31
,
1
<<
31
,
0
,
1
<<
31
,
0
,
1
<<
31
,
1
<<
31
,
1
<<
31
ps_m1m1m1m1
:
times
4
dd
1
<<
31
ps_m1p1
:
dd
1
<<
31
,
0
%assign
i
16
...
...
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
rep
ret
%endmacro
%macro
FFT_DISPATCH
2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
dispatch_tab
%1
]
mov
r2
,
[
r2
+
(
%2
q
-
2
)
*
gprsize
]
%ifdef
PIC
lea
r3
,
[$$]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
INIT_YMM
avx
%if
HAVE_AVX
...
...
@@ -548,6 +563,14 @@ INIT_YMM avx
DECL_PASS
pass_avx
,
PASS_BIG
1
DECL_PASS
pass_interleave_avx
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext
.
nbits
]
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%
+
SUFFIX
,
r1
REP_RET
%endif
INIT_XMM
sse
...
...
@@ -565,6 +588,112 @@ INIT_XMM sse
DECL_PASS
pass_sse
,
PASS_BIG
1
DECL_PASS
pass_interleave_sse
,
PASS_BIG
0
cglobal
fft_calc
,
2
,
5
,
8
mov
r3d
,
[
r0
+
FFTContext
.
nbits
]
PUSH
r1
PUSH
r3
mov
r0
,
r1
mov
r1
,
r3
FFT_DISPATCH
_interleave
%
+
SUFFIX
,
r1
POP
rcx
POP
r4
cmp
rcx
,
4
jg
.
end
mov
r2
,
-
1
add
rcx
,
3
shl
r2
,
cl
sub
r4
,
r2
.
loop
movaps
xmm0
,
[
r4
+
r2
]
movaps
xmm1
,
xmm0
unpcklps
xmm0
,
[
r4
+
r2
+
16
]
unpckhps
xmm1
,
[
r4
+
r2
+
16
]
movaps
[
r4
+
r2
]
,
xmm0
movaps
[
r4
+
r2
+
16
]
,
xmm1
add
r2
,
32
jl
.
loop
.
end
:
REP_RET
cextern_naked
memcpy
cglobal
fft_permute
,
2
,
7
,
1
mov
r4
,
[
r0
+
FFTContext
.
revtab
]
mov
r5
,
[
r0
+
FFTContext
.
tmpbuf
]
mov
ecx
,
[
r0
+
FFTContext
.
nbits
]
mov
r2
,
1
shl
r2
,
cl
xor
r0
,
r0
%if
ARCH_X86_32
mov
r1
,
r1m
%endif
.
loop
:
movaps
xmm0
,
[
r1
+
8
*
r0
]
movzx
r6
,
word
[
r4
+
2
*
r0
]
movzx
r3
,
word
[
r4
+
2
*
r0
+
2
]
movlps
[
r5
+
8
*
r6
]
,
xmm0
movhps
[
r5
+
8
*
r3
]
,
xmm0
add
r0
,
2
cmp
r0
,
r2
jl
.
loop
shl
r2
,
3
%if
ARCH_X86_64
mov
r0
,
r1
mov
r1
,
r5
%else
push
r2
push
r5
push
r1
%endif
%if
ARCH_X86_64
&&
WIN64
==
0
jmp
memcpy
%else
call
memcpy
%if
ARCH_X86_32
add
esp
,
12
%endif
REP_RET
%endif
cglobal
imdct_calc
,
3
,
5
,
3
mov
r3d
,
[
r0
+
FFTContext
.
mdctsize
]
mov
r4
,
[
r0
+
FFTContext
.
imdcthalf
]
add
r1
,
r3
PUSH
r3
PUSH
r1
%if
ARCH_X86_32
push
r2
push
r1
push
r0
%else
sub
rsp
,
8
%endif
call
r4
%if
ARCH_X86_32
add
esp
,
12
%else
add
rsp
,
8
%endif
POP
r1
POP
r3
lea
r0
,
[
r1
+
2
*
r3
]
mov
r2
,
r3
sub
r3
,
16
neg
r2
movaps
xmm2
,
[
ps_m1m1m1m1
]
.
loop
:
movaps
xmm0
,
[
r1
+
r3
]
movaps
xmm1
,
[
r0
+
r2
]
shufps
xmm0
,
xmm0
,
0x1b
shufps
xmm1
,
xmm1
,
0x1b
xorps
xmm0
,
xmm2
movaps
[
r0
+
r3
]
,
xmm1
movaps
[
r1
+
r2
]
,
xmm0
sub
r3
,
16
add
r2
,
16
jl
.
loop
REP_RET
INIT_MMX
3
dnow
%define
mulps
pfmul
%define
addps
pfadd
...
...
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define
SECTION_REL
%endif
%macro
FFT_DISPATCH
2
; clobbers 5 GPRs, 8 XMMs
lea
r2
,
[
dispatch_tab
%1
]
mov
r2
,
[
r2
+
(
%2
q
-
2
)
*
gprsize
]
%ifdef
PIC
lea
r3
,
[$$]
add
r2
,
r3
%endif
call
r2
%endmacro
; FFT_DISPATCH
%macro
DECL_FFT
1
-
2
; nbits, suffix
%ifidn
%0
,
1
%xdefine
fullsuffix
SUFFIX
...
...
libavcodec/x86/fft_sse.c
deleted
100644 → 0
View file @
8123e090
/*
* FFT/MDCT transform with SSE optimizations
* Copyright (c) 2008 Loren Merritt
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "fft.h"
#include "config.h"
DECLARE_ASM_CONST
(
16
,
unsigned
int
,
ff_m1m1m1m1
)[
4
]
=
{
1U
<<
31
,
1U
<<
31
,
1U
<<
31
,
1U
<<
31
};
void
ff_fft_dispatch_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_sse
(
FFTComplex
*
z
,
int
nbits
);
void
ff_fft_dispatch_interleave_avx
(
FFTComplex
*
z
,
int
nbits
);
#if HAVE_AVX
void
ff_fft_calc_avx
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
ff_fft_dispatch_interleave_avx
(
z
,
s
->
nbits
);
}
#endif
void
ff_fft_calc_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
ff_fft_dispatch_interleave_sse
(
z
,
s
->
nbits
);
if
(
n
<=
16
)
{
x86_reg
i
=
-
8
*
n
;
__asm__
volatile
(
"1:
\n
"
"movaps (%0,%1), %%xmm0
\n
"
"movaps %%xmm0, %%xmm1
\n
"
"unpcklps 16(%0,%1), %%xmm0
\n
"
"unpckhps 16(%0,%1), %%xmm1
\n
"
"movaps %%xmm0, (%0,%1)
\n
"
"movaps %%xmm1, 16(%0,%1)
\n
"
"add $32, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
i
)
:
"r"
(
z
+
n
)
:
"memory"
);
}
}
void
ff_fft_permute_sse
(
FFTContext
*
s
,
FFTComplex
*
z
)
{
int
n
=
1
<<
s
->
nbits
;
int
i
;
for
(
i
=
0
;
i
<
n
;
i
+=
2
)
{
__asm__
volatile
(
"movaps %2, %%xmm0
\n
"
"movlps %%xmm0, %0
\n
"
"movhps %%xmm0, %1
\n
"
:
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
]]),
"=m"
(
s
->
tmp_buf
[
s
->
revtab
[
i
+
1
]])
:
"m"
(
z
[
i
])
);
}
memcpy
(
z
,
s
->
tmp_buf
,
n
*
sizeof
(
FFTComplex
));
}
void
ff_imdct_calc_sse
(
FFTContext
*
s
,
FFTSample
*
output
,
const
FFTSample
*
input
)
{
x86_reg
j
,
k
;
long
n
=
s
->
mdct_size
;
long
n4
=
n
>>
2
;
s
->
imdct_half
(
s
,
output
+
n4
,
input
);
j
=
-
n
;
k
=
n
-
16
;
__asm__
volatile
(
"movaps "
MANGLE
(
ff_m1m1m1m1
)
", %%xmm7
\n
"
"1:
\n
"
"movaps (%2,%1), %%xmm0
\n
"
"movaps (%3,%0), %%xmm1
\n
"
"shufps $0x1b, %%xmm0, %%xmm0
\n
"
"shufps $0x1b, %%xmm1, %%xmm1
\n
"
"xorps %%xmm7, %%xmm0
\n
"
"movaps %%xmm1, (%3,%1)
\n
"
"movaps %%xmm0, (%2,%0)
\n
"
"sub $16, %1
\n
"
"add $16, %0
\n
"
"jl 1b
\n
"
:
"+r"
(
j
),
"+r"
(
k
)
:
"r"
(
output
+
n4
),
"r"
(
output
+
n4
*
3
)
XMM_CLOBBERS_ONLY
(
"%xmm0"
,
"%xmm1"
,
"%xmm7"
)
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment