Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
e229df94
Commit
e229df94
authored
Jun 19, 2017
by
James Almer
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
parent
3c5a53cd
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
140 additions
and
6 deletions
+140
-6
aacpsdsp.asm
libavcodec/x86/aacpsdsp.asm
+123
-0
aacpsdsp_init.c
libavcodec/x86/aacpsdsp_init.c
+8
-0
x86util.asm
libavutil/x86/x86util.asm
+9
-6
No files found.
libavcodec/x86/aacpsdsp.asm
View file @
e229df94
...
...
@@ -166,6 +166,129 @@ align 16
jl
.
loop
REP_RET
;***********************************************************
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
; float (*in)[32][2],
; int i, int len)
;***********************************************************
%macro
HYBRID_SYNTHESIS_DEINT
0
cglobal
ps_hybrid_synthesis_deint
,
3
,
7
,
5
,
out
,
in
,
i
,
len
,
out0
,
out1
,
tmp
%if
cpuflag
(
sse4
)
%define
MOVH
movsd
%else
%define
MOVH
movlps
%endif
movsxdifnidn
iq
,
id
mov
lend
,
32
<<
3
lea
outq
,
[
outq
+
iq
*
4
]
mov
tmpd
,
id
shl
tmpd
,
8
add
inq
,
tmpq
mov
tmpd
,
64
sub
tmpd
,
id
mov
id
,
tmpd
test
id
,
1
jne
.
loop4
test
id
,
2
jne
.
loop8
align
16
.
loop16
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop16
:
movaps
m0
,
[inq]
movaps
m1
,
[
inq
+
lenq
]
movaps
m2
,
[
inq
+
lenq
*
2
]
movaps
m3
,
[
inq
+
3
*
32
*
2
*
4
]
TRANSPOSE4x4PS
0
,
1
,
2
,
3
,
4
movaps
[out0q],
m0
movaps
[out1q],
m1
movaps
[
out0q
+
lenq
]
,
m2
movaps
[
out1q
+
lenq
]
,
m3
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop16
add
outq
,
16
add
inq
,
3
*
32
*
2
*
4
sub
id
,
4
jg
.
loop16
RET
align
16
.
loop8
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop8
:
movaps
m0
,
[inq]
movaps
m1
,
[
inq
+
lenq
]
SBUTTERFLYPS
0
,
1
,
2
SBUTTERFLYPD
0
,
1
,
2
MOVH
[out0q],
m0
MOVH
[out1q],
m1
movhps
[
out0q
+
lenq
]
,
m0
movhps
[
out1q
+
lenq
]
,
m1
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop8
add
outq
,
8
add
inq
,
lenq
sub
id
,
2
jg
.
loop16
RET
align
16
.
loop4
:
mov
out0q
,
outq
mov
out1q
,
38
*
64
*
4
add
out1q
,
out0q
mov
tmpd
,
lend
.
inner_loop4
:
movaps
m0
,
[inq]
movss
[out0q],
m0
%if
cpuflag
(
sse4
)
extractps
[out1q],
m0
,
1
extractps
[
out0q
+
lenq
]
,
m0
,
2
extractps
[
out1q
+
lenq
]
,
m0
,
3
%else
movhlps
m1
,
m0
movss
[
out0q
+
lenq
]
,
m1
shufps
m0
,
m0
,
0xb1
movss
[out1q],
m0
movhlps
m1
,
m0
movss
[
out1q
+
lenq
]
,
m1
%endif
lea
out0q
,
[
out0q
+
lenq
*
2
]
lea
out1q
,
[
out1q
+
lenq
*
2
]
add
inq
,
mmsize
sub
tmpd
,
mmsize
jg
.
inner_loop4
add
outq
,
4
sub
id
,
1
test
id
,
2
jne
.
loop8
cmp
id
,
4
jge
.
loop16
RET
%endmacro
INIT_XMM
sse
HYBRID_SYNTHESIS_DEINT
INIT_XMM
sse4
HYBRID_SYNTHESIS_DEINT
;*******************************************************************
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
; const float (*filter)[8][2],
...
...
libavcodec/x86/aacpsdsp_init.c
View file @
e229df94
...
...
@@ -40,6 +40,10 @@ void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
void
ff_ps_stereo_interpolate_ipdopd_sse3
(
float
(
*
l
)[
2
],
float
(
*
r
)[
2
],
float
h
[
2
][
4
],
float
h_step
[
2
][
4
],
int
len
);
void
ff_ps_hybrid_synthesis_deint_sse
(
float
out
[
2
][
38
][
64
],
float
(
*
in
)[
32
][
2
],
int
i
,
int
len
);
void
ff_ps_hybrid_synthesis_deint_sse4
(
float
out
[
2
][
38
][
64
],
float
(
*
in
)[
32
][
2
],
int
i
,
int
len
);
av_cold
void
ff_psdsp_init_x86
(
PSDSPContext
*
s
)
{
...
...
@@ -48,6 +52,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
if
(
EXTERNAL_SSE
(
cpu_flags
))
{
s
->
add_squares
=
ff_ps_add_squares_sse
;
s
->
mul_pair_single
=
ff_ps_mul_pair_single_sse
;
s
->
hybrid_synthesis_deint
=
ff_ps_hybrid_synthesis_deint_sse
;
s
->
hybrid_analysis
=
ff_ps_hybrid_analysis_sse
;
}
if
(
EXTERNAL_SSE3
(
cpu_flags
))
{
...
...
@@ -56,4 +61,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
s
->
stereo_interpolate
[
1
]
=
ff_ps_stereo_interpolate_ipdopd_sse3
;
s
->
hybrid_analysis
=
ff_ps_hybrid_analysis_sse3
;
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
s
->
hybrid_synthesis_deint
=
ff_ps_hybrid_synthesis_deint_sse4
;
}
}
libavutil/x86/x86util.asm
View file @
e229df94
...
...
@@ -71,6 +71,12 @@
SWAP
%1
,
%3
,
%2
%endmacro
%macro
SBUTTERFLYPD
3
movlhps
m%3
,
m%1
,
m%2
movhlps
m%2
,
m%2
,
m%1
SWAP
%1
,
%3
%endmacro
%macro
TRANSPOSE4x4B
5
SBUTTERFLY
bw
,
%1
,
%2
,
%5
SBUTTERFLY
bw
,
%3
,
%4
,
%5
...
...
@@ -117,12 +123,9 @@
%macro
TRANSPOSE4x4PS
5
SBUTTERFLYPS
%1
,
%2
,
%5
SBUTTERFLYPS
%3
,
%4
,
%5
movlhps
m%5
,
m%1
,
m%3
movhlps
m%3
,
m%1
SWAP
%5
,
%1
movlhps
m%5
,
m%2
,
m%4
movhlps
m%4
,
m%2
SWAP
%5
,
%2
,
%3
SBUTTERFLYPD
%1
,
%3
,
%5
SBUTTERFLYPD
%2
,
%4
,
%5
SWAP
%2
,
%3
%endmacro
%macro
TRANSPOSE8x4D
9
-
11
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment