Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
847bb638
Commit
847bb638
authored
Jun 29, 2014
by
Ronald S. Bultje
Committed by
Michael Niedermayer
Jun 30, 2014
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swr: convert resample_common/linear_int16_mmx2/sse2 to yasm.
Signed-off-by:
Michael Niedermayer
<
michaelni@gmx.at
>
parent
e5c806fd
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
187 additions
and
215 deletions
+187
-215
resample_template.c
libswresample/resample_template.c
+4
-19
resample.asm
libswresample/x86/resample.asm
+166
-59
resample_mmx.h
libswresample/x86/resample_mmx.h
+0
-110
resample_x86_dsp.c
libswresample/x86/resample_x86_dsp.c
+17
-27
No files found.
libswresample/resample_template.c
View file @
847bb638
...
...
@@ -44,17 +44,15 @@
#elif defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
# define FILTER_SHIFT 0
# define DELEM float
# define FELEM float
# define FELEM2 float
# define OUT(d, v) d = v
# if defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
# endif
#elif defined(TEMPLATE_RESAMPLE_S32)
# define RENAME(N) N ## _int32
# define FILTER_SHIFT 30
# define DELEM int32_t
...
...
@@ -65,10 +63,9 @@
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
d = (uint64_t)(v + 0x80000000) > 0xFFFFFFFF ? (v>>63) ^ 0x7FFFFFFF : v
#elif defined(TEMPLATE_RESAMPLE_S16) \
|| defined(TEMPLATE_RESAMPLE_S16_MMX2) \
|| defined(TEMPLATE_RESAMPLE_S16_SSE2)
#elif defined(TEMPLATE_RESAMPLE_S16)
# define RENAME(N) N ## _int16
# define FILTER_SHIFT 15
# define DELEM int16_t
# define FELEM int16_t
...
...
@@ -79,18 +76,6 @@
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
d = (unsigned)(v + 32768) > 65535 ? (v>>31) ^ 32767 : v
# if defined(TEMPLATE_RESAMPLE_S16)
# define RENAME(N) N ## _int16
# elif defined(TEMPLATE_RESAMPLE_S16_MMX2)
# define COMMON_CORE COMMON_CORE_INT16_MMX2
# define LINEAR_CORE LINEAR_CORE_INT16_MMX2
# define RENAME(N) N ## _int16_mmx2
# elif defined(TEMPLATE_RESAMPLE_S16_SSE2)
# define COMMON_CORE COMMON_CORE_INT16_SSE2
# define LINEAR_CORE LINEAR_CORE_INT16_SSE2
# define RENAME(N) N ## _int16_sse2
# endif
#endif
#if DO_RESAMPLE_ONE
...
...
libswresample/x86/resample.asm
View file @
847bb638
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
...
...
@@ -49,14 +50,15 @@ endstruc
SECTION_RODATA
pf_1
:
dd
1
.
0
pd_0x4000
:
dd
0x4000
SECTION
.
text
%macro
RESAMPLE_F
LOAT_FNS
0
; int resample_common_
float(ResampleContext *ctx, flo
at *dst,
;
const flo
at *src, int size, int update_ctx)
%macro
RESAMPLE_F
NS
3
; format [float or int16], bps, log2_bps
; int resample_common_
$format(ResampleContext *ctx, $form
at *dst,
;
const $form
at *src, int size, int update_ctx)
%if
ARCH_X86_64
; unix64 and win64
cglobal
resample_common_
float
,
0
,
15
,
2
,
ctx
,
dst
,
src
,
phase_shift
,
index
,
frac
,
\
cglobal
resample_common_
%1
,
0
,
15
,
2
,
ctx
,
dst
,
src
,
phase_shift
,
index
,
frac
,
\
dst_incr_mod
,
size
,
min_filter_count_x4
,
\
min_filter_len_x4
,
dst_incr_div
,
src_incr
,
\
phase_mask
,
dst_end
,
filter_bank
...
...
@@ -85,8 +87,8 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac
mov
ctx_stackq
,
ctxq
mov
min_filter_len_x4d
,
[
ctxq
+
ResampleContext
.
filter_length
]
mov
dst_incr_divd
,
[
ctxq
+
ResampleContext
.
dst_incr_div
]
shl
min_filter_len_x4d
,
2
lea
dst_endq
,
[
dstq
+
sizeq
*
4
]
shl
min_filter_len_x4d
,
%3
lea
dst_endq
,
[
dstq
+
sizeq
*
%2
]
%if
UNIX64
mov
ecx
,
[
ctxq
+
ResampleContext
.
phase_shift
]
...
...
@@ -109,7 +111,7 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac
sub
srcq
,
min_filter_len_x4q
mov
src_stackq
,
srcq
%else
; x86-32
cglobal
resample_common_
float
,
1
,
7
,
2
,
ctx
,
phase_shift
,
dst
,
frac
,
\
cglobal
resample_common_
%1
,
1
,
7
,
2
,
ctx
,
phase_shift
,
dst
,
frac
,
\
index
,
min_filter_length_x4
,
filter_bank
; push temp variables to stack
...
...
@@ -119,7 +121,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
mov
dstq
,
r1mp
mov
r3
,
r3mp
lea
r3
,
[
dstq
+
r3
*
4
]
lea
r3
,
[
dstq
+
r3
*
%2
]
PUSH
dword
[
ctxq
+
ResampleContext
.
dst_incr_div
]
PUSH
dword
[
ctxq
+
ResampleContext
.
dst_incr_mod
]
PUSH
dword
[
ctxq
+
ResampleContext
.
filter_alloc
]
...
...
@@ -128,7 +130,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
PUSH
dword
[
ctxq
+
ResampleContext
.
src_incr
]
mov
min_filter_length_x4d
,
[
ctxq
+
ResampleContext
.
filter_length
]
mov
indexd
,
[
ctxq
+
ResampleContext
.
index
]
shl
min_filter_length_x4d
,
2
shl
min_filter_length_x4d
,
%3
mov
fracd
,
[
ctxq
+
ResampleContext
.
frac
]
neg
min_filter_length_x4q
mov
filter_bankq
,
[
ctxq
+
ResampleContext
.
filter_bank
]
...
...
@@ -157,19 +159,28 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
imul
filterd
,
indexd
%if
ARCH_X86_64
mov
min_filter_count_x4q
,
min_filter_len_x4q
lea
filterq
,
[
filter_bankq
+
filterq
*
4
]
lea
filterq
,
[
filter_bankq
+
filterq
*
%2
]
%else
; x86-32
mov
min_filter_count_x4q
,
filter_bankq
lea
filterq
,
[
min_filter_count_x4q
+
filterq
*
4
]
lea
filterq
,
[
min_filter_count_x4q
+
filterq
*
%2
]
mov
min_filter_count_x4q
,
min_filter_length_x4q
%endif
%ifidn
%1
,
float
xorps
m0
,
m0
,
m0
%else
; int16
movd
m0
,
[
pd_0x4000
]
%endif
align
16
.
inner_loop
:
movups
m1
,
[
srcq
+
min_filter_count_x4q
*
1
]
movu
m1
,
[
srcq
+
min_filter_count_x4q
*
1
]
%ifidn
%1
,
float
mulps
m1
,
m1
,
[
filterq
+
min_filter_count_x4q
*
1
]
addps
m0
,
m0
,
m1
%else
; int16
pmaddwd
m1
,
[
filterq
+
min_filter_count_x4q
*
1
]
paddd
m0
,
m1
%endif
add
min_filter_count_x4q
,
mmsize
js
.
inner_loop
...
...
@@ -179,6 +190,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
%endif
; horizontal sum & store
%ifidn
%1
,
float
movhlps
xm1
,
xm0
addps
xm0
,
xm1
shufps
xm1
,
xm0
,
xm0
,
q0001
...
...
@@ -186,6 +198,21 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
addps
xm0
,
xm1
add
indexd
,
dst_incr_divd
movss
[dstq],
xm0
%else
; int16
%if
mmsize
==
16
pshufd
m1
,
m0
,
q0032
paddd
m0
,
m1
pshufd
m1
,
m0
,
q0001
%else
; mmsize == 8
pshufw
m1
,
m0
,
q0032
%endif
paddd
m0
,
m1
psrad
m0
,
15
add
fracd
,
dst_incr_modd
packssdw
m0
,
m0
add
indexd
,
dst_incr_divd
movd
[dstq],
m0
%endif
cmp
fracd
,
src_incrd
jl
.
skip
sub
fracd
,
src_incrd
...
...
@@ -205,10 +232,10 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
.
skip
:
mov
index_incrd
,
indexd
add
dstq
,
4
add
dstq
,
%2
and
indexd
,
phase_maskd
sar
index_incrd
,
phase_shiftb
lea
srcq
,
[
srcq
+
index_incrq
*
4
]
lea
srcq
,
[
srcq
+
index_incrq
*
%2
]
cmp
dstq
,
dst_endq
jne
.
loop
...
...
@@ -228,7 +255,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
mov
[
ctxq
+
ResampleContext
.
frac
]
,
fracd
sub
rax
,
src_stackq
mov
[
ctxq
+
ResampleContext
.
index
]
,
indexd
shr
rax
,
2
shr
rax
,
%3
.
skip_store
:
%if
ARCH_X86_32
...
...
@@ -236,13 +263,24 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
%endif
RET
; int resample_linear_
flo
at(ResampleContext *ctx, float *dst,
; int resample_linear_
$form
at(ResampleContext *ctx, float *dst,
; const float *src, int size, int update_ctx)
%if
ARCH_X86_64
; unix64 and win64
cglobal
resample_linear_float
,
0
,
15
,
5
,
ctx
,
dst
,
src
,
phase_shift
,
index
,
frac
,
\
dst_incr_mod
,
size
,
min_filter_count_x4
,
\
%if
UNIX64
cglobal
resample_linear_
%1
,
0
,
15
,
5
,
ctx
,
dst
,
phase_mask
,
phase_shift
,
index
,
frac
,
\
size
,
dst_incr_mod
,
min_filter_count_x4
,
\
min_filter_len_x4
,
dst_incr_div
,
src_incr
,
\
phase_mask
,
dst_end
,
filter_bank
src
,
dst_end
,
filter_bank
mov
srcq
,
r2mp
%else
; win64
cglobal
resample_linear_
%1
,
0
,
15
,
5
,
ctx
,
phase_mask
,
src
,
phase_shift
,
index
,
frac
,
\
size
,
dst_incr_mod
,
min_filter_count_x4
,
\
min_filter_len_x4
,
dst_incr_div
,
src_incr
,
\
dst
,
dst_end
,
filter_bank
mov
dstq
,
r1mp
%endif
; use red-zone for variable storage
%define
ctx_stackq
[
rsp
-
0x8
]
...
...
@@ -269,27 +307,31 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac
mov
ctx_stackq
,
ctxq
mov
phase_mask_stackd
,
phase_maskd
mov
min_filter_len_x4d
,
[
ctxq
+
ResampleContext
.
filter_length
]
%ifidn
%1
,
float
cvtsi2ss
xm0
,
src_incrd
movss
xm4
,
[
pf_1
]
divss
xm4
,
xm0
%else
; int16
movd
m4
,
[
pd_0x4000
]
%endif
mov
dst_incr_divd
,
[
ctxq
+
ResampleContext
.
dst_incr_div
]
shl
min_filter_len_x4d
,
2
lea
dst_endq
,
[
dstq
+
sizeq
*
4
]
shl
min_filter_len_x4d
,
%3
lea
dst_endq
,
[
dstq
+
sizeq
*
%2
]
%if
UNIX64
mov
ecx
,
[
ctxq
+
ResampleContext
.
phase_shift
]
mov
edi
,
[
ctxq
+
ResampleContext
.
filter_alloc
]
DEFINE_ARGS
filter_alloc
,
dst
,
src
,
phase_shift
,
index
,
frac
,
dst_incr_mod
,
\
filter1
,
min_filter_count_x4
,
min_filter_len_x4
,
dst_incr_div
,
\
src_incr
,
filter2
,
dst_end
,
filter_bank
DEFINE_ARGS
filter_alloc
,
dst
,
filter2
,
phase_shift
,
index
,
frac
,
filter1
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
src
,
dst_end
,
filter_bank
%elif
WIN64
mov
R9d
,
[
ctxq
+
ResampleContext
.
filter_alloc
]
mov
ecx
,
[
ctxq
+
ResampleContext
.
phase_shift
]
DEFINE_ARGS
phase_shift
,
dst
,
src
,
filter_alloc
,
index
,
frac
,
dst_incr_mod
,
\
filter1
,
min_filter_count_x4
,
min_filter_len_x4
,
dst_incr_div
,
\
src_incr
,
filter2
,
dst_end
,
filter_bank
DEFINE_ARGS
phase_shift
,
filter2
,
src
,
filter_alloc
,
index
,
frac
,
filter1
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
dst
,
dst_end
,
filter_bank
%endif
neg
min_filter_len_x4q
...
...
@@ -297,8 +339,8 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac
sub
srcq
,
min_filter_len_x4q
mov
src_stackq
,
srcq
%else
; x86-32
cglobal
resample_linear_
float
,
1
,
7
,
5
,
ctx
,
filter1
,
dst
,
frac
,
\
index
,
min_filter_length_x4
,
filter_bank
cglobal
resample_linear_
%1
,
1
,
7
,
5
,
ctx
,
min_filter_length_x4
,
filter2
,
\
frac
,
index
,
dst
,
filter_bank
; push temp variables to stack
%define
ctx_stackq
r0mp
...
...
@@ -307,23 +349,27 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
mov
dstq
,
r1mp
mov
r3
,
r3mp
lea
r3
,
[
dstq
+
r3
*
4
]
lea
r3
,
[
dstq
+
r3
*
%2
]
PUSH
dword
[
ctxq
+
ResampleContext
.
dst_incr_div
]
PUSH
r3
mov
r3
,
dword
[
ctxq
+
ResampleContext
.
filter_alloc
]
PUSH
dword
[
ctxq
+
ResampleContext
.
dst_incr_mod
]
PUSH
r3
shl
r3
,
2
shl
r3
,
%3
PUSH
r3
mov
r3
,
dword
[
ctxq
+
ResampleContext
.
src_incr
]
PUSH
dword
[
ctxq
+
ResampleContext
.
phase_mask
]
PUSH
r3d
%ifidn
%1
,
float
cvtsi2ss
xm0
,
r3d
movss
xm4
,
[
pf_1
]
divss
xm4
,
xm0
%else
; int16
movd
m4
,
[
pd_0x4000
]
%endif
mov
min_filter_length_x4d
,
[
ctxq
+
ResampleContext
.
filter_length
]
mov
indexd
,
[
ctxq
+
ResampleContext
.
index
]
shl
min_filter_length_x4d
,
2
shl
min_filter_length_x4d
,
%3
mov
fracd
,
[
ctxq
+
ResampleContext
.
frac
]
neg
min_filter_length_x4q
mov
filter_bankq
,
[
ctxq
+
ResampleContext
.
filter_bank
]
...
...
@@ -333,7 +379,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
PUSH
filter_bankq
PUSH
dword
[
ctxq
+
ResampleContext
.
phase_shift
]
DEFINE_ARGS
src
,
filter1
,
dst
,
frac
,
index
,
min_filter_count_x4
,
filter2
DEFINE_ARGS
filter1
,
min_filter_count_x4
,
filter2
,
frac
,
index
,
dst
,
src
%define
phase_shift_stackd
dword
[
rsp
+
0x0
]
%define
filter_bankq
dword
[
rsp
+
0x4
]
...
...
@@ -354,25 +400,37 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
imul
filter1d
,
indexd
%if
ARCH_X86_64
mov
min_filter_count_x4q
,
min_filter_len_x4q
lea
filter1q
,
[
filter_bankq
+
filter1q
*
4
]
lea
filter2q
,
[
filter1q
+
filter_allocq
*
4
]
lea
filter1q
,
[
filter_bankq
+
filter1q
*
%2
]
lea
filter2q
,
[
filter1q
+
filter_allocq
*
%2
]
%else
; x86-32
mov
min_filter_count_x4q
,
filter_bankq
lea
filter1q
,
[
min_filter_count_x4q
+
filter1q
*
4
]
lea
filter1q
,
[
min_filter_count_x4q
+
filter1q
*
%2
]
mov
min_filter_count_x4q
,
min_filter_length_x4q
mov
filter2q
,
filter1q
add
filter2q
,
filter_alloc_x4q
%endif
%ifidn
%1
,
float
xorps
m0
,
m0
,
m0
xorps
m2
,
m2
,
m2
%else
; int16
mova
m0
,
m4
mova
m2
,
m4
%endif
align
16
.
inner_loop
:
movups
m1
,
[
srcq
+
min_filter_count_x4q
*
1
]
movu
m1
,
[
srcq
+
min_filter_count_x4q
*
1
]
%ifidn
%1
,
float
mulps
m3
,
m1
,
[
filter2q
+
min_filter_count_x4q
*
1
]
mulps
m1
,
m1
,
[
filter1q
+
min_filter_count_x4q
*
1
]
addps
m2
,
m2
,
m3
addps
m0
,
m0
,
m1
%else
; int16
pmaddwd
m3
,
m1
,
[
filter2q
+
min_filter_count_x4q
*
1
]
pmaddwd
m1
,
[
filter1q
+
min_filter_count_x4q
*
1
]
paddd
m2
,
m3
paddd
m0
,
m1
%endif
add
min_filter_count_x4q
,
mmsize
js
.
inner_loop
...
...
@@ -383,6 +441,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
addps
xm2
,
xm3
%endif
%ifidn
%1
,
float
; val += (v2 - val) * (FELEML) frac / c->src_incr;
cvtsi2ss
xm1
,
fracd
subps
xm2
,
xm0
...
...
@@ -399,21 +458,55 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
addps
xm0
,
xm1
add
indexd
,
dst_incr_divd
movss
[dstq],
xm0
%else
; int16
%if
mmsize
==
16
pshufd
m3
,
m2
,
q0032
pshufd
m1
,
m0
,
q0032
paddd
m2
,
m3
paddd
m0
,
m1
pshufd
m3
,
m2
,
q0001
pshufd
m1
,
m0
,
q0001
%else
; mmsize == 8
pshufw
m3
,
m2
,
q0032
pshufw
m1
,
m0
,
q0032
%endif
paddd
m2
,
m3
paddd
m0
,
m1
psubd
m2
,
m0
; This is probably a really bad idea on atom and other machines with a
; long transfer latency between GPRs and XMMs (atom). However, it does
; make the clip a lot simpler...
movd
eax
,
m2
add
indexd
,
dst_incr_divd
imul
fracd
idiv
src_incrd
movd
m1
,
eax
add
fracd
,
dst_incr_modd
paddd
m0
,
m1
psrad
m0
,
15
packssdw
m0
,
m0
movd
[dstq],
m0
; note that for imul/idiv, I need to move filter to edx/eax for each:
; - 32bit: eax=r0[filter1], edx=r2[filter2]
; - win64: eax=r6[filter1], edx=r1[todo]
; - unix64: eax=r6[filter1], edx=r2[todo]
%endif
cmp
fracd
,
src_incrd
jl
.
skip
sub
fracd
,
src_incrd
inc
indexd
%if
UNIX64
DEFINE_ARGS
filter_alloc
,
dst
,
src
,
phase_shift
,
index
,
frac
,
dst_incr_mod
,
\
index_incr
,
min_filter_count_x4
,
min_filter_len_x4
,
dst_incr_div
,
\
src_incr
,
filter2
,
dst_end
,
filter_bank
DEFINE_ARGS
filter_alloc
,
dst
,
filter2
,
phase_shift
,
index
,
frac
,
index_incr
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
src
,
dst_end
,
filter_bank
%elif
WIN64
DEFINE_ARGS
phase_shift
,
dst
,
src
,
filter_alloc
,
index
,
frac
,
dst_incr_mod
,
\
index_incr
,
min_filter_count_x4
,
min_filter_len_x4
,
dst_incr_div
,
\
src_incr
,
filter2
,
dst_end
,
filter_bank
DEFINE_ARGS
phase_shift
,
filter2
,
src
,
filter_alloc
,
index
,
frac
,
index_incr
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
dst
,
dst_end
,
filter_bank
%else
; x86-32
DEFINE_ARGS
src
,
phase_shift
,
dst
,
frac
,
index
,
index_incr
DEFINE_ARGS
filter1
,
phase_shift
,
index_incr
,
frac
,
index
,
dst
,
src
%endif
.
skip
:
...
...
@@ -421,17 +514,23 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
mov
phase_shiftd
,
phase_shift_stackd
%endif
mov
index_incrd
,
indexd
add
dstq
,
4
add
dstq
,
%2
and
indexd
,
phase_mask_stackd
sar
index_incrd
,
phase_shiftb
lea
srcq
,
[
srcq
+
index_incrq
*
4
]
lea
srcq
,
[
srcq
+
index_incrq
*
%2
]
cmp
dstq
,
dst_endq
jne
.
loop
%if
ARCH_X86_64
DEFINE_ARGS
ctx
,
dst
,
src
,
phase_shift
,
index
,
frac
%if
UNIX64
DEFINE_ARGS
ctx
,
dst
,
filter2
,
phase_shift
,
index
,
frac
,
index_incr
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
src
,
dst_end
,
filter_bank
%elif
WIN64
DEFINE_ARGS
ctx
,
filter2
,
src
,
phase_shift
,
index
,
frac
,
index_incr
,
\
dst_incr_mod
,
min_filter_count_x4
,
min_filter_len_x4
,
\
dst_incr_div
,
src_incr
,
dst
,
dst_end
,
filter_bank
%else
; x86-32
DEFINE_ARGS
src
,
ctx
,
update_context
,
frac
,
index
DEFINE_ARGS
filter1
,
ctx
,
update_context
,
frac
,
index
,
dst
,
src
%endif
cmp
dword
update_context_stackd
,
0
...
...
@@ -444,7 +543,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
mov
[
ctxq
+
ResampleContext
.
frac
]
,
fracd
sub
rax
,
src_stackq
mov
[
ctxq
+
ResampleContext
.
index
]
,
indexd
shr
rax
,
2
shr
rax
,
%3
.
skip_store
:
%if
ARCH_X86_32
...
...
@@ -454,9 +553,17 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
%endmacro
INIT_XMM
sse
RESAMPLE_F
LOAT_FNS
RESAMPLE_F
NS
float
,
4
,
2
%if
HAVE_AVX_EXTERNAL
INIT_YMM
avx
RESAMPLE_F
LOAT_FNS
RESAMPLE_F
NS
float
,
4
,
2
%endif
%if
ARCH_X86_32
INIT_MMX
mmxext
RESAMPLE_FNS
int16
,
2
,
1
%endif
INIT_XMM
sse2
RESAMPLE_FNS
int16
,
2
,
1
libswresample/x86/resample_mmx.h
View file @
847bb638
...
...
@@ -22,116 +22,6 @@
#include "libavutil/cpu.h"
#include "libswresample/swresample_internal.h"
DECLARE_ALIGNED
(
16
,
const
uint64_t
,
ff_resample_int16_rounder
)[
2
]
=
{
0x0000000000004000ULL
,
0x0000000000000000ULL
};
#define COMMON_CORE_INT16_MMX2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"movq "MANGLE(ff_resample_int16_rounder)", %%mm0 \n\t"\
"1: \n\t"\
"movq (%1, %0), %%mm1 \n\t"\
"pmaddwd (%2, %0), %%mm1 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"add $8, %0 \n\t"\
" js 1b \n\t"\
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"psrad $15, %%mm0 \n\t"\
"packssdw %%mm0, %%mm0 \n\t"\
"movd %%mm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
);
#define LINEAR_CORE_INT16_MMX2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
"pxor %%mm2, %%mm2 \n\t"\
"1: \n\t"\
"movq (%3, %0), %%mm1 \n\t"\
"movq %%mm1, %%mm3 \n\t"\
"pmaddwd (%4, %0), %%mm1 \n\t"\
"pmaddwd (%5, %0), %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"add $8, %0 \n\t"\
" js 1b \n\t"\
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
"pshufw $0x0E, %%mm2, %%mm3 \n\t"\
"paddd %%mm1, %%mm0 \n\t"\
"paddd %%mm3, %%mm2 \n\t"\
"movd %%mm0, %1 \n\t"\
"movd %%mm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
);
#define COMMON_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"movdqa "MANGLE(ff_resample_int16_rounder)", %%xmm0 \n\t"\
"1: \n\t"\
"movdqu (%1, %0), %%xmm1 \n\t"\
"pmaddwd (%2, %0), %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"psrad $15, %%xmm0 \n\t"\
"packssdw %%xmm0, %%xmm0 \n\t"\
"movd %%xmm0, (%3) \n\t"\
: "+r" (len)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (dst+dst_index)\
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
);
#define LINEAR_CORE_INT16_SSE2 \
x86_reg len= -2*c->filter_length;\
__asm__ volatile(\
"pxor %%xmm0, %%xmm0 \n\t"\
"pxor %%xmm2, %%xmm2 \n\t"\
"1: \n\t"\
"movdqu (%3, %0), %%xmm1 \n\t"\
"movdqa %%xmm1, %%xmm3 \n\t"\
"pmaddwd (%4, %0), %%xmm1 \n\t"\
"pmaddwd (%5, %0), %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"add $16, %0 \n\t"\
" js 1b \n\t"\
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x0E, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
"pshufd $0x01, %%xmm2, %%xmm3 \n\t"\
"paddd %%xmm1, %%xmm0 \n\t"\
"paddd %%xmm3, %%xmm2 \n\t"\
"movd %%xmm0, %1 \n\t"\
"movd %%xmm2, %2 \n\t"\
: "+r" (len),\
"=r" (val),\
"=r" (v2)\
: "r" (((uint8_t*)(src+sample_index))-len),\
"r" (((uint8_t*)filter)-len),\
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
);
#define COMMON_CORE_DBL_SSE2 \
x86_reg len= -8*c->filter_length;\
__asm__ volatile(\
...
...
libswresample/x86/resample_x86_dsp.c
View file @
847bb638
...
...
@@ -27,34 +27,14 @@
#include "libswresample/resample.h"
int
swri_resample_common_int16_mmx2
(
ResampleContext
*
c
,
int16_t
*
dst
,
const
int16_t
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_linear_int16_mmx2
(
ResampleContext
*
c
,
int16_t
*
dst
,
const
int16_t
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_common_int16_sse2
(
ResampleContext
*
c
,
int16_t
*
dst
,
const
int16_t
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_linear_int16_sse2
(
ResampleContext
*
c
,
int16_t
*
dst
,
const
int16_t
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_common_float_sse
(
ResampleContext
*
c
,
float
*
dst
,
const
float
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_linear_float_sse
(
ResampleContext
*
c
,
float
*
dst
,
const
float
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_common_float_avx
(
ResampleContext
*
c
,
float
*
dst
,
const
float
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_linear_float_avx
(
ResampleContext
*
c
,
float
*
dst
,
const
float
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_common_double_sse2
(
ResampleContext
*
c
,
double
*
dst
,
const
double
*
src
,
int
n
,
int
update_ctx
);
int
swri_resample_linear_double_sse2
(
ResampleContext
*
c
,
double
*
dst
,
const
double
*
src
,
int
n
,
int
update_ctx
);
#if HAVE_MMXEXT_INLINE
#if HAVE_SSE2_INLINE
#define DO_RESAMPLE_ONE 0
#include "resample_mmx.h"
#if ARCH_X86_32
#define TEMPLATE_RESAMPLE_S16_MMX2
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_S16_MMX2
#endif
#if HAVE_SSE2_INLINE
#define TEMPLATE_RESAMPLE_S16_SSE2
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_S16_SSE2
#define TEMPLATE_RESAMPLE_DBL_SSE2
#include "libswresample/resample_template.c"
#undef TEMPLATE_RESAMPLE_DBL_SSE2
...
...
@@ -62,7 +42,15 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do
#undef DO_RESAMPLE_ONE
#endif // HAVE_MMXEXT_INLINE
int
ff_resample_common_int16_mmxext
(
ResampleContext
*
c
,
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
sz
,
int
upd
);
int
ff_resample_linear_int16_mmxext
(
ResampleContext
*
c
,
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
sz
,
int
upd
);
int
ff_resample_common_int16_sse2
(
ResampleContext
*
c
,
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
sz
,
int
upd
);
int
ff_resample_linear_int16_sse2
(
ResampleContext
*
c
,
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
sz
,
int
upd
);
int
ff_resample_common_float_sse
(
ResampleContext
*
c
,
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
sz
,
int
upd
);
...
...
@@ -79,17 +67,19 @@ void swresample_dsp_x86_init(ResampleContext *c)
int
av_unused
mm_flags
=
av_get_cpu_flags
();
#define FNIDX(fmt) (AV_SAMPLE_FMT_##fmt - AV_SAMPLE_FMT_S16P)
if
(
ARCH_X86_32
&&
HAVE_MMXEXT_
INLINE
&&
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
c
->
dsp
.
resample_common
[
FNIDX
(
S16P
)]
=
(
resample_fn
)
swri_resample_common_int16_mmx2
;
c
->
dsp
.
resample_linear
[
FNIDX
(
S16P
)]
=
(
resample_fn
)
swri_resample_linear_int16_mmx2
;
if
(
ARCH_X86_32
&&
HAVE_MMXEXT_
EXTERNAL
&&
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
c
->
dsp
.
resample_common
[
FNIDX
(
S16P
)]
=
ff_resample_common_int16_mmxext
;
c
->
dsp
.
resample_linear
[
FNIDX
(
S16P
)]
=
ff_resample_linear_int16_mmxext
;
}
if
(
HAVE_SSE_EXTERNAL
&&
mm_flags
&
AV_CPU_FLAG_SSE
)
{
c
->
dsp
.
resample_common
[
FNIDX
(
FLTP
)]
=
ff_resample_common_float_sse
;
c
->
dsp
.
resample_linear
[
FNIDX
(
FLTP
)]
=
ff_resample_linear_float_sse
;
}
if
(
HAVE_SSE2_EXTERNAL
&&
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
c
->
dsp
.
resample_common
[
FNIDX
(
S16P
)]
=
ff_resample_common_int16_sse2
;
c
->
dsp
.
resample_linear
[
FNIDX
(
S16P
)]
=
ff_resample_linear_int16_sse2
;
}
if
(
HAVE_SSE2_INLINE
&&
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
c
->
dsp
.
resample_common
[
FNIDX
(
S16P
)]
=
(
resample_fn
)
swri_resample_common_int16_sse2
;
c
->
dsp
.
resample_linear
[
FNIDX
(
S16P
)]
=
(
resample_fn
)
swri_resample_linear_int16_sse2
;
c
->
dsp
.
resample_common
[
FNIDX
(
DBLP
)]
=
(
resample_fn
)
swri_resample_common_double_sse2
;
c
->
dsp
.
resample_linear
[
FNIDX
(
DBLP
)]
=
(
resample_fn
)
swri_resample_linear_double_sse2
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment