Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
1338fb79
Commit
1338fb79
authored
Oct 08, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add 10/12bpp sse2 SIMD version for idct_idct_16x16.
parent
cb054d06
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
223 additions
and
16 deletions
+223
-16
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+2
-0
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+221
-16
No files found.
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
1338fb79
...
...
@@ -135,6 +135,7 @@ decl_itxfm_func(idct, iadst, 4, BPC, sse2);
decl_itxfm_func
(
iadst
,
idct
,
4
,
BPC
,
sse2
);
decl_itxfm_func
(
iadst
,
iadst
,
4
,
BPC
,
sse2
);
decl_itxfm_funcs
(
8
,
BPC
,
sse2
);
decl_itxfm_func
(
idct
,
idct
,
16
,
BPC
,
sse2
);
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
,
int
bitexact
)
...
...
@@ -206,6 +207,7 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_itx_funcs
(
TX_4X4
,
4
,
12
,
sse2
);
#endif
init_itx_funcs
(
TX_8X8
,
8
,
BPC
,
sse2
);
init_itx_func
(
TX_16X16
,
DCT_DCT
,
idct
,
idct
,
16
,
BPC
,
sse2
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9itxfm_16bpp.asm
View file @
1338fb79
...
...
@@ -32,6 +32,7 @@ cextern pw_4095
cextern
pw_m1
cextern
pd_1
cextern
pd_16
cextern
pd_32
cextern
pd_8192
pd_8
:
times
4
dd
8
...
...
@@ -530,19 +531,19 @@ IADST4_12BPP_FN iadst, IADST4, idct, IDCT4
IADST4_12BPP_FN
iadst
,
IADST4
,
iadst
,
IADST4
; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+
17
*mmsize
%macro
IDCT8_1D
1
; src
mova
m0
,
[
%1
+
0
*
mmsize
]
mova
m2
,
[
%1
+
4
*
mmsize
]
mova
m4
,
[
%1
+
8
*
mmsize
]
mova
m6
,
[
%1
+
12
*
mmsize
]
; UNSCRATCH 6, 8, rsp+
%3
*mmsize
%macro
IDCT8_1D
1
-
3
2
*
mmsize
,
17
; src, src_stride, stack_offset
mova
m0
,
[
%1
+
0
*
%2
]
mova
m2
,
[
%1
+
2
*
%2
]
mova
m4
,
[
%1
+
4
*
%2
]
mova
m6
,
[
%1
+
6
*
%2
]
IDCT4_12BPP_1D
0
,
2
,
4
,
6
,
1
,
3
; m0/2/4/6 have t0/1/2/3
SCRATCH
4
,
8
,
rsp
+
17
*
mmsize
SCRATCH
6
,
9
,
rsp
+
18
*
mmsize
mova
m1
,
[
%1
+
2
*
mmsize
]
mova
m3
,
[
%1
+
6
*
mmsize
]
mova
m5
,
[
%1
+
10
*
mmsize
]
mova
m7
,
[
%1
+
14
*
mmsize
]
SCRATCH
4
,
8
,
rsp
+
(
%3
+
0
)
*
mmsize
SCRATCH
6
,
9
,
rsp
+
(
%3
+
1
)
*
mmsize
mova
m1
,
[
%1
+
1
*
%2
]
mova
m3
,
[
%1
+
3
*
%2
]
mova
m5
,
[
%1
+
5
*
%2
]
mova
m7
,
[
%1
+
7
*
%2
]
SUMSUB_MUL
1
,
7
,
4
,
6
,
16069
,
3196
; m1=t7a, m7=t4a
SUMSUB_MUL
5
,
3
,
4
,
6
,
9102
,
13623
; m5=t6a, m3=t5a
SUMSUB_BA
d
,
3
,
7
,
4
; m3=t4, m7=t5a
...
...
@@ -550,9 +551,9 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
SUMSUB_MUL
1
,
7
,
4
,
6
,
11585
,
11585
; m1=t6, m7=t5
SUMSUB_BA
d
,
5
,
0
,
4
; m5=out0, m0=out7
SUMSUB_BA
d
,
1
,
2
,
4
; m1=out1, m2=out6
UNSCRATCH
4
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
6
,
9
,
rsp
+
18
*
mmsize
SCRATCH
2
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
4
,
8
,
rsp
+
(
%3
+
0
)
*
mmsize
UNSCRATCH
6
,
9
,
rsp
+
(
%3
+
1
)
*
mmsize
SCRATCH
2
,
8
,
rsp
+
(
%3
+
0
)
*
mmsize
SUMSUB_BA
d
,
7
,
4
,
2
; m7=out2, m4=out5
SUMSUB_BA
d
,
3
,
6
,
2
; m3=out3, m6=out4
SWAP
0
,
5
,
4
,
6
,
2
,
7
...
...
@@ -772,7 +773,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+17*mmsize
%macro
IADST8_1D
1
%macro
IADST8_1D
1
; src
mova
m0
,
[
%1
+
0
*
mmsize
]
mova
m3
,
[
%1
+
6
*
mmsize
]
mova
m4
,
[
%1
+
8
*
mmsize
]
...
...
@@ -904,3 +905,207 @@ INIT_XMM sse2
IADST8_FN
idct
,
IDCT8
,
iadst
,
IADST8
IADST8_FN
iadst
,
IADST8
,
idct
,
IDCT8
IADST8_FN
iadst
,
IADST8
,
iadst
,
IADST8
%macro
IDCT16_1D
1
; src
IDCT8_1D
%1
,
8
*
mmsize
,
67
; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
; SCRATCH 6, 8, rsp+67*mmsize ; t6
SCRATCH
0
,
15
,
rsp
+
74
*
mmsize
; t0a
SCRATCH
1
,
14
,
rsp
+
73
*
mmsize
; t1a
SCRATCH
2
,
13
,
rsp
+
72
*
mmsize
; t2a
SCRATCH
3
,
12
,
rsp
+
71
*
mmsize
; t3a
SCRATCH
4
,
11
,
rsp
+
70
*
mmsize
; t4
mova
[
rsp
+
65
*
mmsize
]
,
m5
; t5
mova
[
rsp
+
66
*
mmsize
]
,
m7
; t7
mova
m0
,
[
%1
+
1
*
4
*
mmsize
]
; in1
mova
m3
,
[
%1
+
7
*
4
*
mmsize
]
; in7
mova
m4
,
[
%1
+
9
*
4
*
mmsize
]
; in9
mova
m7
,
[
%1
+
15
*
4
*
mmsize
]
; in15
SUMSUB_MUL
0
,
7
,
1
,
2
,
16305
,
1606
; m0=t15a, m7=t8a
SUMSUB_MUL
4
,
3
,
1
,
2
,
10394
,
12665
; m4=t14a, m3=t9a
SUMSUB_BA
d
,
3
,
7
,
1
; m3=t8, m7=t9
SUMSUB_BA
d
,
4
,
0
,
1
; m4=t15,m0=t14
SUMSUB_MUL
0
,
7
,
1
,
2
,
15137
,
6270
; m0=t14a, m7=t9a
mova
m1
,
[
%1
+
3
*
4
*
mmsize
]
; in3
mova
m2
,
[
%1
+
5
*
4
*
mmsize
]
; in5
mova
m5
,
[
%1
+
11
*
4
*
mmsize
]
; in11
mova
m6
,
[
%1
+
13
*
4
*
mmsize
]
; in13
SCRATCH
0
,
9
,
rsp
+
68
*
mmsize
SCRATCH
7
,
10
,
rsp
+
69
*
mmsize
SUMSUB_MUL
2
,
5
,
0
,
7
,
14449
,
7723
; m2=t13a, m5=t10a
SUMSUB_MUL
6
,
1
,
0
,
7
,
4756
,
15679
; m6=t12a, m1=t11a
SUMSUB_BA
d
,
5
,
1
,
0
; m5=t11,m1=t10
SUMSUB_BA
d
,
2
,
6
,
0
; m2=t12,m6=t13
NEGD
m1
; m1=-t10
SUMSUB_MUL
1
,
6
,
0
,
7
,
15137
,
6270
; m1=t13a, m6=t10a
UNSCRATCH
7
,
10
,
rsp
+
69
*
mmsize
SUMSUB_BA
d
,
5
,
3
,
0
; m5=t8a, m3=t11a
SUMSUB_BA
d
,
6
,
7
,
0
; m6=t9, m7=t10
SUMSUB_BA
d
,
2
,
4
,
0
; m2=t15a,m4=t12a
SCRATCH
5
,
10
,
rsp
+
69
*
mmsize
SUMSUB_MUL
4
,
3
,
0
,
5
,
11585
,
11585
; m4=t12, m3=t11
UNSCRATCH
0
,
9
,
rsp
+
68
*
mmsize
SUMSUB_BA
d
,
1
,
0
,
5
; m1=t14, m0=t13
SCRATCH
6
,
9
,
rsp
+
68
*
mmsize
SUMSUB_MUL
0
,
7
,
6
,
5
,
11585
,
11585
; m0=t13a,m7=t10a
; order: 15|r74,14|r73,13|r72,12|r71,11|r70,r65,8|r67,r66,10|r69,9|r68,7,3,4,0,1,2
; free: 6,5
UNSCRATCH
5
,
15
,
rsp
+
74
*
mmsize
SUMSUB_BA
d
,
2
,
5
,
6
; m2=out0, m5=out15
SCRATCH
5
,
15
,
rsp
+
74
*
mmsize
UNSCRATCH
5
,
14
,
rsp
+
73
*
mmsize
SUMSUB_BA
d
,
1
,
5
,
6
; m1=out1, m5=out14
SCRATCH
5
,
14
,
rsp
+
73
*
mmsize
UNSCRATCH
5
,
13
,
rsp
+
72
*
mmsize
SUMSUB_BA
d
,
0
,
5
,
6
; m0=out2, m5=out13
SCRATCH
5
,
13
,
rsp
+
72
*
mmsize
UNSCRATCH
5
,
12
,
rsp
+
71
*
mmsize
SUMSUB_BA
d
,
4
,
5
,
6
; m4=out3, m5=out12
SCRATCH
5
,
12
,
rsp
+
71
*
mmsize
UNSCRATCH
5
,
11
,
rsp
+
70
*
mmsize
SUMSUB_BA
d
,
3
,
5
,
6
; m3=out4, m5=out11
SCRATCH
4
,
11
,
rsp
+
70
*
mmsize
mova
m4
,
[
rsp
+
65
*
mmsize
]
SUMSUB_BA
d
,
7
,
4
,
6
; m7=out5, m4=out10
mova
[
rsp
+
65
*
mmsize
]
,
m5
UNSCRATCH
5
,
8
,
rsp
+
67
*
mmsize
UNSCRATCH
6
,
9
,
rsp
+
68
*
mmsize
SCRATCH
2
,
8
,
rsp
+
67
*
mmsize
SCRATCH
1
,
9
,
rsp
+
68
*
mmsize
UNSCRATCH
1
,
10
,
rsp
+
69
*
mmsize
SCRATCH
0
,
10
,
rsp
+
69
*
mmsize
mova
m0
,
[
rsp
+
66
*
mmsize
]
SUMSUB_BA
d
,
6
,
5
,
2
; m6=out6, m5=out9
SUMSUB_BA
d
,
1
,
0
,
2
; m1=out7, m0=out8
SWAP
0
,
3
,
1
,
7
,
2
,
6
,
4
; output order: 8-11|r67-70=out0-3
; 0-6,r65=out4-11
; 12-15|r71-74=out12-15
%endmacro
INIT_XMM
sse2
cglobal
vp9_idct_idct_16x16_add_10
,
4
,
6
+
ARCH_X86_64
,
16
,
\
67
*
mmsize
+
ARCH_X86_32
*
8
*
mmsize
,
\
dst
,
stride
,
block
,
eob
mova
m0
,
[
pw_1023
]
cmp
eobd
,
1
jg
.
idctfull
; dc-only
.
idctfull
:
mova
[
rsp
+
64
*
mmsize
]
,
m0
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
%if
ARCH_X86_64
mov
dstbakq
,
dstq
%endif
lea
stride3q
,
[
strideq
*
3
]
mov
cntd
,
4
mov
ptrq
,
rsp
.
loop_1
:
IDCT16_1D
blockq
; order: 2,1,0,11,3,7,9,10,6,8,4,5,12,13,r65,15
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
7
mova
[
ptrq
+
1
*
mmsize
]
,
m0
mova
[
ptrq
+
5
*
mmsize
]
,
m1
mova
[
ptrq
+
9
*
mmsize
]
,
m2
mova
[
ptrq
+
13
*
mmsize
]
,
m3
mova
m7
,
[
rsp
+
65
*
mmsize
]
TRANSPOSE4x4D
4
,
5
,
6
,
7
,
0
mova
[
ptrq
+
2
*
mmsize
]
,
m4
mova
[
ptrq
+
6
*
mmsize
]
,
m5
mova
[
ptrq
+
10
*
mmsize
]
,
m6
mova
[
ptrq
+
14
*
mmsize
]
,
m7
UNSCRATCH
0
,
8
,
rsp
+
67
*
mmsize
UNSCRATCH
1
,
9
,
rsp
+
68
*
mmsize
UNSCRATCH
2
,
10
,
rsp
+
69
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
70
*
mmsize
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
7
mova
[
ptrq
+
0
*
mmsize
]
,
m0
mova
[
ptrq
+
4
*
mmsize
]
,
m1
mova
[
ptrq
+
8
*
mmsize
]
,
m2
mova
[
ptrq
+
12
*
mmsize
]
,
m3
UNSCRATCH
4
,
12
,
rsp
+
71
*
mmsize
UNSCRATCH
5
,
13
,
rsp
+
72
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
73
*
mmsize
UNSCRATCH
7
,
15
,
rsp
+
74
*
mmsize
TRANSPOSE4x4D
4
,
5
,
6
,
7
,
0
mova
[
ptrq
+
3
*
mmsize
]
,
m4
mova
[
ptrq
+
7
*
mmsize
]
,
m5
mova
[
ptrq
+
11
*
mmsize
]
,
m6
mova
[
ptrq
+
15
*
mmsize
]
,
m7
add
ptrq
,
16
*
mmsize
add
blockq
,
mmsize
dec
cntd
jg
.
loop_1
mov
cntd
,
4
mov
ptrq
,
rsp
.
loop_2
:
IDCT16_1D
ptrq
pxor
m7
,
m7
lea
dstq
,
[
dstq
+
strideq
*
4
]
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
m0
,
[
rsp
+
65
*
mmsize
]
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
%if
ARCH_X86_64
DEFINE_ARGS
dstbak
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dst
%else
mov
dstq
,
dstm
%endif
UNSCRATCH
0
,
8
,
rsp
+
67
*
mmsize
UNSCRATCH
1
,
9
,
rsp
+
68
*
mmsize
UNSCRATCH
2
,
10
,
rsp
+
69
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
70
*
mmsize
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
%if
ARCH_X86_64
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
lea
dstq
,
[
dstbakq
+
stride3q
*
4
]
%else
lea
dstq
,
[
dstq
+
stride3q
*
4
]
%endif
UNSCRATCH
4
,
12
,
rsp
+
71
*
mmsize
UNSCRATCH
5
,
13
,
rsp
+
72
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
73
*
mmsize
UNSCRATCH
0
,
15
,
rsp
+
74
*
mmsize
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
add
ptrq
,
mmsize
%if
ARCH_X86_64
add
dstbakq
,
8
mov
dstq
,
dstbakq
%else
add
dword
dstm
,
8
mov
dstq
,
dstm
%endif
dec
cntd
jg
.
loop_2
; m7 is still zero
ZERO_BLOCK
blockq
-
4
*
mmsize
,
64
,
16
,
m7
RET
INIT_XMM
sse2
cglobal
vp9_idct_idct_16x16_add_12
,
4
,
6
+
ARCH_X86_64
,
16
,
\
67
*
mmsize
+
ARCH_X86_32
*
8
*
mmsize
,
\
dst
,
stride
,
block
,
eob
mova
m0
,
[
pw_4095
]
cmp
eobd
,
1
jg
mangle
(
private_prefix
%
+
_
%
+
vp9_idct_idct_16x16_add_10
%
+
SUFFIX
).
idctfull
; dc-only
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_idct_idct_16x16_add_10
%
+
SUFFIX
).
idctfull
RET
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment