Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
6b579cf5
Commit
6b579cf5
authored
Oct 06, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add 10bpp simd (mmxext/ssse3) for idct_idct_4x4.
parent
1c3be325
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
159 additions
and
49 deletions
+159
-49
constants.c
libavcodec/x86/constants.c
+2
-0
constants.h
libavcodec/x86/constants.h
+1
-0
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+12
-0
vp9itxfm.asm
libavcodec/x86/vp9itxfm.asm
+1
-49
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+96
-0
vp9itxfm_template.asm
libavcodec/x86/vp9itxfm_template.asm
+47
-0
No files found.
libavcodec/x86/constants.c
View file @
6b579cf5
...
...
@@ -85,6 +85,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
0x0000001000000010ULL
,
0x0000001000000010ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_32
)
=
{
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
,
0x0000002000000020ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_8192
)
=
{
0x0000200000002000ULL
,
0x0000200000002000ULL
,
0x0000200000002000ULL
,
0x0000200000002000ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pd_65535
)
=
{
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
,
0x0000ffff0000ffffULL
};
...
...
libavcodec/x86/constants.h
View file @
6b579cf5
...
...
@@ -65,6 +65,7 @@ extern const xmm_reg ff_ps_neg;
extern
const
ymm_reg
ff_pd_1
;
extern
const
ymm_reg
ff_pd_16
;
extern
const
ymm_reg
ff_pd_32
;
extern
const
ymm_reg
ff_pd_8192
;
extern
const
ymm_reg
ff_pd_65535
;
# if ARCH_X86_64
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
View file @
6b579cf5
...
...
@@ -125,6 +125,10 @@ lpf_mix2_wrappers_set(BPC, avx);
decl_ipred_fns
(
tm
,
BPC
,
mmxext
,
sse2
);
decl_itxfm_func
(
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#if BPC == 10
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
mmxext
);
decl_itxfm_func
(
idct
,
idct
,
4
,
BPC
,
ssse3
);
#endif
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
,
int
bitexact
)
...
...
@@ -170,6 +174,9 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
init_ipred_func
(
tm
,
TM_VP8
,
4
,
BPC
,
mmxext
);
if
(
!
bitexact
)
{
init_itx_func_one
(
4
/* lossless */
,
iwht
,
iwht
,
4
,
BPC
,
mmxext
);
#if BPC == 10
init_itx_func
(
TX_4X4
,
DCT_DCT
,
idct
,
idct
,
4
,
10
,
mmxext
);
#endif
}
}
...
...
@@ -182,6 +189,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_lpf_funcs
(
BPC
,
ssse3
);
#if BPC == 10
if
(
!
bitexact
)
{
init_itx_func
(
TX_4X4
,
DCT_DCT
,
idct
,
idct
,
4
,
10
,
ssse3
);
}
#endif
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
...
...
libavcodec/x86/vp9itxfm.asm
View file @
6b579cf5
...
...
@@ -71,8 +71,6 @@ pw_13377x2: times 8 dw 13377*2
pw_m13377_13377
:
times
4
dw
-
13377
,
13377
pw_13377_0
:
times
4
dw
13377
,
0
pd_8192
:
times
4
dd
8192
cextern
pw_8
cextern
pw_16
cextern
pw_32
...
...
@@ -80,38 +78,10 @@ cextern pw_512
cextern
pw_1024
cextern
pw_2048
cextern
pw_m1
cextern
pd_8192
SECTION
.
text
; (a*x + b*y + round) >> shift
%macro
VP9_MULSUB_2W_2X
5
; dst1, dst2/src, round, coefs1, coefs2
pmaddwd
m%1
,
m%2
,
%4
pmaddwd
m%2
,
%5
paddd
m%1
,
%3
paddd
m%2
,
%3
psrad
m%1
,
14
psrad
m%2
,
14
%endmacro
%macro
VP9_MULSUB_2W_4X
7
; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X
%7
,
%6
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
VP9_MULSUB_2W_2X
%1
,
%2
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
packssdw
m%1
,
m%7
packssdw
m%2
,
m%6
%endmacro
%macro
VP9_UNPACK_MULSUB_2W_4X
7
-
9
; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if
%0
==
7
punpckhwd
m%6
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
VP9_MULSUB_2W_4X
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
%else
punpckhwd
m%8
,
m%4
,
m%3
punpcklwd
m%2
,
m%4
,
m%3
VP9_MULSUB_2W_4X
%1
,
%2
,
%5
,
%6
,
%7
,
%8
,
%9
%endif
%endmacro
%macro
VP9_UNPACK_MULSUB_2D_4X
6
; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
punpckhwd
m%4
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
...
...
@@ -191,24 +161,6 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
%macro
VP9_IDCT4_1D_FINALIZE
0
SUMSUB_BA
w
,
3
,
2
,
4
; m3=t3+t0, m2=-t3+t0
SUMSUB_BA
w
,
1
,
0
,
4
; m1=t2+t1, m0=-t2+t1
SWAP
0
,
3
,
2
; 3102 -> 0123
%endmacro
%macro
VP9_IDCT4_1D
0
%if
cpuflag
(
ssse3
)
SUMSUB_BA
w
,
2
,
0
,
4
; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw
m2
,
m6
; m2=t0
pmulhrsw
m0
,
m6
; m0=t1
%else
; <= sse2
VP9_UNPACK_MULSUB_2W_4X
0
,
2
,
11585
,
11585
,
m7
,
4
,
5
; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X
1
,
3
,
15137
,
6270
,
m7
,
4
,
5
; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
; 2x2 top left corner
%macro
VP9_IDCT4_2x2_1D
0
pmulhrsw
m0
,
m5
; m0=t1
...
...
libavcodec/x86/vp9itxfm_16bpp.asm
View file @
6b579cf5
...
...
@@ -25,8 +25,18 @@
SECTION_RODATA
cextern
pw_8
cextern
pw_1023
cextern
pw_2048
cextern
pw_4095
cextern
pd_8192
; FIXME these should probably be shared between 8bpp and 10/12bpp
pw_m11585_11585
:
times
4
dw
-
11585
,
11585
pw_11585_11585
:
times
8
dw
11585
pw_m15137_6270
:
times
4
dw
-
15137
,
6270
pw_6270_15137
:
times
4
dw
6270
,
15137
pw_11585x2
:
times
8
dw
11585
*
2
SECTION
.
text
...
...
@@ -118,3 +128,89 @@ INIT_MMX mmxext
IWHT4_FN
10
,
1023
INIT_MMX
mmxext
IWHT4_FN
12
,
4095
; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
; in 15+1 words without additional effort, since the coefficients are 15bpp.
%macro
IDCT4_10_FN
0
cglobal
vp9_idct_idct_4x4_add_10
,
4
,
4
,
8
,
dst
,
stride
,
block
,
eob
cmp
eobd
,
1
jg
.
idctfull
; dc-only
%if
cpuflag
(
ssse3
)
movd
m0
,
[blockq]
mova
m5
,
[
pw_11585x2
]
pmulhrsw
m0
,
m5
pmulhrsw
m0
,
m5
%else
DEFINE_ARGS
dst
,
stride
,
block
,
coef
mov
coefd
,
dword
[blockq]
imul
coefd
,
11585
add
coefd
,
8192
sar
coefd
,
14
imul
coefd
,
11585
add
coefd
,
(
8
<<
14
)
+
8192
sar
coefd
,
14
+
4
movd
m0
,
coefd
%endif
pshufw
m0
,
m0
,
0
pxor
m4
,
m4
mova
m5
,
[
pw_1023
]
movh
[blockq],
m4
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_2048
]
; (x*2048 + (1<<14))>>15 <=> (x+8)>>4
%endif
VP9_STORE_2X
0
,
0
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
0
,
0
,
6
,
7
,
4
,
5
RET
.
idctfull
:
mova
m0
,
[
blockq
+
0
*
16
+
0
]
mova
m1
,
[
blockq
+
1
*
16
+
0
]
packssdw
m0
,
[
blockq
+
0
*
16
+
8
]
packssdw
m1
,
[
blockq
+
1
*
16
+
8
]
mova
m2
,
[
blockq
+
2
*
16
+
0
]
mova
m3
,
[
blockq
+
3
*
16
+
0
]
packssdw
m2
,
[
blockq
+
2
*
16
+
8
]
packssdw
m3
,
[
blockq
+
3
*
16
+
8
]
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pw_11585x2
]
%endif
mova
m7
,
[
pd_8192
]
; rounding
VP9_IDCT4_1D
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
VP9_IDCT4_1D
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pw_2048
]
pmulhrsw
m0
,
m5
pmulhrsw
m1
,
m5
pmulhrsw
m2
,
m5
pmulhrsw
m3
,
m5
%else
mova
m5
,
[
pw_8
]
paddw
m0
,
m5
paddw
m1
,
m5
paddw
m2
,
m5
paddw
m3
,
m5
psraw
m0
,
4
psraw
m1
,
4
psraw
m2
,
4
psraw
m3
,
4
%endif
mova
m5
,
[
pw_1023
]
VP9_STORE_2X
0
,
1
,
6
,
7
,
4
,
5
lea
dstq
,
[
dstq
+
2
*
strideq
]
VP9_STORE_2X
2
,
3
,
6
,
7
,
4
,
5
RET
%endmacro
INIT_MMX
mmxext
IDCT4_10_FN
INIT_MMX
ssse3
IDCT4_10_FN
libavcodec/x86/vp9itxfm_template.asm
View file @
6b579cf5
...
...
@@ -35,3 +35,50 @@
paddw
m3
,
m2
SWAP
3
,
2
,
1
%endmacro
; (a*x + b*y + round) >> shift
%macro
VP9_MULSUB_2W_2X
5
; dst1, dst2/src, round, coefs1, coefs2
pmaddwd
m%1
,
m%2
,
%4
pmaddwd
m%2
,
%5
paddd
m%1
,
%3
paddd
m%2
,
%3
psrad
m%1
,
14
psrad
m%2
,
14
%endmacro
%macro
VP9_MULSUB_2W_4X
7
; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2
VP9_MULSUB_2W_2X
%7
,
%6
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
VP9_MULSUB_2W_2X
%1
,
%2
,
%5
,
[
pw_m
%3
_
%4
]
,
[
pw_
%4
_
%3
]
packssdw
m%1
,
m%7
packssdw
m%2
,
m%6
%endmacro
%macro
VP9_UNPACK_MULSUB_2W_4X
7
-
9
; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if
%0
==
7
punpckhwd
m%6
,
m%2
,
m%1
punpcklwd
m%2
,
m%1
VP9_MULSUB_2W_4X
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
%else
punpckhwd
m%8
,
m%4
,
m%3
punpcklwd
m%2
,
m%4
,
m%3
VP9_MULSUB_2W_4X
%1
,
%2
,
%5
,
%6
,
%7
,
%8
,
%9
%endif
%endmacro
%macro
VP9_IDCT4_1D_FINALIZE
0
SUMSUB_BA
w
,
3
,
2
,
4
; m3=t3+t0, m2=-t3+t0
SUMSUB_BA
w
,
1
,
0
,
4
; m1=t2+t1, m0=-t2+t1
SWAP
0
,
3
,
2
; 3102 -> 0123
%endmacro
%macro
VP9_IDCT4_1D
0
%if
cpuflag
(
ssse3
)
SUMSUB_BA
w
,
2
,
0
,
4
; m2=IN(0)+IN(2) m0=IN(0)-IN(2)
pmulhrsw
m2
,
m6
; m2=t0
pmulhrsw
m0
,
m6
; m0=t1
%else
; <= sse2
VP9_UNPACK_MULSUB_2W_4X
0
,
2
,
11585
,
11585
,
m7
,
4
,
5
; m0=t1, m1=t0
%endif
VP9_UNPACK_MULSUB_2W_4X
1
,
3
,
15137
,
6270
,
m7
,
4
,
5
; m1=t2, m3=t3
VP9_IDCT4_1D_FINALIZE
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment