Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
0ed21bdc
Commit
0ed21bdc
authored
Feb 05, 2014
by
Clément Bœsch
Committed by
Anton Khirnov
Oct 04, 2016
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9lpf/x86: add ff_vp9_loop_filter_[vh]_44_16_{sse2,ssse3,avx}.
Signed-off-by:
Anton Khirnov
<
anton@khirnov.net
>
parent
f2e3d706
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
99 additions
and
27 deletions
+99
-27
vp9dsp_init.c
libavcodec/x86/vp9dsp_init.c
+5
-0
vp9lpf.asm
libavcodec/x86/vp9lpf.asm
+94
-27
No files found.
libavcodec/x86/vp9dsp_init.c
View file @
0ed21bdc
...
@@ -226,6 +226,9 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
...
@@ -226,6 +226,9 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
lpf_funcs
(
16
,
16
,
sse2
);
lpf_funcs
(
16
,
16
,
sse2
);
lpf_funcs
(
16
,
16
,
ssse3
);
lpf_funcs
(
16
,
16
,
ssse3
);
lpf_funcs
(
16
,
16
,
avx
);
lpf_funcs
(
16
,
16
,
avx
);
lpf_funcs
(
44
,
16
,
sse2
);
lpf_funcs
(
44
,
16
,
ssse3
);
lpf_funcs
(
44
,
16
,
avx
);
lpf_funcs
(
84
,
16
,
sse2
);
lpf_funcs
(
84
,
16
,
sse2
);
lpf_funcs
(
84
,
16
,
ssse3
);
lpf_funcs
(
84
,
16
,
ssse3
);
lpf_funcs
(
84
,
16
,
avx
);
lpf_funcs
(
84
,
16
,
avx
);
...
@@ -279,6 +282,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
...
@@ -279,6 +282,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
if (ARCH_X86_64) { \
if (ARCH_X86_64) { \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
...
...
libavcodec/x86/vp9lpf.asm
View file @
0ed21bdc
...
@@ -278,23 +278,23 @@ SECTION .text
...
@@ -278,23 +278,23 @@ SECTION .text
SWAP
%12
,
%15
SWAP
%12
,
%15
%endmacro
%endmacro
%macro
DEFINE_REAL_P7_TO_Q7
0
%macro
DEFINE_REAL_P7_TO_Q7
0
-
1
0
%define
P7
dst1q
+
2
*
mstrideq
%define
P7
dst1q
+
2
*
mstrideq
+
%1
%define
P6
dst1q
+
mstrideq
%define
P6
dst1q
+
mstrideq
+
%1
%define
P5
dst1q
%define
P5
dst1q
+
%1
%define
P4
dst1q
+
strideq
%define
P4
dst1q
+
strideq
+
%1
%define
P3
dstq
+
4
*
mstrideq
%define
P3
dstq
+
4
*
mstrideq
+
%1
%define
P2
dstq
+
mstride3q
%define
P2
dstq
+
mstride3q
+
%1
%define
P1
dstq
+
2
*
mstrideq
%define
P1
dstq
+
2
*
mstrideq
+
%1
%define
P0
dstq
+
mstrideq
%define
P0
dstq
+
mstrideq
+
%1
%define
Q0
dstq
%define
Q0
dstq
+
%1
%define
Q1
dstq
+
strideq
%define
Q1
dstq
+
strideq
+
%1
%define
Q2
dstq
+
2
*
strideq
%define
Q2
dstq
+
2
*
strideq
+
%1
%define
Q3
dstq
+
stride3q
%define
Q3
dstq
+
stride3q
+
%1
%define
Q4
dstq
+
4
*
strideq
%define
Q4
dstq
+
4
*
strideq
+
%1
%define
Q5
dst2q
+
mstrideq
%define
Q5
dst2q
+
mstrideq
+
%1
%define
Q6
dst2q
%define
Q6
dst2q
+
%1
%define
Q7
dst2q
+
strideq
%define
Q7
dst2q
+
strideq
+
%1
%endmacro
%endmacro
; ..............AB -> AAAAAAAABBBBBBBB
; ..............AB -> AAAAAAAABBBBBBBB
...
@@ -450,8 +450,9 @@ SECTION .text
...
@@ -450,8 +450,9 @@ SECTION .text
pand
m3
,
m5
; fm final value
pand
m3
,
m5
; fm final value
; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
; (m3: fm, m8..15: p3 p2 p1 p0 q0 q1 q2 q3)
; calc flat8in and hev masks
; calc flat8in
(if not 44_16)
and hev masks
mova
m6
,
[
pb_81
]
; [1 1 1 1 ...] ^ 0x80
mova
m6
,
[
pb_81
]
; [1 1 1 1 ...] ^ 0x80
%if
%2
!
=
44
ABSSUB_CMP
m2
,
m8
,
m11
,
m6
,
m4
,
m5
; abs(p3 - p0) <= 1
ABSSUB_CMP
m2
,
m8
,
m11
,
m6
,
m4
,
m5
; abs(p3 - p0) <= 1
mova
m8
,
[
pb_80
]
mova
m8
,
[
pb_80
]
ABSSUB_CMP
m1
,
m9
,
m11
,
m6
,
m4
,
m5
,
m8
; abs(p2 - p0) <= 1
ABSSUB_CMP
m1
,
m9
,
m11
,
m6
,
m4
,
m5
,
m8
; abs(p2 - p0) <= 1
...
@@ -484,6 +485,19 @@ SECTION .text
...
@@ -484,6 +485,19 @@ SECTION .text
%if
%2
==
84
||
%2
==
48
%if
%2
==
84
||
%2
==
48
pand
m2
,
[
mask_mix
%2
]
pand
m2
,
[
mask_mix
%2
]
%endif
%endif
%else
mova
m6
,
[
pb_80
]
movd
m7
,
Hd
SPLATB_MIX
m7
pxor
m7
,
m6
ABSSUB
m4
,
m10
,
m11
,
m1
; abs(p1 - p0)
pxor
m4
,
m6
pcmpgtb
m0
,
m4
,
m7
; abs(p1 - p0) > H (1/2 hev condition)
ABSSUB
m4
,
m13
,
m12
,
m1
; abs(q1 - q0)
pxor
m4
,
m6
pcmpgtb
m5
,
m4
,
m7
; abs(q1 - q0) > H (2/2 hev condition)
por
m0
,
m5
; hev final value
%endif
%if
%2
==
16
%if
%2
==
16
; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3)
...
@@ -525,9 +539,11 @@ SECTION .text
...
@@ -525,9 +539,11 @@ SECTION .text
; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev
; f2: fm & ~f14 & ~f6 & hev => fm & ~(out & in) & ~(~out & in) & hev => fm & ~in & hev
; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev
; f4: fm & ~f14 & ~f6 & ~f2 => fm & ~(out & in) & ~(~out & in) & ~(~in & hev) => fm & ~in & ~hev
; (m0: hev, [m1: flat8out],
m2: flat8in
, m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
; (m0: hev, [m1: flat8out],
[m2: flat8in]
, m3: fm, m8..15: p5 p4 p1 p0 q0 q1 q6 q7)
; filter2()
; filter2()
mova
m6
,
[
pb_80
]
%if
%2
!
=
44
mova
m6
,
[
pb_80
]
; already in m6 if 44_16
%endif
pxor
m15
,
m12
,
m6
; q0 ^ 0x80
pxor
m15
,
m12
,
m6
; q0 ^ 0x80
pxor
m14
,
m11
,
m6
; p0 ^ 0x80
pxor
m14
,
m11
,
m6
; p0 ^ 0x80
psubsb
m15
,
m14
; (signed) q0 - p0
psubsb
m15
,
m14
; (signed) q0 - p0
...
@@ -543,12 +559,16 @@ SECTION .text
...
@@ -543,12 +559,16 @@ SECTION .text
SRSHIFT3B_2X
m6
,
m4
,
m14
,
m7
; f1 and f2 sign byte shift by 3
SRSHIFT3B_2X
m6
,
m4
,
m14
,
m7
; f1 and f2 sign byte shift by 3
SIGN_SUB
m7
,
m12
,
m6
,
m5
,
m9
; m7 = q0 - f1
SIGN_SUB
m7
,
m12
,
m6
,
m5
,
m9
; m7 = q0 - f1
SIGN_ADD
m8
,
m11
,
m4
,
m5
,
m9
; m8 = p0 + f2
SIGN_ADD
m8
,
m11
,
m4
,
m5
,
m9
; m8 = p0 + f2
%if
%2
!
=
44
pandn
m6
,
m2
,
m3
; ~mask(in) & mask(fm)
pandn
m6
,
m2
,
m3
; ~mask(in) & mask(fm)
pand
m6
,
m0
; (~mask(in) & mask(fm)) & mask(hev)
pand
m6
,
m0
; (~mask(in) & mask(fm)) & mask(hev)
%else
pand
m6
,
m3
,
m0
%endif
MASK_APPLY
m7
,
m12
,
m6
,
m5
; m7 = filter2(q0) & mask / we write it in filter4()
MASK_APPLY
m7
,
m12
,
m6
,
m5
; m7 = filter2(q0) & mask / we write it in filter4()
MASK_APPLY
m8
,
m11
,
m6
,
m5
; m8 = filter2(p0) & mask / we write it in filter4()
MASK_APPLY
m8
,
m11
,
m6
,
m5
; m8 = filter2(p0) & mask / we write it in filter4()
; (m0: hev, [m1: flat8out],
m2: flat8in
, m3: fm, m7..m8: q0' p0', m10..13: p1 p0 q0 q1, m14: pb_10, m15: q0-p0)
; (m0: hev, [m1: flat8out],
[m2: flat8in]
, m3: fm, m7..m8: q0' p0', m10..13: p1 p0 q0 q1, m14: pb_10, m15: q0-p0)
; filter4()
; filter4()
mova
m4
,
m15
mova
m4
,
m15
paddsb
m15
,
m4
; 2 * (q0 - p0)
paddsb
m15
,
m4
; 2 * (q0 - p0)
...
@@ -556,14 +576,22 @@ SECTION .text
...
@@ -556,14 +576,22 @@ SECTION .text
paddsb
m6
,
m15
,
[
pb_4
]
; m6: f1 = clip(f + 4, 127)
paddsb
m6
,
m15
,
[
pb_4
]
; m6: f1 = clip(f + 4, 127)
paddsb
m15
,
[
pb_3
]
; m15: f2 = clip(f + 3, 127)
paddsb
m15
,
[
pb_3
]
; m15: f2 = clip(f + 3, 127)
SRSHIFT3B_2X
m6
,
m15
,
m14
,
m9
; f1 and f2 sign byte shift by 3
SRSHIFT3B_2X
m6
,
m15
,
m14
,
m9
; f1 and f2 sign byte shift by 3
%if
%2
!
=
44
%define
p0tmp
m7
%define
q0tmp
m9
pandn
m5
,
m2
,
m3
; ~mask(in) & mask(fm)
pandn
m5
,
m2
,
m3
; ~mask(in) & mask(fm)
pandn
m0
,
m5
; ~mask(hev) & (~mask(in) & mask(fm))
pandn
m0
,
m5
; ~mask(hev) & (~mask(in) & mask(fm))
SIGN_SUB
m9
,
m12
,
m6
,
m4
,
m14
; q0 - f1
%else
MASK_APPLY
m9
,
m7
,
m0
,
m5
; m9 = filter4(q0) & mask
%define
p0tmp
m1
mova
[Q0],
m9
%define
q0tmp
m2
SIGN_ADD
m7
,
m11
,
m15
,
m4
,
m14
; p0 + f2
pandn
m0
,
m3
MASK_APPLY
m7
,
m8
,
m0
,
m5
; m7 = filter4(p0) & mask
%endif
mova
[P0],
m7
SIGN_SUB
q0tmp
,
m12
,
m6
,
m4
,
m14
; q0 - f1
MASK_APPLY
q0tmp
,
m7
,
m0
,
m5
; filter4(q0) & mask
mova
[Q0],
q0tmp
SIGN_ADD
p0tmp
,
m11
,
m15
,
m4
,
m14
; p0 + f2
MASK_APPLY
p0tmp
,
m8
,
m0
,
m5
; filter4(p0) & mask
mova
[P0],
p0tmp
paddb
m6
,
[
pb_80
]
;
paddb
m6
,
[
pb_80
]
;
pxor
m8
,
m8
; f=(f1+1)>>1
pxor
m8
,
m8
; f=(f1+1)>>1
pavgb
m6
,
m8
;
pavgb
m6
,
m8
;
...
@@ -577,6 +605,7 @@ SECTION .text
...
@@ -577,6 +605,7 @@ SECTION .text
; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1)
; filter6()
; filter6()
%if
%2
!
=
44
pxor
m0
,
m0
pxor
m0
,
m0
%if
%2
>
16
%if
%2
>
16
pand
m3
,
m2
pand
m3
,
m2
...
@@ -594,6 +623,7 @@ SECTION .text
...
@@ -594,6 +623,7 @@ SECTION .text
FILTER_UPDATE
m6
,
m7
,
m4
,
m5
,
[Q0],
m14
,
m11
,
m12
,
m9
,
3
,
m3
; [q0] -p3 -p0 +q0 +q3
FILTER_UPDATE
m6
,
m7
,
m4
,
m5
,
[Q0],
m14
,
m11
,
m12
,
m9
,
3
,
m3
; [q0] -p3 -p0 +q0 +q3
FILTER_UPDATE
m4
,
m5
,
m6
,
m7
,
[Q1],
m15
,
m12
,
m13
,
m9
,
3
,
m3
; [q1] -p2 -q0 +q1 +q3
FILTER_UPDATE
m4
,
m5
,
m6
,
m7
,
[Q1],
m15
,
m12
,
m13
,
m9
,
3
,
m3
; [q1] -p2 -q0 +q1 +q3
FILTER_UPDATE
m6
,
m7
,
m4
,
m5
,
[Q2],
m10
,
m13
,
m8
,
m9
,
3
,
m3
,
m8
; [q2] -p1 -q1 +q2 +q3
FILTER_UPDATE
m6
,
m7
,
m4
,
m5
,
[Q2],
m10
,
m13
,
m8
,
m9
,
3
,
m3
,
m8
; [q2] -p1 -q1 +q2 +q3
%endif
; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
; (m0: 0, [m1: flat8out], m2: fm & flat8in, m8..15: q2 q3 p1 p0 q0 q1 p3 p2)
; filter14()
; filter14()
...
@@ -674,6 +704,42 @@ SECTION .text
...
@@ -674,6 +704,42 @@ SECTION .text
movu
[Q5],
m13
movu
[Q5],
m13
movu
[Q6],
m14
movu
[Q6],
m14
movu
[Q7],
m15
movu
[Q7],
m15
%elif
%2
==
44
SWAP
0
,
7
; m0 = p1
SWAP
3
,
4
; m3 = q1
DEFINE_REAL_P7_TO_Q7
2
SBUTTERFLY
bw
,
0
,
1
,
8
SBUTTERFLY
bw
,
2
,
3
,
8
SBUTTERFLY
wd
,
0
,
2
,
8
SBUTTERFLY
wd
,
1
,
3
,
8
SBUTTERFLY
dq
,
0
,
4
,
8
SBUTTERFLY
dq
,
1
,
5
,
8
SBUTTERFLY
dq
,
2
,
6
,
8
SBUTTERFLY
dq
,
3
,
7
,
8
movd
[P7],
m0
punpckhqdq
m0
,
m8
movd
[P6],
m0
movd
[Q0],
m1
punpckhqdq
m1
,
m9
movd
[Q1],
m1
movd
[P3],
m2
punpckhqdq
m2
,
m10
movd
[P2],
m2
movd
[Q4],
m3
punpckhqdq
m3
,
m11
movd
[Q5],
m3
movd
[P5],
m4
punpckhqdq
m4
,
m12
movd
[P4],
m4
movd
[Q2],
m5
punpckhqdq
m5
,
m13
movd
[Q3],
m5
movd
[P1],
m6
punpckhqdq
m6
,
m14
movd
[P0],
m6
movd
[Q6],
m7
punpckhqdq
m7
,
m8
movd
[Q7],
m7
%else
%else
; the following code do a transpose of 8 full lines to 16 half
; the following code do a transpose of 8 full lines to 16 half
; lines (high part). It is inlined to avoid the need of a staging area
; lines (high part). It is inlined to avoid the need of a staging area
...
@@ -743,6 +809,7 @@ LPF_16_VH %1, avx
...
@@ -743,6 +809,7 @@ LPF_16_VH %1, avx
%endmacro
%endmacro
LPF_16_VH_ALL_OPTS
16
LPF_16_VH_ALL_OPTS
16
LPF_16_VH_ALL_OPTS
44
LPF_16_VH_ALL_OPTS
48
LPF_16_VH_ALL_OPTS
48
LPF_16_VH_ALL_OPTS
84
LPF_16_VH_ALL_OPTS
84
LPF_16_VH_ALL_OPTS
88
LPF_16_VH_ALL_OPTS
88
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment