Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
bee330e3
Commit
bee330e3
authored
Mar 05, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp8: convert inner loopfilter x86 assembly to use named arguments.
parent
ffae713a
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
164 additions
and
187 deletions
+164
-187
vp8dsp.asm
libavcodec/x86/vp8dsp.asm
+164
-187
No files found.
libavcodec/x86/vp8dsp.asm
View file @
bee330e3
...
@@ -1654,174 +1654,151 @@ SIMPLE_LOOPFILTER h, 5
...
@@ -1654,174 +1654,151 @@ SIMPLE_LOOPFILTER h, 5
; int flimE, int flimI, int hev_thr);
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
INNER_LOOPFILTER
3
%macro
INNER_LOOPFILTER
2
%if
%3
==
8
; chroma
%if
%2
==
8
; chroma
cglobal
vp8_
%1
_loop_filter8uv_inner
,
6
,
%2
,
13
cglobal
vp8_
%1
_loop_filter8uv_inner
,
6
,
6
,
13
,
dst
,
dst8
,
stride
,
flimE
,
flimI
,
hevthr
%define
dst8_reg
r1
%define
mstride_reg
r2
%define
E_reg
r3
%define
I_reg
r4
%define
hev_thr_reg
r5
%else
; luma
%else
; luma
cglobal
vp8_
%1
_loop_filter16y_inner
,
5
,
%2
,
13
cglobal
vp8_
%1
_loop_filter16y_inner
,
5
,
5
,
13
,
dst
,
stride
,
flimE
,
flimI
,
hevthr
%define
mstride_reg
r1
%define
E_reg
r2
%define
I_reg
r3
%define
hev_thr_reg
r4
%ifdef
m8
; x86-64, sse2
%define
dst8_reg
r4
%elif
mmsize
==
16
; x86-32, sse2
%define
dst8_reg
r5
%else
; x86-32, mmx/mmxext
%define
cnt_reg
r5
%endif
%endif
%define
dst_reg
r0
%define
stride_reg
E_reg
%define
dst2_reg
I_reg
%ifndef
m8
%define
stack_reg
hev_thr_reg
%endif
%endif
%if
cpuflag
(
ssse3
)
%if
cpuflag
(
ssse3
)
pxor
m7
,
m7
pxor
m7
,
m7
%endif
%endif
%ifndef
m8
; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifndef
m8
; mmx/mmxext or sse2 on x86-32
%ifidn
%1
,
v
; [3]=hev() result
; splat function arguments
%assign
pad
16
+
mmsize
*
4
-
gprsize
-
(
stack_offset
&
15
)
SPLATB_REG
m0
,
E_reg
,
m7
; E
%else
; h ; extra storage space for transposes
SPLATB_REG
m1
,
I_reg
,
m7
; I
%assign
pad
16
+
mmsize
*
5
-
gprsize
-
(
stack_offset
&
15
)
SPLATB_REG
m2
,
hev_thr_reg
,
m7
; hev_thresh
; align stack
mov
stack_reg
,
rsp
; backup stack pointer
and
rsp
,
~
(
mmsize
-
1
)
; align stack
%ifidn
%1
,
v
sub
rsp
,
mmsize
*
4
; stack layout: [0]=E, [1]=I, [2]=hev_thr
; [3]=hev() result
%else
; h
sub
rsp
,
mmsize
*
5
; extra storage space for transposes
%endif
%endif
; splat function arguments
%define
flim_E
[rsp]
SPLATB_REG
m0
,
flimEq
,
m7
; E
%define
flim_I
[
rsp
+
mmsize
]
SPLATB_REG
m1
,
flimIq
,
m7
; I
%define
hev_thr
[
rsp
+
mmsize
*
2
]
SPLATB_REG
m2
,
hevthrq
,
m7
; hev_thresh
%define
mask_res
[
rsp
+
mmsize
*
3
]
%define
p0backup
[
rsp
+
mmsize
*
3
]
SUB
rsp
,
pad
%define
q0backup
[
rsp
+
mmsize
*
4
]
%define
m_flimE
[rsp]
mova
flim_E
,
m0
%define
m_flimI
[
rsp
+
mmsize
]
mova
flim_I
,
m1
%define
m_hevthr
[
rsp
+
mmsize
*
2
]
mova
hev_thr
,
m2
%define
m_maskres
[
rsp
+
mmsize
*
3
]
%define
m_p0backup
[
rsp
+
mmsize
*
3
]
%else
; sse2 on x86-64
%define
m_q0backup
[
rsp
+
mmsize
*
4
]
%define
flim_E
m9
mova
m_flimE
,
m0
%define
flim_I
m10
mova
m_flimI
,
m1
%define
hev_thr
m11
mova
m_hevthr
,
m2
%define
mask_res
m12
%else
%define
p0backup
m12
%define
m_flimE
m9
%define
q0backup
m8
%define
m_flimI
m10
%define
m_hevthr
m11
%define
m_maskres
m12
%define
m_p0backup
m12
%define
m_q0backup
m8
; splat function arguments
; splat function arguments
SPLATB_REG
flim_E
,
E_reg
,
m7
; E
SPLATB_REG
m_flimE
,
flimEq
,
m7
; E
SPLATB_REG
flim_I
,
I_reg
,
m7
; I
SPLATB_REG
m_flimI
,
flimIq
,
m7
; I
SPLATB_REG
hev_thr
,
hev_thr_reg
,
m7
; hev_thresh
SPLATB_REG
m_hevthr
,
hevthrq
,
m7
; hev_thresh
%endif
%endif
%if
mmsize
==
8
&&
%3
==
16
; mmx/mmxext
%if
%2
==
8
; chroma
mov
cnt_reg
,
2
DEFINE_ARGS
dst1
,
dst8
,
mstride
,
stride
,
dst2
%elif
mmsize
==
8
DEFINE_ARGS
dst1
,
mstride
,
stride
,
dst2
,
cntr
mov
cntrq
,
2
%else
DEFINE_ARGS
dst1
,
mstride
,
stride
,
dst2
,
dst8
%endif
%endif
mov
stride_reg
,
mstride_reg
mov
strideq
,
mstrideq
neg
mstride_reg
neg
mstrideq
%ifidn
%1
,
h
%ifidn
%1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
4
-
4
]
lea
dst1q
,
[
dst1q
+
strideq
*
4
-
4
]
%if
%
3
==
8
%if
%
2
==
8
; chroma
lea
dst8_reg
,
[
dst8_reg
+
stride_reg
*
4
-
4
]
lea
dst8q
,
[
dst8q
+
strideq
*
4
-
4
]
%endif
%endif
%endif
%endif
%if
mmsize
==
8
%if
mmsize
==
8
.
next8px
.
next8px
:
%endif
%endif
; read
; read
lea
dst2_reg
,
[
dst_reg
+
stride_reg
]
lea
dst2q
,
[
dst1q
+
strideq
]
%ifidn
%1
,
v
%ifidn
%1
,
v
%if
%
3
==
8
&&
mmsize
==
16
%if
%
2
==
8
&&
mmsize
==
16
%define
movrow
movh
%define
movrow
movh
%else
%else
%define
movrow
mova
%define
movrow
mova
%endif
%endif
movrow
m0
,
[
dst
_reg
+
mstride_reg
*
4
]
; p3
movrow
m0
,
[
dst
1q
+
mstrideq
*
4
]
; p3
movrow
m1
,
[
dst2
_reg
+
mstride_reg
*
4
]
; p2
movrow
m1
,
[
dst2
q
+
mstrideq
*
4
]
; p2
movrow
m2
,
[
dst
_reg
+
mstride_reg
*
2
]
; p1
movrow
m2
,
[
dst
1q
+
mstrideq
*
2
]
; p1
movrow
m5
,
[
dst2
_reg
]
; q1
movrow
m5
,
[dst2
q]
; q1
movrow
m6
,
[
dst2
_reg
+
stride_reg
]
; q2
movrow
m6
,
[
dst2
q
+
strideq
*
1
]
; q2
movrow
m7
,
[
dst2
_reg
+
stride_reg
*
2
]
; q3
movrow
m7
,
[
dst2
q
+
strideq
*
2
]
; q3
%if
mmsize
==
16
&&
%
3
==
8
%if
mmsize
==
16
&&
%
2
==
8
movhps
m0
,
[
dst8
_reg
+
mstride_reg
*
4
]
movhps
m0
,
[
dst8
q
+
mstrideq
*
4
]
movhps
m2
,
[
dst8
_reg
+
mstride_reg
*
2
]
movhps
m2
,
[
dst8
q
+
mstrideq
*
2
]
add
dst8_reg
,
stride_reg
add
dst8q
,
strideq
movhps
m1
,
[
dst8
_reg
+
mstride_reg
*
4
]
movhps
m1
,
[
dst8
q
+
mstrideq
*
4
]
movhps
m5
,
[
dst8
_reg
]
movhps
m5
,
[dst8
q
]
movhps
m6
,
[
dst8
_reg
+
stride_reg
]
movhps
m6
,
[
dst8
q
+
strideq
]
movhps
m7
,
[
dst8
_reg
+
stride_reg
*
2
]
movhps
m7
,
[
dst8
q
+
strideq
*
2
]
add
dst8_reg
,
mstride_reg
add
dst8q
,
mstrideq
%endif
%endif
%elif
mmsize
==
8
; mmx/mmxext (h)
%elif
mmsize
==
8
; mmx/mmxext (h)
; read 8 rows of 8px each
; read 8 rows of 8px each
movu
m0
,
[
dst
_reg
+
mstride_reg
*
4
]
movu
m0
,
[
dst
1q
+
mstrideq
*
4
]
movu
m1
,
[
dst2
_reg
+
mstride_reg
*
4
]
movu
m1
,
[
dst2
q
+
mstrideq
*
4
]
movu
m2
,
[
dst
_reg
+
mstride_reg
*
2
]
movu
m2
,
[
dst
1q
+
mstrideq
*
2
]
movu
m3
,
[
dst
_reg
+
mstride_reg
]
movu
m3
,
[
dst
1q
+
mstrideq
]
movu
m4
,
[
dst
_reg
]
movu
m4
,
[dst
1q
]
movu
m5
,
[
dst2
_reg
]
movu
m5
,
[dst2
q
]
movu
m6
,
[
dst2
_reg
+
stride_reg
]
movu
m6
,
[
dst2
q
+
strideq
]
; 8x8 transpose
; 8x8 transpose
TRANSPOSE4x4B
0
,
1
,
2
,
3
,
7
TRANSPOSE4x4B
0
,
1
,
2
,
3
,
7
mova
q0backup
,
m1
mova
m_
q0backup
,
m1
movu
m7
,
[
dst2
_reg
+
stride_reg
*
2
]
movu
m7
,
[
dst2
q
+
strideq
*
2
]
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1
SBUTTERFLY
dq
,
0
,
4
,
1
; p3/p2
SBUTTERFLY
dq
,
0
,
4
,
1
; p3/p2
SBUTTERFLY
dq
,
2
,
6
,
1
; q0/q1
SBUTTERFLY
dq
,
2
,
6
,
1
; q0/q1
SBUTTERFLY
dq
,
3
,
7
,
1
; q2/q3
SBUTTERFLY
dq
,
3
,
7
,
1
; q2/q3
mova
m1
,
q0backup
mova
m1
,
m_
q0backup
mova
q0backup
,
m2
; store q0
mova
m_
q0backup
,
m2
; store q0
SBUTTERFLY
dq
,
1
,
5
,
2
; p1/p0
SBUTTERFLY
dq
,
1
,
5
,
2
; p1/p0
mova
p0backup
,
m5
; store p0
mova
m_
p0backup
,
m5
; store p0
SWAP
1
,
4
SWAP
1
,
4
SWAP
2
,
4
SWAP
2
,
4
SWAP
6
,
3
SWAP
6
,
3
SWAP
5
,
3
SWAP
5
,
3
%else
; sse2 (h)
%else
; sse2 (h)
%if
%
3
==
16
%if
%
2
==
16
lea
dst8_reg
,
[
dst_reg
+
stride_reg
*
8
]
lea
dst8q
,
[
dst1q
+
strideq
*
8
]
%endif
%endif
; read 16 rows of 8px each, interleave
; read 16 rows of 8px each, interleave
movh
m0
,
[
dst
_reg
+
mstride_reg
*
4
]
movh
m0
,
[
dst
1q
+
mstrideq
*
4
]
movh
m1
,
[
dst8
_reg
+
mstride_reg
*
4
]
movh
m1
,
[
dst8
q
+
mstrideq
*
4
]
movh
m2
,
[
dst
_reg
+
mstride_reg
*
2
]
movh
m2
,
[
dst
1q
+
mstrideq
*
2
]
movh
m5
,
[
dst8
_reg
+
mstride_reg
*
2
]
movh
m5
,
[
dst8
q
+
mstrideq
*
2
]
movh
m3
,
[
dst
_reg
+
mstride_reg
]
movh
m3
,
[
dst
1q
+
mstrideq
]
movh
m6
,
[
dst8
_reg
+
mstride_reg
]
movh
m6
,
[
dst8
q
+
mstrideq
]
movh
m4
,
[
dst
_reg
]
movh
m4
,
[dst
1q
]
movh
m7
,
[
dst8
_reg
]
movh
m7
,
[dst8
q
]
punpcklbw
m0
,
m1
; A/I
punpcklbw
m0
,
m1
; A/I
punpcklbw
m2
,
m5
; C/K
punpcklbw
m2
,
m5
; C/K
punpcklbw
m3
,
m6
; D/L
punpcklbw
m3
,
m6
; D/L
punpcklbw
m4
,
m7
; E/M
punpcklbw
m4
,
m7
; E/M
add
dst8_reg
,
stride_reg
add
dst8q
,
strideq
movh
m1
,
[
dst2
_reg
+
mstride_reg
*
4
]
movh
m1
,
[
dst2
q
+
mstrideq
*
4
]
movh
m6
,
[
dst8
_reg
+
mstride_reg
*
4
]
movh
m6
,
[
dst8
q
+
mstrideq
*
4
]
movh
m5
,
[
dst2
_reg
]
movh
m5
,
[dst2
q
]
movh
m7
,
[
dst8
_reg
]
movh
m7
,
[dst8
q
]
punpcklbw
m1
,
m6
; B/J
punpcklbw
m1
,
m6
; B/J
punpcklbw
m5
,
m7
; F/N
punpcklbw
m5
,
m7
; F/N
movh
m6
,
[
dst2
_reg
+
stride_reg
]
movh
m6
,
[
dst2
q
+
strideq
]
movh
m7
,
[
dst8
_reg
+
stride_reg
]
movh
m7
,
[
dst8
q
+
strideq
]
punpcklbw
m6
,
m7
; G/O
punpcklbw
m6
,
m7
; G/O
; 8x16 transpose
; 8x16 transpose
...
@@ -1829,10 +1806,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1829,10 +1806,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
%ifdef
m8
%ifdef
m8
SWAP
1
,
8
SWAP
1
,
8
%else
%else
mova
q0backup
,
m1
mova
m_
q0backup
,
m1
%endif
%endif
movh
m7
,
[
dst2
_reg
+
stride_reg
*
2
]
movh
m7
,
[
dst2
q
+
strideq
*
2
]
movh
m1
,
[
dst8
_reg
+
stride_reg
*
2
]
movh
m1
,
[
dst8
q
+
strideq
*
2
]
punpcklbw
m7
,
m1
; H/P
punpcklbw
m7
,
m1
; H/P
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1
TRANSPOSE4x4B
4
,
5
,
6
,
7
,
1
SBUTTERFLY
dq
,
0
,
4
,
1
; p3/p2
SBUTTERFLY
dq
,
0
,
4
,
1
; p3/p2
...
@@ -1842,14 +1819,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1842,14 +1819,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
SWAP
1
,
8
SWAP
1
,
8
SWAP
2
,
8
SWAP
2
,
8
%else
%else
mova
m1
,
q0backup
mova
m1
,
m_
q0backup
mova
q0backup
,
m2
; store q0
mova
m_
q0backup
,
m2
; store q0
%endif
%endif
SBUTTERFLY
dq
,
1
,
5
,
2
; p1/p0
SBUTTERFLY
dq
,
1
,
5
,
2
; p1/p0
%ifdef
m12
%ifdef
m12
SWAP
5
,
12
SWAP
5
,
12
%else
%else
mova
p0backup
,
m5
; store p0
mova
m_
p0backup
,
m5
; store p0
%endif
%endif
SWAP
1
,
4
SWAP
1
,
4
SWAP
2
,
4
SWAP
2
,
4
...
@@ -1883,7 +1860,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1883,7 +1860,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
por
m6
,
m4
; abs(q2-q1)
por
m6
,
m4
; abs(q2-q1)
%if
notcpuflag
(
mmx2
)
%if
notcpuflag
(
mmx2
)
mova
m4
,
flim_
I
mova
m4
,
m_flim
I
pxor
m3
,
m3
pxor
m3
,
m3
psubusb
m0
,
m4
psubusb
m0
,
m4
psubusb
m1
,
m4
psubusb
m1
,
m4
...
@@ -1905,14 +1882,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1905,14 +1882,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
; normal_limit and high_edge_variance for p1-p0, q1-q0
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP
7
,
3
; now m7 is zero
SWAP
7
,
3
; now m7 is zero
%ifidn
%1
,
v
%ifidn
%1
,
v
movrow
m3
,
[
dst
_reg
+
mstride_reg
]
; p0
movrow
m3
,
[
dst
1q
+
mstrideq
]
; p0
%if
mmsize
==
16
&&
%
3
==
8
%if
mmsize
==
16
&&
%
2
==
8
movhps
m3
,
[
dst8
_reg
+
mstride_reg
]
movhps
m3
,
[
dst8
q
+
mstrideq
]
%endif
%endif
%elifdef
m12
%elifdef
m12
SWAP
3
,
12
SWAP
3
,
12
%else
%else
mova
m3
,
p0backup
mova
m3
,
m_
p0backup
%endif
%endif
mova
m1
,
m2
mova
m1
,
m2
...
@@ -1925,11 +1902,11 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1925,11 +1902,11 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
%if
notcpuflag
(
mmx2
)
%if
notcpuflag
(
mmx2
)
mova
m6
,
m1
mova
m6
,
m1
psubusb
m1
,
m4
psubusb
m1
,
m4
psubusb
m6
,
hev_
thr
psubusb
m6
,
m_hev
thr
pcmpeqb
m1
,
m7
; abs(p1-p0) <= I
pcmpeqb
m1
,
m7
; abs(p1-p0) <= I
pcmpeqb
m6
,
m7
; abs(p1-p0) <= hev_thresh
pcmpeqb
m6
,
m7
; abs(p1-p0) <= hev_thresh
pand
m0
,
m1
pand
m0
,
m1
mova
mask_
res
,
m6
mova
m_mask
res
,
m6
%else
; mmxext/sse2
%else
; mmxext/sse2
pmaxub
m0
,
m1
; max_I
pmaxub
m0
,
m1
; max_I
SWAP
1
,
4
; max_hev_thresh
SWAP
1
,
4
; max_hev_thresh
...
@@ -1937,14 +1914,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1937,14 +1914,14 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
SWAP
6
,
4
; now m6 is I
SWAP
6
,
4
; now m6 is I
%ifidn
%1
,
v
%ifidn
%1
,
v
movrow
m4
,
[
dst
_reg
]
; q0
movrow
m4
,
[dst
1q]
; q0
%if
mmsize
==
16
&&
%
3
==
8
%if
mmsize
==
16
&&
%
2
==
8
movhps
m4
,
[
dst8
_reg
]
movhps
m4
,
[dst8
q
]
%endif
%endif
%elifdef
m8
%elifdef
m8
SWAP
4
,
8
SWAP
4
,
8
%else
%else
mova
m4
,
q0backup
mova
m4
,
m_
q0backup
%endif
%endif
mova
m1
,
m4
mova
m1
,
m4
SWAP
1
,
4
SWAP
1
,
4
...
@@ -1956,26 +1933,26 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1956,26 +1933,26 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
%if
notcpuflag
(
mmx2
)
%if
notcpuflag
(
mmx2
)
mova
m7
,
m1
mova
m7
,
m1
psubusb
m1
,
m6
psubusb
m1
,
m6
psubusb
m7
,
hev_
thr
psubusb
m7
,
m_hev
thr
pxor
m6
,
m6
pxor
m6
,
m6
pcmpeqb
m1
,
m6
; abs(q1-q0) <= I
pcmpeqb
m1
,
m6
; abs(q1-q0) <= I
pcmpeqb
m7
,
m6
; abs(q1-q0) <= hev_thresh
pcmpeqb
m7
,
m6
; abs(q1-q0) <= hev_thresh
mova
m6
,
m
ask_
res
mova
m6
,
m
_mask
res
pand
m0
,
m1
; abs([pq][321]-[pq][210]) <= I
pand
m0
,
m1
; abs([pq][321]-[pq][210]) <= I
pand
m6
,
m7
pand
m6
,
m7
%else
; mmxext/sse2
%else
; mmxext/sse2
pxor
m7
,
m7
pxor
m7
,
m7
pmaxub
m0
,
m1
pmaxub
m0
,
m1
pmaxub
m6
,
m1
pmaxub
m6
,
m1
psubusb
m0
,
flim_
I
psubusb
m0
,
m_flim
I
psubusb
m6
,
hev_
thr
psubusb
m6
,
m_hev
thr
pcmpeqb
m0
,
m7
; max(abs(..)) <= I
pcmpeqb
m0
,
m7
; max(abs(..)) <= I
pcmpeqb
m6
,
m7
; !(max(abs..) > thresh)
pcmpeqb
m6
,
m7
; !(max(abs..) > thresh)
%endif
%endif
%ifdef
m12
%ifdef
m12
SWAP
6
,
12
SWAP
6
,
12
%else
%else
mova
mask_
res
,
m6
; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
mova
m_mask
res
,
m6
; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
%endif
%endif
; simple_limit
; simple_limit
...
@@ -1999,28 +1976,28 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -1999,28 +1976,28 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
pand
m7
,
[
pb_FE
]
pand
m7
,
[
pb_FE
]
psrlq
m7
,
1
; abs(q1-p1)/2
psrlq
m7
,
1
; abs(q1-p1)/2
paddusb
m7
,
m1
; abs(q0-p0)*2+abs(q1-p1)/2
paddusb
m7
,
m1
; abs(q0-p0)*2+abs(q1-p1)/2
psubusb
m7
,
flim_
E
psubusb
m7
,
m_flim
E
pcmpeqb
m7
,
m6
; abs(q0-p0)*2+abs(q1-p1)/2 <= E
pcmpeqb
m7
,
m6
; abs(q0-p0)*2+abs(q1-p1)/2 <= E
pand
m0
,
m7
; normal_limit result
pand
m0
,
m7
; normal_limit result
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef
m8
; x86-64 && sse2
%ifdef
m8
; x86-64 && sse2
mova
m8
,
[
pb_80
]
mova
m8
,
[
pb_80
]
%define
pb_80_var
m8
%define
m_pb_80
m8
%else
; x86-32 or mmx/mmxext
%else
; x86-32 or mmx/mmxext
%define
pb_80_var
[
pb_80
]
%define
m_pb_80
[
pb_80
]
%endif
%endif
mova
m1
,
m4
mova
m1
,
m4
mova
m7
,
m3
mova
m7
,
m3
pxor
m1
,
pb_80_var
pxor
m1
,
m_pb_80
pxor
m7
,
pb_80_var
pxor
m7
,
m_pb_80
psubsb
m1
,
m7
; (signed) q0-p0
psubsb
m1
,
m7
; (signed) q0-p0
mova
m6
,
m2
mova
m6
,
m2
mova
m7
,
m5
mova
m7
,
m5
pxor
m6
,
pb_80_var
pxor
m6
,
m_pb_80
pxor
m7
,
pb_80_var
pxor
m7
,
m_pb_80
psubsb
m6
,
m7
; (signed) p1-q1
psubsb
m6
,
m7
; (signed) p1-q1
mova
m7
,
m
ask_
res
mova
m7
,
m
_mask
res
pandn
m7
,
m6
pandn
m7
,
m6
paddsb
m7
,
m1
paddsb
m7
,
m1
paddsb
m7
,
m1
paddsb
m7
,
m1
...
@@ -2059,7 +2036,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -2059,7 +2036,7 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
%ifdef
m12
%ifdef
m12
SWAP
6
,
12
SWAP
6
,
12
%else
%else
mova
m6
,
m
ask_
res
mova
m6
,
m
_mask
res
%endif
%endif
%if
notcpuflag
(
mmx2
)
%if
notcpuflag
(
mmx2
)
mova
m7
,
[
pb_1
]
mova
m7
,
[
pb_1
]
...
@@ -2087,81 +2064,81 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
...
@@ -2087,81 +2064,81 @@ cglobal vp8_%1_loop_filter16y_inner, 5, %2, 13
; store
; store
%ifidn
%1
,
v
%ifidn
%1
,
v
movrow
[
dst
_reg
+
mstride_reg
*
2
]
,
m2
movrow
[
dst
1q
+
mstrideq
*
2
]
,
m2
movrow
[
dst
_reg
+
mstride_reg
]
,
m3
movrow
[
dst
1q
+
mstrideq
]
,
m3
movrow
[
dst_reg
]
,
m4
movrow
[dst1q
],
m4
movrow
[
dst
_reg
+
stride_reg
]
,
m5
movrow
[
dst
1q
+
strideq
]
,
m5
%if
mmsize
==
16
&&
%
3
==
8
%if
mmsize
==
16
&&
%
2
==
8
movhps
[
dst8
_reg
+
mstride_reg
*
2
]
,
m2
movhps
[
dst8
q
+
mstrideq
*
2
]
,
m2
movhps
[
dst8
_reg
+
mstride_reg
]
,
m3
movhps
[
dst8
q
+
mstrideq
]
,
m3
movhps
[
dst8_reg
]
,
m4
movhps
[dst8q
],
m4
movhps
[
dst8
_reg
+
stride_reg
]
,
m5
movhps
[
dst8
q
+
strideq
]
,
m5
%endif
%endif
%else
; h
%else
; h
add
dst_reg
,
2
add
dst1q
,
2
add
dst2_reg
,
2
add
dst2q
,
2
; 4x8/16 transpose
; 4x8/16 transpose
TRANSPOSE4x4B
2
,
3
,
4
,
5
,
6
TRANSPOSE4x4B
2
,
3
,
4
,
5
,
6
%if
mmsize
==
8
; mmx/mmxext (h)
%if
mmsize
==
8
; mmx/mmxext (h)
WRITE_4x2D
2
,
3
,
4
,
5
,
dst
_reg
,
dst2_reg
,
mstride_reg
,
stride_reg
WRITE_4x2D
2
,
3
,
4
,
5
,
dst
1q
,
dst2q
,
mstrideq
,
strideq
%else
; sse2 (h)
%else
; sse2 (h)
lea
dst8_reg
,
[
dst8_reg
+
mstride_reg
+
2
]
lea
dst8q
,
[
dst8q
+
mstrideq
+
2
]
WRITE_4x4D
2
,
3
,
4
,
5
,
dst
_reg
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
,
%3
WRITE_4x4D
2
,
3
,
4
,
5
,
dst
1q
,
dst2q
,
dst8q
,
mstrideq
,
strideq
,
%2
%endif
%endif
%endif
%endif
%if
mmsize
==
8
%if
mmsize
==
8
%if
%
3
==
8
; chroma
%if
%
2
==
8
; chroma
%ifidn
%1
,
h
%ifidn
%1
,
h
sub
dst_reg
,
2
sub
dst1q
,
2
%endif
%endif
cmp
dst_reg
,
dst8_reg
cmp
dst1q
,
dst8q
mov
dst_reg
,
dst8_reg
mov
dst1q
,
dst8q
jnz
.
next8px
jnz
.
next8px
%else
%else
%ifidn
%1
,
h
%ifidn
%1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
8
-
2
]
lea
dst1q
,
[
dst1q
+
strideq
*
8
-
2
]
%else
; v
%else
; v
add
dst_reg
,
8
add
dst1q
,
8
%endif
%endif
dec
cnt_reg
dec
cntrq
jg
.
next8px
jg
.
next8px
%endif
%endif
%endif
%endif
%ifndef
m8
; sse2 on x86-32 or mmx/mmxext
%ifndef
m8
; sse2 on x86-32 or mmx/mmxext
mov
rsp
,
stack_reg
; restore stack pointer
ADD
rsp
,
pad
%endif
%endif
RET
RET
%endmacro
%endmacro
%if
ARCH_X86_32
%if
ARCH_X86_32
INIT_MMX
mmx
INIT_MMX
mmx
INNER_LOOPFILTER
v
,
6
,
16
INNER_LOOPFILTER
v
,
16
INNER_LOOPFILTER
h
,
6
,
16
INNER_LOOPFILTER
h
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
v
,
8
INNER_LOOPFILTER
h
,
6
,
8
INNER_LOOPFILTER
h
,
8
INIT_MMX
mmx2
INIT_MMX
mmx2
INNER_LOOPFILTER
v
,
6
,
16
INNER_LOOPFILTER
v
,
16
INNER_LOOPFILTER
h
,
6
,
16
INNER_LOOPFILTER
h
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
v
,
8
INNER_LOOPFILTER
h
,
6
,
8
INNER_LOOPFILTER
h
,
8
%endif
%endif
INIT_XMM
sse2
INIT_XMM
sse2
INNER_LOOPFILTER
v
,
5
,
16
INNER_LOOPFILTER
v
,
16
INNER_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
INNER_LOOPFILTER
h
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
v
,
8
INNER_LOOPFILTER
h
,
6
,
8
INNER_LOOPFILTER
h
,
8
INIT_XMM
ssse3
INIT_XMM
ssse3
INNER_LOOPFILTER
v
,
5
,
16
INNER_LOOPFILTER
v
,
16
INNER_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
INNER_LOOPFILTER
h
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
v
,
8
INNER_LOOPFILTER
h
,
6
,
8
INNER_LOOPFILTER
h
,
8
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment