Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
28170f1a
Commit
28170f1a
authored
Mar 03, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp8: convert loopfilter x86 assembly to use cpuflags().
parent
e25be471
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
158 additions
and
201 deletions
+158
-201
vp8dsp.asm
libavcodec/x86/vp8dsp.asm
+158
-201
No files found.
libavcodec/x86/vp8dsp.asm
View file @
28170f1a
...
...
@@ -1422,7 +1422,17 @@ VP8_DC_WHT
add
%4
,
%5
%endmacro
%macro
WRITE_8W_SSE2
5
%macro
WRITE_8W
5
%if
cpuflag
(
sse4
)
pextrw
[
%3
+
%4
*
4
]
,
%1
,
0
pextrw
[
%2
+
%4
*
4
]
,
%1
,
1
pextrw
[
%3
+
%4
*
2
]
,
%1
,
2
pextrw
[
%3
+
%4
]
,
%1
,
3
pextrw
[
%3
]
,
%1
,
4
pextrw
[
%2
]
,
%1
,
5
pextrw
[
%2
+
%5
]
,
%1
,
6
pextrw
[
%2
+
%5
*
2
]
,
%1
,
7
%else
movd
%2
d
,
%1
psrldq
%1
,
4
mov
[
%3
+
%4
*
4
]
,
%2
w
...
...
@@ -1448,67 +1458,51 @@ VP8_DC_WHT
mov
[
%3
+
%5
]
,
%2
w
shr
%2
,
16
mov
[
%3
+
%5
*
2
]
,
%2
w
%endif
%endmacro
%macro
WRITE_8W_SSE4
5
pextrw
[
%3
+
%4
*
4
]
,
%1
,
0
pextrw
[
%2
+
%4
*
4
]
,
%1
,
1
pextrw
[
%3
+
%4
*
2
]
,
%1
,
2
pextrw
[
%3
+
%4
]
,
%1
,
3
pextrw
[
%3
]
,
%1
,
4
pextrw
[
%2
]
,
%1
,
5
pextrw
[
%2
+
%5
]
,
%1
,
6
pextrw
[
%2
+
%5
*
2
]
,
%1
,
7
%endmacro
%macro
SPLATB_REG_MMX
2
-
3
%macro
SPLATB_REG
2
-
3
%if
cpuflag
(
ssse3
)
movd
%1
,
%2
d
pshufb
%1
,
%3
%elif
cpuflag
(
sse2
)
movd
%1
,
%2
d
punpcklbw
%1
,
%1
punpcklwd
%1
,
%1
punpckldq
%1
,
%1
%endmacro
%macro
SPLATB_REG_MMXEXT
2
-
3
pshuflw
%1
,
%1
,
0x0
punpcklqdq
%1
,
%1
%elif
cpuflag
(
mmx2
)
movd
%1
,
%2
d
punpcklbw
%1
,
%1
pshufw
%1
,
%1
,
0x0
%endmacro
%macro
SPLATB_REG_SSE2
2
-
3
%else
movd
%1
,
%2
d
punpcklbw
%1
,
%1
pshuflw
%1
,
%1
,
0x0
punpcklqdq
%1
,
%1
%endmacro
%macro
SPLATB_REG_SSSE3
3
movd
%1
,
%2
d
pshufb
%1
,
%3
punpcklwd
%1
,
%1
punpckldq
%1
,
%1
%endif
%endmacro
%macro
SIMPLE_LOOPFILTER
4
cglobal
vp8_
%
2
_loop_filter_simple_
%1
,
3
,
%3
,
%4
%macro
SIMPLE_LOOPFILTER
2
cglobal
vp8_
%
1
_loop_filter_simple
,
3
,
%2
,
8
%if
mmsize
==
8
; mmx/mmxext
mov
r3
,
2
%endif
%ifnidn
%1
,
sse2
%if
mmsize
==
16
%if
cpuflag
(
ssse3
)
pxor
m0
,
m0
%endif
%endif
SPLATB_REG
m7
,
r2
,
m0
; splat "flim" into register
; set up indexes to address 4 rows
mov
r2
,
r1
neg
r1
%ifidn
%
2
,
h
%ifidn
%
1
,
h
lea
r0
,
[
r0
+
4
*
r2
-
2
]
%endif
%if
mmsize
==
8
; mmx / mmxext
.
next8px
%endif
%ifidn
%
2
,
v
%ifidn
%
1
,
v
; read 4 half/full rows of pixels
mova
m0
,
[
r0
+
r1
*
2
]
; p1
mova
m1
,
[
r0
+
r1
]
; p0
...
...
@@ -1589,7 +1583,7 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
psubusb
m6
,
m3
; p0+f2
; store
%ifidn
%
2
,
v
%ifidn
%
1
,
v
mova
[r0],
m4
mova
[
r0
+
r1
]
,
m6
%else
; h
...
...
@@ -1597,12 +1591,12 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
SBUTTERFLY
bw
,
6
,
4
,
0
%if
mmsize
==
16
; sse2
%if
idn
%1
,
sse4
%if
cpuflag
(
sse4
)
inc
r4
%endif
WRITE_8W
m6
,
r4
,
r0
,
r1
,
r2
lea
r4
,
[
r3
+
r1
+
1
]
%if
idn
%1
,
sse4
%if
cpuflag
(
sse4
)
inc
r3
%endif
WRITE_8W
m4
,
r3
,
r4
,
r1
,
r2
...
...
@@ -1613,7 +1607,7 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
%if
mmsize
==
8
; mmx/mmxext
; next 8 pixels
%ifidn
%
2
,
v
%ifidn
%
1
,
v
add
r0
,
8
; advance 8 cols = pixels
%else
; h
lea
r0
,
[
r0
+
r2
*
8
-
1
]
; advance 8 rows = lines
...
...
@@ -1627,41 +1621,38 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
%endmacro
%if
ARCH_X86_32
INIT_MMX
%define
SPLATB_REG
SPLATB_REG_MMX
SIMPLE_LOOPFILTER
mmx
,
v
,
4
,
0
SIMPLE_LOOPFILTER
mmx
,
h
,
5
,
0
%define
SPLATB_REG
SPLATB_REG_MMXEXT
SIMPLE_LOOPFILTER
mmxext
,
v
,
4
,
0
SIMPLE_LOOPFILTER
mmxext
,
h
,
5
,
0
%endif
INIT_XMM
%define
SPLATB_REG
SPLATB_REG_SSE2
%define
WRITE_8W
WRITE_8W_SSE2
SIMPLE_LOOPFILTER
sse2
,
v
,
3
,
8
SIMPLE_LOOPFILTER
sse2
,
h
,
5
,
8
%define
SPLATB_REG
SPLATB_REG_SSSE3
SIMPLE_LOOPFILTER
ssse3
,
v
,
3
,
8
SIMPLE_LOOPFILTER
ssse3
,
h
,
5
,
8
%define
WRITE_8W
WRITE_8W_SSE4
SIMPLE_LOOPFILTER
sse4
,
h
,
5
,
8
INIT_MMX
mmx
SIMPLE_LOOPFILTER
v
,
4
SIMPLE_LOOPFILTER
h
,
5
INIT_MMX
mmx2
SIMPLE_LOOPFILTER
v
,
4
SIMPLE_LOOPFILTER
h
,
5
%endif
INIT_XMM
sse2
SIMPLE_LOOPFILTER
v
,
3
SIMPLE_LOOPFILTER
h
,
5
INIT_XMM
ssse3
SIMPLE_LOOPFILTER
v
,
3
SIMPLE_LOOPFILTER
h
,
5
INIT_XMM
sse4
SIMPLE_LOOPFILTER
h
,
5
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
%macro
INNER_LOOPFILTER
5
%if
%
4
==
8
; chroma
cglobal
vp8_
%
2
_loop_filter8uv_inner_
%1
,
6
,
%3
,
%5
%macro
INNER_LOOPFILTER
3
%if
%
3
==
8
; chroma
cglobal
vp8_
%
1
_loop_filter8uv_inner
,
6
,
%2
,
13
%define
dst8_reg
r1
%define
mstride_reg
r2
%define
E_reg
r3
%define
I_reg
r4
%define
hev_thr_reg
r5
%else
; luma
cglobal
vp8_
%
2
_loop_filter16y_inner_
%1
,
5
,
%3
,
%5
cglobal
vp8_
%
1
_loop_filter16y_inner
,
5
,
%2
,
13
%define
mstride_reg
r1
%define
E_reg
r2
%define
I_reg
r3
...
...
@@ -1681,11 +1672,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define
stack_reg
hev_thr_reg
%endif
%ifnidn
%1
,
sse2
%if
mmsize
==
16
%if
cpuflag
(
ssse3
)
pxor
m7
,
m7
%endif
%endif
%ifndef
m8
; mmx/mmxext or sse2 on x86-32
; splat function arguments
...
...
@@ -1696,7 +1685,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
; align stack
mov
stack_reg
,
rsp
; backup stack pointer
and
rsp
,
~
(
mmsize
-
1
)
; align stack
%ifidn
%
2
,
v
%ifidn
%
1
,
v
sub
rsp
,
mmsize
*
4
; stack layout: [0]=E, [1]=I, [2]=hev_thr
; [3]=hev() result
%else
; h
...
...
@@ -1729,14 +1718,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
SPLATB_REG
hev_thr
,
hev_thr_reg
,
m7
; hev_thresh
%endif
%if
mmsize
==
8
&&
%
4
==
16
; mmx/mmxext
%if
mmsize
==
8
&&
%
3
==
16
; mmx/mmxext
mov
cnt_reg
,
2
%endif
mov
stride_reg
,
mstride_reg
neg
mstride_reg
%ifidn
%
2
,
h
%ifidn
%
1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
4
-
4
]
%if
%
4
==
8
%if
%
3
==
8
lea
dst8_reg
,
[
dst8_reg
+
stride_reg
*
4
-
4
]
%endif
%endif
...
...
@@ -1746,8 +1735,8 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endif
; read
lea
dst2_reg
,
[
dst_reg
+
stride_reg
]
%ifidn
%
2
,
v
%if
%
4
==
8
&&
mmsize
==
16
%ifidn
%
1
,
v
%if
%
3
==
8
&&
mmsize
==
16
%define
movrow
movh
%else
%define
movrow
mova
...
...
@@ -1758,7 +1747,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
movrow
m5
,
[
dst2_reg
]
; q1
movrow
m6
,
[
dst2_reg
+
stride_reg
]
; q2
movrow
m7
,
[
dst2_reg
+
stride_reg
*
2
]
; q3
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m0
,
[
dst8_reg
+
mstride_reg
*
4
]
movhps
m2
,
[
dst8_reg
+
mstride_reg
*
2
]
add
dst8_reg
,
stride_reg
...
...
@@ -1795,7 +1784,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
SWAP
6
,
3
SWAP
5
,
3
%else
; sse2 (h)
%if
%
4
==
16
%if
%
3
==
16
lea
dst8_reg
,
[
dst_reg
+
stride_reg
*
8
]
%endif
...
...
@@ -1882,7 +1871,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb
m6
,
m5
; q2-q1
por
m6
,
m4
; abs(q2-q1)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m4
,
flim_I
pxor
m3
,
m3
psubusb
m0
,
m4
...
...
@@ -1904,9 +1893,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP
7
,
3
; now m7 is zero
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
m3
,
[
dst_reg
+
mstride_reg
]
; p0
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m3
,
[
dst8_reg
+
mstride_reg
]
%endif
%elifdef
m12
...
...
@@ -1922,7 +1911,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb
m1
,
m3
; p1-p0
psubusb
m6
,
m2
; p0-p1
por
m1
,
m6
; abs(p1-p0)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m6
,
m1
psubusb
m1
,
m4
psubusb
m6
,
hev_thr
...
...
@@ -1936,9 +1925,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endif
SWAP
6
,
4
; now m6 is I
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
m4
,
[
dst_reg
]
; q0
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m4
,
[
dst8_reg
]
%endif
%elifdef
m8
...
...
@@ -1953,7 +1942,7 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
psubusb
m1
,
m5
; q0-q1
psubusb
m7
,
m4
; q1-q0
por
m1
,
m7
; abs(q1-q0)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m7
,
m1
psubusb
m1
,
m6
psubusb
m7
,
hev_thr
...
...
@@ -2061,14 +2050,14 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%else
mova
m6
,
mask_res
%endif
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m7
,
[
pb_1
]
%else
; mmxext/sse2
pxor
m7
,
m7
%endif
pand
m0
,
m6
pand
m1
,
m6
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
paddusb
m0
,
m7
pand
m1
,
[
pb_FE
]
pandn
m7
,
m0
...
...
@@ -2086,12 +2075,12 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
paddusb
m2
,
m0
; p1+a
; store
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
[
dst_reg
+
mstride_reg
*
2
]
,
m2
movrow
[
dst_reg
+
mstride_reg
]
,
m3
movrow
[
dst_reg
]
,
m4
movrow
[
dst_reg
+
stride_reg
]
,
m5
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
[
dst8_reg
+
mstride_reg
*
2
]
,
m2
movhps
[
dst8_reg
+
mstride_reg
]
,
m3
movhps
[
dst8_reg
]
,
m4
...
...
@@ -2108,20 +2097,20 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
WRITE_4x2D
2
,
3
,
4
,
5
,
dst_reg
,
dst2_reg
,
mstride_reg
,
stride_reg
%else
; sse2 (h)
lea
dst8_reg
,
[
dst8_reg
+
mstride_reg
+
2
]
WRITE_4x4D
2
,
3
,
4
,
5
,
dst_reg
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
,
%
4
WRITE_4x4D
2
,
3
,
4
,
5
,
dst_reg
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
,
%
3
%endif
%endif
%if
mmsize
==
8
%if
%
4
==
8
; chroma
%ifidn
%
2
,
h
%if
%
3
==
8
; chroma
%ifidn
%
1
,
h
sub
dst_reg
,
2
%endif
cmp
dst_reg
,
dst8_reg
mov
dst_reg
,
dst8_reg
jnz
.
next8px
%else
%ifidn
%
2
,
h
%ifidn
%
1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
8
-
2
]
%else
; v
add
dst_reg
,
8
...
...
@@ -2138,56 +2127,46 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endmacro
%if
ARCH_X86_32
INIT_MMX
%define
SPLATB_REG
SPLATB_REG_MMX
INNER_LOOPFILTER
mmx
,
v
,
6
,
16
,
0
INNER_LOOPFILTER
mmx
,
h
,
6
,
16
,
0
INNER_LOOPFILTER
mmx
,
v
,
6
,
8
,
0
INNER_LOOPFILTER
mmx
,
h
,
6
,
8
,
0
%define
SPLATB_REG
SPLATB_REG_MMXEXT
INNER_LOOPFILTER
mmxext
,
v
,
6
,
16
,
0
INNER_LOOPFILTER
mmxext
,
h
,
6
,
16
,
0
INNER_LOOPFILTER
mmxext
,
v
,
6
,
8
,
0
INNER_LOOPFILTER
mmxext
,
h
,
6
,
8
,
0
%endif
INIT_XMM
%define
SPLATB_REG
SPLATB_REG_SSE2
INNER_LOOPFILTER
sse2
,
v
,
5
,
16
,
13
%ifdef
m8
INNER_LOOPFILTER
sse2
,
h
,
5
,
16
,
13
%else
INNER_LOOPFILTER
sse2
,
h
,
6
,
16
,
13
%endif
INNER_LOOPFILTER
sse2
,
v
,
6
,
8
,
13
INNER_LOOPFILTER
sse2
,
h
,
6
,
8
,
13
INIT_MMX
mmx
INNER_LOOPFILTER
v
,
6
,
16
INNER_LOOPFILTER
h
,
6
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
h
,
6
,
8
%define
SPLATB_REG
SPLATB_REG_SSSE3
INNER_LOOPFILTER
ssse3
,
v
,
5
,
16
,
13
%ifdef
m8
INNER_LOOPFILTER
ssse3
,
h
,
5
,
16
,
13
%else
INNER_LOOPFILTER
ssse3
,
h
,
6
,
16
,
13
INIT_MMX
mmx2
INNER_LOOPFILTER
v
,
6
,
16
INNER_LOOPFILTER
h
,
6
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
h
,
6
,
8
%endif
INNER_LOOPFILTER
ssse3
,
v
,
6
,
8
,
13
INNER_LOOPFILTER
ssse3
,
h
,
6
,
8
,
13
INIT_XMM
sse2
INNER_LOOPFILTER
v
,
5
,
16
INNER_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
h
,
6
,
8
INIT_XMM
ssse3
INNER_LOOPFILTER
v
,
5
,
16
INNER_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
INNER_LOOPFILTER
v
,
6
,
8
INNER_LOOPFILTER
h
,
6
,
8
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
%macro
MBEDGE_LOOPFILTER
5
%if
%
4
==
8
; chroma
cglobal
vp8_
%
2
_loop_filter8uv_mbedge_
%1
,
6
,
%3
,
%
5
%macro
MBEDGE_LOOPFILTER
3
%if
%
3
==
8
; chroma
cglobal
vp8_
%
1
_loop_filter8uv_mbedge
,
6
,
%2
,
1
5
%define
dst8_reg
r1
%define
mstride_reg
r2
%define
E_reg
r3
%define
I_reg
r4
%define
hev_thr_reg
r5
%else
; luma
cglobal
vp8_
%
2
_loop_filter16y_mbedge_
%1
,
5
,
%3
,
%
5
cglobal
vp8_
%
1
_loop_filter16y_mbedge
,
5
,
%2
,
1
5
%define
mstride_reg
r1
%define
E_reg
r2
%define
I_reg
r3
...
...
@@ -2207,14 +2186,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define
stack_reg
hev_thr_reg
%endif
%define
ssse3_or_higher
0
%ifnidn
%1
,
sse2
%if
mmsize
==
16
%define
ssse3_or_higher
1
%endif
%endif
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
pxor
m7
,
m7
%endif
...
...
@@ -2275,14 +2247,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
SPLATB_REG
hev_thr
,
hev_thr_reg
,
m7
; hev_thresh
%endif
%if
mmsize
==
8
&&
%
4
==
16
; mmx/mmxext
%if
mmsize
==
8
&&
%
3
==
16
; mmx/mmxext
mov
cnt_reg
,
2
%endif
mov
stride_reg
,
mstride_reg
neg
mstride_reg
%ifidn
%
2
,
h
%ifidn
%
1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
4
-
4
]
%if
%
4
==
8
%if
%
3
==
8
lea
dst8_reg
,
[
dst8_reg
+
stride_reg
*
4
-
4
]
%endif
%endif
...
...
@@ -2292,8 +2264,8 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
; read
lea
dst2_reg
,
[
dst_reg
+
stride_reg
]
%ifidn
%
2
,
v
%if
%
4
==
8
&&
mmsize
==
16
%ifidn
%
1
,
v
%if
%
3
==
8
&&
mmsize
==
16
%define
movrow
movh
%else
%define
movrow
mova
...
...
@@ -2304,7 +2276,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
movrow
m5
,
[
dst2_reg
]
; q1
movrow
m6
,
[
dst2_reg
+
stride_reg
]
; q2
movrow
m7
,
[
dst2_reg
+
stride_reg
*
2
]
; q3
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m0
,
[
dst8_reg
+
mstride_reg
*
4
]
movhps
m2
,
[
dst8_reg
+
mstride_reg
*
2
]
add
dst8_reg
,
stride_reg
...
...
@@ -2341,7 +2313,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
SWAP
6
,
3
SWAP
5
,
3
%else
; sse2 (h)
%if
%
4
==
16
%if
%
3
==
16
lea
dst8_reg
,
[
dst_reg
+
stride_reg
*
8
]
%endif
...
...
@@ -2430,7 +2402,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb
m6
,
m5
; q2-q1
por
m6
,
m4
; abs(q2-q1)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m4
,
flim_I
pxor
m3
,
m3
psubusb
m0
,
m4
...
...
@@ -2452,9 +2424,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP
7
,
3
; now m7 is zero
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
m3
,
[
dst_reg
+
mstride_reg
]
; p0
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m3
,
[
dst8_reg
+
mstride_reg
]
%endif
%elifdef
m12
...
...
@@ -2470,7 +2442,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb
m1
,
m3
; p1-p0
psubusb
m6
,
m2
; p0-p1
por
m1
,
m6
; abs(p1-p0)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m6
,
m1
psubusb
m1
,
m4
psubusb
m6
,
hev_thr
...
...
@@ -2484,9 +2456,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
SWAP
6
,
4
; now m6 is I
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
m4
,
[
dst_reg
]
; q0
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
movhps
m4
,
[
dst8_reg
]
%endif
%elifdef
m8
...
...
@@ -2501,7 +2473,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb
m1
,
m5
; q0-q1
psubusb
m7
,
m4
; q1-q0
por
m1
,
m7
; abs(q1-q0)
%if
idn
%1
,
mmx
%if
notcpuflag
(
mmx2
)
mova
m7
,
m1
psubusb
m1
,
m6
psubusb
m7
,
hev_thr
...
...
@@ -2613,7 +2585,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb
m4
,
m1
; q0-f1
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
mova
m7
,
[
pb_1
]
%else
mova
m7
,
[
pw_63
]
...
...
@@ -2626,7 +2598,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
pxor
m0
,
m0
mova
m6
,
m1
pcmpgtb
m0
,
m1
; which are negative
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
punpcklbw
m6
,
m7
; interleave with "1" for rounding
punpckhbw
m1
,
m7
%else
...
...
@@ -2634,7 +2606,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
punpckhbw
m1
,
m0
%endif
mova
lim_sign
,
m0
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
mova
m7
,
[
pb_27_63
]
%ifndef
m8
mova
lim_res
,
m1
...
...
@@ -2667,7 +2639,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubb
m1
,
m6
pand
m1
,
m0
; -a0
pandn
m0
,
m6
; +a0
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pb_18_63
]
; pipelining
%endif
psubusb
m3
,
m1
...
...
@@ -2675,7 +2647,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb
m3
,
m0
; p0+a0
psubusb
m4
,
m0
; q0-a0
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
SWAP
6
,
7
%ifdef
m10
SWAP
1
,
10
...
...
@@ -2707,7 +2679,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubb
m1
,
m6
pand
m1
,
m0
; -a1
pandn
m0
,
m6
; +a1
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
mova
m6
,
[
pb_9_63
]
%endif
psubusb
m2
,
m1
...
...
@@ -2715,7 +2687,7 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
paddusb
m2
,
m0
; p1+a1
psubusb
m5
,
m0
; q1-a1
%if
ssse3_or_higher
%if
cpuflag
(
ssse3
)
SWAP
6
,
7
%ifdef
m10
SWAP
1
,
10
...
...
@@ -2765,14 +2737,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
psubusb
m6
,
m7
; q1-a1
; store
%ifidn
%
2
,
v
%ifidn
%
1
,
v
movrow
[
dst2_reg
+
mstride_reg
*
4
]
,
m1
movrow
[
dst_reg
+
mstride_reg
*
2
]
,
m2
movrow
[
dst_reg
+
mstride_reg
]
,
m3
movrow
[
dst_reg
]
,
m4
movrow
[
dst2_reg
]
,
m5
movrow
[
dst2_reg
+
stride_reg
]
,
m6
%if
mmsize
==
16
&&
%
4
==
8
%if
mmsize
==
16
&&
%
3
==
8
add
dst8_reg
,
mstride_reg
movhps
[
dst8_reg
+
mstride_reg
*
2
]
,
m1
movhps
[
dst8_reg
+
mstride_reg
]
,
m2
...
...
@@ -2796,14 +2768,14 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
WRITE_2x4W
m5
,
m6
,
dst2_reg
,
dst_reg
,
mstride_reg
,
stride_reg
%else
; sse2 (h)
lea
dst8_reg
,
[
dst8_reg
+
mstride_reg
+
1
]
WRITE_4x4D
1
,
2
,
3
,
4
,
dst_reg
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
,
%
4
WRITE_4x4D
1
,
2
,
3
,
4
,
dst_reg
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
,
%
3
lea
dst_reg
,
[
dst2_reg
+
mstride_reg
+
4
]
lea
dst8_reg
,
[
dst8_reg
+
mstride_reg
+
4
]
%if
idn
%1
,
sse4
%if
cpuflag
(
sse4
)
add
dst2_reg
,
4
%endif
WRITE_8W
m5
,
dst2_reg
,
dst_reg
,
mstride_reg
,
stride_reg
%if
idn
%1
,
sse4
%if
cpuflag
(
sse4
)
lea
dst2_reg
,
[
dst8_reg
+
stride_reg
]
%endif
WRITE_8W
m6
,
dst2_reg
,
dst8_reg
,
mstride_reg
,
stride_reg
...
...
@@ -2811,15 +2783,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endif
%if
mmsize
==
8
%if
%
4
==
8
; chroma
%ifidn
%
2
,
h
%if
%
3
==
8
; chroma
%ifidn
%
1
,
h
sub
dst_reg
,
5
%endif
cmp
dst_reg
,
dst8_reg
mov
dst_reg
,
dst8_reg
jnz
.
next8px
%else
%ifidn
%
2
,
h
%ifidn
%
1
,
h
lea
dst_reg
,
[
dst_reg
+
stride_reg
*
8
-
5
]
%else
; v
add
dst_reg
,
8
...
...
@@ -2836,46 +2808,31 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endmacro
%if
ARCH_X86_32
INIT_MMX
%define
SPLATB_REG
SPLATB_REG_MMX
MBEDGE_LOOPFILTER
mmx
,
v
,
6
,
16
,
0
MBEDGE_LOOPFILTER
mmx
,
h
,
6
,
16
,
0
MBEDGE_LOOPFILTER
mmx
,
v
,
6
,
8
,
0
MBEDGE_LOOPFILTER
mmx
,
h
,
6
,
8
,
0
%define
SPLATB_REG
SPLATB_REG_MMXEXT
MBEDGE_LOOPFILTER
mmxext
,
v
,
6
,
16
,
0
MBEDGE_LOOPFILTER
mmxext
,
h
,
6
,
16
,
0
MBEDGE_LOOPFILTER
mmxext
,
v
,
6
,
8
,
0
MBEDGE_LOOPFILTER
mmxext
,
h
,
6
,
8
,
0
%endif
INIT_XMM
%define
SPLATB_REG
SPLATB_REG_SSE2
%define
WRITE_8W
WRITE_8W_SSE2
MBEDGE_LOOPFILTER
sse2
,
v
,
5
,
16
,
15
%ifdef
m8
MBEDGE_LOOPFILTER
sse2
,
h
,
5
,
16
,
15
%else
MBEDGE_LOOPFILTER
sse2
,
h
,
6
,
16
,
15
%endif
MBEDGE_LOOPFILTER
sse2
,
v
,
6
,
8
,
15
MBEDGE_LOOPFILTER
sse2
,
h
,
6
,
8
,
15
INIT_MMX
mmx
MBEDGE_LOOPFILTER
v
,
6
,
16
MBEDGE_LOOPFILTER
h
,
6
,
16
MBEDGE_LOOPFILTER
v
,
6
,
8
MBEDGE_LOOPFILTER
h
,
6
,
8
%define
SPLATB_REG
SPLATB_REG_SSSE3
MBEDGE_LOOPFILTER
ssse3
,
v
,
5
,
16
,
15
%ifdef
m8
MBEDGE_LOOPFILTER
ssse3
,
h
,
5
,
16
,
15
%else
MBEDGE_LOOPFILTER
ssse3
,
h
,
6
,
16
,
15
INIT_MMX
mmx2
MBEDGE_LOOPFILTER
v
,
6
,
16
MBEDGE_LOOPFILTER
h
,
6
,
16
MBEDGE_LOOPFILTER
v
,
6
,
8
MBEDGE_LOOPFILTER
h
,
6
,
8
%endif
MBEDGE_LOOPFILTER
ssse3
,
v
,
6
,
8
,
15
MBEDGE_LOOPFILTER
ssse3
,
h
,
6
,
8
,
15
%define
WRITE_8W
WRITE_8W_SSE4
%ifdef
m8
MBEDGE_LOOPFILTER
sse4
,
h
,
5
,
16
,
15
%else
MBEDGE_LOOPFILTER
sse4
,
h
,
6
,
16
,
15
%endif
MBEDGE_LOOPFILTER
sse4
,
h
,
6
,
8
,
15
INIT_XMM
sse2
MBEDGE_LOOPFILTER
v
,
5
,
16
MBEDGE_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
MBEDGE_LOOPFILTER
v
,
6
,
8
MBEDGE_LOOPFILTER
h
,
6
,
8
INIT_XMM
ssse3
MBEDGE_LOOPFILTER
v
,
5
,
16
MBEDGE_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
MBEDGE_LOOPFILTER
v
,
6
,
8
MBEDGE_LOOPFILTER
h
,
6
,
8
INIT_XMM
sse4
MBEDGE_LOOPFILTER
h
,
5
+
ARCH_X86_32
,
16
MBEDGE_LOOPFILTER
h
,
6
,
8
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment