Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
8ad77b65
Commit
8ad77b65
authored
May 10, 2011
by
Jason Garrett-Glaser
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update x86 H.264 deblock asm
Includes AVX versions from x264.
parent
b6675279
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
277 additions
and
193 deletions
+277
-193
h264_deblock.asm
libavcodec/x86/h264_deblock.asm
+227
-168
h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c
+34
-22
x86util.asm
libavcodec/x86/x86util.asm
+16
-3
No files found.
libavcodec/x86/h264_deblock.asm
View file @
8ad77b65
;*****************************************************************************
;* MMX/SSE2-optimized H.264 deblocking code
;* MMX/SSE2
/AVX
-optimized H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-20
08
x264 project
;* Copyright (C) 2005-20
11
x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
;* This file is part of Libav.
;*
...
...
@@ -26,96 +27,135 @@
%include
"x86inc.asm"
%include
"x86util.asm"
SECTION
_RODATA
SECTION
.
text
cextern
pb_0
cextern
pb_1
cextern
pb_3
cextern
pb_A1
SECTION
.
text
; expands to [base],...,[base+7*stride]
%define
PASS8ROWS
(
base
,
base3
,
stride
,
stride3
)
\
[base],
[
base
+
stride
]
,
[
base
+
stride
*
2
]
,
[base3],
\
[
base3
+
stride
]
,
[
base3
+
stride
*
2
]
,
[
base3
+
stride3
]
,
[
base3
+
stride
*
4
]
; in: 8 rows of 4 bytes in %1..%8
%define
PASS8ROWS
(
base
,
base3
,
stride
,
stride3
,
offset
)
\
PASS8ROWS
(
base
+
offset
,
base3
+
offset
,
stride
,
stride3
)
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3
%macro
TRANSPOSE4x8_LOAD
8
movd
m0
,
%1
movd
m2
,
%2
movd
m1
,
%3
movd
m3
,
%4
punpcklbw
m0
,
m2
punpcklbw
m1
,
m3
movq
m2
,
m0
punpcklwd
m0
,
m1
punpckhwd
m2
,
m1
movd
m4
,
%5
movd
m6
,
%6
movd
m5
,
%7
movd
m7
,
%8
punpcklbw
m4
,
m6
punpcklbw
m5
,
m7
movq
m6
,
m4
punpcklwd
m4
,
m5
punpckhwd
m6
,
m5
movq
m1
,
m0
movq
m3
,
m2
punpckldq
m0
,
m4
punpckhdq
m1
,
m4
punpckldq
m2
,
m6
punpckhdq
m3
,
m6
%macro
TRANSPOSE4x8_LOAD
11
movh
m0
,
%4
movh
m2
,
%5
movh
m1
,
%6
movh
m3
,
%7
punpckl%1
m0
,
m2
punpckl%1
m1
,
m3
mova
m2
,
m0
punpckl%2
m0
,
m1
punpckh%2
m2
,
m1
movh
m4
,
%8
movh
m6
,
%9
movh
m5
,
%10
movh
m7
,
%11
punpckl%1
m4
,
m6
punpckl%1
m5
,
m7
mova
m6
,
m4
punpckl%2
m4
,
m5
punpckh%2
m6
,
m5
punpckh%3
m1
,
m0
,
m4
punpckh%3
m3
,
m2
,
m6
punpckl%3
m0
,
m4
punpckl%3
m2
,
m6
%endmacro
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro
TRANSPOSE8x4_STORE
8
movq
m4
,
m0
movq
m5
,
m1
movq
m6
,
m2
punpckhdq
m4
,
m4
punpckhdq
m5
,
m5
punpckhdq
m6
,
m6
%macro
TRANSPOSE8x4B_STORE
8
punpckhdq
m4
,
m0
,
m0
punpckhdq
m5
,
m1
,
m1
punpckhdq
m6
,
m2
,
m2
punpcklbw
m0
,
m1
punpcklbw
m2
,
m3
movq
m1
,
m0
punpcklwd
m0
,
m2
punpckhwd
m1
,
m2
movd
%1
,
m0
punpckhdq
m0
,
m0
movd
%2
,
m0
movd
%3
,
m1
punpcklwd
m1
,
m0
,
m2
punpckhwd
m0
,
m2
movh
%1
,
m1
punpckhdq
m1
,
m1
movd
%4
,
m1
movh
%2
,
m1
movh
%3
,
m0
punpckhdq
m0
,
m0
movh
%4
,
m0
punpckhdq
m3
,
m3
punpcklbw
m4
,
m5
punpcklbw
m6
,
m3
movq
m5
,
m4
punpcklwd
m4
,
m6
punpckhwd
m5
,
m6
movd
%5
,
m4
punpckhdq
m4
,
m4
movd
%6
,
m4
movd
%7
,
m5
punpcklwd
m5
,
m4
,
m6
punpckhwd
m4
,
m6
movh
%5
,
m5
punpckhdq
m5
,
m5
movd
%8
,
m5
movh
%6
,
m5
movh
%7
,
m4
punpckhdq
m4
,
m4
movh
%8
,
m4
%endmacro
%macro
TRANSPOSE4x8B_LOAD
8
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
,
%8
%endmacro
%macro
TRANSPOSE4x8W_LOAD
8
%if
mmsize
==
16
TRANSPOSE4x8_LOAD
wd
,
dq
,
qdq
,
%1
,
%2
,
%3
,
%4
,
%5
,
%6
,
%7
,
%8
%else
SWAP
1
,
4
,
2
,
3
mova
m0
,
[t5]
mova
m1
,
[
t5
+
r1
]
mova
m2
,
[
t5
+
r1
*
2
]
mova
m3
,
[
t5
+
t6
]
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
%endif
%endmacro
%macro
TRANSPOSE8x2W_STORE
8
punpckhwd
m0
,
m1
,
m2
punpcklwd
m1
,
m2
%if
mmsize
==
8
movd
%3
,
m0
movd
%1
,
m1
psrlq
m1
,
32
psrlq
m0
,
32
movd
%2
,
m1
movd
%4
,
m0
%else
movd
%5
,
m0
movd
%1
,
m1
psrldq
m1
,
4
psrldq
m0
,
4
movd
%2
,
m1
movd
%6
,
m0
psrldq
m1
,
4
psrldq
m0
,
4
movd
%3
,
m1
movd
%7
,
m0
psrldq
m1
,
4
psrldq
m0
,
4
movd
%4
,
m1
movd
%8
,
m0
%endif
%endmacro
%macro
SBUTTERFLY3
4
movq
%4
,
%2
punpckh%1
%4
,
%2
,
%3
punpckl%1
%2
,
%3
punpckh%1
%4
,
%3
%endmacro
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro
TRANSPOSE6x8_MEM
9
RESET_MM_PERMUTATION
movq
m0
,
%1
movq
m1
,
%2
movq
m2
,
%3
...
...
@@ -123,30 +163,32 @@ SECTION .text
movq
m4
,
%5
movq
m5
,
%6
movq
m6
,
%7
SBUTTERFLY
3
bw
,
m0
,
m1
,
m
7
SBUTTERFLY
3
bw
,
m2
,
m3
,
m1
SBUTTERFLY
3
bw
,
m4
,
m5
,
m3
movq
[
%9
+
0x10
]
,
m
1
SBUTTERFLY3
bw
,
m6
,
%8
,
m
5
SBUTTERFLY
3
wd
,
m0
,
m2
,
m1
SBUTTERFLY
3
wd
,
m4
,
m6
,
m2
SBUTTERFLY
bw
,
0
,
1
,
7
SBUTTERFLY
bw
,
2
,
3
,
7
SBUTTERFLY
bw
,
4
,
5
,
7
movq
[
%9
+
0x10
]
,
m
3
SBUTTERFLY3
bw
,
m6
,
%8
,
m
7
SBUTTERFLY
wd
,
0
,
2
,
3
SBUTTERFLY
wd
,
4
,
6
,
3
punpckhdq
m0
,
m4
movq
[
%9
+
0x00
]
,
m0
SBUTTERFLY3
wd
,
m7
,
[
%9
+
0x10
]
,
m6
SBUTTERFLY3
wd
,
m3
,
m5
,
m4
SBUTTERFLY3
dq
,
m7
,
m3
,
m0
SBUTTERFLY3
dq
,
m1
,
m2
,
m5
punpckldq
m6
,
m4
movq
[
%9
+
0x10
]
,
m1
movq
[
%9
+
0x20
]
,
m5
movq
[
%9
+
0x30
]
,
m7
movq
[
%9
+
0x40
]
,
m0
movq
[
%9
+
0x50
]
,
m6
SBUTTERFLY3
wd
,
m1
,
[
%9
+
0x10
]
,
m3
SBUTTERFLY
wd
,
5
,
7
,
0
SBUTTERFLY
dq
,
1
,
5
,
0
SBUTTERFLY
dq
,
2
,
6
,
0
punpckldq
m3
,
m7
movq
[
%9
+
0x10
]
,
m2
movq
[
%9
+
0x20
]
,
m6
movq
[
%9
+
0x30
]
,
m1
movq
[
%9
+
0x40
]
,
m5
movq
[
%9
+
0x50
]
,
m3
RESET_MM_PERMUTATION
%endmacro
; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16
%macro
TRANSPOSE8x8_MEM
16
RESET_MM_PERMUTATION
movq
m0
,
%1
movq
m1
,
%2
movq
m2
,
%3
...
...
@@ -154,38 +196,44 @@ SECTION .text
movq
m4
,
%5
movq
m5
,
%6
movq
m6
,
%7
SBUTTERFLY
3
bw
,
m0
,
m1
,
m
7
SBUTTERFLY
3
bw
,
m2
,
m3
,
m1
SBUTTERFLY
3
bw
,
m4
,
m5
,
m3
SBUTTERFLY3
bw
,
m6
,
%8
,
m
5
movq
%9
,
m
3
SBUTTERFLY
3
wd
,
m0
,
m2
,
m3
SBUTTERFLY
3
wd
,
m4
,
m6
,
m2
SBUTTERFLY
3
wd
,
m7
,
m1
,
m6
movq
%11
,
m
2
movq
m
2
,
%9
SBUTTERFLY
3
wd
,
m2
,
m5
,
m1
SBUTTERFLY
3
dq
,
m0
,
m4
,
m
5
SBUTTERFLY
3
dq
,
m7
,
m2
,
m4
SBUTTERFLY
bw
,
0
,
1
,
7
SBUTTERFLY
bw
,
2
,
3
,
7
SBUTTERFLY
bw
,
4
,
5
,
7
SBUTTERFLY3
bw
,
m6
,
%8
,
m
7
movq
%9
,
m
5
SBUTTERFLY
wd
,
0
,
2
,
5
SBUTTERFLY
wd
,
4
,
6
,
5
SBUTTERFLY
wd
,
1
,
3
,
5
movq
%11
,
m
6
movq
m
6
,
%9
SBUTTERFLY
wd
,
6
,
7
,
5
SBUTTERFLY
dq
,
0
,
4
,
5
SBUTTERFLY
dq
,
1
,
6
,
5
movq
%9
,
m0
movq
%10
,
m
5
movq
%13
,
m
7
movq
%14
,
m
4
SBUTTERFLY3
dq
,
m
3
,
%11
,
m0
SBUTTERFLY
3
dq
,
m6
,
m1
,
m5
movq
%11
,
m
3
movq
%10
,
m
4
movq
%13
,
m
1
movq
%14
,
m
6
SBUTTERFLY3
dq
,
m
2
,
%11
,
m0
SBUTTERFLY
dq
,
3
,
7
,
4
movq
%11
,
m
2
movq
%12
,
m0
movq
%15
,
m6
movq
%16
,
m5
movq
%15
,
m3
movq
%16
,
m7
RESET_MM_PERMUTATION
%endmacro
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro
DIFF_GT
5
%if
avx_enabled
==
0
mova
%5
,
%2
mova
%4
,
%1
psubusb
%5
,
%1
psubusb
%4
,
%2
%else
psubusb
%5
,
%2
,
%1
psubusb
%4
,
%1
,
%2
%endif
por
%4
,
%5
psubusb
%4
,
%3
%endmacro
...
...
@@ -193,32 +241,28 @@ SECTION .text
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro
DIFF_GT2
5
%ifdef
ARCH_X86_64
psubusb
%5
,
%2
,
%1
psubusb
%4
,
%1
,
%2
%else
mova
%5
,
%2
mova
%4
,
%1
psubusb
%5
,
%1
psubusb
%4
,
%2
%endif
psubusb
%5
,
%3
psubusb
%4
,
%3
pcmpeqb
%4
,
%5
%endmacro
%macro
SPLATW
1
%ifidn
m0
,
xmm0
pshuflw
%1
,
%1
,
0
punpcklqdq
%1
,
%1
%else
pshufw
%1
,
%1
,
0
%endif
%endmacro
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
; out: m5=beta-1, m7=mask, %3=alpha-1
; clobbers: m4,m6
%macro
LOAD_MASK
2
-
3
movd
m4
,
%1
movd
m5
,
%2
SPLATW
m4
SPLATW
m5
SPLATW
m4
,
m4
SPLATW
m5
,
m5
packuswb
m4
,
m4
; 16x alpha-1
packuswb
m5
,
m5
; 16x beta-1
%if
%0
>
2
...
...
@@ -237,8 +281,7 @@ SECTION .text
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro
DEBLOCK_P0_Q0
0
mova
m5
,
m1
pxor
m5
,
m2
; p0^q0
pxor
m5
,
m1
,
m2
; p0^q0
pand
m5
,
[
pb_1
]
; (p0^q0)&1
pcmpeqb
m4
,
m4
pxor
m3
,
m4
...
...
@@ -264,14 +307,12 @@ SECTION .text
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro
LUMA_Q1
6
mova
%6
,
m1
pavgb
%6
,
m2
pavgb
%6
,
m1
,
m2
pavgb
%2
,
%6
; avg(p2,avg(p0,q0))
pxor
%6
,
%3
pand
%6
,
[
pb_1
]
; (p2^avg(p0,q0))&1
psubusb
%2
,
%6
; (p2+((p0+q0+1)>>1))>>1
mova
%6
,
%1
psubusb
%6
,
%5
psubusb
%6
,
%1
,
%5
paddusb
%5
,
%1
pmaxub
%2
,
%6
pminub
%2
,
%5
...
...
@@ -280,10 +321,10 @@ SECTION .text
%ifdef
ARCH_X86_64
;-----------------------------------------------------------------------------
; void
x264_deblock_v_luma_sse2
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
deblock_v_luma
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal
x264_deblock_v_luma_sse2
,
5
,
5
,
10
%macro
DEBLOCK_LUMA
1
cglobal
deblock_v_luma_
%1
,
5
,
5
,
10
movd
m8
,
[r4]
; tc0
lea
r4
,
[
r1
*
3
]
dec
r2d
; alpha-1
...
...
@@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
movdqa
m3
,
[r4]
; p2
DIFF_GT2
m1
,
m3
,
m5
,
m6
,
m7
; |p2-p0| > beta-1
pand
m6
,
m9
mova
m7
,
m8
psubb
m7
,
m6
psubb
m7
,
m8
,
m6
pand
m6
,
m8
LUMA_Q1
m0
,
m3
,
[r4],
[
r4
+
r1
]
,
m6
,
m4
...
...
@@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10
RET
;-----------------------------------------------------------------------------
; void
x264_deblock_h_luma_sse2
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
deblock_h_luma
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
x264_deblock_h_luma_sse2
,
5
,
7
cglobal
deblock_h_luma_
%1
,
5
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
+
r10
*
2
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r10, r11 because
x264_
deblock_v_luma_sse2 doesn't use them
; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
lea
r0
,
[
pix_tmp
+
0x30
]
mov
r1d
,
0x10
%ifdef
WIN64
mov
[
rsp
+
0x20
]
,
r4
%endif
call
x264_deblock_v_luma_sse2
call
deblock_v_luma_
%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add
r6
,
2
...
...
@@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq
m1
,
[
pix_tmp
+
0x28
]
movq
m2
,
[
pix_tmp
+
0x38
]
movq
m3
,
[
pix_tmp
+
0x48
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r6
,
r5
,
r10
,
r11
)
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
r6
,
r5
,
r10
,
r11
)
shl
r10
,
3
sub
r6
,
r10
...
...
@@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7
movq
m1
,
[
pix_tmp
+
0x20
]
movq
m2
,
[
pix_tmp
+
0x30
]
movq
m3
,
[
pix_tmp
+
0x40
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r6
,
r5
,
r10
,
r11
)
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
r6
,
r5
,
r10
,
r11
)
%ifdef
WIN64
add
rsp
,
0x98
...
...
@@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7
add
rsp
,
0x68
%endif
RET
%endmacro
INIT_XMM
DEBLOCK_LUMA
sse2
INIT_AVX
DEBLOCK_LUMA
avx
%else
%macro
DEBLOCK_LUMA
3
;-----------------------------------------------------------------------------
; void
x264_deblock_v8_luma_mmxext
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
deblock_v8_luma
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_
%2
_luma_
%1
,
5
,
5
cglobal
deblock_
%2
_luma_
%1
,
5
,
5
lea
r4
,
[
r1
*
3
]
dec
r2
; alpha-1
neg
r4
...
...
@@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
DIFF_GT2
m1
,
m3
,
m5
,
m6
,
m7
; |p2-p0| > beta-1
pand
m6
,
m4
pand
m4
,
[
esp
+
%3
]
; tc
mova
m7
,
m4
psubb
m7
,
m6
psubb
m7
,
m4
,
m6
pand
m6
,
m4
LUMA_Q1
m0
,
m3
,
[r4],
[
r4
+
r1
]
,
m6
,
m4
...
...
@@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5
RET
;-----------------------------------------------------------------------------
; void
x264_deblock_h_luma_mmxext
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
deblock_h_luma
( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
x264_
deblock_h_luma_
%1
,
0
,
5
cglobal
deblock_h_luma_
%1
,
0
,
5
mov
r0
,
r0mp
mov
r3
,
r1m
lea
r4
,
[
r3
*
3
]
...
...
@@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5
PUSH
dword
r2m
PUSH
dword
16
PUSH
dword
r0
call
x264_
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
%1
%ifidn
%2
,
v8
add
dword
[
esp
]
,
8
; pix_tmp+0x38
add
dword
[
esp
+
16
]
,
2
; tc0+2
call
x264_
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
%1
%endif
ADD
esp
,
20
...
...
@@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq
m1
,
[
pix_tmp
+
0x20
]
movq
m2
,
[
pix_tmp
+
0x30
]
movq
m3
,
[
pix_tmp
+
0x40
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r0
,
r1
,
r3
,
r4
)
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
r0
,
r1
,
r3
,
r4
)
lea
r0
,
[
r0
+
r3
*
8
]
lea
r1
,
[
r1
+
r3
*
8
]
...
...
@@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
movq
m1
,
[
pix_tmp
+
0x28
]
movq
m2
,
[
pix_tmp
+
0x38
]
movq
m3
,
[
pix_tmp
+
0x48
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r0
,
r1
,
r3
,
r4
)
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
r0
,
r1
,
r3
,
r4
)
ADD
esp
,
pad
RET
...
...
@@ -502,22 +547,34 @@ INIT_MMX
DEBLOCK_LUMA
mmxext
,
v8
,
8
INIT_XMM
DEBLOCK_LUMA
sse2
,
v
,
16
INIT_AVX
DEBLOCK_LUMA
avx
,
v
,
16
%endif
; ARCH
%macro
LUMA_INTRA_P012
4
; p0..p3 in memory
%ifdef
ARCH_X86_64
pavgb
t0
,
p2
,
p1
pavgb
t1
,
p0
,
q0
%else
mova
t0
,
p2
mova
t1
,
p0
pavgb
t0
,
p1
pavgb
t1
,
q0
%endif
pavgb
t0
,
t1
; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova
t5
,
t1
%ifdef
ARCH_X86_64
paddb
t2
,
p2
,
p1
paddb
t3
,
p0
,
q0
%else
mova
t2
,
p2
mova
t3
,
p0
paddb
t2
,
p1
paddb
t3
,
q0
%endif
paddb
t2
,
t3
mova
t3
,
t2
mova
t4
,
t2
...
...
@@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16
pand
t2
,
mpb_1
psubb
t0
,
t2
; p1' = (p2+p1+p0+q0+2)/4;
%ifdef
ARCH_X86_64
pavgb
t1
,
p2
,
q1
psubb
t2
,
p2
,
q1
%else
mova
t1
,
p2
mova
t2
,
p2
pavgb
t1
,
q1
psubb
t2
,
q1
%endif
paddb
t3
,
t3
psubb
t3
,
t2
; p2+2*p1+2*p0+2*q0+q1
pand
t2
,
mpb_1
...
...
@@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16
pand
t3
,
mpb_1
psubb
t1
,
t3
; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
mova
t3
,
p0
mova
t2
,
p0
pxor
t3
,
q1
pavgb
t2
,
q1
pxor
t3
,
p0
,
q1
pavgb
t2
,
p0
,
q1
pand
t3
,
mpb_1
psubb
t2
,
t3
pavgb
t2
,
p1
; p0'b = (2*p1+p0+q0+2)/4
...
...
@@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16
mova
%1
,
t1
; store p0
mova
t1
,
%4
; p3
mova
t2
,
t1
paddb
t2
,
t1
,
p2
pavgb
t1
,
p2
paddb
t2
,
p2
pavgb
t1
,
t0
; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
paddb
t2
,
t2
paddb
t2
,
t4
; 2*p3+3*p2+p1+p0+q0
...
...
@@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16
%endif
;-----------------------------------------------------------------------------
; void
x264_deblock_v_luma_intra_sse2
( uint8_t *pix, int stride, int alpha, int beta )
; void
deblock_v_luma_intra
( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_
%2
_luma_intra_
%1
,
4
,
6
,
16
cglobal
deblock_
%2
_luma_intra_
%1
,
4
,
6
,
16
%ifndef
ARCH_X86_64
sub
esp
,
0x60
%endif
...
...
@@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
INIT_MMX
%ifdef
ARCH_X86_64
;-----------------------------------------------------------------------------
; void
x264_deblock_h_luma_intra_sse2
( uint8_t *pix, int stride, int alpha, int beta )
; void
deblock_h_luma_intra
( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_h_luma_intra_
%1
,
4
,
7
cglobal
deblock_h_luma_intra_
%1
,
4
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
*
3
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
lea
r0
,
[
pix_tmp
+
0x40
]
mov
r1
,
0x10
call
x264_
deblock_v_luma_intra_
%1
call
deblock_v_luma_intra_
%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea
r5
,
[
r6
+
r11
]
...
...
@@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7
add
rsp
,
0x88
RET
%else
cglobal
x264_
deblock_h_luma_intra_
%1
,
2
,
4
cglobal
deblock_h_luma_intra_
%1
,
2
,
4
lea
r3
,
[
r1
*
3
]
sub
r0
,
4
lea
r2
,
[
r0
+
r3
]
...
...
@@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
PUSH
dword
r2m
PUSH
dword
16
PUSH
r0
call
x264_
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
%1
%ifidn
%2
,
v8
add
dword
[rsp],
8
; pix_tmp+8
call
x264_
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
%1
%endif
ADD
esp
,
16
...
...
@@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
INIT_XMM
DEBLOCK_LUMA_INTRA
sse2
,
v
INIT_AVX
DEBLOCK_LUMA_INTRA
avx
,
v
%ifndef
ARCH_X86_64
INIT_MMX
DEBLOCK_LUMA_INTRA
mmxext
,
v8
%endif
INIT_MMX
%macro
CHROMA_V_START
0
...
...
@@ -790,23 +849,23 @@ INIT_MMX
%define
t6
r6
;-----------------------------------------------------------------------------
; void
x264
_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
ff
_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_v_chroma_mmxext
,
5
,
6
cglobal
deblock_v_chroma_mmxext
,
5
,
6
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
movq
m2
,
[r0]
movq
m3
,
[
r0
+
r1
]
call
x264
_chroma_inter_body_mmxext
call
ff
_chroma_inter_body_mmxext
movq
[
t5
+
r1
]
,
m1
movq
[r0],
m2
RET
;-----------------------------------------------------------------------------
; void
x264
_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void
ff
_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_h_chroma_mmxext
,
5
,
7
cglobal
deblock_h_chroma_mmxext
,
5
,
7
%ifdef
ARCH_X86_64
%
define
buf0
[
rsp
-
24
]
%
define
buf1
[
rsp
-
16
]
...
...
@@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7
%
define
buf1
r2m
%endif
CHROMA_H_START
TRANSPOSE4x8_LOAD
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
movq
buf0
,
m0
movq
buf1
,
m3
call
x264
_chroma_inter_body_mmxext
call
ff
_chroma_inter_body_mmxext
movq
m0
,
buf0
movq
m3
,
buf1
TRANSPOSE8x4_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
RET
ALIGN
16
x264
_chroma_inter_body_mmxext
:
ff
_chroma_inter_body_mmxext
:
LOAD_MASK
r2d
,
r3d
movd
m6
,
[r4]
; tc0
punpcklbw
m6
,
m6
...
...
@@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext:
%define
t6
r5
;-----------------------------------------------------------------------------
; void
x264
_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void
ff
_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_v_chroma_intra_mmxext
,
4
,
5
cglobal
deblock_v_chroma_intra_mmxext
,
4
,
5
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
movq
m2
,
[r0]
movq
m3
,
[
r0
+
r1
]
call
x264
_chroma_intra_body_mmxext
call
ff
_chroma_intra_body_mmxext
movq
[
t5
+
r1
]
,
m1
movq
[r0],
m2
RET
;-----------------------------------------------------------------------------
; void
x264
_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void
ff
_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_
deblock_h_chroma_intra_mmxext
,
4
,
6
cglobal
deblock_h_chroma_intra_mmxext
,
4
,
6
CHROMA_H_START
TRANSPOSE4x8_LOAD
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
call
x264
_chroma_intra_body_mmxext
TRANSPOSE8x4_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
call
ff
_chroma_intra_body_mmxext
TRANSPOSE8x4
B
_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
RET
ALIGN
16
x264
_chroma_intra_body_mmxext
:
ff
_chroma_intra_body_mmxext
:
LOAD_MASK
r2d
,
r3d
movq
m5
,
m1
movq
m6
,
m2
...
...
libavcodec/x86/h264dsp_mmx.c
View file @
8ad77b65
...
...
@@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
}
#define LF_FUNC(DIR, TYPE, OPT) \
void ff_
x264_
deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \
void ff_
x264_
deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
LF_FUNC
(
h
,
chroma
,
mmxext
)
LF_IFUNC
(
h
,
chroma_intra
,
mmxext
)
...
...
@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext)
LF_IFUNC
(
h
,
luma_intra
,
mmxext
)
#if HAVE_YASM && ARCH_X86_32
LF_FUNC
(
v8
,
luma
,
mmxext
)
static
void
ff_
x264_
deblock_v_luma_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
static
void
ff_deblock_v_luma_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
{
if
((
tc0
[
0
]
&
tc0
[
1
])
>=
0
)
ff_
x264_
deblock_v8_luma_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
ff_deblock_v8_luma_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
if
((
tc0
[
2
]
&
tc0
[
3
])
>=
0
)
ff_
x264_
deblock_v8_luma_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
ff_deblock_v8_luma_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
}
LF_IFUNC
(
v8
,
luma_intra
,
mmxext
)
static
void
ff_
x264_
deblock_v_luma_intra_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
static
void
ff_deblock_v_luma_intra_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
{
ff_
x264_
deblock_v8_luma_intra_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_
x264_
deblock_v8_luma_intra_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
}
#endif
...
...
@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2)
LF_IFUNC
(
h
,
luma_intra
,
sse2
)
LF_FUNC
(
v
,
luma
,
sse2
)
LF_IFUNC
(
v
,
luma_intra
,
sse2
)
LF_FUNC
(
h
,
luma
,
avx
)
LF_IFUNC
(
h
,
luma_intra
,
avx
)
LF_FUNC
(
v
,
luma
,
avx
)
LF_IFUNC
(
v
,
luma_intra
,
avx
)
/***********************************/
/* weighted prediction */
...
...
@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
h264_idct_add8
=
ff_h264_idct_add8_mmx2
;
c
->
h264_idct_add16intra
=
ff_h264_idct_add16intra_mmx2
;
c
->
h264_v_loop_filter_chroma
=
ff_
x264_
deblock_v_chroma_mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_
x264_
deblock_h_chroma_mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_
x264_
deblock_v_chroma_intra_mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_
x264_
deblock_h_chroma_intra_mmxext
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_mmxext
;
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_
x264_
deblock_v_luma_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_
x264_
deblock_h_luma_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_
x264_
deblock_v_luma_intra_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_
x264_
deblock_h_luma_intra_mmxext
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_mmxext
;
#endif
c
->
weight_h264_pixels_tab
[
0
]
=
ff_h264_weight_16x16_mmx2
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_h264_weight_16x8_mmx2
;
...
...
@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
biweight_h264_pixels_tab
[
4
]
=
ff_h264_biweight_8x4_sse2
;
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_
x264_
deblock_v_luma_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_
x264_
deblock_h_luma_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_
x264_
deblock_v_luma_intra_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_
x264_
deblock_h_luma_intra_sse2
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_sse2
;
#endif
c
->
h264_idct_add16
=
ff_h264_idct_add16_sse2
;
...
...
@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
biweight_h264_pixels_tab
[
3
]
=
ff_h264_biweight_8x8_ssse3
;
c
->
biweight_h264_pixels_tab
[
4
]
=
ff_h264_biweight_8x4_ssse3
;
}
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_avx
;
#endif
}
}
}
#endif
...
...
libavcodec/x86/x86util.asm
View file @
8ad77b65
...
...
@@ -24,16 +24,20 @@
;******************************************************************************
%macro
SBUTTERFLY
4
%if
avx_enabled
==
0
mova
m%4
,
m%2
punpckl%1
m%2
,
m%3
punpckh%1
m%4
,
m%3
%else
punpckh%1
m%4
,
m%2
,
m%3
punpckl%1
m%2
,
m%3
%endif
SWAP
%3
,
%4
%endmacro
%macro
SBUTTERFLY2
4
mova
m%4
,
m%2
punpckh%1
m%2
,
m%3
punpckl%1
m%4
,
m%3
punpckl%1
m%4
,
m%2
,
m%3
punpckh%1
m%2
,
m%2
,
m%3
SWAP
%2
,
%4
,
%3
%endmacro
...
...
@@ -444,3 +448,12 @@
%macro
PMINUB_MMXEXT
3
; dst, src, ignored
pminub
%1
,
%2
%endmacro
%macro
SPLATW
2
-
3
0
%if
mmsize
==
16
pshuflw
%1
,
%2
,
(
%3
)
*
0x55
punpcklqdq
%1
,
%1
%else
pshufw
%1
,
%2
,
(
%3
)
*
0x55
%endif
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment