Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
a5bbb124
Commit
a5bbb124
authored
Jul 28, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
h264_loopfilter: port x86 simd to cpuflags.
parent
23565c26
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
120 additions
and
121 deletions
+120
-121
h264_deblock.asm
libavcodec/x86/h264_deblock.asm
+52
-52
h264_deblock_10bit.asm
libavcodec/x86/h264_deblock_10bit.asm
+38
-39
h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c
+30
-30
No files found.
libavcodec/x86/h264_deblock.asm
View file @
a5bbb124
...
@@ -282,8 +282,8 @@ cextern pb_A1
...
@@ -282,8 +282,8 @@ cextern pb_A1
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
DEBLOCK_LUMA
1
%macro
DEBLOCK_LUMA
0
cglobal
deblock_v_luma_8
_
%1
,
5
,
5
,
10
cglobal
deblock_v_luma_8
,
5
,
5
,
10
movd
m8
,
[r4]
; tc0
movd
m8
,
[r4]
; tc0
lea
r4
,
[
r1
*
3
]
lea
r4
,
[
r1
*
3
]
dec
r2d
; alpha-1
dec
r2d
; alpha-1
...
@@ -327,8 +327,8 @@ cglobal deblock_v_luma_8_%1, 5,5,10
...
@@ -327,8 +327,8 @@ cglobal deblock_v_luma_8_%1, 5,5,10
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
INIT_MMX
INIT_MMX
cpuname
cglobal
deblock_h_luma_8
_
%1
,
5
,
9
cglobal
deblock_h_luma_8
,
5
,
9
movsxd
r7
,
r1d
movsxd
r7
,
r1d
lea
r8
,
[
r7
+
r7
*
2
]
lea
r8
,
[
r7
+
r7
*
2
]
lea
r6
,
[
r0
-
4
]
lea
r6
,
[
r0
-
4
]
...
@@ -355,7 +355,7 @@ cglobal deblock_h_luma_8_%1, 5,9
...
@@ -355,7 +355,7 @@ cglobal deblock_h_luma_8_%1, 5,9
%if
WIN64
%if
WIN64
mov
[
rsp
+
0x20
]
,
r4
mov
[
rsp
+
0x20
]
,
r4
%endif
%endif
call
deblock_v_luma_8
_
%1
call
deblock_v_luma_8
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add
r6
,
2
add
r6
,
2
...
@@ -384,24 +384,24 @@ cglobal deblock_h_luma_8_%1, 5,9
...
@@ -384,24 +384,24 @@ cglobal deblock_h_luma_8_%1, 5,9
RET
RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA
sse2
DEBLOCK_LUMA
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA
avx
DEBLOCK_LUMA
%else
%else
%macro
DEBLOCK_LUMA
3
%macro
DEBLOCK_LUMA
2
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_
%
2
_luma_8_
%1
,
5
,
5
cglobal
deblock_
%
1
_luma_8
,
5
,
5
lea
r4
,
[
r1
*
3
]
lea
r4
,
[
r1
*
3
]
dec
r2
; alpha-1
dec
r2
; alpha-1
neg
r4
neg
r4
dec
r3
; beta-1
dec
r3
; beta-1
add
r4
,
r0
; pix-3*stride
add
r4
,
r0
; pix-3*stride
%
assign
pad
2
*
%
3
+
12
-
(
stack_offset
&
15
)
%
assign
pad
2
*
%
2
+
12
-
(
stack_offset
&
15
)
SUB
esp
,
pad
SUB
esp
,
pad
mova
m0
,
[
r4
+
r1
]
; p1
mova
m0
,
[
r4
+
r1
]
; p1
...
@@ -415,7 +415,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
...
@@ -415,7 +415,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
movd
m4
,
[r3]
; tc0
movd
m4
,
[r3]
; tc0
punpcklbw
m4
,
m4
punpcklbw
m4
,
m4
punpcklbw
m4
,
m4
; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
punpcklbw
m4
,
m4
; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
mova
[
esp
+
%
3
]
,
m4
; tc
mova
[
esp
+
%
2
]
,
m4
; tc
pcmpgtb
m4
,
m3
pcmpgtb
m4
,
m3
mova
m3
,
[r4]
; p2
mova
m3
,
[r4]
; p2
pand
m4
,
m7
pand
m4
,
m7
...
@@ -423,7 +423,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
...
@@ -423,7 +423,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
DIFF_GT2
m1
,
m3
,
m5
,
m6
,
m7
; |p2-p0| > beta-1
DIFF_GT2
m1
,
m3
,
m5
,
m6
,
m7
; |p2-p0| > beta-1
pand
m6
,
m4
pand
m6
,
m4
pand
m4
,
[
esp
+
%
3
]
; tc
pand
m4
,
[
esp
+
%
2
]
; tc
psubb
m7
,
m4
,
m6
psubb
m7
,
m4
,
m6
pand
m6
,
m4
pand
m6
,
m4
LUMA_Q1
m0
,
m3
,
[r4],
[
r4
+
r1
]
,
m6
,
m4
LUMA_Q1
m0
,
m3
,
[r4],
[
r4
+
r1
]
,
m6
,
m4
...
@@ -431,7 +431,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
...
@@ -431,7 +431,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
mova
m4
,
[
r0
+
2
*
r1
]
; q2
mova
m4
,
[
r0
+
2
*
r1
]
; q2
DIFF_GT2
m2
,
m4
,
m5
,
m6
,
m3
; |q2-q0| > beta-1
DIFF_GT2
m2
,
m4
,
m5
,
m6
,
m3
; |q2-q0| > beta-1
pand
m6
,
[esp]
; mask
pand
m6
,
[esp]
; mask
mova
m5
,
[
esp
+
%
3
]
; tc
mova
m5
,
[
esp
+
%
2
]
; tc
psubb
m7
,
m6
psubb
m7
,
m6
pand
m5
,
m6
pand
m5
,
m6
mova
m3
,
[
r0
+
r1
]
mova
m3
,
[
r0
+
r1
]
...
@@ -446,8 +446,8 @@ cglobal deblock_%2_luma_8_%1, 5,5
...
@@ -446,8 +446,8 @@ cglobal deblock_%2_luma_8_%1, 5,5
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
INIT_MMX
INIT_MMX
cpuname
cglobal
deblock_h_luma_8
_
%1
,
0
,
5
cglobal
deblock_h_luma_8
,
0
,
5
mov
r0
,
r0mp
mov
r0
,
r0mp
mov
r3
,
r1m
mov
r3
,
r1m
lea
r4
,
[
r3
*
3
]
lea
r4
,
[
r3
*
3
]
...
@@ -470,11 +470,11 @@ cglobal deblock_h_luma_8_%1, 0,5
...
@@ -470,11 +470,11 @@ cglobal deblock_h_luma_8_%1, 0,5
PUSH
dword
r2m
PUSH
dword
r2m
PUSH
dword
16
PUSH
dword
16
PUSH
dword
r0
PUSH
dword
r0
call
deblock_
%
2
_luma_8_
%1
call
deblock_
%
1
_luma_8
%ifidn
%
2
,
v8
%ifidn
%
1
,
v8
add
dword
[
esp
]
,
8
; pix_tmp+0x38
add
dword
[
esp
]
,
8
; pix_tmp+0x38
add
dword
[
esp
+
16
]
,
2
; tc0+2
add
dword
[
esp
+
16
]
,
2
; tc0+2
call
deblock_
%
2
_luma_8_
%1
call
deblock_
%
1
_luma_8
%endif
%endif
ADD
esp
,
20
ADD
esp
,
20
...
@@ -501,12 +501,12 @@ cglobal deblock_h_luma_8_%1, 0,5
...
@@ -501,12 +501,12 @@ cglobal deblock_h_luma_8_%1, 0,5
RET
RET
%endmacro
; DEBLOCK_LUMA
%endmacro
; DEBLOCK_LUMA
INIT_MMX
INIT_MMX
mmx2
DEBLOCK_LUMA
mmxext
,
v8
,
8
DEBLOCK_LUMA
v8
,
8
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA
sse2
,
v
,
16
DEBLOCK_LUMA
v
,
16
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA
avx
,
v
,
16
DEBLOCK_LUMA
v
,
16
%endif
; ARCH
%endif
; ARCH
...
@@ -608,7 +608,7 @@ DEBLOCK_LUMA avx, v, 16
...
@@ -608,7 +608,7 @@ DEBLOCK_LUMA avx, v, 16
%
define
mask1p
mask1q
%
define
mask1p
mask1q
%endmacro
%endmacro
%macro
DEBLOCK_LUMA_INTRA
2
%macro
DEBLOCK_LUMA_INTRA
1
%
define
p1
m0
%
define
p1
m0
%
define
p0
m1
%
define
p0
m1
%
define
q0
m2
%
define
q0
m2
...
@@ -643,7 +643,7 @@ DEBLOCK_LUMA avx, v, 16
...
@@ -643,7 +643,7 @@ DEBLOCK_LUMA avx, v, 16
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_
%
2
_luma_intra_8_
%1
,
4
,
6
,
16
cglobal
deblock_
%
1
_luma_intra_8
,
4
,
6
,
16
%if
ARCH_X86_64
==
0
%if
ARCH_X86_64
==
0
sub
esp
,
0x60
sub
esp
,
0x60
%endif
%endif
...
@@ -700,12 +700,12 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16
...
@@ -700,12 +700,12 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16
%endif
%endif
RET
RET
INIT_MMX
INIT_MMX
cpuname
%if
ARCH_X86_64
%if
ARCH_X86_64
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_8
_
%1
,
4
,
9
cglobal
deblock_h_luma_intra_8
,
4
,
9
movsxd
r7
,
r1d
movsxd
r7
,
r1d
lea
r8
,
[
r7
*
3
]
lea
r8
,
[
r7
*
3
]
lea
r6
,
[
r0
-
4
]
lea
r6
,
[
r0
-
4
]
...
@@ -721,7 +721,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9
...
@@ -721,7 +721,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9
lea
r0
,
[
pix_tmp
+
0x40
]
lea
r0
,
[
pix_tmp
+
0x40
]
mov
r1
,
0x10
mov
r1
,
0x10
call
deblock_v_luma_intra_8
_
%1
call
deblock_v_luma_intra_8
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea
r5
,
[
r6
+
r8
]
lea
r5
,
[
r6
+
r8
]
...
@@ -734,7 +734,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9
...
@@ -734,7 +734,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9
add
rsp
,
0x88
add
rsp
,
0x88
RET
RET
%else
%else
cglobal
deblock_h_luma_intra_8
_
%1
,
2
,
4
cglobal
deblock_h_luma_intra_8
,
2
,
4
lea
r3
,
[
r1
*
3
]
lea
r3
,
[
r1
*
3
]
sub
r0
,
4
sub
r0
,
4
lea
r2
,
[
r0
+
r3
]
lea
r2
,
[
r0
+
r3
]
...
@@ -753,10 +753,10 @@ cglobal deblock_h_luma_intra_8_%1, 2,4
...
@@ -753,10 +753,10 @@ cglobal deblock_h_luma_intra_8_%1, 2,4
PUSH
dword
r2m
PUSH
dword
r2m
PUSH
dword
16
PUSH
dword
16
PUSH
r0
PUSH
r0
call
deblock_
%
2
_luma_intra_8_
%1
call
deblock_
%
1
_luma_intra_8
%ifidn
%
2
,
v8
%ifidn
%
1
,
v8
add
dword
[rsp],
8
; pix_tmp+8
add
dword
[rsp],
8
; pix_tmp+8
call
deblock_
%
2
_luma_intra_8_
%1
call
deblock_
%
1
_luma_intra_8
%endif
%endif
ADD
esp
,
16
ADD
esp
,
16
...
@@ -775,16 +775,16 @@ cglobal deblock_h_luma_intra_8_%1, 2,4
...
@@ -775,16 +775,16 @@ cglobal deblock_h_luma_intra_8_%1, 2,4
%endif
; ARCH_X86_64
%endif
; ARCH_X86_64
%endmacro
; DEBLOCK_LUMA_INTRA
%endmacro
; DEBLOCK_LUMA_INTRA
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA_INTRA
sse2
,
v
DEBLOCK_LUMA_INTRA
v
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA_INTRA
avx
,
v
DEBLOCK_LUMA_INTRA
v
%if
ARCH_X86_64
==
0
%if
ARCH_X86_64
==
0
INIT_MMX
INIT_MMX
mmx2
DEBLOCK_LUMA_INTRA
mmxext
,
v8
DEBLOCK_LUMA_INTRA
v8
%endif
%endif
INIT_MMX
INIT_MMX
mmx2
%macro
CHROMA_V_START
0
%macro
CHROMA_V_START
0
dec
r2d
; alpha-1
dec
r2d
; alpha-1
...
@@ -809,13 +809,13 @@ INIT_MMX
...
@@ -809,13 +809,13 @@ INIT_MMX
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_8
_mmxext
,
5
,
6
cglobal
deblock_v_chroma_8
,
5
,
6
CHROMA_V_START
CHROMA_V_START
movq
m0
,
[t5]
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
movq
m1
,
[
t5
+
r1
]
movq
m2
,
[r0]
movq
m2
,
[r0]
movq
m3
,
[
r0
+
r1
]
movq
m3
,
[
r0
+
r1
]
call
ff_chroma_inter_body_mmx
ext
call
ff_chroma_inter_body_mmx
2
movq
[
t5
+
r1
]
,
m1
movq
[
t5
+
r1
]
,
m1
movq
[r0],
m2
movq
[r0],
m2
RET
RET
...
@@ -823,7 +823,7 @@ cglobal deblock_v_chroma_8_mmxext, 5,6
...
@@ -823,7 +823,7 @@ cglobal deblock_v_chroma_8_mmxext, 5,6
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_8
_mmxext
,
5
,
7
cglobal
deblock_h_chroma_8
,
5
,
7
%if
UNIX64
%if
UNIX64
%
define
buf0
[
rsp
-
24
]
%
define
buf0
[
rsp
-
24
]
%
define
buf1
[
rsp
-
16
]
%
define
buf1
[
rsp
-
16
]
...
@@ -839,7 +839,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7
...
@@ -839,7 +839,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
movq
buf0
,
m0
movq
buf0
,
m0
movq
buf1
,
m3
movq
buf1
,
m3
call
ff_chroma_inter_body_mmx
ext
call
ff_chroma_inter_body_mmx
2
movq
m0
,
buf0
movq
m0
,
buf0
movq
m3
,
buf1
movq
m3
,
buf1
TRANSPOSE8x4B_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE8x4B_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
...
@@ -849,7 +849,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7
...
@@ -849,7 +849,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7
RET
RET
ALIGN
16
ALIGN
16
ff_chroma_inter_body_mmx
ext
:
ff_chroma_inter_body_mmx
2
:
LOAD_MASK
r2d
,
r3d
LOAD_MASK
r2d
,
r3d
movd
m6
,
[r4]
; tc0
movd
m6
,
[r4]
; tc0
punpcklbw
m6
,
m6
punpcklbw
m6
,
m6
...
@@ -876,13 +876,13 @@ ff_chroma_inter_body_mmxext:
...
@@ -876,13 +876,13 @@ ff_chroma_inter_body_mmxext:
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_intra_8
_mmxext
,
4
,
5
cglobal
deblock_v_chroma_intra_8
,
4
,
5
CHROMA_V_START
CHROMA_V_START
movq
m0
,
[t5]
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
movq
m1
,
[
t5
+
r1
]
movq
m2
,
[r0]
movq
m2
,
[r0]
movq
m3
,
[
r0
+
r1
]
movq
m3
,
[
r0
+
r1
]
call
ff_chroma_intra_body_mmx
ext
call
ff_chroma_intra_body_mmx
2
movq
[
t5
+
r1
]
,
m1
movq
[
t5
+
r1
]
,
m1
movq
[r0],
m2
movq
[r0],
m2
RET
RET
...
@@ -890,15 +890,15 @@ cglobal deblock_v_chroma_intra_8_mmxext, 4,5
...
@@ -890,15 +890,15 @@ cglobal deblock_v_chroma_intra_8_mmxext, 4,5
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_intra_8
_mmxext
,
4
,
6
cglobal
deblock_h_chroma_intra_8
,
4
,
6
CHROMA_H_START
CHROMA_H_START
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
call
ff_chroma_intra_body_mmx
ext
call
ff_chroma_intra_body_mmx
2
TRANSPOSE8x4B_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
TRANSPOSE8x4B_STORE
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
RET
RET
ALIGN
16
ALIGN
16
ff_chroma_intra_body_mmx
ext
:
ff_chroma_intra_body_mmx
2
:
LOAD_MASK
r2d
,
r3d
LOAD_MASK
r2d
,
r3d
movq
m5
,
m1
movq
m5
,
m1
movq
m6
,
m2
movq
m6
,
m2
...
...
libavcodec/x86/h264_deblock_10bit.asm
View file @
a5bbb124
...
@@ -151,11 +151,11 @@ cextern pw_4
...
@@ -151,11 +151,11 @@ cextern pw_4
%endif
%endif
%endmacro
%endmacro
%macro
DEBLOCK_LUMA
1
%macro
DEBLOCK_LUMA
0
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_luma_10
_
%1
,
5
,
5
,
8
*
(
mmsize
/
16
)
cglobal
deblock_v_luma_10
,
5
,
5
,
8
*
(
mmsize
/
16
)
%
assign
pad
5
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
assign
pad
5
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
define
tcm
[rsp]
%
define
tcm
[rsp]
%
define
ms1
[
rsp
+
mmsize
]
%
define
ms1
[
rsp
+
mmsize
]
...
@@ -210,7 +210,7 @@ cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
...
@@ -210,7 +210,7 @@ cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16)
ADD
rsp
,
pad
ADD
rsp
,
pad
RET
RET
cglobal
deblock_h_luma_10
_
%1
,
5
,
6
,
8
*
(
mmsize
/
16
)
cglobal
deblock_h_luma_10
,
5
,
6
,
8
*
(
mmsize
/
16
)
%
assign
pad
7
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
assign
pad
7
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
define
tcm
[rsp]
%
define
tcm
[rsp]
%
define
ms1
[
rsp
+
mmsize
]
%
define
ms1
[
rsp
+
mmsize
]
...
@@ -301,7 +301,6 @@ cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
...
@@ -301,7 +301,6 @@ cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16)
RET
RET
%endmacro
%endmacro
INIT_XMM
%if
ARCH_X86_64
%if
ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; m12=alpha, m13=beta
...
@@ -339,8 +338,8 @@ INIT_XMM
...
@@ -339,8 +338,8 @@ INIT_XMM
SWAP
3
,
9
SWAP
3
,
9
%endmacro
%endmacro
%macro
DEBLOCK_LUMA_64
1
%macro
DEBLOCK_LUMA_64
0
cglobal
deblock_v_luma_10
_
%1
,
5
,
5
,
15
cglobal
deblock_v_luma_10
,
5
,
5
,
15
%
define
p2
m8
%
define
p2
m8
%
define
p1
m0
%
define
p1
m0
%
define
p0
m1
%
define
p0
m1
...
@@ -377,7 +376,7 @@ cglobal deblock_v_luma_10_%1, 5,5,15
...
@@ -377,7 +376,7 @@ cglobal deblock_v_luma_10_%1, 5,5,15
jg
.
loop
jg
.
loop
REP_RET
REP_RET
cglobal
deblock_h_luma_10
_
%1
,
5
,
7
,
15
cglobal
deblock_h_luma_10
,
5
,
7
,
15
shl
r2d
,
2
shl
r2d
,
2
shl
r3d
,
2
shl
r3d
,
2
LOAD_AB
m12
,
m13
,
r2
,
r3
LOAD_AB
m12
,
m13
,
r2
,
r3
...
@@ -417,10 +416,10 @@ cglobal deblock_h_luma_10_%1, 5,7,15
...
@@ -417,10 +416,10 @@ cglobal deblock_h_luma_10_%1, 5,7,15
REP_RET
REP_RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA_64
sse2
DEBLOCK_LUMA_64
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA_64
avx
DEBLOCK_LUMA_64
%endif
%endif
%macro
SWAPMOVA
2
%macro
SWAPMOVA
2
...
@@ -602,8 +601,8 @@ DEBLOCK_LUMA_64 avx
...
@@ -602,8 +601,8 @@ DEBLOCK_LUMA_64 avx
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
DEBLOCK_LUMA_INTRA_64
1
%macro
DEBLOCK_LUMA_INTRA_64
0
cglobal
deblock_v_luma_intra_10
_
%1
,
4
,
7
,
16
cglobal
deblock_v_luma_intra_10
,
4
,
7
,
16
%
define
t0
m1
%
define
t0
m1
%
define
t1
m2
%
define
t1
m2
%
define
t2
m4
%
define
t2
m4
...
@@ -653,7 +652,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,16
...
@@ -653,7 +652,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,16
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_10
_
%1
,
4
,
7
,
16
cglobal
deblock_h_luma_intra_10
,
4
,
7
,
16
%
define
t0
m15
%
define
t0
m15
%
define
t1
m14
%
define
t1
m14
%
define
t2
m2
%
define
t2
m2
...
@@ -712,18 +711,18 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,16
...
@@ -712,18 +711,18 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,16
RET
RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA_INTRA_64
sse2
DEBLOCK_LUMA_INTRA_64
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA_INTRA_64
avx
DEBLOCK_LUMA_INTRA_64
%endif
%endif
%macro
DEBLOCK_LUMA_INTRA
1
%macro
DEBLOCK_LUMA_INTRA
0
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_luma_intra_10
_
%1
,
4
,
7
,
8
*
(
mmsize
/
16
)
cglobal
deblock_v_luma_intra_10
,
4
,
7
,
8
*
(
mmsize
/
16
)
LUMA_INTRA_INIT
3
LUMA_INTRA_INIT
3
lea
r4
,
[
r1
*
4
]
lea
r4
,
[
r1
*
4
]
lea
r5
,
[
r1
*
3
]
lea
r5
,
[
r1
*
3
]
...
@@ -751,7 +750,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
...
@@ -751,7 +750,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_10
_
%1
,
4
,
7
,
8
*
(
mmsize
/
16
)
cglobal
deblock_h_luma_intra_10
,
4
,
7
,
8
*
(
mmsize
/
16
)
LUMA_INTRA_INIT
8
LUMA_INTRA_INIT
8
%if
mmsize
==
8
%if
mmsize
==
8
lea
r4
,
[
r1
*
3
]
lea
r4
,
[
r1
*
3
]
...
@@ -793,15 +792,15 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
...
@@ -793,15 +792,15 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16)
%endmacro
%endmacro
%if
ARCH_X86_64
==
0
%if
ARCH_X86_64
==
0
INIT_MMX
INIT_MMX
mmx2
DEBLOCK_LUMA
mmxext
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
mmxext
DEBLOCK_LUMA_INTRA
INIT_XMM
INIT_XMM
sse2
DEBLOCK_LUMA
sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
sse2
DEBLOCK_LUMA_INTRA
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_LUMA
avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
avx
DEBLOCK_LUMA_INTRA
%endif
%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
...
@@ -843,11 +842,11 @@ DEBLOCK_LUMA_INTRA avx
...
@@ -843,11 +842,11 @@ DEBLOCK_LUMA_INTRA avx
psraw
%1
,
6
psraw
%1
,
6
%endmacro
%endmacro
%macro
DEBLOCK_CHROMA
1
%macro
DEBLOCK_CHROMA
0
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_10
_
%1
,
5
,
7
-
(
mmsize
/
16
),
8
*
(
mmsize
/
16
)
cglobal
deblock_v_chroma_10
,
5
,
7
-
(
mmsize
/
16
),
8
*
(
mmsize
/
16
)
mov
r5
,
r0
mov
r5
,
r0
sub
r0
,
r1
sub
r0
,
r1
sub
r0
,
r1
sub
r0
,
r1
...
@@ -881,7 +880,7 @@ cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
...
@@ -881,7 +880,7 @@ cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_intra_10
_
%1
,
4
,
6
-
(
mmsize
/
16
),
8
*
(
mmsize
/
16
)
cglobal
deblock_v_chroma_intra_10
,
4
,
6
-
(
mmsize
/
16
),
8
*
(
mmsize
/
16
)
mov
r4
,
r0
mov
r4
,
r0
sub
r0
,
r1
sub
r0
,
r1
sub
r0
,
r1
sub
r0
,
r1
...
@@ -908,10 +907,10 @@ cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
...
@@ -908,10 +907,10 @@ cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16)
%endmacro
%endmacro
%if
ARCH_X86_64
==
0
%if
ARCH_X86_64
==
0
INIT_MMX
INIT_MMX
mmx2
DEBLOCK_CHROMA
mmxext
DEBLOCK_CHROMA
%endif
%endif
INIT_XMM
INIT_XMM
sse2
DEBLOCK_CHROMA
sse2
DEBLOCK_CHROMA
INIT_
AVX
INIT_
XMM
avx
DEBLOCK_CHROMA
avx
DEBLOCK_CHROMA
libavcodec/x86/h264dsp_mmx.c
View file @
a5bbb124
...
@@ -249,12 +249,12 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, in
...
@@ -249,12 +249,12 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, in
int alpha, int beta);
int alpha, int beta);
#define LF_FUNCS(type, depth)\
#define LF_FUNCS(type, depth)\
LF_FUNC (h, chroma, depth, mmx
ext
)\
LF_FUNC (h, chroma, depth, mmx
2
)\
LF_IFUNC(h, chroma_intra, depth, mmx
ext
)\
LF_IFUNC(h, chroma_intra, depth, mmx
2
)\
LF_FUNC (v, chroma, depth, mmx
ext
)\
LF_FUNC (v, chroma, depth, mmx
2
)\
LF_IFUNC(v, chroma_intra, depth, mmx
ext
)\
LF_IFUNC(v, chroma_intra, depth, mmx
2
)\
LF_FUNC (h, luma, depth, mmx
ext
)\
LF_FUNC (h, luma, depth, mmx
2
)\
LF_IFUNC(h, luma_intra, depth, mmx
ext
)\
LF_IFUNC(h, luma_intra, depth, mmx
2
)\
LF_FUNC (h, luma, depth, sse2)\
LF_FUNC (h, luma, depth, sse2)\
LF_IFUNC(h, luma_intra, depth, sse2)\
LF_IFUNC(h, luma_intra, depth, sse2)\
LF_FUNC (v, luma, depth, sse2)\
LF_FUNC (v, luma, depth, sse2)\
...
@@ -276,24 +276,24 @@ LF_FUNCS( uint8_t, 8)
...
@@ -276,24 +276,24 @@ LF_FUNCS( uint8_t, 8)
LF_FUNCS
(
uint16_t
,
10
)
LF_FUNCS
(
uint16_t
,
10
)
#if ARCH_X86_32
#if ARCH_X86_32
LF_FUNC
(
v8
,
luma
,
8
,
mmx
ext
)
LF_FUNC
(
v8
,
luma
,
8
,
mmx
2
)
static
void
ff_deblock_v_luma_8_mmx
ext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
static
void
ff_deblock_v_luma_8_mmx
2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
{
{
if
((
tc0
[
0
]
&
tc0
[
1
])
>=
0
)
if
((
tc0
[
0
]
&
tc0
[
1
])
>=
0
)
ff_deblock_v8_luma_8_mmx
ext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
ff_deblock_v8_luma_8_mmx
2
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
if
((
tc0
[
2
]
&
tc0
[
3
])
>=
0
)
if
((
tc0
[
2
]
&
tc0
[
3
])
>=
0
)
ff_deblock_v8_luma_8_mmx
ext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
ff_deblock_v8_luma_8_mmx
2
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
}
}
LF_IFUNC
(
v8
,
luma_intra
,
8
,
mmx
ext
)
LF_IFUNC
(
v8
,
luma_intra
,
8
,
mmx
2
)
static
void
ff_deblock_v_luma_intra_8_mmx
ext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
static
void
ff_deblock_v_luma_intra_8_mmx
2
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
{
{
ff_deblock_v8_luma_intra_8_mmx
ext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_8_mmx
2
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_8_mmx
ext
(
pix
+
8
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_8_mmx
2
(
pix
+
8
,
stride
,
alpha
,
beta
);
}
}
#endif
/* ARCH_X86_32 */
#endif
/* ARCH_X86_32 */
LF_FUNC
(
v
,
luma
,
10
,
mmx
ext
)
LF_FUNC
(
v
,
luma
,
10
,
mmx
2
)
LF_IFUNC
(
v
,
luma_intra
,
10
,
mmx
ext
)
LF_IFUNC
(
v
,
luma_intra
,
10
,
mmx
2
)
/***********************************/
/***********************************/
/* weighted prediction */
/* weighted prediction */
...
@@ -373,17 +373,17 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
...
@@ -373,17 +373,17 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
c
->
h264_idct_add8
=
ff_h264_idct_add8_8_mmx2
;
c
->
h264_idct_add8
=
ff_h264_idct_add8_8_mmx2
;
c
->
h264_idct_add16intra
=
ff_h264_idct_add16intra_8_mmx2
;
c
->
h264_idct_add16intra
=
ff_h264_idct_add16intra_8_mmx2
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_8_mmx
ext
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_8_mmx
2
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_8_mmx
ext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_8_mmx
2
;
if
(
chroma_format_idc
==
1
)
{
if
(
chroma_format_idc
==
1
)
{
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_8_mmx
ext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_8_mmx
2
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_8_mmx
ext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_8_mmx
2
;
}
}
#if ARCH_X86_32
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_8_mmx
ext
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_8_mmx
2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_8_mmx
ext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_8_mmx
2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_8_mmx
ext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_8_mmx
2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_8_mmx
ext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_8_mmx
2
;
#endif
#endif
c
->
weight_h264_pixels_tab
[
0
]
=
ff_h264_weight_16_mmx2
;
c
->
weight_h264_pixels_tab
[
0
]
=
ff_h264_weight_16_mmx2
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_h264_weight_8_mmx2
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_h264_weight_8_mmx2
;
...
@@ -436,12 +436,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
...
@@ -436,12 +436,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
#if ARCH_X86_32
#if ARCH_X86_32
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_10_mmx
ext
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_10_mmx
2
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_10_mmx
ext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_10_mmx
2
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_mmx
ext
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_mmx
2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_mmx
ext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_mmx
2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_mmx
ext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_mmx
2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_mmx
ext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_mmx
2
;
#endif
#endif
c
->
h264_idct_dc_add
=
ff_h264_idct_dc_add_10_mmx2
;
c
->
h264_idct_dc_add
=
ff_h264_idct_dc_add_10_mmx2
;
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment