Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
9f3d6ca4
Commit
9f3d6ca4
authored
May 10, 2011
by
Jason Garrett-Glaser
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Port x86 10-bit H.264 deblock asm from x264
parent
8ad77b65
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
909 additions
and
65 deletions
+909
-65
Makefile
libavcodec/x86/Makefile
+1
-0
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+1
-0
h264_deblock.asm
libavcodec/x86/h264_deblock.asm
+17
-17
h264_deblock_10bit.asm
libavcodec/x86/h264_deblock_10bit.asm
+804
-0
h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c
+81
-48
x86util.asm
libavcodec/x86/x86util.asm
+5
-0
No files found.
libavcodec/x86/Makefile
View file @
9f3d6ca4
...
...
@@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
MMX-OBJS-$(CONFIG_H264DSP)
+=
x86/h264dsp_mmx.o
YASM-OBJS-$(CONFIG_H264DSP)
+=
x86/h264_deblock.o
\
x86/h264_deblock_10bit.o
\
x86/h264_weight.o
\
x86/h264_idct.o
\
...
...
libavcodec/x86/dsputil_mmx.c
View file @
9f3d6ca4
...
...
@@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
{
0x8000000080000000ULL
,
0x8000000080000000ULL
};
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_pw_1
)
=
0x0001000100010001ULL
;
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_2
)
=
{
0x0002000200020002ULL
,
0x0002000200020002ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_3
)
=
{
0x0003000300030003ULL
,
0x0003000300030003ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_5
)
=
{
0x0005000500050005ULL
,
0x0005000500050005ULL
};
...
...
libavcodec/x86/h264_deblock.asm
View file @
9f3d6ca4
...
...
@@ -324,7 +324,7 @@ cextern pb_A1
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro
DEBLOCK_LUMA
1
cglobal
deblock_v_luma_
%1
,
5
,
5
,
10
cglobal
deblock_v_luma_
8_
%1
,
5
,
5
,
10
movd
m8
,
[r4]
; tc0
lea
r4
,
[
r1
*
3
]
dec
r2d
; alpha-1
...
...
@@ -369,7 +369,7 @@ cglobal deblock_v_luma_%1, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
deblock_h_luma_
%1
,
5
,
7
cglobal
deblock_h_luma_
8_
%1
,
5
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
+
r10
*
2
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -396,7 +396,7 @@ cglobal deblock_h_luma_%1, 5,7
%ifdef
WIN64
mov
[
rsp
+
0x20
]
,
r4
%endif
call
deblock_v_luma_
%1
call
deblock_v_luma_
8_
%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add
r6
,
2
...
...
@@ -436,7 +436,7 @@ DEBLOCK_LUMA avx
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_
%2
_luma_
%1
,
5
,
5
cglobal
deblock_
%2
_luma_
8_
%1
,
5
,
5
lea
r4
,
[
r1
*
3
]
dec
r2
; alpha-1
neg
r4
...
...
@@ -489,7 +489,7 @@ cglobal deblock_%2_luma_%1, 5,5
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
deblock_h_luma_
%1
,
0
,
5
cglobal
deblock_h_luma_
8_
%1
,
0
,
5
mov
r0
,
r0mp
mov
r3
,
r1m
lea
r4
,
[
r3
*
3
]
...
...
@@ -512,11 +512,11 @@ cglobal deblock_h_luma_%1, 0,5
PUSH
dword
r2m
PUSH
dword
16
PUSH
dword
r0
call
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
8_
%1
%ifidn
%2
,
v8
add
dword
[
esp
]
,
8
; pix_tmp+0x38
add
dword
[
esp
+
16
]
,
2
; tc0+2
call
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
8_
%1
%endif
ADD
esp
,
20
...
...
@@ -685,7 +685,7 @@ DEBLOCK_LUMA avx, v, 16
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_
%2
_luma_intra_
%1
,
4
,
6
,
16
cglobal
deblock_
%2
_luma_intra_
8_
%1
,
4
,
6
,
16
%ifndef
ARCH_X86_64
sub
esp
,
0x60
%endif
...
...
@@ -747,7 +747,7 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_
%1
,
4
,
7
cglobal
deblock_h_luma_intra_
8_
%1
,
4
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
*
3
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -763,7 +763,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
lea
r0
,
[
pix_tmp
+
0x40
]
mov
r1
,
0x10
call
deblock_v_luma_intra_
%1
call
deblock_v_luma_intra_
8_
%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea
r5
,
[
r6
+
r11
]
...
...
@@ -776,7 +776,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
add
rsp
,
0x88
RET
%else
cglobal
deblock_h_luma_intra_
%1
,
2
,
4
cglobal
deblock_h_luma_intra_
8_
%1
,
2
,
4
lea
r3
,
[
r1
*
3
]
sub
r0
,
4
lea
r2
,
[
r0
+
r3
]
...
...
@@ -795,10 +795,10 @@ cglobal deblock_h_luma_intra_%1, 2,4
PUSH
dword
r2m
PUSH
dword
16
PUSH
r0
call
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
8_
%1
%ifidn
%2
,
v8
add
dword
[rsp],
8
; pix_tmp+8
call
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
8_
%1
%endif
ADD
esp
,
16
...
...
@@ -851,7 +851,7 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_mmxext
,
5
,
6
cglobal
deblock_v_chroma_
8_
mmxext
,
5
,
6
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
...
...
@@ -865,7 +865,7 @@ cglobal deblock_v_chroma_mmxext, 5,6
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_mmxext
,
5
,
7
cglobal
deblock_h_chroma_
8_
mmxext
,
5
,
7
%ifdef
ARCH_X86_64
%
define
buf0
[
rsp
-
24
]
%
define
buf1
[
rsp
-
16
]
...
...
@@ -911,7 +911,7 @@ ff_chroma_inter_body_mmxext:
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_intra_mmxext
,
4
,
5
cglobal
deblock_v_chroma_intra_
8_
mmxext
,
4
,
5
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
...
...
@@ -925,7 +925,7 @@ cglobal deblock_v_chroma_intra_mmxext, 4,5
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_intra_mmxext
,
4
,
6
cglobal
deblock_h_chroma_intra_
8_
mmxext
,
4
,
6
CHROMA_H_START
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
call
ff_chroma_intra_body_mmxext
...
...
libavcodec/x86/h264_deblock_10bit.asm
0 → 100644
View file @
9f3d6ca4
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
;* Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"x86inc.asm"
%include
"x86util.asm"
SECTION_RODATA
pw_pixel_max
:
times
8
dw
((
1
<<
10
)
-
1
)
SECTION
.
text
cextern
pw_2
cextern
pw_4
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro
ABS_SUB
5
psubusw
%5
,
%2
,
%1
psubusw
%4
,
%1
,
%2
por
%4
,
%5
psubw
%4
,
%3
%endmacro
; out: %4 = |%1-%2|<%3
%macro
DIFF_LT
5
psubusw
%4
,
%2
,
%1
psubusw
%5
,
%1
,
%2
por
%5
,
%4
; |%1-%2|
pxor
%4
,
%4
psubw
%5
,
%3
; |%1-%2|-%3
pcmpgtw
%4
,
%5
; 0 > |%1-%2|-%3
%endmacro
%macro
LOAD_AB
4
movd
%1
,
%3
movd
%2
,
%4
SPLATW
%1
,
%1
SPLATW
%2
,
%2
%endmacro
; in: %2=tc reg
; out: %1=splatted tc
%macro
LOAD_TC
2
movd
%1
,
[
%2
]
punpcklbw
%1
,
%1
%if
mmsize
==
8
pshufw
%1
,
%1
,
0
%else
pshuflw
%1
,
%1
,
01010000
b
pshufd
%1
,
%1
,
01010000
b
%endif
psraw
%1
,
6
%endmacro
; in: %1=p1, %2=p0, %3=q0, %4=q1
; %5=alpha, %6=beta, %7-%9=tmp
; out: %7=mask
%macro
LOAD_MASK
9
ABS_SUB
%2
,
%3
,
%5
,
%8
,
%7
; |p0-q0| - alpha
ABS_SUB
%1
,
%2
,
%6
,
%9
,
%7
; |p1-p0| - beta
pand
%8
,
%9
ABS_SUB
%3
,
%4
,
%6
,
%9
,
%7
; |q1-q0| - beta
pxor
%7
,
%7
pand
%8
,
%9
pcmpgtw
%7
,
%8
%endmacro
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', m2=q0'
%macro
DEBLOCK_P0_Q0
7
psubw
%3
,
%4
pxor
%7
,
%7
paddw
%3
,
[
pw_4
]
psubw
%7
,
%5
psubw
%6
,
%2
,
%1
psllw
%6
,
2
paddw
%3
,
%6
psraw
%3
,
3
mova
%6
,
[
pw_pixel_max
]
CLIPW
%3
,
%7
,
%5
pxor
%7
,
%7
paddw
%1
,
%3
psubw
%2
,
%3
CLIPW
%1
,
%7
,
%6
CLIPW
%2
,
%7
,
%6
%endmacro
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
%macro
LUMA_Q1
6
pavgw
%6
,
%3
,
%4
; (p0+q0+1)>>1
paddw
%1
,
%6
pxor
%6
,
%6
psraw
%1
,
1
psubw
%6
,
%5
psubw
%1
,
%2
CLIPW
%1
,
%6
,
%5
paddw
%1
,
%2
%endmacro
%macro
LUMA_DEBLOCK_ONE
3
DIFF_LT
m5
,
%1
,
bm
,
m4
,
m6
pxor
m6
,
m6
mova
%3
,
m4
pcmpgtw
m6
,
tcm
pand
m4
,
tcm
pandn
m6
,
m7
pand
m4
,
m6
LUMA_Q1
m5
,
%2
,
m1
,
m2
,
m4
,
m6
%endmacro
%macro
LUMA_H_STORE
2
%if
mmsize
==
8
movq
[
r0
-
4
]
,
m0
movq
[
r0
+
r1
-
4
]
,
m1
movq
[
r0
+
r1
*
2
-
4
]
,
m2
movq
[
r0
+
%2
-
4
]
,
m3
%else
movq
[
r0
-
4
]
,
m0
movhps
[
r0
+
r1
-
4
]
,
m0
movq
[
r0
+
r1
*
2
-
4
]
,
m1
movhps
[
%1
-
4
]
,
m1
movq
[
%1
+
r1
-
4
]
,
m2
movhps
[
%1
+
r1
*
2
-
4
]
,
m2
movq
[
%1
+
%2
-
4
]
,
m3
movhps
[
%1
+
r1
*
4
-
4
]
,
m3
%endif
%endmacro
%macro
DEBLOCK_LUMA
1
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_v_luma_10_
%1
,
5
,
5
,
8
*
(
mmsize
/
16
)
%
assign
pad
5
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
define
tcm
[rsp]
%
define
ms1
[
rsp
+
mmsize
]
%
define
ms2
[
rsp
+
mmsize
*
2
]
%
define
am
[
rsp
+
mmsize
*
3
]
%
define
bm
[
rsp
+
mmsize
*
4
]
SUB
rsp
,
pad
shl
r2d
,
2
shl
r3d
,
2
LOAD_AB
m4
,
m5
,
r2
,
r3
mov
r3
,
32
/
mmsize
mov
r2
,
r0
sub
r0
,
r1
mova
am
,
m4
sub
r0
,
r1
mova
bm
,
m5
sub
r0
,
r1
.
loop
:
mova
m0
,
[
r0
+
r1
]
mova
m1
,
[
r0
+
r1
*
2
]
mova
m2
,
[r2]
mova
m3
,
[
r2
+
r1
]
LOAD_MASK
m0
,
m1
,
m2
,
m3
,
am
,
bm
,
m7
,
m4
,
m6
LOAD_TC
m6
,
r4
mova
tcm
,
m6
mova
m5
,
[r0]
LUMA_DEBLOCK_ONE
m1
,
m0
,
ms1
mova
[
r0
+
r1
]
,
m5
mova
m5
,
[
r2
+
r1
*
2
]
LUMA_DEBLOCK_ONE
m2
,
m3
,
ms2
mova
[
r2
+
r1
]
,
m5
pxor
m5
,
m5
mova
m6
,
tcm
pcmpgtw
m5
,
tcm
psubw
m6
,
ms1
pandn
m5
,
m7
psubw
m6
,
ms2
pand
m5
,
m6
DEBLOCK_P0_Q0
m1
,
m2
,
m0
,
m3
,
m5
,
m7
,
m6
mova
[
r0
+
r1
*
2
]
,
m1
mova
[r2],
m2
add
r0
,
mmsize
add
r2
,
mmsize
add
r4
,
mmsize
/
8
dec
r3
jg
.
loop
ADD
rsp
,
pad
RET
cglobal
deblock_h_luma_10_
%1
,
5
,
6
,
8
*
(
mmsize
/
16
)
%
assign
pad
7
*
mmsize
+
12
-
(
stack_offset
&
15
)
%
define
tcm
[rsp]
%
define
ms1
[
rsp
+
mmsize
]
%
define
ms2
[
rsp
+
mmsize
*
2
]
%
define
p1m
[
rsp
+
mmsize
*
3
]
%
define
p2m
[
rsp
+
mmsize
*
4
]
%
define
am
[
rsp
+
mmsize
*
5
]
%
define
bm
[
rsp
+
mmsize
*
6
]
SUB
rsp
,
pad
shl
r2d
,
2
shl
r3d
,
2
LOAD_AB
m4
,
m5
,
r2
,
r3
mov
r3
,
r1
mova
am
,
m4
add
r3
,
r1
mov
r5
,
32
/
mmsize
mova
bm
,
m5
add
r3
,
r1
%if
mmsize
==
16
mov
r2
,
r0
add
r2
,
r3
%endif
.
loop
:
%if
mmsize
==
8
movq
m2
,
[
r0
-
8
]
; y q2 q1 q0
movq
m7
,
[
r0
+
0
]
movq
m5
,
[
r0
+
r1
-
8
]
movq
m3
,
[
r0
+
r1
+
0
]
movq
m0
,
[
r0
+
r1
*
2
-
8
]
movq
m6
,
[
r0
+
r1
*
2
+
0
]
movq
m1
,
[
r0
+
r3
-
8
]
TRANSPOSE4x4W
2
,
5
,
0
,
1
,
4
SWAP
2
,
7
movq
m7
,
[
r0
+
r3
]
TRANSPOSE4x4W
2
,
3
,
6
,
7
,
4
%else
movu
m5
,
[
r0
-
8
]
; y q2 q1 q0 p0 p1 p2 x
movu
m0
,
[
r0
+
r1
-
8
]
movu
m2
,
[
r0
+
r1
*
2
-
8
]
movu
m3
,
[
r2
-
8
]
TRANSPOSE4x4W
5
,
0
,
2
,
3
,
6
mova
tcm
,
m3
movu
m4
,
[
r2
+
r1
-
8
]
movu
m1
,
[
r2
+
r1
*
2
-
8
]
movu
m3
,
[
r2
+
r3
-
8
]
movu
m7
,
[
r2
+
r1
*
4
-
8
]
TRANSPOSE4x4W
4
,
1
,
3
,
7
,
6
mova
m6
,
tcm
punpcklqdq
m6
,
m7
punpckhqdq
m5
,
m4
SBUTTERFLY
qdq
,
0
,
1
,
7
SBUTTERFLY
qdq
,
2
,
3
,
7
%endif
mova
p2m
,
m6
LOAD_MASK
m0
,
m1
,
m2
,
m3
,
am
,
bm
,
m7
,
m4
,
m6
LOAD_TC
m6
,
r4
mova
tcm
,
m6
LUMA_DEBLOCK_ONE
m1
,
m0
,
ms1
mova
p1m
,
m5
mova
m5
,
p2m
LUMA_DEBLOCK_ONE
m2
,
m3
,
ms2
mova
p2m
,
m5
pxor
m5
,
m5
mova
m6
,
tcm
pcmpgtw
m5
,
tcm
psubw
m6
,
ms1
pandn
m5
,
m7
psubw
m6
,
ms2
pand
m5
,
m6
DEBLOCK_P0_Q0
m1
,
m2
,
m0
,
m3
,
m5
,
m7
,
m6
mova
m0
,
p1m
mova
m3
,
p2m
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
LUMA_H_STORE
r2
,
r3
add
r4
,
mmsize
/
8
lea
r0
,
[
r0
+
r1
*
(
mmsize
/
2
)
]
lea
r2
,
[
r2
+
r1
*
(
mmsize
/
2
)
]
dec
r5
jg
.
loop
ADD
rsp
,
pad
RET
%endmacro
INIT_XMM
%ifdef
ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; clobbers: m4, m5, m6, m7, m10, m11, m14
%macro
DEBLOCK_LUMA_INTER_SSE2
0
LOAD_MASK
m0
,
m1
,
m2
,
m3
,
m12
,
m13
,
m7
,
m4
,
m6
LOAD_TC
m6
,
r4
DIFF_LT
m8
,
m1
,
m13
,
m10
,
m4
DIFF_LT
m9
,
m2
,
m13
,
m11
,
m4
pand
m6
,
m7
mova
m14
,
m6
pxor
m4
,
m4
pcmpgtw
m6
,
m4
pand
m6
,
m14
mova
m5
,
m10
pand
m5
,
m6
LUMA_Q1
m8
,
m0
,
m1
,
m2
,
m5
,
m4
mova
m5
,
m11
pand
m5
,
m6
LUMA_Q1
m9
,
m3
,
m1
,
m2
,
m5
,
m4
pxor
m4
,
m4
psubw
m6
,
m10
pcmpgtw
m4
,
m14
pandn
m4
,
m7
psubw
m6
,
m11
pand
m4
,
m6
DEBLOCK_P0_Q0
m1
,
m2
,
m0
,
m3
,
m4
,
m5
,
m6
SWAP
0
,
8
SWAP
3
,
9
%endmacro
%macro
DEBLOCK_LUMA_64
1
cglobal
deblock_v_luma_10_
%1
,
5
,
5
,
15
%
define
p2
m8
%
define
p1
m0
%
define
p0
m1
%
define
q0
m2
%
define
q1
m3
%
define
q2
m9
%
define
mask0
m7
%
define
mask1
m10
%
define
mask2
m11
shl
r2d
,
2
shl
r3d
,
2
LOAD_AB
m12
,
m13
,
r2
,
r3
mov
r2
,
r0
sub
r0
,
r1
sub
r0
,
r1
sub
r0
,
r1
mov
r3
,
2
.
loop
:
mova
p2
,
[r0]
mova
p1
,
[
r0
+
r1
]
mova
p0
,
[
r0
+
r1
*
2
]
mova
q0
,
[r2]
mova
q1
,
[
r2
+
r1
]
mova
q2
,
[
r2
+
r1
*
2
]
DEBLOCK_LUMA_INTER_SSE2
mova
[
r0
+
r1
]
,
p1
mova
[
r0
+
r1
*
2
]
,
p0
mova
[r2],
q0
mova
[
r2
+
r1
]
,
q1
add
r0
,
mmsize
add
r2
,
mmsize
add
r4
,
2
dec
r3
jg
.
loop
REP_RET
cglobal
deblock_h_luma_10_
%1
,
5
,
7
,
15
shl
r2d
,
2
shl
r3d
,
2
LOAD_AB
m12
,
m13
,
r2
,
r3
mov
r2
,
r1
add
r2
,
r1
add
r2
,
r1
mov
r5
,
r0
add
r5
,
r2
mov
r6
,
2
.
loop
:
movu
m8
,
[
r0
-
8
]
; y q2 q1 q0 p0 p1 p2 x
movu
m0
,
[
r0
+
r1
-
8
]
movu
m2
,
[
r0
+
r1
*
2
-
8
]
movu
m9
,
[
r5
-
8
]
movu
m5
,
[
r5
+
r1
-
8
]
movu
m1
,
[
r5
+
r1
*
2
-
8
]
movu
m3
,
[
r5
+
r2
-
8
]
movu
m7
,
[
r5
+
r1
*
4
-
8
]
TRANSPOSE4x4W
8
,
0
,
2
,
9
,
10
TRANSPOSE4x4W
5
,
1
,
3
,
7
,
10
punpckhqdq
m8
,
m5
SBUTTERFLY
qdq
,
0
,
1
,
10
SBUTTERFLY
qdq
,
2
,
3
,
10
punpcklqdq
m9
,
m7
DEBLOCK_LUMA_INTER_SSE2
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
LUMA_H_STORE
r5
,
r2
add
r4
,
2
lea
r0
,
[
r0
+
r1
*
8
]
lea
r5
,
[
r5
+
r1
*
8
]
dec
r6
jg
.
loop
REP_RET
%endmacro
INIT_XMM
DEBLOCK_LUMA_64
sse2
INIT_AVX
DEBLOCK_LUMA_64
avx
%endif
%macro
SWAPMOVA
2
%ifid
%1
SWAP
%1
,
%2
%else
mova
%1
,
%2
%endif
%endmacro
; in: t0-t2: tmp registers
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro
LUMA_INTRA_P012
12
; p0..p3 in memory
%ifdef
ARCH_X86_64
paddw
t0
,
%3
,
%2
mova
t2
,
%4
paddw
t2
,
%3
%else
mova
t0
,
%3
mova
t2
,
%4
paddw
t0
,
%2
paddw
t2
,
%3
%endif
paddw
t0
,
%1
paddw
t2
,
t2
paddw
t0
,
%5
paddw
t2
,
%9
paddw
t0
,
%9
; (p2 + p1 + p0 + q0 + 2)
paddw
t2
,
t0
; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
psrlw
t2
,
3
psrlw
t1
,
t0
,
2
psubw
t2
,
%3
psubw
t1
,
%2
pand
t2
,
%8
pand
t1
,
%8
paddw
t2
,
%3
paddw
t1
,
%2
SWAPMOVA
%11
,
t1
psubw
t1
,
t0
,
%3
paddw
t0
,
t0
psubw
t1
,
%5
psubw
t0
,
%3
paddw
t1
,
%6
paddw
t1
,
%2
paddw
t0
,
%6
psrlw
t1
,
2
; (2*p1 + p0 + q1 + 2)/4
psrlw
t0
,
3
; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
pxor
t0
,
t1
pxor
t1
,
%1
pand
t0
,
%8
pand
t1
,
%7
pxor
t0
,
t1
pxor
t0
,
%1
SWAPMOVA
%10
,
t0
SWAPMOVA
%12
,
t2
%endmacro
%macro
LUMA_INTRA_INIT
1
%
xdefine
pad
%1
*
mmsize
+
((
gprsize
*
3
)
%
mmsize
)
-
(
stack_offset
&
15
)
%
define
t0
m4
%
define
t1
m5
%
define
t2
m6
%
define
t3
m7
%
assign
i
4
%rep
%1
CAT_XDEFINE
t
,
i
,
[
rsp
+
mmsize
*
(
i
-
4
)
]
%
assign
i
i
+
1
%endrep
SUB
rsp
,
pad
%endmacro
; in: %1-%3=tmp, %4=p2, %5=q2
%macro
LUMA_INTRA_INTER
5
LOAD_AB
t0
,
t1
,
r2d
,
r3d
mova
%1
,
t0
LOAD_MASK
m0
,
m1
,
m2
,
m3
,
%1
,
t1
,
t0
,
t2
,
t3
%ifdef
ARCH_X86_64
mova
%2
,
t0
; mask0
psrlw
t3
,
%1
,
2
%else
mova
t3
,
%1
mova
%2
,
t0
; mask0
psrlw
t3
,
2
%endif
paddw
t3
,
[
pw_2
]
; alpha/4+2
DIFF_LT
m1
,
m2
,
t3
,
t2
,
t0
; t2 = |p0-q0| < alpha/4+2
pand
t2
,
%2
mova
t3
,
%5
; q2
mova
%1
,
t2
; mask1
DIFF_LT
t3
,
m2
,
t1
,
t2
,
t0
; t2 = |q2-q0| < beta
pand
t2
,
%1
mova
t3
,
%4
; p2
mova
%3
,
t2
; mask1q
DIFF_LT
t3
,
m1
,
t1
,
t2
,
t0
; t2 = |p2-p0| < beta
pand
t2
,
%1
mova
%1
,
t2
; mask1p
%endmacro
%macro
LUMA_H_INTRA_LOAD
0
%if
mmsize
==
8
movu
t0
,
[
r0
-
8
]
movu
t1
,
[
r0
+
r1
-
8
]
movu
m0
,
[
r0
+
r1
*
2
-
8
]
movu
m1
,
[
r0
+
r4
-
8
]
TRANSPOSE4x4W
4
,
5
,
0
,
1
,
2
mova
t4
,
t0
; p3
mova
t5
,
t1
; p2
movu
m2
,
[r0]
movu
m3
,
[
r0
+
r1
]
movu
t0
,
[
r0
+
r1
*
2
]
movu
t1
,
[
r0
+
r4
]
TRANSPOSE4x4W
2
,
3
,
4
,
5
,
6
mova
t6
,
t0
; q2
mova
t7
,
t1
; q3
%else
movu
t0
,
[
r0
-
8
]
movu
t1
,
[
r0
+
r1
-
8
]
movu
m0
,
[
r0
+
r1
*
2
-
8
]
movu
m1
,
[
r0
+
r5
-
8
]
movu
m2
,
[
r4
-
8
]
movu
m3
,
[
r4
+
r1
-
8
]
movu
t2
,
[
r4
+
r1
*
2
-
8
]
movu
t3
,
[
r4
+
r5
-
8
]
TRANSPOSE8x8W
4
,
5
,
0
,
1
,
2
,
3
,
6
,
7
,
t4
,
t5
mova
t4
,
t0
; p3
mova
t5
,
t1
; p2
mova
t6
,
t2
; q2
mova
t7
,
t3
; q3
%endif
%endmacro
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
%macro
LUMA_H_INTRA_STORE
9
%if
mmsize
==
8
TRANSPOSE4x4W
%1
,
%2
,
%3
,
%4
,
%9
movq
[
r0
-
8
]
,
m%1
movq
[
r0
+
r1
-
8
]
,
m%2
movq
[
r0
+
r1
*
2
-
8
]
,
m%3
movq
[
r0
+
r4
-
8
]
,
m%4
movq
m%1
,
%8
TRANSPOSE4x4W
%5
,
%6
,
%7
,
%1
,
%9
movq
[r0],
m%5
movq
[
r0
+
r1
]
,
m%6
movq
[
r0
+
r1
*
2
]
,
m%7
movq
[
r0
+
r4
]
,
m%1
%else
TRANSPOSE2x4x4W
%1
,
%2
,
%3
,
%4
,
%9
movq
[
r0
-
8
]
,
m%1
movq
[
r0
+
r1
-
8
]
,
m%2
movq
[
r0
+
r1
*
2
-
8
]
,
m%3
movq
[
r0
+
r5
-
8
]
,
m%4
movhps
[
r4
-
8
]
,
m%1
movhps
[
r4
+
r1
-
8
]
,
m%2
movhps
[
r4
+
r1
*
2
-
8
]
,
m%3
movhps
[
r4
+
r5
-
8
]
,
m%4
%ifnum
%8
SWAP
%1
,
%8
%else
mova
m%1
,
%8
%endif
TRANSPOSE2x4x4W
%5
,
%6
,
%7
,
%1
,
%9
movq
[r0],
m%5
movq
[
r0
+
r1
]
,
m%6
movq
[
r0
+
r1
*
2
]
,
m%7
movq
[
r0
+
r5
]
,
m%1
movhps
[r4],
m%5
movhps
[
r4
+
r1
]
,
m%6
movhps
[
r4
+
r1
*
2
]
,
m%7
movhps
[
r4
+
r5
]
,
m%1
%endif
%endmacro
%ifdef
ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro
DEBLOCK_LUMA_INTRA_64
1
cglobal
deblock_v_luma_intra_10_
%1
,
4
,
7
,
16
%
define
t0
m1
%
define
t1
m2
%
define
t2
m4
%
define
p2
m8
%
define
p1
m9
%
define
p0
m10
%
define
q0
m11
%
define
q1
m12
%
define
q2
m13
%
define
aa
m5
%
define
bb
m14
lea
r4
,
[
r1
*
4
]
lea
r5
,
[
r1
*
3
]
; 3*stride
neg
r4
add
r4
,
r0
; pix-4*stride
mov
r6
,
2
mova
m0
,
[
pw_2
]
shl
r2d
,
2
shl
r3d
,
2
LOAD_AB
aa
,
bb
,
r2d
,
r3d
.
loop
mova
p2
,
[
r4
+
r1
]
mova
p1
,
[
r4
+
2
*
r1
]
mova
p0
,
[
r4
+
r5
]
mova
q0
,
[r0]
mova
q1
,
[
r0
+
r1
]
mova
q2
,
[
r0
+
2
*
r1
]
LOAD_MASK
p1
,
p0
,
q0
,
q1
,
aa
,
bb
,
m3
,
t0
,
t1
mova
t2
,
aa
psrlw
t2
,
2
paddw
t2
,
m0
; alpha/4+2
DIFF_LT
p0
,
q0
,
t2
,
m6
,
t0
; m6 = |p0-q0| < alpha/4+2
DIFF_LT
p2
,
p0
,
bb
,
t1
,
t0
; m7 = |p2-p0| < beta
DIFF_LT
q2
,
q0
,
bb
,
m7
,
t0
; t1 = |q2-q0| < beta
pand
m6
,
m3
pand
m7
,
m6
pand
m6
,
t1
LUMA_INTRA_P012
p0
,
p1
,
p2
,
[r4],
q0
,
q1
,
m3
,
m6
,
m0
,
[
r4
+
r5
]
,
[
r4
+
2
*
r1
]
,
[
r4
+
r1
]
LUMA_INTRA_P012
q0
,
q1
,
q2
,
[
r0
+
r5
]
,
p0
,
p1
,
m3
,
m7
,
m0
,
[r0],
[
r0
+
r1
]
,
[
r0
+
2
*
r1
]
add
r0
,
mmsize
add
r4
,
mmsize
dec
r6
jg
.
loop
REP_RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_10_
%1
,
4
,
7
,
16
%
define
t0
m15
%
define
t1
m14
%
define
t2
m2
%
define
q3
m5
%
define
q2
m8
%
define
q1
m9
%
define
q0
m10
%
define
p0
m11
%
define
p1
m12
%
define
p2
m13
%
define
p3
m4
%
define
spill
[rsp]
%
assign
pad
24
-
(
stack_offset
&
15
)
SUB
rsp
,
pad
lea
r4
,
[
r1
*
4
]
lea
r5
,
[
r1
*
3
]
; 3*stride
add
r4
,
r0
; pix+4*stride
mov
r6
,
2
mova
m0
,
[
pw_2
]
shl
r2d
,
2
shl
r3d
,
2
.
loop
movu
q3
,
[
r0
-
8
]
movu
q2
,
[
r0
+
r1
-
8
]
movu
q1
,
[
r0
+
r1
*
2
-
8
]
movu
q0
,
[
r0
+
r5
-
8
]
movu
p0
,
[
r4
-
8
]
movu
p1
,
[
r4
+
r1
-
8
]
movu
p2
,
[
r4
+
r1
*
2
-
8
]
movu
p3
,
[
r4
+
r5
-
8
]
TRANSPOSE8x8W
5
,
8
,
9
,
10
,
11
,
12
,
13
,
4
,
1
LOAD_AB
m1
,
m2
,
r2d
,
r3d
LOAD_MASK
q1
,
q0
,
p0
,
p1
,
m1
,
m2
,
m3
,
t0
,
t1
psrlw
m1
,
2
paddw
m1
,
m0
; alpha/4+2
DIFF_LT
p0
,
q0
,
m1
,
m6
,
t0
; m6 = |p0-q0| < alpha/4+2
DIFF_LT
q2
,
q0
,
m2
,
t1
,
t0
; t1 = |q2-q0| < beta
DIFF_LT
p0
,
p2
,
m2
,
m7
,
t0
; m7 = |p2-p0| < beta
pand
m6
,
m3
pand
m7
,
m6
pand
m6
,
t1
mova
spill
,
q3
LUMA_INTRA_P012
q0
,
q1
,
q2
,
q3
,
p0
,
p1
,
m3
,
m6
,
m0
,
m5
,
m1
,
q2
LUMA_INTRA_P012
p0
,
p1
,
p2
,
p3
,
q0
,
q1
,
m3
,
m7
,
m0
,
p0
,
m6
,
p2
mova
m7
,
spill
LUMA_H_INTRA_STORE
7
,
8
,
1
,
5
,
11
,
6
,
13
,
4
,
14
lea
r0
,
[
r0
+
r1
*
8
]
lea
r4
,
[
r4
+
r1
*
8
]
dec
r6
jg
.
loop
ADD
rsp
,
pad
RET
%endmacro
INIT_XMM
DEBLOCK_LUMA_INTRA_64
sse2
INIT_AVX
DEBLOCK_LUMA_INTRA_64
avx
%endif
%macro
DEBLOCK_LUMA_INTRA
1
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_v_luma_intra_10_
%1
,
4
,
7
,
8
*
(
mmsize
/
16
)
LUMA_INTRA_INIT
3
lea
r4
,
[
r1
*
4
]
lea
r5
,
[
r1
*
3
]
neg
r4
add
r4
,
r0
mov
r6
,
32
/
mmsize
shl
r2d
,
2
shl
r3d
,
2
.
loop
:
mova
m0
,
[
r4
+
r1
*
2
]
; p1
mova
m1
,
[
r4
+
r5
]
; p0
mova
m2
,
[r0]
; q0
mova
m3
,
[
r0
+
r1
]
; q1
LUMA_INTRA_INTER
t4
,
t5
,
t6
,
[
r4
+
r1
]
,
[
r0
+
r1
*
2
]
LUMA_INTRA_P012
m1
,
m0
,
t3
,
[r4],
m2
,
m3
,
t5
,
t4
,
[
pw_2
]
,
[
r4
+
r5
]
,
[
r4
+
2
*
r1
]
,
[
r4
+
r1
]
mova
t3
,
[
r0
+
r1
*
2
]
; q2
LUMA_INTRA_P012
m2
,
m3
,
t3
,
[
r0
+
r5
]
,
m1
,
m0
,
t5
,
t6
,
[
pw_2
]
,
[r0],
[
r0
+
r1
]
,
[
r0
+
2
*
r1
]
add
r0
,
mmsize
add
r4
,
mmsize
dec
r6
jg
.
loop
ADD
rsp
,
pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_10_
%1
,
4
,
7
,
8
*
(
mmsize
/
16
)
LUMA_INTRA_INIT
8
%if
mmsize
==
8
lea
r4
,
[
r1
*
3
]
mov
r5
,
32
/
mmsize
%else
lea
r4
,
[
r1
*
4
]
lea
r5
,
[
r1
*
3
]
; 3*stride
add
r4
,
r0
; pix+4*stride
mov
r6
,
32
/
mmsize
%endif
shl
r2d
,
2
shl
r3d
,
2
.
loop
:
LUMA_H_INTRA_LOAD
LUMA_INTRA_INTER
t8
,
t9
,
t10
,
t5
,
t6
LUMA_INTRA_P012
m1
,
m0
,
t3
,
t4
,
m2
,
m3
,
t9
,
t8
,
[
pw_2
]
,
t8
,
t5
,
t11
mova
t3
,
t6
; q2
LUMA_INTRA_P012
m2
,
m3
,
t3
,
t7
,
m1
,
m0
,
t9
,
t10
,
[
pw_2
]
,
m4
,
t6
,
m5
mova
m2
,
t4
mova
m0
,
t11
mova
m1
,
t5
mova
m3
,
t8
mova
m6
,
t6
LUMA_H_INTRA_STORE
2
,
0
,
1
,
3
,
4
,
6
,
5
,
t7
,
7
lea
r0
,
[
r0
+
r1
*
(
mmsize
/
2
)
]
%if
mmsize
==
8
dec
r5
%else
lea
r4
,
[
r4
+
r1
*
(
mmsize
/
2
)
]
dec
r6
%endif
jg
.
loop
ADD
rsp
,
pad
RET
%endmacro
%ifndef
ARCH_X86_64
INIT_MMX
DEBLOCK_LUMA
mmxext
DEBLOCK_LUMA_INTRA
mmxext
INIT_XMM
DEBLOCK_LUMA
sse2
DEBLOCK_LUMA_INTRA
sse2
INIT_AVX
DEBLOCK_LUMA
avx
DEBLOCK_LUMA_INTRA
avx
%endif
libavcodec/x86/h264dsp_mmx.c
View file @
9f3d6ca4
...
...
@@ -218,45 +218,49 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
);
}
#define LF_FUNC(DIR, TYPE, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
LF_FUNC
(
h
,
chroma
,
mmxext
)
LF_IFUNC
(
h
,
chroma_intra
,
mmxext
)
LF_FUNC
(
v
,
chroma
,
mmxext
)
LF_IFUNC
(
v
,
chroma_intra
,
mmxext
)
LF_FUNC
(
h
,
luma
,
mmxext
)
LF_IFUNC
(
h
,
luma_intra
,
mmxext
)
#if HAVE_YASM && ARCH_X86_32
LF_FUNC
(
v8
,
luma
,
mmxext
)
static
void
ff_deblock_v_luma_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
#define LF_FUNCS(type, depth)\
LF_FUNC (h, chroma, depth, mmxext)\
LF_IFUNC(h, chroma_intra, depth, mmxext)\
LF_FUNC (v, chroma, depth, mmxext)\
LF_IFUNC(v, chroma_intra, depth, mmxext)\
LF_FUNC (h, luma, depth, mmxext)\
LF_IFUNC(h, luma_intra, depth, mmxext)\
LF_FUNC (h, luma, depth, sse2)\
LF_IFUNC(h, luma_intra, depth, sse2)\
LF_FUNC (v, luma, depth, sse2)\
LF_IFUNC(v, luma_intra, depth, sse2)\
LF_FUNC (h, luma, depth, avx)\
LF_IFUNC(h, luma_intra, depth, avx)\
LF_FUNC (v, luma, depth, avx)\
LF_IFUNC(v, luma_intra, depth, avx)
LF_FUNCS
(
uint8_t
,
8
)
LF_FUNCS
(
uint16_t
,
10
)
LF_FUNC
(
v8
,
luma
,
8
,
mmxext
)
static
void
ff_deblock_v_luma_8_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
{
if
((
tc0
[
0
]
&
tc0
[
1
])
>=
0
)
ff_deblock_v8_luma_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
ff_deblock_v8_luma_
8_
mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
if
((
tc0
[
2
]
&
tc0
[
3
])
>=
0
)
ff_deblock_v8_luma_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
ff_deblock_v8_luma_
8_
mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
}
LF_IFUNC
(
v8
,
luma_intra
,
mmxext
)
static
void
ff_deblock_v_luma_intra_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
LF_IFUNC
(
v8
,
luma_intra
,
8
,
mmxext
)
static
void
ff_deblock_v_luma_intra_
8_
mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
{
ff_deblock_v8_luma_intra_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_
8_
mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_
8_
mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
}
#endif
LF_FUNC
(
h
,
luma
,
sse2
)
LF_IFUNC
(
h
,
luma_intra
,
sse2
)
LF_FUNC
(
v
,
luma
,
sse2
)
LF_IFUNC
(
v
,
luma_intra
,
sse2
)
LF_FUNC
(
h
,
luma
,
avx
)
LF_IFUNC
(
h
,
luma_intra
,
avx
)
LF_FUNC
(
v
,
luma
,
avx
)
LF_IFUNC
(
v
,
luma_intra
,
avx
)
LF_FUNC
(
v
,
luma
,
10
,
mmxext
)
LF_IFUNC
(
v
,
luma_intra
,
10
,
mmxext
)
/***********************************/
/* weighted prediction */
...
...
@@ -318,15 +322,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
h264_idct_add8
=
ff_h264_idct_add8_mmx2
;
c
->
h264_idct_add16intra
=
ff_h264_idct_add16intra_mmx2
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_mmxext
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_
8_
mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_
8_
mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_
8_
mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_
8_
mmxext
;
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_mmxext
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_
8_
mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_
8_
mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_
8_
mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_
8_
mmxext
;
#endif
c
->
weight_h264_pixels_tab
[
0
]
=
ff_h264_weight_16x16_mmx2
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_h264_weight_16x8_mmx2
;
...
...
@@ -364,10 +368,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
biweight_h264_pixels_tab
[
4
]
=
ff_h264_biweight_8x4_sse2
;
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_sse2
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_
8_
sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_
8_
sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_
8_
sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_
8_
sse2
;
#endif
c
->
h264_idct_add16
=
ff_h264_idct_add16_sse2
;
...
...
@@ -383,10 +387,39 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
}
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_avx
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_8_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_8_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_8_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_8_avx
;
#endif
}
}
}
#endif
}
else
if
(
bit_depth
==
10
)
{
#if HAVE_YASM
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_mmxext
;
#endif
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_sse2
;
#endif
}
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_avx
;
#endif
}
}
...
...
libavcodec/x86/x86util.asm
View file @
9f3d6ca4
...
...
@@ -457,3 +457,8 @@
pshufw
%1
,
%2
,
(
%3
)
*
0x55
%endif
%endmacro
%macro
CLIPW
3
;(dst, min, max)
pmaxsw
%1
,
%2
pminsw
%1
,
%3
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment