Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
9f3d6ca4
Commit
9f3d6ca4
authored
May 10, 2011
by
Jason Garrett-Glaser
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Port x86 10-bit H.264 deblock asm from x264
parent
8ad77b65
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
909 additions
and
65 deletions
+909
-65
Makefile
libavcodec/x86/Makefile
+1
-0
dsputil_mmx.c
libavcodec/x86/dsputil_mmx.c
+1
-0
h264_deblock.asm
libavcodec/x86/h264_deblock.asm
+17
-17
h264_deblock_10bit.asm
libavcodec/x86/h264_deblock_10bit.asm
+804
-0
h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c
+81
-48
x86util.asm
libavcodec/x86/x86util.asm
+5
-0
No files found.
libavcodec/x86/Makefile
View file @
9f3d6ca4
...
...
@@ -9,6 +9,7 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
MMX-OBJS-$(CONFIG_H264DSP)
+=
x86/h264dsp_mmx.o
YASM-OBJS-$(CONFIG_H264DSP)
+=
x86/h264_deblock.o
\
x86/h264_deblock_10bit.o
\
x86/h264_weight.o
\
x86/h264_idct.o
\
...
...
libavcodec/x86/dsputil_mmx.c
View file @
9f3d6ca4
...
...
@@ -43,6 +43,7 @@ DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
{
0x8000000080000000ULL
,
0x8000000080000000ULL
};
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_pw_1
)
=
0x0001000100010001ULL
;
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_2
)
=
{
0x0002000200020002ULL
,
0x0002000200020002ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_3
)
=
{
0x0003000300030003ULL
,
0x0003000300030003ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_5
)
=
{
0x0005000500050005ULL
,
0x0005000500050005ULL
};
...
...
libavcodec/x86/h264_deblock.asm
View file @
9f3d6ca4
...
...
@@ -324,7 +324,7 @@ cextern pb_A1
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro
DEBLOCK_LUMA
1
cglobal
deblock_v_luma_
%1
,
5
,
5
,
10
cglobal
deblock_v_luma_
8_
%1
,
5
,
5
,
10
movd
m8
,
[r4]
; tc0
lea
r4
,
[
r1
*
3
]
dec
r2d
; alpha-1
...
...
@@ -369,7 +369,7 @@ cglobal deblock_v_luma_%1, 5,5,10
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
deblock_h_luma_
%1
,
5
,
7
cglobal
deblock_h_luma_
8_
%1
,
5
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
+
r10
*
2
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -396,7 +396,7 @@ cglobal deblock_h_luma_%1, 5,7
%ifdef
WIN64
mov
[
rsp
+
0x20
]
,
r4
%endif
call
deblock_v_luma_
%1
call
deblock_v_luma_
8_
%1
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add
r6
,
2
...
...
@@ -436,7 +436,7 @@ DEBLOCK_LUMA avx
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_
%2
_luma_
%1
,
5
,
5
cglobal
deblock_
%2
_luma_
8_
%1
,
5
,
5
lea
r4
,
[
r1
*
3
]
dec
r2
; alpha-1
neg
r4
...
...
@@ -489,7 +489,7 @@ cglobal deblock_%2_luma_%1, 5,5
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
deblock_h_luma_
%1
,
0
,
5
cglobal
deblock_h_luma_
8_
%1
,
0
,
5
mov
r0
,
r0mp
mov
r3
,
r1m
lea
r4
,
[
r3
*
3
]
...
...
@@ -512,11 +512,11 @@ cglobal deblock_h_luma_%1, 0,5
PUSH
dword
r2m
PUSH
dword
16
PUSH
dword
r0
call
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
8_
%1
%ifidn
%2
,
v8
add
dword
[
esp
]
,
8
; pix_tmp+0x38
add
dword
[
esp
+
16
]
,
2
; tc0+2
call
deblock_
%2
_luma_
%1
call
deblock_
%2
_luma_
8_
%1
%endif
ADD
esp
,
20
...
...
@@ -685,7 +685,7 @@ DEBLOCK_LUMA avx, v, 16
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_
%2
_luma_intra_
%1
,
4
,
6
,
16
cglobal
deblock_
%2
_luma_intra_
8_
%1
,
4
,
6
,
16
%ifndef
ARCH_X86_64
sub
esp
,
0x60
%endif
...
...
@@ -747,7 +747,7 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_luma_intra_
%1
,
4
,
7
cglobal
deblock_h_luma_intra_
8_
%1
,
4
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
*
3
]
lea
r6
,
[
r0
-
4
]
...
...
@@ -763,7 +763,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
lea
r0
,
[
pix_tmp
+
0x40
]
mov
r1
,
0x10
call
deblock_v_luma_intra_
%1
call
deblock_v_luma_intra_
8_
%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea
r5
,
[
r6
+
r11
]
...
...
@@ -776,7 +776,7 @@ cglobal deblock_h_luma_intra_%1, 4,7
add
rsp
,
0x88
RET
%else
cglobal
deblock_h_luma_intra_
%1
,
2
,
4
cglobal
deblock_h_luma_intra_
8_
%1
,
2
,
4
lea
r3
,
[
r1
*
3
]
sub
r0
,
4
lea
r2
,
[
r0
+
r3
]
...
...
@@ -795,10 +795,10 @@ cglobal deblock_h_luma_intra_%1, 2,4
PUSH
dword
r2m
PUSH
dword
16
PUSH
r0
call
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
8_
%1
%ifidn
%2
,
v8
add
dword
[rsp],
8
; pix_tmp+8
call
deblock_
%2
_luma_intra_
%1
call
deblock_
%2
_luma_intra_
8_
%1
%endif
ADD
esp
,
16
...
...
@@ -851,7 +851,7 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_mmxext
,
5
,
6
cglobal
deblock_v_chroma_
8_
mmxext
,
5
,
6
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
...
...
@@ -865,7 +865,7 @@ cglobal deblock_v_chroma_mmxext, 5,6
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_mmxext
,
5
,
7
cglobal
deblock_h_chroma_
8_
mmxext
,
5
,
7
%ifdef
ARCH_X86_64
%
define
buf0
[
rsp
-
24
]
%
define
buf1
[
rsp
-
16
]
...
...
@@ -911,7 +911,7 @@ ff_chroma_inter_body_mmxext:
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_v_chroma_intra_mmxext
,
4
,
5
cglobal
deblock_v_chroma_intra_
8_
mmxext
,
4
,
5
CHROMA_V_START
movq
m0
,
[t5]
movq
m1
,
[
t5
+
r1
]
...
...
@@ -925,7 +925,7 @@ cglobal deblock_v_chroma_intra_mmxext, 4,5
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
deblock_h_chroma_intra_mmxext
,
4
,
6
cglobal
deblock_h_chroma_intra_
8_
mmxext
,
4
,
6
CHROMA_H_START
TRANSPOSE4x8_LOAD
bw
,
wd
,
dq
,
PASS8ROWS
(
t5
,
r0
,
r1
,
t6
)
call
ff_chroma_intra_body_mmxext
...
...
libavcodec/x86/h264_deblock_10bit.asm
0 → 100644
View file @
9f3d6ca4
This diff is collapsed.
Click to expand it.
libavcodec/x86/h264dsp_mmx.c
View file @
9f3d6ca4
...
...
@@ -218,45 +218,49 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
);
}
#define LF_FUNC(DIR, TYPE, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
LF_FUNC
(
h
,
chroma
,
mmxext
)
LF_IFUNC
(
h
,
chroma_intra
,
mmxext
)
LF_FUNC
(
v
,
chroma
,
mmxext
)
LF_IFUNC
(
v
,
chroma_intra
,
mmxext
)
LF_FUNC
(
h
,
luma
,
mmxext
)
LF_IFUNC
(
h
,
luma_intra
,
mmxext
)
#if HAVE_YASM && ARCH_X86_32
LF_FUNC
(
v8
,
luma
,
mmxext
)
static
void
ff_deblock_v_luma_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
#define LF_FUNCS(type, depth)\
LF_FUNC (h, chroma, depth, mmxext)\
LF_IFUNC(h, chroma_intra, depth, mmxext)\
LF_FUNC (v, chroma, depth, mmxext)\
LF_IFUNC(v, chroma_intra, depth, mmxext)\
LF_FUNC (h, luma, depth, mmxext)\
LF_IFUNC(h, luma_intra, depth, mmxext)\
LF_FUNC (h, luma, depth, sse2)\
LF_IFUNC(h, luma_intra, depth, sse2)\
LF_FUNC (v, luma, depth, sse2)\
LF_IFUNC(v, luma_intra, depth, sse2)\
LF_FUNC (h, luma, depth, avx)\
LF_IFUNC(h, luma_intra, depth, avx)\
LF_FUNC (v, luma, depth, avx)\
LF_IFUNC(v, luma_intra, depth, avx)
LF_FUNCS
(
uint8_t
,
8
)
LF_FUNCS
(
uint16_t
,
10
)
LF_FUNC
(
v8
,
luma
,
8
,
mmxext
)
static
void
ff_deblock_v_luma_8_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
)
{
if
((
tc0
[
0
]
&
tc0
[
1
])
>=
0
)
ff_deblock_v8_luma_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
ff_deblock_v8_luma_
8_
mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
,
tc0
);
if
((
tc0
[
2
]
&
tc0
[
3
])
>=
0
)
ff_deblock_v8_luma_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
ff_deblock_v8_luma_
8_
mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
,
tc0
+
2
);
}
LF_IFUNC
(
v8
,
luma_intra
,
mmxext
)
static
void
ff_deblock_v_luma_intra_mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
LF_IFUNC
(
v8
,
luma_intra
,
8
,
mmxext
)
static
void
ff_deblock_v_luma_intra_
8_
mmxext
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
)
{
ff_deblock_v8_luma_intra_mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_
8_
mmxext
(
pix
+
0
,
stride
,
alpha
,
beta
);
ff_deblock_v8_luma_intra_
8_
mmxext
(
pix
+
8
,
stride
,
alpha
,
beta
);
}
#endif
LF_FUNC
(
h
,
luma
,
sse2
)
LF_IFUNC
(
h
,
luma_intra
,
sse2
)
LF_FUNC
(
v
,
luma
,
sse2
)
LF_IFUNC
(
v
,
luma_intra
,
sse2
)
LF_FUNC
(
h
,
luma
,
avx
)
LF_IFUNC
(
h
,
luma_intra
,
avx
)
LF_FUNC
(
v
,
luma
,
avx
)
LF_IFUNC
(
v
,
luma_intra
,
avx
)
LF_FUNC
(
v
,
luma
,
10
,
mmxext
)
LF_IFUNC
(
v
,
luma_intra
,
10
,
mmxext
)
/***********************************/
/* weighted prediction */
...
...
@@ -318,15 +322,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
h264_idct_add8
=
ff_h264_idct_add8_mmx2
;
c
->
h264_idct_add16intra
=
ff_h264_idct_add16intra_mmx2
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_mmxext
;
c
->
h264_v_loop_filter_chroma
=
ff_deblock_v_chroma_
8_
mmxext
;
c
->
h264_h_loop_filter_chroma
=
ff_deblock_h_chroma_
8_
mmxext
;
c
->
h264_v_loop_filter_chroma_intra
=
ff_deblock_v_chroma_intra_
8_
mmxext
;
c
->
h264_h_loop_filter_chroma_intra
=
ff_deblock_h_chroma_intra_
8_
mmxext
;
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_mmxext
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_
8_
mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_
8_
mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_
8_
mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_
8_
mmxext
;
#endif
c
->
weight_h264_pixels_tab
[
0
]
=
ff_h264_weight_16x16_mmx2
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_h264_weight_16x8_mmx2
;
...
...
@@ -364,10 +368,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c
->
biweight_h264_pixels_tab
[
4
]
=
ff_h264_biweight_8x4_sse2
;
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_sse2
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_
8_
sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_
8_
sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_
8_
sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_
8_
sse2
;
#endif
c
->
h264_idct_add16
=
ff_h264_idct_add16_sse2
;
...
...
@@ -383,10 +387,39 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
}
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_avx
;
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_8_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_8_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_8_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_8_avx
;
#endif
}
}
}
#endif
}
else
if
(
bit_depth
==
10
)
{
#if HAVE_YASM
if
(
mm_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
#if ARCH_X86_32
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_mmxext
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_mmxext
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_mmxext
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_mmxext
;
#endif
if
(
mm_flags
&
AV_CPU_FLAG_SSE2
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_sse2
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_sse2
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_sse2
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_sse2
;
#endif
}
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
#if HAVE_ALIGNED_STACK
c
->
h264_v_loop_filter_luma
=
ff_deblock_v_luma_10_avx
;
c
->
h264_h_loop_filter_luma
=
ff_deblock_h_luma_10_avx
;
c
->
h264_v_loop_filter_luma_intra
=
ff_deblock_v_luma_intra_10_avx
;
c
->
h264_h_loop_filter_luma_intra
=
ff_deblock_h_luma_intra_10_avx
;
#endif
}
}
...
...
libavcodec/x86/x86util.asm
View file @
9f3d6ca4
...
...
@@ -457,3 +457,8 @@
pshufw
%1
,
%2
,
(
%3
)
*
0x55
%endif
%endmacro
%macro
CLIPW
3
;(dst, min, max)
pmaxsw
%1
,
%2
pminsw
%1
,
%3
%endmacro
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment