Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
e25be471
Commit
e25be471
authored
Mar 02, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp8: convert idct/mc x86 assembly to use cpuflags().
parent
8249a23f
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
124 additions
and
116 deletions
+124
-116
vp8dsp-init.c
libavcodec/x86/vp8dsp-init.c
+56
-56
vp8dsp.asm
libavcodec/x86/vp8dsp.asm
+68
-60
No files found.
libavcodec/x86/vp8dsp-init.c
View file @
e25be471
...
@@ -29,16 +29,16 @@
...
@@ -29,16 +29,16 @@
/*
/*
* MC functions
* MC functions
*/
*/
extern
void
ff_put_vp8_epel4_h4_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_epel4_h4_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_epel4_h6_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_epel4_h6_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_epel4_v4_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_epel4_v4_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_epel4_v6_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_epel4_v6_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
...
@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
...
@@ -80,7 +80,7 @@ extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_bilinear4_h_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_bilinear4_h_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_bilinear8_h_sse2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_bilinear8_h_sse2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
...
@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
...
@@ -93,7 +93,7 @@ extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_bilinear4_v_mmx
ext
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_bilinear4_v_mmx
2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
uint8_t
*
src
,
ptrdiff_t
srcstride
,
int
height
,
int
mx
,
int
my
);
int
height
,
int
mx
,
int
my
);
extern
void
ff_put_vp8_bilinear8_v_sse2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
extern
void
ff_put_vp8_bilinear8_v_sse2
(
uint8_t
*
dst
,
ptrdiff_t
dststride
,
...
@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
...
@@ -139,27 +139,27 @@ static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
}
}
#if ARCH_X86_32
#if ARCH_X86_32
TAP_W8
(
mmx
ext
,
epel
,
h4
)
TAP_W8
(
mmx
2
,
epel
,
h4
)
TAP_W8
(
mmx
ext
,
epel
,
h6
)
TAP_W8
(
mmx
2
,
epel
,
h6
)
TAP_W16
(
mmx
ext
,
epel
,
h6
)
TAP_W16
(
mmx
2
,
epel
,
h6
)
TAP_W8
(
mmx
ext
,
epel
,
v4
)
TAP_W8
(
mmx
2
,
epel
,
v4
)
TAP_W8
(
mmx
ext
,
epel
,
v6
)
TAP_W8
(
mmx
2
,
epel
,
v6
)
TAP_W16
(
mmx
ext
,
epel
,
v6
)
TAP_W16
(
mmx
2
,
epel
,
v6
)
TAP_W8
(
mmx
ext
,
bilinear
,
h
)
TAP_W8
(
mmx
2
,
bilinear
,
h
)
TAP_W16
(
mmx
ext
,
bilinear
,
h
)
TAP_W16
(
mmx
2
,
bilinear
,
h
)
TAP_W8
(
mmx
ext
,
bilinear
,
v
)
TAP_W8
(
mmx
2
,
bilinear
,
v
)
TAP_W16
(
mmx
ext
,
bilinear
,
v
)
TAP_W16
(
mmx
2
,
bilinear
,
v
)
#endif
#endif
TAP_W16
(
sse2
,
epel
,
h6
)
TAP_W16
(
sse2
,
epel
,
h6
)
TAP_W16
(
sse2
,
epel
,
v6
)
TAP_W16
(
sse2
,
epel
,
v6
)
TAP_W16
(
sse2
,
bilinear
,
h
)
TAP_W16
(
sse2
,
bilinear
,
h
)
TAP_W16
(
sse2
,
bilinear
,
v
)
TAP_W16
(
sse2
,
bilinear
,
v
)
TAP_W16
(
ssse3
,
epel
,
h6
)
TAP_W16
(
ssse3
,
epel
,
h6
)
TAP_W16
(
ssse3
,
epel
,
v6
)
TAP_W16
(
ssse3
,
epel
,
v6
)
TAP_W16
(
ssse3
,
bilinear
,
h
)
TAP_W16
(
ssse3
,
bilinear
,
h
)
TAP_W16
(
ssse3
,
bilinear
,
v
)
TAP_W16
(
ssse3
,
bilinear
,
v
)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
...
@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
...
@@ -177,13 +177,13 @@ static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT
#if ARCH_X86_32
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
#define HVTAPMMX(x, y) \
HVTAP(mmx
ext
, 8, x, y, 4, 8) \
HVTAP(mmx
2
, 8, x, y, 4, 8) \
HVTAP(mmx
ext
, 8, x, y, 8, 16)
HVTAP(mmx
2
, 8, x, y, 8, 16)
HVTAP
(
mmx
ext
,
8
,
6
,
6
,
16
,
16
)
HVTAP
(
mmx
2
,
8
,
6
,
6
,
16
,
16
)
#else
#else
#define HVTAPMMX(x, y) \
#define HVTAPMMX(x, y) \
HVTAP(mmx
ext
, 8, x, y, 4, 8)
HVTAP(mmx
2
, 8, x, y, 4, 8)
#endif
#endif
HVTAPMMX
(
4
,
4
)
HVTAPMMX
(
4
,
4
)
...
@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
...
@@ -218,16 +218,16 @@ static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
dst, dststride, tmp, SIZE, height, mx, my); \
}
}
HVBILIN
(
mmx
ext
,
8
,
4
,
8
)
HVBILIN
(
mmx
2
,
8
,
4
,
8
)
#if ARCH_X86_32
#if ARCH_X86_32
HVBILIN
(
mmx
ext
,
8
,
8
,
16
)
HVBILIN
(
mmx
2
,
8
,
8
,
16
)
HVBILIN
(
mmx
ext
,
8
,
16
,
16
)
HVBILIN
(
mmx
2
,
8
,
16
,
16
)
#endif
#endif
HVBILIN
(
sse2
,
8
,
8
,
16
)
HVBILIN
(
sse2
,
8
,
8
,
16
)
HVBILIN
(
sse2
,
8
,
16
,
16
)
HVBILIN
(
sse2
,
8
,
16
,
16
)
HVBILIN
(
ssse3
,
8
,
4
,
8
)
HVBILIN
(
ssse3
,
8
,
4
,
8
)
HVBILIN
(
ssse3
,
8
,
8
,
16
)
HVBILIN
(
ssse3
,
8
,
8
,
16
)
HVBILIN
(
ssse3
,
8
,
16
,
16
)
HVBILIN
(
ssse3
,
8
,
16
,
16
)
extern
void
ff_vp8_idct_dc_add_mmx
(
uint8_t
*
dst
,
DCTELEM
block
[
16
],
extern
void
ff_vp8_idct_dc_add_mmx
(
uint8_t
*
dst
,
DCTELEM
block
[
16
],
ptrdiff_t
stride
);
ptrdiff_t
stride
);
...
@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
...
@@ -283,7 +283,7 @@ extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
int e, int i, int hvt);
int e, int i, int hvt);
DECLARE_LOOP_FILTER
(
mmx
)
DECLARE_LOOP_FILTER
(
mmx
)
DECLARE_LOOP_FILTER
(
mmx
ext
)
DECLARE_LOOP_FILTER
(
mmx
2
)
DECLARE_LOOP_FILTER
(
sse2
)
DECLARE_LOOP_FILTER
(
sse2
)
DECLARE_LOOP_FILTER
(
ssse3
)
DECLARE_LOOP_FILTER
(
ssse3
)
DECLARE_LOOP_FILTER
(
sse4
)
DECLARE_LOOP_FILTER
(
sse4
)
...
@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
...
@@ -351,26 +351,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
* is only used for luma, and luma is always a copy or sixtap. */
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
if
(
mm_flags
&
AV_CPU_FLAG_MMX2
)
{
VP8_MC_FUNC
(
2
,
4
,
mmx
ext
);
VP8_MC_FUNC
(
2
,
4
,
mmx
2
);
VP8_BILINEAR_MC_FUNC
(
2
,
4
,
mmx
ext
);
VP8_BILINEAR_MC_FUNC
(
2
,
4
,
mmx
2
);
#if ARCH_X86_32
#if ARCH_X86_32
VP8_LUMA_MC_FUNC
(
0
,
16
,
mmx
ext
);
VP8_LUMA_MC_FUNC
(
0
,
16
,
mmx
2
);
VP8_MC_FUNC
(
1
,
8
,
mmx
ext
);
VP8_MC_FUNC
(
1
,
8
,
mmx
2
);
VP8_BILINEAR_MC_FUNC
(
0
,
16
,
mmx
ext
);
VP8_BILINEAR_MC_FUNC
(
0
,
16
,
mmx
2
);
VP8_BILINEAR_MC_FUNC
(
1
,
8
,
mmx
ext
);
VP8_BILINEAR_MC_FUNC
(
1
,
8
,
mmx
2
);
c
->
vp8_v_loop_filter_simple
=
ff_vp8_v_loop_filter_simple_mmx
ext
;
c
->
vp8_v_loop_filter_simple
=
ff_vp8_v_loop_filter_simple_mmx
2
;
c
->
vp8_h_loop_filter_simple
=
ff_vp8_h_loop_filter_simple_mmx
ext
;
c
->
vp8_h_loop_filter_simple
=
ff_vp8_h_loop_filter_simple_mmx
2
;
c
->
vp8_v_loop_filter16y_inner
=
ff_vp8_v_loop_filter16y_inner_mmx
ext
;
c
->
vp8_v_loop_filter16y_inner
=
ff_vp8_v_loop_filter16y_inner_mmx
2
;
c
->
vp8_h_loop_filter16y_inner
=
ff_vp8_h_loop_filter16y_inner_mmx
ext
;
c
->
vp8_h_loop_filter16y_inner
=
ff_vp8_h_loop_filter16y_inner_mmx
2
;
c
->
vp8_v_loop_filter8uv_inner
=
ff_vp8_v_loop_filter8uv_inner_mmx
ext
;
c
->
vp8_v_loop_filter8uv_inner
=
ff_vp8_v_loop_filter8uv_inner_mmx
2
;
c
->
vp8_h_loop_filter8uv_inner
=
ff_vp8_h_loop_filter8uv_inner_mmx
ext
;
c
->
vp8_h_loop_filter8uv_inner
=
ff_vp8_h_loop_filter8uv_inner_mmx
2
;
c
->
vp8_v_loop_filter16y
=
ff_vp8_v_loop_filter16y_mbedge_mmx
ext
;
c
->
vp8_v_loop_filter16y
=
ff_vp8_v_loop_filter16y_mbedge_mmx
2
;
c
->
vp8_h_loop_filter16y
=
ff_vp8_h_loop_filter16y_mbedge_mmx
ext
;
c
->
vp8_h_loop_filter16y
=
ff_vp8_h_loop_filter16y_mbedge_mmx
2
;
c
->
vp8_v_loop_filter8uv
=
ff_vp8_v_loop_filter8uv_mbedge_mmx
ext
;
c
->
vp8_v_loop_filter8uv
=
ff_vp8_v_loop_filter8uv_mbedge_mmx
2
;
c
->
vp8_h_loop_filter8uv
=
ff_vp8_h_loop_filter8uv_mbedge_mmx
ext
;
c
->
vp8_h_loop_filter8uv
=
ff_vp8_h_loop_filter8uv_mbedge_mmx
2
;
#endif
#endif
}
}
...
...
libavcodec/x86/vp8dsp.asm
View file @
e25be471
...
@@ -173,8 +173,8 @@ SECTION .text
...
@@ -173,8 +173,8 @@ SECTION .text
; int height, int mx, int my);
; int height, int mx, int my);
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
FILTER_SSSE3
3
%macro
FILTER_SSSE3
1
cglobal
put_vp8_epel
%1
_h6
_ssse3
,
6
,
6
,
%2
cglobal
put_vp8_epel
%1
_h6
,
6
,
6
,
8
lea
r5d
,
[
r5
*
3
]
lea
r5d
,
[
r5
*
3
]
mova
m3
,
[
filter_h6_shuf2
]
mova
m3
,
[
filter_h6_shuf2
]
mova
m4
,
[
filter_h6_shuf3
]
mova
m4
,
[
filter_h6_shuf3
]
...
@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
...
@@ -189,7 +189,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
movu
m0
,
[
r2
-
2
]
movu
m0
,
[
r2
-
2
]
mova
m1
,
m0
mova
m1
,
m0
mova
m2
,
m0
mova
m2
,
m0
%if
idn
%1
,
4
%if
mmsize
==
8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
; shuffle with a memory operand
punpcklbw
m0
,
[
r2
+
3
]
punpcklbw
m0
,
[
r2
+
3
]
...
@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
...
@@ -215,7 +215,7 @@ cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_epel
%1
_h4
_ssse3
,
6
,
6
,
%3
cglobal
put_vp8_epel
%1
_h4
,
6
,
6
,
7
shl
r5d
,
4
shl
r5d
,
4
mova
m2
,
[
pw_64
]
mova
m2
,
[
pw_64
]
mova
m3
,
[
filter_h2_shuf
]
mova
m3
,
[
filter_h2_shuf
]
...
@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
...
@@ -246,7 +246,7 @@ cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_epel
%1
_v4
_ssse3
,
7
,
7
,
%2
cglobal
put_vp8_epel
%1
_v4
,
7
,
7
,
8
shl
r6d
,
4
shl
r6d
,
4
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
fourtap_filter_hb_m
]
lea
r11
,
[
fourtap_filter_hb_m
]
...
@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
...
@@ -285,7 +285,7 @@ cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_epel
%1
_v6
_ssse3
,
7
,
7
,
%2
cglobal
put_vp8_epel
%1
_v6
,
7
,
7
,
8
lea
r6d
,
[
r6
*
3
]
lea
r6d
,
[
r6
*
3
]
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
sixtap_filter_hb_m
]
lea
r11
,
[
sixtap_filter_hb_m
]
...
@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
...
@@ -333,13 +333,14 @@ cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
REP_RET
REP_RET
%endmacro
%endmacro
INIT_MMX
INIT_MMX
ssse3
FILTER_SSSE3
4
,
0
,
0
FILTER_SSSE3
4
INIT_XMM
INIT_XMM
ssse3
FILTER_SSSE3
8
,
8
,
7
FILTER_SSSE3
8
; 4x4 block, H-only 4-tap filter
; 4x4 block, H-only 4-tap filter
cglobal
put_vp8_epel4_h4_mmxext
,
6
,
6
INIT_MMX
mmx2
cglobal
put_vp8_epel4_h4
,
6
,
6
shl
r5d
,
4
shl
r5d
,
4
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
fourtap_filter_hw_m
]
lea
r11
,
[
fourtap_filter_hw_m
]
...
@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
...
@@ -386,7 +387,8 @@ cglobal put_vp8_epel4_h4_mmxext, 6, 6
REP_RET
REP_RET
; 4x4 block, H-only 6-tap filter
; 4x4 block, H-only 6-tap filter
cglobal
put_vp8_epel4_h6_mmxext
,
6
,
6
INIT_MMX
mmx2
cglobal
put_vp8_epel4_h6
,
6
,
6
lea
r5d
,
[
r5
*
3
]
lea
r5d
,
[
r5
*
3
]
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
sixtap_filter_hw_m
]
lea
r11
,
[
sixtap_filter_hw_m
]
...
@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
...
@@ -442,8 +444,8 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
INIT_XMM
INIT_XMM
sse2
cglobal
put_vp8_epel8_h4
_sse2
,
6
,
6
,
10
cglobal
put_vp8_epel8_h4
,
6
,
6
,
10
shl
r5d
,
5
shl
r5d
,
5
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
fourtap_filter_v_m
]
lea
r11
,
[
fourtap_filter_v_m
]
...
@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
...
@@ -490,7 +492,8 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_epel8_h6_sse2
,
6
,
6
,
14
INIT_XMM
sse2
cglobal
put_vp8_epel8_h6
,
6
,
6
,
14
lea
r5d
,
[
r5
*
3
]
lea
r5d
,
[
r5
*
3
]
shl
r5d
,
4
shl
r5d
,
4
%ifdef
PIC
%ifdef
PIC
...
@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
...
@@ -552,9 +555,9 @@ cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
%macro
FILTER_V
3
%macro
FILTER_V
1
; 4x4 block, V-only 4-tap filter
; 4x4 block, V-only 4-tap filter
cglobal
put_vp8_epel
%
2
_v4_
%1
,
7
,
7
,
%3
cglobal
put_vp8_epel
%
1
_v4
,
7
,
7
,
8
shl
r6d
,
5
shl
r6d
,
5
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
fourtap_filter_v_m
]
lea
r11
,
[
fourtap_filter_v_m
]
...
@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
...
@@ -607,7 +610,7 @@ cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
; 4x4 block, V-only 6-tap filter
; 4x4 block, V-only 6-tap filter
cglobal
put_vp8_epel
%
2
_v6_
%1
,
7
,
7
,
%3
cglobal
put_vp8_epel
%
1
_v6
,
7
,
7
,
8
shl
r6d
,
4
shl
r6d
,
4
lea
r6
,
[
r6
*
3
]
lea
r6
,
[
r6
*
3
]
%ifdef
PIC
%ifdef
PIC
...
@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
...
@@ -671,13 +674,13 @@ cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
REP_RET
REP_RET
%endmacro
%endmacro
INIT_MMX
INIT_MMX
mmx2
FILTER_V
mmxext
,
4
,
0
FILTER_V
4
INIT_XMM
INIT_XMM
sse2
FILTER_V
sse2
,
8
,
8
FILTER_V
8
%macro
FILTER_BILINEAR
3
%macro
FILTER_BILINEAR
1
cglobal
put_vp8_bilinear
%
2
_v_
%1
,
7
,
7
,
%3
cglobal
put_vp8_bilinear
%
1
_v
,
7
,
7
,
7
mov
r5d
,
8
*
16
mov
r5d
,
8
*
16
shl
r6d
,
4
shl
r6d
,
4
sub
r5d
,
r6d
sub
r5d
,
r6d
...
@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
...
@@ -705,7 +708,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
psraw
m2
,
2
psraw
m2
,
2
pavgw
m0
,
m6
pavgw
m0
,
m6
pavgw
m2
,
m6
pavgw
m2
,
m6
%if
idn
%1
,
mmxext
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m0
,
m0
packuswb
m2
,
m2
packuswb
m2
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
0
]
,
m0
...
@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
...
@@ -722,7 +725,7 @@ cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_bilinear
%
2
_h_
%1
,
7
,
7
,
%3
cglobal
put_vp8_bilinear
%
1
_h
,
7
,
7
,
7
mov
r6d
,
8
*
16
mov
r6d
,
8
*
16
shl
r5d
,
4
shl
r5d
,
4
sub
r6d
,
r5d
sub
r6d
,
r5d
...
@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
...
@@ -751,7 +754,7 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
psraw
m2
,
2
psraw
m2
,
2
pavgw
m0
,
m6
pavgw
m0
,
m6
pavgw
m2
,
m6
pavgw
m2
,
m6
%if
idn
%1
,
mmxext
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m0
,
m0
packuswb
m2
,
m2
packuswb
m2
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
0
]
,
m0
...
@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
...
@@ -769,13 +772,13 @@ cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
REP_RET
REP_RET
%endmacro
%endmacro
INIT_MMX
INIT_MMX
mmx2
FILTER_BILINEAR
mmxext
,
4
,
0
FILTER_BILINEAR
4
INIT_XMM
INIT_XMM
sse2
FILTER_BILINEAR
sse2
,
8
,
7
FILTER_BILINEAR
8
%macro
FILTER_BILINEAR_SSSE3
1
%macro
FILTER_BILINEAR_SSSE3
1
cglobal
put_vp8_bilinear
%1
_v
_ssse3
,
7
,
7
cglobal
put_vp8_bilinear
%1
_v
,
7
,
7
,
5
shl
r6d
,
4
shl
r6d
,
4
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vb_m
]
lea
r11
,
[
bilinear_filter_vb_m
]
...
@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
...
@@ -811,7 +814,7 @@ cglobal put_vp8_bilinear%1_v_ssse3, 7,7
jg
.
nextrow
jg
.
nextrow
REP_RET
REP_RET
cglobal
put_vp8_bilinear
%1
_h
_ssse3
,
7
,
7
cglobal
put_vp8_bilinear
%1
_h
,
7
,
7
,
5
shl
r5d
,
4
shl
r5d
,
4
%ifdef
PIC
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vb_m
]
lea
r11
,
[
bilinear_filter_vb_m
]
...
@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
...
@@ -848,12 +851,13 @@ cglobal put_vp8_bilinear%1_h_ssse3, 7,7
REP_RET
REP_RET
%endmacro
%endmacro
INIT_MMX
INIT_MMX
ssse3
FILTER_BILINEAR_SSSE3
4
FILTER_BILINEAR_SSSE3
4
INIT_XMM
INIT_XMM
ssse3
FILTER_BILINEAR_SSSE3
8
FILTER_BILINEAR_SSSE3
8
cglobal
put_vp8_pixels8_mmx
,
5
,
5
INIT_MMX
mmx
cglobal
put_vp8_pixels8
,
5
,
5
.
nextrow
:
.
nextrow
:
movq
mm0
,
[
r2
+
r3
*
0
]
movq
mm0
,
[
r2
+
r3
*
0
]
movq
mm1
,
[
r2
+
r3
*
1
]
movq
mm1
,
[
r2
+
r3
*
1
]
...
@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
...
@@ -866,7 +870,8 @@ cglobal put_vp8_pixels8_mmx, 5,5
REP_RET
REP_RET
%if
ARCH_X86_32
%if
ARCH_X86_32
cglobal
put_vp8_pixels16_mmx
,
5
,
5
INIT_MMX
mmx
cglobal
put_vp8_pixels16
,
5
,
5
.
nextrow
:
.
nextrow
:
movq
mm0
,
[
r2
+
r3
*
0
+
0
]
movq
mm0
,
[
r2
+
r3
*
0
+
0
]
movq
mm1
,
[
r2
+
r3
*
0
+
8
]
movq
mm1
,
[
r2
+
r3
*
0
+
8
]
...
@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
...
@@ -883,7 +888,8 @@ cglobal put_vp8_pixels16_mmx, 5,5
REP_RET
REP_RET
%endif
%endif
cglobal
put_vp8_pixels16_sse
,
5
,
5
,
2
INIT_XMM
sse
cglobal
put_vp8_pixels16
,
5
,
5
,
2
.
nextrow
:
.
nextrow
:
movups
xmm0
,
[
r2
+
r3
*
0
]
movups
xmm0
,
[
r2
+
r3
*
0
]
movups
xmm1
,
[
r2
+
r3
*
1
]
movups
xmm1
,
[
r2
+
r3
*
1
]
...
@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
...
@@ -918,8 +924,8 @@ cglobal put_vp8_pixels16_sse, 5,5,2
%4
[
r1
+
r2
+
%3
]
,
m5
%4
[
r1
+
r2
+
%3
]
,
m5
%endmacro
%endmacro
INIT_MMX
INIT_MMX
mmx
cglobal
vp8_idct_dc_add
_mmx
,
3
,
3
cglobal
vp8_idct_dc_add
,
3
,
3
; load data
; load data
movd
m0
,
[r1]
movd
m0
,
[r1]
...
@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
...
@@ -941,8 +947,8 @@ cglobal vp8_idct_dc_add_mmx, 3, 3
ADD_DC
m0
,
m1
,
0
,
movh
ADD_DC
m0
,
m1
,
0
,
movh
RET
RET
INIT_XMM
INIT_XMM
sse4
cglobal
vp8_idct_dc_add
_sse4
,
3
,
3
,
6
cglobal
vp8_idct_dc_add
,
3
,
3
,
6
; load data
; load data
movd
m0
,
[r1]
movd
m0
,
[r1]
pxor
m1
,
m1
pxor
m1
,
m1
...
@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
...
@@ -976,8 +982,8 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%if
ARCH_X86_32
%if
ARCH_X86_32
INIT_MMX
INIT_MMX
mmx
cglobal
vp8_idct_dc_add4y
_mmx
,
3
,
3
cglobal
vp8_idct_dc_add4y
,
3
,
3
; load data
; load data
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m1
,
[
r1
+
32
*
2
]
; C
movd
m1
,
[
r1
+
32
*
2
]
; C
...
@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
...
@@ -1012,8 +1018,8 @@ cglobal vp8_idct_dc_add4y_mmx, 3, 3
RET
RET
%endif
%endif
INIT_XMM
INIT_XMM
sse2
cglobal
vp8_idct_dc_add4y
_sse2
,
3
,
3
,
6
cglobal
vp8_idct_dc_add4y
,
3
,
3
,
6
; load data
; load data
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m1
,
[
r1
+
32
*
2
]
; C
movd
m1
,
[
r1
+
32
*
2
]
; C
...
@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
...
@@ -1046,8 +1052,8 @@ cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
INIT_MMX
INIT_MMX
mmx
cglobal
vp8_idct_dc_add4uv
_mmx
,
3
,
3
cglobal
vp8_idct_dc_add4uv
,
3
,
3
; load data
; load data
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m0
,
[
r1
+
32
*
0
]
; A
movd
m1
,
[
r1
+
32
*
2
]
; C
movd
m1
,
[
r1
+
32
*
2
]
; C
...
@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
...
@@ -1118,9 +1124,8 @@ cglobal vp8_idct_dc_add4uv_mmx, 3, 3
SWAP
%4
,
%3
SWAP
%4
,
%3
%endmacro
%endmacro
INIT_MMX
%macro
VP8_IDCT_ADD
0
%macro
VP8_IDCT_ADD
1
cglobal
vp8_idct_add
,
3
,
3
cglobal
vp8_idct_add_
%1
,
3
,
3
; load block data
; load block data
movq
m0
,
[
r1
+
0
]
movq
m0
,
[
r1
+
0
]
movq
m1
,
[
r1
+
8
]
movq
m1
,
[
r1
+
8
]
...
@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
...
@@ -1128,7 +1133,7 @@ cglobal vp8_idct_add_%1, 3, 3
movq
m3
,
[
r1
+
24
]
movq
m3
,
[
r1
+
24
]
movq
m6
,
[
pw_20091
]
movq
m6
,
[
pw_20091
]
movq
m7
,
[
pw_17734
]
movq
m7
,
[
pw_17734
]
%if
idn
%1
,
sse
%if
cpuflag
(
sse
)
xorps
xmm0
,
xmm0
xorps
xmm0
,
xmm0
movaps
[
r1
+
0
]
,
xmm0
movaps
[
r1
+
0
]
,
xmm0
movaps
[
r1
+
16
]
,
xmm0
movaps
[
r1
+
16
]
,
xmm0
...
@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
...
@@ -1157,9 +1162,11 @@ cglobal vp8_idct_add_%1, 3, 3
%endmacro
%endmacro
%if
ARCH_X86_32
%if
ARCH_X86_32
VP8_IDCT_ADD
mmx
INIT_MMX
mmx
VP8_IDCT_ADD
%endif
%endif
VP8_IDCT_ADD
sse
INIT_MMX
sse
VP8_IDCT_ADD
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
...
@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
...
@@ -1192,13 +1199,13 @@ VP8_IDCT_ADD sse
SWAP
%1
,
%4
,
%3
SWAP
%1
,
%4
,
%3
%endmacro
%endmacro
%macro
VP8_DC_WHT
1
%macro
VP8_DC_WHT
0
cglobal
vp8_luma_dc_wht
_
%1
,
2
,
3
cglobal
vp8_luma_dc_wht
,
2
,
3
movq
m0
,
[r1]
movq
m0
,
[r1]
movq
m1
,
[
r1
+
8
]
movq
m1
,
[
r1
+
8
]
movq
m2
,
[
r1
+
16
]
movq
m2
,
[
r1
+
16
]
movq
m3
,
[
r1
+
24
]
movq
m3
,
[
r1
+
24
]
%if
idn
%1
,
sse
%if
cpuflag
(
sse
)
xorps
xmm0
,
xmm0
xorps
xmm0
,
xmm0
movaps
[
r1
+
0
]
,
xmm0
movaps
[
r1
+
0
]
,
xmm0
movaps
[
r1
+
16
]
,
xmm0
movaps
[
r1
+
16
]
,
xmm0
...
@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
...
@@ -1222,11 +1229,12 @@ cglobal vp8_luma_dc_wht_%1, 2,3
RET
RET
%endmacro
%endmacro
INIT_MMX
%if
ARCH_X86_32
%if
ARCH_X86_32
VP8_DC_WHT
mmx
INIT_MMX
mmx
VP8_DC_WHT
%endif
%endif
VP8_DC_WHT
sse
INIT_MMX
sse
VP8_DC_WHT
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment