Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
344d5190
Commit
344d5190
authored
Sep 16, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: add subpel MC SIMD for 10/12bpp.
parent
77f35967
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
708 additions
and
165 deletions
+708
-165
Makefile
libavcodec/x86/Makefile
+4
-1
vp9dsp_init.c
libavcodec/x86/vp9dsp_init.c
+43
-154
vp9dsp_init.h
libavcodec/x86/vp9dsp_init.h
+109
-1
vp9dsp_init_10bpp.c
libavcodec/x86/vp9dsp_init_10bpp.c
+25
-0
vp9dsp_init_12bpp.c
libavcodec/x86/vp9dsp_init_12bpp.c
+25
-0
vp9dsp_init_16bpp.c
libavcodec/x86/vp9dsp_init_16bpp.c
+1
-1
vp9dsp_init_16bpp_template.c
libavcodec/x86/vp9dsp_init_16bpp_template.c
+62
-0
vp9mc.asm
libavcodec/x86/vp9mc.asm
+18
-8
vp9mc_16bpp.asm
libavcodec/x86/vp9mc_16bpp.asm
+421
-0
No files found.
libavcodec/x86/Makefile
View file @
344d5190
...
...
@@ -63,6 +63,8 @@ OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VORBIS_DECODER)
+=
x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP6_DECODER)
+=
x86/vp6dsp_init.o
OBJS-$(CONFIG_VP9_DECODER)
+=
x86/vp9dsp_init.o
\
x86/vp9dsp_init_10bpp.o
\
x86/vp9dsp_init_12bpp.o
\
x86/vp9dsp_init_16bpp.o
OBJS-$(CONFIG_WEBP_DECODER)
+=
x86/vp8dsp_init.o
...
...
@@ -157,5 +159,6 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER)
+=
x86/vp9intrapred.o
\
x86/vp9itxfm.o
\
x86/vp9lpf.o
\
x86/vp9mc.o
x86/vp9mc.o
\
x86/vp9mc_16bpp.o
YASM-OBJS-$(CONFIG_WEBP_DECODER)
+=
x86/vp8dsp.o
libavcodec/x86/vp9dsp_init.c
View file @
344d5190
...
...
@@ -44,144 +44,52 @@ decl_fpel_func(put, 64, , avx);
decl_fpel_func
(
avg
,
32
,
_8
,
avx2
);
decl_fpel_func
(
avg
,
64
,
_8
,
avx2
);
#define mc_func(avg, sz, dir, opt, type, f_sz) \
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz])
#define mc_funcs(sz, opt, type, fsz) \
mc_func(put, sz, h, opt, type, fsz); \
mc_func(avg, sz, h, opt, type, fsz); \
mc_func(put, sz, v, opt, type, fsz); \
mc_func(avg, sz, v, opt, type, fsz)
mc_funcs
(
4
,
mmxext
,
int16_t
,
8
);
mc_funcs
(
8
,
sse2
,
int16_t
,
8
);
mc_funcs
(
4
,
ssse3
,
int8_t
,
32
);
mc_funcs
(
8
,
ssse3
,
int8_t
,
32
);
decl_mc_funcs
(
4
,
mmxext
,
int16_t
,
8
,
8
);
decl_mc_funcs
(
8
,
sse2
,
int16_t
,
8
,
8
);
decl_mc_funcs
(
4
,
ssse3
,
int8_t
,
32
,
8
);
decl_mc_funcs
(
8
,
ssse3
,
int8_t
,
32
,
8
);
#if ARCH_X86_64
mc_funcs
(
16
,
ssse3
,
int8_t
,
32
);
mc_funcs
(
32
,
avx2
,
int8_t
,
32
);
decl_mc_funcs
(
16
,
ssse3
,
int8_t
,
32
,
8
);
decl_mc_funcs
(
32
,
avx2
,
int8_t
,
32
,
8
);
#endif
#undef mc_funcs
#undef mc_func
#define mc_rep_func(avg, sz, hsz, dir, opt, type, f_sz) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz]) \
{ \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
src_stride, h, filter); \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
src_stride, h, filter); \
}
#define mc_rep_funcs(sz, hsz, opt, type, fsz) \
mc_rep_func(put, sz, hsz, h, opt, type, fsz); \
mc_rep_func(avg, sz, hsz, h, opt, type, fsz); \
mc_rep_func(put, sz, hsz, v, opt, type, fsz); \
mc_rep_func(avg, sz, hsz, v, opt, type, fsz)
mc_rep_funcs
(
16
,
8
,
sse2
,
int16_t
,
8
);
mc_rep_funcs
(
16
,
8
,
8
,
sse2
,
int16_t
,
8
,
8
);
#if ARCH_X86_32
mc_rep_funcs
(
16
,
8
,
ssse3
,
int8_t
,
32
);
mc_rep_funcs
(
16
,
8
,
8
,
ssse3
,
int8_t
,
32
,
8
);
#endif
mc_rep_funcs
(
32
,
16
,
sse2
,
int16_t
,
8
);
mc_rep_funcs
(
32
,
16
,
ssse3
,
int8_t
,
32
);
mc_rep_funcs
(
64
,
32
,
sse2
,
int16_t
,
8
);
mc_rep_funcs
(
64
,
32
,
ssse3
,
int8_t
,
32
);
mc_rep_funcs
(
32
,
16
,
16
,
sse2
,
int16_t
,
8
,
8
);
mc_rep_funcs
(
32
,
16
,
16
,
ssse3
,
int8_t
,
32
,
8
);
mc_rep_funcs
(
64
,
32
,
32
,
sse2
,
int16_t
,
8
,
8
);
mc_rep_funcs
(
64
,
32
,
32
,
ssse3
,
int8_t
,
32
,
8
);
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
mc_rep_funcs
(
64
,
32
,
avx2
,
int8_t
,
32
);
mc_rep_funcs
(
64
,
32
,
32
,
avx2
,
int8_t
,
32
,
8
);
#endif
#undef mc_rep_funcs
#undef mc_rep_func
extern
const
int8_t
ff_filters_ssse3
[
3
][
15
][
4
][
32
];
extern
const
int16_t
ff_filters_sse2
[
3
][
15
][
8
][
8
];
#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, opt) \
static void op##_8tap_##fname##_##sz##hv_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64]); \
ff_vp9_put_8tap_1d_h_##sz##_##opt(temp, 64, src - 3 * src_stride, src_stride, \
h + 7, ff_filters_##f_opt[f][mx - 1]); \
ff_vp9_##op##_8tap_1d_v_##sz##_##opt(dst, dst_stride, temp + 3 * 64, 64, \
h, ff_filters_##f_opt[f][my - 1]); \
}
#define filters_8tap_2d_fn(op, sz, align, opt, f_opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, opt)
#define filters_8tap_2d_fn2(op, align, opt4, opt8, f_opt) \
filters_8tap_2d_fn(op, 64, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 32, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 16, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 8, align, opt8, f_opt) \
filters_8tap_2d_fn(op, 4, align, opt4, f_opt)
filters_8tap_2d_fn2
(
put
,
16
,
mmxext
,
sse2
,
sse2
)
filters_8tap_2d_fn2
(
avg
,
16
,
mmxext
,
sse2
,
sse2
)
filters_8tap_2d_fn2
(
put
,
16
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_2d_fn2
(
avg
,
16
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_2d_fn2
(
put
,
16
,
8
,
1
,
mmxext
,
sse2
,
sse2
)
filters_8tap_2d_fn2
(
avg
,
16
,
8
,
1
,
mmxext
,
sse2
,
sse2
)
filters_8tap_2d_fn2
(
put
,
16
,
8
,
1
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_2d_fn2
(
avg
,
16
,
8
,
1
,
ssse3
,
ssse3
,
ssse3
)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_2d_fn
(
put
,
64
,
32
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
put
,
32
,
32
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
avg
,
64
,
32
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
avg
,
32
,
32
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
put
,
64
,
32
,
8
,
1
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
put
,
32
,
32
,
8
,
1
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
avg
,
64
,
32
,
8
,
1
,
avx2
,
ssse3
)
filters_8tap_2d_fn
(
avg
,
32
,
32
,
8
,
1
,
avx2
,
ssse3
)
#endif
#undef filters_8tap_2d_fn2
#undef filters_8tap_2d_fn
#undef filter_8tap_2d_fn
#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, opt) \
static void op##_8tap_##fname##_##sz##dir##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
ff_vp9_##op##_8tap_1d_##dir##_##sz##_##opt(dst, dst_stride, src, src_stride, \
h, ff_filters_##f_opt[f][dvar - 1]); \
}
#define filters_8tap_1d_fn(op, sz, dir, dvar, opt, f_opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, opt)
#define filters_8tap_1d_fn2(op, sz, opt, f_opt) \
filters_8tap_1d_fn(op, sz, h, mx, opt, f_opt) \
filters_8tap_1d_fn(op, sz, v, my, opt, f_opt)
#define filters_8tap_1d_fn3(op, opt4, opt8, f_opt) \
filters_8tap_1d_fn2(op, 64, opt8, f_opt) \
filters_8tap_1d_fn2(op, 32, opt8, f_opt) \
filters_8tap_1d_fn2(op, 16, opt8, f_opt) \
filters_8tap_1d_fn2(op, 8, opt8, f_opt) \
filters_8tap_1d_fn2(op, 4, opt4, f_opt)
filters_8tap_1d_fn3
(
put
,
mmxext
,
sse2
,
sse2
)
filters_8tap_1d_fn3
(
avg
,
mmxext
,
sse2
,
sse2
)
filters_8tap_1d_fn3
(
put
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_1d_fn3
(
avg
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_1d_fn3
(
put
,
8
,
mmxext
,
sse2
,
sse2
)
filters_8tap_1d_fn3
(
avg
,
8
,
mmxext
,
sse2
,
sse2
)
filters_8tap_1d_fn3
(
put
,
8
,
ssse3
,
ssse3
,
ssse3
)
filters_8tap_1d_fn3
(
avg
,
8
,
ssse3
,
ssse3
,
ssse3
)
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
filters_8tap_1d_fn2
(
put
,
64
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
put
,
32
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
avg
,
64
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
avg
,
32
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
put
,
64
,
8
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
put
,
32
,
8
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
avg
,
64
,
8
,
avx2
,
ssse3
)
filters_8tap_1d_fn2
(
avg
,
32
,
8
,
avx2
,
ssse3
)
#endif
#undef filters_8tap_1d_fn
#undef filters_8tap_1d_fn2
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
#define itxfm_func(typea, typeb, size, opt) \
void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
int16_t *block, int eob)
...
...
@@ -306,36 +214,17 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
{
#if HAVE_YASM
int
cpu_flags
;
if
(
bpp
!=
8
)
{
ff_vp9dsp_init_16bpp_x86
(
dsp
,
bpp
);
if
(
bpp
==
10
)
{
ff_vp9dsp_init_10bpp_x86
(
dsp
);
return
;
}
else
if
(
bpp
==
12
)
{
ff_vp9dsp_init_12bpp_x86
(
dsp
);
return
;
}
cpu_flags
=
av_get_cpu_flags
();
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
#define init_subpel2(idx1, idx2, sz, type, opt) \
init_subpel1(idx1, idx2, 1, 1, sz, hv, type, opt); \
init_subpel1(idx1, idx2, 0, 1, sz, v, type, opt); \
init_subpel1(idx1, idx2, 1, 0, sz, h, type, opt)
#define init_subpel3_32_64(idx, type, opt) \
init_subpel2(0, idx, 64, type, opt); \
init_subpel2(1, idx, 32, type, opt)
#define init_subpel3_8to64(idx, type, opt) \
init_subpel3_32_64(idx, type, opt); \
init_subpel2(2, idx, 16, type, opt); \
init_subpel2(3, idx, 8, type, opt)
#define init_subpel3(idx, type, opt) \
init_subpel3_8to64(idx, type, opt); \
init_subpel2(4, idx, 4, type, opt)
#define init_lpf(opt) do { \
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
...
...
@@ -390,8 +279,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
}
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
init_subpel2
(
4
,
0
,
4
,
put
,
mmxext
);
init_subpel2
(
4
,
1
,
4
,
avg
,
mmxext
);
init_subpel2
(
4
,
0
,
4
,
put
,
8
,
mmxext
);
init_subpel2
(
4
,
1
,
4
,
avg
,
8
,
mmxext
);
init_fpel_func
(
4
,
1
,
4
,
avg
,
_8
,
mmxext
);
init_fpel_func
(
3
,
1
,
8
,
avg
,
_8
,
mmxext
);
dsp
->
itxfm_add
[
TX_4X4
][
DCT_DCT
]
=
ff_vp9_idct_idct_4x4_add_mmxext
;
...
...
@@ -409,8 +298,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
init_subpel3_8to64
(
0
,
put
,
sse2
);
init_subpel3_8to64
(
1
,
avg
,
sse2
);
init_subpel3_8to64
(
0
,
put
,
8
,
sse2
);
init_subpel3_8to64
(
1
,
avg
,
8
,
sse2
);
init_fpel_func
(
2
,
1
,
16
,
avg
,
_8
,
sse2
);
init_fpel_func
(
1
,
1
,
32
,
avg
,
_8
,
sse2
);
init_fpel_func
(
0
,
1
,
64
,
avg
,
_8
,
sse2
);
...
...
@@ -439,8 +328,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
init_subpel3
(
0
,
put
,
ssse3
);
init_subpel3
(
1
,
avg
,
ssse3
);
init_subpel3
(
0
,
put
,
8
,
ssse3
);
init_subpel3
(
1
,
avg
,
8
,
ssse3
);
dsp
->
itxfm_add
[
TX_4X4
][
DCT_DCT
]
=
ff_vp9_idct_idct_4x4_add_ssse3
;
dsp
->
itxfm_add
[
TX_4X4
][
ADST_DCT
]
=
ff_vp9_idct_iadst_4x4_add_ssse3
;
dsp
->
itxfm_add
[
TX_4X4
][
DCT_ADST
]
=
ff_vp9_iadst_idct_4x4_add_ssse3
;
...
...
@@ -493,8 +382,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_fpel_func
(
0
,
1
,
64
,
avg
,
_8
,
avx2
);
if
(
ARCH_X86_64
)
{
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
init_subpel3_32_64
(
0
,
put
,
avx2
);
init_subpel3_32_64
(
1
,
avg
,
avx2
);
init_subpel3_32_64
(
0
,
put
,
8
,
avx2
);
init_subpel3_32_64
(
1
,
avg
,
8
,
avx2
);
#endif
}
init_dc_ipred
(
32
,
avx2
);
...
...
libavcodec/x86/vp9dsp_init.h
View file @
344d5190
...
...
@@ -28,12 +28,120 @@ void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define decl_mc_func(avg, sz, dir, opt, type, f_sz, bpp) \
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz])
#define decl_mc_funcs(sz, opt, type, fsz, bpp) \
decl_mc_func(put, sz, h, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const type (*filter)[f_sz]) \
{ \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst, dst_stride, src, \
src_stride, h, filter); \
ff_vp9_##avg##_8tap_1d_##dir##_##hsz##_##bpp##_##opt(dst + hszb, dst_stride, src + hszb, \
src_stride, h, filter); \
}
#define mc_rep_funcs(sz, hsz, hszb, opt, type, fsz, bpp) \
mc_rep_func(put, sz, hsz, hszb, h, opt, type, fsz, bpp); \
mc_rep_func(avg, sz, hsz, hszb, h, opt, type, fsz, bpp); \
mc_rep_func(put, sz, hsz, hszb, v, opt, type, fsz, bpp); \
mc_rep_func(avg, sz, hsz, hszb, v, opt, type, fsz, bpp)
#define filter_8tap_1d_fn(op, sz, f, f_opt, fname, dir, dvar, bpp, opt) \
static void op##_8tap_##fname##_##sz##dir##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
ff_vp9_##op##_8tap_1d_##dir##_##sz##_##bpp##_##opt(dst, dst_stride, src, src_stride, \
h, ff_filters_##f_opt[f][dvar - 1]); \
}
#define filters_8tap_1d_fn(op, sz, dir, dvar, bpp, opt, f_opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, dir, dvar, bpp, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, dir, dvar, bpp, opt) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, dir, dvar, bpp, opt)
#define filters_8tap_1d_fn2(op, sz, bpp, opt, f_opt) \
filters_8tap_1d_fn(op, sz, h, mx, bpp, opt, f_opt) \
filters_8tap_1d_fn(op, sz, v, my, bpp, opt, f_opt)
#define filters_8tap_1d_fn3(op, bpp, opt4, opt8, f_opt) \
filters_8tap_1d_fn2(op, 64, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 32, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 16, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 8, bpp, opt8, f_opt) \
filters_8tap_1d_fn2(op, 4, bpp, opt4, f_opt)
#define filter_8tap_2d_fn(op, sz, f, f_opt, fname, align, bpp, bytes, opt) \
static void op##_8tap_##fname##_##sz##hv_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_##align(uint8_t, temp, [71 * 64 * bytes]); \
ff_vp9_put_8tap_1d_h_##sz##_##bpp##_##opt(temp, 64 * bytes, src - 3 * src_stride, \
src_stride, h + 7, \
ff_filters_##f_opt[f][mx - 1]); \
ff_vp9_##op##_8tap_1d_v_##sz##_##bpp##_##opt(dst, dst_stride, temp + 3 * bytes * 64, \
64 * bytes, h, \
ff_filters_##f_opt[f][my - 1]); \
}
#define filters_8tap_2d_fn(op, sz, align, bpp, bytes, opt, f_opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, f_opt, regular, align, bpp, bytes, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, f_opt, sharp, align, bpp, bytes, opt) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, f_opt, smooth, align, bpp, bytes, opt)
#define filters_8tap_2d_fn2(op, align, bpp, bytes, opt4, opt8, f_opt) \
filters_8tap_2d_fn(op, 64, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 32, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 16, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 8, align, bpp, bytes, opt8, f_opt) \
filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
,
int
bpp
);
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, bpp, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = \
type##_8tap_smooth_##sz##dir##_##bpp##_##opt; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = \
type##_8tap_regular_##sz##dir##_##bpp##_##opt; \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = \
type##_8tap_sharp_##sz##dir##_##bpp##_##opt
#define init_subpel2(idx1, idx2, sz, type, bpp, opt) \
init_subpel1(idx1, idx2, 1, 1, sz, hv, type, bpp, opt); \
init_subpel1(idx1, idx2, 0, 1, sz, v, type, bpp, opt); \
init_subpel1(idx1, idx2, 1, 0, sz, h, type, bpp, opt)
#define init_subpel3_32_64(idx, type, bpp, opt) \
init_subpel2(0, idx, 64, type, bpp, opt); \
init_subpel2(1, idx, 32, type, bpp, opt)
#define init_subpel3_8to64(idx, type, bpp, opt) \
init_subpel3_32_64(idx, type, bpp, opt); \
init_subpel2(2, idx, 16, type, bpp, opt); \
init_subpel2(3, idx, 8, type, bpp, opt)
#define init_subpel3(idx, type, bpp, opt) \
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
void
ff_vp9dsp_init_10bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_12bpp_x86
(
VP9DSPContext
*
dsp
);
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
);
#endif
/* AVCODEC_X86_VP9DSP_INIT_H */
libavcodec/x86/vp9dsp_init_10bpp.c
0 → 100644
View file @
344d5190
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPC 10
#define INIT_FUNC ff_vp9dsp_init_10bpp_x86
#include "vp9dsp_init_16bpp_template.c"
libavcodec/x86/vp9dsp_init_12bpp.c
0 → 100644
View file @
344d5190
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPC 12
#define INIT_FUNC ff_vp9dsp_init_12bpp_x86
#include "vp9dsp_init_16bpp_template.c"
libavcodec/x86/vp9dsp_init_16bpp.c
View file @
344d5190
...
...
@@ -48,7 +48,7 @@ decl_fpel_func(avg, 128, _16, avx2);
#endif
/* HAVE_YASM */
av_cold
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
,
int
bpp
)
av_cold
void
ff_vp9dsp_init_16bpp_x86
(
VP9DSPContext
*
dsp
)
{
#if HAVE_YASM
int
cpu_flags
=
av_get_cpu_flags
();
...
...
libavcodec/x86/vp9dsp_init_16bpp_template.c
0 → 100644
View file @
344d5190
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp9dsp.h"
#include "libavcodec/x86/vp9dsp_init.h"
#if HAVE_YASM
extern
const
int16_t
ff_filters_16bpp
[
3
][
15
][
4
][
16
];
decl_mc_funcs
(
4
,
sse2
,
int16_t
,
16
,
BPC
);
decl_mc_funcs
(
8
,
sse2
,
int16_t
,
16
,
BPC
);
mc_rep_funcs
(
16
,
8
,
16
,
sse2
,
int16_t
,
16
,
BPC
);
mc_rep_funcs
(
32
,
16
,
32
,
sse2
,
int16_t
,
16
,
BPC
);
mc_rep_funcs
(
64
,
32
,
64
,
sse2
,
int16_t
,
16
,
BPC
);
filters_8tap_2d_fn2
(
put
,
16
,
BPC
,
2
,
sse2
,
sse2
,
16
bpp
)
filters_8tap_2d_fn2
(
avg
,
16
,
BPC
,
2
,
sse2
,
sse2
,
16
bpp
)
filters_8tap_1d_fn3
(
put
,
BPC
,
sse2
,
sse2
,
16
bpp
)
filters_8tap_1d_fn3
(
avg
,
BPC
,
sse2
,
sse2
,
16
bpp
)
#endif
/* HAVE_YASM */
av_cold
void
INIT_FUNC
(
VP9DSPContext
*
dsp
)
{
#if HAVE_YASM
int
cpu_flags
=
av_get_cpu_flags
();
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
init_subpel3
(
0
,
put
,
BPC
,
sse2
);
init_subpel3
(
1
,
avg
,
BPC
,
sse2
);
}
#endif
/* HAVE_YASM */
ff_vp9dsp_init_16bpp_x86
(
dsp
);
}
libavcodec/x86/vp9mc.asm
View file @
344d5190
...
...
@@ -45,6 +45,13 @@ times 8 dw %7
times
8
dw
%8
%endmacro
%macro
F8_16BPP_TAPS
8
times
8
dw
%1
,
%2
times
8
dw
%3
,
%4
times
8
dw
%5
,
%6
times
8
dw
%7
,
%8
%endmacro
%macro
FILTER
1
const
filters_
%1
; smooth
F8_TAPS
-
3
,
-
1
,
32
,
64
,
38
,
1
,
-
3
,
0
...
...
@@ -102,12 +109,15 @@ FILTER ssse3
%define
F8_TAPS
F8_SSE2_TAPS
; int16_t ff_filters_sse2[3][15][8][8]
FILTER
sse2
%define
F8_TAPS
F8_16BPP_TAPS
; int16_t ff_filters_16bpp[3][15][4][16]
FILTER
16
bpp
SECTION
.
text
%macro
filter_sse2_h_fn
1
%assign
%%
px
mmsize
/
2
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
,
6
,
6
,
15
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_8
,
6
,
6
,
15
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
pxor
m5
,
m5
mova
m6
,
[
pw_64
]
mova
m7
,
[
filteryq
+
0
]
...
...
@@ -192,7 +202,7 @@ filter_sse2_h_fn avg
%macro
filter_h_fn
1
%assign
%%
px
mmsize
/
2
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
,
6
,
6
,
11
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_8
,
6
,
6
,
11
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m6
,
[
pw_256
]
mova
m7
,
[
filteryq
+
0
]
%if
ARCH_X86_64
&&
mmsize
>
8
...
...
@@ -253,7 +263,7 @@ filter_h_fn avg
%if
ARCH_X86_64
%macro
filter_hx2_fn
1
%assign
%%
px
mmsize
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
,
6
,
6
,
14
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_8
,
6
,
6
,
14
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m13
,
[
pw_256
]
mova
m8
,
[
filteryq
+
0
]
mova
m9
,
[
filteryq
+
32
]
...
...
@@ -315,9 +325,9 @@ filter_hx2_fn avg
%macro
filter_sse2_v_fn
1
%assign
%%
px
mmsize
/
2
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
,
6
,
8
,
15
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_8
,
6
,
8
,
15
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
,
4
,
7
,
15
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_8
,
4
,
7
,
15
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%define
hd
r4mp
%endif
...
...
@@ -413,9 +423,9 @@ filter_sse2_v_fn avg
%macro
filter_v_fn
1
%assign
%%
px
mmsize
/
2
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
,
6
,
8
,
11
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_8
,
6
,
8
,
11
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
,
4
,
7
,
11
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_8
,
4
,
7
,
11
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%define
hd
r4mp
%endif
...
...
@@ -487,7 +497,7 @@ filter_v_fn avg
%macro
filter_vx2_fn
1
%assign
%%
px
mmsize
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
,
6
,
8
,
14
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_8
,
6
,
8
,
14
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
mova
m13
,
[
pw_256
]
lea
sstride3q
,
[
sstrideq
*
3
]
lea
src4q
,
[
srcq
+
sstrideq
]
...
...
libavcodec/x86/vp9mc_16bpp.asm
0 → 100644
View file @
344d5190
;******************************************************************************
;* VP9 MC SIMD optimizations
;*
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
32
pw_4095
:
times
16
dw
0xfff
pd_64
:
times
8
dd
64
cextern
pw_1023
SECTION
.
text
%macro
filter_h4_fn
1
-
2
12
cglobal
vp9_
%1
_8tap_1d_h_4_10
,
6
,
6
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m5
,
[
pw_1023
]
.
body
:
%if
notcpuflag
(
sse4
)
&&
ARCH_X86_64
pxor
m11
,
m11
%endif
mova
m6
,
[
pd_64
]
mova
m7
,
[
filteryq
+
0
]
%if
ARCH_X86_64
&&
mmsize
>
8
mova
m8
,
[
filteryq
+
32
]
mova
m9
,
[
filteryq
+
64
]
mova
m10
,
[
filteryq
+
96
]
%endif
.
loop
:
movh
m0
,
[
srcq
-
6
]
movh
m1
,
[
srcq
-
4
]
movh
m2
,
[
srcq
-
2
]
movh
m3
,
[
srcq
+
0
]
movh
m4
,
[
srcq
+
2
]
punpcklwd
m0
,
m1
punpcklwd
m2
,
m3
pmaddwd
m0
,
m7
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m2
,
m8
%else
pmaddwd
m2
,
[
filteryq
+
32
]
%endif
movu
m1
,
[
srcq
+
4
]
movu
m3
,
[
srcq
+
6
]
paddd
m0
,
m2
movu
m2
,
[
srcq
+
8
]
add
srcq
,
sstrideq
punpcklwd
m4
,
m1
punpcklwd
m3
,
m2
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m4
,
m9
pmaddwd
m3
,
m10
%else
pmaddwd
m4
,
[
filteryq
+
64
]
pmaddwd
m3
,
[
filteryq
+
96
]
%endif
paddd
m0
,
m4
paddd
m0
,
m3
paddd
m0
,
m6
psrad
m0
,
7
%if
cpuflag
(
sse4
)
packusdw
m0
,
m0
%else
packssdw
m0
,
m0
%endif
%ifidn
%1
,
avg
movh
m1
,
[dstq]
%endif
pminsw
m0
,
m5
%if
notcpuflag
(
sse4
)
%if
ARCH_X86_64
pmaxsw
m0
,
m11
%else
pxor
m2
,
m2
pmaxsw
m0
,
m2
%endif
%endif
%ifidn
%1
,
avg
pavgw
m0
,
m1
%endif
movh
[dstq],
m0
add
dstq
,
dstrideq
dec
hd
jg
.
loop
RET
cglobal
vp9_
%1
_8tap_1d_h_4_12
,
6
,
6
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m5
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_
%1
_8tap_1d_h_4_10
%
+
SUFFIX
).
body
%endmacro
INIT_XMM
sse2
filter_h4_fn
put
filter_h4_fn
avg
%macro
filter_h_fn
1
-
2
12
%assign
%%
px
mmsize
/
2
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_10
,
6
,
6
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m5
,
[
pw_1023
]
.
body
:
%if
notcpuflag
(
sse4
)
&&
ARCH_X86_64
pxor
m11
,
m11
%endif
mova
m6
,
[
pd_64
]
mova
m7
,
[
filteryq
+
0
]
%if
ARCH_X86_64
&&
mmsize
>
8
mova
m8
,
[
filteryq
+
32
]
mova
m9
,
[
filteryq
+
64
]
mova
m10
,
[
filteryq
+
96
]
%endif
.
loop
:
movu
m0
,
[
srcq
-
6
]
movu
m1
,
[
srcq
-
4
]
movu
m2
,
[
srcq
-
2
]
movu
m3
,
[
srcq
+
0
]
movu
m4
,
[
srcq
+
2
]
pmaddwd
m0
,
m7
pmaddwd
m1
,
m7
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m2
,
m8
pmaddwd
m3
,
m8
pmaddwd
m4
,
m9
%else
pmaddwd
m2
,
[
filteryq
+
32
]
pmaddwd
m3
,
[
filteryq
+
32
]
pmaddwd
m4
,
[
filteryq
+
64
]
%endif
paddd
m0
,
m2
paddd
m1
,
m3
paddd
m0
,
m4
movu
m2
,
[
srcq
+
4
]
movu
m3
,
[
srcq
+
6
]
movu
m4
,
[
srcq
+
8
]
add
srcq
,
sstrideq
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m2
,
m9
pmaddwd
m3
,
m10
pmaddwd
m4
,
m10
%else
pmaddwd
m2
,
[
filteryq
+
64
]
pmaddwd
m3
,
[
filteryq
+
96
]
pmaddwd
m4
,
[
filteryq
+
96
]
%endif
paddd
m1
,
m2
paddd
m0
,
m3
paddd
m1
,
m4
paddd
m0
,
m6
paddd
m1
,
m6
psrad
m0
,
7
psrad
m1
,
7
%if
cpuflag
(
sse4
)
packusdw
m0
,
m0
packusdw
m1
,
m1
%else
packssdw
m0
,
m0
packssdw
m1
,
m1
%endif
punpcklwd
m0
,
m1
pminsw
m0
,
m5
%if
notcpuflag
(
sse4
)
%if
ARCH_X86_64
pmaxsw
m0
,
m11
%else
pxor
m2
,
m2
pmaxsw
m0
,
m2
%endif
%endif
%ifidn
%1
,
avg
pavgw
m0
,
[dstq]
%endif
mova
[dstq],
m0
add
dstq
,
dstrideq
dec
hd
jg
.
loop
RET
cglobal
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_12
,
6
,
6
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
mova
m5
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_
%1
_8tap_1d_h_
%
+
%%
px
%
+
_10
%
+
SUFFIX
).
body
%endmacro
INIT_XMM
sse2
filter_h_fn
put
filter_h_fn
avg
%macro
filter_v4_fn
1
-
2
12
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_4_10
,
6
,
8
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_4_10
,
4
,
7
,
%2
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%define
hd
r4mp
%endif
mova
m5
,
[
pw_1023
]
.
body
:
%if
notcpuflag
(
sse4
)
&&
ARCH_X86_64
pxor
m11
,
m11
%endif
mova
m6
,
[
pd_64
]
lea
sstride3q
,
[
sstrideq
*
3
]
lea
src4q
,
[
srcq
+
sstrideq
]
sub
srcq
,
sstride3q
mova
m7
,
[
filteryq
+
0
]
%if
ARCH_X86_64
&&
mmsize
>
8
mova
m8
,
[
filteryq
+
32
]
mova
m9
,
[
filteryq
+
64
]
mova
m10
,
[
filteryq
+
96
]
%endif
.
loop
:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movh
m0
,
[srcq]
movh
m1
,
[
srcq
+
sstrideq
]
movh
m2
,
[
srcq
+
sstrideq
*
2
]
movh
m3
,
[
srcq
+
sstride3q
]
add
srcq
,
sstrideq
movh
m4
,
[src4q]
punpcklwd
m0
,
m1
punpcklwd
m2
,
m3
pmaddwd
m0
,
m7
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m2
,
m8
%else
pmaddwd
m2
,
[
filteryq
+
32
]
%endif
movh
m1
,
[
src4q
+
sstrideq
]
movh
m3
,
[
src4q
+
sstrideq
*
2
]
paddd
m0
,
m2
movh
m2
,
[
src4q
+
sstride3q
]
add
src4q
,
sstrideq
punpcklwd
m4
,
m1
punpcklwd
m3
,
m2
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m4
,
m9
pmaddwd
m3
,
m10
%else
pmaddwd
m4
,
[
filteryq
+
64
]
pmaddwd
m3
,
[
filteryq
+
96
]
%endif
paddd
m0
,
m4
paddd
m0
,
m3
paddd
m0
,
m6
psrad
m0
,
7
%if
cpuflag
(
sse4
)
packusdw
m0
,
m0
%else
packssdw
m0
,
m0
%endif
%ifidn
%1
,
avg
movh
m1
,
[dstq]
%endif
pminsw
m0
,
m5
%if
notcpuflag
(
sse4
)
%if
ARCH_X86_64
pmaxsw
m0
,
m11
%else
pxor
m2
,
m2
pmaxsw
m0
,
m2
%endif
%endif
%ifidn
%1
,
avg
pavgw
m0
,
m1
%endif
movh
[dstq],
m0
add
dstq
,
dstrideq
dec
hd
jg
.
loop
RET
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_4_12
,
6
,
8
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_4_12
,
4
,
7
,
%2
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%endif
mova
m5
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_
%1
_8tap_1d_v_4_10
%
+
SUFFIX
).
body
%endmacro
INIT_XMM
sse2
filter_v4_fn
put
filter_v4_fn
avg
%macro
filter_v_fn
1
-
2
13
%assign
%%
px
mmsize
/
2
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_10
,
6
,
8
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_10
,
4
,
7
,
%2
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%define
hd
r4mp
%endif
mova
m5
,
[
pw_1023
]
.
body
:
%if
notcpuflag
(
sse4
)
&&
ARCH_X86_64
pxor
m12
,
m12
%endif
%if
ARCH_X86_64
mova
m11
,
[
pd_64
]
%endif
lea
sstride3q
,
[
sstrideq
*
3
]
lea
src4q
,
[
srcq
+
sstrideq
]
sub
srcq
,
sstride3q
mova
m7
,
[
filteryq
+
0
]
%if
ARCH_X86_64
&&
mmsize
>
8
mova
m8
,
[
filteryq
+
32
]
mova
m9
,
[
filteryq
+
64
]
mova
m10
,
[
filteryq
+
96
]
%endif
.
loop
:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movu
m0
,
[srcq]
movu
m1
,
[
srcq
+
sstrideq
]
movu
m2
,
[
srcq
+
sstrideq
*
2
]
movu
m3
,
[
srcq
+
sstride3q
]
add
srcq
,
sstrideq
movu
m4
,
[src4q]
SBUTTERFLY
wd
,
0
,
1
,
6
SBUTTERFLY
wd
,
2
,
3
,
6
pmaddwd
m0
,
m7
pmaddwd
m1
,
m7
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m2
,
m8
pmaddwd
m3
,
m8
%else
pmaddwd
m2
,
[
filteryq
+
32
]
pmaddwd
m3
,
[
filteryq
+
32
]
%endif
paddd
m0
,
m2
paddd
m1
,
m3
movu
m2
,
[
src4q
+
sstrideq
]
movu
m3
,
[
src4q
+
sstrideq
*
2
]
SBUTTERFLY
wd
,
4
,
2
,
6
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m4
,
m9
pmaddwd
m2
,
m9
%else
pmaddwd
m4
,
[
filteryq
+
64
]
pmaddwd
m2
,
[
filteryq
+
64
]
%endif
paddd
m0
,
m4
paddd
m1
,
m2
movu
m4
,
[
src4q
+
sstride3q
]
add
src4q
,
sstrideq
SBUTTERFLY
wd
,
3
,
4
,
6
%if
ARCH_X86_64
&&
mmsize
>
8
pmaddwd
m3
,
m10
pmaddwd
m4
,
m10
%else
pmaddwd
m3
,
[
filteryq
+
96
]
pmaddwd
m4
,
[
filteryq
+
96
]
%endif
paddd
m0
,
m3
paddd
m1
,
m4
%if
ARCH_X86_64
paddd
m0
,
m11
paddd
m1
,
m11
%else
paddd
m0
,
[
pd_64
]
paddd
m1
,
[
pd_64
]
%endif
psrad
m0
,
7
psrad
m1
,
7
%if
cpuflag
(
sse4
)
packusdw
m0
,
m1
%else
packssdw
m0
,
m1
%endif
pminsw
m0
,
m5
%if
notcpuflag
(
sse4
)
%if
ARCH_X86_64
pmaxsw
m0
,
m12
%else
pxor
m2
,
m2
pmaxsw
m0
,
m2
%endif
%endif
%ifidn
%1
,
avg
pavgw
m0
,
[dstq]
%endif
mova
[dstq],
m0
add
dstq
,
dstrideq
dec
hd
jg
.
loop
RET
%if
ARCH_X86_64
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_12
,
6
,
8
,
%2
,
dst
,
dstride
,
src
,
sstride
,
h
,
filtery
,
src4
,
sstride3
%else
cglobal
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_12
,
4
,
7
,
%2
,
dst
,
dstride
,
src
,
sstride
,
filtery
,
src4
,
sstride3
mov
filteryq
,
r5mp
%endif
mova
m5
,
[
pw_4095
]
jmp
mangle
(
private_prefix
%
+
_
%
+
vp9_
%1
_8tap_1d_v_
%
+
%%
px
%
+
_10
%
+
SUFFIX
).
body
%endmacro
INIT_XMM
sse2
filter_v_fn
put
filter_v_fn
avg
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment