Commit 26ece7a5 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.

parent db7786e8
......@@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
......
......@@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
......@@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
#endif /* AVCODEC_X86_CONSTANTS_H */
......@@ -24,14 +24,11 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pd_32: times 4 dd 32
SECTION .text
cextern pw_1023
%define pw_pixel_max pw_1023
cextern pd_32
;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
......
......@@ -34,11 +34,11 @@ cextern pw_8
cextern pw_4
cextern pw_2
cextern pw_1
cextern pd_16
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
pd_17: times 4 dd 17
pd_16: times 4 dd 16
SECTION .text
......
......@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
#define decl_ipred_fn(type, sz, bpp, opt) \
void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *l, \
const uint8_t *a)
#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
decl_ipred_fn(type, 4, bpp, opt4); \
decl_ipred_fn(type, 8, bpp, opt8_16_32); \
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
decl_ipred_fn(type, 32, bpp, opt8_16_32)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
......@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt)
#define cat(a, bpp, b) a##bpp##b
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 8, bpp, opt); \
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
......
......@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func(avg, 64, _16, avx2);
decl_fpel_func(avg, 128, _16, avx2);
decl_ipred_fns(v, 16, mmx, sse);
decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
......@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 8, put, , mmx);
init_ipred_func(v, VERT, 4, 16, mmx);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_fpel_func(4, 1, 8, avg, _16, mmxext);
init_ipred_func(h, HOR, 4, 16, mmxext);
init_ipred_func(dc, DC, 4, 16, mmxext);
init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
}
if (EXTERNAL_SSE(cpu_flags)) {
......@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 0, 32, put, , sse);
init_fpel_func(1, 0, 64, put, , sse);
init_fpel_func(0, 0, 128, put, , sse);
init_8_16_32_ipred_funcs(v, VERT, 16, sse);
}
if (EXTERNAL_SSE2(cpu_flags)) {
......@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, sse2);
init_fpel_func(1, 1, 64, avg, _16, sse2);
init_fpel_func(0, 1, 128, avg, _16, sse2);
init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
......
......@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set(BPC, sse2);
lpf_mix2_wrappers_set(BPC, ssse3);
lpf_mix2_wrappers_set(BPC, avx);
decl_ipred_fns(tm, BPC, mmxext, sse2);
#endif /* HAVE_YASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp)
......@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
}
if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3(0, put, BPC, sse2);
init_subpel3(1, avg, BPC, sse2);
init_lpf_funcs(BPC, sse2);
init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment