Commit 26ece7a5 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp9: 16bpp tm/dc/h/v intra pred simd (mostly sse2) functions.

parent db7786e8
...@@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o ...@@ -158,6 +158,7 @@ YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9intrapred_16bpp.o \
x86/vp9itxfm.o \ x86/vp9itxfm.o \
x86/vp9lpf.o \ x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \ x86/vp9lpf_16bpp.o \
......
...@@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800 ...@@ -81,3 +81,7 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_ps_neg) = { 0x8000000080000000ULL, 0x800
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL, DECLARE_ALIGNED(32, const ymm_reg, ff_pd_1) = { 0x0000000100000001ULL, 0x0000000100000001ULL,
0x0000000100000001ULL, 0x0000000100000001ULL }; 0x0000000100000001ULL, 0x0000000100000001ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x0000001000000010ULL,
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
...@@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC; ...@@ -63,5 +63,7 @@ extern const uint64_t ff_pb_FC;
extern const xmm_reg ff_ps_neg; extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1; extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
#endif /* AVCODEC_X86_CONSTANTS_H */ #endif /* AVCODEC_X86_CONSTANTS_H */
...@@ -24,14 +24,11 @@ ...@@ -24,14 +24,11 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA
pd_32: times 4 dd 32
SECTION .text SECTION .text
cextern pw_1023 cextern pw_1023
%define pw_pixel_max pw_1023 %define pw_pixel_max pw_1023
cextern pd_32
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
......
...@@ -34,11 +34,11 @@ cextern pw_8 ...@@ -34,11 +34,11 @@ cextern pw_8
cextern pw_4 cextern pw_4
cextern pw_2 cextern pw_2
cextern pw_1 cextern pw_1
cextern pd_16
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3 pw_m3: times 8 dw -3
pd_17: times 4 dd 17 pd_17: times 4 dd 17
pd_16: times 4 dd 16
SECTION .text SECTION .text
......
...@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \ ...@@ -41,6 +41,18 @@ decl_mc_func(avg, sz, h, opt, type, fsz, bpp); \
decl_mc_func(put, sz, v, opt, type, fsz, bpp); \ decl_mc_func(put, sz, v, opt, type, fsz, bpp); \
decl_mc_func(avg, sz, v, opt, type, fsz, bpp) decl_mc_func(avg, sz, v, opt, type, fsz, bpp)
#define decl_ipred_fn(type, sz, bpp, opt) \
void ff_vp9_ipred_##type##_##sz##x##sz##_##bpp##_##opt(uint8_t *dst, \
ptrdiff_t stride, \
const uint8_t *l, \
const uint8_t *a)
#define decl_ipred_fns(type, bpp, opt4, opt8_16_32) \
decl_ipred_fn(type, 4, bpp, opt4); \
decl_ipred_fn(type, 8, bpp, opt8_16_32); \
decl_ipred_fn(type, 16, bpp, opt8_16_32); \
decl_ipred_fn(type, 32, bpp, opt8_16_32)
#define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \ #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \
static av_always_inline void \ static av_always_inline void \
ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
...@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt) ...@@ -142,6 +154,17 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_subpel3_8to64(idx, type, bpp, opt); \ init_subpel3_8to64(idx, type, bpp, opt); \
init_subpel2(4, idx, 4, type, bpp, opt) init_subpel2(4, idx, 4, type, bpp, opt)
#define cat(a, bpp, b) a##bpp##b
#define init_ipred_func(type, enum, sz, bpp, opt) \
dsp->intra_pred[TX_##sz##X##sz][enum##_PRED] = \
cat(ff_vp9_ipred_##type##_##sz##x##sz##_, bpp, _##opt)
#define init_8_16_32_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 8, bpp, opt); \
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);
......
...@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2); ...@@ -46,6 +46,11 @@ decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func(avg, 64, _16, avx2); decl_fpel_func(avg, 64, _16, avx2);
decl_fpel_func(avg, 128, _16, avx2); decl_fpel_func(avg, 128, _16, avx2);
decl_ipred_fns(v, 16, mmx, sse);
decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
...@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) ...@@ -55,10 +60,15 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 8, put, , mmx); init_fpel_func(4, 0, 8, put, , mmx);
init_ipred_func(v, VERT, 4, 16, mmx);
} }
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
init_fpel_func(4, 1, 8, avg, _16, mmxext); init_fpel_func(4, 1, 8, avg, _16, mmxext);
init_ipred_func(h, HOR, 4, 16, mmxext);
init_ipred_func(dc, DC, 4, 16, mmxext);
init_ipred_func(dc_top, TOP_DC, 4, 16, mmxext);
init_ipred_func(dc_left, LEFT_DC, 4, 16, mmxext);
} }
if (EXTERNAL_SSE(cpu_flags)) { if (EXTERNAL_SSE(cpu_flags)) {
...@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) ...@@ -66,6 +76,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 0, 32, put, , sse); init_fpel_func(2, 0, 32, put, , sse);
init_fpel_func(1, 0, 64, put, , sse); init_fpel_func(1, 0, 64, put, , sse);
init_fpel_func(0, 0, 128, put, , sse); init_fpel_func(0, 0, 128, put, , sse);
init_8_16_32_ipred_funcs(v, VERT, 16, sse);
} }
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
...@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) ...@@ -73,6 +84,10 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(2, 1, 32, avg, _16, sse2); init_fpel_func(2, 1, 32, avg, _16, sse2);
init_fpel_func(1, 1, 64, avg, _16, sse2); init_fpel_func(1, 1, 64, avg, _16, sse2);
init_fpel_func(0, 1, 128, avg, _16, sse2); init_fpel_func(0, 1, 128, avg, _16, sse2);
init_8_16_32_ipred_funcs(h, HOR, 16, sse2);
init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
} }
if (EXTERNAL_AVX_FAST(cpu_flags)) { if (EXTERNAL_AVX_FAST(cpu_flags)) {
......
...@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \ ...@@ -121,6 +121,8 @@ lpf_mix2_wrappers(8, 8, bpp, opt); \
lpf_mix2_wrappers_set(BPC, sse2); lpf_mix2_wrappers_set(BPC, sse2);
lpf_mix2_wrappers_set(BPC, ssse3); lpf_mix2_wrappers_set(BPC, ssse3);
lpf_mix2_wrappers_set(BPC, avx); lpf_mix2_wrappers_set(BPC, avx);
decl_ipred_fns(tm, BPC, mmxext, sse2);
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
av_cold void INIT_FUNC(VP9DSPContext *dsp) av_cold void INIT_FUNC(VP9DSPContext *dsp)
...@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp) ...@@ -153,10 +155,15 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \ init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt) init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
}
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3(0, put, BPC, sse2); init_subpel3(0, put, BPC, sse2);
init_subpel3(1, avg, BPC, sse2); init_subpel3(1, avg, BPC, sse2);
init_lpf_funcs(BPC, sse2); init_lpf_funcs(BPC, sse2);
init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2);
} }
if (EXTERNAL_SSSE3(cpu_flags)) { if (EXTERNAL_SSSE3(cpu_flags)) {
......
;******************************************************************************
;* VP9 Intra prediction SIMD optimizations
;*
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
;* Copyright (c) 2015 Henrik Gramner <henrik gramner com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pd_2: times 8 dd 2
pd_4: times 8 dd 4
pd_8: times 8 dd 8
cextern pw_1
cextern pw_1023
cextern pw_4095
cextern pd_16
cextern pd_32
SECTION .text
INIT_MMX mmx
cglobal vp9_ipred_v_4x4_16, 2, 4, 1, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse
cglobal vp9_ipred_v_8x8_16, 2, 4, 1, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse
cglobal vp9_ipred_v_16x16_16, 2, 4, 2, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq]
mova m1, [aq+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m1
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m1
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse
cglobal vp9_ipred_v_32x32_16, 2, 4, 4, dst, stride, l, a
movifnidn aq, amp
mova m0, [aq+mmsize*0]
mova m1, [aq+mmsize*1]
mova m2, [aq+mmsize*2]
mova m3, [aq+mmsize*3]
DEFINE_ARGS dst, stride, cnt
mov cntd, 16
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m3
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*1+32], m2
mova [dstq+strideq*1+48], m3
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
INIT_MMX mmxext
cglobal vp9_ipred_h_4x4_16, 3, 3, 4, dst, stride, l, a
mova m3, [lq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pshufw m0, m3, q3333
pshufw m1, m3, q2222
pshufw m2, m3, q1111
pshufw m3, m3, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
RET
INIT_XMM sse2
cglobal vp9_ipred_h_8x8_16, 3, 3, 4, dst, stride, l, a
mova m2, [lq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
punpckhwd m3, m2, m2
pshufd m0, m3, q3333
pshufd m1, m3, q2222
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufd m0, m3, q1111
pshufd m1, m3, q0000
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
punpcklwd m2, m2
pshufd m0, m2, q3333
pshufd m1, m2, q2222
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufd m0, m2, q1111
pshufd m1, m2, q0000
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
RET
INIT_XMM sse2
cglobal vp9_ipred_h_16x16_16, 3, 5, 4, dst, stride, l, stride3, cnt
mov cntd, 3
lea stride3q, [strideq*3]
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*2+ 0], m2
mova [dstq+strideq*2+16], m2
mova [dstq+stride3q + 0], m3
mova [dstq+stride3q +16], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_h_32x32_16, 3, 5, 4, dst, stride, l, stride3, cnt
mov cntd, 7
lea stride3q, [strideq*3]
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m1
mova [dstq+strideq*1+32], m1
mova [dstq+strideq*1+48], m1
mova [dstq+strideq*2+ 0], m2
mova [dstq+strideq*2+16], m2
mova [dstq+strideq*2+32], m2
mova [dstq+strideq*2+48], m2
mova [dstq+stride3q + 0], m3
mova [dstq+stride3q +16], m3
mova [dstq+stride3q +32], m3
mova [dstq+stride3q +48], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
INIT_MMX mmxext
cglobal vp9_ipred_dc_4x4_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufw m1, m0, q3232
paddd m0, [pd_4]
paddd m0, m1
psrad m0, 3
pshufw m0, m0, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_8x8_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [aq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_8]
paddd m0, m1
psrad m0, 4
pshuflw m0, m0, q0000
punpcklqdq m0, m0
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_16x16_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq]
paddw m0, [lq+mmsize]
paddw m0, [aq]
paddw m0, [aq+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_16]
paddd m0, m1
psrad m0, 5
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m0
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_32x32_16, 4, 4, 2, dst, stride, l, a
mova m0, [lq+mmsize*0]
paddw m0, [lq+mmsize*1]
paddw m0, [lq+mmsize*2]
paddw m0, [lq+mmsize*3]
paddw m0, [aq+mmsize*0]
paddw m0, [aq+mmsize*1]
paddw m0, [aq+mmsize*2]
paddw m0, [aq+mmsize*3]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 16
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_32]
paddd m0, m1
psrad m0, 6
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*1+32], m0
mova [dstq+strideq*1+48], m0
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
%macro DC_1D_FNS 2
INIT_MMX mmxext
cglobal vp9_ipred_dc_%1_4x4_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufw m1, m0, q3232
paddd m0, [pd_2]
paddd m0, m1
psrad m0, 2
pshufw m0, m0, q0000
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_8x8_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_4]
paddd m0, m1
psrad m0, 3
pshuflw m0, m0, q0000
punpcklqdq m0, m0
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_16x16_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2]
paddw m0, [%2+mmsize]
DEFINE_ARGS dst, stride, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 4
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_8]
paddd m0, m1
psrad m0, 4
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*2+ 0], m0
mova [dstq+strideq*2+16], m0
mova [dstq+stride3q + 0], m0
mova [dstq+stride3q +16], m0
lea dstq, [dstq+strideq*4]
dec cntd
jg .loop
RET
INIT_XMM sse2
cglobal vp9_ipred_dc_%1_32x32_16, 4, 4, 2, dst, stride, l, a
mova m0, [%2+mmsize*0]
paddw m0, [%2+mmsize*1]
paddw m0, [%2+mmsize*2]
paddw m0, [%2+mmsize*3]
DEFINE_ARGS dst, stride, cnt
mov cntd, 16
pmaddwd m0, [pw_1]
pshufd m1, m0, q3232
paddd m0, m1
pshufd m1, m0, q1111
paddd m0, [pd_16]
paddd m0, m1
psrad m0, 5
pshuflw m0, m0, q0000
punpcklqdq m0, m0
.loop:
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m0
mova [dstq+strideq*0+32], m0
mova [dstq+strideq*0+48], m0
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+16], m0
mova [dstq+strideq*1+32], m0
mova [dstq+strideq*1+48], m0
lea dstq, [dstq+strideq*2]
dec cntd
jg .loop
RET
%endmacro
DC_1D_FNS top, aq
DC_1D_FNS left, lq
INIT_MMX mmxext
cglobal vp9_ipred_tm_4x4_10, 4, 4, 6, dst, stride, l, a
mova m5, [pw_1023]
.body:
mova m4, [aq]
mova m3, [lq]
movd m0, [aq-4]
pshufw m0, m0, q1111
psubw m4, m0
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pshufw m0, m3, q3333
pshufw m1, m3, q2222
pshufw m2, m3, q1111
pshufw m3, m3, q0000
paddw m0, m4
paddw m1, m4
paddw m2, m4
paddw m3, m4
pxor m4, m4
pmaxsw m0, m4
pmaxsw m1, m4
pmaxsw m2, m4
pmaxsw m3, m4
pminsw m0, m5
pminsw m1, m5
pminsw m2, m5
pminsw m3, m5
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
RET
cglobal vp9_ipred_tm_4x4_12, 4, 4, 6, dst, stride, l, a
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_4x4_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_8x8_10, 4, 5, 7, dst, stride, l, a
mova m4, [pw_1023]
.body:
pxor m6, m6
mova m5, [aq]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m5, m0
DEFINE_ARGS dst, stride, l, stride3, cnt
lea stride3q, [strideq*3]
mov cntd, 1
.loop:
movh m3, [lq+cntq*8]
punpcklwd m3, m3
pshufd m0, m3, q3333
pshufd m1, m3, q2222
pshufd m2, m3, q1111
pshufd m3, m3, q0000
paddw m0, m5
paddw m1, m5
paddw m2, m5
paddw m3, m5
pmaxsw m0, m6
pmaxsw m1, m6
pmaxsw m2, m6
pmaxsw m3, m6
pminsw m0, m4
pminsw m1, m4
pminsw m2, m4
pminsw m3, m4
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_8x8_12, 4, 5, 7, dst, stride, l, a
mova m4, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_8x8_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_16x16_10, 4, 4, 8, dst, stride, l, a
mova m7, [pw_1023]
.body:
pxor m6, m6
mova m4, [aq]
mova m5, [aq+mmsize]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m4, m0
psubw m5, m0
DEFINE_ARGS dst, stride, l, cnt
mov cntd, 7
.loop:
movd m3, [lq+cntq*4]
punpcklwd m3, m3
pshufd m2, m3, q1111
pshufd m3, m3, q0000
paddw m0, m2, m4
paddw m2, m5
paddw m1, m3, m4
paddw m3, m5
pmaxsw m0, m6
pmaxsw m2, m6
pmaxsw m1, m6
pmaxsw m3, m6
pminsw m0, m7
pminsw m2, m7
pminsw m1, m7
pminsw m3, m7
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m2
mova [dstq+strideq*1+ 0], m1
mova [dstq+strideq*1+16], m3
lea dstq, [dstq+strideq*2]
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_16x16_12, 4, 4, 8, dst, stride, l, a
mova m7, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_16x16_10 %+ SUFFIX).body
INIT_XMM sse2
cglobal vp9_ipred_tm_32x32_10, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
mova m0, [pw_1023]
.body:
pxor m1, m1
%if ARCH_X86_64
SWAP 0, 8
SWAP 1, 9
%define reg_min m9
%define reg_max m8
%else
mova [rsp+ 0], m0
mova [rsp+16], m1
%define reg_min [rsp+16]
%define reg_max [rsp+ 0]
%endif
mova m4, [aq+mmsize*0]
mova m5, [aq+mmsize*1]
mova m6, [aq+mmsize*2]
mova m7, [aq+mmsize*3]
movd m0, [aq-4]
pshuflw m0, m0, q1111
punpcklqdq m0, m0
psubw m4, m0
psubw m5, m0
psubw m6, m0
psubw m7, m0
DEFINE_ARGS dst, stride, l, cnt
mov cntd, 31
.loop:
pinsrw m3, [lq+cntq*2], 0
punpcklwd m3, m3
pshufd m3, m3, q0000
paddw m0, m3, m4
paddw m1, m3, m5
paddw m2, m3, m6
paddw m3, m7
pmaxsw m0, reg_min
pmaxsw m1, reg_min
pmaxsw m2, reg_min
pmaxsw m3, reg_min
pminsw m0, reg_max
pminsw m1, reg_max
pminsw m2, reg_max
pminsw m3, reg_max
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+16], m1
mova [dstq+strideq*0+32], m2
mova [dstq+strideq*0+48], m3
add dstq, strideq
dec cntd
jge .loop
RET
cglobal vp9_ipred_tm_32x32_12, 4, 4, 10, 32 * ARCH_X86_32, dst, stride, l, a
mova m0, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_ipred_tm_32x32_10 %+ SUFFIX).body
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment