Commit 35a5d971 authored by Ilia Valiakhmetov's avatar Ilia Valiakhmetov Committed by Ronald S. Bultje

avcodec/vp9: add 64-bit ipred_dr_32x32_16 avx2 implementation

vp9_diag_downright_32x32_12bpp_c: 429.7
vp9_diag_downright_32x32_12bpp_sse2: 158.9
vp9_diag_downright_32x32_12bpp_ssse3: 144.6
vp9_diag_downright_32x32_12bpp_avx: 141.0
vp9_diag_downright_32x32_12bpp_avx2: 73.8

Almost 50% faster than avx implementation
Signed-off-by: 's avatarRonald S. Bultje <rsbultje@gmail.com>
parent 0daa1cf0
......@@ -52,8 +52,9 @@ decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
decl_ipred_fn(dl, 16, 16, avx2);
decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dl, 32, 16, avx2);
decl_ipred_fn(dr, 16, 16, avx2);
decl_ipred_fn(dr, 32, 16, avx2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
......@@ -137,8 +138,9 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
}
#endif /* HAVE_X86ASM */
......
......@@ -1221,8 +1221,109 @@ cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
mova [dstq+strideq*0], m4 ; 0
mova [dst3q+strideq*4], m5 ; 7
RET
%endif
%if ARCH_X86_64
cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
mova m0, [lq+mmsize*0+0] ; l[0-15]
mova m1, [lq+mmsize*1+0] ; l[16-31]
movu m2, [aq+mmsize*0-2] ; *abcdefghijklmno
mova m3, [aq+mmsize*0+0] ; abcdefghijklmnop
mova m4, [aq+mmsize*1+0] ; qrstuvwxyz012345
vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0
vpalignr m6, m5, m0, 2 ; mnopqrstuvwxyz01
vpalignr m7, m5, m0, 4 ; nopqrstuvwxyz012
LOWPASS 0, 6, 7 ; L[0-15]
vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg
vpalignr m5, m7, m1, 2 ; lmnopqrstuvwxyz*
vpalignr m6, m7, m1, 4 ; mnopqrstuvwxyz*a
LOWPASS 1, 5, 6 ; L[16-31]#
vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx
vpalignr m6, m5, m3, 2 ; bcdefghijklmnopq
LOWPASS 2, 3, 6 ; A[0-15]
movu m3, [aq+mmsize*1-2] ; pqrstuvwxyz01234
vperm2i128 m6, m4, m4, q2001 ; yz012345........
vpalignr m7, m6, m4, 2 ; rstuvwxyz012345.
LOWPASS 3, 4, 7 ; A[16-31].
vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH
vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23]
vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX
DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
lea stride3q, [strideq*3]
lea stride5q, [stride3q+strideq*2]
lea stride7q, [strideq*4+stride3q]
lea dst24q, [dst8q+stride3q*8]
lea dst8q, [dst8q+strideq*8]
mov cntd, 2
.loop:
mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7
mova [dst24q+stride7q+32], m1
mova [dst8q+stride7q+0], m1
mova [dst8q+stride7q+32], m2
vpalignr m6, m4, m1, 2
vpalignr m7, m5, m0, 2
vpalignr m9, m8, m2, 2
mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
mova [dst24q+stride3q*2+32], m6
mova [dst8q+stride3q*2+0], m6
mova [dst8q+stride3q*2+32], m9
vpalignr m6, m4, m1, 4
vpalignr m7, m5, m0, 4
vpalignr m9, m8, m2, 4
mova [dst24q+stride5q+0], m7 ; 29 21 13 5
mova [dst24q+stride5q+32], m6
mova [dst8q+stride5q+0], m6
mova [dst8q+stride5q+32], m9
vpalignr m6, m4, m1, 6
vpalignr m7, m5, m0, 6
vpalignr m9, m8, m2, 6
mova [dst24q+strideq*4+0 ], m7 ; 28 20 12 4
mova [dst24q+strideq*4+32], m6
mova [dst8q+strideq*4+0], m6
mova [dst8q+strideq*4+32], m9
vpalignr m6, m4, m1, 8
vpalignr m7, m5, m0, 8
vpalignr m9, m8, m2, 8
mova [dst24q+stride3q+0 ], m7 ; 27 19 11 3
mova [dst24q+stride3q+32], m6
mova [dst8q+stride3q+0], m6
mova [dst8q+stride3q+32], m9
vpalignr m6, m4, m1, 10
vpalignr m7, m5, m0, 10
vpalignr m9, m8, m2, 10
mova [dst24q+strideq*2+0 ], m7 ; 26 18 10 2
mova [dst24q+strideq*2+32], m6
mova [dst8q+strideq*2+0], m6
mova [dst8q+strideq*2+32], m9
vpalignr m6, m4, m1, 12
vpalignr m7, m5, m0, 12
vpalignr m9, m8, m2, 12
mova [dst24q+strideq+0 ], m7 ; 25 17 9 1
mova [dst24q+strideq+32], m6
mova [dst8q+strideq+0], m6
mova [dst8q+strideq+32], m9
vpalignr m6, m4, m1, 14
vpalignr m7, m5, m0, 14
vpalignr m9, m8, m2, 14
mova [dst24q+strideq*0+0 ], m7 ; 24 16 8 0
mova [dst24q+strideq*0+32], m6
mova [dst8q+strideq*0+0], m6
mova [dst8q+strideq*0+32], m9
mova m0, m5
mova m5, m1
mova m1, m4
mova m4, m2
mova m2, m8
mova m8, m3
sub dst24q, stride7q
sub dst24q, strideq
sub dst8q, stride7q
sub dst8q, strideq
dec cntd
jg .loop
RET
%endif
%endif
%macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment