Commit 5eb4f95b authored by Mirage Abeysekara's avatar Mirage Abeysekara Committed by Ronald S. Bultje

h264pred: added AVX2 implementation for tm_vp8 16x16.

checkasm --bench results with 5000 runs

pred16x16_tm_vp8_c: 302.8
pred16x16_tm_vp8_mmx: 101.4
pred16x16_tm_vp8_mmxext: 95.5
pred16x16_tm_vp8_sse2: 95.1
pred16x16_tm_vp8_avx2: 38.2
Signed-off-by: 's avatarRonald S. Bultje <rsbultje@gmail.com>
parent f3cd2302
......@@ -268,6 +268,43 @@ cglobal pred16x16_tm_vp8_8, 2,6,6
jg .loop
REP_RET
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
sub dstq, strideq
pmovzxbw m0, [dstq]
vpbroadcastb xm1, [r0-1]
pmovzxbw m1, xm1
psubw m0, m1
mov iterationd, 4
lea stride3q, [strideq*3]
.loop:
vpbroadcastb xm1, [dstq+strideq*1-1]
vpbroadcastb xm2, [dstq+strideq*2-1]
vpbroadcastb xm3, [dstq+stride3q-1]
vpbroadcastb xm4, [dstq+strideq*4-1]
pmovzxbw m1, xm1
pmovzxbw m2, xm2
pmovzxbw m3, xm3
pmovzxbw m4, xm4
paddw m1, m0
paddw m2, m0
paddw m3, m0
paddw m4, m0
vpackuswb m1, m1, m2
vpackuswb m3, m3, m4
vpermq m1, m1, q3120
vpermq m3, m3, q3120
movdqa [dstq+strideq*1], xm1
vextracti128 [dstq+strideq*2], m1, 1
movdqa [dstq+stride3q*1], xm3
vextracti128 [dstq+strideq*4], m3, 1
lea dstq, [dstq+strideq*4]
dec iterationd
jg .loop
REP_RET
%endif
;-----------------------------------------------------------------------------
; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
......
......@@ -127,6 +127,7 @@ PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
PRED16x16(tm_vp8, 8, avx2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
......@@ -323,6 +324,12 @@ av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
}
}
}
if(EXTERNAL_AVX2(cpu_flags)){
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_avx2;
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment