Commit 4353c350 authored by Martin Vignali's avatar Martin Vignali

avcodec/x86/lossless_videodsp : add avx2 version for add_left_pred

parent cfbcea1c
...@@ -114,40 +114,54 @@ MEDIAN_PRED ...@@ -114,40 +114,54 @@ MEDIAN_PRED
add dstq, wq add dstq, wq
neg wq neg wq
%%.loop: %%.loop:
pshufb xm0, xm5
%if %2 %if %2
mova m1, [srcq+wq] mova m1, [srcq+wq]
%else %else
movu m1, [srcq+wq] movu m1, [srcq+wq]
%endif %endif
mova m2, m1 psllw m2, m1, 8
psllw m1, 8
paddb m1, m2 paddb m1, m2
mova m2, m1 pshufb m2, m1, m3
pshufb m1, m3
paddb m1, m2 paddb m1, m2
pshufb m0, m5 pshufb m2, m1, m4
mova m2, m1
pshufb m1, m4
paddb m1, m2 paddb m1, m2
%if mmsize == 16 %if mmsize >= 16
mova m2, m1 pshufb m2, m1, m6
pshufb m1, m6
paddb m1, m2 paddb m1, m2
%endif %endif
paddb m0, m1 paddb xm0, xm1
%if %1 %if %1
mova [dstq+wq], m0 mova [dstq+wq], xm0
%else %else
movq [dstq+wq], m0 movq [dstq+wq], xm0
movhps [dstq+wq+8], m0 movhps [dstq+wq+8], xm0
%endif
%if mmsize == 32
vextracti128 xm2, m1, 1 ; get second lane of the ymm
pshufb xm0, xm5 ; set alls val to last val of the first lane
paddb xm0, xm2
;store val
%if %1
mova [dstq+wq+16], xm0
%else;
movq [dstq+wq+16], xm0
movhps [dstq+wq+16+8], xm0
%endif
%endif %endif
add wq, mmsize add wq, mmsize
jl %%.loop jl %%.loop
%if mmsize == 32
mov eax, [dstq -1]
and eax, 0xff
%else;
mov eax, mmsize-1 mov eax, mmsize-1
sub eax, wd sub eax, wd
movd m1, eax movd m1, eax
pshufb m0, m1 pshufb m0, m1
movd eax, m0 movd eax, m0
%endif
RET RET
%endmacro %endmacro
...@@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left ...@@ -166,15 +180,15 @@ cglobal add_left_pred, 3,3,7, dst, src, w, left
%macro ADD_LEFT_PRED_UNALIGNED 0 %macro ADD_LEFT_PRED_UNALIGNED 0
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
mova m5, [pb_15] mova xm5, [pb_15]
mova m6, [pb_zzzzzzzz77777777] VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
mova m4, [pb_zzzz3333zzzzbbbb] VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd] VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm movd xm0, leftm
pslldq m0, 15 pslldq xm0, 15
test srcq, 15 test srcq, mmsize - 1
jnz .src_unaligned jnz .src_unaligned
test dstq, 15 test dstq, mmsize - 1
jnz .dst_unaligned jnz .dst_unaligned
ADD_LEFT_LOOP 1, 1 ADD_LEFT_LOOP 1, 1
.dst_unaligned: .dst_unaligned:
...@@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left ...@@ -186,6 +200,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
INIT_XMM ssse3 INIT_XMM ssse3
ADD_LEFT_PRED_UNALIGNED ADD_LEFT_PRED_UNALIGNED
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
ADD_LEFT_PRED_UNALIGNED
%endif
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w); ; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
;------------------------------------------------------------------------------ ;------------------------------------------------------------------------------
......
...@@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src, ...@@ -38,6 +38,8 @@ int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left); ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src, int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left); ptrdiff_t w, int left);
int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
ptrdiff_t w, int left);
int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc);
...@@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) ...@@ -118,5 +120,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
} }
if (EXTERNAL_AVX2_FAST(cpu_flags)) { if (EXTERNAL_AVX2_FAST(cpu_flags)) {
c->add_bytes = ff_add_bytes_avx2; c->add_bytes = ff_add_bytes_avx2;
c->add_left_pred = ff_add_left_pred_unaligned_avx2;
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment