Commit e9930883 authored by Martin Vignali's avatar Martin Vignali Committed by James Darnley

libavcodec/bswapdsp : add AVX2 func for bswap_buf (swap uint32_t)

parent 9b0510a8
...@@ -35,14 +35,18 @@ SECTION .text ...@@ -35,14 +35,18 @@ SECTION .text
mov r3d, r2d mov r3d, r2d
sar r2d, 3 sar r2d, 3
jz .left4_%1 jz .left4_%1
%if cpuflag(avx2)
sar r2d, 1
jz .left8_%1
%endif
.loop8_%1: .loop8_%1:
mov%1 m0, [r1 + 0] mov%1 m0, [r1 + 0]
mov%1 m1, [r1 + 16] mov%1 m1, [r1 + mmsize]
%if cpuflag(ssse3) %if cpuflag(ssse3)||cpuflag(avx2)
pshufb m0, m2 pshufb m0, m2
pshufb m1, m2 pshufb m1, m2
mov%1 [r0 + 0], m0 mov%1 [r0 + 0], m0
mov%1 [r0 + 16], m1 mov%1 [r0 + mmsize], m1
%else %else
pshuflw m0, m0, 10110001b pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b pshuflw m1, m1, 10110001b
...@@ -59,18 +63,29 @@ SECTION .text ...@@ -59,18 +63,29 @@ SECTION .text
mov%1 [r0 + 0], m2 mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3 mov%1 [r0 + 16], m3
%endif %endif
add r0, 32 add r0, mmsize*2
add r1, 32 add r1, mmsize*2
dec r2d dec r2d
jnz .loop8_%1 jnz .loop8_%1
%if cpuflag(avx2)
.left8_%1:
mov r2d, r3d
test r3d, 8
jz .left4_%1
mov%1 m0, [r1]
pshufb m0, m2
mov%1 [r0 + 0], m0
add r1, mmsize
add r0, mmsize
%endif
.left4_%1: .left4_%1:
mov r2d, r3d mov r2d, r3d
test r3d, 4 test r3d, 4
jz .left jz .left
mov%1 m0, [r1] mov%1 xm0, [r1]
%if cpuflag(ssse3) %if cpuflag(ssse3)
pshufb m0, m2 pshufb xm0, xm2
mov%1 [r0], m0 mov%1 [r0], xm0
%else %else
pshuflw m0, m0, 10110001b pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b pshufhw m0, m0, 10110001b
...@@ -86,16 +101,20 @@ SECTION .text ...@@ -86,16 +101,20 @@ SECTION .text
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w); ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0 %macro BSWAP32_BUF 0
%if cpuflag(ssse3) %if cpuflag(ssse3)||cpuflag(avx2)
cglobal bswap32_buf, 3,4,3 cglobal bswap32_buf, 3,4,3
mov r3, r1 mov r3, r1
%if cpuflag(avx2)
vbroadcasti128 m2, [pb_bswap32]
%else
mova m2, [pb_bswap32] mova m2, [pb_bswap32]
%endif
%else %else
cglobal bswap32_buf, 3,4,5 cglobal bswap32_buf, 3,4,5
mov r3, r1 mov r3, r1
%endif %endif
or r3, r0 or r3, r0
test r3, 15 test r3, mmsize - 1
jz .start_align jz .start_align
BSWAP_LOOPS u BSWAP_LOOPS u
jmp .left jmp .left
...@@ -105,9 +124,9 @@ cglobal bswap32_buf, 3,4,5 ...@@ -105,9 +124,9 @@ cglobal bswap32_buf, 3,4,5
%if cpuflag(ssse3) %if cpuflag(ssse3)
test r2d, 2 test r2d, 2
jz .left1 jz .left1
movq m0, [r1] movq xm0, [r1]
pshufb m0, m2 pshufb xm0, xm2
movq [r0], m0 movq [r0], xm0
add r1, 8 add r1, 8
add r0, 8 add r0, 8
.left1: .left1:
...@@ -137,3 +156,6 @@ BSWAP32_BUF ...@@ -137,3 +156,6 @@ BSWAP32_BUF
INIT_XMM ssse3 INIT_XMM ssse3
BSWAP32_BUF BSWAP32_BUF
INIT_YMM avx2
BSWAP32_BUF
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c) av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
{ {
...@@ -34,4 +35,6 @@ av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c) ...@@ -34,4 +35,6 @@ av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
c->bswap_buf = ff_bswap32_buf_sse2; c->bswap_buf = ff_bswap32_buf_sse2;
if (EXTERNAL_SSSE3(cpu_flags)) if (EXTERNAL_SSSE3(cpu_flags))
c->bswap_buf = ff_bswap32_buf_ssse3; c->bswap_buf = ff_bswap32_buf_ssse3;
if (EXTERNAL_AVX2_FAST(cpu_flags))
c->bswap_buf = ff_bswap32_buf_avx2;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment