Commit d0f56ca0 authored by James Almer's avatar James Almer Committed by Michael Niedermayer

x86/hevc_deblock: improve 8bit transpose store macros

Up to four instructions less depending on function and instruction set.
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 2e6fdcb7
......@@ -77,16 +77,10 @@ INIT_XMM sse2
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
packuswb m0, m0
packuswb m1, m1
packuswb m2, m2
packuswb m3, m3
punpcklbw m0, m1
punpcklbw m2, m3
punpckhwd m6, m0, m2
punpcklwd m0, m2
packuswb m0, m2
packuswb m1, m3
SBUTTERFLY bw, 0, 1, 2
SBUTTERFLY wd, 0, 1, 2
movd %1, m0
pshufd m0, m0, 0x39
......@@ -96,13 +90,13 @@ INIT_XMM sse2
pshufd m0, m0, 0x39
movd %4, m0
movd %5, m6
pshufd m6, m6, 0x39
movd %6, m6
pshufd m6, m6, 0x39
movd %7, m6
pshufd m6, m6, 0x39
movd %8, m6
movd %5, m1
pshufd m1, m1, 0x39
movd %6, m1
pshufd m1, m1, 0x39
movd %7, m1
pshufd m1, m1, 0x39
movd %8, m1
%endmacro
; in: 8 rows of 4 words in %4..%11
......@@ -204,40 +198,20 @@ INIT_XMM sse2
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
packuswb m0, m0
packuswb m1, m1
packuswb m2, m2
packuswb m3, m3
packuswb m4, m4
packuswb m5, m5
packuswb m6, m6
packuswb m7, m7
punpcklbw m0, m1
punpcklbw m2, m3
punpckhwd m8, m0, m2
punpcklwd m0, m2
punpcklbw m4, m5
punpcklbw m6, m7
punpckhwd m9, m4, m6
punpcklwd m4, m6
packuswb m0, m4
packuswb m1, m5
packuswb m2, m6
packuswb m3, m7
TRANSPOSE2x4x4B 0, 1, 2, 3, 4
punpckhdq m10, m0, m4; 2, 3
punpckldq m0, m4; 0, 1
punpckldq m11, m8, m9; 4, 5
punpckhdq m8, m9; 6, 7
movq %1, m0
movhps %2, m0
movq %3, m10
movhps %4, m10
movq %5, m11
movhps %6, m11
movq %7, m8
movhps %8, m8
movq %3, m1
movhps %4, m1
movq %5, m2
movhps %6, m2
movq %7, m3
movhps %8, m3
%endmacro
; in: 8 rows of 8 words in %1..%8
......
......@@ -69,6 +69,15 @@
SWAP %2, %3
%endmacro
%macro TRANSPOSE2x4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
SBUTTERFLY wd, %1, %3, %5
SBUTTERFLY wd, %2, %4, %5
SBUTTERFLY dq, %1, %2, %5
SBUTTERFLY dq, %3, %4, %5
%endmacro
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment