Commit 422b2362 authored by Loren Merritt's avatar Loren Merritt Committed by Reinhard Tartler

dct32_sse: eliminate some spills

125->104 cycles on penryn (x86_64 only)
parent 165c7c42
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
;****************************************************************************** ;******************************************************************************
%include "x86inc.asm" %include "x86inc.asm"
%include "config.asm" %include "x86util.asm"
SECTION_RODATA 32 SECTION_RODATA 32
...@@ -37,8 +37,9 @@ ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 ...@@ -37,8 +37,9 @@ ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 1.000000, 1.000000, 1.306563, 0.541196 dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107 dd 1.000000, 0.707107, 1.000000, -0.707107
dd 1.000000, 0.707107, 1.000000, -0.707107 dd 1.000000, 0.707107, 1.000000, -0.707107
dd 0.707107, 0.707107, 0.707107, 0.707107
align 32
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
%macro BUTTERFLY_SSE 4 %macro BUTTERFLY_SSE 4
...@@ -77,6 +78,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 ...@@ -77,6 +78,18 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
BUTTERFLY0 %1, %2, %3, %4, 0xb1 BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro %endmacro
%macro BUTTERFLY3V 5
movaps m%5, m%1
addps m%1, m%2
subps m%5, m%2
SWAP %2, %5
mulps m%2, [ps_cos_vec+192]
movaps m%5, m%3
addps m%3, m%4
subps m%4, m%5
mulps m%4, [ps_cos_vec+192]
%endmacro
%macro PASS6_AND_PERMUTE 0 %macro PASS6_AND_PERMUTE 0
mov tmpd, [outq+4] mov tmpd, [outq+4]
movss m7, [outq+72] movss m7, [outq+72]
...@@ -269,9 +282,131 @@ INIT_XMM ...@@ -269,9 +282,131 @@ INIT_XMM
%define BUTTERFLY BUTTERFLY_SSE %define BUTTERFLY BUTTERFLY_SSE
%define BUTTERFLY0 BUTTERFLY0_SSE %define BUTTERFLY0 BUTTERFLY0_SSE
%ifdef ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%macro PASS5 0
nop ; FIXME code alignment
SWAP 5, 8
SWAP 4, 12
SWAP 6, 14
SWAP 7, 13
SWAP 0, 15
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
TRANSPOSE4x4PS 8, 9, 10, 11, 0
BUTTERFLY3V 8, 9, 10, 11, 0
addps m10, m11
TRANSPOSE4x4PS 12, 13, 14, 15, 0
BUTTERFLY3V 12, 13, 14, 15, 0
addps m14, m15
addps m12, m14
addps m14, m13
addps m13, m15
%endmacro
%macro PASS6 0
SWAP 9, 12
SWAP 11, 14
movss [outq+0x00], m8
pshuflw m0, m8, 0xe
movss [outq+0x10], m9
pshuflw m1, m9, 0xe
movss [outq+0x20], m10
pshuflw m2, m10, 0xe
movss [outq+0x30], m11
pshuflw m3, m11, 0xe
movss [outq+0x40], m12
pshuflw m4, m12, 0xe
movss [outq+0x50], m13
pshuflw m5, m13, 0xe
movss [outq+0x60], m14
pshuflw m6, m14, 0xe
movaps [outq+0x70], m15
pshuflw m7, m15, 0xe
addss m0, m1
addss m1, m2
movss [outq+0x08], m0
addss m2, m3
movss [outq+0x18], m1
addss m3, m4
movss [outq+0x28], m2
addss m4, m5
movss [outq+0x38], m3
addss m5, m6
movss [outq+0x48], m4
addss m6, m7
movss [outq+0x58], m5
movss [outq+0x68], m6
movss [outq+0x78], m7
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
movhlps m0, m1
pshufd m1, m1, 3
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%rep 7
movhlps m0, m1
pshufd m1, m1, 3
addss m15, m1
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%endrep
%assign i 4
%rep 15
addss m0, m1
movss [outq+i], m0
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%assign i i+8
%endrep
%endmacro
%else ; ARCH_X86_32
%macro SPILL 2 ; xmm#, mempos
movaps [outq+(%2-8)*16], m%1
%endmacro
%macro UNSPILL 2
movaps m%1, [outq+(%2-8)*16]
%endmacro
%define PASS6 PASS6_AND_PERMUTE
%macro PASS5 0
movaps m2, [ps_cos_vec+160]
shufps m3, m3, 0xcc
BUTTERFLY3 m5, m3, m2, m1
SPILL 5, 8
UNSPILL 1, 9
BUTTERFLY3 m1, m3, m2, m5
SPILL 1, 14
BUTTERFLY3 m4, m3, m2, m5
SPILL 4, 12
BUTTERFLY3 m7, m3, m2, m5
SPILL 7, 13
UNSPILL 5, 10
BUTTERFLY3 m5, m3, m2, m7
SPILL 5, 10
UNSPILL 4, 11
BUTTERFLY3 m4, m3, m2, m7
SPILL 4, 11
BUTTERFLY3 m6, m3, m2, m7
SPILL 6, 9
BUTTERFLY3 m0, m3, m2, m7
SPILL 0, 15
%endmacro
%endif
INIT_XMM INIT_XMM
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
cglobal dct32_float_sse, 2,3,8, out, in, tmp cglobal dct32_float_sse, 2,3,16, out, in, tmp
; pass 1 ; pass 1
movaps m0, [inq+0] movaps m0, [inq+0]
...@@ -287,8 +422,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp ...@@ -287,8 +422,8 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
; pass 2 ; pass 2
movaps m2, [ps_cos_vec+64] movaps m2, [ps_cos_vec+64]
BUTTERFLY m1, m4, m2, m3 BUTTERFLY m1, m4, m2, m3
movaps [outq+48], m1 SPILL 1, 11
movaps [outq+ 0], m4 SPILL 4, 8
; pass 1 ; pass 1
movaps m1, [inq+16] movaps m1, [inq+16]
...@@ -313,17 +448,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp ...@@ -313,17 +448,17 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
movaps m2, [ps_cos_vec+96] movaps m2, [ps_cos_vec+96]
shufps m1, m1, 0x1b shufps m1, m1, 0x1b
BUTTERFLY m0, m1, m2, m3 BUTTERFLY m0, m1, m2, m3
movaps [outq+112], m0 SPILL 0, 15
movaps [outq+ 96], m1 SPILL 1, 14
movaps m0, [outq+0] UNSPILL 0, 8
shufps m5, m5, 0x1b shufps m5, m5, 0x1b
BUTTERFLY m0, m5, m2, m3 BUTTERFLY m0, m5, m2, m3
movaps m1, [outq+48] UNSPILL 1, 11
shufps m6, m6, 0x1b shufps m6, m6, 0x1b
BUTTERFLY m1, m6, m2, m3 BUTTERFLY m1, m6, m2, m3
movaps [outq+48], m1 SPILL 1, 11
shufps m4, m4, 0x1b shufps m4, m4, 0x1b
BUTTERFLY m7, m4, m2, m3 BUTTERFLY m7, m4, m2, m3
...@@ -335,57 +470,25 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp ...@@ -335,57 +470,25 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp
BUTTERFLY2 m5, m3, m2, m1 BUTTERFLY2 m5, m3, m2, m1
BUTTERFLY2 m0, m3, m2, m1 BUTTERFLY2 m0, m3, m2, m1
movaps [outq+16], m0 SPILL 0, 9
BUTTERFLY2 m6, m3, m2, m1 BUTTERFLY2 m6, m3, m2, m1
movaps [outq+32], m6 SPILL 6, 10
movaps m0, [outq+48] UNSPILL 0, 11
BUTTERFLY2 m0, m3, m2, m1 BUTTERFLY2 m0, m3, m2, m1
movaps [outq+48], m0 SPILL 0, 11
BUTTERFLY2 m4, m3, m2, m1 BUTTERFLY2 m4, m3, m2, m1
BUTTERFLY2 m7, m3, m2, m1 BUTTERFLY2 m7, m3, m2, m1
movaps m6, [outq+96] UNSPILL 6, 14
BUTTERFLY2 m6, m3, m2, m1 BUTTERFLY2 m6, m3, m2, m1
movaps m0, [outq+112] UNSPILL 0, 15
BUTTERFLY2 m0, m3, m2, m1 BUTTERFLY2 m0, m3, m2, m1
; pass 5 PASS5
movaps m2, [ps_cos_vec+160] PASS6
shufps m3, m3, 0xcc
BUTTERFLY3 m5, m3, m2, m1
movaps [outq+0], m5
movaps m1, [outq+16]
BUTTERFLY3 m1, m3, m2, m5
movaps [outq+96], m1
BUTTERFLY3 m4, m3, m2, m5
movaps [outq+64], m4
BUTTERFLY3 m7, m3, m2, m5
movaps [outq+80], m7
movaps m5, [outq+32]
BUTTERFLY3 m5, m3, m2, m7
movaps [outq+32], m5
movaps m4, [outq+48]
BUTTERFLY3 m4, m3, m2, m7
movaps [outq+48], m4
BUTTERFLY3 m6, m3, m2, m7
movaps [outq+16], m6
BUTTERFLY3 m0, m3, m2, m7
movaps [outq+112], m0
; pass 6, no SIMD...
PASS6_AND_PERMUTE
RET RET
...@@ -95,13 +95,6 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 ...@@ -95,13 +95,6 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
; void ff_float_interleave6(float *dst, const float **src, unsigned int len); ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro BUTTERFLYPS 3
movaps m%3, m%1
unpcklps m%1, m%2
unpckhps m%3, m%2
SWAP %2, %3
%endmacro
%macro FLOAT_INTERLEAVE6 2 %macro FLOAT_INTERLEAVE6 2
cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
...@@ -130,9 +123,9 @@ cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 ...@@ -130,9 +123,9 @@ cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
movaps m4, [srcq+src4q] movaps m4, [srcq+src4q]
movaps m5, [srcq+src5q] movaps m5, [srcq+src5q]
BUTTERFLYPS 0, 1, 6 SBUTTERFLYPS 0, 1, 6
BUTTERFLYPS 2, 3, 6 SBUTTERFLYPS 2, 3, 6
BUTTERFLYPS 4, 5, 6 SBUTTERFLYPS 4, 5, 6
movaps m6, m4 movaps m6, m4
shufps m4, m0, 0xe4 shufps m4, m0, 0xe4
......
...@@ -41,6 +41,13 @@ ...@@ -41,6 +41,13 @@
SWAP %2, %4, %3 SWAP %2, %4, %3
%endmacro %endmacro
%macro SBUTTERFLYPS 3
movaps m%3, m%1
unpcklps m%1, m%2
unpckhps m%3, m%2
SWAP %2, %3
%endmacro
%macro TRANSPOSE4x4B 5 %macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5 SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5 SBUTTERFLY bw, %3, %4, %5
...@@ -74,6 +81,19 @@ ...@@ -74,6 +81,19 @@
SWAP %2, %3 SWAP %2, %3
%endmacro %endmacro
; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS %1, %2, %5
SBUTTERFLYPS %3, %4, %5
movaps m%5, m%1
movlhps m%1, m%3
movhlps m%3, m%5
movaps m%5, m%2
movlhps m%2, m%4
movhlps m%4, m%5
SWAP %2, %3
%endmacro
%macro TRANSPOSE8x8W 9-11 %macro TRANSPOSE8x8W 9-11
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9 SBUTTERFLY wd, %1, %2, %9
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment