Commit decd5193 authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Michael Niedermayer

x86: xvid_idct: merged idct_put SSE2 versions

Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 8200575d
...@@ -292,13 +292,13 @@ SECTION .text ...@@ -292,13 +292,13 @@ SECTION .text
%define TAN3 xmm13 %define TAN3 xmm13
%define TAN1 xmm14 %define TAN1 xmm14
%else %else
%define ROW0 [r0 + 0*16] %define ROW0 [BLOCK + 0*16]
%define REG0 xmm4 %define REG0 xmm4
%define ROW2 [r0 + 2*16] %define ROW2 [BLOCK + 2*16]
%define REG2 xmm4 %define REG2 xmm4
%define ROW4 [r0 + 4*16] %define ROW4 [BLOCK + 4*16]
%define REG4 xmm6 %define REG4 xmm6
%define ROW6 [r0 + 6*16] %define ROW6 [BLOCK + 6*16]
%define REG6 xmm6 %define REG6 xmm6
%define XMMS xmm2 %define XMMS xmm2
%define SREG2 xmm7 %define SREG2 xmm7
...@@ -369,8 +369,71 @@ SECTION .text ...@@ -369,8 +369,71 @@ SECTION .text
movdqa TAN1, [tan1] movdqa TAN1, [tan1]
%endmacro %endmacro
%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
psraw xmm5, 6
psraw REG0, 6
psraw TAN3, 6
psraw xmm3, 6
; dct coeffs must still be written for AC prediction
%if %2 == 0
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
%else
; Must now load args as gprs are no longer used for masks
; DEST is set to where address of dest was loaded
%if ARCH_X86_32
%xdefine DEST r2q ; BLOCK is r0, stride r1
movifnidn DEST, destm
movifnidn strideq, stridem
%else
%xdefine DEST r0q
%endif
lea r3q, [3*strideq]
%if %2 == 1
packuswb TAN3, xmm3
packuswb xmm5, REG0
movq [DEST + strideq], TAN3
movhps [DEST + 2*strideq], TAN3
; REG0 and TAN3 are now available (and likely used in second half)
%else
%warning Unimplemented
%endif
%endif
%endmacro
%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
psraw %3, 6
psraw %4, 6
psraw %5, 6
psraw %6, 6
; dct coeffs must still be written for AC prediction
%if %2 == 0
movdqa [%1+0*16], %3
movdqa [%1+3*16], %5
movdqa [%1+4*16], %6
movdqa [%1+7*16], %4
%elif %2 == 1
packuswb %3, %5
packuswb %6, %4
; address of dest may have been loaded
movq [DEST], %3
movhps [DEST + r3q], %3
lea DEST, [DEST + 4*strideq]
movq [DEST], %6
movhps [DEST + r3q], %6
; and now write remainder of first half
movq [DEST + 2*strideq], xmm5
movhps [DEST + strideq], xmm5
%elif %2 == 2
%warning Unimplemented
%endif
%endmacro
; IDCT pass on columns. ; IDCT pass on columns.
%macro iLLM_PASS 1 ;dct %macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
movdqa xmm1, TAN3 movdqa xmm1, TAN3
movdqa xmm3, TAN1 movdqa xmm3, TAN1
pmulhw TAN3, xmm4 pmulhw TAN3, xmm4
...@@ -407,7 +470,7 @@ SECTION .text ...@@ -407,7 +470,7 @@ SECTION .text
psubsw xmm5, REG6 psubsw xmm5, REG6
MOV32 ROW0, REG0 MOV32 ROW0, REG0
MOV32 ROW4, REG4 MOV32 ROW4, REG4
MOV32 TAN1, [r0] MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0 movdqa XMMS, REG0
psubsw REG0, REG4 psubsw REG0, REG4
paddsw REG4, XMMS paddsw REG4, XMMS
...@@ -423,33 +486,22 @@ SECTION .text ...@@ -423,33 +486,22 @@ SECTION .text
movdqa XMMS, REG0 movdqa XMMS, REG0
psubsw REG0, xmm3 psubsw REG0, xmm3
paddsw xmm3, XMMS paddsw xmm3, XMMS
MOV32 [r0], TAN1 MOV32 [BLOCK], TAN1
psraw xmm5, 6
psraw REG0, 6 FIRST_HALF %1, %2
psraw TAN3, 6
psraw xmm3, 6
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
movdqa xmm0, xmm7 movdqa xmm0, xmm7
movdqa xmm4, REG4 movdqa xmm4, REG4
psubsw xmm7, xmm1 psubsw xmm7, xmm1
psubsw REG4, TAN1 psubsw REG4, TAN1
paddsw xmm1, xmm0 paddsw xmm1, xmm0
paddsw TAN1, xmm4 paddsw TAN1, xmm4
psraw xmm1, 6
psraw xmm7, 6 SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
psraw TAN1, 6
psraw REG4, 6
movdqa [%1+0*16], xmm1
movdqa [%1+3*16], TAN1
movdqa [%1+4*16], REG4
movdqa [%1+7*16], xmm7
%endmacro %endmacro
; IDCT pass on columns, assuming rows 4-7 are zero ; IDCT pass on columns, assuming rows 4-7 are zero
%macro iLLM_PASS_SPARSE 1 ;dct %macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
pmulhw TAN3, xmm4 pmulhw TAN3, xmm4
paddsw TAN3, xmm4 paddsw TAN3, xmm4
movdqa xmm3, xmm6 movdqa xmm3, xmm6
...@@ -475,7 +527,7 @@ SECTION .text ...@@ -475,7 +527,7 @@ SECTION .text
movdqa xmm6, REG0 movdqa xmm6, REG0
psubsw xmm6, SREG2 psubsw xmm6, SREG2
paddsw SREG2, REG0 paddsw SREG2, REG0
MOV32 TAN1, [r0] MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0 movdqa XMMS, REG0
psubsw REG0, xmm5 psubsw REG0, xmm5
paddsw xmm5, XMMS paddsw xmm5, XMMS
...@@ -485,70 +537,92 @@ SECTION .text ...@@ -485,70 +537,92 @@ SECTION .text
movdqa XMMS, REG0 movdqa XMMS, REG0
psubsw REG0, xmm3 psubsw REG0, xmm3
paddsw xmm3, XMMS paddsw xmm3, XMMS
MOV32 [r0], TAN1 MOV32 [BLOCK], TAN1
psraw xmm5, 6
psraw REG0, 6 FIRST_HALF %1, %2
psraw TAN3, 6
psraw xmm3, 6
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
movdqa xmm0, SREG2 movdqa xmm0, SREG2
movdqa xmm4, xmm6 movdqa xmm4, xmm6
psubsw SREG2, xmm1 psubsw SREG2, xmm1
psubsw xmm6, TAN1 psubsw xmm6, TAN1
paddsw xmm1, xmm0 paddsw xmm1, xmm0
paddsw TAN1, xmm4 paddsw TAN1, xmm4
psraw xmm1, 6
psraw SREG2, 6 SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
psraw TAN1, 6
psraw xmm6, 6
movdqa [%1+0*16], xmm1
movdqa [%1+3*16], TAN1
movdqa [%1+4*16], xmm6
movdqa [%1+7*16], SREG2
%endmacro %endmacro
INIT_XMM sse2 %macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
cglobal xvid_idct, 1, 5, 8+7*ARCH_X86_64, block %if %1 == 0 || ARCH_X86_32
%define GPR0 r1d
%define GPR1 r2d
%define GPR2 r3d
%define GPR3 r4d
%define NUM_GPRS 5
%else
%define GPR0 r3d
%define GPR1 r4d
%define GPR2 r5d
%define GPR3 r6d
%define NUM_GPRS 7
%endif
%if %1 == 0
cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
%xdefine BLOCK blockq
%else
%if %1 == 1
cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
%else
cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
%endif
%if ARCH_X86_64
%xdefine BLOCK blockq
%else
mov r0q, blockm
%xdefine BLOCK r0q
%endif
%endif
movq mm0, [pb_127] movq mm0, [pb_127]
iMTX_MULT r0 + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
iMTX_MULT r0 + 1*16, iTab2, PUT_ODD, ROW1, 1*16 iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
iMTX_MULT r0 + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
TEST_TWO_ROWS r0 + 3*16, r0 + 4*16, r1d, r2d, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
JZ r1d, col1 JZ GPR0, col1
iMTX_MULT r0 + 3*16, iTab4, PUT_ODD, ROW3, 3*16 iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
.col1: .col1:
TEST_TWO_ROWS r0 + 5*16, r0 + 6*16, r1d, r3d, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
TEST_ONE_ROW r0 + 7*16, r4d, CLEAR_ODD, ROW7 ; esi TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
iLLM_HEAD iLLM_HEAD
JNZ r2d, 2 JNZ GPR1, 2
JNZ r1d, 3 JNZ GPR0, 3
JNZ r3d, 4 JNZ GPR2, 4
JNZ r4d, 5 JNZ GPR3, 5
iLLM_PASS_SPARSE r0 iLLM_PASS_SPARSE BLOCK, %1
jmp .6 jmp .6
.2: .2:
iMTX_MULT r0 + 4*16, iTab1, PUT_EVEN, ROW4 iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
.3: .3:
iMTX_MULT r0 + 5*16, iTab4, PUT_ODD, ROW5, 4*16 iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
JZ r3d, col2 JZ GPR2, col2
.4: .4:
iMTX_MULT r0 + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
.col2: .col2:
JZ r4d, col3 JZ GPR3, col3
.5: .5:
iMTX_MULT r0 + 7*16, iTab2, PUT_ODD, ROW7, 5*16 iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
.col3: .col3:
%if ARCH_X86_32 %if ARCH_X86_32
iLLM_HEAD iLLM_HEAD
%endif %endif
iLLM_PASS r0 iLLM_PASS BLOCK, %1
.6: .6:
RET RET
%endmacro
INIT_XMM sse2
IDCT_SSE2 0
IDCT_SSE2 1
%if ARCH_X86_32 %if ARCH_X86_32
......
...@@ -26,11 +26,7 @@ ...@@ -26,11 +26,7 @@
#include "idctdsp.h" #include "idctdsp.h"
#include "xvididct.h" #include "xvididct.h"
static void xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block) void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
{
ff_xvid_idct_sse2(block);
ff_put_pixels_clamped(block, dest, line_size);
}
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
{ {
...@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, ...@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
#endif #endif
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = xvid_idct_sse2_put; c->idct_put = ff_xvid_idct_put_sse2;
c->idct_add = xvid_idct_sse2_add; c->idct_add = xvid_idct_sse2_add;
c->idct = ff_xvid_idct_sse2; c->idct = ff_xvid_idct_sse2;
c->perm_type = FF_IDCT_PERM_SSE2; c->perm_type = FF_IDCT_PERM_SSE2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment