Commit decd5193 authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Michael Niedermayer

x86: xvid_idct: merged idct_put SSE2 versions

Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 8200575d
......@@ -292,13 +292,13 @@ SECTION .text
%define TAN3 xmm13
%define TAN1 xmm14
%else
%define ROW0 [r0 + 0*16]
%define ROW0 [BLOCK + 0*16]
%define REG0 xmm4
%define ROW2 [r0 + 2*16]
%define ROW2 [BLOCK + 2*16]
%define REG2 xmm4
%define ROW4 [r0 + 4*16]
%define ROW4 [BLOCK + 4*16]
%define REG4 xmm6
%define ROW6 [r0 + 6*16]
%define ROW6 [BLOCK + 6*16]
%define REG6 xmm6
%define XMMS xmm2
%define SREG2 xmm7
......@@ -369,8 +369,71 @@ SECTION .text
movdqa TAN1, [tan1]
%endmacro
%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
psraw xmm5, 6
psraw REG0, 6
psraw TAN3, 6
psraw xmm3, 6
; dct coeffs must still be written for AC prediction
%if %2 == 0
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
%else
; Must now load args as gprs are no longer used for masks
; DEST is set to where address of dest was loaded
%if ARCH_X86_32
%xdefine DEST r2q ; BLOCK is r0, stride r1
movifnidn DEST, destm
movifnidn strideq, stridem
%else
%xdefine DEST r0q
%endif
lea r3q, [3*strideq]
%if %2 == 1
packuswb TAN3, xmm3
packuswb xmm5, REG0
movq [DEST + strideq], TAN3
movhps [DEST + 2*strideq], TAN3
; REG0 and TAN3 are now available (and likely used in second half)
%else
%warning Unimplemented
%endif
%endif
%endmacro
%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
psraw %3, 6
psraw %4, 6
psraw %5, 6
psraw %6, 6
; dct coeffs must still be written for AC prediction
%if %2 == 0
movdqa [%1+0*16], %3
movdqa [%1+3*16], %5
movdqa [%1+4*16], %6
movdqa [%1+7*16], %4
%elif %2 == 1
packuswb %3, %5
packuswb %6, %4
; address of dest may have been loaded
movq [DEST], %3
movhps [DEST + r3q], %3
lea DEST, [DEST + 4*strideq]
movq [DEST], %6
movhps [DEST + r3q], %6
; and now write remainder of first half
movq [DEST + 2*strideq], xmm5
movhps [DEST + strideq], xmm5
%elif %2 == 2
%warning Unimplemented
%endif
%endmacro
; IDCT pass on columns.
%macro iLLM_PASS 1 ;dct
%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
movdqa xmm1, TAN3
movdqa xmm3, TAN1
pmulhw TAN3, xmm4
......@@ -407,7 +470,7 @@ SECTION .text
psubsw xmm5, REG6
MOV32 ROW0, REG0
MOV32 ROW4, REG4
MOV32 TAN1, [r0]
MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0
psubsw REG0, REG4
paddsw REG4, XMMS
......@@ -423,33 +486,22 @@ SECTION .text
movdqa XMMS, REG0
psubsw REG0, xmm3
paddsw xmm3, XMMS
MOV32 [r0], TAN1
psraw xmm5, 6
psraw REG0, 6
psraw TAN3, 6
psraw xmm3, 6
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
MOV32 [BLOCK], TAN1
FIRST_HALF %1, %2
movdqa xmm0, xmm7
movdqa xmm4, REG4
psubsw xmm7, xmm1
psubsw REG4, TAN1
paddsw xmm1, xmm0
paddsw TAN1, xmm4
psraw xmm1, 6
psraw xmm7, 6
psraw TAN1, 6
psraw REG4, 6
movdqa [%1+0*16], xmm1
movdqa [%1+3*16], TAN1
movdqa [%1+4*16], REG4
movdqa [%1+7*16], xmm7
SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
%endmacro
; IDCT pass on columns, assuming rows 4-7 are zero
%macro iLLM_PASS_SPARSE 1 ;dct
%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
pmulhw TAN3, xmm4
paddsw TAN3, xmm4
movdqa xmm3, xmm6
......@@ -475,7 +527,7 @@ SECTION .text
movdqa xmm6, REG0
psubsw xmm6, SREG2
paddsw SREG2, REG0
MOV32 TAN1, [r0]
MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0
psubsw REG0, xmm5
paddsw xmm5, XMMS
......@@ -485,70 +537,92 @@ SECTION .text
movdqa XMMS, REG0
psubsw REG0, xmm3
paddsw xmm3, XMMS
MOV32 [r0], TAN1
psraw xmm5, 6
psraw REG0, 6
psraw TAN3, 6
psraw xmm3, 6
movdqa [%1+1*16], TAN3
movdqa [%1+2*16], xmm3
movdqa [%1+5*16], REG0
movdqa [%1+6*16], xmm5
MOV32 [BLOCK], TAN1
FIRST_HALF %1, %2
movdqa xmm0, SREG2
movdqa xmm4, xmm6
psubsw SREG2, xmm1
psubsw xmm6, TAN1
paddsw xmm1, xmm0
paddsw TAN1, xmm4
psraw xmm1, 6
psraw SREG2, 6
psraw TAN1, 6
psraw xmm6, 6
movdqa [%1+0*16], xmm1
movdqa [%1+3*16], TAN1
movdqa [%1+4*16], xmm6
movdqa [%1+7*16], SREG2
SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
%endmacro
INIT_XMM sse2
cglobal xvid_idct, 1, 5, 8+7*ARCH_X86_64, block
%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
%if %1 == 0 || ARCH_X86_32
%define GPR0 r1d
%define GPR1 r2d
%define GPR2 r3d
%define GPR3 r4d
%define NUM_GPRS 5
%else
%define GPR0 r3d
%define GPR1 r4d
%define GPR2 r5d
%define GPR3 r6d
%define NUM_GPRS 7
%endif
%if %1 == 0
cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
%xdefine BLOCK blockq
%else
%if %1 == 1
cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
%else
cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
%endif
%if ARCH_X86_64
%xdefine BLOCK blockq
%else
mov r0q, blockm
%xdefine BLOCK r0q
%endif
%endif
movq mm0, [pb_127]
iMTX_MULT r0 + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
iMTX_MULT r0 + 1*16, iTab2, PUT_ODD, ROW1, 1*16
iMTX_MULT r0 + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
TEST_TWO_ROWS r0 + 3*16, r0 + 4*16, r1d, r2d, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
JZ r1d, col1
iMTX_MULT r0 + 3*16, iTab4, PUT_ODD, ROW3, 3*16
TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
JZ GPR0, col1
iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
.col1:
TEST_TWO_ROWS r0 + 5*16, r0 + 6*16, r1d, r3d, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
TEST_ONE_ROW r0 + 7*16, r4d, CLEAR_ODD, ROW7 ; esi
TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
iLLM_HEAD
JNZ r2d, 2
JNZ r1d, 3
JNZ r3d, 4
JNZ r4d, 5
iLLM_PASS_SPARSE r0
JNZ GPR1, 2
JNZ GPR0, 3
JNZ GPR2, 4
JNZ GPR3, 5
iLLM_PASS_SPARSE BLOCK, %1
jmp .6
.2:
iMTX_MULT r0 + 4*16, iTab1, PUT_EVEN, ROW4
iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
.3:
iMTX_MULT r0 + 5*16, iTab4, PUT_ODD, ROW5, 4*16
JZ r3d, col2
iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
JZ GPR2, col2
.4:
iMTX_MULT r0 + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
.col2:
JZ r4d, col3
JZ GPR3, col3
.5:
iMTX_MULT r0 + 7*16, iTab2, PUT_ODD, ROW7, 5*16
iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
.col3:
%if ARCH_X86_32
iLLM_HEAD
%endif
iLLM_PASS r0
iLLM_PASS BLOCK, %1
.6:
RET
%endmacro
INIT_XMM sse2
IDCT_SSE2 0
IDCT_SSE2 1
%if ARCH_X86_32
......
......@@ -26,11 +26,7 @@
#include "idctdsp.h"
#include "xvididct.h"
static void xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
{
ff_xvid_idct_sse2(block);
ff_put_pixels_clamped(block, dest, line_size);
}
void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
{
......@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = xvid_idct_sse2_put;
c->idct_put = ff_xvid_idct_put_sse2;
c->idct_add = xvid_idct_sse2_add;
c->idct = ff_xvid_idct_sse2;
c->perm_type = FF_IDCT_PERM_SSE2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment