Commit 8ad77b65 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

Update x86 H.264 deblock asm

Includes AVX versions from x264.
parent b6675279
This diff is collapsed.
...@@ -219,10 +219,10 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ...@@ -219,10 +219,10 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
} }
#define LF_FUNC(DIR, TYPE, OPT) \ #define LF_FUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0); int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \ #define LF_IFUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta); int alpha, int beta);
LF_FUNC (h, chroma, mmxext) LF_FUNC (h, chroma, mmxext)
...@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext) ...@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext)
LF_IFUNC(h, luma_intra, mmxext) LF_IFUNC(h, luma_intra, mmxext)
#if HAVE_YASM && ARCH_X86_32 #if HAVE_YASM && ARCH_X86_32
LF_FUNC (v8, luma, mmxext) LF_FUNC (v8, luma, mmxext)
static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{ {
if((tc0[0] & tc0[1]) >= 0) if((tc0[0] & tc0[1]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
if((tc0[2] & tc0[3]) >= 0) if((tc0[2] & tc0[3]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
} }
LF_IFUNC(v8, luma_intra, mmxext) LF_IFUNC(v8, luma_intra, mmxext)
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{ {
ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
} }
#endif #endif
...@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2) ...@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2)
LF_IFUNC(h, luma_intra, sse2) LF_IFUNC(h, luma_intra, sse2)
LF_FUNC (v, luma, sse2) LF_FUNC (v, luma, sse2)
LF_IFUNC(v, luma_intra, sse2) LF_IFUNC(v, luma_intra, sse2)
LF_FUNC (h, luma, avx)
LF_IFUNC(h, luma_intra, avx)
LF_FUNC (v, luma, avx)
LF_IFUNC(v, luma_intra, avx)
/***********************************/ /***********************************/
/* weighted prediction */ /* weighted prediction */
...@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->h264_idct_add8 = ff_h264_idct_add8_mmx2; c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
#if ARCH_X86_32 #if ARCH_X86_32
c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
#endif #endif
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
...@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
#if HAVE_ALIGNED_STACK #if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
#endif #endif
c->h264_idct_add16 = ff_h264_idct_add16_sse2; c->h264_idct_add16 = ff_h264_idct_add16_sse2;
...@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) ...@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
} }
if (mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
#endif
}
} }
} }
#endif #endif
......
...@@ -24,16 +24,20 @@ ...@@ -24,16 +24,20 @@
;****************************************************************************** ;******************************************************************************
%macro SBUTTERFLY 4 %macro SBUTTERFLY 4
%if avx_enabled == 0
mova m%4, m%2 mova m%4, m%2
punpckl%1 m%2, m%3 punpckl%1 m%2, m%3
punpckh%1 m%4, m%3 punpckh%1 m%4, m%3
%else
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%endif
SWAP %3, %4 SWAP %3, %4
%endmacro %endmacro
%macro SBUTTERFLY2 4 %macro SBUTTERFLY2 4
mova m%4, m%2 punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%3 punpckh%1 m%2, m%2, m%3
punpckl%1 m%4, m%3
SWAP %2, %4, %3 SWAP %2, %4, %3
%endmacro %endmacro
...@@ -444,3 +448,12 @@ ...@@ -444,3 +448,12 @@
%macro PMINUB_MMXEXT 3 ; dst, src, ignored %macro PMINUB_MMXEXT 3 ; dst, src, ignored
pminub %1, %2 pminub %1, %2
%endmacro %endmacro
%macro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%else
pshufw %1, %2, (%3)*0x55
%endif
%endmacro
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment