Commit 8ad77b65 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

Update x86 H.264 deblock asm

Includes AVX versions from x264.
parent b6675279
This diff is collapsed.
......@@ -219,11 +219,11 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
}
#define LF_FUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta, int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, OPT) \
void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
int alpha, int beta);
LF_FUNC (h, chroma, mmxext)
LF_IFUNC(h, chroma_intra, mmxext)
......@@ -234,18 +234,18 @@ LF_FUNC (h, luma, mmxext)
LF_IFUNC(h, luma_intra, mmxext)
#if HAVE_YASM && ARCH_X86_32
LF_FUNC (v8, luma, mmxext)
static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
static void ff_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{
if((tc0[0] & tc0[1]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
ff_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
if((tc0[2] & tc0[3]) >= 0)
ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
ff_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
}
LF_IFUNC(v8, luma_intra, mmxext)
static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
static void ff_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{
ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
ff_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
ff_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
}
#endif
......@@ -253,6 +253,10 @@ LF_FUNC (h, luma, sse2)
LF_IFUNC(h, luma_intra, sse2)
LF_FUNC (v, luma, sse2)
LF_IFUNC(v, luma_intra, sse2)
LF_FUNC (h, luma, avx)
LF_IFUNC(h, luma_intra, avx)
LF_FUNC (v, luma, avx)
LF_IFUNC(v, luma_intra, avx)
/***********************************/
/* weighted prediction */
......@@ -314,15 +318,15 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_mmxext;
c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_mmxext;
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_mmxext;
c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_mmxext;
#if ARCH_X86_32
c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
c->h264_v_loop_filter_luma= ff_deblock_v_luma_mmxext;
c->h264_h_loop_filter_luma= ff_deblock_h_luma_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_mmxext;
#endif
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
......@@ -360,10 +364,10 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_sse2;
#endif
c->h264_idct_add16 = ff_h264_idct_add16_sse2;
......@@ -377,6 +381,14 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
}
if (mm_flags&AV_CPU_FLAG_AVX) {
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_avx;
#endif
}
}
}
#endif
......
......@@ -24,16 +24,20 @@
;******************************************************************************
%macro SBUTTERFLY 4
%if avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
%else
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
mova m%4, m%2
punpckh%1 m%2, m%3
punpckl%1 m%4, m%3
punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
......@@ -444,3 +448,12 @@
%macro PMINUB_MMXEXT 3 ; dst, src, ignored
pminub %1, %2
%endmacro
%macro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%else
pshufw %1, %2, (%3)*0x55
%endif
%endmacro
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment