Commit ac096fc8 authored by James Darnley's avatar James Darnley

avcodec/h264: add avx 8-bit 4:2:0 chroma h deblock/loop filter

~1.14x faster (93 vs. 81 cycles) compared with mmxext function
parent 5c567588
......@@ -1059,6 +1059,54 @@ ff_chroma_intra_body_mmxext:
paddb m2, m6
ret
%macro LOAD_8_ROWS 8
movd m0, %1
movd m1, %2
movd m2, %3
movd m3, %4
movd m4, %5
movd m5, %6
movd m6, %7
movd m7, %8
%endmacro
%macro STORE_8_ROWS 8
movd %1, m0
movd %2, m1
movd %3, m2
movd %4, m3
movd %5, m4
movd %6, m5
movd %7, m6
movd %8, m7
%endmacro
%macro TRANSPOSE_8x4B_XMM 0
punpcklbw m0, m1
punpcklbw m2, m3
punpcklbw m4, m5
punpcklbw m6, m7
punpcklwd m0, m2
punpcklwd m4, m6
punpckhdq m2, m0, m4
punpckldq m0, m4
MOVHL m1, m0
MOVHL m3, m2
%endmacro
%macro TRANSPOSE_4x8B_XMM 0
punpcklbw m0, m1
punpcklbw m2, m3
punpckhwd m4, m0, m2
punpcklwd m0, m2
MOVHL m6, m4
MOVHL m2, m0
pshufd m1, m0, 1
pshufd m3, m2, 1
pshufd m5, m4, 1
pshufd m7, m6, 1
%endmacro
%macro CHROMA_INTER_BODY_XMM 1
LOAD_MASK alpha_d, beta_d
movd m6, [tc0_q]
......@@ -1078,6 +1126,15 @@ ff_chroma_intra_body_mmxext:
sub %1, stride_q
%endmacro
%macro CHROMA_H_START_XMM 2
movsxdifnidn stride_q, stride_d
dec alpha_d
dec beta_d
lea %2, [3*stride_q]
mov %1, pix_q
add %1, %2
%endmacro
%macro DEBLOCK_CHROMA_XMM 1
INIT_XMM %1
......@@ -1093,6 +1150,19 @@ cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
movq [pix_q], m2
RET
cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
CHROMA_H_START_XMM r5, r6
LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
TRANSPOSE_8x4B_XMM
movq [rsp], m0
movq [rsp + 8], m3
CHROMA_INTER_BODY_XMM 1
movq m0, [rsp]
movq m3, [rsp + 8]
TRANSPOSE_4x8B_XMM
STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
RET
%endmacro ; DEBLOCK_CHROMA_XMM
DEBLOCK_CHROMA_XMM avx
......
......@@ -319,6 +319,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
#endif
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_avx;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_avx;
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment