avcodec/h264: add avx 8-bit h264_idct_add

Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext

avcodec/h264: add avx 8-bit h264_idct_add
Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
f61d454c · James Darnley · b5325c67 · f61d454c · f61d454c
Commit f61d454c authored Mar 16, 2017 by James Darnley
Show whitespace changes
Inline Side-by-side

Showing with 35 additions and 1 deletion

h264_idct.asm libavcodec/x86/h264_idct.asm +32 -1

h264dsp_init.c libavcodec/x86/h264dsp_init.c +3 -0

No files found.
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -65,7 +65,15 @@ SECTION .text

    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
    mova         m6, [pw_32]
+    %if mmsize == 8
        TRANSPOSE4x4W 0, 1, 2, 3, 4
+    %else
+        punpcklwd m0, m1
+        punpcklwd m2, m3
+        SBUTTERFLY dq, 0, 2, 4
+        MOVHL m1, m0
+        MOVHL m3, m2
+    %endif
    paddw        m0, m6
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
    pxor         m7, m7
@@ -1131,3 +1139,26 @@ INIT_MMX mmx
 IDCT_DC_DEQUANT 0
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
+
+INIT_XMM avx
+
+; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
+%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
+    movd       %3, [%7]
+    movd       %4, [%7+%8]
+    psraw      %1, %6
+    psraw      %2, %6
+    punpcklbw  %3, %5
+    punpcklbw  %4, %5
+    paddw      %3, %1
+    paddw      %4, %2
+    packuswb   %3, %5
+    packuswb   %4, %5
+    movd     [%7], %3
+    movd  [%7+%8], %4
+%endmacro
+
+cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
+    movsxdifnidn stride_q, stride_d
+    IDCT4_ADD    dst_q, block_q, stride_q
+RET
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
                                                       int stride);

 IDCT_ADD_FUNC(, 8, mmx)
+IDCT_ADD_FUNC(, 8, avx)
 IDCT_ADD_FUNC(, 10, sse2)
 IDCT_ADD_FUNC(_dc, 8, mmxext)
 IDCT_ADD_FUNC(_dc, 10, mmxext)
@@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx;
                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
            }
+
+            c->h264_idct_add        = ff_h264_idct_add_8_avx;
        }
    } else if (bit_depth == 10) {
        if (EXTERNAL_MMXEXT(cpu_flags)) {