avcodec/hevc: new idct + asm

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

avcodec/hevc: new idct + asm
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
92cccb7b · plepere · Michael Niedermayer · fa0d0fb4 · 92cccb7b · 92cccb7b
Commit 92cccb7b authored Jun 13, 2014 by plepere Committed by Michael Niedermayer Jun 17, 2014
8 changed files
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -1388,8 +1388,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
            s->hevcdsp.transform_skip(dst, coeffs, stride);
        else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2)
            s->hevcdsp.transform_4x4_luma_add(dst, coeffs, stride);
-        else
-            s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+        else {
+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+            if (max_xy == 0)
+                s->hevcdsp.transform_dc_add[log2_trafo_size-2](dst, coeffs, stride);
+            else {
+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                if (max_xy < 4)
+                    col_limit = FFMIN(4, col_limit);
+                else if (max_xy < 8)
+                    col_limit = FFMIN(8, col_limit);
+                else if (max_xy < 12)
+                    col_limit = FFMIN(24, col_limit);
+                s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride, col_limit);
+            }
+        }
    }
 }


--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -202,6 +202,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
    hevcdsp->transform_add[2]       = FUNC(transform_16x16_add, depth);     \
    hevcdsp->transform_add[3]       = FUNC(transform_32x32_add, depth);     \
                                                                            \
+    hevcdsp->transform_dc_add[0]    = FUNC(transform_4x4_dc_add, depth);    \
+    hevcdsp->transform_dc_add[1]    = FUNC(transform_8x8_dc_add, depth);    \
+    hevcdsp->transform_dc_add[2]    = FUNC(transform_16x16_dc_add, depth);  \
+    hevcdsp->transform_dc_add[3]    = FUNC(transform_32x32_dc_add, depth);  \
+                                                                            \
    hevcdsp->sao_band_filter[0] = FUNC(sao_band_filter_0, depth);           \
    hevcdsp->sao_band_filter[1] = FUNC(sao_band_filter_1, depth);           \
    hevcdsp->sao_band_filter[2] = FUNC(sao_band_filter_2, depth);           \

--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -50,7 +50,9 @@ typedef struct HEVCDSPContext {
    void (*transform_skip)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
    void (*transform_4x4_luma_add)(uint8_t *dst, int16_t *coeffs,
                                   ptrdiff_t stride);
-    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+    void (*transform_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t _stride, int col_limit);
+
+    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);

    void (*sao_band_filter[4])(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                               struct SAOParams *sao, int *borders,

--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -92,7 +92,8 @@ YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                          x86/fpel.o                    \
                                          x86/qpel.o
 YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_mc.o                 \
-                                          x86/hevc_deblock.o
+                                          x86/hevc_deblock.o            \
+                                          x86/hevc_idct.o
 YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o                    \
                                          x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o

--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
+; /*
+; * Provide SSE & MMX idct functions for HEVC decoding
+; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+max_pixels_10:          times 8  dw ((1 << 10)-1)
+dc_add_10:              times 4 dd ((1 << 14-10) + 1)
+
+
+SECTION .text
+
+;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
+
+%macro DC_ADD_INIT 2
+    add              %1w, ((1 << 14-8) + 1)
+    sar              %1w, (15-8)
+    movd              m0, %1
+    lea               %1, [%2*3]
+    SPLATW            m0, m0, 0
+    pxor              m1, m1
+    psubw             m1, m0
+    packuswb          m0, m0
+    packuswb          m1, m1
+%endmacro
+
+%macro DC_ADD_OP 4
+    %1                m2, [%2     ]
+    %1                m3, [%2+%3  ]
+    %1                m4, [%2+%3*2]
+    %1                m5, [%2+%4  ]
+    paddusb           m2, m0
+    paddusb           m3, m0
+    paddusb           m4, m0
+    paddusb           m5, m0
+    psubusb           m2, m1
+    psubusb           m3, m1
+    psubusb           m4, m1
+    psubusb           m5, m1
+    %1         [%2     ], m2
+    %1         [%2+%3  ], m3
+    %1         [%2+%3*2], m4
+    %1         [%2+%4  ], m5
+%endmacro
+
+INIT_MMX mmxext
+; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+%if ARCH_X86_64
+cglobal hevc_idct4_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       movh, r0, r2, r3
+    RET
+
+; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct8_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    RET
+%else
+; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct4_dc_add_8, 2, 3, 0
+    movsx             r2, word [r1]
+    mov               r1, r2m
+    DC_ADD_INIT       r2, r1
+    DC_ADD_OP       movh, r0, r1, r2
+    RET
+
+; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct8_dc_add_8, 2, 3, 0
+    movsx             r2, word [r1]
+    mov               r1, r2m
+    DC_ADD_INIT       r2, r1
+    DC_ADD_OP       mova, r0, r1, r2
+    lea               r0, [r0+r1*4]
+    DC_ADD_OP       mova, r0, r1, r2
+    RET
+%endif
+
+
+INIT_XMM sse2
+; void ff_hevc_idct16_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct16_dc_add_8, 3, 4, 0
+    movsx             r3, word [r1]
+    DC_ADD_INIT       r3, r2
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT_DC_ADD_OP_10 3
+    pxor              m5, m5
+%if avx_enabled
+    paddw             m1, m0, [%1+0   ]
+    paddw             m2, m0, [%1+%2  ]
+    paddw             m3, m0, [%1+%2*2]
+    paddw             m4, m0, [%1+%3  ]
+%else
+    mova              m1, [%1+0   ]
+    mova              m2, [%1+%2  ]
+    mova              m3, [%1+%2*2]
+    mova              m4, [%1+%3  ]
+    paddw             m1, m0
+    paddw             m2, m0
+    paddw             m3, m0
+    paddw             m4, m0
+%endif
+    CLIPW             m1, m5, m6
+    CLIPW             m2, m5, m6
+    CLIPW             m3, m5, m6
+    CLIPW             m4, m5, m6
+    mova       [%1+0   ], m1
+    mova       [%1+%2  ], m2
+    mova       [%1+%2*2], m3
+    mova       [%1+%3  ], m4
+%endmacro
+
+INIT_MMX mmxext
+cglobal hevc_idct4_dc_add_10,3,3
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd              m0, r1d
+    lea               r1, [r2*3]
+    SPLATW            m0, m0, 0
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+
+;-----------------------------------------------------------------------------
+; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
+;-----------------------------------------------------------------------------
+%macro IDCT8_DC_ADD 0
+cglobal hevc_idct8_dc_add_10,3,4,7
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd              m0, r1d
+    lea               r1, [r2*3]
+    SPLATW            m0, m0, 0
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+%endmacro
+
+INIT_XMM sse2
+IDCT8_DC_ADD
+%if HAVE_AVX_EXTERNAL
+INIT_XMM avx
+IDCT8_DC_ADD
+%endif
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -28,6 +28,10 @@
 #include <stddef.h>
 #include <stdint.h>

+
+#define idct_dc_proto(size, bitd, opt) \
+                void ff_hevc_idct##size##_dc_add_##bitd##_##opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+
 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
 dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \
 dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \
@@ -119,5 +123,26 @@ QPEL_PROTOTYPES(qpel_hv, 10, sse4);
 WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);

+///////////////////////////////////////////////////////////////////////////////
+// IDCT
+///////////////////////////////////////////////////////////////////////////////
+
+
+idct_dc_proto(4, 8,mmxext);
+idct_dc_proto(8, 8,mmxext);
+idct_dc_proto(16,8,  sse2);
+idct_dc_proto(32,8,  sse2);
+
+
+idct_dc_proto(4, 10,mmxext);
+idct_dc_proto(8, 10,  sse2);
+idct_dc_proto(16,10,  sse2);
+idct_dc_proto(32,10,  sse2);
+idct_dc_proto(8, 10,   avx);
+idct_dc_proto(16,10,   avx);
+idct_dc_proto(32,10,   avx);
+
+
+

 #endif // AVCODEC_X86_HEVCDSP_H
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -49,6 +49,48 @@ LFC_FUNCS(uint8_t,  10)
 LFL_FUNCS(uint8_t,   8)
 LFL_FUNCS(uint8_t,  10)

+#if HAVE_SSE2_EXTERNAL
+void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_SSE2_EXTERNAL
+#if HAVE_AVX_EXTERNAL
+void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
+    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
+}
+
+void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_AVX_EXTERNAL

 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@@ -368,9 +410,17 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
    int mm_flags = av_get_cpu_flags();

    if (bit_depth == 8) {
+        if (EXTERNAL_MMXEXT(mm_flags)) {
+                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
+                c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;
+
+        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+
+                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
+                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
        }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@@ -387,13 +437,21 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
-
        }
    } else if (bit_depth == 10) {
+        if (EXTERNAL_MMXEXT(mm_flags)) {
+                c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
+
+        }
        if (EXTERNAL_SSE2(mm_flags)) {
                    c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
                    c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
-        }
+
+
+                    c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
+                    c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
+                    c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
+                }
        if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
                    c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
                    c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
@@ -410,5 +468,11 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
            QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
            QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
        }
+        if (EXTERNAL_AVX(mm_flags)) {
+            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
+            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
+        }
+
    }
 }