yadif: x86 assembly for 9 to 14-bit samples

These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

yadif: x86 assembly for 9 to 14-bit samples
These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
0a5814c9 · James Darnley · Michael Niedermayer · 17e7b495 · 0a5814c9 · 0a5814c9
Commit 0a5814c9 authored Mar 16, 2013 by James Darnley Committed by Michael Niedermayer Mar 16, 2013
Show whitespace changes
Inline Side-by-side

Showing with 305 additions and 2 deletions

Makefile libavfilter/x86/Makefile +1 -1

vf_yadif_init.c libavfilter/x86/vf_yadif_init.c +20 -1

yadif-10.asm libavfilter/x86/yadif-10.asm +284 -0

No files found.
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -5,4 +5,4 @@ OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o
-YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER)             += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -49,6 +49,16 @@ void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
                                     void *next, int w, int prefs,
                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+                                       void *next, int w, int prefs,
+                                       int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+                                     void *next, int w, int prefs,
+                                     int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+                                      void *next, int w, int prefs,
+                                      int mrefs, int parity, int mode);
 av_cold void ff_yadif_init_x86(YADIFContext *yadif)
 {
    int cpu_flags = av_get_cpu_flags();
@@ -56,7 +66,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
                                  : yadif->csp->comp[0].depth_minus1 + 1;
 #if HAVE_YASM
-    if (bit_depth > 8) {
+    if (bit_depth >= 15) {
 #if ARCH_X86_32
        if (EXTERNAL_MMXEXT(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
@@ -67,6 +77,15 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif)
            yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
        if (EXTERNAL_SSE4(cpu_flags))
            yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+    } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
    } else {
 #if ARCH_X86_32
    if (EXTERNAL_MMXEXT(cpu_flags))

--- a/libavfilter/x86/yadif-10.asm
+++ b/libavfilter/x86/yadif-10.asm
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+pw_1: times 8 dw 1
+SECTION .text
+%macro PABS 2
+%if cpuflag(ssse3)
+    pabsw %1, %1
+%else
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+%endif
+%endmacro
+%macro PMAXUW 2
+%if cpuflag(sse4)
+    pmaxuw %1, %2
+%else
+    psubusw %1, %2
+    paddusw %1, %2
+%endif
+%endmacro
+%macro CHECK 2
+    movu      m2, [curq+t1+%1*2]
+    movu      m3, [curq+t0+%2*2]
+    mova      m4, m2
+    mova      m5, m2
+    pxor      m4, m3
+    pavgw     m5, m3
+    pand      m4, [pw_1]
+    psubusw   m5, m4
+%if mmsize == 16
+    psrldq    m5, 2
+%else
+    psrlq     m5, 16
+%endif
+    mova      m4, m2
+    psubusw   m2, m3
+    psubusw   m3, m4
+    PMAXUW    m2, m3
+    mova      m3, m2
+    mova      m4, m2
+%if mmsize == 16
+    psrldq    m3, 2
+    psrldq    m4, 4
+%else
+    psrlq     m3, 16
+    psrlq     m4, 32
+%endif
+    paddw     m2, m3
+    paddw     m2, m4
+%endmacro
+%macro CHECK1 0
+    mova    m3, m0
+    pcmpgtw m3, m2
+    pminsw  m0, m2
+    mova    m6, m3
+    pand    m5, m3
+    pandn   m3, m1
+    por     m3, m5
+    mova    m1, m3
+%endmacro
+; %macro CHECK2 0
+;     paddw   m6, [pw_1]
+;     psllw   m6, 14
+;     paddsw  m2, m6
+;     mova    m3, m0
+;     pcmpgtw m3, m2
+;     pminsw  m0, m2
+;     pand    m5, m3
+;     pandn   m3, m1
+;     por     m3, m5
+;     mova    m1, m3
+; %endmacro
+; This version of CHECK2 is required for 14-bit samples.  The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+%macro CHECK2 0
+    mova    m3, m0
+    pcmpgtw m0, m2
+    pand    m0, m6
+    mova    m6, m0
+    pand    m5, m6
+    pand    m2, m0
+    pandn   m6, m1
+    pandn   m0, m3
+    por     m6, m5
+    por     m0, m2
+    mova    m1, m6
+%endmacro
+%macro LOAD 2
+    movu      m%1, %2
+%endmacro
+%macro FILTER 3
+.loop%1:
+    pxor         m7, m7
+    LOAD          0, [curq+t1]
+    LOAD          1, [curq+t0]
+    LOAD          2, [%2]
+    LOAD          3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psraw        m3, 1
+    mova   [rsp+ 0], m0
+    mova   [rsp+16], m3
+    mova   [rsp+32], m1
+    psubw        m2, m4
+    PABS         m2, m4
+    LOAD          3, [prevq+t1]
+    LOAD          4, [prevq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD          3, [nextq+t1]
+    LOAD          4, [nextq+t0]
+    psubw        m3, m0
+    psubw        m4, m1
+    PABS         m3, m5
+    PABS         m4, m5
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    mova   [rsp+48], m2
+    paddw        m1, m0
+    paddw        m0, m0
+    psubw        m0, m1
+    psrlw        m1, 1
+    PABS         m0, m2
+    movu         m2, [curq+t1-1*2]
+    movu         m3, [curq+t0-1*2]
+    mova         m4, m2
+    psubusw      m2, m3
+    psubusw      m3, m4
+    PMAXUW       m2, m3
+%if mmsize == 16
+    mova         m3, m2
+    psrldq       m3, 4
+%else
+    mova         m3, m2
+    psrlq        m3, 32
+%endif
+    paddw        m0, m2
+    paddw        m0, m3
+    psubw        m0, [pw_1]
+    CHECK -2, 0
+    CHECK1
+    CHECK -3, 1
+    CHECK2
+    CHECK 0, -2
+    CHECK1
+    CHECK 1, -3
+    CHECK2
+    mova         m6, [rsp+48]
+    cmp   DWORD r8m, 2
+    jge .end%1
+    LOAD          2, [%2+t1*2]
+    LOAD          4, [%3+t1*2]
+    LOAD          3, [%2+t0*2]
+    LOAD          5, [%3+t0*2]
+    paddw        m2, m4
+    paddw        m3, m5
+    psrlw        m2, 1
+    psrlw        m3, 1
+    mova         m4, [rsp+ 0]
+    mova         m5, [rsp+16]
+    mova         m7, [rsp+32]
+    psubw        m2, m4
+    psubw        m3, m7
+    mova         m0, m5
+    psubw        m5, m4
+    psubw        m0, m7
+    mova         m4, m2
+    pminsw       m2, m3
+    pmaxsw       m3, m4
+    pmaxsw       m2, m5
+    pminsw       m3, m5
+    pmaxsw       m2, m0
+    pminsw       m3, m0
+    pxor         m4, m4
+    pmaxsw       m6, m3
+    psubw        m4, m2
+    pmaxsw       m6, m4
+.end%1:
+    mova         m2, [rsp+16]
+    mova         m3, m2
+    psubw        m2, m6
+    paddw        m3, m6
+    pmaxsw       m1, m2
+    pminsw       m1, m3
+    movu     [dstq], m1
+    add        dstq, mmsize-4
+    add       prevq, mmsize-4
+    add        curq, mmsize-4
+    add       nextq, mmsize-4
+    sub   DWORD r4m, mmsize/2-2
+    jg .loop%1
+%endmacro
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, parity, mode
+%endif
+    cmp      DWORD wm, 0
+    jle .ret
+%if ARCH_X86_32
+    mov            r4, r5mp
+    mov            r5, r6mp
+    DECLARE_REG_TMP 4,5
+%else
+    movsxd         r5, DWORD r5m
+    movsxd         r6, DWORD r6m
+    DECLARE_REG_TMP 5,6
+%endif
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq
+    jmp .ret
+.parity0:
+    FILTER 0, curq, nextq
+.ret:
+    RET
+%endmacro
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif