Merge commit '0e730494'

* commit '0e730494': avfilter: x86: Port gradfun filter optimizations to yasm Conflicts: libavfilter/x86/vf_gradfun_init.c Merged-by: Michael Niedermayer <michaelni@gmx.at>

Merge commit '0e730494'
* commit '0e730494': avfilter: x86: Port gradfun filter optimizations to yasm Conflicts: libavfilter/x86/vf_gradfun_init.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
1ea28ffc · Michael Niedermayer · 04916953 · 0e730494 · 1ea28ffc · 1ea28ffc
Commit 1ea28ffc authored Oct 24, 2013 by Michael Niedermayer
Showing with 207 additions and 218 deletions

Makefile libavfilter/x86/Makefile +2 -1

vf_gradfun.asm libavfilter/x86/vf_gradfun.asm +110 -0

vf_gradfun.c libavfilter/x86/vf_gradfun.c +0 -217

vf_gradfun_init.c libavfilter/x86/vf_gradfun_init.c +95 -0

No files found.
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
-OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun.o
+OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
 OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o

+YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
 YASM-OBJS-$(CONFIG_VOLUME_FILTER)            += x86/af_volume.o

--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
+;******************************************************************************
+;* x86-optimized functions for gradfun filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_7f: times 8 dw 0x7F
+pw_ff: times 8 dw 0xFF
+
+SECTION .text
+
+%macro FILTER_LINE 1
+    movh       m0, [r2+r0]
+    movh       m1, [r3+r0]
+    punpcklbw  m0, m7
+    punpcklwd  m1, m1
+    psllw      m0, 7
+    psubw      m1, m0
+    PABSW      m2, m1
+    pmulhuw    m2, m5
+    psubw      m2, m6
+    pminsw     m2, m7
+    pmullw     m2, m2
+    psllw      m1, 2
+    paddw      m0, %1
+    pmulhw     m1, m2
+    paddw      m0, m1
+    psraw      m0, 7
+    packuswb   m0, m0
+    movh  [r1+r0], m0
+%endmacro
+
+INIT_MMX mmxext
+cglobal gradfun_filter_line, 6, 6
+    movh      m5, r4d
+    pxor      m7, m7
+    pshufw    m5, m5,0
+    mova      m6, [pw_7f]
+    mova      m3, [r5]
+    mova      m4, [r5+8]
+.loop:
+    FILTER_LINE m3
+    add       r0, 4
+    jge .end
+    FILTER_LINE m4
+    add       r0, 4
+    jl .loop
+.end:
+    REP_RET
+
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+    movd       m5, r4d
+    pxor       m7, m7
+    pshuflw    m5, m5, 0
+    mova       m6, [pw_7f]
+    punpcklqdq m5, m5
+    mova       m4, [r5]
+.loop:
+    FILTER_LINE m4
+    add        r0, 8
+    jl .loop
+    REP_RET
+
+%macro BLUR_LINE 1
+cglobal gradfun_blur_line_%1, 6, 6, 8
+    mova        m7, [pw_ff]
+.loop:
+    %1          m0, [r4+r0]
+    %1          m1, [r5+r0]
+    mova        m2, m0
+    mova        m3, m1
+    psrlw       m0, 8
+    psrlw       m1, 8
+    pand        m2, m7
+    pand        m3, m7
+    paddw       m0, m1
+    paddw       m2, m3
+    paddw       m0, m2
+    paddw       m0, [r2+r0]
+    mova        m1, [r1+r0]
+    mova   [r1+r0], m0
+    psubw       m0, m1
+    mova   [r3+r0], m0
+    add         r0, 16
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_LINE movdqa
+BLUR_LINE movdqu
--- a/libavfilter/x86/vf_gradfun.c
+++ b/libavfilter/x86/vf_gradfun.c
-/*
- * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavfilter/gradfun.h"
-
-#if HAVE_INLINE_ASM
-
-DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
-DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
-
-#if HAVE_MMXEXT_INLINE
-static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
-                                       int width, int thresh,
-                                       const uint16_t *dithers)
-{
-    intptr_t x;
-    if (width & 3) {
-        x = width & ~3;
-        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
-        width = x;
-    }
-    x = -width;
-    __asm__ volatile(
-        "movd          %4, %%mm5 \n"
-        "pxor       %%mm7, %%mm7 \n"
-        "pshufw $0, %%mm5, %%mm5 \n"
-        "movq          %6, %%mm6 \n"
-        "movq          (%5), %%mm3 \n"
-        "movq         8(%5), %%mm4 \n"
-
-        "1: \n"
-        "movd     (%2,%0), %%mm0 \n"
-        "movd     (%3,%0), %%mm1 \n"
-        "punpcklbw  %%mm7, %%mm0 \n"
-        "punpcklwd  %%mm1, %%mm1 \n"
-        "psllw         $7, %%mm0 \n"
-        "pxor       %%mm2, %%mm2 \n"
-        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
-        "psubw      %%mm1, %%mm2 \n"
-        "pmaxsw     %%mm1, %%mm2 \n"
-        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%mm6, %%mm2 \n"
-        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%mm2, %%mm2 \n"
-        "paddw      %%mm3, %%mm0 \n" // pix += dither
-        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
-        "pmulhw     %%mm2, %%mm1 \n"
-        "paddw      %%mm1, %%mm0 \n" // pix += m
-        "psraw         $7, %%mm0 \n"
-        "packuswb   %%mm0, %%mm0 \n"
-        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add           $4, %0 \n"
-        "jnl 2f \n"
-
-        "movd     (%2,%0), %%mm0 \n"
-        "movd     (%3,%0), %%mm1 \n"
-        "punpcklbw  %%mm7, %%mm0 \n"
-        "punpcklwd  %%mm1, %%mm1 \n"
-        "psllw         $7, %%mm0 \n"
-        "pxor       %%mm2, %%mm2 \n"
-        "psubw      %%mm0, %%mm1 \n" // delta = dc - pix
-        "psubw      %%mm1, %%mm2 \n"
-        "pmaxsw     %%mm1, %%mm2 \n"
-        "pmulhuw    %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%mm6, %%mm2 \n"
-        "pminsw     %%mm7, %%mm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%mm2, %%mm2 \n"
-        "paddw      %%mm4, %%mm0 \n" // pix += dither
-        "psllw         $2, %%mm1 \n" // m = m*m*delta >> 14
-        "pmulhw     %%mm2, %%mm1 \n"
-        "paddw      %%mm1, %%mm0 \n" // pix += m
-        "psraw         $7, %%mm0 \n"
-        "packuswb   %%mm0, %%mm0 \n"
-        "movd       %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add           $4, %0 \n"
-        "jl 1b \n"
-
-        "2: \n"
-        "emms \n"
-        :"+r"(x)
-        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
-         "rm"(thresh), "r"(dithers), "m"(*pw_7f)
-        :"memory"
-    );
-}
-#endif
-
-#if HAVE_SSSE3_INLINE
-static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers)
-{
-    intptr_t x;
-    if (width & 7) {
-        // could be 10% faster if I somehow eliminated this
-        x = width & ~7;
-        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
-        width = x;
-    }
-    x = -width;
-    __asm__ volatile(
-        "movd           %4, %%xmm5 \n"
-        "pxor       %%xmm7, %%xmm7 \n"
-        "pshuflw $0,%%xmm5, %%xmm5 \n"
-        "movdqa         %6, %%xmm6 \n"
-        "punpcklqdq %%xmm5, %%xmm5 \n"
-        "movdqa         %5, %%xmm4 \n"
-        "1: \n"
-        "movq      (%2,%0), %%xmm0 \n"
-        "movq      (%3,%0), %%xmm1 \n"
-        "punpcklbw  %%xmm7, %%xmm0 \n"
-        "punpcklwd  %%xmm1, %%xmm1 \n"
-        "psllw          $7, %%xmm0 \n"
-        "psubw      %%xmm0, %%xmm1 \n" // delta = dc - pix
-        "pabsw      %%xmm1, %%xmm2 \n"
-        "pmulhuw    %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
-        "psubw      %%xmm6, %%xmm2 \n"
-        "pminsw     %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
-        "pmullw     %%xmm2, %%xmm2 \n"
-        "psllw          $2, %%xmm1 \n"
-        "paddw      %%xmm4, %%xmm0 \n" // pix += dither
-        "pmulhw     %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
-        "paddw      %%xmm1, %%xmm0 \n" // pix += m
-        "psraw          $7, %%xmm0 \n"
-        "packuswb   %%xmm0, %%xmm0 \n"
-        "movq       %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
-        "add            $8, %0 \n"
-        "jl 1b \n"
-        :"+&r"(x)
-        :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
-         "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
-        :"memory"
-    );
-}
-#endif /* HAVE_SSSE3_INLINE */
-
-#if HAVE_SSE2_INLINE
-static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
-{
-#define BLURV(load)\
-    intptr_t x = -2*width;\
-    __asm__ volatile(\
-        "movdqa %6, %%xmm7 \n"\
-        "1: \n"\
-        load"   (%4,%0), %%xmm0 \n"\
-        load"   (%5,%0), %%xmm1 \n"\
-        "movdqa  %%xmm0, %%xmm2 \n"\
-        "movdqa  %%xmm1, %%xmm3 \n"\
-        "psrlw       $8, %%xmm0 \n"\
-        "psrlw       $8, %%xmm1 \n"\
-        "pand    %%xmm7, %%xmm2 \n"\
-        "pand    %%xmm7, %%xmm3 \n"\
-        "paddw   %%xmm1, %%xmm0 \n"\
-        "paddw   %%xmm3, %%xmm2 \n"\
-        "paddw   %%xmm2, %%xmm0 \n"\
-        "paddw  (%2,%0), %%xmm0 \n"\
-        "movdqa (%1,%0), %%xmm1 \n"\
-        "movdqa  %%xmm0, (%1,%0) \n"\
-        "psubw   %%xmm1, %%xmm0 \n"\
-        "movdqa  %%xmm0, (%3,%0) \n"\
-        "add        $16, %0 \n"\
-        "jl 1b \n"\
-        :"+&r"(x)\
-        :"r"(buf+width),\
-         "r"(buf1+width),\
-         "r"(dc+width),\
-         "r"(src+width*2),\
-         "r"(src+width*2+src_linesize),\
-         "m"(*pw_ff)\
-        :"memory"\
-    );
-    if (((intptr_t) src | src_linesize) & 15) {
-        BLURV("movdqu");
-    } else {
-        BLURV("movdqa");
-    }
-}
-#endif /* HAVE_SSE2_INLINE */
-
-#endif /* HAVE_INLINE_ASM */
-
-av_cold void ff_gradfun_init_x86(GradFunContext *gf)
-{
-#if HAVE_MMXEXT_INLINE
-    int cpu_flags = av_get_cpu_flags();
-
-    if (cpu_flags & AV_CPU_FLAG_MMXEXT)
-        gf->filter_line = gradfun_filter_line_mmxext;
-#endif
-#if HAVE_SSSE3_INLINE
-    if (cpu_flags & AV_CPU_FLAG_SSSE3)
-        gf->filter_line = gradfun_filter_line_ssse3;
-#endif
-#if HAVE_SSE2_INLINE
-    if (cpu_flags & AV_CPU_FLAG_SSE2)
-        gf->blur_line = gradfun_blur_line_sse2;
-#endif
-}
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
+/*
+ * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/gradfun.h"
+
+#if HAVE_YASM
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                   const uint16_t *dc, int thresh,
+                                   const uint16_t *dithers);
+static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc,
+                                       int width, int thresh,
+                                       const uint16_t *dithers)
+{
+    intptr_t x;
+    if (width & 3) {
+        x = width & ~3;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    ff_gradfun_filter_line_mmxext(x, dst + width, src + width, dc + width/2,
+                                  thresh, dithers);
+}
+
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
+                                  const uint16_t *dc, int thresh,
+                                  const uint16_t *dithers);
+static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
+{
+    intptr_t x;
+    if (width & 7) {
+        // could be 10% faster if I somehow eliminated this
+        x = width & ~7;
+        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
+        width = x;
+    }
+    x = -width;
+    ff_gradfun_filter_line_ssse3(x, dst + width, src + width, dc + width/2,
+                                 thresh, dithers);
+}
+
+void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf, uint16_t *buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
+void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf, uint16_t *buf1, uint16_t *dc, uint8_t *src1, uint8_t *src2);
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, const uint8_t *src, int src_linesize, int width)
+{
+    intptr_t x = -2*width;
+    if (((intptr_t) src | src_linesize) & 15) {
+        ff_gradfun_blur_line_movdqu_sse2(x, buf + width, buf1 + width,
+                                         dc + width, src + width * 2,
+                                         src + width * 2 + src_linesize);
+    } else {
+        ff_gradfun_blur_line_movdqa_sse2(x, buf + width, buf1 + width,
+                                         dc + width, src + width * 2,
+                                         src + width * 2 + src_linesize);
+    }
+}
+#endif /* HAVE_YASM */
+
+av_cold void ff_gradfun_init_x86(GradFunContext *gf)
+{
+#if HAVE_YASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        gf->filter_line = gradfun_filter_line_mmxext;
+    if (EXTERNAL_SSSE3(cpu_flags))
+        gf->filter_line = gradfun_filter_line_ssse3;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        gf->blur_line = gradfun_blur_line_sse2;
+#endif /* HAVE_YASM */
+}