Commit b2da63db authored by Michael Niedermayer's avatar Michael Niedermayer

Merge commit '245b76a1'

* commit '245b76a1':
  x86: dsputil: Split inline assembly from init code

Conflicts:
	libavcodec/x86/dsputil_mmx.c

Note, the author attribution is left in place and not removed
as it is in the merged commit.
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents eda9d97b 245b76a1
...@@ -34,7 +34,8 @@ OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o ...@@ -34,7 +34,8 @@ OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
x86/dsputil_mmx.o \
x86/fdct.o \ x86/fdct.o \
x86/fpel_mmx.o \ x86/fpel_mmx.o \
x86/idct_mmx_xvid.o \ x86/idct_mmx_xvid.o \
......
/*
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/simple_idct.h"
#include "dsputil_mmx.h"
#include "idct_xvid.h"
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int dstStride,
int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
int w, int left);
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
#if HAVE_YASM
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
8, stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
}
QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_YASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_MMX_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
}
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
c->gmc = ff_gmc_mmx;
#endif
c->add_bytes = ff_add_bytes_mmx;
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
/* slower than cmov version on AMD */
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
}
c->vector_clipf = ff_vector_clipf_sse;
#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = ff_gmc_sse;
#endif
#endif /* HAVE_YASM */
}
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE2_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSSE3_EXTERNAL
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
if (mm_flags & AV_CPU_FLAG_ATOM)
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
else
c->apply_window_int16 = ff_apply_window_int16_ssse3;
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE4_EXTERNAL
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif /* HAVE_SSE4_EXTERNAL */
}
av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_7REGS && HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
#endif
if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_INLINE_ASM
const int idct_algo = avctx->idct_algo;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
} else if (idct_algo == FF_IDCT_XVIDMMX) {
if (mm_flags & AV_CPU_FLAG_SSE2) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
} else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
} else {
c->idct_put = ff_idct_xvid_mmx_put;
c->idct_add = ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
}
#endif /* HAVE_INLINE_ASM */
dsputil_init_mmx(c, avctx, mm_flags);
}
if (mm_flags & AV_CPU_FLAG_MMXEXT)
dsputil_init_mmxext(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE)
dsputil_init_sse(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE2)
dsputil_init_sse2(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSSE3)
dsputil_init_ssse3(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE4)
dsputil_init_sse4(c, avctx, mm_flags);
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx);
}
...@@ -22,70 +22,17 @@ ...@@ -22,70 +22,17 @@
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/ */
#include "libavutil/attributes.h" #include "config.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h" #include "libavutil/cpu.h"
#include "libavutil/x86/asm.h" #include "libavutil/x86/asm.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/h264dsp.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/simple_idct.h"
#include "libavcodec/videodsp.h" #include "libavcodec/videodsp.h"
#include "constants.h" #include "constants.h"
#include "dsputil_mmx.h" #include "dsputil_mmx.h"
#include "idct_xvid.h"
#include "diracdsp_mmx.h" #include "diracdsp_mmx.h"
//#undef NDEBUG
//#include <assert.h>
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int dstStride,
int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
#if HAVE_INLINE_ASM #if HAVE_INLINE_ASM
/***********************************/
/* standard MMX */
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size) int line_size)
{ {
...@@ -221,7 +168,7 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, ...@@ -221,7 +168,7 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
} }
#define CLEAR_BLOCKS(name, n) \ #define CLEAR_BLOCKS(name, n) \
static void name(int16_t *blocks) \ void name(int16_t *blocks) \
{ \ { \
__asm__ volatile ( \ __asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \ "pxor %%mm7, %%mm7 \n\t" \
...@@ -238,10 +185,10 @@ static void name(int16_t *blocks) \ ...@@ -238,10 +185,10 @@ static void name(int16_t *blocks) \
: "%"REG_a \ : "%"REG_a \
); \ ); \
} }
CLEAR_BLOCKS(clear_blocks_mmx, 6) CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
CLEAR_BLOCKS(clear_block_mmx, 1) CLEAR_BLOCKS(ff_clear_block_mmx, 1)
static void clear_block_sse(int16_t *block) void ff_clear_block_sse(int16_t *block)
{ {
__asm__ volatile ( __asm__ volatile (
"xorps %%xmm0, %%xmm0 \n" "xorps %%xmm0, %%xmm0 \n"
...@@ -258,7 +205,7 @@ static void clear_block_sse(int16_t *block) ...@@ -258,7 +205,7 @@ static void clear_block_sse(int16_t *block)
); );
} }
static void clear_blocks_sse(int16_t *blocks) void ff_clear_blocks_sse(int16_t *blocks)
{ {
__asm__ volatile ( __asm__ volatile (
"xorps %%xmm0, %%xmm0 \n" "xorps %%xmm0, %%xmm0 \n"
...@@ -280,7 +227,7 @@ static void clear_blocks_sse(int16_t *blocks) ...@@ -280,7 +227,7 @@ static void clear_blocks_sse(int16_t *blocks)
); );
} }
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{ {
x86_reg i = 0; x86_reg i = 0;
__asm__ volatile ( __asm__ volatile (
...@@ -306,9 +253,9 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) ...@@ -306,9 +253,9 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
} }
#if HAVE_7REGS #if HAVE_7REGS
static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w, const uint8_t *diff, int w,
int *left, int *left_top) int *left, int *left_top)
{ {
x86_reg w2 = -w; x86_reg w2 = -w;
x86_reg x; x86_reg x;
...@@ -345,8 +292,8 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, ...@@ -345,8 +292,8 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
/* Draw the edges of width 'w' of an image of size width, height /* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */ * this MMX version can only handle w == 8 || w == 16. */
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides) int w, int h, int sides)
{ {
uint8_t *ptr, *last_line; uint8_t *ptr, *last_line;
int i; int i;
...@@ -457,402 +404,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, ...@@ -457,402 +404,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
} }
} }
} }
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
8, stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
}
QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_YASM */
#if HAVE_INLINE_ASM
typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
ptrdiff_t linesize, int block_w, int block_h, ptrdiff_t linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h); int src_x, int src_y, int w, int h);
...@@ -986,28 +538,28 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, ...@@ -986,28 +538,28 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
#if CONFIG_VIDEODSP #if CONFIG_VIDEODSP
#if HAVE_YASM #if HAVE_YASM
#if ARCH_X86_32 #if ARCH_X86_32
static void gmc_mmx(uint8_t *dst, uint8_t *src, void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height) int shift, int r, int width, int height)
{ {
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8); width, height, &ff_emulated_edge_mc_8);
} }
#endif #endif
static void gmc_sse(uint8_t *dst, uint8_t *src, void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height) int shift, int r, int width, int height)
{ {
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8); width, height, &ff_emulated_edge_mc_8);
} }
#else #else
static void gmc_mmx(uint8_t *dst, uint8_t *src, void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height) int shift, int r, int width, int height)
{ {
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8); width, height, &ff_emulated_edge_mc_8);
...@@ -1042,6 +594,7 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[ ...@@ -1042,6 +594,7 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[
} }
#if HAVE_MMX_INLINE #if HAVE_MMX_INLINE
PIXELS16(static, ff_avg, , , _mmxext)
DIRAC_PIXOP(put, ff_put, mmx) DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx) DIRAC_PIXOP(avg, ff_avg, mmx)
#endif #endif
...@@ -1084,8 +637,8 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, ...@@ -1084,8 +637,8 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride,
#endif #endif
#endif #endif
static void vector_clipf_sse(float *dst, const float *src, void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len) float min, float max, int len)
{ {
x86_reg i = (len - 16) * 4; x86_reg i = (len - 16) * 4;
__asm__ volatile ( __asm__ volatile (
...@@ -1119,274 +672,3 @@ static void vector_clipf_sse(float *dst, const float *src, ...@@ -1119,274 +672,3 @@ static void vector_clipf_sse(float *dst, const float *src,
} }
#endif /* HAVE_INLINE_ASM */ #endif /* HAVE_INLINE_ASM */
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
int w, int left);
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_MMX_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
c->clear_block = clear_block_mmx;
c->clear_blocks = clear_blocks_mmx;
c->draw_edges = draw_edges_mmx;
}
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
c->gmc = gmc_mmx;
#endif
c->add_bytes = add_bytes_mmx;
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
/* slower than cmov version on AMD */
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = clear_block_sse;
c->clear_blocks = clear_blocks_sse;
}
}
c->vector_clipf = vector_clipf_sse;
#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = gmc_sse;
#endif
#endif /* HAVE_YASM */
}
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE2_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSSE3_EXTERNAL
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
if (mm_flags & AV_CPU_FLAG_ATOM)
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
else
c->apply_window_int16 = ff_apply_window_int16_ssse3;
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSE4_EXTERNAL
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif /* HAVE_SSE4_EXTERNAL */
}
av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
#if HAVE_7REGS && HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
if (mm_flags & AV_CPU_FLAG_MMX) {
#if HAVE_INLINE_ASM
const int idct_algo = avctx->idct_algo;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
} else if (idct_algo == FF_IDCT_XVIDMMX) {
if (mm_flags & AV_CPU_FLAG_SSE2) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
} else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
} else {
c->idct_put = ff_idct_xvid_mmx_put;
c->idct_add = ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
}
#endif /* HAVE_INLINE_ASM */
dsputil_init_mmx(c, avctx, mm_flags);
}
if (mm_flags & AV_CPU_FLAG_MMXEXT)
dsputil_init_mmxext(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE)
dsputil_init_sse(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE2)
dsputil_init_sse2(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSSE3)
dsputil_init_ssse3(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE4)
dsputil_init_sse4(c, avctx, mm_flags);
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx);
}
...@@ -110,6 +110,32 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_s ...@@ -110,6 +110,32 @@ void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_s
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides);
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len);
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h); ptrdiff_t line_size, int h);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment