Commit e3f530fe authored by Ronald S. Bultje's avatar Ronald S. Bultje

prores: idct sse2/sse4 optimizations.

~3.0-3.5x as fast as original C version, 1.6x as fast overall.
parent 6aa3cac6
...@@ -545,8 +545,8 @@ static int decode_slice(ProresContext *ctx, int pic_num, int slice_num, ...@@ -545,8 +545,8 @@ static int decode_slice(ProresContext *ctx, int pic_num, int slice_num,
if (ctx->qmat_changed || sf != ctx->prev_slice_sf) { if (ctx->qmat_changed || sf != ctx->prev_slice_sf) {
ctx->prev_slice_sf = sf; ctx->prev_slice_sf = sf;
for (i = 0; i < 64; i++) { for (i = 0; i < 64; i++) {
ctx->qmat_luma_scaled[i] = ctx->qmat_luma[i] * sf; ctx->qmat_luma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_luma[i] * sf;
ctx->qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * sf; ctx->qmat_chroma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_chroma[i] * sf;
} }
} }
......
...@@ -56,6 +56,8 @@ void ff_proresdsp_init(ProresDSPContext *dsp) ...@@ -56,6 +56,8 @@ void ff_proresdsp_init(ProresDSPContext *dsp)
dsp->idct_put = prores_idct_put_c; dsp->idct_put = prores_idct_put_c;
dsp->idct_permutation_type = FF_NO_IDCT_PERM; dsp->idct_permutation_type = FF_NO_IDCT_PERM;
if (HAVE_MMX) ff_proresdsp_x86_init(dsp);
ff_init_scantable_permutation(dsp->idct_permutation, ff_init_scantable_permutation(dsp->idct_permutation,
dsp->idct_permutation_type); dsp->idct_permutation_type);
} }
...@@ -35,4 +35,6 @@ typedef struct { ...@@ -35,4 +35,6 @@ typedef struct {
void ff_proresdsp_init(ProresDSPContext *dsp); void ff_proresdsp_init(ProresDSPContext *dsp);
void ff_proresdsp_x86_init(ProresDSPContext *dsp);
#endif /* AVCODEC_PRORESDSP_H */ #endif /* AVCODEC_PRORESDSP_H */
...@@ -33,6 +33,8 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o ...@@ -33,6 +33,8 @@ MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o
......
...@@ -64,6 +64,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x00400 ...@@ -64,6 +64,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x00400
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
......
/*
* Apple ProRes compatible decoder
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/proresdsp.h"
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
DCTELEM *block);
void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
DCTELEM *block);
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
DCTELEM *block);
void ff_proresdsp_x86_init(ProresDSPContext *dsp)
{
#if ARCH_X86_64
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_SSE2) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse2;
}
if (flags & AV_CPU_FLAG_SSE4) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse4;
}
#if HAVE_AVX
if (flags & AV_CPU_FLAG_AVX) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_avx;
}
#endif
#endif
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment