Commit 356ee8d7 authored by Mans Rullgard's avatar Mans Rullgard Committed by Diego Biurrun

x86: clean up ff_dsputil_init_mmx()

This splits ff_dsputil_init_mmx() into multiple functions, one for
each MMX/SSE level, somewhat simplifying the nested conditions.
Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
Signed-off-by: 's avatarDiego Biurrun <diego@biurrun.de>
parent 3faa141d
......@@ -2336,110 +2336,55 @@ extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
const float *src1, int len);
void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
if (avctx->dsp_mask) {
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
mm_flags |= (avctx->dsp_mask & 0xffff);
else
mm_flags &= ~(avctx->dsp_mask & 0xffff);
}
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
#if 0
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
if (mm_flags & AV_CPU_FLAG_MMX)
av_log(avctx, AV_LOG_INFO, " mmx");
if (mm_flags & AV_CPU_FLAG_MMX2)
av_log(avctx, AV_LOG_INFO, " mmx2");
if (mm_flags & AV_CPU_FLAG_3DNOW)
av_log(avctx, AV_LOG_INFO, " 3dnow");
if (mm_flags & AV_CPU_FLAG_SSE)
av_log(avctx, AV_LOG_INFO, " sse");
if (mm_flags & AV_CPU_FLAG_SSE2)
av_log(avctx, AV_LOG_INFO, " sse2");
av_log(avctx, AV_LOG_INFO, "\n");
#endif
#define H264_QPEL_FUNCS(x, y, CPU) \
c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \
c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU; \
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU
if (mm_flags & AV_CPU_FLAG_MMX) {
const int idct_algo= avctx->idct_algo;
#define H264_QPEL_FUNCS_10(x, y, CPU) \
c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \
c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU; \
c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \
c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
c->idct_put= ff_simple_idct_put_mmx;
c->idct_add= ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
#if CONFIG_GPL
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_libmpeg2mmx2_idct_put;
c->idct_add= ff_libmpeg2mmx2_idct_add;
c->idct = ff_mmxext_idct;
}else{
c->idct_put= ff_libmpeg2mmx_idct_put;
c->idct_add= ff_libmpeg2mmx_idct_add;
c->idct = ff_mmx_idct;
}
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
#endif
}else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
idct_algo==FF_IDCT_VP3 && HAVE_YASM){
if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_vp3_idct_put_sse2;
c->idct_add= ff_vp3_idct_add_sse2;
c->idct = ff_vp3_idct_sse2;
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
}else{
c->idct_put= ff_vp3_idct_put_mmx;
c->idct_add= ff_vp3_idct_add_mmx;
c->idct = ff_vp3_idct_mmx;
c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
}
}else if(idct_algo==FF_IDCT_CAVS){
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
}else if(idct_algo==FF_IDCT_XVIDMMX){
if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_idct_xvid_sse2_put;
c->idct_add= ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type= FF_SSE2_IDCT_PERM;
}else if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_idct_xvid_mmx2_put;
c->idct_add= ff_idct_xvid_mmx2_add;
c->idct = ff_idct_xvid_mmx2;
}else{
c->idct_put= ff_idct_xvid_mmx_put;
c->idct_add= ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
}
static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
c->clear_block = clear_block_mmx;
c->clear_blocks = clear_blocks_mmx;
if ((mm_flags & AV_CPU_FLAG_SSE) &&
!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = clear_block_sse;
c->clear_blocks = clear_blocks_sse;
}
}
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
c->draw_edges = draw_edges_mmx;
if (!high_bit_depth) {
SET_HPEL_FUNCS(put, 0, 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
SET_HPEL_FUNCS(avg, 0, 16, mmx);
......@@ -2458,26 +2403,30 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->emulated_edge_mc = emulated_edge_mc_mmx;
#endif
c->add_bytes= add_bytes_mmx;
if (!high_bit_depth)
c->draw_edges = draw_edges_mmx;
c->add_bytes = add_bytes_mmx;
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter= h263_v_loop_filter_mmx;
c->h263_h_loop_filter= h263_h_loop_filter_mmx;
c->h263_v_loop_filter = h263_v_loop_filter_mmx;
c->h263_h_loop_filter = h263_h_loop_filter_mmx;
}
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif
if (mm_flags & AV_CPU_FLAG_MMX2) {
}
static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
c->prefetch = prefetch_mmx2;
if (!high_bit_depth) {
......@@ -2496,19 +2445,20 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
}
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
if (!high_bit_depth) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
}
if (CONFIG_VP3_DECODER && HAVE_YASM) {
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
}
}
if (CONFIG_VP3_DECODER && HAVE_YASM) {
......@@ -2521,24 +2471,6 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
}
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
......@@ -2553,8 +2485,7 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
}
else if (bit_depth == 10) {
} else if (bit_depth == 10) {
#if HAVE_YASM
#if !ARCH_X86_64
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
......@@ -2574,25 +2505,36 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
}
if (bit_depth == 10 && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
}
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
} else {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
}
#endif
#if HAVE_7REGS
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
} else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
}
static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
c->prefetch = prefetch_3dnow;
if (!high_bit_depth) {
......@@ -2610,11 +2552,12 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
}
......@@ -2649,20 +2592,72 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
#endif
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
c->vector_fmul_add = vector_fmul_add_3dnow;
#if HAVE_7REGS
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
}
static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
}
static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = clear_block_sse;
c->clear_blocks = clear_blocks_sse;
}
}
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
c->vector_fmul = vector_fmul_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
if (!(mm_flags & AV_CPU_FLAG_3DNOW))
c->vector_fmul_add = vector_fmul_add_sse;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
c->vector_clipf = vector_clipf_sse;
#define H264_QPEL_FUNCS(x, y, CPU)\
c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
c->gmc = gmc_sse;
#endif
}
static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
if (mm_flags & AV_CPU_FLAG_3DNOW) {
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
c->put_pixels_tab[0][0] = put_pixels16_sse2;
......@@ -2671,7 +2666,7 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
H264_QPEL_FUNCS(0, 0, sse2);
}
}
if(mm_flags & AV_CPU_FLAG_SSE2){
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
......@@ -2686,30 +2681,46 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
#if HAVE_YASM
#define H264_QPEL_FUNCS_10(x, y, CPU)\
c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
if (CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
}
}
#endif
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif
}
static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_SSSE3
if(mm_flags & AV_CPU_FLAG_SSSE3){
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
......@@ -2726,86 +2737,20 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
#if HAVE_YASM
else if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
}
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
}
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
#endif
}
#endif
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
}
if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
}
if(mm_flags & AV_CPU_FLAG_MMX2){
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
} else {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
}
#endif
}
if(mm_flags & AV_CPU_FLAG_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
c->vector_fmul = vector_fmul_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
c->vector_fmul_add = vector_fmul_add_sse;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
c->vector_clipf = vector_clipf_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
c->gmc = gmc_sse;
#endif
}
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
} else {
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
}
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_ATOM) {
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
} else {
......@@ -2816,22 +2761,28 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif
}
#endif
}
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
#if HAVE_YASM
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif
}
}
static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
#if HAVE_AVX && HAVE_YASM
if (mm_flags & AV_CPU_FLAG_AVX) {
const int bit_depth = avctx->bits_per_raw_sample;
if (bit_depth == 10) {
//AVX implies !cache64.
//TODO: Port cache(32|64) detection from x264.
H264_QPEL_FUNCS_10(1, 0, sse2)
H264_QPEL_FUNCS_10(2, 0, sse2)
H264_QPEL_FUNCS_10(3, 0, sse2)
// AVX implies !cache64.
// TODO: Port cache(32|64) detection from x264.
H264_QPEL_FUNCS_10(1, 0, sse2);
H264_QPEL_FUNCS_10(2, 0, sse2);
H264_QPEL_FUNCS_10(3, 0, sse2);
if (CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
......@@ -2839,9 +2790,116 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
}
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
#endif
}
void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
if (avctx->dsp_mask) {
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
mm_flags |= (avctx->dsp_mask & 0xffff);
else
mm_flags &= ~(avctx->dsp_mask & 0xffff);
}
#if 0
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
if (mm_flags & AV_CPU_FLAG_MMX)
av_log(avctx, AV_LOG_INFO, " mmx");
if (mm_flags & AV_CPU_FLAG_MMX2)
av_log(avctx, AV_LOG_INFO, " mmx2");
if (mm_flags & AV_CPU_FLAG_3DNOW)
av_log(avctx, AV_LOG_INFO, " 3dnow");
if (mm_flags & AV_CPU_FLAG_SSE)
av_log(avctx, AV_LOG_INFO, " sse");
if (mm_flags & AV_CPU_FLAG_SSE2)
av_log(avctx, AV_LOG_INFO, " sse2");
av_log(avctx, AV_LOG_INFO, "\n");
#endif
if (mm_flags & AV_CPU_FLAG_MMX) {
const int idct_algo= avctx->idct_algo;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
c->idct_put= ff_simple_idct_put_mmx;
c->idct_add= ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
#if CONFIG_GPL
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_libmpeg2mmx2_idct_put;
c->idct_add= ff_libmpeg2mmx2_idct_add;
c->idct = ff_mmxext_idct;
}else{
c->idct_put= ff_libmpeg2mmx_idct_put;
c->idct_add= ff_libmpeg2mmx_idct_add;
c->idct = ff_mmx_idct;
}
c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
#endif
}else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
idct_algo==FF_IDCT_VP3 && HAVE_YASM){
if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_vp3_idct_put_sse2;
c->idct_add= ff_vp3_idct_add_sse2;
c->idct = ff_vp3_idct_sse2;
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
}else{
c->idct_put= ff_vp3_idct_put_mmx;
c->idct_add= ff_vp3_idct_add_mmx;
c->idct = ff_vp3_idct_mmx;
c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
}
}else if(idct_algo==FF_IDCT_CAVS){
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
}else if(idct_algo==FF_IDCT_XVIDMMX){
if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_idct_xvid_sse2_put;
c->idct_add= ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type= FF_SSE2_IDCT_PERM;
}else if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_idct_xvid_mmx2_put;
c->idct_add= ff_idct_xvid_mmx2_add;
c->idct = ff_idct_xvid_mmx2;
}else{
c->idct_put= ff_idct_xvid_mmx_put;
c->idct_add= ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
}
dsputil_init_mmx(c, avctx, mm_flags);
}
if (mm_flags & AV_CPU_FLAG_MMX2)
dsputil_init_mmx2(c, avctx, mm_flags);
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
dsputil_init_3dnow(c, avctx, mm_flags);
if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT))
dsputil_init_3dnow2(c, avctx, mm_flags);
if (HAVE_SSE && (mm_flags & AV_CPU_FLAG_SSE))
dsputil_init_sse(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE2)
dsputil_init_sse2(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSSE3)
dsputil_init_ssse3(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
dsputil_init_sse4(c, avctx, mm_flags);
if (mm_flags & AV_CPU_FLAG_AVX)
dsputil_init_avx(c, avctx, mm_flags);
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment