Commit fd8b90f5 authored by Ronald S. Bultje's avatar Ronald S. Bultje

vp9: fix overflow in 8x8 topleft 32x32 idct ssse3 version.

Also disable the mmx/iwht optimization when the bitexact flag is set.
With synthetically coded coefficients (i.e. these that lead to a
residual well outside the [-255,255] range), our optimizations will
overflow. It doesn't make sense to fix the overflows, since they can
only occur on synthetic input, not on real fwht-generated input. Thus,
add a bitexact flag that disables this optimization.
parent 4bb9dbe4
...@@ -360,7 +360,7 @@ static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt ...@@ -360,7 +360,7 @@ static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt
av_freep(&s->block_base); av_freep(&s->block_base);
if (s->bpp != s->last_bpp) { if (s->bpp != s->last_bpp) {
ff_vp9dsp_init(&s->dsp, s->bpp); ff_vp9dsp_init(&s->dsp, s->bpp, ctx->flags & AV_CODEC_FLAG_BITEXACT);
ff_videodsp_init(&s->vdsp, s->bpp); ff_videodsp_init(&s->vdsp, s->bpp);
s->last_bpp = s->bpp; s->last_bpp = s->bpp;
} }
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include "libavutil/common.h" #include "libavutil/common.h"
#include "vp9dsp.h" #include "vp9dsp.h"
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp) av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
{ {
if (bpp == 8) { if (bpp == 8) {
ff_vp9dsp_init_8(dsp); ff_vp9dsp_init_8(dsp);
...@@ -36,6 +36,6 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp) ...@@ -36,6 +36,6 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp)
ff_vp9dsp_init_12(dsp); ff_vp9dsp_init_12(dsp);
} }
if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp); if (ARCH_X86) ff_vp9dsp_init_x86(dsp, bpp, bitexact);
if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp); if (ARCH_MIPS) ff_vp9dsp_init_mips(dsp, bpp);
} }
...@@ -120,13 +120,13 @@ typedef struct VP9DSPContext { ...@@ -120,13 +120,13 @@ typedef struct VP9DSPContext {
vp9_scaled_mc_func smc[5][4][2]; vp9_scaled_mc_func smc[5][4][2];
} VP9DSPContext; } VP9DSPContext;
void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp); void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact);
void ff_vp9dsp_init_8(VP9DSPContext *dsp); void ff_vp9dsp_init_8(VP9DSPContext *dsp);
void ff_vp9dsp_init_10(VP9DSPContext *dsp); void ff_vp9dsp_init_10(VP9DSPContext *dsp);
void ff_vp9dsp_init_12(VP9DSPContext *dsp); void ff_vp9dsp_init_12(VP9DSPContext *dsp);
void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp); void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact);
void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp); void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp);
#endif /* AVCODEC_VP9DSP_H */ #endif /* AVCODEC_VP9DSP_H */
...@@ -307,7 +307,7 @@ ipred_func(32, tm, avx2); ...@@ -307,7 +307,7 @@ ipred_func(32, tm, avx2);
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp) av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
{ {
#if HAVE_YASM #if HAVE_YASM
int cpu_flags; int cpu_flags;
...@@ -388,10 +388,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp) ...@@ -388,10 +388,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp)
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
init_fpel(4, 0, 4, put, mmx); init_fpel(4, 0, 4, put, mmx);
init_fpel(3, 0, 8, put, mmx); init_fpel(3, 0, 8, put, mmx);
if (!bitexact) {
dsp->itxfm_add[4 /* lossless */][DCT_DCT] = dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
dsp->itxfm_add[4 /* lossless */][ADST_DCT] = dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
dsp->itxfm_add[4 /* lossless */][DCT_ADST] = dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
}
init_ipred(8, mmx, v, VERT); init_ipred(8, mmx, v, VERT);
} }
......
...@@ -1127,10 +1127,14 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16 ...@@ -1127,10 +1127,14 @@ IADST8_FN iadst, IADST8, iadst, IADST8, avx, 16
pmulhrsw m7, m4, [pw_16069x2] ; t6-7 pmulhrsw m7, m4, [pw_16069x2] ; t6-7
pmulhrsw m4, [pw_3196x2] ; t4-5 pmulhrsw m4, [pw_3196x2] ; t4-5
%if 0 ; overflows :(
paddw m6, m7, m4 paddw m6, m7, m4
psubw m5, m7, m4 psubw m5, m7, m4
pmulhrsw m5, [pw_11585x2] ; t5 pmulhrsw m5, [pw_11585x2] ; t5
pmulhrsw m6, [pw_11585x2] ; t6 pmulhrsw m6, [pw_11585x2] ; t6
%else
VP9_UNPACK_MULSUB_2W_4X 5, 6, 7, 4, 11585, 11585, [pd_8192], 0, 1 ; t5, t6
%endif
psubw m0, m3, m7 psubw m0, m3, m7
paddw m7, m3 paddw m7, m3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment