Commit 827d43bb authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

VP8: move zeroing of luma DC block into the WHT

Lets us do the zeroing in asm instead of C.
Also makes it consistent with the way the regular iDCT code does it.

Originally committed as revision 24668 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 42907c6a
...@@ -117,6 +117,7 @@ typedef struct { ...@@ -117,6 +117,7 @@ typedef struct {
*/ */
DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4]; DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16]; DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
uint8_t intra4x4_pred_mode_mb[16]; uint8_t intra4x4_pred_mode_mb[16];
int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock
...@@ -864,22 +865,19 @@ static av_always_inline ...@@ -864,22 +865,19 @@ static av_always_inline
void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
uint8_t t_nnz[9], uint8_t l_nnz[9]) uint8_t t_nnz[9], uint8_t l_nnz[9])
{ {
LOCAL_ALIGNED_16(DCTELEM, dc,[16]);
int i, x, y, luma_start = 0, luma_ctx = 3; int i, x, y, luma_start = 0, luma_ctx = 3;
int nnz_pred, nnz, nnz_total = 0; int nnz_pred, nnz, nnz_total = 0;
int segment = s->segment; int segment = s->segment;
if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) { if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
AV_ZERO128(dc);
AV_ZERO128(dc+8);
nnz_pred = t_nnz[8] + l_nnz[8]; nnz_pred = t_nnz[8] + l_nnz[8];
// decode DC values and do hadamard // decode DC values and do hadamard
nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred, nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
s->qmat[segment].luma_dc_qmul); s->qmat[segment].luma_dc_qmul);
l_nnz[8] = t_nnz[8] = !!nnz; l_nnz[8] = t_nnz[8] = !!nnz;
nnz_total += nnz; nnz_total += nnz;
s->vp8dsp.vp8_luma_dc_wht(s->block, dc); s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
luma_start = 1; luma_start = 1;
luma_ctx = 0; luma_ctx = 0;
} }
......
...@@ -46,6 +46,10 @@ static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16]) ...@@ -46,6 +46,10 @@ static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
t1 = dc[i*4+1] + dc[i*4+2]; t1 = dc[i*4+1] + dc[i*4+2];
t2 = dc[i*4+1] - dc[i*4+2]; t2 = dc[i*4+1] - dc[i*4+2];
t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding
dc[i*4+0] = 0;
dc[i*4+1] = 0;
dc[i*4+2] = 0;
dc[i*4+3] = 0;
*block[i][0] = (t0 + t1) >> 3; *block[i][0] = (t0 + t1) >> 3;
*block[i][1] = (t3 + t2) >> 3; *block[i][1] = (t3 + t2) >> 3;
......
...@@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str ...@@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str
extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride); extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride); extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
...@@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) ...@@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
if (mm_flags & FF_MM_SSE) { if (mm_flags & FF_MM_SSE) {
c->vp8_idct_add = ff_vp8_idct_add_sse; c->vp8_idct_add = ff_vp8_idct_add_sse;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
} }
......
...@@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse ...@@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse
SWAP %1, %4, %3 SWAP %1, %4, %3
%endmacro %endmacro
INIT_MMX %macro VP8_DC_WHT 1
cglobal vp8_luma_dc_wht_mmx, 2,3 cglobal vp8_luma_dc_wht_%1, 2,3
movq m0, [r1] movq m0, [r1]
movq m1, [r1+8] movq m1, [r1+8]
movq m2, [r1+16] movq m2, [r1+16]
movq m3, [r1+24] movq m3, [r1+24]
%ifidn %1, sse
xorps xmm0, xmm0
movaps [r1+ 0], xmm0
movaps [r1+16], xmm0
%else
pxor m4, m4
movq [r1+ 0], m4
movq [r1+ 8], m4
movq [r1+16], m4
movq [r1+24], m4
%endif
HADAMARD4_1D 0, 1, 2, 3 HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4 TRANSPOSE4x4W 0, 1, 2, 3, 4
paddw m0, [pw_3] paddw m0, [pw_3]
...@@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 ...@@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
SCATTER_WHT 0, 1, 0 SCATTER_WHT 0, 1, 0
SCATTER_WHT 2, 3, 2 SCATTER_WHT 2, 3, 2
RET RET
%endmacro
INIT_MMX
VP8_DC_WHT mmx
VP8_DC_WHT sse
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment