Commit c4211046 authored by Jason Garrett-Glaser's avatar Jason Garrett-Glaser

Smarter VP8 prefetching

Prefetch all refs (including altref), but only if they've been used so far this
frame.
~2.5% faster overall.

TODO: Do something even smarter, like using how often each ref has been used
so far, so that a couple blocks of a rarely-used ref don't force us to prefetch
it.

Originally committed as revision 24444 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 5b9eb687
...@@ -123,6 +123,7 @@ typedef struct { ...@@ -123,6 +123,7 @@ typedef struct {
int mbskip_enabled; int mbskip_enabled;
int sign_bias[4]; ///< one state [0, 1] per ref frame type int sign_bias[4]; ///< one state [0, 1] per ref frame type
int ref_count[3];
/** /**
* Base parameters for segmentation, i.e. per-macroblock parameters. * Base parameters for segmentation, i.e. per-macroblock parameters.
...@@ -733,6 +734,7 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, ...@@ -733,6 +734,7 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN; VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
else else
mb->ref_frame = VP56_FRAME_PREVIOUS; mb->ref_frame = VP56_FRAME_PREVIOUS;
s->ref_count[mb->ref_frame-1]++;
// motion vectors, 16.3 // motion vectors, 16.3
find_near_mvs(s, mb, mb_x, mb_y, near, &best, cnt); find_near_mvs(s, mb, mb_x, mb_y, near, &best, cnt);
...@@ -1081,15 +1083,19 @@ static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3], ...@@ -1081,15 +1083,19 @@ static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
/* Fetch pixels for estimated mv 4 macroblocks ahead. /* Fetch pixels for estimated mv 4 macroblocks ahead.
* Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */ * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int x_off, int y_off, int ref) static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int ref)
{ {
int mx = mb->mv.x + x_off + 8; /* Don't prefetch refs that haven't been used yet this frame. */
int my = mb->mv.y + y_off; if (s->ref_count[ref-1]) {
uint8_t **src= s->framep[ref]->data; int x_off = mb_x << 4, y_off = mb_y << 4;
int off= mx + (my + (mb_x&3)*4)*s->linesize + 64; int mx = mb->mv.x + x_off + 8;
s->dsp.prefetch(src[0]+off, s->linesize, 4); int my = mb->mv.y + y_off;
off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64; uint8_t **src= s->framep[ref]->data;
s->dsp.prefetch(src[1]+off, src[2]-src[1], 2); int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
s->dsp.prefetch(src[0]+off, s->linesize, 4);
off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
}
} }
/** /**
...@@ -1103,8 +1109,6 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, ...@@ -1103,8 +1109,6 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
AVFrame *ref = s->framep[mb->ref_frame]; AVFrame *ref = s->framep[mb->ref_frame];
VP56mv *bmv = mb->bmv; VP56mv *bmv = mb->bmv;
prefetch_motion(s, mb, mb_x, mb_y, x_off, y_off, VP56_FRAME_PREVIOUS);
if (mb->mode < VP8_MVMODE_SPLIT) { if (mb->mode < VP8_MVMODE_SPLIT) {
vp8_mc_part(s, dst, ref, x_off, y_off, vp8_mc_part(s, dst, ref, x_off, y_off,
0, 0, 16, 16, width, height, &mb->mv); 0, 0, 16, 16, width, height, &mb->mv);
...@@ -1179,8 +1183,6 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, ...@@ -1179,8 +1183,6 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
8, 8, 8, 8, width, height, &bmv[3]); 8, 8, 8, 8, width, height, &bmv[3]);
break; break;
} }
prefetch_motion(s, mb, mb_x, mb_y, x_off, y_off, VP56_FRAME_GOLDEN);
} }
static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst, static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
...@@ -1458,6 +1460,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, ...@@ -1458,6 +1460,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
// top edge of 127 for intra prediction // top edge of 127 for intra prediction
memset(s->top_border, 127, (s->mb_width+1)*sizeof(*s->top_border)); memset(s->top_border, 127, (s->mb_width+1)*sizeof(*s->top_border));
memset(s->ref_count, 0, sizeof(s->ref_count));
for (mb_y = 0; mb_y < s->mb_height; mb_y++) { for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)]; VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
...@@ -1490,6 +1493,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, ...@@ -1490,6 +1493,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb); decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb);
prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_PREVIOUS);
if (!mb->skip) if (!mb->skip)
decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz); decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
else { else {
...@@ -1502,6 +1507,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, ...@@ -1502,6 +1507,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
else else
inter_predict(s, dst, mb, mb_x, mb_y); inter_predict(s, dst, mb, mb_x, mb_y);
prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN);
if (!mb->skip) { if (!mb->skip) {
idct_mb(s, dst[0], dst[1], dst[2], mb); idct_mb(s, dst[0], dst[1], dst[2], mb);
} else { } else {
...@@ -1518,6 +1525,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, ...@@ -1518,6 +1525,8 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
if (s->deblock_filter) if (s->deblock_filter)
filter_level_for_mb(s, mb, &s->filter_strength[mb_x]); filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
prefetch_motion(s, mb, mb_x, mb_y, VP56_FRAME_GOLDEN2);
dst[0] += 16; dst[0] += 16;
dst[1] += 8; dst[1] += 8;
dst[2] += 8; dst[2] += 8;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment