Commit 951455c1 authored by Daniel Kang's avatar Daniel Kang Committed by Luca Barbato

vp8: implement sliced threading

Testing gives 25-30% gain on HD clips with two threads and
up to 50% gain with eight threads.

Sliced threading uses more memory than single or frame threading.

Frame threading and single threading keep the previous memory
layout.
Signed-off-by: 's avatarLuca Barbato <lu_zero@gentoo.org>
parent 17343e39
This diff is collapsed.
......@@ -4,6 +4,7 @@
* Copyright (C) 2010 David Conrad
* Copyright (C) 2010 Ronald S. Bultje
* Copyright (C) 2010 Jason Garrett-Glaser
* Copyright (C) 2012 Daniel Kang
*
* This file is part of Libav.
*
......@@ -88,10 +89,40 @@ typedef struct {
} VP8Macroblock;
typedef struct {
pthread_mutex_t lock;
pthread_cond_t cond;
int thread_nr;
int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF)
int wait_mb_pos; // What the current thread is waiting on.
uint8_t *edge_emu_buffer;
/**
* For coeff decode, we need to know whether the above block had non-zero
* coefficients. This means for each macroblock, we need data for 4 luma
* blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
* per macroblock. We keep the last row in top_nnz.
*/
DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
/**
* This is the index plus one of the last non-zero coeff
* for each of the blocks in the current macroblock.
* So, 0 -> no coeffs
* 1 -> dc-only (special transform)
* 2+-> full transform
*/
DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
VP8FilterStrength *filter_strength;
} VP8ThreadData;
#define MAX_THREADS 8
typedef struct {
VP8ThreadData *thread_data;
AVCodecContext *avctx;
AVFrame *framep[4];
AVFrame *next_framep[4];
uint8_t *edge_emu_buffer;
AVFrame *curframe;
AVFrame *prev_frame;
uint16_t mb_width; /* number of horizontal MB */
uint16_t mb_height; /* number of vertical MB */
......@@ -128,7 +159,6 @@ typedef struct {
} filter;
VP8Macroblock *macroblocks;
VP8FilterStrength *filter_strength;
uint8_t *intra4x4_pred_mode_top;
uint8_t intra4x4_pred_mode_left[4];
......@@ -169,32 +199,10 @@ typedef struct {
int8_t ref[4];
} lf_delta;
/**
* Cache of the top row needed for intra prediction
* 16 for luma, 8 for each chroma plane
*/
uint8_t (*top_border)[16+8+8];
/**
* For coeff decode, we need to know whether the above block had non-zero
* coefficients. This means for each macroblock, we need data for 4 luma
* blocks, 2 u blocks, 2 v blocks, and the luma dc block, for a total of 9
* per macroblock. We keep the last row in top_nnz.
*/
uint8_t (*top_nnz)[9];
DECLARE_ALIGNED(8, uint8_t, left_nnz)[9];
/**
* This is the index plus one of the last non-zero coeff
* for each of the blocks in the current macroblock.
* So, 0 -> no coeffs
* 1 -> dc-only (special transform)
* 2+-> full transform
*/
DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
VP56RangeCoder c; ///< header context, includes mb modes and motion vectors
DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
/**
* These are all of the updatable probabilities for binary decisions.
......@@ -247,6 +255,13 @@ typedef struct {
uint8_t *segmentation_maps[5];
int num_maps_to_be_freed;
int maps_are_invalid;
int num_jobs;
/**
* This describes the macroblock memory layout.
* 0 -> Only width+height*2+1 macroblocks allocated (frame/single thread).
* 1 -> Macroblocks for entire frame alloced (sliced thread).
*/
int mb_layout;
} VP8Context;
#endif /* AVCODEC_VP8_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment