Commit c988f975 authored by Michael Niedermayer's avatar Michael Niedermayer

Rearchitecturing the stiched up goose part 1

Run loop filter per row instead of per MB, this also should make it
much easier to switch to per frame filtering and also doing so in a
seperate thread in the future if some volunteer wants to try.
Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample)
This change also allows some optimizations to be tried that would not have
been possible before.

Originally committed as revision 21270 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 00c4127e
This diff is collapsed.
......@@ -300,7 +300,7 @@ typedef struct H264Context{
* is 64 if not available.
*/
DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
uint8_t (*non_zero_count)[16];
uint8_t (*non_zero_count)[32];
/**
* Motion vector cache.
......@@ -423,6 +423,7 @@ typedef struct H264Context{
*/
unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode
unsigned int list_count;
uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type
Picture *short_ref[32];
Picture *long_ref[32];
Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture
......@@ -736,8 +737,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
//FIXME deblocking could skip the intra and nnz parts.
if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
return;
// if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
// return;
/* Wow, what a mess, why didn't they simplify the interlacing & intra
* stuff, I can't imagine that these complex rules are worth it. */
......@@ -793,20 +794,33 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
if(MB_MBAFF && !IS_INTRA(mb_type)){
if(!IS_INTRA(mb_type)){
int list;
for(list=0; list<h->list_count; list++){
//These values where changed for ease of performing MC, we need to change them back
//FIXME maybe we can make MC and loop filter use the same values or prevent
//the MC code from changing ref_cache and rather use a temporary array.
if(USES_LIST(mb_type,list)){
int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
int8_t *ref;
int y, b_xy;
if(!USES_LIST(mb_type, list)){
fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
*(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
ref += h->b8_stride;
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
*(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
*(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101;
continue;
}
ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
*(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
ref += h->b8_stride;
*(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
for(y=0; y<4; y++){
*(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride];
*(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
}
}
}
}else{
......@@ -1196,6 +1210,23 @@ static inline void write_back_non_zero_count(H264Context *h){
h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
//FIXME sort better how things are stored in non_zero_count
h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];
h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];
}
static inline void write_back_motion(H264Context *h, int mb_type){
......@@ -1271,7 +1302,7 @@ static void decode_mb_skip(H264Context *h){
const int mb_xy= h->mb_xy;
int mb_type=0;
memset(h->non_zero_count[mb_xy], 0, 16);
memset(h->non_zero_count[mb_xy], 0, 32);
memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
if(MB_FIELD)
......
......@@ -1392,7 +1392,7 @@ decode_intra_mb:
// In deblocking, the quantizer is 0
s->current_picture.qscale_table[mb_xy]= 0;
// All coeffs are present
memset(h->non_zero_count[mb_xy], 16, 16);
memset(h->non_zero_count[mb_xy], 16, 32);
s->current_picture.mb_type[mb_xy]= mb_type;
h->last_qscale_diff = 0;
return 0;
......
......@@ -620,7 +620,7 @@ decode_intra_mb:
// In deblocking, the quantizer is 0
s->current_picture.qscale_table[mb_xy]= 0;
// All coeffs are present
memset(h->non_zero_count[mb_xy], 16, 16);
memset(h->non_zero_count[mb_xy], 16, 32);
s->current_picture.mb_type[mb_xy]= mb_type;
return 0;
......
......@@ -620,7 +620,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
// Do not use s->qscale as luma quantizer because it has not the same
// value in IPCM macroblocks.
qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
//tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
//tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
//{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
if( dir == 0 ) {
......@@ -650,6 +650,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
int first_vertical_edge_done = 0;
av_unused int dir;
int list;
//for sufficiently low qp, filtering wouldn't do anything
//this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
......@@ -663,6 +664,35 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
}
}
h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0];
h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1];
h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2];
h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3];
h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4];
h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5];
h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6];
h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9];
h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8];
h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7];
h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12];
h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11];
h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10];
h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13];
h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14];
h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15];
h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16];
h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17];
h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18];
h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19];
h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20];
h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21];
h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22];
h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23];
// CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
if(!h->pps.cabac && h->pps.transform_8x8_mode){
int top_type, left_type[2];
......@@ -687,16 +717,16 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
if(IS_8x8DCT(mb_type)){
h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]=
h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp & 1;
h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp_table[mb_xy] & 1;
h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;
h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;
h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment