Commit 3e20143e authored by Loren Merritt's avatar Loren Merritt

mmx implementation of deblocking strength decision.

2-3% faster h264.

Originally committed as revision 6113 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 001299bf
......@@ -4111,6 +4111,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
c->h264_loop_filter_strength= NULL;
c->h263_h_loop_filter= h263_h_loop_filter_c;
c->h263_v_loop_filter= h263_v_loop_filter_c;
......
......@@ -305,6 +305,9 @@ typedef struct DSPContext {
void (*h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta);
void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta);
// h264_loop_filter_strength: simd only. the C version is inlined in h264.c
void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step, int mask_mv0, int mask_mv1);
void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
......
......@@ -409,6 +409,7 @@ static VLC run7_vlc;
static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
static always_inline uint32_t pack16to32(int a, int b){
#ifdef WORDS_BIGENDIAN
......@@ -3879,7 +3880,7 @@ static void hl_decode_mb(H264Context *h){
tprintf("call filter_mb\n");
backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
}
}
}
......@@ -6694,7 +6695,7 @@ decode_intra_mb:
}
static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
int i, d;
const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
const int alpha = alpha_table[index_a];
......@@ -6755,7 +6756,7 @@ static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4]
}
}
}
static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
int i;
const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
const int alpha = alpha_table[index_a];
......@@ -6771,7 +6772,7 @@ static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4
}
}
static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
int i;
for( i = 0; i < 16; i++, pix += stride) {
int index_a;
......@@ -6869,7 +6870,7 @@ static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int
}
}
}
static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
int i;
for( i = 0; i < 8; i++, pix += stride) {
int index_a;
......@@ -6922,7 +6923,7 @@ static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, in
}
}
static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
int i, d;
const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
const int alpha = alpha_table[index_a];
......@@ -6982,7 +6983,7 @@ static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4]
}
}
static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
int i;
const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
const int alpha = alpha_table[index_a];
......@@ -6998,6 +6999,108 @@ static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4
}
}
static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
MpegEncContext * const s = &h->s;
int mb_xy, mb_type;
int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
if(mb_x==0 || mb_y==0 || !s->dsp.h264_loop_filter_strength) {
filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
return;
}
assert(!FRAME_MBAFF);
mb_xy = mb_x + mb_y*s->mb_stride;
mb_type = s->current_picture.mb_type[mb_xy];
qp = s->current_picture.qscale_table[mb_xy];
qp0 = s->current_picture.qscale_table[mb_xy-1];
qp1 = s->current_picture.qscale_table[h->top_mb_xy];
qpc = get_chroma_qp( h->pps.chroma_qp_index_offset, qp );
qpc0 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp0 );
qpc1 = get_chroma_qp( h->pps.chroma_qp_index_offset, qp1 );
qp0 = (qp + qp0 + 1) >> 1;
qp1 = (qp + qp1 + 1) >> 1;
qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh)
return;
qpc0 = (qpc + qpc0 + 1) >> 1;
qpc1 = (qpc + qpc1 + 1) >> 1;
if( IS_INTRA(mb_type) ) {
int16_t bS4[4] = {4,4,4,4};
int16_t bS3[4] = {3,3,3,3};
if( IS_8x8DCT(mb_type) ) {
filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
} else {
filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bS4, qp1 );
filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
}
filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bS4, qpc1 );
filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
return;
} else {
DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
== (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
(mb_type & MB_TYPE_16x8) ? 1 : 0;
int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
&& (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
? 3 : 0;
int step = IS_8x8DCT(mb_type) ? 2 : 1;
s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
(h->slice_type == B_TYPE), edges, step, mask_edge0, mask_edge1 );
if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
bSv[0][0] = 0x0004000400040004ULL;
if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
bSv[1][0] = 0x0004000400040004ULL;
#define FILTER(hv,dir,edge)\
if(bSv[dir][edge]) {\
filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
if(!(edge&1)) {\
filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
}\
}
if( edges == 1 ) {
FILTER(v,0,0);
FILTER(h,1,0);
} else if( IS_8x8DCT(mb_type) ) {
FILTER(v,0,0);
FILTER(v,0,2);
FILTER(h,1,0);
FILTER(h,1,2);
} else {
FILTER(v,0,0);
FILTER(v,0,1);
FILTER(v,0,2);
FILTER(v,0,3);
FILTER(h,1,0);
FILTER(h,1,1);
FILTER(h,1,2);
FILTER(h,1,3);
}
#undef FILTER
}
}
static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
MpegEncContext * const s = &h->s;
const int mb_xy= mb_x + mb_y*s->mb_stride;
......@@ -7035,7 +7138,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
*/
const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
int bS[8];
int16_t bS[8];
int qp[2];
int chroma_qp[2];
int mb_qp, mbn0_qp, mbn1_qp;
......@@ -7114,7 +7217,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
int mbn_xy = mb_xy - 2 * s->mb_stride;
int qp, chroma_qp;
int i, j;
int bS[4];
int16_t bS[4];
for(j=0; j<2; j++, mbn_xy += s->mb_stride){
if( IS_INTRA(mb_type) ||
......@@ -7150,7 +7253,7 @@ static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8
/* mbn_xy: neighbor macroblock */
const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
const int mbn_type = s->current_picture.mb_type[mbn_xy];
int bS[4];
int16_t bS[4];
int qp;
if( (edge&1) && IS_8x8DCT(mb_type) )
......
......@@ -53,6 +53,9 @@ static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
......@@ -3282,6 +3285,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
......
......@@ -561,6 +561,101 @@ static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
transpose4x4(pix-2+4*stride, trans+4, stride, 8);
}
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step, int mask_mv0, int mask_mv1 ) {
int dir;
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movq %0, %%mm6 \n\t"
"movq %1, %%mm5 \n\t"
"movq %2, %%mm4 \n\t"
::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
);
// could do a special case for dir==0 && edges==1, but it only reduces the
// average filter time by 1.2%
for( dir=1; dir>=0; dir-- ) {
const int d_idx = dir ? -8 : -1;
const int mask_mv = dir ? mask_mv1 : mask_mv0;
const uint64_t mask_dir = dir ? 0 : 0xffffffffffffffffULL;
int b_idx, edge, l;
for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
asm volatile(
"pand %0, %%mm0 \n\t"
::"m"(mask_dir)
);
if(!(mask_mv & edge)) {
asm volatile("pxor %%mm0, %%mm0 \n\t":);
for( l = bidir; l >= 0; l-- ) {
asm volatile(
"movd %0, %%mm1 \n\t"
"punpckldq %1, %%mm1 \n\t"
"movq %%mm1, %%mm2 \n\t"
"psrlw $7, %%mm2 \n\t"
"pand %%mm6, %%mm2 \n\t"
"por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
"punpckldq %%mm1, %%mm2 \n\t"
"pcmpeqb %%mm2, %%mm1 \n\t"
"paddb %%mm6, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
"por %%mm1, %%mm0 \n\t"
"movq %2, %%mm1 \n\t"
"movq %3, %%mm2 \n\t"
"psubw %4, %%mm1 \n\t"
"psubw %5, %%mm2 \n\t"
"packsswb %%mm2, %%mm1 \n\t"
"paddb %%mm5, %%mm1 \n\t"
"pminub %%mm4, %%mm1 \n\t"
"pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
"por %%mm1, %%mm0 \n\t"
::"m"(ref[l][b_idx]),
"m"(ref[l][b_idx+d_idx]),
"m"(mv[l][b_idx][0]),
"m"(mv[l][b_idx+2][0]),
"m"(mv[l][b_idx+d_idx][0]),
"m"(mv[l][b_idx+d_idx+2][0])
);
}
}
asm volatile(
"movd %0, %%mm1 \n\t"
"por %1, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
::"m"(nnz[b_idx]),
"m"(nnz[b_idx+d_idx])
);
asm volatile(
"pcmpeqw %%mm7, %%mm0 \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t"
"psrlw $15, %%mm0 \n\t" // nonzero -> 1
"psrlw $14, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"por %%mm1, %%mm2 \n\t"
"psrlw $1, %%mm1 \n\t"
"pandn %%mm2, %%mm1 \n\t"
"movq %%mm1, %0 \n\t"
:"=m"(*bS[dir][edge])
::"memory"
);
}
edges = 4;
step = 1;
}
asm volatile(
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm1 \n\t"
"movq 16(%0), %%mm2 \n\t"
"movq 24(%0), %%mm3 \n\t"
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
"movq %%mm0, (%0) \n\t"
"movq %%mm3, 8(%0) \n\t"
"movq %%mm4, 16(%0) \n\t"
"movq %%mm2, 24(%0) \n\t"
::"r"(bS[0])
:"memory"
);
}
/***********************************/
/* motion compensation */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment