Commit 703c8195 authored by Loren Merritt's avatar Loren Merritt

mmx implementation of 3-point GMC. (5x faster than C)

Originally committed as revision 5265 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 841f65f2
...@@ -1144,7 +1144,7 @@ static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y ...@@ -1144,7 +1144,7 @@ static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y
} }
} }
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
{ {
int y, vx, vy; int y, vx, vy;
...@@ -3865,7 +3865,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) ...@@ -3865,7 +3865,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->add_pixels8 = add_pixels8_c; c->add_pixels8 = add_pixels8_c;
c->add_pixels4 = add_pixels4_c; c->add_pixels4 = add_pixels4_c;
c->gmc1 = gmc1_c; c->gmc1 = gmc1_c;
c->gmc = gmc_c; c->gmc = ff_gmc_c;
c->clear_blocks = clear_blocks_c; c->clear_blocks = clear_blocks_c;
c->pix_sum = pix_sum_c; c->pix_sum = pix_sum_c;
c->pix_norm1 = pix_norm1_c; c->pix_norm1 = pix_norm1_c;
......
...@@ -82,6 +82,9 @@ void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, i ...@@ -82,6 +82,9 @@ void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, i
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
/* minimum alignment rules ;) /* minimum alignment rules ;)
if u notice errors in the align stuff, need more alignment for some asm code for some cpu if u notice errors in the align stuff, need more alignment for some asm code for some cpu
or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ... or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...
......
...@@ -2403,6 +2403,126 @@ static void just_return() { return; } ...@@ -2403,6 +2403,126 @@ static void just_return() { return; }
c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
c->avg_ ## postfix1 = avg_ ## postfix2; c->avg_ ## postfix1 = avg_ ## postfix2;
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
const int w = 8;
const int s = 1<<shift;
const int ix = ox>>(16+shift);
const int iy = oy>>(16+shift);
const int oxs = ox>>4;
const int oys = oy>>4;
const int dxxs = dxx>>4;
const int dxys = dxy>>4;
const int dyxs = dyx>>4;
const int dyys = dyy>>4;
const uint16_t r4[4] = {r,r,r,r};
const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
const uint64_t shift2 = 2*shift;
uint8_t edge_buf[(h+1)*stride];
int x, y;
const int dxw = (dxx-(1<<(16+shift)))*(w-1);
const int dyh = (dyy-(1<<(16+shift)))*(h-1);
const int dxh = dxy*(h-1);
const int dyw = dyx*(w-1);
if( // non-constant fullpel offset (3% of blocks)
(ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
// uses more than 16 bits of subpel mv (only at huge resolution)
|| (dxx|dxy|dyx|dyy)&15 )
{
//FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
return;
}
if( (unsigned)ix >= width-w ||
(unsigned)iy >= height-h )
{
ff_emulated_edge_mc(edge_buf, src+ix+iy*stride, stride, w+1, h+1, ix, iy, width, height);
src = edge_buf;
}
else
src += ix + iy*stride;
for(x=0; x<w; x+=4){
uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
oxs - dxys + dxxs*(x+1),
oxs - dxys + dxxs*(x+2),
oxs - dxys + dxxs*(x+3) };
uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
oys - dyys + dyxs*(x+1),
oys - dyys + dyxs*(x+2),
oys - dyys + dyxs*(x+3) };
asm volatile(
"movd %0, %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
:: "g"(s)
);
for(y=0; y<h; y++){
asm volatile(
"movq %0, %%mm4 \n\t"
"movq %1, %%mm5 \n\t"
"paddw %2, %%mm4 \n\t"
"paddw %3, %%mm5 \n\t"
"movq %%mm4, %0 \n\t"
"movq %%mm5, %1 \n\t"
"psrlw $12, %%mm4 \n\t"
"psrlw $12, %%mm5 \n\t"
: "+m"(*dx4), "+m"(*dy4)
: "m"(*dxy4), "m"(*dyy4)
);
asm volatile(
"movq %%mm6, %%mm2 \n\t"
"movq %%mm6, %%mm1 \n\t"
"psubw %%mm4, %%mm2 \n\t"
"psubw %%mm5, %%mm1 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
"pmullw %%mm5, %%mm3 \n\t" // dx*dy
"pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
"pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
"movd %4, %%mm5 \n\t"
"movd %3, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
"pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
"movd %2, %%mm5 \n\t"
"movd %1, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
"pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %5, %%mm0 \n\t"
"psrlw %6, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
: "=m"(dst[x+y*stride])
: "m"(src[0]), "m"(src[1]),
"m"(src[stride]), "m"(src[stride+1]),
"m"(*r4), "m"(shift2)
);
src += stride;
}
src += 4-h*stride;
}
}
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
long i=0; long i=0;
...@@ -2725,6 +2845,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ...@@ -2725,6 +2845,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
c->gmc= gmc_mmx;
c->add_bytes= add_bytes_mmx; c->add_bytes= add_bytes_mmx;
#ifdef CONFIG_ENCODERS #ifdef CONFIG_ENCODERS
c->diff_bytes= diff_bytes_mmx; c->diff_bytes= diff_bytes_mmx;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment