Commit d2bb7db1 authored by Loren Merritt's avatar Loren Merritt

sort H.264 mmx dsp functions into their own file

Originally committed as revision 4338 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent c26ae41d
......@@ -50,7 +50,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
"paddw %3, %%mm4\n\t"
"psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
"pxor %%mm7, %%mm7\n\t"
: "=m" (DD) : "rm" (x), "rm" (y), "m" (sixtyfour));
: "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
asm volatile("movq %%mm4, %0" : "=m" (AA));
......@@ -134,7 +134,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
"packuswb %%mm3, %%mm2\n\t"
H264_CHROMA_OP(%0, %%mm2)
"movq %%mm2, %0\n\t"
: "=m" (dst[0]) : "m" (thirtytwo));
: "=m" (dst[0]) : "m" (ff_pw_32));
dst+= stride;
}
}
This diff is collapsed.
This diff is collapsed.
......@@ -598,90 +598,3 @@ declare_idct (ff_mmxext_idct, mmxext_table,
declare_idct (ff_mmx_idct, mmx_table,
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
"paddw "#b", "#b" \n\t"\
"psubw "#a", "#b" \n\t"
#define SUMSUB_BADC( a, b, c, d ) \
"paddw "#b", "#a" \n\t"\
"paddw "#d", "#c" \n\t"\
"paddw "#b", "#b" \n\t"\
"paddw "#d", "#d" \n\t"\
"psubw "#a", "#b" \n\t"\
"psubw "#c", "#d" \n\t"
#define SUMSUBD2_AB( a, b, t ) \
"movq "#b", "#t" \n\t"\
"psraw $1 , "#b" \n\t"\
"paddw "#a", "#b" \n\t"\
"psraw $1 , "#a" \n\t"\
"psubw "#t", "#a" \n\t"
#define IDCT4_1D( s02, s13, d02, d13, t ) \
SUMSUB_BA ( s02, d02 )\
SUMSUBD2_AB( s13, d13, t )\
SUMSUB_BADC( d13, s02, s13, d02 )
#define SBUTTERFLY( a, b, t, n ) \
"movq "#a", "#t" \n\t" /* abcd */\
"punpckl"#n" "#b", "#a" \n\t" /* aebf */\
"punpckh"#n" "#b", "#t" \n\t" /* cgdh */
#define TRANSPOSE4( a, b, c, d, t ) \
SBUTTERFLY( a, b, t, wd ) /* a=aebf t=cgdh */\
SBUTTERFLY( c, d, b, wd ) /* c=imjn b=kolp */\
SBUTTERFLY( a, c, d, dq ) /* a=aeim d=bfjn */\
SBUTTERFLY( t, b, c, dq ) /* t=cgko c=dhlp */
#define STORE_DIFF_4P( p, t, z ) \
"psraw $6, "#p" \n\t"\
"movd (%0), "#t" \n\t"\
"punpcklbw "#z", "#t" \n\t"\
"paddsw "#t", "#p" \n\t"\
"packuswb "#z", "#p" \n\t"\
"movd "#p", (%0) \n\t"
static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
{
/* Load dct coeffs */
asm volatile(
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm1 \n\t"
"movq 16(%0), %%mm2 \n\t"
"movq 24(%0), %%mm3 \n\t"
:: "r"(block) );
asm volatile(
/* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
"movq %0, %%mm6 \n\t"
/* in: 1,4,0,2 out: 1,2,3,0 */
TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
"paddw %%mm6, %%mm3 \n\t"
/* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
"pxor %%mm7, %%mm7 \n\t"
:: "m"(ff_pw_32));
asm volatile(
STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
"add %1, %0 \n\t"
STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
"add %1, %0 \n\t"
STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
"add %1, %0 \n\t"
STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
: "+r"(dst)
: "r" ((long)stride)
);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment