Commit 4af7bcc1 authored by Arpi's avatar Arpi

MMX/MMXEXT iDCT support, using external functions currently defined in libmpeg2

Gives average 13-20% mpeg decoding speedup on x86 systems.


Originally committed as revision 30 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 2d6d0c1d
......@@ -21,6 +21,7 @@
#include "avcodec.h"
#include "dsputil.h"
void (*ff_idct)(DCTELEM *block);
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
......@@ -363,6 +364,7 @@ void dsputil_init(void)
squareTbl[i] = (i - 256) * (i - 256);
}
ff_idct = j_rev_dct;
get_pixels = get_pixels_c;
put_pixels_clamped = put_pixels_clamped_c;
add_pixels_clamped = add_pixels_clamped_c;
......
......@@ -25,6 +25,7 @@ void dsputil_init(void);
/* pixel ops : interface with DCT */
extern void (*ff_idct)(DCTELEM *block);
extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
......
......@@ -29,6 +29,16 @@ int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h);
#ifdef USE_MMX_IDCT
/* external functions, defined in libmpeg2 */
void mmx_idct(DCTELEM *block);
void mmxext_idct(DCTELEM *block);
/* this should be in dsputil.h? -- A'rpi */
extern UINT8 ff_alternate_horizontal_scan[64];
extern UINT8 ff_alternate_vertical_scan[64];
extern UINT8 zigzag_direct[64];
#endif
/* pixel operations */
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
......@@ -1039,5 +1049,23 @@ void dsputil_init_mmx(void)
sub_pixels_tab[1] = sub_pixels_x2_3dnow;
sub_pixels_tab[2] = sub_pixels_y2_3dnow;
}
#ifdef USE_MMX_IDCT
/* use MMX / MMXEXT iDCT code from libmpeg2 */
//printf("LIBAVCODEC: Using MMX%s iDCT code\n",(mm_flags & MM_MMXEXT)?"EXT":"");
ff_idct = (mm_flags & MM_MMXEXT) ? mmxext_idct : mmx_idct;
/* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
{ int i,j;
for (i = 0; i < 64; i++) {
j = zigzag_direct[i];
zigzag_direct[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
j = ff_alternate_horizontal_scan[i];
ff_alternate_horizontal_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
j = ff_alternate_vertical_scan[i];
ff_alternate_vertical_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
}
}
#endif
}
}
......@@ -331,7 +331,8 @@ static const UINT8 mbMotionVectorTable[17][2] = {
{ 0xc, 10 },
};
const UINT8 zigzag_direct[64] = {
//const
UINT8 zigzag_direct[64] = {
0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
......
......@@ -634,7 +634,7 @@ static inline void put_dct(MpegEncContext *s,
{
if (!s->mpeg2)
s->dct_unquantize(s, block, i, s->qscale);
j_rev_dct (block);
ff_idct (block);
put_pixels_clamped(block, dest, line_size);
}
......@@ -645,7 +645,7 @@ static inline void add_dct(MpegEncContext *s,
if (s->block_last_index[i] >= 0) {
if (!s->mpeg2)
s->dct_unquantize(s, block, i, s->qscale);
j_rev_dct (block);
ff_idct (block);
add_pixels_clamped(block, dest, line_size);
}
}
......
......@@ -179,7 +179,8 @@ typedef struct MpegEncContext {
DCTELEM *block, int n, int qscale);
} MpegEncContext;
extern const UINT8 zigzag_direct[64];
//const
extern UINT8 zigzag_direct[64];
int MPV_common_init(MpegEncContext *s);
void MPV_common_end(MpegEncContext *s);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment