* using DSPContext - so each codec could use its local (sub)set of CPU extension

Originally committed as revision 1194 to svn://svn.ffmpeg.org/ffmpeg/trunk

* using DSPContext - so each codec could use its local (sub)set of CPU extension
Originally committed as revision 1194 to svn://svn.ffmpeg.org/ffmpeg/trunk
eb4b3dd3 · Zdenek Kabelac · fb602cd1 · eb4b3dd3 · eb4b3dd3 · eb4b3dd3
Commit eb4b3dd3 authored Nov 11, 2002 by Zdenek Kabelac
13 changed files
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -20,7 +20,7 @@
 */
 #include "avcodec.h"
 #include "dsputil.h"
-
+/*
 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
@@ -41,7 +41,7 @@ op_pixels_abs_func pix_abs8x8;
 op_pixels_abs_func pix_abs8x8_x2;
 op_pixels_abs_func pix_abs8x8_y2;
 op_pixels_abs_func pix_abs8x8_xy2;
-
+*/
 int ff_bit_exact=0;

 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
@@ -84,7 +84,7 @@ const UINT8 ff_alternate_vertical_scan[64] = {
 };

 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
-UINT32 inverse[256]={
+const UINT32 inverse[256]={
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
@@ -119,7 +119,7 @@ UINT32 inverse[256]={
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 };

-int pix_sum_c(UINT8 * pix, int line_size)
+static int pix_sum_c(UINT8 * pix, int line_size)
 {
    int s, i, j;

@@ -141,7 +141,7 @@ int pix_sum_c(UINT8 * pix, int line_size)
    return s;
 }

-int pix_norm1_c(UINT8 * pix, int line_size)
+static int pix_norm1_c(UINT8 * pix, int line_size)
 {
    int s, i, j;
    UINT32 *sq = squareTbl + 256;
@@ -165,7 +165,7 @@ int pix_norm1_c(UINT8 * pix, int line_size)
 }


-void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
+static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
 {
    int i;

@@ -184,8 +184,8 @@ void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
    }
 }

-void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
-		   int stride){
+static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
+			  const UINT8 *s2, int stride){
    int i;

    /* read the pixels */
@@ -205,8 +205,8 @@ void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
 }


-void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
-                          int line_size)
+static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
+				 int line_size)
 {
    int i;
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
@@ -227,7 +227,7 @@ void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
    }
 }

-void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
+static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
                          int line_size)
 {
    int i;
@@ -1353,7 +1353,7 @@ QPEL_MC(0, avg_       , _       , op_avg)
 #undef op_put
 #undef op_put_no_rnd

-int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;

@@ -1381,7 +1381,7 @@ int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;

@@ -1409,7 +1409,7 @@ int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;
    UINT8 *pix3 = pix2 + line_size;
@@ -1439,7 +1439,7 @@ int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;
    UINT8 *pix3 = pix2 + line_size;
@@ -1469,7 +1469,7 @@ int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;

@@ -1489,7 +1489,7 @@ int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;

@@ -1509,7 +1509,7 @@ int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;
    UINT8 *pix3 = pix2 + line_size;
@@ -1531,7 +1531,7 @@ int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
    return s;
 }

-int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
+static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
 {
    int s, i;
    UINT8 *pix3 = pix2 + line_size;
@@ -1574,12 +1574,12 @@ void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable,
    }
 }

-void clear_blocks_c(DCTELEM *blocks)
+static void clear_blocks_c(DCTELEM *blocks)
 {
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
 }

-void dsputil_init(void)
+void dsputil_init(DSPContext* c, unsigned mask)
 {
    int i;

@@ -1593,42 +1593,82 @@ void dsputil_init(void)
        squareTbl[i] = (i - 256) * (i - 256);
    }

-    get_pixels = get_pixels_c;
-    diff_pixels = diff_pixels_c;
-    put_pixels_clamped = put_pixels_clamped_c;
-    add_pixels_clamped = add_pixels_clamped_c;
-    ff_gmc1= gmc1_c;
-    ff_gmc= gmc_c;
-    clear_blocks= clear_blocks_c;
-    pix_sum= pix_sum_c;
-    pix_norm1= pix_norm1_c;
-
-    pix_abs16x16     = pix_abs16x16_c;
-    pix_abs16x16_x2  = pix_abs16x16_x2_c;
-    pix_abs16x16_y2  = pix_abs16x16_y2_c;
-    pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
-    pix_abs8x8     = pix_abs8x8_c;
-    pix_abs8x8_x2  = pix_abs8x8_x2_c;
-    pix_abs8x8_y2  = pix_abs8x8_y2_c;
-    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
+    c->get_pixels = get_pixels_c;
+    c->diff_pixels = diff_pixels_c;
+    c->put_pixels_clamped = put_pixels_clamped_c;
+    c->add_pixels_clamped = add_pixels_clamped_c;
+    c->gmc1 = gmc1_c;
+    c->gmc = gmc_c;
+    c->clear_blocks = clear_blocks_c;
+    c->pix_sum = pix_sum_c;
+    c->pix_norm1 = pix_norm1_c;
+
+    c->pix_abs16x16     = pix_abs16x16_c;
+    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
+    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
+    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
+    c->pix_abs8x8     = pix_abs8x8_c;
+    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
+    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
+    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
+
+    c->put_pixels_tab[0][0] = put_pixels16;
+    c->put_pixels_tab[0][1] = put_pixels16_x2;
+    c->put_pixels_tab[0][2] = put_pixels16_y2;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2;
+
+    c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16;
+    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2;
+    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2;
+    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2;
+
+    c->put_pixels_tab[1][0] = put_pixels8;
+    c->put_pixels_tab[1][1] = put_pixels8_x2;
+    c->put_pixels_tab[1][2] = put_pixels8_y2;
+    c->put_pixels_tab[1][3] = put_pixels8_xy2;
+
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels8;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2;
+
+    c->avg_pixels_tab[1][0] = avg_pixels8;
+    c->avg_pixels_tab[1][1] = avg_pixels8_x2;
+    c->avg_pixels_tab[1][2] = avg_pixels8_y2;
+    c->avg_pixels_tab[1][3] = avg_pixels8_xy2;
+
+    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8;
+    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2;
+    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2;
+    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2;

 #ifdef HAVE_MMX
-    dsputil_init_mmx();
+    dsputil_init_mmx(c, mask);
 #endif
 #ifdef ARCH_ARMV4L
-    dsputil_init_armv4l();
+    dsputil_init_armv4l(c, mask);
 #endif
 #ifdef HAVE_MLIB
-    dsputil_init_mlib();
+    dsputil_init_mlib(c, mask);
 #endif
 #ifdef ARCH_ALPHA
-    dsputil_init_alpha();
+    dsputil_init_alpha(c, mask);
 #endif
 #ifdef ARCH_POWERPC
-    dsputil_init_ppc();
+    dsputil_init_ppc(c, mask);
 #endif
 #ifdef HAVE_MMI
-    dsputil_init_mmi();
+    dsputil_init_mmi(c, mask);
 #endif

    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
@@ -1639,7 +1679,8 @@ void avcodec_set_bit_exact(void)
 {
    ff_bit_exact=1;
 #ifdef HAVE_MMX
-    dsputil_set_bit_exact_mmx();
+#warning FIXME - set_bit_exact
+//    dsputil_set_bit_exact_mmx();
 #endif
 }


--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -45,10 +45,9 @@ extern const UINT8 ff_zigzag_direct[64];
 extern UINT32 squareTbl[512];
 extern UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];

-void dsputil_init(void);

 /* minimum alignment rules ;)
-if u notice errors in the align stuff, need more alignment for some asm code for some cpu 
+if u notice errors in the align stuff, need more alignment for some asm code for some cpu
 or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...

 !warning these alignments might not match reallity, (missing attribute((align)) stuff somewhere possible)
@@ -57,39 +56,20 @@ i (michael) didnt check them, these are just the alignents which i think could b
 !future video codecs might need functions with less strict alignment
 */

-/* pixel ops : interface with DCT */
-extern void (*get_pixels)(DCTELEM *block/*align 16*/, const UINT8 *pixels/*align 8*/, int line_size);
-extern void (*diff_pixels)(DCTELEM *block/*align 16*/, const UINT8 *s1/*align 8*/, const UINT8 *s2/*align 8*/, int stride);
-extern void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
-extern void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
-extern void (*ff_gmc1)(UINT8 *dst/*align 8*/, UINT8 *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
-extern void (*ff_gmc )(UINT8 *dst/*align 8*/, UINT8 *src/*align 1*/, int stride, int h, int ox, int oy, 
-                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
-extern void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
-extern int (*pix_sum)(UINT8 * pix, int line_size);
-extern int (*pix_norm1)(UINT8 * pix, int line_size);
-
-
-
+/*
 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size);
 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size);
 void clear_blocks_c(DCTELEM *blocks);
+*/

 /* add and put pixel (decoding) */
 // blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
 typedef void (*op_pixels_func)(UINT8 *block/*align width (8 or 16)*/, const UINT8 *pixels/*align 1*/, int line_size, int h);
 typedef void (*qpel_mc_func)(UINT8 *dst/*align width (8 or 16)*/, UINT8 *src/*align 1*/, int stride);

-extern op_pixels_func put_pixels_tab[2][4];
-extern op_pixels_func avg_pixels_tab[2][4];
-extern op_pixels_func put_no_rnd_pixels_tab[2][4];
-extern op_pixels_func avg_no_rnd_pixels_tab[2][4];
-extern qpel_mc_func put_qpel_pixels_tab[2][16];
-extern qpel_mc_func avg_qpel_pixels_tab[2][16];
-extern qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
-extern qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
+

 #define CALL_2X_PIXELS(a, b, n)\
 static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
@@ -100,20 +80,46 @@ static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 /* motion estimation */

 typedef int (*op_pixels_abs_func)(UINT8 *blk1/*align width (8 or 16)*/, UINT8 *blk2/*align 1*/, int line_size);
-
-extern op_pixels_abs_func pix_abs16x16;
-extern op_pixels_abs_func pix_abs16x16_x2;
-extern op_pixels_abs_func pix_abs16x16_y2;
-extern op_pixels_abs_func pix_abs16x16_xy2;
-extern op_pixels_abs_func pix_abs8x8;
-extern op_pixels_abs_func pix_abs8x8_x2;
-extern op_pixels_abs_func pix_abs8x8_y2;
-extern op_pixels_abs_func pix_abs8x8_xy2;
-
+/*
 int pix_abs16x16_c(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs16x16_x2_c(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs16x16_y2_c(UINT8 *blk1, UINT8 *blk2, int lx);
 int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx);
+*/
+typedef struct DSPContext {
+    /* pixel ops : interface with DCT */
+    void (*get_pixels)(DCTELEM *block/*align 16*/, const UINT8 *pixels/*align 8*/, int line_size);
+    void (*diff_pixels)(DCTELEM *block/*align 16*/, const UINT8 *s1/*align 8*/, const UINT8 *s2/*align 8*/, int stride);
+    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
+    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, UINT8 *pixels/*align 8*/, int line_size);
+    void (*gmc1)(UINT8 *dst/*align 8*/, UINT8 *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
+    void (*gmc )(UINT8 *dst/*align 8*/, UINT8 *src/*align 1*/, int stride, int h, int ox, int oy,
+		    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
+    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
+    int (*pix_sum)(UINT8 * pix, int line_size);
+    int (*pix_norm1)(UINT8 * pix, int line_size);
+
+    /* maybe create an array for 16/8 functions */
+    op_pixels_func put_pixels_tab[2][4];
+    op_pixels_func avg_pixels_tab[2][4];
+    op_pixels_func put_no_rnd_pixels_tab[2][4];
+    op_pixels_func avg_no_rnd_pixels_tab[2][4];
+    qpel_mc_func put_qpel_pixels_tab[2][16];
+    qpel_mc_func avg_qpel_pixels_tab[2][16];
+    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
+    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
+
+    op_pixels_abs_func pix_abs16x16;
+    op_pixels_abs_func pix_abs16x16_x2;
+    op_pixels_abs_func pix_abs16x16_y2;
+    op_pixels_abs_func pix_abs16x16_xy2;
+    op_pixels_abs_func pix_abs8x8;
+    op_pixels_abs_func pix_abs8x8_x2;
+    op_pixels_abs_func pix_abs8x8_y2;
+    op_pixels_abs_func pix_abs8x8_xy2;
+} DSPContext;
+
+void dsputil_init(DSPContext* p, unsigned mask);

 /**
 * permute block according to permuatation.
@@ -121,8 +127,12 @@ int pix_abs16x16_xy2_c(UINT8 *blk1, UINT8 *blk2, int lx);
 */
 void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last);

+#define emms_c()
+
 #if defined(HAVE_MMX)

+#undef emms_c()
+
 #define MM_MMX    0x0001 /* standard MMX */
 #define MM_3DNOW  0x0004 /* AMD 3DNOW */
 #define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */
@@ -132,6 +142,8 @@ void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable,
 extern int mm_flags;

 int mm_support(void);
+void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size);
+void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size);

 static inline void emms(void)
 {
@@ -146,54 +158,44 @@ static inline void emms(void)

 #define __align8 __attribute__ ((aligned (8)))

-void dsputil_init_mmx(void);
-void dsputil_set_bit_exact_mmx(void);
+void dsputil_init_mmx(DSPContext* c, unsigned mask);
+void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask);

 #elif defined(ARCH_ARMV4L)

-#define emms_c()
-
 /* This is to use 4 bytes read to the IDCT pointers for some 'zero'
   line ptimizations */
 #define __align8 __attribute__ ((aligned (4)))

-void dsputil_init_armv4l(void);   
+void dsputil_init_armv4l(DSPContext* c, unsigned mask);

 #elif defined(HAVE_MLIB)
- 
-#define emms_c()

 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
 #define __align8 __attribute__ ((aligned (8)))

-void dsputil_init_mlib(void);   
+void dsputil_init_mlib(DSPContext* c, unsigned mask);

 #elif defined(ARCH_ALPHA)

-#define emms_c()
 #define __align8 __attribute__ ((aligned (8)))

-void dsputil_init_alpha(void);
+void dsputil_init_alpha(DSPContext* c, unsigned mask);

 #elif defined(ARCH_POWERPC)

-#define emms_c()
 #define __align8 __attribute__ ((aligned (16)))

-void dsputil_init_ppc(void);
+void dsputil_init_ppc(DSPContext* c, unsigned mask);

 #elif defined(HAVE_MMI)

-#define emms_c()
-
 #define __align8 __attribute__ ((aligned (16)))

-void dsputil_init_mmi(void);   
+void dsputil_init_mmi(DSPContext* c, unsigned mask);

 #else

-#define emms_c()
-
 #define __align8

 #endif
@@ -263,9 +265,9 @@ typedef struct MDCTContext {
 } MDCTContext;

 int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
-void ff_imdct_calc(MDCTContext *s, FFTSample *output, 
+void ff_imdct_calc(MDCTContext *s, FFTSample *output,
                const FFTSample *input, FFTSample *tmp);
-void ff_mdct_calc(MDCTContext *s, FFTSample *out, 
+void ff_mdct_calc(MDCTContext *s, FFTSample *out,
               const FFTSample *input, FFTSample *tmp);
 void ff_mdct_end(MDCTContext *s);


--- a/libavcodec/dv.c
+++ b/libavcodec/dv.c
@@ -114,6 +114,7 @@ static int dvvideo_decode_init(AVCodecContext *avctx)
    /* XXX: fix it */
    memset(&s2, 0, sizeof(MpegEncContext));
    s2.avctx = avctx;
+    dsputil_init(&s2.dsp, avctx->dsp_mask);
    if (DCT_common_init(&s2) < 0)
       return -1;


--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -331,7 +331,7 @@ static void guess_mv(MpegEncContext *s){
                s->mv_type = MV_TYPE_16X16;
                s->mb_skiped=0;

-                clear_blocks(s->block[0]);
+		s->dsp.clear_blocks(s->block[0]);

                s->mb_x= mb_x;
                s->mb_y= mb_y;
@@ -458,7 +458,7 @@ int score_sum=0;
                    s->mv_type = MV_TYPE_16X16;
                    s->mb_skiped=0;

-                    clear_blocks(s->block[0]);
+		    s->dsp.clear_blocks(s->block[0]);

                    s->mb_x= mb_x;
                    s->mb_y= mb_y;
@@ -559,8 +559,8 @@ static int is_intra_more_likely(MpegEncContext *s){
                UINT8 *mb_ptr     = s->current_picture[0] + mb_x*16 + mb_y*16*s->linesize;
                UINT8 *last_mb_ptr= s->last_picture   [0] + mb_x*16 + mb_y*16*s->linesize;
    
-                is_intra_likely += pix_abs16x16(last_mb_ptr, mb_ptr                    , s->linesize);
-                is_intra_likely -= pix_abs16x16(last_mb_ptr, last_mb_ptr+s->linesize*16, s->linesize);
+		is_intra_likely += s->dsp.pix_abs16x16(last_mb_ptr, mb_ptr                    , s->linesize);
+                is_intra_likely -= s->dsp.pix_abs16x16(last_mb_ptr, last_mb_ptr+s->linesize*16, s->linesize);
            }else{
                if(s->mbintra_table[i]) //HACK (this is allways inited but we should use mb_type[])
                   is_intra_likely++;
@@ -738,7 +738,7 @@ void ff_error_resilience(MpegEncContext *s){
                s->mv[0][0][1] = s->motion_val[ mb_x*2+1 + (mb_y*2+1)*s->block_wrap[0] ][1];
            }
        
-            clear_blocks(s->block[0]);
+	    s->dsp.clear_blocks(s->block[0]);

            s->mb_x= mb_x;
            s->mb_y= mb_y;
@@ -778,8 +778,8 @@ void ff_error_resilience(MpegEncContext *s){
                    s->mv[1][0][0]= 0;
                    s->mv[1][0][1]= 0;
                }
-                                
-                clear_blocks(s->block[0]);
+
+                s->dsp.clear_blocks(s->block[0]);
                s->mb_x= mb_x;
                s->mb_y= mb_y;
                MPV_decode_mb(s, s->block);

--- a/libavcodec/h263.c
+++ b/libavcodec/h263.c
@@ -538,7 +538,7 @@ void mpeg4_encode_mb(MpegEncContext * s,
                        if(s->coded_order[i+1].pict_type!=B_TYPE) break;

                        b_pic= s->coded_order[i+1].picture[0] + offset;
-                        diff= pix_abs16x16(p_pic, b_pic, s->linesize);
+			diff= s->dsp.pix_abs16x16(p_pic, b_pic, s->linesize);
                        if(diff>s->qscale*70){ //FIXME check that 70 is optimal
                            s->mb_skiped=0;
                            break;

--- a/libavcodec/h263dec.c
+++ b/libavcodec/h263dec.c
@@ -195,7 +195,7 @@ static int decode_slice(MpegEncContext *s){
            }

            /* DCT & quantize */
-            clear_blocks(s->block[0]);
+	    s->dsp.clear_blocks(s->block[0]);
            
            s->mv_dir = MV_DIR_FORWARD;
            s->mv_type = MV_TYPE_16X16;

--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
--- a/libavcodec/mpeg12.c
+++ b/libavcodec/mpeg12.c
@@ -1623,7 +1623,7 @@ static int mpeg_decode_slice(AVCodecContext *avctx,
    s->mb_incr= 1;

    for(;;) {
-        clear_blocks(s->block[0]);
+	s->dsp.clear_blocks(s->block[0]);
        
        ret = mpeg_decode_mb(s, s->block);
        dprintf("ret=%d\n", ret);

--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -221,6 +221,7 @@ typedef struct MpegEncContext {
    int unrestricted_mv;
    int h263_long_vectors; /* use horrible h263v1 long vector mode */

+    DSPContext dsp;             /* pointers for accelerated dsp fucntions */
    int f_code; /* forward MV resolution */
    int b_code; /* backward MV resolution for B Frames (mpeg4) */
    INT16 (*motion_val)[2];            /* used for MV prediction (4MV per MB) */

--- a/libavcodec/rv10.c
+++ b/libavcodec/rv10.c
@@ -447,7 +447,7 @@ static int rv10_decode_packet(AVCodecContext *avctx,
        printf("**mb x=%d y=%d\n", s->mb_x, s->mb_y);
 #endif
        
-        clear_blocks(s->block[0]);
+	s->dsp.clear_blocks(s->block[0]);
        s->mv_dir = MV_DIR_FORWARD;
        s->mv_type = MV_TYPE_16X16; 
        if (ff_h263_decode_mb(s, s->block) == SLICE_ERROR) {

--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -804,7 +804,7 @@ static void svq1_skip_block (uint8_t *current, uint8_t *previous, int pitch, int
  }
 }

-static int svq1_motion_inter_block (bit_buffer_t *bitbuf,
+static int svq1_motion_inter_block (MpegEncContext *s, bit_buffer_t *bitbuf,
 			       uint8_t *current, uint8_t *previous, int pitch,
 			       svq1_pmv_t *motion, int x, int y) {
  uint8_t    *src;
@@ -839,12 +839,12 @@ static int svq1_motion_inter_block (bit_buffer_t *bitbuf,
  src = &previous[(x + (mv.x >> 1)) + (y + (mv.y >> 1))*pitch];
  dst = current;

-  put_pixels_tab[0][((mv.y & 1) << 1) | (mv.x & 1)](dst,src,pitch,16);
+  s->dsp.put_pixels_tab[0][((mv.y & 1) << 1) | (mv.x & 1)](dst,src,pitch,16);

  return 0;
 }

-static int svq1_motion_inter_4v_block (bit_buffer_t *bitbuf,
+static int svq1_motion_inter_4v_block (MpegEncContext *s, bit_buffer_t *bitbuf,
 				  uint8_t *current, uint8_t *previous, int pitch,
 				  svq1_pmv_t *motion,int x, int y) {
  uint8_t    *src;
@@ -906,7 +906,7 @@ static int svq1_motion_inter_4v_block (bit_buffer_t *bitbuf,
    src = &previous[(x + (pmv[i]->x >> 1)) + (y + (pmv[i]->y >> 1))*pitch];
    dst = current;

-    put_pixels_tab[1][((pmv[i]->y & 1) << 1) | (pmv[i]->x & 1)](dst,src,pitch,8);
+    s->dsp.put_pixels_tab[1][((pmv[i]->y & 1) << 1) | (pmv[i]->x & 1)](dst,src,pitch,8);

    /* select next block */
    if (i & 1) {
@@ -921,7 +921,7 @@ static int svq1_motion_inter_4v_block (bit_buffer_t *bitbuf,
  return 0;
 }

-static int svq1_decode_delta_block (bit_buffer_t *bitbuf,
+static int svq1_decode_delta_block (MpegEncContext *s, bit_buffer_t *bitbuf,
 			uint8_t *current, uint8_t *previous, int pitch,
 			svq1_pmv_t *motion, int x, int y) {
  uint32_t bit_cache;
@@ -951,7 +951,7 @@ static int svq1_decode_delta_block (bit_buffer_t *bitbuf,
    break;

  case SVQ1_BLOCK_INTER:
-    result = svq1_motion_inter_block (bitbuf, current, previous, pitch, motion, x, y);
+    result = svq1_motion_inter_block (s, bitbuf, current, previous, pitch, motion, x, y);

    if (result != 0)
    {
@@ -964,7 +964,7 @@ static int svq1_decode_delta_block (bit_buffer_t *bitbuf,
    break;

  case SVQ1_BLOCK_INTER_4V:
-    result = svq1_motion_inter_4v_block (bitbuf, current, previous, pitch, motion, x, y);
+    result = svq1_motion_inter_4v_block (s, bitbuf, current, previous, pitch, motion, x, y);

    if (result != 0)
    {
@@ -1142,8 +1142,8 @@ static int svq1_decode_frame(AVCodecContext *avctx,

      for (y=0; y < height; y+=16) {
 	for (x=0; x < width; x+=16) {
-	  result = svq1_decode_delta_block (&s->gb, &current[x], previous,
-				       linesize, pmv, x, y);
+	  result = svq1_decode_delta_block (s, &s->gb, &current[x], previous,
+					    linesize, pmv, x, y);
 	  if (result != 0)
 	  {
 #ifdef DEBUG_SVQ1