ps2 optimizations update patch by (Leon van Stuivenberg <leonvs at iae dot nl>)

Originally committed as revision 996 to svn://svn.ffmpeg.org/ffmpeg/trunk

ps2 optimizations update patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
Originally committed as revision 996 to svn://svn.ffmpeg.org/ffmpeg/trunk
5917d17c · Leon van Stuivenberg · Michael Niedermayer · a46a3ce4 · 5917d17c · 5917d17c
Commit 5917d17c authored Oct 03, 2002 by Leon van Stuivenberg Committed by Michael Niedermayer Oct 03, 2002
7 changed files
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -77,7 +77,7 @@ OBJS += ppc/dsputil_ppc.o
 endif
 ifeq ($(TARGET_MMI),yes)
-OBJS += ps2/dsputil_mmi.o ps2/idct_mmi.o
+OBJS += ps2/dsputil_mmi.o ps2/idct_mmi.o ps2/mpegvideo_mmi.o
 endif
 ifeq ($(TARGET_ALTIVEC),yes)

--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -216,6 +216,9 @@ int MPV_common_init(MpegEncContext *s)
 #ifdef HAVE_MLIB
    MPV_common_init_mlib(s);
 #endif
+#ifdef HAVE_MMI
+    MPV_common_init_mmi(s);
+#endif
    /* load & permutate scantables

--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -503,6 +503,9 @@ void MPV_common_init_axp(MpegEncContext *s);
 #ifdef HAVE_MLIB
 void MPV_common_init_mlib(MpegEncContext *s);
 #endif
+#ifdef HAVE_MMI
+void MPV_common_init_mmi(MpegEncContext *s);
+#endif
 extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
 void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
 void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);

--- a/libavcodec/ps2/dsputil_mmi.c
+++ b/libavcodec/ps2/dsputil_mmi.c
@@ -20,96 +20,113 @@
 */
 #include "../dsputil.h"
-void ff_mmi_idct(DCTELEM * block);
 #include "mmi.h"
+/* the provided 'as' in binutils 2.9EE doesn't support
+the EE's mips3 instructions properly */
+#define AS_BUGGY
 static void clear_blocks_mmi(DCTELEM * blocks)
 {
-    /* $4 = blocks */
    int i;
    for (i = 0; i < 6; i++) {
-        sq($0, 0, $4);
+        asm volatile(
-        sq($0, 16, $4);
+        "sq     $0, 0(%0)       \n\t"
-        sq($0, 32, $4);
+        "sq     $0, 16(%0)      \n\t"
-        sq($0, 48, $4);
+        "sq     $0, 32(%0)      \n\t"
-        sq($0, 64, $4);
+        "sq     $0, 48(%0)      \n\t"
-        sq($0, 80, $4);
+        "sq     $0, 64(%0)      \n\t"
-        sq($0, 96, $4);
+        "sq     $0, 80(%0)      \n\t"
-        sq($0, 112, $4);
+        "sq     $0, 96(%0)      \n\t"
-        __asm__ __volatile__("addi $4, $4, 128");
+        "sq     $0, 112(%0)     \n\t" :: "r" (blocks) : "memory" );
+        blocks += 64;
    }
 }
-static void put_pixels_clamped_mmi(const DCTELEM * block, UINT8 * pixels,
+static void get_pixels_mmi(DCTELEM *block, const UINT8 *pixels, int line_size)
-				   int line_size)
 {
-    /* $4 = block, $5 = pixels, $6 = line_size */
+    int i;
-    __asm__ __volatile__("li $11, 255":::"$11");
+    for(i=0;i<8;i++) {
-    lq($4, 0, $12);
+#ifdef AS_BUGGY
-    pcpyld($11, $11, $11);
+        ld3(5, 0, 8);
-    pcpyh($11, $11);
+        asm volatile(
+        "add    %1, %1, %2      \n\t"
-#define PUT(rs) \
+        "pextlb $8, $0, $8      \n\t"
-    ppacb($0, $##rs, $##rs); \
+        "sq     $8, 0(%0)       \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
-    sd3(rs, 0, 5); \
+#else
-    __asm__ __volatile__ ("add $5, $5, $6");
+        asm volatile(
+        "ld     $8, 0(%1)       \n\t"
-    pminh($12, $11, $12);
+        "add    %1, %1, %2      \n\t"
-    pmaxh($12, $0, $12);
+        "pextlb $8, $0, $8      \n\t"
-    lq($4, 16, $13);
+        "sq     $8, 0(%0)       \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
-    PUT(12);
+#endif
+        block += 8;
-    pminh($13, $11, $13);
+    }
-    pmaxh($13, $0, $13);
+}
-    lq($4, 32, $12);
-    PUT(13);
-    pminh($12, $11, $12);
-    pmaxh($12, $0, $12);
-    lq($4, 48, $13);
-    PUT(12);
-    pminh($13, $11, $13);
-    pmaxh($13, $0, $13);
-    lq($4, 64, $12);
-    PUT(13);
-    pminh($12, $11, $12);
-    pmaxh($12, $0, $12);
-    lq($4, 80, $13);
-    PUT(12);
-    pminh($13, $11, $13);
-    pmaxh($13, $0, $13);
-    lq($4, 96, $12);
-    PUT(13);
-    pminh($12, $11, $12);
-    pmaxh($12, $0, $12);
-    lq($4, 112, $13);
-    PUT(12);
-    pminh($13, $11, $13);
+static void put_pixels8_mmi(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-    pmaxh($13, $0, $13);
+{
-    PUT(13);
+    int i;
+    for(i=0; i<h; i++) {
+#ifdef AS_BUGGY
+        ldr3(5, 0, 8);
+        ldl3(5, 7, 8);
+        asm volatile ( "add $5, $5, $6 \n\t" );
+        sd3(8, 0, 4);
+        asm volatile ( "add $4, $4, $6 \n\t" );
+#else
+        asm volatile(
+        "ldr    $8, 0(%1)       \n\t"
+        "ldl    $8, 7(%1)       \n\t"
+        "add    %1, %1, %2      \n\t"
+        "sd     $8, 0(%0)       \n\t"
+        "add    %0, %0, %2      \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
+#endif
+    }
 }
-/* todo
-static void add_pixels_clamped_mmi(const DCTELEM * block, UINT8 * pixels,
+static void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-				   int line_size)
 {
+    int i;
+    for(i=0; i<h; i++) {
+#ifdef AS_BUGGY
+        ldr3(5, 0, 8);
+        ldl3(5, 7, 8);
+        ldr3(5, 8, 9);
+        ldl3(5, 15, 9);
+        asm volatile ( "add $5, $5, $6 \n\t" );
+        pcpyld($9, $8, $8);
+        sq($8, 0, $4);
+        asm volatile ( "add $4, $4, $6 \n\t" );
+#else
+        asm volatile (
+        "ldr    $8, 0(%1)       \n\t"
+        "ldl    $8, 7(%1)       \n\t"
+        "ldr    $9, 8(%1)       \n\t"
+        "ldl    $9, 15(%1)      \n\t"
+        "add    %1, %1, %2      \n\t"
+        "pcpyld $8, $9, $8      \n\t"
+        "sq     $8, 0(%0)       \n\t"
+        "add    %0, %0, %2      \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "$9", "memory" );
+#endif
+    }
 }
-*/
 void dsputil_init_mmi(void)
 {
-    put_pixels_clamped = put_pixels_clamped_mmi;
-    //add_pixels_clamped = add_pixels_clamped_mmi;
    clear_blocks = clear_blocks_mmi;
-    ff_idct = ff_mmi_idct;
+    put_pixels_tab[1][0] = put_pixels8_mmi;
+    put_no_rnd_pixels_tab[1][0] = put_pixels8_mmi;
+    put_pixels_tab[0][0] = put_pixels16_mmi;
+    put_no_rnd_pixels_tab[0][0] = put_pixels16_mmi;
+    get_pixels = get_pixels_mmi;
 }
--- a/libavcodec/ps2/idct_mmi.c
+++ b/libavcodec/ps2/idct_mmi.c
--- a/libavcodec/ps2/mmi.h
+++ b/libavcodec/ps2/mmi.h
@@ -48,6 +48,20 @@
 #define	sq(reg, off, base)	\
 	__asm__ __volatile__ ("sq " #reg ", %0("#base ")" : : "i" (off) )
+/*
+#define	ld(base, off, reg)	\
+	__asm__ __volatile__ ("ld " #reg ", " #off "("#base ")")
+*/
+#define	ld3(base, off, reg)	\
+	__asm__ __volatile__ (".word %0" : : "i" ( 0xdc000000 | (base<<21) | (reg<<16) | (off)))
+#define	ldr3(base, off, reg)	\
+	__asm__ __volatile__ (".word %0" : : "i" ( 0x6c000000 | (base<<21) | (reg<<16) | (off)))
+#define	ldl3(base, off, reg)	\
+	__asm__ __volatile__ (".word %0" : : "i" ( 0x68000000 | (base<<21) | (reg<<16) | (off)))
 /*
 #define	sd(reg, off, base)	\
 	__asm__ __volatile__ ("sd " #reg ", " #off "("#base ")")
@@ -116,5 +130,23 @@
 #define	pminh(rs, rt, rd) \
 	__asm__ __volatile__ ("pminh " #rd ", " #rs ", " #rt )
+#define	pinteh(rs, rt, rd) \
+	__asm__ __volatile__ ("pinteh  " #rd ", " #rs ", " #rt )
+#define	paddh(rs, rt, rd) \
+	__asm__ __volatile__ ("paddh  " #rd ", " #rs ", " #rt )
+#define	psubh(rs, rt, rd) \
+	__asm__ __volatile__ ("psubh  " #rd ", " #rs ", " #rt )
+#define	psrah(rt, sa, rd) \
+	__asm__ __volatile__ ("psrah  " #rd ", " #rt ", %0" : : "i"(sa) )
+#define	pmfhl_uw(rd) \
+	__asm__ __volatile__ ("pmfhl.uw  " #rd)
+#define	pextlb(rs, rt, rd) \
+	__asm__ __volatile__ ("pextlb  " #rd ", " #rs ", " #rt )
 #endif
--- a/libavcodec/ps2/mpegvideo_mmi.c
+++ b/libavcodec/ps2/mpegvideo_mmi.c
+/*
+ * Copyright (c) 2000,2001 Fabrice Bellard.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * MMI optimization by Leon van Stuivenberg <leonvs@iae.nl>
+ */
+#include "../dsputil.h"
+#include "../mpegvideo.h"
+#include "../avcodec.h"
+void ff_mmi_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
+void ff_mmi_idct_add(UINT8 *dest, int line_size, DCTELEM *block);
+static void dct_unquantize_h263_mmi(MpegEncContext *s, 
+                                  DCTELEM *block, int n, int qscale)
+{
+    int level=0, qmul, qadd;
+    int nCoeffs;
+    assert(s->block_last_index[n]>=0);
+    qadd = (qscale - 1) | 1;
+    qmul = qscale << 1;
+    if (s->mb_intra) {
+        if (!s->h263_aic) {
+            if (n < 4) 
+                level = block[0] * s->y_dc_scale;
+            else
+                level = block[0] * s->c_dc_scale;
+        }else {
+            qadd = 0;
+	    level = block[0];
+        }
+        nCoeffs= 63; //does not allways use zigzag table 
+    } else {
+        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+    }
+    asm volatile(
+        "add    $14, $0, %3	\n\t"
+        "pcpyld $8, %0, %0	\n\t"	
+        "pcpyh  $8, $8		\n\t"   //r8 = qmul
+        "pcpyld $9, %1, %1	\n\t"	
+        "pcpyh  $9, $9		\n\t"   //r9 = qadd
+        ".p2align 2             \n\t"
+        "1:			\n\t"
+        "lq     $10, 0($14)	\n\t"   //r10 = level
+        "addi   $14, $14, 16	\n\t"	//block+=8
+        "addi   %2, %2, -8	\n\t"
+        "pcgth  $11, $0, $10	\n\t"   //r11 = level < 0 ? -1 : 0
+        "pcgth  $12, $10, $0	\n\t"   //r12 = level > 0 ? -1 : 0
+        "por    $12, $11, $12	\n\t"
+        "pmulth $10, $10, $8	\n\t"	
+        "paddh  $13, $9, $11	\n\t"
+        "pxor   $13, $13, $11   \n\t"   //r13 = level < 0 ? -qadd : qadd
+        "pmfhl.uw $11		\n\t"
+        "pinteh $10, $11, $10	\n\t"   //r10 = level * qmul
+        "paddh  $10, $10, $13	\n\t"
+        "pand   $10, $10, $12   \n\t"
+        "sq     $10, -16($14)	\n\t"
+        "bgez   %2, 1b		\n\t"
+	:: "r"(qmul), "r" (qadd), "r" (nCoeffs), "r" (block) : "$8", "$9", "$10", "$11", "$12", "$13", "$14", "memory" );
+    if(s->mb_intra)
+        block[0]= level;
+}
+void MPV_common_init_mmi(MpegEncContext *s)
+{
+    int i;
+//  const int dct_algo = s->avctx->dct_algo;
+    const int idct_algo= s->avctx->idct_algo;
+    if(idct_algo==FF_IDCT_AUTO){
+        s->idct_put= ff_mmi_idct_put;
+        s->idct_add= ff_mmi_idct_add;
+        for(i=0; i<64; i++)
+            s->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
+    }
+    s->dct_unquantize_h263 = dct_unquantize_h263_mmi;
+}