Commit 5917d17c authored by Leon van Stuivenberg's avatar Leon van Stuivenberg Committed by Michael Niedermayer

ps2 optimizations update patch by (Leon van Stuivenberg <leonvs at iae dot nl>)

Originally committed as revision 996 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent a46a3ce4
......@@ -77,7 +77,7 @@ OBJS += ppc/dsputil_ppc.o
endif
ifeq ($(TARGET_MMI),yes)
OBJS += ps2/dsputil_mmi.o ps2/idct_mmi.o
OBJS += ps2/dsputil_mmi.o ps2/idct_mmi.o ps2/mpegvideo_mmi.o
endif
ifeq ($(TARGET_ALTIVEC),yes)
......
......@@ -216,6 +216,9 @@ int MPV_common_init(MpegEncContext *s)
#ifdef HAVE_MLIB
MPV_common_init_mlib(s);
#endif
#ifdef HAVE_MMI
MPV_common_init_mmi(s);
#endif
/* load & permutate scantables
......
......@@ -503,6 +503,9 @@ void MPV_common_init_axp(MpegEncContext *s);
#ifdef HAVE_MLIB
void MPV_common_init_mlib(MpegEncContext *s);
#endif
#ifdef HAVE_MMI
void MPV_common_init_mmi(MpegEncContext *s);
#endif
extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);
......
......@@ -20,96 +20,113 @@
*/
#include "../dsputil.h"
void ff_mmi_idct(DCTELEM * block);
#include "mmi.h"
/* the provided 'as' in binutils 2.9EE doesn't support
the EE's mips3 instructions properly */
#define AS_BUGGY
static void clear_blocks_mmi(DCTELEM * blocks)
{
/* $4 = blocks */
int i;
for (i = 0; i < 6; i++) {
sq($0, 0, $4);
sq($0, 16, $4);
sq($0, 32, $4);
sq($0, 48, $4);
sq($0, 64, $4);
sq($0, 80, $4);
sq($0, 96, $4);
sq($0, 112, $4);
__asm__ __volatile__("addi $4, $4, 128");
asm volatile(
"sq $0, 0(%0) \n\t"
"sq $0, 16(%0) \n\t"
"sq $0, 32(%0) \n\t"
"sq $0, 48(%0) \n\t"
"sq $0, 64(%0) \n\t"
"sq $0, 80(%0) \n\t"
"sq $0, 96(%0) \n\t"
"sq $0, 112(%0) \n\t" :: "r" (blocks) : "memory" );
blocks += 64;
}
}
static void put_pixels_clamped_mmi(const DCTELEM * block, UINT8 * pixels,
int line_size)
static void get_pixels_mmi(DCTELEM *block, const UINT8 *pixels, int line_size)
{
/* $4 = block, $5 = pixels, $6 = line_size */
__asm__ __volatile__("li $11, 255":::"$11");
lq($4, 0, $12);
pcpyld($11, $11, $11);
pcpyh($11, $11);
#define PUT(rs) \
ppacb($0, $##rs, $##rs); \
sd3(rs, 0, 5); \
__asm__ __volatile__ ("add $5, $5, $6");
pminh($12, $11, $12);
pmaxh($12, $0, $12);
lq($4, 16, $13);
PUT(12);
pminh($13, $11, $13);
pmaxh($13, $0, $13);
lq($4, 32, $12);
PUT(13);
pminh($12, $11, $12);
pmaxh($12, $0, $12);
lq($4, 48, $13);
PUT(12);
pminh($13, $11, $13);
pmaxh($13, $0, $13);
lq($4, 64, $12);
PUT(13);
pminh($12, $11, $12);
pmaxh($12, $0, $12);
lq($4, 80, $13);
PUT(12);
pminh($13, $11, $13);
pmaxh($13, $0, $13);
lq($4, 96, $12);
PUT(13);
int i;
for(i=0;i<8;i++) {
#ifdef AS_BUGGY
ld3(5, 0, 8);
asm volatile(
"add %1, %1, %2 \n\t"
"pextlb $8, $0, $8 \n\t"
"sq $8, 0(%0) \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
#else
asm volatile(
"ld $8, 0(%1) \n\t"
"add %1, %1, %2 \n\t"
"pextlb $8, $0, $8 \n\t"
"sq $8, 0(%0) \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
#endif
block += 8;
}
}
pminh($12, $11, $12);
pmaxh($12, $0, $12);
lq($4, 112, $13);
PUT(12);
pminh($13, $11, $13);
pmaxh($13, $0, $13);
PUT(13);
static void put_pixels8_mmi(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
int i;
for(i=0; i<h; i++) {
#ifdef AS_BUGGY
ldr3(5, 0, 8);
ldl3(5, 7, 8);
asm volatile ( "add $5, $5, $6 \n\t" );
sd3(8, 0, 4);
asm volatile ( "add $4, $4, $6 \n\t" );
#else
asm volatile(
"ldr $8, 0(%1) \n\t"
"ldl $8, 7(%1) \n\t"
"add %1, %1, %2 \n\t"
"sd $8, 0(%0) \n\t"
"add %0, %0, %2 \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "memory" );
#endif
}
}
/* todo
static void add_pixels_clamped_mmi(const DCTELEM * block, UINT8 * pixels,
int line_size)
static void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
int i;
for(i=0; i<h; i++) {
#ifdef AS_BUGGY
ldr3(5, 0, 8);
ldl3(5, 7, 8);
ldr3(5, 8, 9);
ldl3(5, 15, 9);
asm volatile ( "add $5, $5, $6 \n\t" );
pcpyld($9, $8, $8);
sq($8, 0, $4);
asm volatile ( "add $4, $4, $6 \n\t" );
#else
asm volatile (
"ldr $8, 0(%1) \n\t"
"ldl $8, 7(%1) \n\t"
"ldr $9, 8(%1) \n\t"
"ldl $9, 15(%1) \n\t"
"add %1, %1, %2 \n\t"
"pcpyld $8, $9, $8 \n\t"
"sq $8, 0(%0) \n\t"
"add %0, %0, %2 \n\t" :: "r" (block), "r" (pixels), "r" (line_size) : "$8", "$9", "memory" );
#endif
}
}
*/
void dsputil_init_mmi(void)
{
put_pixels_clamped = put_pixels_clamped_mmi;
//add_pixels_clamped = add_pixels_clamped_mmi;
clear_blocks = clear_blocks_mmi;
ff_idct = ff_mmi_idct;
put_pixels_tab[1][0] = put_pixels8_mmi;
put_no_rnd_pixels_tab[1][0] = put_pixels8_mmi;
put_pixels_tab[0][0] = put_pixels16_mmi;
put_no_rnd_pixels_tab[0][0] = put_pixels16_mmi;
get_pixels = get_pixels_mmi;
}
This diff is collapsed.
......@@ -48,6 +48,20 @@
#define sq(reg, off, base) \
__asm__ __volatile__ ("sq " #reg ", %0("#base ")" : : "i" (off) )
/*
#define ld(base, off, reg) \
__asm__ __volatile__ ("ld " #reg ", " #off "("#base ")")
*/
#define ld3(base, off, reg) \
__asm__ __volatile__ (".word %0" : : "i" ( 0xdc000000 | (base<<21) | (reg<<16) | (off)))
#define ldr3(base, off, reg) \
__asm__ __volatile__ (".word %0" : : "i" ( 0x6c000000 | (base<<21) | (reg<<16) | (off)))
#define ldl3(base, off, reg) \
__asm__ __volatile__ (".word %0" : : "i" ( 0x68000000 | (base<<21) | (reg<<16) | (off)))
/*
#define sd(reg, off, base) \
__asm__ __volatile__ ("sd " #reg ", " #off "("#base ")")
......@@ -116,5 +130,23 @@
#define pminh(rs, rt, rd) \
__asm__ __volatile__ ("pminh " #rd ", " #rs ", " #rt )
#define pinteh(rs, rt, rd) \
__asm__ __volatile__ ("pinteh " #rd ", " #rs ", " #rt )
#define paddh(rs, rt, rd) \
__asm__ __volatile__ ("paddh " #rd ", " #rs ", " #rt )
#define psubh(rs, rt, rd) \
__asm__ __volatile__ ("psubh " #rd ", " #rs ", " #rt )
#define psrah(rt, sa, rd) \
__asm__ __volatile__ ("psrah " #rd ", " #rt ", %0" : : "i"(sa) )
#define pmfhl_uw(rd) \
__asm__ __volatile__ ("pmfhl.uw " #rd)
#define pextlb(rs, rt, rd) \
__asm__ __volatile__ ("pextlb " #rd ", " #rs ", " #rt )
#endif
/*
* Copyright (c) 2000,2001 Fabrice Bellard.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* MMI optimization by Leon van Stuivenberg <leonvs@iae.nl>
*/
#include "../dsputil.h"
#include "../mpegvideo.h"
#include "../avcodec.h"
void ff_mmi_idct_put(UINT8 *dest, int line_size, DCTELEM *block);
void ff_mmi_idct_add(UINT8 *dest, int line_size, DCTELEM *block);
static void dct_unquantize_h263_mmi(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int level=0, qmul, qadd;
int nCoeffs;
assert(s->block_last_index[n]>=0);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
}else {
qadd = 0;
level = block[0];
}
nCoeffs= 63; //does not allways use zigzag table
} else {
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
asm volatile(
"add $14, $0, %3 \n\t"
"pcpyld $8, %0, %0 \n\t"
"pcpyh $8, $8 \n\t" //r8 = qmul
"pcpyld $9, %1, %1 \n\t"
"pcpyh $9, $9 \n\t" //r9 = qadd
".p2align 2 \n\t"
"1: \n\t"
"lq $10, 0($14) \n\t" //r10 = level
"addi $14, $14, 16 \n\t" //block+=8
"addi %2, %2, -8 \n\t"
"pcgth $11, $0, $10 \n\t" //r11 = level < 0 ? -1 : 0
"pcgth $12, $10, $0 \n\t" //r12 = level > 0 ? -1 : 0
"por $12, $11, $12 \n\t"
"pmulth $10, $10, $8 \n\t"
"paddh $13, $9, $11 \n\t"
"pxor $13, $13, $11 \n\t" //r13 = level < 0 ? -qadd : qadd
"pmfhl.uw $11 \n\t"
"pinteh $10, $11, $10 \n\t" //r10 = level * qmul
"paddh $10, $10, $13 \n\t"
"pand $10, $10, $12 \n\t"
"sq $10, -16($14) \n\t"
"bgez %2, 1b \n\t"
:: "r"(qmul), "r" (qadd), "r" (nCoeffs), "r" (block) : "$8", "$9", "$10", "$11", "$12", "$13", "$14", "memory" );
if(s->mb_intra)
block[0]= level;
}
void MPV_common_init_mmi(MpegEncContext *s)
{
int i;
// const int dct_algo = s->avctx->dct_algo;
const int idct_algo= s->avctx->idct_algo;
if(idct_algo==FF_IDCT_AUTO){
s->idct_put= ff_mmi_idct_put;
s->idct_add= ff_mmi_idct_add;
for(i=0; i<64; i++)
s->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
}
s->dct_unquantize_h263 = dct_unquantize_h263_mmi;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment