Commit 053dea12 authored by Aurelien Jacobs's avatar Aurelien Jacobs Committed by Michael Niedermayer

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64...

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)

Originally committed as revision 3578 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 3ba1438d
......@@ -106,6 +106,14 @@ case "$cpu" in
i386|i486|i586|i686|i86pc|BePC)
cpu="x86"
;;
x86_64)
if [ "`$cc -dumpmachine | grep x86_64 | cut -d- -f1`" = "x86_64" -a \
-z "`echo $CFLAGS | grep -- -m32`" ]; then
cpu="x86_64"
else
cpu="x86"
fi
;;
# armv4l is a subset of armv5tel
armv4l|armv5tel)
cpu="armv4l"
......@@ -500,7 +508,7 @@ fi
# compute mmx state
if test $mmx = "default"; then
if test $cpu = "x86"; then
if test $cpu = "x86" -o $cpu = "x86_64"; then
mmx="yes"
else
mmx="no"
......@@ -827,6 +835,7 @@ done
# test gcc version to see if vector builtins can be used
# currently only used on i386 for MMX builtins
cat > $TMPC << EOF
#include <xmmintrin.h>
int main(void) {
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)
return 0;
......@@ -985,7 +994,7 @@ echo "CPU $cpu ($tune)"
echo "Big Endian $bigendian"
echo "inttypes.h $inttypes"
echo "broken inttypes.h $emu_fast_int"
if test $cpu = "x86"; then
if test $cpu = "x86" -o $cpu = "x86_64"; then
echo "MMX enabled $mmx"
echo "Vector Builtins $builtin_vector"
fi
......@@ -1074,6 +1083,9 @@ echo "TARGET_OS=$TARGET_OS" >> config.mak
if test "$cpu" = "x86" ; then
echo "TARGET_ARCH_X86=yes" >> config.mak
echo "#define ARCH_X86 1" >> $TMPH
elif test "$cpu" = "x86_64" ; then
echo "TARGET_ARCH_X86_64=yes" >> config.mak
echo "#define ARCH_X86_64 1" >> $TMPH
elif test "$cpu" = "armv4l" ; then
echo "TARGET_ARCH_ARMV4L=yes" >> config.mak
echo "#define ARCH_ARMV4L 1" >> $TMPH
......
......@@ -10,17 +10,23 @@
#include <byteswap.h>
#else
#ifdef ARCH_X86
static inline unsigned short ByteSwap16(unsigned short x)
#ifdef ARCH_X86_64
# define LEGACY_REGS "=Q"
#else
# define LEGACY_REGS "=q"
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline uint16_t ByteSwap16(uint16_t x)
{
__asm("xchgb %b0,%h0" :
"=q" (x) :
LEGACY_REGS (x) :
"0" (x));
return x;
}
#define bswap_16(x) ByteSwap16(x)
static inline unsigned int ByteSwap32(unsigned int x)
static inline uint32_t ByteSwap32(uint32_t x)
{
#if __CPU__ > 386
__asm("bswap %0":
......@@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x)
__asm("xchgb %b0,%h0\n"
" rorl $16,%0\n"
" xchgb %b0,%h0":
"=q" (x) :
LEGACY_REGS (x) :
#endif
"0" (x));
return x;
}
#define bswap_32(x) ByteSwap32(x)
static inline unsigned long long int ByteSwap64(unsigned long long int x)
static inline uint64_t ByteSwap64(uint64_t x)
{
#ifdef ARCH_X86_64
__asm("bswap %0":
"=r" (x) :
"0" (x));
return x;
#else
register union { __extension__ uint64_t __ll;
uint32_t __l[2]; } __x;
asm("xchgl %0,%1":
"=r"(__x.__l[0]),"=r"(__x.__l[1]):
"0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32))));
"0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32))));
return __x.__ll;
#endif
}
#define bswap_64(x) ByteSwap64(x)
......
......@@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {}
extern const uint32_t inverse[256];
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
# define FASTDIV(a,b) \
({\
int ret,dmy;\
......@@ -271,7 +271,7 @@ extern const uint32_t inverse[256];
# define FASTDIV(a,b) ((a)/(b))
#endif
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
// avoid +32 for shift optimization (gcc should do that ...)
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
asm ("sarl %1, %0\n\t"
......@@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM {
#endif
/* used to avoid missaligned exceptions on some archs (alpha, ...) */
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
# define unaligned32(a) (*(const uint32_t*)(a))
#else
# ifdef __GNUC__
......@@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
static inline void put_bits(PutBitContext *s, int n, unsigned int value)
{
# ifdef ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86
# if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile(
"movl %0, %%ecx \n\t"
"xorl %%eax, %%eax \n\t"
......@@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
s->index= index;
# endif
# else //ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86
# if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile(
"movl $7, %%ecx \n\t"
"andl %0, %%ecx \n\t"
......@@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){
name##_bit_count-= 32;\
}\
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
# define SKIP_CACHE(name, gb, num)\
asm(\
"shldl %2, %1, %0 \n\t"\
......@@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){
#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
#define MASK_ABS(mask, level)\
asm volatile(\
"cdq \n\t"\
......@@ -1252,7 +1252,7 @@ if((y)<(x)){\
}
#endif
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline long long rdtsc(void)
{
long long l;
......
......@@ -4,12 +4,20 @@
#include <stdlib.h>
#include "../dsputil.h"
#ifdef ARCH_X86_64
# define REG_b "rbx"
# define REG_S "rsi"
#else
# define REG_b "ebx"
# define REG_S "esi"
#endif
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index,eax,ebx,ecx,edx)\
__asm __volatile\
("movl %%ebx, %%esi\n\t"\
("mov %%"REG_b", %%"REG_S"\n\t"\
"cpuid\n\t"\
"xchgl %%ebx, %%esi"\
"xchg %%"REG_b", %%"REG_S\
: "=a" (eax), "=S" (ebx),\
"=c" (ecx), "=d" (edx)\
: "0" (index));
......@@ -24,7 +32,7 @@ int mm_support(void)
/* See if CPUID instruction is supported ... */
/* ... Get copies of EFLAGS into eax and ecx */
"pushf\n\t"
"popl %0\n\t"
"pop %0\n\t"
"movl %0, %1\n\t"
/* ... Toggle the ID bit in one copy and store */
......@@ -35,7 +43,7 @@ int mm_support(void)
/* ... Get the (hopefully modified) EFLAGS */
"pushf\n\t"
"popl %0\n\t"
"pop %0\n\t"
: "=a" (eax), "=c" (ecx)
:
: "cc"
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
};
static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
struct
{
const long fdct_r_row_sse2[4] ATTR_ALIGN(16);
const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
} fdct_r_row_sse2 ATTR_ALIGN(16)=
{{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
......
......@@ -5,6 +5,12 @@
#ifndef AVCODEC_I386MMX_H
#define AVCODEC_I386MMX_H
#ifdef ARCH_X86_64
# define REG_a "rax"
#else
# define REG_a "eax"
#endif
/*
* The type of an value that fits in an MMX register (note that long
* long constant values MUST be suffixed by LL and unsigned long long
......
......@@ -20,6 +20,7 @@
* mostly by Michael Niedermayer <michaelni@gmx.at>
*/
#include "../dsputil.h"
#include "mmx.h"
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
0x0000000000000000ULL,
......@@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t"
"movq (%2, %%eax), %%mm4 \n\t"
"addl %3, %%eax \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm4 \n\t"
"add %3, %%"REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%eax), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t"
"movq (%2, %%eax), %%mm5 \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq (%2, %%"REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t"
......@@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t"
"add %3, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
: "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
);
}
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t"
"addl %3, %%eax \n\t"
"movq (%1, %%eax), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t"
"add %3, %%"REG_a" \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t"
"add %3, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
: "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
);
}
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"pavgb %%mm2, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t"
"movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t"
"movq (%1, %%eax), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t"
"add %4, %%"REG_a" \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm1, %%mm3 \n\t"
"movq (%3, %%eax), %%mm1 \n\t"
"movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
);
}
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ //FIXME reuse src
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"movq "MANGLE(bone)", %%mm5 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t"
"movq 1(%1, %%eax), %%mm1 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq 1(%1, %%"REG_a"), %%mm1 \n\t"
"movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm2, %%mm0 \n\t"
"pavgb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm3 \n\t"
"pavgb %%mm3, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t"
"movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t"
"movq (%1, %%eax), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm4 \n\t"
"add %4, %%"REG_a" \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%"REG_a"), %%mm4 \n\t"
"pavgb %%mm3, %%mm1 \n\t"
"pavgb %%mm4, %%mm2 \n\t"
"psubusb %%mm5, %%mm2 \n\t"
"pavgb %%mm1, %%mm2 \n\t"
"movq (%3, %%eax), %%mm1 \n\t"
"movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
);
}
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t"
"movq (%2, %%eax), %%mm3 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t"
"movq (%3, %%eax), %%mm2 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t"
......@@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
"punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
);
}
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
int len= -(stride*h);
long len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
......@@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
......@@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm1 \n\t"
"movq (%3, %%eax), %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm3 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"paddw %%mm5, %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm2 \n\t"
......@@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
);
}
......
This diff is collapsed.
......@@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
DCTELEM *block, int n,
int qscale, int *overflow)
{
int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
long last_non_zero_p1;
int level=0, q; //=0 is cuz gcc says uninitalized ...
const uint16_t *qmat, *bias;
__align8 int16_t temp_block[64];
......@@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if(s->out_format == FMT_H263 && s->mpeg_quant==0){
asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1
"movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0
"movq (%2), %%mm5 \n\t" // qmat[0]
"pxor %%mm6, %%mm6 \n\t"
"psubw (%3), %%mm6 \n\t" // -bias[0]
"movl $-128, %%eax \n\t"
"mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t"
"1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i]
"movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
......@@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t"
"movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0
"movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
"movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t"
......@@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
"movd %%mm3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64)
......@@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
);
}else{ // FMT_H263
asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1
"movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0
"movl $-128, %%eax \n\t"
"mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t"
"1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i]
"movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
"movq (%3, %%eax), %%mm6 \n\t" // bias[0]
"movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0]
"paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
"movq (%2, %%eax), %%mm5 \n\t" // qmat[i]
"movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i]
"pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t"
"movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0
"movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
"movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t"
......@@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1
"movd %%mm3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64)
......
......@@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
# define always_inline inline
#endif
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
......@@ -172,7 +172,7 @@ static char *replaceTable[]=
};
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline void prefetchnta(void *p)
{
asm volatile( "prefetchnta (%0)\n\t"
......@@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#endif //HAVE_ALTIVEC
#endif //ARCH_POWERPC
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX
......@@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef HAVE_ALTIVEC
#undef ARCH_X86
#ifdef COMPILE_C
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#undef ARCH_X86
#define RENAME(a) a ## _C
#include "postprocess_template.c"
#endif
......@@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX
#include "postprocess_template.c"
#endif
......@@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX
#define HAVE_MMX2
#undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX2
#include "postprocess_template.c"
#endif
......@@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX
#undef HAVE_MMX2
#define HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _3DNow
#include "postprocess_template.c"
#endif
......@@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
// difference wouldnt be messureable here but its much better because
// someone might exchange the cpu whithout restarting mplayer ;)
#ifdef RUNTIME_CPUDETECT
#ifdef ARCH_X86
#if defined(ARCH_X86) || defined(ARCH_X86_64)
// ordered per speed fasterst first
if(c->cpuCaps & PP_CPU_CAPS_MMX2)
postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
......
......@@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
necessitate to modify mpegvideo.c. The problem comes from the
fact they decided to store the quantized DC (which would lead
to problems if Q could vary !) */
#if defined ARCH_X86 && !defined PIC
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC
asm volatile(
"movl %3, %%eax \n\t"
"shrl $1, %%eax \n\t"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment