Commit 6ad1fa5a authored by Bernhard Rosenkränzer's avatar Bernhard Rosenkränzer Committed by Michael Niedermayer

Better ARM support for mplayer/ffmpeg, ported from atty fork

while playing with some new hardware, I found it's running a forked mplayer
 -- and it looks like they're following the GPL.

 The maintainer's page is here: http://atty.jp/?Zaurus/mplayer
 Unfortunately it's mostly in Japanese, so it's hard to figure out any
  details.

  Their code looks quite interesting (at least to those of us w/ ARM CPUs).

  The patches I've attached are the patches from atty.jp with a couple of
  modifications by myself:
  - ported to current CVS
  - reverted their change of removing SNOW support from ffmpeg
  - cleaned up their bswap mess
  - removed DOS-style linebreaks from various files

patch by (Bernhard Rosenkraenzer: bero, arklinux org)

Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent c66a4434
......@@ -316,8 +316,11 @@ endif
# armv4l specific stuff
ifeq ($(TARGET_ARCH_ARMV4L),yes)
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o
ifeq ($(TARGET_IWMMXT),yes)
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o
endif
endif
# sun mediaLib specific stuff
......@@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o
CFLAGS += $(MLIB_INC)
endif
# Intel IPP specific stuff
# currently only works when libavcodec is used in mplayer
ifeq ($(HAVE_IPP),yes)
CFLAGS += $(IPP_INC)
endif
# alpha specific stuff
ifeq ($(TARGET_ARCH_ALPHA),yes)
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \
......
......@@ -18,6 +18,13 @@
*/
#include "../dsputil.h"
#ifdef HAVE_IPP
#include "ipp.h"
#endif
#ifdef HAVE_IWMMXT
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
#endif
extern void j_rev_dct_ARM(DCTELEM *data);
extern void simple_idct_ARM(DCTELEM *data);
......@@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data);
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_x2_arm(block, pixels, line_size, h);
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_y2_arm(block, pixels, line_size, h);
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_xy2_arm(block, pixels, line_size, h);
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
{
asm volatile (
"mov r10, #8 \n\t"
"1: \n\t"
/* load dest */
"ldr r4, [%1] \n\t"
/* block[0] and block[1]*/
"ldrsh r5, [%0] \n\t"
"ldrsh r7, [%0, #2] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[2] and block[3] */
/* [A] */
"ldrsh r7, [%0, #6] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"ldr r4, [%1, #4] \n\t" /* moved form [B] */
"orr r9, r9, r8, lsl #24 \n\t"
/* store dest */
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
"str r9, [%1] \n\t"
/* load dest */
/* [B] */
/* block[4] and block[5] */
/* [C] */
"ldrsh r7, [%0, #10] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[6] and block[7] */
/* [D] */
"ldrsh r7, [%0, #14] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"add %0, %0, #16 \n\t" /* moved from [E] */
"orr r9, r9, r8, lsl #24 \n\t"
"subs r10, r10, #1 \n\t" /* moved from [F] */
/* store dest */
"str r9, [%1, #4] \n\t"
/* [E] */
/* [F] */
"add %1, %1, %2 \n\t"
"bne 1b \n\t"
:
: "r"(block),
"r"(dest),
"r"(line_size)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
}
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
......@@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
simple_idct_ARM (block);
ff_add_pixels_clamped(block, dest, line_size);
}
static void simple_idct_ipp(DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#endif
}
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
#endif
}
#ifdef HAVE_IWMMXT
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
#endif
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#ifdef HAVE_IWMMXT
add_pixels_clamped_iwmmxt(block, dest, line_size);
#else
add_pixels_clamped_ARM(block, dest, line_size);
#endif
#endif
}
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
{
......@@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
ff_put_pixels_clamped = c->put_pixels_clamped;
ff_add_pixels_clamped = c->add_pixels_clamped;
#ifdef HAVE_IPP
if(idct_algo==FF_IDCT_ARM){
#else
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
#endif
c->idct_put= j_rev_dct_ARM_put;
c->idct_add= j_rev_dct_ARM_add;
c->idct = j_rev_dct_ARM;
......@@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
c->idct_add= simple_idct_ARM_add;
c->idct = simple_idct_ARM;
c->idct_permutation_type= FF_NO_IDCT_PERM;
#ifdef HAVE_IPP
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
#else
} else if (idct_algo==FF_IDCT_IPP){
#endif
c->idct_put= simple_idct_ipp_put;
c->idct_add= simple_idct_ipp_add;
c->idct = simple_idct_ipp;
c->idct_permutation_type= FF_NO_IDCT_PERM;
}
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(Ȥʤ) */
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
#if 1
#ifdef HAVE_IWMMXT
dsputil_init_iwmmxt(c, avctx);
#endif
#endif
}
This diff is collapsed.
/*
* iWMMXt optimized DSP utils
* Copyright (c) 2004 AGAWA Koji
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "../dsputil.h"
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2b"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2B
#define DEF(x, y) x ## _ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2br"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2BR
// need scheduling
#define OP(AVG) \
asm volatile ( \
/* alignment */ \
"and r12, %[pixels], #7 \n\t" \
"bic %[pixels], %[pixels], #7 \n\t" \
"tmcr wcgr1, r12 \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
\
"1: \n\t" \
\
"wldrd wr2, [%[pixels]] \n\t" \
"wldrd wr3, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"pld [%[pixels]] \n\t" \
"walignr1 wr5, wr2, wr3 \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
"pld [%[pixels]] \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"subs %[h], %[h], #2 \n\t" \
"bne 1b \n\t" \
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
: [line_size]"r"(line_size) \
: "memory", "r12");
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2br");
}
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2b");
}
#undef OP
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
{
uint8_t *pixels2 = pixels + line_size;
__asm__ __volatile__ (
"mov r12, #4 \n\t"
"1: \n\t"
"pld [%[pixels], %[line_size2]] \n\t"
"pld [%[pixels2], %[line_size2]] \n\t"
"wldrd wr4, [%[pixels]] \n\t"
"wldrd wr5, [%[pixels2]] \n\t"
"pld [%[block], #32] \n\t"
"wunpckelub wr6, wr4 \n\t"
"wldrd wr0, [%[block]] \n\t"
"wunpckehub wr7, wr4 \n\t"
"wldrd wr1, [%[block], #8] \n\t"
"wunpckelub wr8, wr5 \n\t"
"wldrd wr2, [%[block], #16] \n\t"
"wunpckehub wr9, wr5 \n\t"
"wldrd wr3, [%[block], #24] \n\t"
"add %[block], %[block], #32 \n\t"
"waddhss wr10, wr0, wr6 \n\t"
"waddhss wr11, wr1, wr7 \n\t"
"waddhss wr12, wr2, wr8 \n\t"
"waddhss wr13, wr3, wr9 \n\t"
"wpackhus wr14, wr10, wr11 \n\t"
"wpackhus wr15, wr12, wr13 \n\t"
"wstrd wr14, [%[pixels]] \n\t"
"add %[pixels], %[pixels], %[line_size2] \n\t"
"subs r12, r12, #1 \n\t"
"wstrd wr15, [%[pixels2]] \n\t"
"add %[pixels2], %[pixels2], %[line_size2] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
: [line_size2]"r"(line_size << 1)
: "cc", "memory", "r12");
}
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
return;
}
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
{
c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
}
This diff is collapsed.
......@@ -21,6 +21,13 @@
#include "../mpegvideo.h"
#include "../avcodec.h"
#ifdef HAVE_IWMMXT
extern void MPV_common_init_iwmmxt(MpegEncContext *s);
#endif
void MPV_common_init_armv4l(MpegEncContext *s)
{
#ifdef HAVE_IWMMXT
MPV_common_init_iwmmxt(s);
#endif
}
#include "../dsputil.h"
#include "../mpegvideo.h"
#include "../avcodec.h"
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int level, qmul, qadd;
int nCoeffs;
DCTELEM *block_orig = block;
assert(s->block_last_index[n]>=0);
qmul = qscale << 1;
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level = block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ __volatile__ (
/* "movd %1, %%mm6 \n\t" //qmul */
/* "packssdw %%mm6, %%mm6 \n\t" */
/* "packssdw %%mm6, %%mm6 \n\t" */
"tbcsth wr6, %[qmul] \n\t"
/* "movd %2, %%mm5 \n\t" //qadd */
/* "packssdw %%mm5, %%mm5 \n\t" */
/* "packssdw %%mm5, %%mm5 \n\t" */
"tbcsth wr5, %[qadd] \n\t"
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */
"1: \n\t"
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */
/* "movq (%0, %3), %%mm2 \n\t" */
/* "movq 8(%0, %3), %%mm3 \n\t" */
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */
"subs %[i], %[i], #1 \n\t"
"bne 1b \n\t" /* "jng 1b \n\t" */
:[block]"+r"(block)
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
:"memory");
block_orig[0] = level;
}
#if 0
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int nCoeffs;
assert(s->block_last_index[n]>=0);
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
}
#endif
void MPV_common_init_iwmmxt(MpegEncContext *s)
{
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
#if 0
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
#endif
}
......@@ -1180,6 +1180,7 @@ typedef struct AVCodecContext {
#define FF_IDCT_SIMPLEARM 10
#define FF_IDCT_H264 11
#define FF_IDCT_VP3 12
#define FP_IDCT_IPP 13
/**
* slice count.
......
......@@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){
return (x>>8) | (x<<8);
}
#ifdef ARCH_ARM
static always_inline uint32_t bswap_32(uint32_t x){
uint32_t t;
__asm__ (
"eor %1, %0, %0, ror #16 \n\t"
"bic %1, %1, #0xFF0000 \n\t"
"mov %0, %0, ror #8 \n\t"
"eor %0, %0, %1, lsr #8 \n\t"
: "+r"(x), "+r"(t));
return x;
}
#else
static always_inline uint32_t bswap_32(uint32_t x){
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
return (x>>16) | (x<<16);
}
#endif
static inline uint64_t bswap_64(uint64_t x)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment