Commit 6ad1fa5a authored by Bernhard Rosenkränzer's avatar Bernhard Rosenkränzer Committed by Michael Niedermayer

Better ARM support for mplayer/ffmpeg, ported from atty fork

while playing with some new hardware, I found it's running a forked mplayer
 -- and it looks like they're following the GPL.

 The maintainer's page is here: http://atty.jp/?Zaurus/mplayer
 Unfortunately it's mostly in Japanese, so it's hard to figure out any
  details.

  Their code looks quite interesting (at least to those of us w/ ARM CPUs).

  The patches I've attached are the patches from atty.jp with a couple of
  modifications by myself:
  - ported to current CVS
  - reverted their change of removing SNOW support from ffmpeg
  - cleaned up their bswap mess
  - removed DOS-style linebreaks from various files

patch by (Bernhard Rosenkraenzer: bero, arklinux org)

Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent c66a4434
......@@ -316,8 +316,11 @@ endif
# armv4l specific stuff
ifeq ($(TARGET_ARCH_ARMV4L),yes)
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o
ifeq ($(TARGET_IWMMXT),yes)
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o
endif
endif
# sun mediaLib specific stuff
......@@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o
CFLAGS += $(MLIB_INC)
endif
# Intel IPP specific stuff
# currently only works when libavcodec is used in mplayer
ifeq ($(HAVE_IPP),yes)
CFLAGS += $(IPP_INC)
endif
# alpha specific stuff
ifeq ($(TARGET_ARCH_ALPHA),yes)
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \
......
......@@ -18,6 +18,13 @@
*/
#include "../dsputil.h"
#ifdef HAVE_IPP
#include "ipp.h"
#endif
#ifdef HAVE_IWMMXT
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
#endif
extern void j_rev_dct_ARM(DCTELEM *data);
extern void simple_idct_ARM(DCTELEM *data);
......@@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data);
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_x2_arm(block, pixels, line_size, h);
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_y2_arm(block, pixels, line_size, h);
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_pixels8_xy2_arm(block, pixels, line_size, h);
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
}
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
}
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
{
asm volatile (
"mov r10, #8 \n\t"
"1: \n\t"
/* load dest */
"ldr r4, [%1] \n\t"
/* block[0] and block[1]*/
"ldrsh r5, [%0] \n\t"
"ldrsh r7, [%0, #2] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[2] and block[3] */
/* [A] */
"ldrsh r7, [%0, #6] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"ldr r4, [%1, #4] \n\t" /* moved form [B] */
"orr r9, r9, r8, lsl #24 \n\t"
/* store dest */
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
"str r9, [%1] \n\t"
/* load dest */
/* [B] */
/* block[4] and block[5] */
/* [C] */
"ldrsh r7, [%0, #10] \n\t"
"and r6, r4, #0xFF \n\t"
"and r8, r4, #0xFF00 \n\t"
"add r6, r5, r6 \n\t"
"add r8, r7, r8, lsr #8 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"mov r9, r6 \n\t"
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
"orr r9, r9, r8, lsl #8 \n\t"
/* block[6] and block[7] */
/* [D] */
"ldrsh r7, [%0, #14] \n\t"
"and r6, r4, #0xFF0000 \n\t"
"and r8, r4, #0xFF000000 \n\t"
"add r6, r5, r6, lsr #16 \n\t"
"add r8, r7, r8, lsr #24 \n\t"
"mvn r5, r5 \n\t"
"mvn r7, r7 \n\t"
"tst r6, #0x100 \n\t"
"movne r6, r5, lsr #24 \n\t"
"tst r8, #0x100 \n\t"
"movne r8, r7, lsr #24 \n\t"
"orr r9, r9, r6, lsl #16 \n\t"
"add %0, %0, #16 \n\t" /* moved from [E] */
"orr r9, r9, r8, lsl #24 \n\t"
"subs r10, r10, #1 \n\t" /* moved from [F] */
/* store dest */
"str r9, [%1, #4] \n\t"
/* [E] */
/* [F] */
"add %1, %1, %2 \n\t"
"bne 1b \n\t"
:
: "r"(block),
"r"(dest),
"r"(line_size)
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
}
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
......@@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
simple_idct_ARM (block);
ff_add_pixels_clamped(block, dest, line_size);
}
static void simple_idct_ipp(DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#endif
}
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
#endif
}
#ifdef HAVE_IWMMXT
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
#endif
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
{
#ifdef HAVE_IPP
ippiDCT8x8Inv_Video_16s_C1I(block);
#ifdef HAVE_IWMMXT
add_pixels_clamped_iwmmxt(block, dest, line_size);
#else
add_pixels_clamped_ARM(block, dest, line_size);
#endif
#endif
}
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
{
......@@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
ff_put_pixels_clamped = c->put_pixels_clamped;
ff_add_pixels_clamped = c->add_pixels_clamped;
#ifdef HAVE_IPP
if(idct_algo==FF_IDCT_ARM){
#else
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
#endif
c->idct_put= j_rev_dct_ARM_put;
c->idct_add= j_rev_dct_ARM_add;
c->idct = j_rev_dct_ARM;
......@@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
c->idct_add= simple_idct_ARM_add;
c->idct = simple_idct_ARM;
c->idct_permutation_type= FF_NO_IDCT_PERM;
#ifdef HAVE_IPP
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
#else
} else if (idct_algo==FF_IDCT_IPP){
#endif
c->idct_put= simple_idct_ipp_put;
c->idct_add= simple_idct_ipp_add;
c->idct = simple_idct_ipp;
c->idct_permutation_type= FF_NO_IDCT_PERM;
}
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(Ȥʤ) */
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
#if 1
#ifdef HAVE_IWMMXT
dsputil_init_iwmmxt(c, avctx);
#endif
#endif
}
@
@ ARMv4L optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This library is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2 of the License, or (at your option) any later version.
@
@ This library is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with this library; if not, write to the Free Software
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
mov \Rd0, \Rn0, lsr #(\shift * 8)
mov \Rd1, \Rn1, lsr #(\shift * 8)
mov \Rd2, \Rn2, lsr #(\shift * 8)
mov \Rd3, \Rn3, lsr #(\shift * 8)
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
.endm
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
mov \R0, \R0, lsr #(\shift * 8)
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
mov \R1, \R1, lsr #(\shift * 8)
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
.endm
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
.endm
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
orr \Rn0, \Rn0, \Rm0
orr \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
sub \Rd0, \Rn0, \Rd0, lsr #1
sub \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
and \Rn0, \Rn0, \Rm0
and \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
add \Rd0, \Rn0, \Rd0, lsr #1
add \Rd1, \Rn1, \Rd1, lsr #1
.endm
@ ----------------------------------------------------------------
.align 8
.global put_pixels16_arm
put_pixels16_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11, lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
bic r1, r1, #3
add r5, r5, r4, lsl #2
ldrne pc, [r5]
1:
ldmia r1, {r4-r7}
add r1, r1, r2
stmia r0, {r4-r7}
pld [r1]
subs r3, r3, #1
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r11, pc}
.align 8
2:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r11, pc}
.align 8
3:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r11, pc}
.align 8
4:
ldmia r1, {r4-r8}
add r1, r1, r2
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stmia r0, {r9-r12}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 1b
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_arm
put_pixels8_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r5,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
bic r1, r1, #3
add r5, r5, r4, lsl #2
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
subs r3, r3, #1
pld [r1]
stmia r0, {r4-r5}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r5,pc}
.align 8
2:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r5,pc}
.align 8
3:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r5,pc}
.align 8
4:
ldmia r1, {r4-r5, r12}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
pld [r1]
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r5,pc}
.align 8
5:
.word 1b
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_x2_arm
put_pixels8_x2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r10,pc}
.align 8
2:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r10,pc}
.align 8
3:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r10,pc}
.align 8
4:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
.align 8
.global put_no_rnd_pixels8_x2_arm
put_no_rnd_pixels8_x2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 1b
ldmfd sp!, {r4-r10,pc}
.align 8
2:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 2b
ldmfd sp!, {r4-r10,pc}
.align 8
3:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 3b
ldmfd sp!, {r4-r10,pc}
.align 8
4:
ldmia r1, {r4-r5, r10}
add r1, r1, r2
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 4b
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.align 8
.global put_pixels8_y2_arm
put_pixels8_y2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
mov r3, r3, lsr #1
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
6: ldmia r1, {r6-r7}
add r1, r1, r2
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldmia r1, {r4-r5}
add r1, r1, r2
stmia r0, {r8-r9}
add r0, r0, r2
pld [r1]
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
2:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
3:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
4:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
.align 8
.global put_no_rnd_pixels8_y2_arm
put_no_rnd_pixels8_y2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adr r5, 5f
ands r4, r1, #3
mov r3, r3, lsr #1
ldr r12, [r5]
add r5, r5, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
ldmia r1, {r4-r5}
add r1, r1, r2
6: ldmia r1, {r6-r7}
add r1, r1, r2
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldmia r1, {r4-r5}
add r1, r1, r2
stmia r0, {r8-r9}
add r0, r0, r2
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stmia r0, {r8-r9}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
2:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
3:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
4:
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
6: ldmia r1, {r7-r9}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stmia r0, {r10-r11}
add r0, r0, r2
ldmia r1, {r4-r6}
add r1, r1, r2
pld [r1]
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stmia r0, {r10-r11}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.align 8
5:
.word 0xFEFEFEFE
.word 2b
.word 3b
.word 4b
@ ----------------------------------------------------------------
.macro RND_XY2_IT align, rnd
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
.if \align == 0
ldmia r1, {r6-r8}
.elseif \align == 3
ldmia r1, {r5-r7}
.else
ldmia r1, {r8-r10}
.endif
add r1, r1, r2
pld [r1]
.if \align == 0
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
.elseif \align == 1
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
.elseif \align == 2
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
.elseif \align == 3
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
.endif
ldr r14, [r12, #0] @ 0x03030303
tst r3, #1
and r8, r4, r14
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
.if \rnd == 1
ldreq r14, [r12, #16] @ 0x02020202
.else
ldreq r14, [r12, #28] @ 0x01010101
.endif
add r8, r8, r10
add r9, r9, r11
addeq r8, r8, r14
addeq r9, r9, r14
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
and r6, r14, r6, lsr #2
and r7, r14, r7, lsr #2
add r10, r4, r6
add r11, r5, r7
.endm
.macro RND_XY2_EXPAND align, rnd
RND_XY2_IT \align, \rnd
6: stmfd sp!, {r8-r11}
RND_XY2_IT \align, \rnd
ldmfd sp!, {r4-r7}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
ldr r14, [r12, #24] @ 0x0F0F0F0F
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
add r4, r4, r6
add r5, r5, r7
subs r3, r3, #1
stmia r0, {r4-r5}
add r0, r0, r2
bne 6b
ldmfd sp!, {r4-r11,pc}
.endm
.align 8
.global put_pixels8_xy2_arm
put_pixels8_xy2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adrl r12, 5f
ands r4, r1, #3
add r5, r12, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
RND_XY2_EXPAND 0, 1
.align 8
2:
RND_XY2_EXPAND 1, 1
.align 8
3:
RND_XY2_EXPAND 2, 1
.align 8
4:
RND_XY2_EXPAND 3, 1
5:
.word 0x03030303
.word 2b
.word 3b
.word 4b
.word 0x02020202
.word 0xFCFCFCFC >> 2
.word 0x0F0F0F0F
.word 0x01010101
.align 8
.global put_no_rnd_pixels8_xy2_arm
put_no_rnd_pixels8_xy2_arm:
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
adrl r12, 5f
ands r4, r1, #3
add r5, r12, r4, lsl #2
bic r1, r1, #3
ldrne pc, [r5]
1:
RND_XY2_EXPAND 0, 0
.align 8
2:
RND_XY2_EXPAND 1, 0
.align 8
3:
RND_XY2_EXPAND 2, 0
.align 8
4:
RND_XY2_EXPAND 3, 0
5:
.word 0x03030303
.word 2b
.word 3b
.word 4b
.word 0x02020202
.word 0xFCFCFCFC >> 2
.word 0x0F0F0F0F
.word 0x01010101
/*
* iWMMXt optimized DSP utils
* Copyright (c) 2004 AGAWA Koji
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "../dsputil.h"
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2b"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2B
#define DEF(x, y) x ## _ ## y ##_iwmmxt
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
#define WAVG2B "wavg2br"
#include "dsputil_iwmmxt_rnd.h"
#undef DEF
#undef SET_RND
#undef WAVG2BR
// need scheduling
#define OP(AVG) \
asm volatile ( \
/* alignment */ \
"and r12, %[pixels], #7 \n\t" \
"bic %[pixels], %[pixels], #7 \n\t" \
"tmcr wcgr1, r12 \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
\
"1: \n\t" \
\
"wldrd wr2, [%[pixels]] \n\t" \
"wldrd wr3, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"pld [%[pixels]] \n\t" \
"walignr1 wr5, wr2, wr3 \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"wldrd wr0, [%[pixels]] \n\t" \
"wldrd wr1, [%[pixels], #8] \n\t" \
"add %[pixels], %[pixels], %[line_size] \n\t" \
"walignr1 wr4, wr0, wr1 \n\t" \
"pld [%[pixels]] \n\t" \
AVG " wr6, wr4, wr5 \n\t" \
"wstrd wr6, [%[block]] \n\t" \
"add %[block], %[block], %[line_size] \n\t" \
\
"subs %[h], %[h], #2 \n\t" \
"bne 1b \n\t" \
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
: [line_size]"r"(line_size) \
: "memory", "r12");
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2br");
}
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
OP("wavg2b");
}
#undef OP
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
{
uint8_t *pixels2 = pixels + line_size;
__asm__ __volatile__ (
"mov r12, #4 \n\t"
"1: \n\t"
"pld [%[pixels], %[line_size2]] \n\t"
"pld [%[pixels2], %[line_size2]] \n\t"
"wldrd wr4, [%[pixels]] \n\t"
"wldrd wr5, [%[pixels2]] \n\t"
"pld [%[block], #32] \n\t"
"wunpckelub wr6, wr4 \n\t"
"wldrd wr0, [%[block]] \n\t"
"wunpckehub wr7, wr4 \n\t"
"wldrd wr1, [%[block], #8] \n\t"
"wunpckelub wr8, wr5 \n\t"
"wldrd wr2, [%[block], #16] \n\t"
"wunpckehub wr9, wr5 \n\t"
"wldrd wr3, [%[block], #24] \n\t"
"add %[block], %[block], #32 \n\t"
"waddhss wr10, wr0, wr6 \n\t"
"waddhss wr11, wr1, wr7 \n\t"
"waddhss wr12, wr2, wr8 \n\t"
"waddhss wr13, wr3, wr9 \n\t"
"wpackhus wr14, wr10, wr11 \n\t"
"wpackhus wr15, wr12, wr13 \n\t"
"wstrd wr14, [%[pixels]] \n\t"
"add %[pixels], %[pixels], %[line_size2] \n\t"
"subs r12, r12, #1 \n\t"
"wstrd wr15, [%[pixels2]] \n\t"
"add %[pixels2], %[pixels2], %[line_size2] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
: [line_size2]"r"(line_size << 1)
: "cc", "memory", "r12");
}
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{
return;
}
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
{
c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
}
void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
__asm__ __volatile__ (
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r4, %[pixels], %[line_size] \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr0, [%[pixels]] \n\t"
"subs %[h], %[h], #2 \n\t"
"wldrd wr1, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr3, [r4] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wldrd wr4, [r4, #8] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr8, wr0, wr1 \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr10, wr3, wr4 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr10, [r5] \n\t"
"add r5, r5, %[line_size] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
:
: "memory", "r4", "r5", "r12");
}
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
__asm__ __volatile__ (
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r4, %[pixels], %[line_size] \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr0, [%[pixels]] \n\t"
"subs %[h], %[h], #2 \n\t"
"wldrd wr1, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr3, [r4] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wldrd wr4, [r4, #8] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr8, wr0, wr1 \n\t"
"wldrd wr0, [%[block]] \n\t"
"wldrd wr2, [r5] \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr10, wr3, wr4 \n\t"
WAVG2B" wr8, wr8, wr0 \n\t"
WAVG2B" wr10, wr10, wr2 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr10, [r5] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"add r5, r5, %[line_size] \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
:
: "memory", "r4", "r5", "r12");
}
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
__asm__ __volatile__ (
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r4, %[pixels], %[line_size] \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr0, [%[pixels]] \n\t"
"wldrd wr1, [%[pixels], #8] \n\t"
"subs %[h], %[h], #2 \n\t"
"wldrd wr2, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr3, [r4] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr8, wr0, wr1 \n\t"
"wldrd wr4, [r4, #8] \n\t"
"walignr1 wr9, wr1, wr2 \n\t"
"wldrd wr5, [r4, #16] \n\t"
"add r4, r4, %[line_size] \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr10, wr3, wr4 \n\t"
"wstrd wr8, [%[block]] \n\t"
"walignr1 wr11, wr4, wr5 \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr10, [r5] \n\t"
"wstrd wr11, [r5, #8] \n\t"
"add r5, r5, %[line_size] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
:
: "memory", "r4", "r5", "r12");
}
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
__asm__ __volatile__ (
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r4, %[pixels], %[line_size]\n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr0, [%[pixels]] \n\t"
"wldrd wr1, [%[pixels], #8] \n\t"
"subs %[h], %[h], #2 \n\t"
"wldrd wr2, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr3, [r4] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr8, wr0, wr1 \n\t"
"wldrd wr4, [r4, #8] \n\t"
"walignr1 wr9, wr1, wr2 \n\t"
"wldrd wr5, [r4, #16] \n\t"
"add r4, r4, %[line_size] \n\t"
"wldrd wr0, [%[block]] \n\t"
"pld [r4] \n\t"
"wldrd wr1, [%[block], #8] \n\t"
"pld [r4, #32] \n\t"
"wldrd wr2, [r5] \n\t"
"walignr1 wr10, wr3, wr4 \n\t"
"wldrd wr3, [r5, #8] \n\t"
WAVG2B" wr8, wr8, wr0 \n\t"
WAVG2B" wr9, wr9, wr1 \n\t"
WAVG2B" wr10, wr10, wr2 \n\t"
"wstrd wr8, [%[block]] \n\t"
"walignr1 wr11, wr4, wr5 \n\t"
WAVG2B" wr11, wr11, wr3 \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr10, [r5] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"wstrd wr11, [r5, #8] \n\t"
"add r5, r5, %[line_size] \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"bne 1b \n\t"
: [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
:
: "memory", "r4", "r5", "r12");
}
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"add r4, %[pixels], %[line_size]\n\t"
"tmcr wcgr2, r12 \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr13, [r4] \n\t"
"pld [%[pixels]] \n\t"
"wldrd wr14, [r4, #8] \n\t"
"pld [%[pixels], #32] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr2, wr13, wr14 \n\t"
"wmoveq wr4, wr11 \n\t"
"wmoveq wr6, wr14 \n\t"
"walignr2ne wr4, wr10, wr11 \n\t"
"walignr2ne wr6, wr13, wr14 \n\t"
WAVG2B" wr0, wr0, wr4 \n\t"
WAVG2B" wr2, wr2, wr6 \n\t"
"wstrd wr0, [%[block]] \n\t"
"subs %[h], %[h], #2 \n\t"
"wstrd wr2, [r5] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"add r5, r5, %[line_size] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "r4", "r5", "r12", "memory");
}
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"add r4, %[pixels], %[line_size]\n\t"
"tmcr wcgr2, r12 \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr13, [r4] \n\t"
"pld [%[pixels]] \n\t"
"wldrd wr14, [r4, #8] \n\t"
"pld [%[pixels], #32] \n\t"
"wldrd wr15, [r4, #16] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
"walignr1 wr2, wr13, wr14 \n\t"
"walignr1 wr3, wr14, wr15 \n\t"
"wmoveq wr4, wr11 \n\t"
"wmoveq wr5, wr12 \n\t"
"wmoveq wr6, wr14 \n\t"
"wmoveq wr7, wr15 \n\t"
"walignr2ne wr4, wr10, wr11 \n\t"
"walignr2ne wr5, wr11, wr12 \n\t"
"walignr2ne wr6, wr13, wr14 \n\t"
"walignr2ne wr7, wr14, wr15 \n\t"
WAVG2B" wr0, wr0, wr4 \n\t"
WAVG2B" wr1, wr1, wr5 \n\t"
"wstrd wr0, [%[block]] \n\t"
WAVG2B" wr2, wr2, wr6 \n\t"
"wstrd wr1, [%[block], #8] \n\t"
WAVG2B" wr3, wr3, wr7 \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr2, [r5] \n\t"
"subs %[h], %[h], #2 \n\t"
"wstrd wr3, [r5, #8] \n\t"
"add r5, r5, %[line_size] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"add r4, %[pixels], %[line_size]\n\t"
"tmcr wcgr2, r12 \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr13, [r4] \n\t"
"pld [%[pixels]] \n\t"
"wldrd wr14, [r4, #8] \n\t"
"pld [%[pixels], #32] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr2, wr13, wr14 \n\t"
"wmoveq wr4, wr11 \n\t"
"wmoveq wr6, wr14 \n\t"
"walignr2ne wr4, wr10, wr11 \n\t"
"wldrd wr10, [%[block]] \n\t"
"walignr2ne wr6, wr13, wr14 \n\t"
"wldrd wr12, [r5] \n\t"
WAVG2B" wr0, wr0, wr4 \n\t"
WAVG2B" wr2, wr2, wr6 \n\t"
WAVG2B" wr0, wr0, wr10 \n\t"
WAVG2B" wr2, wr2, wr12 \n\t"
"wstrd wr0, [%[block]] \n\t"
"subs %[h], %[h], #2 \n\t"
"wstrd wr2, [r5] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"add r5, r5, %[line_size] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"add r4, %[pixels], %[line_size]\n\t"
"tmcr wcgr2, r12 \n\t"
"add r5, %[block], %[line_size] \n\t"
"mov %[line_size], %[line_size], lsl #1 \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"wldrd wr13, [r4] \n\t"
"pld [%[pixels]] \n\t"
"wldrd wr14, [r4, #8] \n\t"
"pld [%[pixels], #32] \n\t"
"wldrd wr15, [r4, #16] \n\t"
"add r4, r4, %[line_size] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"pld [r4] \n\t"
"pld [r4, #32] \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
"walignr1 wr2, wr13, wr14 \n\t"
"walignr1 wr3, wr14, wr15 \n\t"
"wmoveq wr4, wr11 \n\t"
"wmoveq wr5, wr12 \n\t"
"wmoveq wr6, wr14 \n\t"
"wmoveq wr7, wr15 \n\t"
"walignr2ne wr4, wr10, wr11 \n\t"
"walignr2ne wr5, wr11, wr12 \n\t"
"walignr2ne wr6, wr13, wr14 \n\t"
"walignr2ne wr7, wr14, wr15 \n\t"
"wldrd wr10, [%[block]] \n\t"
WAVG2B" wr0, wr0, wr4 \n\t"
"wldrd wr11, [%[block], #8] \n\t"
WAVG2B" wr1, wr1, wr5 \n\t"
"wldrd wr12, [r5] \n\t"
WAVG2B" wr2, wr2, wr6 \n\t"
"wldrd wr13, [r5, #8] \n\t"
WAVG2B" wr3, wr3, wr7 \n\t"
WAVG2B" wr0, wr0, wr10 \n\t"
WAVG2B" wr1, wr1, wr11 \n\t"
WAVG2B" wr2, wr2, wr12 \n\t"
WAVG2B" wr3, wr3, wr13 \n\t"
"wstrd wr0, [%[block]] \n\t"
"subs %[h], %[h], #2 \n\t"
"wstrd wr1, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wstrd wr2, [r5] \n\t"
"pld [%[block]] \n\t"
"wstrd wr3, [r5, #8] \n\t"
"add r5, r5, %[line_size] \n\t"
"pld [%[block], #32] \n\t"
"pld [r5] \n\t"
"pld [r5, #32] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
:"r4", "r5", "r12", "memory");
}
void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"pld [%[block]] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr4, wr10, wr11 \n\t"
"wldrd wr10, [%[block]] \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr8, wr8, wr10 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"pld [%[block]] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"wldrd wr10, [%[block]] \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr8, wr8, wr10 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"subs %[h], %[h], #2 \n\t"
"pld [%[block]] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "cc", "memory", "r12");
}
void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr4, wr10, wr11 \n\t"
"walignr1 wr5, wr11, wr12 \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr9, wr1, wr5 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr9, wr1, wr5 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"subs %[h], %[h], #2 \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
int stride = line_size;
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"and r12, %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"pld [%[block]] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
"1: \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr4, wr10, wr11 \n\t"
"walignr1 wr5, wr11, wr12 \n\t"
"wldrd wr10, [%[block]] \n\t"
"wldrd wr11, [%[block], #8] \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr9, wr1, wr5 \n\t"
WAVG2B" wr8, wr8, wr10 \n\t"
WAVG2B" wr9, wr9, wr11 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wldrd wr10, [%[pixels]] \n\t"
"wldrd wr11, [%[pixels], #8] \n\t"
"pld [%[block]] \n\t"
"wldrd wr12, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr0, wr10, wr11 \n\t"
"walignr1 wr1, wr11, wr12 \n\t"
"wldrd wr10, [%[block]] \n\t"
"wldrd wr11, [%[block], #8] \n\t"
WAVG2B" wr8, wr0, wr4 \n\t"
WAVG2B" wr9, wr1, wr5 \n\t"
WAVG2B" wr8, wr8, wr10 \n\t"
WAVG2B" wr9, wr9, wr11 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"subs %[h], %[h], #2 \n\t"
"pld [%[block]] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
:
: "r4", "r5", "r12", "memory");
}
void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"mov r12, #2 \n\t"
"pld [%[pixels], #32] \n\t"
"tmcr wcgr0, r12 \n\t" /* for shift value */
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"add r12, r12, #1 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"tmcr wcgr2, r12 \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"cmp r12, #8 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"1: \n\t"
// [wr0 wr1 wr2 wr3]
// [wr4 wr5 wr6 wr7] <= *
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr6, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr4, wr6 \n\t"
"wunpckehub wr5, wr6 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr4, wr4, wr8 \n\t"
"waddhus wr5, wr5, wr9 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"subs %[h], %[h], #2 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
: [line_size]"r"(line_size)
: "r12", "memory");
}
void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[pixels]] \n\t"
"mov r12, #2 \n\t"
"pld [%[pixels], #32] \n\t"
"tmcr wcgr0, r12 \n\t" /* for shift value */
/* alignment */
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"tmcr wcgr2, r12 \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr3, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr2, wr3 \n\t"
"wunpckehub wr3, wr3 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr2, wr2, wr10 \n\t"
"waddhus wr3, wr3, wr11 \n\t"
"1: \n\t"
// [wr0 wr1 wr2 wr3]
// [wr4 wr5 wr6 wr7] <= *
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr6, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr7, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr4, wr6 \n\t"
"wunpckehub wr5, wr6 \n\t"
"wunpckelub wr6, wr7 \n\t"
"wunpckehub wr7, wr7 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr4, wr4, wr8 \n\t"
"waddhus wr5, wr5, wr9 \n\t"
"waddhus wr6, wr6, wr10 \n\t"
"waddhus wr7, wr7, wr11 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr10, wr2, wr6 \n\t"
"waddhus wr11, wr3, wr7 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"waddhus wr10, wr10, wr15 \n\t"
"waddhus wr11, wr11, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wsrlhg wr10, wr10, wcgr0 \n\t"
"wsrlhg wr11, wr11, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"wpackhus wr9, wr10, wr11 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr3, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr2, wr3 \n\t"
"wunpckehub wr3, wr3 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr2, wr2, wr10 \n\t"
"waddhus wr3, wr3, wr11 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr10, wr2, wr6 \n\t"
"waddhus wr11, wr3, wr7 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"waddhus wr10, wr10, wr15 \n\t"
"waddhus wr11, wr11, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wsrlhg wr10, wr10, wcgr0 \n\t"
"wsrlhg wr11, wr11, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"wpackhus wr9, wr10, wr11 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"subs %[h], %[h], #2 \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
: [line_size]"r"(line_size)
: "r12", "memory");
}
void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"pld [%[pixels]] \n\t"
"mov r12, #2 \n\t"
"pld [%[pixels], #32] \n\t"
"tmcr wcgr0, r12 \n\t" /* for shift value */
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"add r12, r12, #1 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"tmcr wcgr2, r12 \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"cmp r12, #8 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"1: \n\t"
// [wr0 wr1 wr2 wr3]
// [wr4 wr5 wr6 wr7] <= *
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr6, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr4, wr6 \n\t"
"wunpckehub wr5, wr6 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr4, wr4, wr8 \n\t"
"waddhus wr5, wr5, wr9 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"wldrd wr12, [%[block]] \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
WAVG2B" wr8, wr8, wr12 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"wldrd wr12, [%[pixels]] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr13, [%[pixels], #8] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"wmoveq wr10, wr13 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"wldrd wr12, [%[block]] \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"subs %[h], %[h], #2 \n\t"
WAVG2B" wr8, wr8, wr12 \n\t"
"wstrd wr8, [%[block]] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
: [line_size]"r"(line_size)
: "r12", "memory");
}
void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
// [wr0 wr1 wr2 wr3] for previous line
// [wr4 wr5 wr6 wr7] for current line
SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
__asm__ __volatile__(
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"pld [%[pixels]] \n\t"
"mov r12, #2 \n\t"
"pld [%[pixels], #32] \n\t"
"tmcr wcgr0, r12 \n\t" /* for shift value */
/* alignment */
"and r12, %[pixels], #7 \n\t"
"bic %[pixels], %[pixels], #7 \n\t"
"tmcr wcgr1, r12 \n\t"
"add r12, r12, #1 \n\t"
"tmcr wcgr2, r12 \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"pld [%[pixels]] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr3, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr2, wr3 \n\t"
"wunpckehub wr3, wr3 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr2, wr2, wr10 \n\t"
"waddhus wr3, wr3, wr11 \n\t"
"1: \n\t"
// [wr0 wr1 wr2 wr3]
// [wr4 wr5 wr6 wr7] <= *
"wldrd wr12, [%[pixels]] \n\t"
"cmp r12, #8 \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr6, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr7, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr4, wr6 \n\t"
"wunpckehub wr5, wr6 \n\t"
"wunpckelub wr6, wr7 \n\t"
"wunpckehub wr7, wr7 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr4, wr4, wr8 \n\t"
"waddhus wr5, wr5, wr9 \n\t"
"waddhus wr6, wr6, wr10 \n\t"
"waddhus wr7, wr7, wr11 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr10, wr2, wr6 \n\t"
"waddhus wr11, wr3, wr7 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"waddhus wr10, wr10, wr15 \n\t"
"waddhus wr11, wr11, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wldrd wr12, [%[block]] \n\t"
"wldrd wr13, [%[block], #8] \n\t"
"wsrlhg wr10, wr10, wcgr0 \n\t"
"wsrlhg wr11, wr11, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"wpackhus wr9, wr10, wr11 \n\t"
WAVG2B" wr8, wr8, wr12 \n\t"
WAVG2B" wr9, wr9, wr13 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
// [wr0 wr1 wr2 wr3] <= *
// [wr4 wr5 wr6 wr7]
"wldrd wr12, [%[pixels]] \n\t"
"pld [%[block]] \n\t"
"wldrd wr13, [%[pixels], #8] \n\t"
"pld [%[block], #32] \n\t"
"wldrd wr14, [%[pixels], #16] \n\t"
"add %[pixels], %[pixels], %[line_size] \n\t"
"walignr1 wr2, wr12, wr13 \n\t"
"pld [%[pixels]] \n\t"
"pld [%[pixels], #32] \n\t"
"walignr1 wr3, wr13, wr14 \n\t"
"wmoveq wr10, wr13 \n\t"
"wmoveq wr11, wr14 \n\t"
"walignr2ne wr10, wr12, wr13 \n\t"
"walignr2ne wr11, wr13, wr14 \n\t"
"wunpckelub wr0, wr2 \n\t"
"wunpckehub wr1, wr2 \n\t"
"wunpckelub wr2, wr3 \n\t"
"wunpckehub wr3, wr3 \n\t"
"wunpckelub wr8, wr10 \n\t"
"wunpckehub wr9, wr10 \n\t"
"wunpckelub wr10, wr11 \n\t"
"wunpckehub wr11, wr11 \n\t"
"waddhus wr0, wr0, wr8 \n\t"
"waddhus wr1, wr1, wr9 \n\t"
"waddhus wr2, wr2, wr10 \n\t"
"waddhus wr3, wr3, wr11 \n\t"
"waddhus wr8, wr0, wr4 \n\t"
"waddhus wr9, wr1, wr5 \n\t"
"waddhus wr10, wr2, wr6 \n\t"
"waddhus wr11, wr3, wr7 \n\t"
"waddhus wr8, wr8, wr15 \n\t"
"waddhus wr9, wr9, wr15 \n\t"
"waddhus wr10, wr10, wr15 \n\t"
"waddhus wr11, wr11, wr15 \n\t"
"wsrlhg wr8, wr8, wcgr0 \n\t"
"wsrlhg wr9, wr9, wcgr0 \n\t"
"wldrd wr12, [%[block]] \n\t"
"wldrd wr13, [%[block], #8] \n\t"
"wsrlhg wr10, wr10, wcgr0 \n\t"
"wsrlhg wr11, wr11, wcgr0 \n\t"
"wpackhus wr8, wr8, wr9 \n\t"
"wpackhus wr9, wr10, wr11 \n\t"
WAVG2B" wr8, wr8, wr12 \n\t"
WAVG2B" wr9, wr9, wr13 \n\t"
"wstrd wr8, [%[block]] \n\t"
"wstrd wr9, [%[block], #8] \n\t"
"add %[block], %[block], %[line_size] \n\t"
"subs %[h], %[h], #2 \n\t"
"pld [%[block]] \n\t"
"pld [%[block], #32] \n\t"
"bne 1b \n\t"
: [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
: [line_size]"r"(line_size)
: "r12", "memory");
}
......@@ -21,6 +21,13 @@
#include "../mpegvideo.h"
#include "../avcodec.h"
#ifdef HAVE_IWMMXT
extern void MPV_common_init_iwmmxt(MpegEncContext *s);
#endif
void MPV_common_init_armv4l(MpegEncContext *s)
{
#ifdef HAVE_IWMMXT
MPV_common_init_iwmmxt(s);
#endif
}
#include "../dsputil.h"
#include "../mpegvideo.h"
#include "../avcodec.h"
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int level, qmul, qadd;
int nCoeffs;
DCTELEM *block_orig = block;
assert(s->block_last_index[n]>=0);
qmul = qscale << 1;
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level = block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ __volatile__ (
/* "movd %1, %%mm6 \n\t" //qmul */
/* "packssdw %%mm6, %%mm6 \n\t" */
/* "packssdw %%mm6, %%mm6 \n\t" */
"tbcsth wr6, %[qmul] \n\t"
/* "movd %2, %%mm5 \n\t" //qadd */
/* "packssdw %%mm5, %%mm5 \n\t" */
/* "packssdw %%mm5, %%mm5 \n\t" */
"tbcsth wr5, %[qadd] \n\t"
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */
"1: \n\t"
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */
/* "movq (%0, %3), %%mm2 \n\t" */
/* "movq 8(%0, %3), %%mm3 \n\t" */
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */
"subs %[i], %[i], #1 \n\t"
"bne 1b \n\t" /* "jng 1b \n\t" */
:[block]"+r"(block)
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
:"memory");
block_orig[0] = level;
}
#if 0
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int nCoeffs;
assert(s->block_last_index[n]>=0);
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
}
#endif
void MPV_common_init_iwmmxt(MpegEncContext *s)
{
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
#if 0
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
#endif
}
......@@ -1180,6 +1180,7 @@ typedef struct AVCodecContext {
#define FF_IDCT_SIMPLEARM 10
#define FF_IDCT_H264 11
#define FF_IDCT_VP3 12
#define FP_IDCT_IPP 13
/**
* slice count.
......
......@@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){
return (x>>8) | (x<<8);
}
#ifdef ARCH_ARM
static always_inline uint32_t bswap_32(uint32_t x){
uint32_t t;
__asm__ (
"eor %1, %0, %0, ror #16 \n\t"
"bic %1, %1, #0xFF0000 \n\t"
"mov %0, %0, ror #8 \n\t"
"eor %0, %0, %1, lsr #8 \n\t"
: "+r"(x), "+r"(t));
return x;
}
#else
static always_inline uint32_t bswap_32(uint32_t x){
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
return (x>>16) | (x<<16);
}
#endif
static inline uint64_t bswap_64(uint64_t x)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment