Commit 8986fddc authored by Mans Rullgard's avatar Mans Rullgard

ARM: allow building in Thumb2 mode

Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent 9cd7b854
...@@ -967,6 +967,7 @@ CONFIG_LIST=" ...@@ -967,6 +967,7 @@ CONFIG_LIST="
static static
swscale swscale
swscale_alpha swscale_alpha
thumb
vaapi vaapi
vdpau vdpau
version3 version3
...@@ -2607,7 +2608,7 @@ if enabled alpha; then ...@@ -2607,7 +2608,7 @@ if enabled alpha; then
elif enabled arm; then elif enabled arm; then
check_cflags -marm enabled thumb && check_cflags -mthumb || check_cflags -marm
nogas=die nogas=die
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then
......
...@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx, ...@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
"vmov d1, %2, %3 \n\t" "vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t" "lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t" "and %0, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t" "lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t" "lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t" "and %1, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t" "lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t" "lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t" "and %2, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t" "lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t" "vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t" "and %3, %5, #1<<31 \n\t"
......
...@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1 ...@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
lsl r3, lr, #1 lsl r3, lr, #1
ldrh r12, [r0, r3] ldrh r12, [r0, r3]
subs r2, r2, #1 subs r2, r2, #1
it gt
ldrbgt lr, [r1], #1 ldrbgt lr, [r1], #1
add r12, r12, #1 add r12, r12, #1
strh r12, [r0, r3] strh r12, [r0, r3]
......
...@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1 ...@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
mov r11, r10 mov r11, r10
ldrb r10, [r4], #1 @ band_start_tab[band++] ldrb r10, [r4], #1 @ band_start_tab[band++]
subs r9, r9, r5 @ - floor subs r9, r9, r5 @ - floor
it lt
movlt r9, #0 movlt r9, #0
cmp r10, r3 @ - end cmp r10, r3 @ - end
and r9, r9, r8 @ & 0x1fe0 and r9, r9, r8 @ & 0x1fe0
ite gt
subgt r8, r3, r11 subgt r8, r3, r11
suble r8, r10, r11 suble r8, r10, r11
add r9, r9, r5 @ + floor => m add r9, r9, r5 @ + floor => m
......
...@@ -41,6 +41,7 @@ endfunc ...@@ -41,6 +41,7 @@ endfunc
function ff_ac3_exponent_min_neon, export=1 function ff_ac3_exponent_min_neon, export=1
cmp r1, #0 cmp r1, #0
it eq
bxeq lr bxeq lr
push {lr} push {lr}
mov r12, #256 mov r12, #256
......
...@@ -24,9 +24,18 @@ ...@@ -24,9 +24,18 @@
# define ELF # define ELF
#else #else
# define ELF @ # define ELF @
#endif
#if CONFIG_THUMB
# define A @
# define T
#else
# define A
# define T @
#endif #endif
.syntax unified .syntax unified
T .thumb
.macro require8 val=1 .macro require8 val=1
ELF .eabi_attribute 24, \val ELF .eabi_attribute 24, \val
...@@ -82,6 +91,90 @@ ELF .size \name, . - \name ...@@ -82,6 +91,90 @@ ELF .size \name, . - \name
#endif #endif
.endm .endm
.macro ldr_pre rt, rn, rm:vararg
A ldr \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldr \rt, [\rn]
.endm
.macro ldr_post rt, rn, rm:vararg
A ldr \rt, [\rn], \rm
T ldr \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrd_reg rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn, \rm]
T add \rt, \rn, \rm
T ldrd \rt, \rt2, [\rt]
.endm
.macro ldrd_post rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn], \rm
T ldrd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrh_pre rt, rn, rm
A ldrh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_dpre rt, rn, rm
A ldrh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_post rt, rn, rm
A ldrh \rt, [\rn], \rm
T ldrh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro str_post rt, rn, rm:vararg
A str \rt, [\rn], \rm
T str \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strb_post rt, rn, rm:vararg
A strb \rt, [\rn], \rm
T strb \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strd_post rt, rt2, rn, rm
A strd \rt, \rt2, [\rn], \rm
T strd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_pre rt, rn, rm
A strh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_dpre rt, rn, rm
A strh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_post rt, rn, rm
A strh \rt, [\rn], \rm
T strh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_dpost rt, rn, rm
A strh \rt, [\rn], -\rm
T strh \rt, [\rn]
T sub \rn, \rn, \rm
.endm
#if HAVE_VFP_ARGS #if HAVE_VFP_ARGS
.eabi_attribute 28, 1 .eabi_attribute 28, 1
# define VFP # define VFP
......
...@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1 ...@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
add r5, r2, #256*4-16 @ cf1 add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12 sub r1, r1, #12
cmp r3, #32 cmp r3, #32
ite eq
moveq r6, #256/32 moveq r6, #256/32
movne r6, #256/64 movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale NOVFP vldr s0, [sp, #16] @ scale
......
...@@ -554,10 +554,12 @@ endfunc ...@@ -554,10 +554,12 @@ endfunc
and r9, r5, r14 and r9, r5, r14
and r10, r6, r14 and r10, r6, r14
and r11, r7, r14 and r11, r7, r14
it eq
andeq r14, r14, r14, \rnd #1 andeq r14, r14, r14, \rnd #1
add r8, r8, r10 add r8, r8, r10
add r9, r9, r11 add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2 ldr r12, =0xfcfcfcfc >> 2
itt eq
addeq r8, r8, r14 addeq r8, r8, r14
addeq r9, r9, r14 addeq r9, r9, r14
and r4, r12, r4, lsr #2 and r4, r12, r4, lsr #2
...@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1 ...@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5 mvn r5, r5
mvn r7, r7 mvn r7, r7
tst r6, #0x100 tst r6, #0x100
it ne
movne r6, r5, lsr #24 movne r6, r5, lsr #24
tst r8, #0x100 tst r8, #0x100
it ne
movne r8, r7, lsr #24 movne r8, r7, lsr #24
mov r9, r6 mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */ ldrsh r5, [r0, #4] /* moved form [A] */
...@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1 ...@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5 mvn r5, r5
mvn r7, r7 mvn r7, r7
tst r6, #0x100 tst r6, #0x100
it ne
movne r6, r5, lsr #24 movne r6, r5, lsr #24
tst r8, #0x100 tst r8, #0x100
it ne
movne r8, r7, lsr #24 movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16 orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */ ldr r4, [r1, #4] /* moved form [B] */
...@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1 ...@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5 mvn r5, r5
mvn r7, r7 mvn r7, r7
tst r6, #0x100 tst r6, #0x100
it ne
movne r6, r5, lsr #24 movne r6, r5, lsr #24
tst r8, #0x100 tst r8, #0x100
it ne
movne r8, r7, lsr #24 movne r8, r7, lsr #24
mov r9, r6 mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */ ldrsh r5, [r0, #12] /* moved from [D] */
...@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1 ...@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5 mvn r5, r5
mvn r7, r7 mvn r7, r7
tst r6, #0x100 tst r6, #0x100
it ne
movne r6, r5, lsr #24 movne r6, r5, lsr #24
tst r8, #0x100 tst r8, #0x100
it ne
movne r8, r7, lsr #24 movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16 orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */ add r0, r0, #16 /* moved from [E] */
......
...@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1 ...@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r6, [r1, #8] ldr r6, [r1, #8]
ldr r7, [r1, #12] ldr r7, [r1, #12]
ldr r4, [r1], r2 ldr_post r4, r1, r2
strd r6, r7, [r0, #8] strd r6, r7, [r0, #8]
ldr r9, [r1, #4] ldr r9, [r1, #4]
strd r4, r5, [r0], r2 strd_post r4, r5, r0, r2
ldr r10, [r1, #8] ldr r10, [r1, #8]
ldr r11, [r1, #12] ldr r11, [r1, #12]
ldr r8, [r1], r2 ldr_post r8, r1, r2
strd r10, r11, [r0, #8] strd r10, r11, [r0, #8]
subs r3, r3, #2 subs r3, r3, #2
strd r8, r9, [r0], r2 strd_post r8, r9, r0, r2
bne 1b bne 1b
pop {r4-r11} pop {r4-r11}
...@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1 ...@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
push {r4-r7} push {r4-r7}
1: 1:
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r4, [r1], r2 ldr_post r4, r1, r2
ldr r7, [r1, #4] ldr r7, [r1, #4]
strd r4, r5, [r0], r2 strd_post r4, r5, r0, r2
ldr r6, [r1], r2 ldr_post r6, r1, r2
subs r3, r3, #2 subs r3, r3, #2
strd r6, r7, [r0], r2 strd_post r6, r7, r0, r2
bne 1b bne 1b
pop {r4-r7} pop {r4-r7}
...@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1 ...@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r7, [r1, #5] ldr r7, [r1, #5]
lsr r6, r4, #8 lsr r6, r4, #8
ldr r8, [r1, r2]! ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24 orr r6, r6, r5, lsl #24
ldr r9, [r1, #4] ldr r9, [r1, #4]
ldr r11, [r1, #5] ldr r11, [r1, #5]
...@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1 ...@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
uhadd8 r9, r9, r11 uhadd8 r9, r9, r11
and r6, r6, r12 and r6, r6, r12
uadd8 r8, r8, r14 uadd8 r8, r8, r14
strd r4, r5, [r0], r2 strd_post r4, r5, r0, r2
uadd8 r9, r9, r6 uadd8 r9, r9, r6
strd r8, r9, [r0], r2 strd_post r8, r9, r0, r2
bne 1b bne 1b
pop {r4-r11, pc} pop {r4-r11, pc}
...@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1 ...@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
orr r12, r12, r12, lsl #16 orr r12, r12, r12, lsl #16
ldr r4, [r1] ldr r4, [r1]
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r6, [r1, r2]! ldr_pre r6, r1, r2
ldr r7, [r1, #4] ldr r7, [r1, #4]
1: 1:
subs r3, r3, #2 subs r3, r3, #2
...@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1 ...@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
uhadd8 r9, r5, r7 uhadd8 r9, r5, r7
eor r11, r5, r7 eor r11, r5, r7
and r10, r10, r12 and r10, r10, r12
ldr r4, [r1, r2]! ldr_pre r4, r1, r2
uadd8 r8, r8, r10 uadd8 r8, r8, r10
and r11, r11, r12 and r11, r11, r12
uadd8 r9, r9, r11 uadd8 r9, r9, r11
...@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1 ...@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
eor r7, r5, r7 eor r7, r5, r7
uadd8 r10, r10, r6 uadd8 r10, r10, r6
and r7, r7, r12 and r7, r7, r12
ldr r6, [r1, r2]! ldr_pre r6, r1, r2
uadd8 r11, r11, r7 uadd8 r11, r11, r7
strd r8, r9, [r0], r2 strd_post r8, r9, r0, r2
ldr r7, [r1, #4] ldr r7, [r1, #4]
strd r10, r11, [r0], r2 strd_post r10, r11, r0, r2
bne 1b bne 1b
pop {r4-r11} pop {r4-r11}
...@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1 ...@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
ldr r4, [r1] ldr r4, [r1]
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r7, [r1, #5] ldr r7, [r1, #5]
ldr r8, [r1, r2]! ldr_pre r8, r1, r2
ldr r9, [r1, #4] ldr r9, [r1, #4]
ldr r14, [r1, #5] ldr r14, [r1, #5]
add r1, r1, r2 add r1, r1, r2
...@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1 ...@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr} push {r4-r9, lr}
ldr r4, [r1] ldr r4, [r1]
ldr r5, [r1, #4] ldr r5, [r1, #4]
ldr r6, [r1, r2]! ldr_pre r6, r1, r2
ldr r7, [r1, #4] ldr r7, [r1, #4]
1: 1:
subs r3, r3, #2 subs r3, r3, #2
uhadd8 r8, r4, r6 uhadd8 r8, r4, r6
ldr r4, [r1, r2]! ldr_pre r4, r1, r2
uhadd8 r9, r5, r7 uhadd8 r9, r5, r7
ldr r5, [r1, #4] ldr r5, [r1, #4]
uhadd8 r12, r4, r6 uhadd8 r12, r4, r6
ldr r6, [r1, r2]! ldr_pre r6, r1, r2
uhadd8 r14, r5, r7 uhadd8 r14, r5, r7
ldr r7, [r1, #4] ldr r7, [r1, #4]
stm r0, {r8,r9} stm r0, {r8,r9}
...@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1 ...@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
orr lr, lr, lr, lsl #16 orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0] ldrd r4, r5, [r0]
ldr r10, [r1, #4] ldr r10, [r1, #4]
ldr r9, [r1], r2 ldr_post r9, r1, r2
subs r3, r3, #2 subs r3, r3, #2
1: 1:
pld [r1, r2] pld [r1, r2]
eor r8, r4, r9 eor r8, r4, r9
uhadd8 r4, r4, r9 uhadd8 r4, r4, r9
eor r12, r5, r10 eor r12, r5, r10
ldrd r6, r7, [r0, r2] ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10 uhadd8 r5, r5, r10
and r8, r8, lr and r8, r8, lr
ldr r10, [r1, #4] ldr r10, [r1, #4]
and r12, r12, lr and r12, r12, lr
uadd8 r4, r4, r8 uadd8 r4, r4, r8
ldr r9, [r1], r2 ldr_post r9, r1, r2
eor r8, r6, r9 eor r8, r6, r9
uadd8 r5, r5, r12 uadd8 r5, r5, r12
pld [r1, r2, lsl #1] pld [r1, r2, lsl #1]
eor r12, r7, r10 eor r12, r7, r10
uhadd8 r6, r6, r9 uhadd8 r6, r6, r9
strd r4, r5, [r0], r2 strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10 uhadd8 r7, r7, r10
beq 2f beq 2f
and r8, r8, lr and r8, r8, lr
ldrd r4, r5, [r0, r2] ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8 uadd8 r6, r6, r8
ldr r10, [r1, #4] ldr r10, [r1, #4]
and r12, r12, lr and r12, r12, lr
subs r3, r3, #2 subs r3, r3, #2
uadd8 r7, r7, r12 uadd8 r7, r7, r12
ldr r9, [r1], r2 ldr_post r9, r1, r2
strd r6, r7, [r0], r2 strd_post r6, r7, r0, r2
b 1b b 1b
2: 2:
and r8, r8, lr and r8, r8, lr
and r12, r12, lr and r12, r12, lr
uadd8 r6, r6, r8 uadd8 r6, r6, r8
uadd8 r7, r7, r12 uadd8 r7, r7, r12
strd r6, r7, [r0], r2 strd_post r6, r7, r0, r2
pop {r4-r10, pc} pop {r4-r10, pc}
endfunc endfunc
...@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1 ...@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
orr r6, r8, r5, lsl #8 orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8 orr r7, r4, lr, lsl #8
subs r3, r3, #1 subs r3, r3, #1
strd r6, r7, [r1], r2 strd_post r6, r7, r1, r2
bgt 1b bgt 1b
pop {r4-r8,pc} pop {r4-r8,pc}
endfunc endfunc
...@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1 ...@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
push {r4-r8, lr} push {r4-r8, lr}
mov lr, #8 mov lr, #8
1: 1:
ldrd r4, r5, [r1], r2 ldrd_post r4, r5, r1, r2
subs lr, lr, #1 subs lr, lr, #1
uxtb16 r6, r4 uxtb16 r6, r4
uxtb16 r4, r4, ror #8 uxtb16 r4, r4, ror #8
...@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1 ...@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
push {r4-r9, lr} push {r4-r9, lr}
mov lr, #8 mov lr, #8
1: 1:
ldrd r4, r5, [r1], r3 ldrd_post r4, r5, r1, r3
ldrd r6, r7, [r2], r3 ldrd_post r6, r7, r2, r3
uxtb16 r8, r4 uxtb16 r8, r4
uxtb16 r4, r4, ror #8 uxtb16 r4, r4, ror #8
uxtb16 r9, r6 uxtb16 r9, r6
...@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1 ...@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
push {r4-r9, lr} push {r4-r9, lr}
mov r0, #0 mov r0, #0
mov lr, #0 mov lr, #0
ldrd r4, r5, [r1], r3 ldrd_post r4, r5, r1, r3
1: 1:
subs r12, r12, #2 subs r12, r12, #2
ldr r7, [r2, #4] ldr r7, [r2, #4]
ldr r6, [r2], r3 ldr_post r6, r2, r3
ldrd r8, r9, [r1], r3 ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0 usada8 r0, r4, r6, r0
pld [r2, r3] pld [r2, r3]
usada8 lr, r5, r7, lr usada8 lr, r5, r7, lr
ldr r7, [r2, #4] ldr r7, [r2, #4]
ldr r6, [r2], r3 ldr_post r6, r2, r3
beq 2f beq 2f
ldrd r4, r5, [r1], r3 ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0 usada8 r0, r8, r6, r0
pld [r2, r3] pld [r2, r3]
usada8 lr, r9, r7, lr usada8 lr, r9, r7, lr
...@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1 ...@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
ldr r7, [r0, #12] ldr r7, [r0, #12]
usada8 r2, r6, lr, r2 usada8 r2, r6, lr, r2
beq 2f beq 2f
ldr r4, [r0, r1]! ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3 usada8 r3, r7, lr, r3
bgt 1b bgt 1b
2: 2:
......
...@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1 ...@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
2: vst1.32 {d2-d3}, [r3, :128]! 2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]! vst1.32 {d0-d1}, [r12,:128]!
it lt
bxlt lr bxlt lr
3: vld1.32 {d2-d3}, [r1,:128] 3: vld1.32 {d2-d3}, [r1,:128]
...@@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2 ...@@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2
2: vst1.32 {q2},[r0,:128]! 2: vst1.32 {q2},[r0,:128]!
vst1.32 {q3},[r0,:128]! vst1.32 {q3},[r0,:128]!
ands len, len, #15 ands len, len, #15
it eq
bxeq lr bxeq lr
3: vld1.32 {q0},[r1,:128]! 3: vld1.32 {q0},[r1,:128]!
vmul.f32 q0, q0, q8 vmul.f32 q0, q0, q8
...@@ -638,6 +640,7 @@ NOVFP ldr r3, [sp] ...@@ -638,6 +640,7 @@ NOVFP ldr r3, [sp]
2: vst1.32 {q8},[r0,:128]! 2: vst1.32 {q8},[r0,:128]!
vst1.32 {q9},[r0,:128]! vst1.32 {q9},[r0,:128]!
ands r3, r3, #7 ands r3, r3, #7
it eq
popeq {pc} popeq {pc}
3: vld1.32 {q0},[r1,:128]! 3: vld1.32 {q0},[r1,:128]!
ldr r12, [r2], #4 ldr r12, [r2], #4
......
...@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1 ...@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
1: 1:
subs r3, r3, #16 subs r3, r3, #16
vmul.f32 s12, s4, s12 vmul.f32 s12, s4, s12
itttt ge
vldmiage r1!, {s16-s19} vldmiage r1!, {s16-s19}
vldmiage r2!, {s24-s27} vldmiage r2!, {s24-s27}
vldmiage r1!, {s20-s23} vldmiage r1!, {s20-s23}
vldmiage r2!, {s28-s31} vldmiage r2!, {s28-s31}
it ge
vmulge.f32 s24, s16, s24 vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11} vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15} vstmia r0!, {s12-s15}
it ge
vmulge.f32 s28, s20, s28 vmulge.f32 s28, s20, s28
itttt gt
vldmiagt r1!, {s0-s3} vldmiagt r1!, {s0-s3}
vldmiagt r2!, {s8-s11} vldmiagt r2!, {s8-s11}
vldmiagt r1!, {s4-s7} vldmiagt r1!, {s4-s7}
vldmiagt r2!, {s12-s15} vldmiagt r2!, {s12-s15}
ittt ge
vmulge.f32 s8, s0, s8 vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27} vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31} vstmiage r0!, {s28-s31}
...@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1 ...@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
vmul.f32 s11, s0, s11 vmul.f32 s11, s0, s11
1: 1:
subs r3, r3, #16 subs r3, r3, #16
it ge
vldmdbge r2!, {s16-s19} vldmdbge r2!, {s16-s19}
vmul.f32 s12, s7, s12 vmul.f32 s12, s7, s12
it ge
vldmiage r1!, {s24-s27} vldmiage r1!, {s24-s27}
vmul.f32 s13, s6, s13 vmul.f32 s13, s6, s13
it ge
vldmdbge r2!, {s20-s23} vldmdbge r2!, {s20-s23}
vmul.f32 s14, s5, s14 vmul.f32 s14, s5, s14
it ge
vldmiage r1!, {s28-s31} vldmiage r1!, {s28-s31}
vmul.f32 s15, s4, s15 vmul.f32 s15, s4, s15
it ge
vmulge.f32 s24, s19, s24 vmulge.f32 s24, s19, s24
it gt
vldmdbgt r2!, {s0-s3} vldmdbgt r2!, {s0-s3}
it ge
vmulge.f32 s25, s18, s25 vmulge.f32 s25, s18, s25
vstmia r0!, {s8-s13} vstmia r0!, {s8-s13}
it ge
vmulge.f32 s26, s17, s26 vmulge.f32 s26, s17, s26
it gt
vldmiagt r1!, {s8-s11} vldmiagt r1!, {s8-s11}
itt ge
vmulge.f32 s27, s16, s27 vmulge.f32 s27, s16, s27
vmulge.f32 s28, s23, s28 vmulge.f32 s28, s23, s28
it gt
vldmdbgt r2!, {s4-s7} vldmdbgt r2!, {s4-s7}
it ge
vmulge.f32 s29, s22, s29 vmulge.f32 s29, s22, s29
vstmia r0!, {s14-s15} vstmia r0!, {s14-s15}
ittt ge
vmulge.f32 s30, s21, s30 vmulge.f32 s30, s21, s30
vmulge.f32 s31, s20, s31 vmulge.f32 s31, s20, s31
vmulge.f32 s8, s3, s8 vmulge.f32 s8, s3, s8
it gt
vldmiagt r1!, {s12-s15} vldmiagt r1!, {s12-s15}
itttt ge
vmulge.f32 s9, s2, s9 vmulge.f32 s9, s2, s9
vmulge.f32 s10, s1, s10 vmulge.f32 s10, s1, s10
vstmiage r0!, {s24-s27} vstmiage r0!, {s24-s27}
vmulge.f32 s11, s0, s11 vmulge.f32 s11, s0, s11
it ge
vstmiage r0!, {s28-s31} vstmiage r0!, {s28-s31}
bgt 1b bgt 1b
......
...@@ -71,6 +71,7 @@ endfunc ...@@ -71,6 +71,7 @@ endfunc
function ff_float_to_int16_interleave_neon, export=1 function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2 cmp r3, #2
itt lt
ldrlt r1, [r1] ldrlt r1, [r1]
blt ff_float_to_int16_neon blt ff_float_to_int16_neon
bne 4f bne 4f
...@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1 ...@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.64 {d3}, [r8], ip vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip vst1.64 {d7}, [r8], ip
subs r3, r3, #4 subs r3, r3, #4
it eq
popeq {r4-r8,pc} popeq {r4-r8,pc}
cmp r3, #4 cmp r3, #4
add r0, r0, #8 add r0, r0, #8
...@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1 ...@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.32 {d23[1]}, [r8], ip vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2 8: subs r3, r3, #2
add r0, r0, #4 add r0, r0, #4
it eq
popeq {r4-r8,pc} popeq {r4-r8,pc}
@ 1 channel @ 1 channel
...@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1 ...@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.16 {d2[3]}, [r5,:16], ip vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip vst1.16 {d3[3]}, [r5,:16], ip
it eq
popeq {r4-r8,pc} popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]! vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16 vcvt.s32.f32 q0, q0, #16
......
...@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1 ...@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
vmov r5, r6, s2, s3 vmov r5, r6, s2, s3
vmov r7, r8, s4, s5 vmov r7, r8, s4, s5
vmov ip, lr, s6, s7 vmov ip, lr, s6, s7
it gt
vldmiagt r1!, {s16-s23} vldmiagt r1!, {s16-s23}
ssat r4, #16, r4 ssat r4, #16, r4
ssat r3, #16, r3 ssat r3, #16, r3
...@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1 ...@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
ssat r5, #16, r5 ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16 pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16 pkhbt r4, r5, r6, lsl #16
itttt gt
vcvtgt.s32.f32 s0, s16 vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17 vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18 vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19 vcvtgt.s32.f32 s3, s19
itttt gt
vcvtgt.s32.f32 s4, s20 vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21 vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22 vcvtgt.s32.f32 s6, s22
......
This diff is collapsed.
...@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1 ...@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
blt 2f blt 2f
ldrsh lr, [r1] ldrsh lr, [r1]
add r0, r0, r4 add r0, r0, r4
it ne
movne lr, #0 movne lr, #0
cmp lr, #0 cmp lr, #0
adrne lr, ff_h264_idct_dc_add_neon ite ne
adreq lr, ff_h264_idct_add_neon adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr blx lr
2: subs ip, ip, #1 2: subs ip, ip, #1
add r1, r1, #32 add r1, r1, #32
...@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1 ...@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
add r0, r0, r4 add r0, r0, r4
cmp r8, #0 cmp r8, #0
ldrsh r8, [r1] ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon iteet ne
adreq lr, ff_h264_idct_dc_add_neon adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0 cmpeq r8, #0
blxne lr blxne lr
subs ip, ip, #1 subs ip, ip, #1
...@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1 ...@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
add r1, r3, r12, lsl #5 add r1, r3, r12, lsl #5
cmp r8, #0 cmp r8, #0
ldrsh r8, [r1] ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon iteet ne
adreq lr, ff_h264_idct_dc_add_neon adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0 cmpeq r8, #0
blxne lr blxne lr
add r12, r12, #1 add r12, r12, #1
cmp r12, #4 cmp r12, #4
itt eq
moveq r12, #16 moveq r12, #16
moveq r4, r9 moveq r4, r9
cmp r12, #20 cmp r12, #20
...@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1 ...@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
blt 2f blt 2f
ldrsh lr, [r1] ldrsh lr, [r1]
add r0, r0, r4 add r0, r0, r4
it ne
movne lr, #0 movne lr, #0
cmp lr, #0 cmp lr, #0
adrne lr, ff_h264_idct8_dc_add_neon ite ne
adreq lr, ff_h264_idct8_add_neon adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr blx lr
2: subs r12, r12, #4 2: subs r12, r12, #4
add r1, r1, #128 add r1, r1, #128
......
...@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c) ...@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
__asm__ ( __asm__ (
"mov %0, %2 \n\t" "mov %0, %2 \n\t"
"cmp %1, %2 \n\t" "cmp %1, %2 \n\t"
"itt gt \n\t"
"movgt %0, %1 \n\t" "movgt %0, %1 \n\t"
"movgt %1, %2 \n\t" "movgt %1, %2 \n\t"
"cmp %1, %3 \n\t" "cmp %1, %3 \n\t"
"it le \n\t"
"movle %1, %3 \n\t" "movle %1, %3 \n\t"
"cmp %0, %1 \n\t" "cmp %0, %1 \n\t"
"it gt \n\t"
"movgt %0, %1 \n\t" "movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a) : "=&r"(m), "+r"(a)
: "r"(b), "r"(c) : "r"(b), "r"(c)
......
...@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1 ...@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
vadd.f32 d17, d17, d3 @ in2u+in1d -I vadd.f32 d17, d17, d3 @ in2u+in1d -I
1: 1:
vmul.f32 d7, d0, d21 @ I*s vmul.f32 d7, d0, d21 @ I*s
ldr r10, [r3, lr, lsr #1] A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]! ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s vmul.f32 d4, d1, d21 @ -R*s
......
...@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 ...@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
sum8 r8, r9, r1, r0, r10, r11, r12, lr sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9 round r10, r8, r9
strh r10, [r3], r4 strh_post r10, r3, r4
mov lr, #15 mov lr, #15
1: 1:
...@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1 ...@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
round r10, r8, r9 round r10, r8, r9
adds r8, r8, r4 adds r8, r8, r4
adc r9, r9, r7 adc r9, r9, r7
strh r10, [r3], r12 strh_post r10, r3, r12
round r11, r8, r9 round r11, r8, r9
subs lr, lr, #1 subs lr, lr, #1
strh r11, [r5], -r12 strh_dpost r11, r5, r12
bgt 1b bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
......
...@@ -38,15 +38,21 @@ ...@@ -38,15 +38,21 @@
.macro dequant_t dst, src, mul, add, tmp .macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16 rsbs \tmp, ip, \src, asr #16
it gt
addgt \tmp, \add, #0 addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0 rsblt \tmp, \add, #0
it ne
smlatbne \dst, \src, \mul, \tmp smlatbne \dst, \src, \mul, \tmp
.endm .endm
.macro dequant_b dst, src, mul, add, tmp .macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16 rsbs \tmp, ip, \src, lsl #16
it gt
addgt \tmp, \add, #0 addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0 rsblt \tmp, \add, #0
it ne
smlabbne \dst, \src, \mul, \tmp smlabbne \dst, \src, \mul, \tmp
.endm .endm
...@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1 ...@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
strh lr, [r0], #2 strh lr, [r0], #2
subs r3, r3, #8 subs r3, r3, #8
it gt
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b bgt 1b
adds r3, r3, #2 adds r3, r3, #2
it le
pople {r4-r9,pc} pople {r4-r9,pc}
2: 2:
ldrsh r9, [r0, #0] ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2] ldrsh lr, [r0, #2]
mov r8, r2 mov r8, r2
cmp r9, #0 cmp r9, #0
it lt
rsblt r8, r2, #0 rsblt r8, r2, #0
it ne
smlabbne r9, r9, r1, r8 smlabbne r9, r9, r1, r8
mov r8, r2 mov r8, r2
cmp lr, #0 cmp lr, #0
it lt
rsblt r8, r2, #0 rsblt r8, r2, #0
it ne
smlabbne lr, lr, r1, r8 smlabbne lr, lr, r1, r8
strh r9, [r0], #2 strh r9, [r0], #2
strh lr, [r0], #2 strh lr, [r0], #2
......
...@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1 ...@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
subs r3, r3, #16 subs r3, r3, #16
vst1.16 {q0}, [r1,:128]! vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]! vst1.16 {q8}, [r1,:128]!
it le
bxle lr bxle lr
cmp r3, #8 cmp r3, #8
bgt 1b bgt 1b
...@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1 ...@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldr r6, [r0, #AC_PRED] ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0 cmp r6, #0
it ne
movne r12, #63 movne r12, #63
bne 1f bne 1f
ldr r12, [r12, r2, lsl #2] ldr r12, [r12, r2, lsl #2]
...@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1 ...@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldrsh r4, [r1] ldrsh r4, [r1]
cmp r5, #0 cmp r5, #0
mov r5, r1 mov r5, r1
it ne
movne r2, #0 movne r2, #0
bne 2f bne 2f
cmp r2, #4 cmp r2, #4
it ge
addge r0, r0, #4 addge r0, r0, #4
sub r2, r3, #1 sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE] ldr r6, [r0, #Y_DC_SCALE]
......
...@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1 ...@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
vst1.32 {d22}, [r5,:64] vst1.32 {d22}, [r5,:64]
cmp r6, #0 cmp r6, #0
it eq
popeq {r4-r8,pc} popeq {r4-r8,pc}
vmul.f32 d22, d22, d18 vmul.f32 d22, d22, d18
......
...@@ -121,11 +121,13 @@ __b_evaluation: ...@@ -121,11 +121,13 @@ __b_evaluation:
ldr r11, [r12, #offW7] @ R11=W7 ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls teq r2, #0 @ if null avoid muls
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3] rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
...@@ -148,19 +150,23 @@ __b_evaluation: ...@@ -148,19 +150,23 @@ __b_evaluation:
@@ MAC16(b3, -W1, row[7]); @@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]); @@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5] mov r3, r3, asr #16 @ R3=ROWr16[5]
teq r3, #0 @ if null avoid muls teq r3, #0 @ if null avoid muls
it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7] mov r4, r4, asr #16 @ R4=ROWr16[7]
itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5] rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now @@ R3 is free now
teq r4, #0 @ if null avoid muls teq r4, #0 @ if null avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7] rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now @@ R4 is free now
__end_b_evaluation: __end_b_evaluation:
...@@ -204,16 +210,19 @@ __a_evaluation: ...@@ -204,16 +210,19 @@ __a_evaluation:
@@ a2 -= W4*row[4] @@ a2 -= W4*row[4]
@@ a3 += W4*row[4] @@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4] ldrsh r11, [r14, #8] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls teq r11, #0 @ if null avoid muls
it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4] mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now @@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6] ldrsh r9, [r14, #12] @ R9=ROWr16[6]
itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6] mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6] mulne r10, r8, r9 @ R10=W2*ROWr16[6]
...@@ -222,6 +231,7 @@ __a_evaluation: ...@@ -222,6 +231,7 @@ __a_evaluation:
@@ a1 -= W2*row[6]; @@ a1 -= W2*row[6];
@@ a2 += W2*row[6]; @@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
...@@ -323,10 +333,12 @@ __b_evaluation2: ...@@ -323,10 +333,12 @@ __b_evaluation2:
ldrsh r2, [r14, #48] ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls teq r2, #0 @ if 0, then avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3] rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
...@@ -342,18 +354,22 @@ __b_evaluation2: ...@@ -342,18 +354,22 @@ __b_evaluation2:
@@ MAC16(b1, -W5, col[7x8]); @@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8] ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls teq r3, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8] rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8] ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now @@ R3 is free now
teq r4, #0 @ if 0 then avoid muls teq r4, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8] rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now @@ R4 is free now
__end_b_evaluation2: __end_b_evaluation2:
...@@ -390,15 +406,18 @@ __a_evaluation2: ...@@ -390,15 +406,18 @@ __a_evaluation2:
@@ a3 += W4*row[4] @@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4] ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls teq r11, #0 @ if null avoid muls
itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4] mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now @@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6] ldrsh r9, [r14, #96] @ R9=ROWr16[6]
it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6] mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6] mulne r10, r8, r9 @ R10=W2*ROWr16[6]
...@@ -407,6 +426,7 @@ __a_evaluation2: ...@@ -407,6 +426,7 @@ __a_evaluation2:
@@ a1 -= W2*row[6]; @@ a1 -= W2*row[6];
@@ a2 += W2*row[6]; @@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2: __end_a_evaluation2:
......
...@@ -49,6 +49,7 @@ function idct_row_armv5te ...@@ -49,6 +49,7 @@ function idct_row_armv5te
ldrd v1, [a1, #8] ldrd v1, [a1, #8]
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2 orrs v1, v1, v2
itt eq
cmpeq v1, a4 cmpeq v1, a4
cmpeq v1, a3, lsr #16 cmpeq v1, a3, lsr #16
beq row_dc_only beq row_dc_only
...@@ -269,6 +270,7 @@ function idct_col_armv5te ...@@ -269,6 +270,7 @@ function idct_col_armv5te
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
adds a2, a3, v1 adds a2, a3, v1
mov a2, a2, lsr #20 mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
add ip, a4, v2 add ip, a4, v2
mov ip, ip, asr #20 mov ip, ip, asr #20
...@@ -276,6 +278,7 @@ function idct_col_armv5te ...@@ -276,6 +278,7 @@ function idct_col_armv5te
str a2, [a1] str a2, [a1]
subs a3, a3, v1 subs a3, a3, v1
mov a2, a3, lsr #20 mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
sub a4, a4, v2 sub a4, a4, v2
mov a4, a4, asr #20 mov a4, a4, asr #20
...@@ -285,6 +288,7 @@ function idct_col_armv5te ...@@ -285,6 +288,7 @@ function idct_col_armv5te
subs a2, a3, v3 subs a2, a3, v3
mov a2, a2, lsr #20 mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
sub ip, a4, v4 sub ip, a4, v4
mov ip, ip, asr #20 mov ip, ip, asr #20
...@@ -292,6 +296,7 @@ function idct_col_armv5te ...@@ -292,6 +296,7 @@ function idct_col_armv5te
str a2, [a1, #(16*1)] str a2, [a1, #(16*1)]
adds a3, a3, v3 adds a3, a3, v3
mov a2, a3, lsr #20 mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
add a4, a4, v4 add a4, a4, v4
mov a4, a4, asr #20 mov a4, a4, asr #20
...@@ -301,6 +306,7 @@ function idct_col_armv5te ...@@ -301,6 +306,7 @@ function idct_col_armv5te
adds a2, a3, v5 adds a2, a3, v5
mov a2, a2, lsr #20 mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
add ip, a4, v6 add ip, a4, v6
mov ip, ip, asr #20 mov ip, ip, asr #20
...@@ -308,6 +314,7 @@ function idct_col_armv5te ...@@ -308,6 +314,7 @@ function idct_col_armv5te
str a2, [a1, #(16*2)] str a2, [a1, #(16*2)]
subs a3, a3, v5 subs a3, a3, v5
mov a2, a3, lsr #20 mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
sub a4, a4, v6 sub a4, a4, v6
mov a4, a4, asr #20 mov a4, a4, asr #20
...@@ -317,6 +324,7 @@ function idct_col_armv5te ...@@ -317,6 +324,7 @@ function idct_col_armv5te
adds a2, a3, v7 adds a2, a3, v7
mov a2, a2, lsr #20 mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
add ip, a4, fp add ip, a4, fp
mov ip, ip, asr #20 mov ip, ip, asr #20
...@@ -324,6 +332,7 @@ function idct_col_armv5te ...@@ -324,6 +332,7 @@ function idct_col_armv5te
str a2, [a1, #(16*3)] str a2, [a1, #(16*3)]
subs a3, a3, v7 subs a3, a3, v7
mov a2, a3, lsr #20 mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000 orrmi a2, a2, #0xf000
sub a4, a4, fp sub a4, a4, fp
mov a4, a4, asr #20 mov a4, a4, asr #20
...@@ -335,15 +344,19 @@ endfunc ...@@ -335,15 +344,19 @@ endfunc
.macro clip dst, src:vararg .macro clip dst, src:vararg
movs \dst, \src movs \dst, \src
it mi
movmi \dst, #0 movmi \dst, #0
cmp \dst, #255 cmp \dst, #255
it gt
movgt \dst, #255 movgt \dst, #255
.endm .endm
.macro aclip dst, src:vararg .macro aclip dst, src:vararg
adds \dst, \src adds \dst, \src
it mi
movmi \dst, #0 movmi \dst, #0
cmp \dst, #255 cmp \dst, #255
it gt
movgt \dst, #255 movgt \dst, #255
.endm .endm
...@@ -370,35 +383,35 @@ function idct_col_put_armv5te ...@@ -370,35 +383,35 @@ function idct_col_put_armv5te
orr a2, a3, a4, lsl #8 orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3 rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
strh a2, [v2, v1]! strh_pre a2, v2, v1
sub a2, a3, v3 sub a2, a3, v3
clip a2, a2, asr #20 clip a2, a2, asr #20
sub ip, a4, v4 sub ip, a4, v4
clip ip, ip, asr #20 clip ip, ip, asr #20
orr a2, a2, ip, lsl #8 orr a2, a2, ip, lsl #8
strh a2, [v1, lr]! strh_pre a2, v1, lr
add a3, a3, v3 add a3, a3, v3
clip a2, a3, asr #20 clip a2, a3, asr #20
add a4, a4, v4 add a4, a4, v4
clip a4, a4, asr #20 clip a4, a4, asr #20
orr a2, a2, a4, lsl #8 orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]! strh_dpre a2, v2, lr
add a2, a3, v5 add a2, a3, v5
clip a2, a2, asr #20 clip a2, a2, asr #20
add ip, a4, v6 add ip, a4, v6
clip ip, ip, asr #20 clip ip, ip, asr #20
orr a2, a2, ip, lsl #8 orr a2, a2, ip, lsl #8
strh a2, [v1, lr]! strh_pre a2, v1, lr
sub a3, a3, v5 sub a3, a3, v5
clip a2, a3, asr #20 clip a2, a3, asr #20
sub a4, a4, v6 sub a4, a4, v6
clip a4, a4, asr #20 clip a4, a4, asr #20
orr a2, a2, a4, lsl #8 orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]! strh_dpre a2, v2, lr
add a2, a3, v7 add a2, a3, v7
clip a2, a2, asr #20 clip a2, a2, asr #20
...@@ -411,7 +424,7 @@ function idct_col_put_armv5te ...@@ -411,7 +424,7 @@ function idct_col_put_armv5te
sub a4, a4, fp sub a4, a4, fp
clip a4, a4, asr #20 clip a4, a4, asr #20
orr a2, a2, a4, lsl #8 orr a2, a2, a4, lsl #8
strh a2, [v2, -lr] strh_dpre a2, v2, lr
ldr pc, [sp], #4 ldr pc, [sp], #4
endfunc endfunc
...@@ -436,7 +449,7 @@ function idct_col_add_armv5te ...@@ -436,7 +449,7 @@ function idct_col_add_armv5te
ldr v1, [sp, #32] ldr v1, [sp, #32]
sub a4, a4, v2 sub a4, a4, v2
rsb v2, v1, v1, lsl #3 rsb v2, v1, v1, lsl #3
ldrh ip, [v2, lr]! ldrh_pre ip, v2, lr
strh a2, [lr] strh a2, [lr]
and a2, ip, #255 and a2, ip, #255
aclip a3, a2, a3, asr #20 aclip a3, a2, a3, asr #20
...@@ -448,7 +461,7 @@ function idct_col_add_armv5te ...@@ -448,7 +461,7 @@ function idct_col_add_armv5te
strh a2, [v2] strh a2, [v2]
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]! ldrh_pre ip, lr, v1
sub a2, a3, v3 sub a2, a3, v3
add a3, a3, v3 add a3, a3, v3
and v3, ip, #255 and v3, ip, #255
...@@ -458,7 +471,7 @@ function idct_col_add_armv5te ...@@ -458,7 +471,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8 aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8 orr a2, a2, v3, lsl #8
add a4, a4, v4 add a4, a4, v4
ldrh ip, [v2, -v1]! ldrh_dpre ip, v2, v1
strh a2, [lr] strh a2, [lr]
and a2, ip, #255 and a2, ip, #255
aclip a3, a2, a3, asr #20 aclip a3, a2, a3, asr #20
...@@ -468,7 +481,7 @@ function idct_col_add_armv5te ...@@ -468,7 +481,7 @@ function idct_col_add_armv5te
strh a2, [v2] strh a2, [v2]
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]! ldrh_pre ip, lr, v1
add a2, a3, v5 add a2, a3, v5
sub a3, a3, v5 sub a3, a3, v5
and v3, ip, #255 and v3, ip, #255
...@@ -478,7 +491,7 @@ function idct_col_add_armv5te ...@@ -478,7 +491,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8 aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8 orr a2, a2, v3, lsl #8
sub a4, a4, v6 sub a4, a4, v6
ldrh ip, [v2, -v1]! ldrh_dpre ip, v2, v1
strh a2, [lr] strh a2, [lr]
and a2, ip, #255 and a2, ip, #255
aclip a3, a2, a3, asr #20 aclip a3, a2, a3, asr #20
...@@ -488,7 +501,7 @@ function idct_col_add_armv5te ...@@ -488,7 +501,7 @@ function idct_col_add_armv5te
strh a2, [v2] strh a2, [v2]
ldmfd sp!, {a3, a4} ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]! ldrh_pre ip, lr, v1
add a2, a3, v7 add a2, a3, v7
sub a3, a3, v7 sub a3, a3, v7
and v3, ip, #255 and v3, ip, #255
...@@ -498,7 +511,7 @@ function idct_col_add_armv5te ...@@ -498,7 +511,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8 aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8 orr a2, a2, v3, lsl #8
sub a4, a4, fp sub a4, a4, fp
ldrh ip, [v2, -v1]! ldrh_dpre ip, v2, v1
strh a2, [lr] strh a2, [lr]
and a2, ip, #255 and a2, ip, #255
aclip a3, a2, a3, asr #20 aclip a3, a2, a3, asr #20
......
...@@ -200,6 +200,7 @@ function idct_row_armv6 ...@@ -200,6 +200,7 @@ function idct_row_armv6
ldr r3, [r0, #8] /* r3 = row[3,1] */ ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */ ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip orrs lr, lr, ip
itt eq
cmpeq lr, r3 cmpeq lr, r3
cmpeq lr, r2, lsr #16 cmpeq lr, r2, lsr #16
beq 1f beq 1f
...@@ -282,14 +283,14 @@ function idct_col_put_armv6 ...@@ -282,14 +283,14 @@ function idct_col_put_armv6
pop {r1, r2} pop {r1, r2}
idct_finish_shift_sat COL_SHIFT idct_finish_shift_sat COL_SHIFT
strb r4, [r1], r2 strb_post r4, r1, r2
strb r5, [r1], r2 strb_post r5, r1, r2
strb r6, [r1], r2 strb_post r6, r1, r2
strb r7, [r1], r2 strb_post r7, r1, r2
strb r11,[r1], r2 strb_post r11,r1, r2
strb r10,[r1], r2 strb_post r10,r1, r2
strb r9, [r1], r2 strb_post r9, r1, r2
strb r8, [r1], r2 strb_post r8, r1, r2
sub r1, r1, r2, lsl #3 sub r1, r1, r2, lsl #3
...@@ -318,16 +319,16 @@ function idct_col_add_armv6 ...@@ -318,16 +319,16 @@ function idct_col_add_armv6
add ip, r3, ip, asr #COL_SHIFT add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT add r4, r7, r4, asr #COL_SHIFT
strb ip, [r1], r2 strb_post ip, r1, r2
ldrb ip, [r1, r2] ldrb ip, [r1, r2]
usat r4, #8, r4 usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2] ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5 usat r5, #8, r5
strb r4, [r1], r2 strb_post r4, r1, r2
ldrb r3, [r1, r2] ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2] ldrb ip, [r1, r2, lsl #2]
strb r5, [r1], r2 strb_post r5, r1, r2
ldrb r7, [r1, r2] ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2] ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT add r6, r3, r6, asr #COL_SHIFT
...@@ -340,11 +341,11 @@ function idct_col_add_armv6 ...@@ -340,11 +341,11 @@ function idct_col_add_armv6
usat r8, #8, r8 usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr usat lr, #8, lr
strb r6, [r1], r2 strb_post r6, r1, r2
strb r10,[r1], r2 strb_post r10,r1, r2
strb r9, [r1], r2 strb_post r9, r1, r2
strb r8, [r1], r2 strb_post r8, r1, r2
strb lr, [r1], r2 strb_post lr, r1, r2
sub r1, r1, r2, lsl #3 sub r1, r1, r2, lsl #3
......
...@@ -71,7 +71,7 @@ function idct_row4_pld_neon ...@@ -71,7 +71,7 @@ function idct_row4_pld_neon
add r3, r0, r1, lsl #2 add r3, r0, r1, lsl #2
pld [r0, r1] pld [r0, r1]
pld [r0, r1, lsl #1] pld [r0, r1, lsl #1]
pld [r3, -r1] A pld [r3, -r1]
pld [r3] pld [r3]
pld [r3, r1] pld [r3, r1]
add r3, r3, r1, lsl #1 add r3, r3, r1, lsl #1
...@@ -164,6 +164,7 @@ function idct_col4_neon ...@@ -164,6 +164,7 @@ function idct_col4_neon
orrs r4, r4, r5 orrs r4, r4, r5
idct_col4_top idct_col4_top
it eq
addeq r2, r2, #16 addeq r2, r2, #16
beq 1f beq 1f
...@@ -176,6 +177,7 @@ function idct_col4_neon ...@@ -176,6 +177,7 @@ function idct_col4_neon
1: orrs r6, r6, r7 1: orrs r6, r6, r7
ldrd r4, [r2, #16] ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16 addeq r2, r2, #16
beq 2f beq 2f
...@@ -187,6 +189,7 @@ function idct_col4_neon ...@@ -187,6 +189,7 @@ function idct_col4_neon
2: orrs r4, r4, r5 2: orrs r4, r4, r5
ldrd r4, [r2, #16] ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16 addeq r2, r2, #16
beq 3f beq 3f
...@@ -199,6 +202,7 @@ function idct_col4_neon ...@@ -199,6 +202,7 @@ function idct_col4_neon
vadd.i32 q13, q13, q8 vadd.i32 q13, q13, q8
3: orrs r4, r4, r5 3: orrs r4, r4, r5
it eq
addeq r2, r2, #16 addeq r2, r2, #16
beq 4f beq 4f
......
...@@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale ...@@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale
vst1.32 {q9}, [r2,:128] vst1.32 {q9}, [r2,:128]
subs r1, r1, #1 subs r1, r1, #1
it eq
popeq {r4-r11,pc} popeq {r4-r11,pc}
cmp r4, #0 cmp r4, #0
itt eq
subeq r8, r8, #512*4 subeq r8, r8, #512*4
subeq r9, r9, #512*4 subeq r9, r9, #512*4
sub r5, r5, #512*4 sub r5, r5, #512*4
......
...@@ -21,6 +21,14 @@ ...@@ -21,6 +21,14 @@
#ifndef AVCODEC_ARM_VP56_ARITH_H #ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H #define AVCODEC_ARM_VP56_ARITH_H
#if CONFIG_THUMB
# define A(x)
# define T(x) x
#else
# define A(x) x
# define T(x)
#endif
#if HAVE_ARMV6 && HAVE_INLINE_ASM #if HAVE_ARMV6 && HAVE_INLINE_ASM
#define vp56_rac_get_prob vp56_rac_get_prob_armv6 #define vp56_rac_get_prob vp56_rac_get_prob_armv6
...@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) ...@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
unsigned bit; unsigned bit;
__asm__ ("adds %3, %3, %0 \n" __asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n" "cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n" A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n" "rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n" "smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n" "rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n" T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n" "subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n" "lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n" "cmp %1, %0, lsl #16 \n"
"ittte ge \n"
"subge %1, %1, %0, lsl #16 \n" "subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n" "subge %0, %5, %0 \n"
"movge %2, #1 \n" "movge %2, #1 \n"
...@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) ...@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
unsigned tmp; unsigned tmp;
__asm__ ("adds %3, %3, %0 \n" __asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n" "cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n" A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n" "rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n" "smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n" "rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n" T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n" "subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n" "lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n" "lsl %2, %0, #16 \n"
......
...@@ -25,13 +25,18 @@ ...@@ -25,13 +25,18 @@
lsl \cw, \cw, \t0 lsl \cw, \cw, \t0
lsl \t0, \h, \t0 lsl \t0, \h, \t0
rsb \h, \pr, #256 rsb \h, \pr, #256
it cs
ldrhcs \t1, [\buf], #2 ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h smlabb \h, \t0, \pr, \h
T itttt cs
rev16cs \t1, \t1 rev16cs \t1, \t1
orrcs \cw, \cw, \t1, lsl \bs A orrcs \cw, \cw, \t1, lsl \bs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16 subcs \bs, \bs, #16
lsr \h, \h, #8 lsr \h, \h, #8
cmp \cw, \h, lsl #16 cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16 subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h subge \h, \t0, \h
.endm .endm
...@@ -40,14 +45,20 @@ ...@@ -40,14 +45,20 @@
adds \bs, \bs, \t0 adds \bs, \bs, \t0
lsl \cw, \cw, \t0 lsl \cw, \cw, \t0
lsl \t0, \h, \t0 lsl \t0, \h, \t0
it cs
ldrhcs \t1, [\buf], #2 ldrhcs \t1, [\buf], #2
mov \h, #128 mov \h, #128
it cs
rev16cs \t1, \t1 rev16cs \t1, \t1
add \h, \h, \t0, lsl #7 add \h, \h, \t0, lsl #7
orrcs \cw, \cw, \t1, lsl \bs A orrcs \cw, \cw, \t1, lsl \bs
T ittt cs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16 subcs \bs, \bs, #16
lsr \h, \h, #8 lsr \h, \h, #8
cmp \cw, \h, lsl #16 cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16 subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h subge \h, \t0, \h
.endm .endm
...@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
cmp r3, #0 cmp r3, #0
ldr r11, [r5] ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf ldm r0, {r5-r7} @ high, bits, buf
it ne
pkhtbne r11, r11, r11, asr #16 pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word ldr r8, [r0, #16] @ code_word
0: 0:
...@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
adds r6, r6, r9 adds r6, r6, r9
add r4, r4, #11 add r4, r4, #11
lsl r8, r8, r9 lsl r8, r8, r9
it cs
ldrhcs r10, [r7], #2 ldrhcs r10, [r7], #2
lsl r9, r5, r9 lsl r9, r5, r9
mov r5, #128 mov r5, #128
it cs
rev16cs r10, r10 rev16cs r10, r10
add r5, r5, r9, lsl #7 add r5, r5, r9, lsl #7
orrcs r8, r8, r10, lsl r6 T ittt cs
T lslcs r10, r10, r6
T orrcs r8, r8, r10
A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16 subcs r6, r6, #16
lsr r5, r5, #8 lsr r5, r5, #8
cmp r8, r5, lsl #16 cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1 movrel r10, zigzag_scan-1
itt ge
subge r8, r8, r5, lsl #16 subge r8, r8, r5, lsl #16
subge r5, r9, r5 subge r5, r9, r5
ldrb r10, [r10, r3] ldrb r10, [r10, r3]
it ge
rsbge r12, r12, #0 rsbge r12, r12, #0
cmp r3, #16 cmp r3, #16
strh r12, [r1, r10] strh r12, [r1, r10]
...@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
ldr r0, [sp] ldr r0, [sp]
ldr r9, [r0, #12] ldr r9, [r0, #12]
cmp r7, r9 cmp r7, r9
it hi
movhi r7, r9 movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word str r8, [r0, #16] @ code_word
...@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #2 mov r12, #2
ldrb r0, [r4, #4] ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1 addge r12, #1
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
blt 4f blt 4f
ldrb r0, [r4, #5] ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1 addge r12, #1
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
b 4f b 4f
...@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #5 mov r12, #5
mov r0, #159 mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1 addge r12, r12, #1
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
b 4f b 4f
...@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #7 mov r12, #7
mov r0, #165 mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #2 addge r12, r12, #2
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
mov r0, #145 mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1 addge r12, r12, #1
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
b 4f b 4f
3: 3:
ldrb r0, [r4, #8] ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r4, r4, #1 addge r4, r4, #1
ldrb r9, [lr, r5] ldrb r9, [lr, r5]
ite ge
movge r12, #2 movge r12, #2
movlt r12, #0 movlt r12, #0
ldrb r0, [r4, #9] ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8 mov r9, #8
it ge
addge r12, r12, #1 addge r12, r12, #1
movrel r4, X(ff_vp8_dct_cat_prob) movrel r4, X(ff_vp8_dct_cat_prob)
lsl r9, r9, r12 lsl r9, r9, r12
...@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
lsl r1, r1, #1 lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10 rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1 ldrb r0, [r4], #1
it ge
addge r1, r1, #1 addge r1, r1, #1
cmp r0, #0 cmp r0, #0
bne 1b bne 1b
...@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1 ...@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
add r4, r2, r4 add r4, r2, r4
add r4, r4, #22 add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10 rac_get_128 r5, r6, r7, r8, r9, r10
it ge
rsbge r12, r12, #0 rsbge r12, r12, #0
smulbb r12, r12, r11 smulbb r12, r12, r11
movrel r9, zigzag_scan-1 movrel r9, zigzag_scan-1
......
...@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1 ...@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
push {r4-r6,lr} push {r4-r6,lr}
1: 1:
subs r12, r12, #4 subs r12, r12, #4
ldr r4, [r2], r3 ldr_post r4, r2, r3
ldr r5, [r2], r3 ldr_post r5, r2, r3
ldr r6, [r2], r3 ldr_post r6, r2, r3
ldr lr, [r2], r3 ldr_post lr, r2, r3
str r4, [r0], r1 str_post r4, r0, r1
str r5, [r0], r1 str_post r5, r0, r1
str r6, [r0], r1 str_post r6, r0, r1
str lr, [r0], r1 str_post lr, r0, r1
bgt 1b bgt 1b
pop {r4-r6,pc} pop {r4-r6,pc}
endfunc endfunc
......
...@@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b) ...@@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b)
int r; int r;
__asm__ ("cmp %2, #2 \n\t" __asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t" "ldr %0, [%3, %2, lsl #2] \n\t"
"ite le \n\t"
"lsrle %0, %1, #1 \n\t" "lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t" "smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
...@@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a) ...@@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a)
{ {
int x, y; int x, y;
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t" __asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
"itet ne \n\t"
"mvnne %1, #1<<31 \n\t" "mvnne %1, #1<<31 \n\t"
"moveq %0, %Q2 \n\t" "moveq %0, %Q2 \n\t"
"eorne %0, %1, %R2, asr #31 \n\t" "eorne %0, %1, %R2, asr #31 \n\t"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment