Commit 8986fddc authored by Mans Rullgard's avatar Mans Rullgard

ARM: allow building in Thumb2 mode

Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent 9cd7b854
......@@ -967,6 +967,7 @@ CONFIG_LIST="
static
swscale
swscale_alpha
thumb
vaapi
vdpau
version3
......@@ -2607,7 +2608,7 @@ if enabled alpha; then
elif enabled arm; then
check_cflags -marm
enabled thumb && check_cflags -mthumb || check_cflags -marm
nogas=die
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then
......
......@@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"
......
......@@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]
......
......@@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
mov r11, r10
ldrb r10, [r4], #1 @ band_start_tab[band++]
subs r9, r9, r5 @ - floor
it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8 @ & 0x1fe0
ite gt
subgt r8, r3, r11
suble r8, r10, r11
add r9, r9, r5 @ + floor => m
......
......@@ -41,6 +41,7 @@ endfunc
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
it eq
bxeq lr
push {lr}
mov r12, #256
......
......@@ -24,9 +24,18 @@
# define ELF
#else
# define ELF @
#endif
#if CONFIG_THUMB
# define A @
# define T
#else
# define A
# define T @
#endif
.syntax unified
T .thumb
.macro require8 val=1
ELF .eabi_attribute 24, \val
......@@ -82,6 +91,90 @@ ELF .size \name, . - \name
#endif
.endm
.macro ldr_pre rt, rn, rm:vararg
A ldr \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldr \rt, [\rn]
.endm
.macro ldr_post rt, rn, rm:vararg
A ldr \rt, [\rn], \rm
T ldr \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrd_reg rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn, \rm]
T add \rt, \rn, \rm
T ldrd \rt, \rt2, [\rt]
.endm
.macro ldrd_post rt, rt2, rn, rm
A ldrd \rt, \rt2, [\rn], \rm
T ldrd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro ldrh_pre rt, rn, rm
A ldrh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_dpre rt, rn, rm
A ldrh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T ldrh \rt, [\rn]
.endm
.macro ldrh_post rt, rn, rm
A ldrh \rt, [\rn], \rm
T ldrh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro str_post rt, rn, rm:vararg
A str \rt, [\rn], \rm
T str \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strb_post rt, rn, rm:vararg
A strb \rt, [\rn], \rm
T strb \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strd_post rt, rt2, rn, rm
A strd \rt, \rt2, [\rn], \rm
T strd \rt, \rt2, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_pre rt, rn, rm
A strh \rt, [\rn, \rm]!
T add \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_dpre rt, rn, rm
A strh \rt, [\rn, -\rm]!
T sub \rn, \rn, \rm
T strh \rt, [\rn]
.endm
.macro strh_post rt, rn, rm
A strh \rt, [\rn], \rm
T strh \rt, [\rn]
T add \rn, \rn, \rm
.endm
.macro strh_dpost rt, rn, rm
A strh \rt, [\rn], -\rm
T strh \rt, [\rn]
T sub \rn, \rn, \rm
.endm
#if HAVE_VFP_ARGS
.eabi_attribute 28, 1
# define VFP
......
......@@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
cmp r3, #32
ite eq
moveq r6, #256/32
movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale
......
......@@ -554,10 +554,12 @@ endfunc
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
......@@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
......@@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
......@@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
......@@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */
......
......@@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
ldr r4, [r1], r2
ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
ldr r8, [r1], r2
ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
......@@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
push {r4-r7}
1:
ldr r5, [r1, #4]
ldr r4, [r1], r2
ldr_post r4, r1, r2
ldr r7, [r1, #4]
strd r4, r5, [r0], r2
ldr r6, [r1], r2
strd_post r4, r5, r0, r2
ldr_post r6, r1, r2
subs r3, r3, #2
strd r6, r7, [r0], r2
strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
......@@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
ldr r8, [r1, r2]!
ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
......@@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
......@@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
......@@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
ldr r4, [r1, r2]!
ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
......@@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
uadd8 r11, r11, r7
strd r8, r9, [r0], r2
strd_post r8, r9, r0, r2
ldr r7, [r1, #4]
strd r10, r11, [r0], r2
strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
......@@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
ldr r8, [r1, r2]!
ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
......@@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
ldr r4, [r1, r2]!
ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
ldr r6, [r1, r2]!
ldr_pre r6, r1, r2
uhadd8 r14, r5, r7
ldr r7, [r1, #4]
stm r0, {r8,r9}
......@@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
ldr r9, [r1], r2
ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
ldrd r6, r7, [r0, r2]
ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
ldr r9, [r1], r2
ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
strd r4, r5, [r0], r2
strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
ldrd r4, r5, [r0, r2]
ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
ldr r9, [r1], r2
strd r6, r7, [r0], r2
ldr_post r9, r1, r2
strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
strd r6, r7, [r0], r2
strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc
......@@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
strd r6, r7, [r1], r2
strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc
......@@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
push {r4-r8, lr}
mov lr, #8
1:
ldrd r4, r5, [r1], r2
ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
......@@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
push {r4-r9, lr}
mov lr, #8
1:
ldrd r4, r5, [r1], r3
ldrd r6, r7, [r2], r3
ldrd_post r4, r5, r1, r3
ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
......@@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
push {r4-r9, lr}
mov r0, #0
mov lr, #0
ldrd r4, r5, [r1], r3
ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
ldr r6, [r2], r3
ldrd r8, r9, [r1], r3
ldr_post r6, r2, r3
ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
ldr r6, [r2], r3
ldr_post r6, r2, r3
beq 2f
ldrd r4, r5, [r1], r3
ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
......@@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr r4, [r0, r1]!
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
......
......@@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
it lt
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
......@@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2
2: vst1.32 {q2},[r0,:128]!
vst1.32 {q3},[r0,:128]!
ands len, len, #15
it eq
bxeq lr
3: vld1.32 {q0},[r1,:128]!
vmul.f32 q0, q0, q8
......@@ -638,6 +640,7 @@ NOVFP ldr r3, [sp]
2: vst1.32 {q8},[r0,:128]!
vst1.32 {q9},[r0,:128]!
ands r3, r3, #7
it eq
popeq {pc}
3: vld1.32 {q0},[r1,:128]!
ldr r12, [r2], #4
......
......@@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
1:
subs r3, r3, #16
vmul.f32 s12, s4, s12
itttt ge
vldmiage r1!, {s16-s19}
vldmiage r2!, {s24-s27}
vldmiage r1!, {s20-s23}
vldmiage r2!, {s28-s31}
it ge
vmulge.f32 s24, s16, s24
vstmia r0!, {s8-s11}
vstmia r0!, {s12-s15}
it ge
vmulge.f32 s28, s20, s28
itttt gt
vldmiagt r1!, {s0-s3}
vldmiagt r2!, {s8-s11}
vldmiagt r1!, {s4-s7}
vldmiagt r2!, {s12-s15}
ittt ge
vmulge.f32 s8, s0, s8
vstmiage r0!, {s24-s27}
vstmiage r0!, {s28-s31}
......@@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
vmul.f32 s11, s0, s11
1:
subs r3, r3, #16
it ge
vldmdbge r2!, {s16-s19}
vmul.f32 s12, s7, s12
it ge
vldmiage r1!, {s24-s27}
vmul.f32 s13, s6, s13
it ge
vldmdbge r2!, {s20-s23}
vmul.f32 s14, s5, s14
it ge
vldmiage r1!, {s28-s31}
vmul.f32 s15, s4, s15
it ge
vmulge.f32 s24, s19, s24
it gt
vldmdbgt r2!, {s0-s3}
it ge
vmulge.f32 s25, s18, s25
vstmia r0!, {s8-s13}
it ge
vmulge.f32 s26, s17, s26
it gt
vldmiagt r1!, {s8-s11}
itt ge
vmulge.f32 s27, s16, s27
vmulge.f32 s28, s23, s28
it gt
vldmdbgt r2!, {s4-s7}
it ge
vmulge.f32 s29, s22, s29
vstmia r0!, {s14-s15}
ittt ge
vmulge.f32 s30, s21, s30
vmulge.f32 s31, s20, s31
vmulge.f32 s8, s3, s8
it gt
vldmiagt r1!, {s12-s15}
itttt ge
vmulge.f32 s9, s2, s9
vmulge.f32 s10, s1, s10
vstmiage r0!, {s24-s27}
vmulge.f32 s11, s0, s11
it ge
vstmiage r0!, {s28-s31}
bgt 1b
......
......@@ -71,6 +71,7 @@ endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
itt lt
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
......@@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
......@@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
it eq
popeq {r4-r8,pc}
@ 1 channel
......@@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
......
......@@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
......@@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
......
This diff is collapsed.
......@@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
adrne lr, ff_h264_idct_dc_add_neon
adreq lr, ff_h264_idct_add_neon
ite ne
adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
......@@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon
adreq lr, ff_h264_idct_dc_add_neon
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
......@@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
add r1, r3, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
adrne lr, ff_h264_idct_add_neon
adreq lr, ff_h264_idct_dc_add_neon
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
......@@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
adrne lr, ff_h264_idct8_dc_add_neon
adreq lr, ff_h264_idct8_add_neon
ite ne
adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128
......
......@@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
"itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
"it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
"it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)
......
......@@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
ldr r10, [r3, lr, lsr #1]
A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
......
......@@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
strh r10, [r3], r4
strh_post r10, r3, r4
mov lr, #15
1:
......@@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
strh r10, [r3], r12
strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
strh r11, [r5], -r12
strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
......
......@@ -38,15 +38,21 @@
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlabbne \dst, \src, \mul, \tmp
.endm
......@@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
strh lr, [r0], #2
subs r3, r3, #8
it gt
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2
......
......@@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
it le
bxle lr
cmp r3, #8
bgt 1b
......@@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
......@@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
it ne
movne r2, #0
bne 2f
cmp r2, #4
it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]
......
......@@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
vst1.32 {d22}, [r5,:64]
cmp r6, #0
it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
......
......@@ -121,11 +121,13 @@ __b_evaluation:
ldr r11, [r12, #offW7] @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
......@@ -148,19 +150,23 @@ __b_evaluation:
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
teq r3, #0 @ if null avoid muls
teq r3, #0 @ if null avoid muls
it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
teq r4, #0 @ if null avoid muls
teq r4, #0 @ if null avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
......@@ -204,16 +210,19 @@ __a_evaluation:
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
teq r11, #0 @ if null avoid muls
it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
......@@ -222,6 +231,7 @@ __a_evaluation:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
......@@ -323,10 +333,12 @@ __b_evaluation2:
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
......@@ -342,18 +354,22 @@ __b_evaluation2:
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
__end_b_evaluation2:
......@@ -390,15 +406,18 @@ __a_evaluation2:
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
......@@ -407,6 +426,7 @@ __a_evaluation2:
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation2:
......
......@@ -49,6 +49,7 @@ function idct_row_armv5te
ldrd v1, [a1, #8]
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2
itt eq
cmpeq v1, a4
cmpeq v1, a3, lsr #16
beq row_dc_only
......@@ -269,6 +270,7 @@ function idct_col_armv5te
ldmfd sp!, {a3, a4}
adds a2, a3, v1
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v2
mov ip, ip, asr #20
......@@ -276,6 +278,7 @@ function idct_col_armv5te
str a2, [a1]
subs a3, a3, v1
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v2
mov a4, a4, asr #20
......@@ -285,6 +288,7 @@ function idct_col_armv5te
subs a2, a3, v3
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
sub ip, a4, v4
mov ip, ip, asr #20
......@@ -292,6 +296,7 @@ function idct_col_armv5te
str a2, [a1, #(16*1)]
adds a3, a3, v3
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
add a4, a4, v4
mov a4, a4, asr #20
......@@ -301,6 +306,7 @@ function idct_col_armv5te
adds a2, a3, v5
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v6
mov ip, ip, asr #20
......@@ -308,6 +314,7 @@ function idct_col_armv5te
str a2, [a1, #(16*2)]
subs a3, a3, v5
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v6
mov a4, a4, asr #20
......@@ -317,6 +324,7 @@ function idct_col_armv5te
adds a2, a3, v7
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, fp
mov ip, ip, asr #20
......@@ -324,6 +332,7 @@ function idct_col_armv5te
str a2, [a1, #(16*3)]
subs a3, a3, v7
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, fp
mov a4, a4, asr #20
......@@ -335,15 +344,19 @@ endfunc
.macro clip dst, src:vararg
movs \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
.macro aclip dst, src:vararg
adds \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
......@@ -370,35 +383,35 @@ function idct_col_put_armv5te
orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4}
strh a2, [v2, v1]!
strh_pre a2, v2, v1
sub a2, a3, v3
clip a2, a2, asr #20
sub ip, a4, v4
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh a2, [v1, lr]!
strh_pre a2, v1, lr
add a3, a3, v3
clip a2, a3, asr #20
add a4, a4, v4
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]!
strh_dpre a2, v2, lr
add a2, a3, v5
clip a2, a2, asr #20
add ip, a4, v6
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh a2, [v1, lr]!
strh_pre a2, v1, lr
sub a3, a3, v5
clip a2, a3, asr #20
sub a4, a4, v6
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh a2, [v2, -lr]!
strh_dpre a2, v2, lr
add a2, a3, v7
clip a2, a2, asr #20
......@@ -411,7 +424,7 @@ function idct_col_put_armv5te
sub a4, a4, fp
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
strh a2, [v2, -lr]
strh_dpre a2, v2, lr
ldr pc, [sp], #4
endfunc
......@@ -436,7 +449,7 @@ function idct_col_add_armv5te
ldr v1, [sp, #32]
sub a4, a4, v2
rsb v2, v1, v1, lsl #3
ldrh ip, [v2, lr]!
ldrh_pre ip, v2, lr
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
......@@ -448,7 +461,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
sub a2, a3, v3
add a3, a3, v3
and v3, ip, #255
......@@ -458,7 +471,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
add a4, a4, v4
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
......@@ -468,7 +481,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
add a2, a3, v5
sub a3, a3, v5
and v3, ip, #255
......@@ -478,7 +491,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, v6
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
......@@ -488,7 +501,7 @@ function idct_col_add_armv5te
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh ip, [lr, v1]!
ldrh_pre ip, lr, v1
add a2, a3, v7
sub a3, a3, v7
and v3, ip, #255
......@@ -498,7 +511,7 @@ function idct_col_add_armv5te
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, fp
ldrh ip, [v2, -v1]!
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
......
......@@ -200,6 +200,7 @@ function idct_row_armv6
ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip
itt eq
cmpeq lr, r3
cmpeq lr, r2, lsr #16
beq 1f
......@@ -282,14 +283,14 @@ function idct_col_put_armv6
pop {r1, r2}
idct_finish_shift_sat COL_SHIFT
strb r4, [r1], r2
strb r5, [r1], r2
strb r6, [r1], r2
strb r7, [r1], r2
strb r11,[r1], r2
strb r10,[r1], r2
strb r9, [r1], r2
strb r8, [r1], r2
strb_post r4, r1, r2
strb_post r5, r1, r2
strb_post r6, r1, r2
strb_post r7, r1, r2
strb_post r11,r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
sub r1, r1, r2, lsl #3
......@@ -318,16 +319,16 @@ function idct_col_add_armv6
add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT
strb ip, [r1], r2
strb_post ip, r1, r2
ldrb ip, [r1, r2]
usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5
strb r4, [r1], r2
strb_post r4, r1, r2
ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2]
strb r5, [r1], r2
strb_post r5, r1, r2
ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT
......@@ -340,11 +341,11 @@ function idct_col_add_armv6
usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr
strb r6, [r1], r2
strb r10,[r1], r2
strb r9, [r1], r2
strb r8, [r1], r2
strb lr, [r1], r2
strb_post r6, r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
strb_post lr, r1, r2
sub r1, r1, r2, lsl #3
......
......@@ -71,7 +71,7 @@ function idct_row4_pld_neon
add r3, r0, r1, lsl #2
pld [r0, r1]
pld [r0, r1, lsl #1]
pld [r3, -r1]
A pld [r3, -r1]
pld [r3]
pld [r3, r1]
add r3, r3, r1, lsl #1
......@@ -164,6 +164,7 @@ function idct_col4_neon
orrs r4, r4, r5
idct_col4_top
it eq
addeq r2, r2, #16
beq 1f
......@@ -176,6 +177,7 @@ function idct_col4_neon
1: orrs r6, r6, r7
ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16
beq 2f
......@@ -187,6 +189,7 @@ function idct_col4_neon
2: orrs r4, r4, r5
ldrd r4, [r2, #16]
it eq
addeq r2, r2, #16
beq 3f
......@@ -199,6 +202,7 @@ function idct_col4_neon
vadd.i32 q13, q13, q8
3: orrs r4, r4, r5
it eq
addeq r2, r2, #16
beq 4f
......
......@@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale
vst1.32 {q9}, [r2,:128]
subs r1, r1, #1
it eq
popeq {r4-r11,pc}
cmp r4, #0
itt eq
subeq r8, r8, #512*4
subeq r9, r9, #512*4
sub r5, r5, #512*4
......
......@@ -21,6 +21,14 @@
#ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H
#if CONFIG_THUMB
# define A(x)
# define T(x) x
#else
# define A(x) x
# define T(x)
#endif
#if HAVE_ARMV6 && HAVE_INLINE_ASM
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
......@@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
unsigned bit;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n"
A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n"
"ittte ge \n"
"subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n"
"movge %2, #1 \n"
......@@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
unsigned tmp;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
"ldrcsh %2, [%4], #2 \n"
A("ldrcsh %2, [%4], #2 \n")
T("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
"orrcs %1, %1, %2, lsl %3 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n"
......
......@@ -25,13 +25,18 @@
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
rsb \h, \pr, #256
it cs
ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h
T itttt cs
rev16cs \t1, \t1
orrcs \cw, \cw, \t1, lsl \bs
A orrcs \cw, \cw, \t1, lsl \bs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
......@@ -40,14 +45,20 @@
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
it cs
ldrhcs \t1, [\buf], #2
mov \h, #128
it cs
rev16cs \t1, \t1
add \h, \h, \t0, lsl #7
orrcs \cw, \cw, \t1, lsl \bs
A orrcs \cw, \cw, \t1, lsl \bs
T ittt cs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
......@@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
cmp r3, #0
ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf
it ne
pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word
0:
......@@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
adds r6, r6, r9
add r4, r4, #11
lsl r8, r8, r9
it cs
ldrhcs r10, [r7], #2
lsl r9, r5, r9
mov r5, #128
it cs
rev16cs r10, r10
add r5, r5, r9, lsl #7
orrcs r8, r8, r10, lsl r6
T ittt cs
T lslcs r10, r10, r6
T orrcs r8, r8, r10
A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16
lsr r5, r5, #8
cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1
itt ge
subge r8, r8, r5, lsl #16
subge r5, r9, r5
ldrb r10, [r10, r3]
it ge
rsbge r12, r12, #0
cmp r3, #16
strh r12, [r1, r10]
......@@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
ldr r0, [sp]
ldr r9, [r0, #12]
cmp r7, r9
it hi
movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word
......@@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #2
ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
blt 4f
ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
b 4f
......@@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #5
mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
......@@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
mov r12, #7
mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #2
ldrb r9, [lr, r5]
mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
3:
ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r4, r4, #1
ldrb r9, [lr, r5]
ite ge
movge r12, #2
movlt r12, #0
ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8
it ge
addge r12, r12, #1
movrel r4, X(ff_vp8_dct_cat_prob)
lsl r9, r9, r12
......@@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1
it ge
addge r1, r1, #1
cmp r0, #0
bne 1b
......@@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
add r4, r2, r4
add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10
it ge
rsbge r12, r12, #0
smulbb r12, r12, r11
movrel r9, zigzag_scan-1
......
......@@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
push {r4-r6,lr}
1:
subs r12, r12, #4
ldr r4, [r2], r3
ldr r5, [r2], r3
ldr r6, [r2], r3
ldr lr, [r2], r3
str r4, [r0], r1
str r5, [r0], r1
str r6, [r0], r1
str lr, [r0], r1
ldr_post r4, r2, r3
ldr_post r5, r2, r3
ldr_post r6, r2, r3
ldr_post lr, r2, r3
str_post r4, r0, r1
str_post r5, r0, r1
str_post r6, r0, r1
str_post lr, r0, r1
bgt 1b
pop {r4-r6,pc}
endfunc
......
......@@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b)
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
"ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
......@@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a)
{
int x, y;
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
"itet ne \n\t"
"mvnne %1, #1<<31 \n\t"
"moveq %0, %Q2 \n\t"
"eorne %0, %1, %R2, asr #31 \n\t"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment