Commit e814015d authored by Måns Rullgård's avatar Måns Rullgård

ARM: NEON optimised vorbis_inverse_coupling

12% faster Vorbis decoding on Cortex-A8.

Originally committed as revision 19637 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 737cbcde
...@@ -161,6 +161,8 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0, ...@@ -161,6 +161,8 @@ void ff_vector_fmul_window_neon(float *dst, const float *src0,
void ff_float_to_int16_neon(int16_t *, const float *, long); void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{ {
c->put_pixels_tab[0][0] = ff_put_pixels16_neon; c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
...@@ -272,4 +274,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ...@@ -272,4 +274,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->float_to_int16 = ff_float_to_int16_neon; c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
} }
if (CONFIG_VORBIS_DECODER)
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
} }
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
#include "config.h"
#include "asm.S" #include "asm.S"
preserve8 preserve8
...@@ -795,3 +796,66 @@ NOVFP ldr lr, [sp, #16] ...@@ -795,3 +796,66 @@ NOVFP ldr lr, [sp, #16]
vst1.64 {d22,d23},[ip,:128], r5 vst1.64 {d22,d23},[ip,:128], r5
pop {r4,r5,pc} pop {r4,r5,pc}
.endfunc .endfunc
#if CONFIG_VORBIS_DECODER
function ff_vorbis_inverse_coupling_neon, export=1
vmov.i32 q10, #1<<31
subs r2, r2, #4
tst r2, #4
mov r3, r0
mov r12, r1
beq 3f
vld1.32 {d24-d25},[r1,:128]!
vld1.32 {d22-d23},[r0,:128]!
vcle.s32 q8, q12, #0
vand q9, q11, q10
veor q12, q12, q9
vand q2, q12, q8
vbic q3, q12, q8
vadd.f32 q12, q11, q2
vsub.f32 q11, q11, q3
1: vld1.32 {d2-d3}, [r1,:128]!
vld1.32 {d0-d1}, [r0,:128]!
vcle.s32 q8, q1, #0
vand q9, q0, q10
veor q1, q1, q9
vst1.32 {d24-d25},[r3, :128]!
vst1.32 {d22-d23},[r12,:128]!
vand q2, q1, q8
vbic q3, q1, q8
vadd.f32 q1, q0, q2
vsub.f32 q0, q0, q3
subs r2, r2, #8
ble 2f
vld1.32 {d24-d25},[r1,:128]!
vld1.32 {d22-d23},[r0,:128]!
vcle.s32 q8, q12, #0
vand q9, q11, q10
veor q12, q12, q9
vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
vand q2, q12, q8
vbic q3, q12, q8
vadd.f32 q12, q11, q2
vsub.f32 q11, q11, q3
b 1b
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
vld1.32 {d0-d1}, [r0,:128]
vcle.s32 q8, q1, #0
vand q9, q0, q10
veor q1, q1, q9
vand q2, q1, q8
vbic q3, q1, q8
vadd.f32 q1, q0, q2
vsub.f32 q0, q0, q3
vst1.32 {d2-d3}, [r0,:128]!
vst1.32 {d0-d1}, [r1,:128]!
bx lr
.endfunc
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment