Commit bd53b426 authored by Måns Rullgård's avatar Måns Rullgård

ARM: NEON optimised H.264 weighted prediction

Originally committed as revision 16771 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 5a29589b
...@@ -92,6 +92,23 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, ...@@ -92,6 +92,23 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0); int beta, int8_t *tc0);
void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights, int log2_den, int weightd, int weights,
int offset); int offset);
...@@ -201,6 +218,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) ...@@ -201,6 +218,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
......
...@@ -1536,3 +1536,135 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 ...@@ -1536,3 +1536,135 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
biweight_entry 4, 2 biweight_entry 4, 2
biweight_entry 4, 4, b=0 biweight_entry 4, 4, b=0
biweight_func 4 biweight_func 4
@ Weighted prediction
.macro weight_16 mac
vdup.8 d0, r3
vmov q2, q8
vmov q3, q8
1: subs ip, ip, #2
vld1.8 {d20-d21},[r0,:128], r1
\mac q2, d0, d20
pld [r0]
\mac q3, d0, d21
vmov q12, q8
vld1.8 {d28-d29},[r0,:128], r1
vmov q13, q8
\mac q12, d0, d28
pld [r0]
\mac q13, d0, d29
vshl.s16 q2, q2, q9
vshl.s16 q3, q3, q9
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
vshl.s16 q12, q12, q9
vshl.s16 q13, q13, q9
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vmov q3, q8
vst1.8 {d4- d5}, [r4,:128], r1
vmov q2, q8
vst1.8 {d24-d25},[r4,:128], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_8 mac
vdup.8 d0, r3
vmov q1, q8
vmov q10, q8
1: subs ip, ip, #2
vld1.8 {d4},[r0,:64], r1
\mac q1, d0, d4
pld [r0]
vld1.8 {d6},[r0,:64], r1
\mac q10, d0, d6
pld [r0]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.8 {d2},[r4,:64], r1
vmov q1, q8
vst1.8 {d4},[r4,:64], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_4 mac
vdup.8 d0, r3
vmov q1, q8
vmov q10, q8
1: subs ip, ip, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
\mac q1, d0, d4
pld [r0]
blt 2f
vld1.32 {d6[0]},[r0,:32], r1
vld1.32 {d6[1]},[r0,:32], r1
\mac q10, d0, d6
pld [r0]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
vmov q1, q8
vst1.32 {d4[0]},[r4,:32], r1
vst1.32 {d4[1]},[r4,:32], r1
bne 1b
pop {r4, pc}
2: vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
pop {r4, pc}
.endm
.macro weight_func w
function weight_h264_pixels_\w\()_neon
push {r4, lr}
ldr r4, [sp, #8]
vdup.16 q9, r2
mov lr, #1
lsl r4, r4, r2
subs r2, r2, #1
vneg.s16 q9, q9
addge r4, r4, lr, lsl r2
cmp r3, #0
vdup.16 q8, r4
mov r4, r0
blt 10f
weight_\w vmlal.u8
10: rsb r3, r3, #0
weight_\w vmlsl.u8
.endfunc
.endm
.macro weight_entry w, h, b=1
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
mov ip, #\h
.if \b
b weight_h264_pixels_\w\()_neon
.endif
.endfunc
.endm
weight_entry 16, 8
weight_entry 16, 16, b=0
weight_func 16
weight_entry 8, 16
weight_entry 8, 4
weight_entry 8, 8, b=0
weight_func 8
weight_entry 4, 8
weight_entry 4, 2
weight_entry 4, 4, b=0
weight_func 4
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment