Commit f5c05b9a authored by Janne Grunau's avatar Janne Grunau Committed by Mans Rullgard

rv40: NEON optimised chroma MC

Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent f054a827
...@@ -68,6 +68,8 @@ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_neon.o \ ...@@ -68,6 +68,8 @@ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_neon.o \
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_neon.o \ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_neon.o \
arm/rv34dsp_neon.o \ arm/rv34dsp_neon.o \
arm/rv40dsp_init_neon.o \
arm/h264cmc_neon.o \
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "asm.S" #include "asm.S"
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc8 type .macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_h264_chroma_mc8_neon, export=1 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
push {r4-r7, lr} push {r4-r7, lr}
ldrd r4, [sp, #20] ldrd r4, [sp, #20]
.ifc \type,avg .ifc \type,avg
...@@ -31,6 +31,15 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 ...@@ -31,6 +31,15 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
pld [r1] pld [r1]
pld [r1, r2] pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
A muls r7, r4, r5 A muls r7, r4, r5
T mul r7, r4, r5 T mul r7, r4, r5
T cmp r7, #0 T cmp r7, #0
...@@ -67,10 +76,17 @@ T cmp r7, #0 ...@@ -67,10 +76,17 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1 vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2 vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3 vmlal.u8 q9, d5, d3
vrshrn.u16 d16, q8, #6
vld1.8 {d6, d7}, [r5], r4 vld1.8 {d6, d7}, [r5], r4
pld [r1] pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg .ifc \type,avg
vld1.8 {d20}, [lr,:64], r2 vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
...@@ -102,8 +118,15 @@ T cmp r7, #0 ...@@ -102,8 +118,15 @@ T cmp r7, #0
vmull.u8 q9, d6, d0 vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1 vmlal.u8 q9, d4, d1
vld1.8 {d6}, [r5], r4 vld1.8 {d6}, [r5], r4
.ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg .ifc \type,avg
vld1.8 {d20}, [lr,:64], r2 vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
...@@ -131,8 +154,15 @@ T cmp r7, #0 ...@@ -131,8 +154,15 @@ T cmp r7, #0
vmlal.u8 q9, d7, d1 vmlal.u8 q9, d7, d1
pld [r1] pld [r1]
vext.8 d5, d4, d5, #1 vext.8 d5, d4, d5, #1
.ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6 vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg .ifc \type,avg
vld1.8 {d20}, [lr,:64], r2 vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2
...@@ -149,8 +179,8 @@ endfunc ...@@ -149,8 +179,8 @@ endfunc
.endm .endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc4 type .macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_h264_chroma_mc4_neon, export=1 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
push {r4-r7, lr} push {r4-r7, lr}
ldrd r4, [sp, #20] ldrd r4, [sp, #20]
.ifc \type,avg .ifc \type,avg
...@@ -159,6 +189,15 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 ...@@ -159,6 +189,15 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
pld [r1] pld [r1]
pld [r1, r2] pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
A muls r7, r4, r5 A muls r7, r4, r5
T mul r7, r4, r5 T mul r7, r4, r5
T cmp r7, #0 T cmp r7, #0
...@@ -199,7 +238,12 @@ T cmp r7, #0 ...@@ -199,7 +238,12 @@ T cmp r7, #0
vld1.8 {d6}, [r5], r4 vld1.8 {d6}, [r5], r4
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
.ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
subs r3, r3, #2 subs r3, r3, #2
pld [r1] pld [r1]
.ifc \type,avg .ifc \type,avg
...@@ -236,7 +280,12 @@ T cmp r7, #0 ...@@ -236,7 +280,12 @@ T cmp r7, #0
vld1.32 {d4[1]}, [r5], r4 vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
.ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg .ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2
...@@ -266,7 +315,12 @@ T cmp r7, #0 ...@@ -266,7 +315,12 @@ T cmp r7, #0
vadd.i16 d16, d16, d17 vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19 vadd.i16 d17, d18, d19
pld [r1] pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6 vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg .ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2
...@@ -352,9 +406,25 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 ...@@ -352,9 +406,25 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
endfunc endfunc
.endm .endm
#if CONFIG_H264_DECODER
h264_chroma_mc8 put h264_chroma_mc8 put
h264_chroma_mc8 avg h264_chroma_mc8 avg
h264_chroma_mc4 put h264_chroma_mc4 put
h264_chroma_mc4 avg h264_chroma_mc4 avg
h264_chroma_mc2 put h264_chroma_mc2 put
h264_chroma_mc2 avg h264_chroma_mc2 avg
#endif
#if CONFIG_RV40_DECODER
const rv40bias
.short 0, 16, 32, 16
.short 32, 28, 32, 28
.short 0, 32, 16, 32
.short 32, 28, 32, 28
endconst
h264_chroma_mc8 put, rv40
h264_chroma_mc8 avg, rv40
h264_chroma_mc4 put, rv40
h264_chroma_mc4 avg, rv40
#endif
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavcodec/avcodec.h"
#include "libavcodec/rv34dsp.h"
void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
}
...@@ -59,5 +59,6 @@ void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp); ...@@ -59,5 +59,6 @@ void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp); void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp); void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
#endif /* AVCODEC_RV34DSP_H */ #endif /* AVCODEC_RV34DSP_H */
...@@ -534,4 +534,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) { ...@@ -534,4 +534,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) {
if (HAVE_MMX) if (HAVE_MMX)
ff_rv40dsp_init_x86(c, dsp); ff_rv40dsp_init_x86(c, dsp);
if (HAVE_NEON)
ff_rv40dsp_init_neon(c, dsp);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment