Commit 1c67ad9d authored by Yu Xiaolei's avatar Yu Xiaolei Committed by Michael Niedermayer

swscale: NEON optimized unscaled rgba to nv12 conversion

Signed-off-by: 's avatarYu Xiaolei <dreifachstein@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent ffbcb1c6
OBJS += arm/swscale_unscaled.o
NEON-OBJS += arm/rgb2yuv_neon_32.o
NEON-OBJS += arm/rgb2yuv_neon_16.o
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_l, q11
alias n16x16_h, q12
alias y16x16_l, q13
alias y16x16_h, q14
alias_qw y8x16, q15
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c16x8, q15
alias_qw c8x8x2, q10
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_o, q11
alias n16x16_ol, q11_l
alias n16x16_oh, q11_h
alias y32x16_el, q12
alias y32x16_eh, q13
alias y32x16_ol, q14
alias y32x16_oh, q15
alias y16x16_e, q12
alias y16x16_el, q12_l
alias y16x16_eh, q12_h
alias y16x16_o, q13
alias y16x16_ol, q13_l
alias y16x16_oh, q13_h
alias y8x16, y16x16_e
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
/*
* in: r8x16, g8x16, b8x16
* out: y8x16
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c32x8_l, q14
alias c32x8_h, q15
alias_qw c16x8, q13
alias_qw c8x8x2, q10
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
/*
* in: r16x8, g16x8, b16x8
* out: c8x8
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
bgt 0b
epilogue
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.if \count == 0
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.if \count == 0
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.if \count == 0
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.if \count == 0
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
// common aliases
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
alias BIAS_U, d3
alias BIAS_V, BIAS_U
alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count=0
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
: rgbx_to_nv12_neon_16_wrapper;
}
}
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
......@@ -835,6 +835,7 @@ extern const AVClass sws_context_class;
void ff_get_unscaled_swscale(SwsContext *c);
void ff_get_unscaled_swscale_bfin(SwsContext *c);
void ff_get_unscaled_swscale_ppc(SwsContext *c);
void ff_get_unscaled_swscale_arm(SwsContext *c);
/**
* Return function pointer to fastest main scaler path function depending
......
......@@ -1384,6 +1384,9 @@ void ff_get_unscaled_swscale(SwsContext *c)
ff_get_unscaled_swscale_bfin(c);
if (ARCH_PPC)
ff_get_unscaled_swscale_ppc(c);
if (ARCH_ARM)
ff_get_unscaled_swscale_arm(c);
}
/* Convert the palette to the same packed 32-bit format as the palette */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment