Commit a4d4bad7 authored by Martin Storsjö's avatar Martin Storsjö

arm: Add NEON optimizations for 10 and 12 bit vp9 MC

This work is sponsored by, and copyright, Google.

The plain pixel put/copy functions are used from the 8 bit version,
for the double size (e.g. put16 uses ff_vp9_copy32_neon), and a new
copy128 is added.

Compared with the 8 bit version, the filters can no longer use the
trick to accumulate in 16 bit with only saturation at the end, but now
the accumulators need to be 32 bit. This avoids the need to keep track
of which filter index is the largest though, reducing the size of the
executable code for these filters.

For the horizontal filters, we only do 4 or 8 pixels wide in parallel
(while doing two rows at a time), since we don't have enough register
space to filter 16 pixels wide.

For the vertical filters, we still do 4 and 8 pixels in parallel just
as in the 8 bit case, but we need to store the output after every 2
rows instead of after every 4 rows.

Examples of relative speedup compared to the C version, from checkasm:
                               Cortex    A7     A8     A9    A53
vp9_avg4_10bpp_neon:                   2.25   2.44   3.05   2.16
vp9_avg8_10bpp_neon:                   3.66   8.48   3.86   3.50
vp9_avg16_10bpp_neon:                  3.39   8.26   3.37   2.72
vp9_avg32_10bpp_neon:                  4.03  10.20   4.07   3.42
vp9_avg64_10bpp_neon:                  4.15  10.01   4.13   3.70
vp9_avg_8tap_smooth_4h_10bpp_neon:     3.38   6.22   3.41   4.75
vp9_avg_8tap_smooth_4hv_10bpp_neon:    3.89   6.39   4.30   5.32
vp9_avg_8tap_smooth_4v_10bpp_neon:     5.32   9.73   6.34   7.31
vp9_avg_8tap_smooth_8h_10bpp_neon:     4.45   9.40   4.68   6.87
vp9_avg_8tap_smooth_8hv_10bpp_neon:    4.64   8.91   5.44   6.47
vp9_avg_8tap_smooth_8v_10bpp_neon:     6.44  13.42   8.68   8.79
vp9_avg_8tap_smooth_64h_10bpp_neon:    4.66   9.02   4.84   7.71
vp9_avg_8tap_smooth_64hv_10bpp_neon:   4.61   9.14   4.92   7.10
vp9_avg_8tap_smooth_64v_10bpp_neon:    6.90  14.13   9.57  10.41
vp9_put4_10bpp_neon:                   1.33   1.46   2.09   1.33
vp9_put8_10bpp_neon:                   1.57   3.42   1.83   1.84
vp9_put16_10bpp_neon:                  1.55   4.78   2.17   1.89
vp9_put32_10bpp_neon:                  2.06   5.35   2.14   2.30
vp9_put64_10bpp_neon:                  3.00   2.41   1.95   1.66
vp9_put_8tap_smooth_4h_10bpp_neon:     3.19   5.81   3.31   4.63
vp9_put_8tap_smooth_4hv_10bpp_neon:    3.86   6.22   4.32   5.21
vp9_put_8tap_smooth_4v_10bpp_neon:     5.40   9.77   6.08   7.21
vp9_put_8tap_smooth_8h_10bpp_neon:     4.22   8.41   4.46   6.63
vp9_put_8tap_smooth_8hv_10bpp_neon:    4.56   8.51   5.39   6.25
vp9_put_8tap_smooth_8v_10bpp_neon:     6.60  12.43   8.17   8.89
vp9_put_8tap_smooth_64h_10bpp_neon:    4.41   8.59   4.54   7.49
vp9_put_8tap_smooth_64hv_10bpp_neon:   4.43   8.58   5.34   6.63
vp9_put_8tap_smooth_64v_10bpp_neon:    7.26  13.92   9.27  10.92

For the larger 8tap filters, the speedup vs C code is around 4-14x.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent cda9a3e8
......@@ -44,7 +44,9 @@ OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_arm.o
OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \
arm/vp9dsp_init_12bpp_arm.o \
arm/vp9dsp_init_arm.o
# ARMv5 optimizations
......@@ -142,4 +144,5 @@ NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \
arm/vp9lpf_neon.o \
arm/vp9mc_16bpp_neon.o \
arm/vp9mc_neon.o
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_VP9DSP_INIT_H
#define AVCODEC_ARM_VP9DSP_INIT_H
#include "libavcodec/vp9dsp.h"
void ff_vp9dsp_init_10bpp_arm(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_arm(VP9DSPContext *dsp);
#endif /* AVCODEC_ARM_VP9DSP_INIT_H */
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPP 10
#define INIT_FUNC ff_vp9dsp_init_10bpp_arm
#include "vp9dsp_init_16bpp_arm_template.c"
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPP 12
#define INIT_FUNC ff_vp9dsp_init_12bpp_arm
#include "vp9dsp_init_16bpp_arm_template.c"
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "vp9dsp_init.h"
#define declare_fpel(type, sz, suffix) \
void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define decl_mc_func(op, filter, dir, sz, bpp) \
void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define define_8tap_2d_fn(op, filter, sz, bpp) \
static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
/* We only need h + 7 lines, but the horizontal filter assumes an \
* even number of rows, so filter h + 8 lines here. */ \
ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
src - 3 * src_stride, src_stride, \
h + 8, mx, 0); \
ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
temp + 3 * 2 * sz, 2 * sz, \
h, 0, my); \
}
#define decl_filter_funcs(op, dir, sz, bpp) \
decl_mc_func(op, regular, dir, sz, bpp); \
decl_mc_func(op, sharp, dir, sz, bpp); \
decl_mc_func(op, smooth, dir, sz, bpp)
#define decl_mc_funcs(sz, bpp) \
decl_filter_funcs(put, h, sz, bpp); \
decl_filter_funcs(avg, h, sz, bpp); \
decl_filter_funcs(put, v, sz, bpp); \
decl_filter_funcs(avg, v, sz, bpp); \
decl_filter_funcs(put, hv, sz, bpp); \
decl_filter_funcs(avg, hv, sz, bpp)
declare_fpel(copy, 128, );
declare_fpel(copy, 64, );
declare_fpel(copy, 32, );
declare_fpel(copy, 16, );
declare_fpel(copy, 8, );
declare_fpel(avg, 64, _16);
declare_fpel(avg, 32, _16);
declare_fpel(avg, 16, _16);
declare_fpel(avg, 8, _16);
declare_fpel(avg, 4, _16);
decl_mc_funcs(64, BPP);
decl_mc_funcs(32, BPP);
decl_mc_funcs(16, BPP);
decl_mc_funcs(8, BPP);
decl_mc_funcs(4, BPP);
#define define_8tap_2d_funcs(sz, bpp) \
define_8tap_2d_fn(put, regular, sz, bpp) \
define_8tap_2d_fn(put, sharp, sz, bpp) \
define_8tap_2d_fn(put, smooth, sz, bpp) \
define_8tap_2d_fn(avg, regular, sz, bpp) \
define_8tap_2d_fn(avg, sharp, sz, bpp) \
define_8tap_2d_fn(avg, smooth, sz, bpp)
define_8tap_2d_funcs(64, BPP)
define_8tap_2d_funcs(32, BPP)
define_8tap_2d_funcs(16, BPP)
define_8tap_2d_funcs(8, BPP)
define_8tap_2d_funcs(4, BPP)
static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_fpel(idx1, idx2, sz, type, suffix) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix##_neon
#define init_copy_avg(idx, sz1, sz2) \
init_fpel(idx, 0, sz2, copy, ); \
init_fpel(idx, 1, sz1, avg, _16)
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
#define init_mc_funcs_dirs(idx, sz, bpp) \
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
init_copy_avg(0, 64, 128);
init_copy_avg(1, 32, 64);
init_copy_avg(2, 16, 32);
init_copy_avg(3, 8, 16);
init_copy_avg(4, 4, 8);
init_mc_funcs_dirs(0, 64, BPP);
init_mc_funcs_dirs(1, 32, BPP);
init_mc_funcs_dirs(2, 16, BPP);
init_mc_funcs_dirs(3, 8, BPP);
init_mc_funcs_dirs(4, 4, BPP);
}
}
av_cold void INIT_FUNC(VP9DSPContext *dsp)
{
vp9dsp_mc_init_arm(dsp);
}
......@@ -23,6 +23,7 @@
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/vp9dsp.h"
#include "vp9dsp_init.h"
#define declare_fpel(type, sz) \
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
......@@ -240,7 +241,13 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp)
{
if (bpp != 8)
if (bpp == 10) {
ff_vp9dsp_init_10bpp_arm(dsp);
return;
} else if (bpp == 12) {
ff_vp9dsp_init_12bpp_arm(dsp);
return;
} else if (bpp != 8)
return;
vp9dsp_mc_init_arm(dsp);
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment