Commit 2ed67eba authored by Martin Storsjö's avatar Martin Storsjö

arm: Add NEON optimizations for 10 and 12 bit vp9 itxfm

This work is sponsored by, and copyright, Google.

This is structured similarly to the 8 bit version. In the 8 bit
version, the coefficients are 16 bits, and intermediates are 32 bits.

Here, the coefficients are 32 bit. For the 4x4 transforms for 10 bit
content, the intermediates also fit in 32 bits, but for all other
transforms (4x4 for 12 bit content, and 8x8 and larger for both 10
and 12 bit) the intermediates are 64 bit.

For the existing 8 bit case, the 8x8 transform fit all coefficients in
registers; for 10/12 bit, when the coefficients are 32 bit, the 8x8
transform also has to be done in slices of 4 pixels (just as 16x16 and
32x32 for 8 bit).

The slice width also shrinks from 4 elements to 2 elements in parallel
for the 16x16 and 32x32 cases.

The 16 bit coefficients from idct_coeffs and similar tables also need
to be lenghtened to 32 bit in order to be used in multiplication with
vectors with 32 bit elements. This leads to the fixed coefficient
vectors needing more space, leading to more cases where they have to
be reloaded within the transform (in iadst16).

This technically would need testing in checkasm for subpartitions
in increments of 2, but that slows down normal checkasm runs
excessively.

Examples of relative speedup compared to the C version, from checkasm:
                                     Cortex    A7     A8     A9    A53
vp9_inv_adst_adst_4x4_sub4_add_10_neon:      4.83  11.36   5.22   6.77
vp9_inv_adst_adst_8x8_sub8_add_10_neon:      4.12   7.60   4.06   4.84
vp9_inv_adst_adst_16x16_sub16_add_10_neon:   3.93   8.16   4.52   5.35
vp9_inv_dct_dct_4x4_sub1_add_10_neon:        1.36   2.57   1.41   1.61
vp9_inv_dct_dct_4x4_sub4_add_10_neon:        4.24   8.66   5.06   5.81
vp9_inv_dct_dct_8x8_sub1_add_10_neon:        2.63   4.18   1.68   2.87
vp9_inv_dct_dct_8x8_sub4_add_10_neon:        4.52   9.47   4.24   5.39
vp9_inv_dct_dct_8x8_sub8_add_10_neon:        3.45   7.34   3.45   4.30
vp9_inv_dct_dct_16x16_sub1_add_10_neon:      3.56   6.21   2.47   4.32
vp9_inv_dct_dct_16x16_sub2_add_10_neon:      5.68  12.73   5.28   7.07
vp9_inv_dct_dct_16x16_sub8_add_10_neon:      4.42   9.28   4.24   5.45
vp9_inv_dct_dct_16x16_sub16_add_10_neon:     3.41   7.29   3.35   4.19
vp9_inv_dct_dct_32x32_sub1_add_10_neon:      4.52   8.35   3.83   6.40
vp9_inv_dct_dct_32x32_sub2_add_10_neon:      5.86  13.19   6.14   7.04
vp9_inv_dct_dct_32x32_sub16_add_10_neon:     4.29   8.11   4.59   5.06
vp9_inv_dct_dct_32x32_sub32_add_10_neon:     3.31   5.70   3.56   3.84
vp9_inv_wht_wht_4x4_sub4_add_10_neon:        1.89   2.80   1.82   1.97

The speedup compared to the C functions is around 1.3 to 7x for the
full transforms, even higher for the smaller subpartitions.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
parent a4d4bad7
......@@ -142,7 +142,8 @@ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_neon.o \
NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \
arm/vp9itxfm_neon.o \
arm/vp9lpf_neon.o \
arm/vp9mc_16bpp_neon.o \
arm/vp9mc_neon.o
......@@ -141,7 +141,54 @@ static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
}
}
#define define_itxfm2(type_a, type_b, sz, bpp) \
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
ptrdiff_t stride, \
int16_t *_block, int eob)
#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
#define define_itxfm_funcs(sz, bpp) \
define_itxfm(idct, idct, sz, bpp); \
define_itxfm(iadst, idct, sz, bpp); \
define_itxfm(idct, iadst, sz, bpp); \
define_itxfm(iadst, iadst, sz, bpp)
define_itxfm_funcs(4, BPP);
define_itxfm_funcs(8, BPP);
define_itxfm_funcs(16, BPP);
define_itxfm(idct, idct, 32, BPP);
define_itxfm(iwht, iwht, 4, BPP);
static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_itxfm2(tx, sz, bpp) \
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
#define init_idct2(tx, nm, bpp) \
dsp->itxfm_add[tx][DCT_DCT] = \
dsp->itxfm_add[tx][ADST_DCT] = \
dsp->itxfm_add[tx][DCT_ADST] = \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
init_itxfm(TX_4X4, 4x4, BPP);
init_itxfm(TX_8X8, 8x8, BPP);
init_itxfm(TX_16X16, 16x16, BPP);
init_idct(TX_32X32, idct_idct_32x32, BPP);
init_idct(4, iwht_iwht_4x4, BPP);
}
}
av_cold void INIT_FUNC(VP9DSPContext *dsp)
{
vp9dsp_mc_init_arm(dsp);
vp9dsp_itxfm_init_arm(dsp);
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment