Commit d3f5b947 authored by Janne Grunau's avatar Janne Grunau

aarch64: opus NEON iMDCT and FFT

Opus celt decoding 11% faster and the iMDCT over 2.5 times faster on
Apple's A7.
parent 7c5ca546
...@@ -7,6 +7,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o ...@@ -7,6 +7,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opus_imdct_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_init.o
...@@ -23,4 +24,5 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o ...@@ -23,4 +24,5 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opus_imdct_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
/*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
#define AVCODEC_AARCH64_ASM_OFFSETS_H
/* CeltIMDCTContext */
#define CELT_EXPTAB 0x20
#define CELT_FFT_N 0x00
#define CELT_LEN2 0x04
#define CELT_LEN4 (CELT_LEN2 + 0x4) // loaded as pair
#define CELT_TMP 0x10
#define CELT_TWIDDLE (CELT_TMP + 0x8) // loaded as pair
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
/*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavutil/internal.h"
#include "libavcodec/opus_imdct.h"
#include "asm-offsets.h"
AV_CHECK_OFFSET(CeltIMDCTContext, exptab, CELT_EXPTAB);
AV_CHECK_OFFSET(CeltIMDCTContext, fft_n, CELT_FFT_N);
AV_CHECK_OFFSET(CeltIMDCTContext, len2, CELT_LEN2);
AV_CHECK_OFFSET(CeltIMDCTContext, len4, CELT_LEN4);
AV_CHECK_OFFSET(CeltIMDCTContext, tmp, CELT_TMP);
AV_CHECK_OFFSET(CeltIMDCTContext, twiddle_exptab, CELT_TWIDDLE);
void ff_celt_imdct_half_neon(CeltIMDCTContext *s, float *dst, const float *src,
ptrdiff_t stride, float scale);
void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->imdct_half = ff_celt_imdct_half_neon;
}
}
This diff is collapsed.
...@@ -92,8 +92,6 @@ typedef struct OpusRangeCoder { ...@@ -92,8 +92,6 @@ typedef struct OpusRangeCoder {
typedef struct SilkContext SilkContext; typedef struct SilkContext SilkContext;
typedef struct CeltIMDCTContext CeltIMDCTContext;
typedef struct CeltContext CeltContext; typedef struct CeltContext CeltContext;
typedef struct OpusPacket { typedef struct OpusPacket {
...@@ -398,22 +396,6 @@ int ff_silk_decode_superframe(SilkContext *s, OpusRangeCoder *rc, ...@@ -398,22 +396,6 @@ int ff_silk_decode_superframe(SilkContext *s, OpusRangeCoder *rc,
enum OpusBandwidth bandwidth, int coded_channels, enum OpusBandwidth bandwidth, int coded_channels,
int duration_ms); int duration_ms);
/**
* Init an iMDCT of the length 2 * 15 * (2^N)
*/
int ff_celt_imdct_init(CeltIMDCTContext **s, int N);
/**
* Free an iMDCT.
*/
void ff_celt_imdct_uninit(CeltIMDCTContext **s);
/**
* Calculate the middle half of the iMDCT
*/
void ff_celt_imdct_half(CeltIMDCTContext *s, float *dst, const float *src,
int src_stride, float scale);
int ff_celt_init(AVCodecContext *avctx, CeltContext **s, int output_channels); int ff_celt_init(AVCodecContext *avctx, CeltContext **s, int output_channels);
void ff_celt_free(CeltContext **s); void ff_celt_free(CeltContext **s);
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "libavutil/float_dsp.h" #include "libavutil/float_dsp.h"
#include "opus.h" #include "opus.h"
#include "opus_imdct.h"
enum CeltSpread { enum CeltSpread {
CELT_SPREAD_NONE, CELT_SPREAD_NONE,
...@@ -2095,8 +2096,8 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc, ...@@ -2095,8 +2096,8 @@ int ff_celt_decode_frame(CeltContext *s, OpusRangeCoder *rc,
for (j = 0; j < s->blocks; j++) { for (j = 0; j < s->blocks; j++) {
float *dst = frame->buf + 1024 + j * s->blocksize; float *dst = frame->buf + 1024 + j * s->blocksize;
ff_celt_imdct_half(imdct, dst + CELT_OVERLAP / 2, s->coeffs[i] + j, imdct->imdct_half(imdct, dst + CELT_OVERLAP / 2, s->coeffs[i] + j,
s->blocks, imdct_scale); s->blocks, imdct_scale);
s->dsp.vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2, s->dsp.vector_fmul_window(dst, dst, dst + CELT_OVERLAP / 2,
celt_window, CELT_OVERLAP / 2); celt_window, CELT_OVERLAP / 2);
} }
......
...@@ -25,12 +25,19 @@ ...@@ -25,12 +25,19 @@
#include <float.h> #include <float.h>
#include <math.h> #include <math.h>
#include <stddef.h>
#include "config.h"
#include "libavutil/attributes.h" #include "libavutil/attributes.h"
#include "libavutil/common.h" #include "libavutil/common.h"
#include "fft.h" #include "avfft.h"
#include "opus.h" #include "opus.h"
#include "opus_imdct.h"
// minimal iMDCT size to make SIMD opts easier
#define CELT_MIN_IMDCT_SIZE 120
// complex c = a * b // complex c = a * b
#define CMUL3(cre, cim, are, aim, bre, bim) \ #define CMUL3(cre, cim, are, aim, bre, bim) \
...@@ -59,18 +66,6 @@ do { \ ...@@ -59,18 +66,6 @@ do { \
(d).im = -ri + ir; \ (d).im = -ri + ir; \
} while (0) } while (0)
struct CeltIMDCTContext {
int fft_n;
int len2;
int len4;
FFTComplex *tmp;
FFTComplex *twiddle_exptab;
FFTComplex *exptab[6];
};
av_cold void ff_celt_imdct_uninit(CeltIMDCTContext **ps) av_cold void ff_celt_imdct_uninit(CeltIMDCTContext **ps)
{ {
CeltIMDCTContext *s = *ps; CeltIMDCTContext *s = *ps;
...@@ -89,6 +84,9 @@ av_cold void ff_celt_imdct_uninit(CeltIMDCTContext **ps) ...@@ -89,6 +84,9 @@ av_cold void ff_celt_imdct_uninit(CeltIMDCTContext **ps)
av_freep(ps); av_freep(ps);
} }
static void celt_imdct_half(CeltIMDCTContext *s, float *dst, const float *src,
ptrdiff_t stride, float scale);
av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N) av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N)
{ {
CeltIMDCTContext *s; CeltIMDCTContext *s;
...@@ -96,7 +94,7 @@ av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N) ...@@ -96,7 +94,7 @@ av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N)
int len = 2 * len2; int len = 2 * len2;
int i, j; int i, j;
if (len2 > CELT_MAX_FRAME_SIZE) if (len2 > CELT_MAX_FRAME_SIZE || len2 < CELT_MIN_IMDCT_SIZE)
return AVERROR(EINVAL); return AVERROR(EINVAL);
s = av_mallocz(sizeof(*s)); s = av_mallocz(sizeof(*s));
...@@ -136,6 +134,11 @@ av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N) ...@@ -136,6 +134,11 @@ av_cold int ff_celt_imdct_init(CeltIMDCTContext **ps, int N)
for (j = 15; j < 19; j++) for (j = 15; j < 19; j++)
s->exptab[0][j] = s->exptab[0][j - 15]; s->exptab[0][j] = s->exptab[0][j - 15];
s->imdct_half = celt_imdct_half;
if (ARCH_AARCH64)
ff_celt_imdct_init_aarch64(s);
*ps = s; *ps = s;
return 0; return 0;
...@@ -144,7 +147,7 @@ fail: ...@@ -144,7 +147,7 @@ fail:
return AVERROR(ENOMEM); return AVERROR(ENOMEM);
} }
static void fft5(FFTComplex *out, const FFTComplex *in, int stride) static void fft5(FFTComplex *out, const FFTComplex *in, ptrdiff_t stride)
{ {
// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
static const FFTComplex fact[] = { { 0.30901699437494745, 0.95105651629515353 }, static const FFTComplex fact[] = { { 0.30901699437494745, 0.95105651629515353 },
...@@ -177,7 +180,7 @@ static void fft5(FFTComplex *out, const FFTComplex *in, int stride) ...@@ -177,7 +180,7 @@ static void fft5(FFTComplex *out, const FFTComplex *in, int stride)
out[4].im = in[0].im + z[0][3].im + z[1][2].im + z[2][1].im + z[3][0].im; out[4].im = in[0].im + z[0][3].im + z[1][2].im + z[2][1].im + z[3][0].im;
} }
static void fft15(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, int stride) static void fft15(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, ptrdiff_t stride)
{ {
const FFTComplex *exptab = s->exptab[0]; const FFTComplex *exptab = s->exptab[0];
FFTComplex tmp[5]; FFTComplex tmp[5];
...@@ -212,7 +215,8 @@ static void fft15(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, in ...@@ -212,7 +215,8 @@ static void fft15(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, in
/* /*
* FFT of the length 15 * (2^N) * FFT of the length 15 * (2^N)
*/ */
static void fft_calc(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, int N, int stride) static void fft_calc(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in,
int N, ptrdiff_t stride)
{ {
if (N) { if (N) {
const FFTComplex *exptab = s->exptab[N]; const FFTComplex *exptab = s->exptab[N];
...@@ -237,8 +241,8 @@ static void fft_calc(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in, ...@@ -237,8 +241,8 @@ static void fft_calc(CeltIMDCTContext *s, FFTComplex *out, const FFTComplex *in,
fft15(s, out, in, stride); fft15(s, out, in, stride);
} }
void ff_celt_imdct_half(CeltIMDCTContext *s, float *dst, const float *src, static void celt_imdct_half(CeltIMDCTContext *s, float *dst, const float *src,
int stride, float scale) ptrdiff_t stride, float scale)
{ {
FFTComplex *z = (FFTComplex *)dst; FFTComplex *z = (FFTComplex *)dst;
const int len8 = s->len4 / 2; const int len8 = s->len4 / 2;
......
/*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_OPUS_IMDCT_H
#define AVCODEC_OPUS_IMDCT_H
#include <stddef.h>
#include "avfft.h"
typedef struct CeltIMDCTContext {
int fft_n;
int len2;
int len4;
FFTComplex *tmp;
FFTComplex *twiddle_exptab;
FFTComplex *exptab[6];
/**
* Calculate the middle half of the iMDCT
*/
void (*imdct_half)(struct CeltIMDCTContext *s, float *dst, const float *src,
ptrdiff_t src_stride, float scale);
} CeltIMDCTContext;
/**
* Init an iMDCT of the length 2 * 15 * (2^N)
*/
int ff_celt_imdct_init(CeltIMDCTContext **s, int N);
/**
* Free an iMDCT.
*/
void ff_celt_imdct_uninit(CeltIMDCTContext **s);
void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s);
#endif /* AVCODEC_OPUS_IMDCT_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment