aarch64: NEON float FFT

Approximately as fast as the ARM NEON version on Apple's A7.

aarch64: NEON float FFT
Approximately as fast as the ARM NEON version on Apple's A7.
650c4300 · Janne Grunau · f9157463 · 650c4300 · 650c4300 · 650c4300
Commit 650c4300 authored Mar 26, 2014 by Janne Grunau
5 changed files
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
+OBJS-$(CONFIG_FFT)                      += aarch64/fft_init_aarch64.o
 OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
@@ -10,6 +11,7 @@ OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o

 ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o

+NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264dsp_neon.o              \
                                           aarch64/h264idct_neon.o

--- a/libavcodec/aarch64/fft_init_aarch64.c
+++ b/libavcodec/aarch64/fft_init_aarch64.c
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+av_cold void ff_fft_init_aarch64(FFTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->fft_permute  = ff_fft_permute_neon;
+        s->fft_calc     = ff_fft_calc_neon;
+    }
+}
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+.macro transpose d0, d1, s0, s1
+        trn1            \d0, \s0, \s1
+        trn2            \d1, \s0, \s1
+.endm
+
+
+function fft4_neon
+        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
+        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
+
+        ext             v16.8b, v2.8b,  v3.8b,  #4
+        ext             v17.8b, v3.8b,  v2.8b,  #4
+
+        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
+        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
+
+        fadd            v0.2s,  v4.2s,  v5.2s
+        fsub            v2.2s,  v4.2s,  v5.2s
+        fadd            v1.2s,  v6.2s,  v7.2s
+        fsub            v3.2s,  v6.2s,  v7.2s
+
+        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        ret
+endfunc
+
+function fft8_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+
+        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
+
+        ret
+endfunc
+
+function fft16_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ld1             {v20.4s,v21.4s}, [x0], #32
+        ld1             {v22.4s,v23.4s}, [x0], #32
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        transpose       v24.2d, v25.2d, v20.2d, v22.2d
+        transpose       v26.2d, v27.2d, v21.2d, v23.2d
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+        ext             v20.16b, v21.16b, v21.16b,  #4
+        ext             v21.16b, v23.16b, v23.16b,  #4
+
+        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
+        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
+        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
+        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
+
+        // 2 x fft4
+        transpose       v22.2d, v23.2d, v20.2d, v21.2d
+
+        fadd            v4.4s,  v24.4s, v25.4s
+        fadd            v5.4s,  v26.4s, v27.4s
+        fsub            v6.4s,  v24.4s, v25.4s
+        fsub            v7.4s,  v22.4s, v23.4s
+
+        ld1             {v23.4s},  [x14]
+
+        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
+        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
+        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
+        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
+
+        //fft_pass_neon_16
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v23.s[1]
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
+        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
+        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
+        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
+
+//second half
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v23.s[2]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v23.s[3]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
+
+        zip1            v24.4s, v26.4s, v27.4s
+        zip2            v25.4s, v26.4s, v27.4s
+        fneg            v26.4s, v24.4s
+        fadd            v4.4s,  v25.4s, v24.4s
+        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
+        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
+        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
+        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
+        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
+
+        st1             {v16.4s,v17.4s}, [x1], #32
+        st1             {v18.4s,v19.4s}, [x1], #32
+        st1             {v20.4s,v21.4s}, [x1], #32
+        st1             {v22.4s,v23.4s}, [x1], #32
+
+        ret
+endfunc
+
+
+const  trans4_float, align=4
+        .byte    0,  1,  2,  3
+        .byte    8,  9, 10, 11
+        .byte    4,  5,  6,  7
+        .byte   12, 13, 14, 15
+endconst
+
+const  trans8_float, align=4
+        .byte   24, 25, 26, 27
+        .byte    0,  1,  2,  3
+        .byte   28, 29, 30, 31
+        .byte    4,  5,  6,  7
+endconst
+
+function fft_pass_neon
+        sub             x6,  x2,  #1            // n - 1, loop counter
+        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
+        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
+        add             x5,  x4,  x5            // wim
+        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
+        add             x2,  x0,  x2,  lsl #5   // &z[o2]
+        add             x3,  x0,  x3            // &z[o3]
+        add             x1,  x0,  x1            // &z[o1]
+        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
+        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
+        trn2            v25.2d, v20.2d, v22.2d
+        sub             x5,  x5,  #4            // wim--
+        trn1            v24.2d, v20.2d, v22.2d
+        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v4.s[1]
+        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
+        prfm            pldl1keep, [x2, #16]
+        prfm            pldl1keep, [x3, #16]
+        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        prfm            pldl1keep, [x0, #16]
+        prfm            pldl1keep, [x1, #16]
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+1:
+        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
+        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
+        transpose       v26.2d, v27.2d, v20.2d, v22.2d
+        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v4.s[0]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v4.s[1]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v16.4s},[x0]           // {z[0],z[1]}
+        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
+
+        subs            x6,  x6,  #1            // n--
+
+        zip1            v20.4s, v26.4s, v27.4s
+        zip2            v21.4s, v26.4s, v27.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro  def_fft n, n2, n4
+function fft\n\()_neon  align=6
+        sub             sp,  sp,  #16
+        stp             x28, x30, [sp]
+        add             x28, x0,  #\n4*2*8
+        bl              fft\n2\()_neon
+        mov             x0,  x28
+        bl              fft\n4\()_neon
+        add             x0,  x28, #\n4*1*8
+        bl              fft\n4\()_neon
+        sub             x0,  x28, #\n4*2*8
+        ldp             x28, x30, [sp], #16
+        movrel          x4,  X(ff_cos_\n)
+        mov             x2,  #\n4/2
+        b               fft_pass_neon
+endfunc
+.endm
+
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+        prfm            pldl1keep, [x1]
+        movrel          x10, trans4_float
+        ldr             w2,  [x0]
+        movrel          x11, trans8_float
+        sub             w2,  w2,  #2
+        movrel          x3,  fft_tab_neon
+        ld1             {v30.16b}, [x10]
+        mov             x7,  #-8
+        movrel          x12, pmmp
+        ldr             x3,  [x3, x2, lsl #3]
+        movrel          x13, mppm
+        movrel          x14, X(ff_cos_16)
+        ld1             {v31.16b}, [x11]
+        mov             x0,  x1
+        ld1             {v29.4s},  [x12]         // pmmp
+        ld1             {v28.4s},  [x13]
+        br              x3
+endfunc
+
+function ff_fft_permute_neon, export=1
+        mov             x6,  #1
+        ldr             w2,  [x0]       // nbits
+        ldr             x3,  [x0, #16]  // tmp_buf
+        ldr             x0,  [x0, #8]   // revtab
+        lsl             x6,  x6, x2
+        mov             x2,  x6
+1:
+        ld1             {v0.2s,v1.2s}, [x1], #16
+        ldr             w4,  [x0], #4
+        uxth            w5,  w4
+        lsr             w4,  w4,  #16
+        add             x5,  x3,  x5,  lsl #3
+        add             x4,  x3,  x4,  lsl #3
+        st1             {v0.2s}, [x5]
+        st1             {v1.2s}, [x4]
+        subs            x6,  x6, #2
+        b.gt            1b
+
+        sub             x1,  x1,  x2,  lsl #3
+1:
+        ld1             {v0.4s,v1.4s}, [x3], #32
+        st1             {v0.4s,v1.4s}, [x1], #32
+        subs            x2,  x2,  #4
+        b.gt            1b
+
+        ret
+endfunc
+
+const   fft_tab_neon
+        .quad fft4_neon
+        .quad fft8_neon
+        .quad fft16_neon
+        .quad fft32_neon
+        .quad fft64_neon
+        .quad fft128_neon
+        .quad fft256_neon
+        .quad fft512_neon
+        .quad fft1024_neon
+        .quad fft2048_neon
+        .quad fft4096_neon
+        .quad fft8192_neon
+        .quad fft16384_neon
+        .quad fft32768_neon
+        .quad fft65536_neon
+endconst
+
+const   pmmp, align=4
+        .float          +1.0, -1.0, -1.0, +1.0
+endconst
+
+const   mppm, align=4
+        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -133,6 +133,7 @@ void ff_init_ff_cos_tabs(int index);
 */
 int ff_fft_init(FFTContext *s, int nbits, int inverse);

+void ff_fft_init_aarch64(FFTContext *s);
 void ff_fft_init_x86(FFTContext *s);
 void ff_fft_init_arm(FFTContext *s);
 void ff_fft_init_ppc(FFTContext *s);

--- a/libavcodec/fft_template.c
+++ b/libavcodec/fft_template.c
@@ -158,6 +158,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
 #endif

 #if FFT_FLOAT
+    if (ARCH_AARCH64) ff_fft_init_aarch64(s);
    if (ARCH_ARM)     ff_fft_init_arm(s);
    if (ARCH_PPC)     ff_fft_init_ppc(s);
    if (ARCH_X86)     ff_fft_init_x86(s);