Commit ab123739 authored by Rong Yan's avatar Rong Yan Committed by Michael Niedermayer

ppc: Fix the bug of fft for little endian environment on POWER7 and later

Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 8e2ade43
...@@ -1548,6 +1548,7 @@ ARCH_EXT_LIST_PPC=" ...@@ -1548,6 +1548,7 @@ ARCH_EXT_LIST_PPC="
dcbzl dcbzl
ldbrx ldbrx
ppc4xx ppc4xx
vsx
" "
ARCH_EXT_LIST_X86=" ARCH_EXT_LIST_X86="
...@@ -1933,6 +1934,7 @@ mipsdspr2_deps="mips" ...@@ -1933,6 +1934,7 @@ mipsdspr2_deps="mips"
altivec_deps="ppc" altivec_deps="ppc"
ppc4xx_deps="ppc" ppc4xx_deps="ppc"
vsx_deps="ppc"
cpunop_deps="i686" cpunop_deps="i686"
x86_64_select="i686" x86_64_select="i686"
...@@ -3691,6 +3693,7 @@ elif enabled mips; then ...@@ -3691,6 +3693,7 @@ elif enabled mips; then
elif enabled ppc; then elif enabled ppc; then
disable ldbrx disable ldbrx
disable vsx
case $(tolower $cpu) in case $(tolower $cpu) in
601|ppc601|powerpc601) 601|ppc601|powerpc601)
...@@ -3718,7 +3721,7 @@ elif enabled ppc; then ...@@ -3718,7 +3721,7 @@ elif enabled ppc; then
g5|970|ppc970|powerpc970) g5|970|ppc970|powerpc970)
cpuflags="-mcpu=970" cpuflags="-mcpu=970"
;; ;;
power[3-7]*) power[3-8]*)
cpuflags="-mcpu=$cpu" cpuflags="-mcpu=$cpu"
;; ;;
cell) cell)
...@@ -4286,6 +4289,12 @@ unsigned int endian = 'B' << 24 | 'I' << 16 | 'G' << 8 | 'E'; ...@@ -4286,6 +4289,12 @@ unsigned int endian = 'B' << 24 | 'I' << 16 | 'G' << 8 | 'E';
EOF EOF
od -t x1 $TMPO | grep -q '42 *49 *47 *45' && enable bigendian od -t x1 $TMPO | grep -q '42 *49 *47 *45' && enable bigendian
if [ "$cpu" = "power7" ] || [ "$cpu" = "power8" ] ;then
if ! enabled bigendian ;then
enable vsx
fi
fi
if enabled asm; then if enabled asm; then
enabled arm && nogas=die enabled arm && nogas=die
...@@ -4415,6 +4424,9 @@ EOF ...@@ -4415,6 +4424,9 @@ EOF
enabled altivec || warn "Altivec disabled, possibly missing --cpu flag" enabled altivec || warn "Altivec disabled, possibly missing --cpu flag"
fi fi
if enabled vsx; then
check_cflags -mvsx
fi
elif enabled x86; then elif enabled x86; then
check_builtin rdtsc intrin.h "__rdtsc()" check_builtin rdtsc intrin.h "__rdtsc()"
...@@ -5256,6 +5268,7 @@ fi ...@@ -5256,6 +5268,7 @@ fi
if enabled ppc; then if enabled ppc; then
echo "AltiVec enabled ${altivec-no}" echo "AltiVec enabled ${altivec-no}"
echo "PPC 4xx optimizations ${ppc4xx-no}" echo "PPC 4xx optimizations ${ppc4xx-no}"
echo "PPC VSX optimizations ${vsx-no}"
echo "dcbzl available ${dcbzl-no}" echo "dcbzl available ${dcbzl-no}"
fi fi
echo "debug symbols ${debug-no}" echo "debug symbols ${debug-no}"
......
...@@ -26,4 +26,5 @@ ALTIVEC-OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o \ ...@@ -26,4 +26,5 @@ ALTIVEC-OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o \
ppc/int_altivec.o \ ppc/int_altivec.o \
FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o
FFT-OBJS-$(HAVE_VSX) += ppc/fft_vsx.o
ALTIVEC-OBJS-$(CONFIG_FFT) += $(FFT-OBJS-yes) ALTIVEC-OBJS-$(CONFIG_FFT) += $(FFT-OBJS-yes)
...@@ -36,8 +36,12 @@ ...@@ -36,8 +36,12 @@
* It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
*/ */
#if HAVE_VSX
#include "fft_vsx.h"
#else
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
#endif
#if HAVE_GNU_AS && HAVE_ALTIVEC #if HAVE_GNU_AS && HAVE_ALTIVEC
static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
...@@ -94,7 +98,11 @@ static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample ...@@ -94,7 +98,11 @@ static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample
k--; k--;
} while(k >= 0); } while(k >= 0);
#if HAVE_VSX
ff_fft_calc_vsx(s, (FFTComplex*)output);
#else
ff_fft_calc_altivec(s, (FFTComplex*)output); ff_fft_calc_altivec(s, (FFTComplex*)output);
#endif
/* post rotation + reordering */ /* post rotation + reordering */
j = -n32; j = -n32;
...@@ -147,7 +155,11 @@ av_cold void ff_fft_init_ppc(FFTContext *s) ...@@ -147,7 +155,11 @@ av_cold void ff_fft_init_ppc(FFTContext *s)
if (!PPC_ALTIVEC(av_get_cpu_flags())) if (!PPC_ALTIVEC(av_get_cpu_flags()))
return; return;
#if HAVE_VSX
s->fft_calc = ff_fft_calc_interleave_vsx;
#else
s->fft_calc = ff_fft_calc_interleave_altivec; s->fft_calc = ff_fft_calc_interleave_altivec;
#endif
if (s->mdct_bits >= 5) { if (s->mdct_bits >= 5) {
s->imdct_calc = imdct_calc_altivec; s->imdct_calc = imdct_calc_altivec;
s->imdct_half = imdct_half_altivec; s->imdct_half = imdct_half_altivec;
......
/*
* FFT transform, optimized with VSX built-in functions
* Copyright (c) 2014 Rong Yan
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#include "fft_vsx.h"
#if HAVE_VSX
static void fft32_vsx_interleave(FFTComplex *z)
{
fft16_vsx_interleave(z);
fft8_vsx_interleave(z+16);
fft8_vsx_interleave(z+24);
pass_vsx_interleave(z,ff_cos_32,4);
}
static void fft64_vsx_interleave(FFTComplex *z)
{
fft32_vsx_interleave(z);
fft16_vsx_interleave(z+32);
fft16_vsx_interleave(z+48);
pass_vsx_interleave(z,ff_cos_64, 8);
}
static void fft128_vsx_interleave(FFTComplex *z)
{
fft64_vsx_interleave(z);
fft32_vsx_interleave(z+64);
fft32_vsx_interleave(z+96);
pass_vsx_interleave(z,ff_cos_128,16);
}
static void fft256_vsx_interleave(FFTComplex *z)
{
fft128_vsx_interleave(z);
fft64_vsx_interleave(z+128);
fft64_vsx_interleave(z+192);
pass_vsx_interleave(z,ff_cos_256,32);
}
static void fft512_vsx_interleave(FFTComplex *z)
{
fft256_vsx_interleave(z);
fft128_vsx_interleave(z+256);
fft128_vsx_interleave(z+384);
pass_vsx_interleave(z,ff_cos_512,64);
}
static void fft1024_vsx_interleave(FFTComplex *z)
{
fft512_vsx_interleave(z);
fft256_vsx_interleave(z+512);
fft256_vsx_interleave(z+768);
pass_vsx_interleave(z,ff_cos_1024,128);
}
static void fft2048_vsx_interleave(FFTComplex *z)
{
fft1024_vsx_interleave(z);
fft512_vsx_interleave(z+1024);
fft512_vsx_interleave(z+1536);
pass_vsx_interleave(z,ff_cos_2048,256);
}
static void fft4096_vsx_interleave(FFTComplex *z)
{
fft2048_vsx_interleave(z);
fft1024_vsx_interleave(z+2048);
fft1024_vsx_interleave(z+3072);
pass_vsx_interleave(z,ff_cos_4096, 512);
}
static void fft8192_vsx_interleave(FFTComplex *z)
{
fft4096_vsx_interleave(z);
fft2048_vsx_interleave(z+4096);
fft2048_vsx_interleave(z+6144);
pass_vsx_interleave(z,ff_cos_8192,1024);
}
static void fft16384_vsx_interleave(FFTComplex *z)
{
fft8192_vsx_interleave(z);
fft4096_vsx_interleave(z+8192);
fft4096_vsx_interleave(z+12288);
pass_vsx_interleave(z,ff_cos_16384,2048);
}
static void fft32768_vsx_interleave(FFTComplex *z)
{
fft16384_vsx_interleave(z);
fft8192_vsx_interleave(z+16384);
fft8192_vsx_interleave(z+24576);
pass_vsx_interleave(z,ff_cos_32768,4096);
}
static void fft65536_vsx_interleave(FFTComplex *z)
{
fft32768_vsx_interleave(z);
fft16384_vsx_interleave(z+32768);
fft16384_vsx_interleave(z+49152);
pass_vsx_interleave(z,ff_cos_65536,8192);
}
static void fft32_vsx(FFTComplex *z)
{
fft16_vsx(z);
fft8_vsx(z+16);
fft8_vsx(z+24);
pass_vsx(z,ff_cos_32,4);
}
static void fft64_vsx(FFTComplex *z)
{
fft32_vsx(z);
fft16_vsx(z+32);
fft16_vsx(z+48);
pass_vsx(z,ff_cos_64, 8);
}
static void fft128_vsx(FFTComplex *z)
{
fft64_vsx(z);
fft32_vsx(z+64);
fft32_vsx(z+96);
pass_vsx(z,ff_cos_128,16);
}
static void fft256_vsx(FFTComplex *z)
{
fft128_vsx(z);
fft64_vsx(z+128);
fft64_vsx(z+192);
pass_vsx(z,ff_cos_256,32);
}
static void fft512_vsx(FFTComplex *z)
{
fft256_vsx(z);
fft128_vsx(z+256);
fft128_vsx(z+384);
pass_vsx(z,ff_cos_512,64);
}
static void fft1024_vsx(FFTComplex *z)
{
fft512_vsx(z);
fft256_vsx(z+512);
fft256_vsx(z+768);
pass_vsx(z,ff_cos_1024,128);
}
static void fft2048_vsx(FFTComplex *z)
{
fft1024_vsx(z);
fft512_vsx(z+1024);
fft512_vsx(z+1536);
pass_vsx(z,ff_cos_2048,256);
}
static void fft4096_vsx(FFTComplex *z)
{
fft2048_vsx(z);
fft1024_vsx(z+2048);
fft1024_vsx(z+3072);
pass_vsx(z,ff_cos_4096, 512);
}
static void fft8192_vsx(FFTComplex *z)
{
fft4096_vsx(z);
fft2048_vsx(z+4096);
fft2048_vsx(z+6144);
pass_vsx(z,ff_cos_8192,1024);
}
static void fft16384_vsx(FFTComplex *z)
{
fft8192_vsx(z);
fft4096_vsx(z+8192);
fft4096_vsx(z+12288);
pass_vsx(z,ff_cos_16384,2048);
}
static void fft32768_vsx(FFTComplex *z)
{
fft16384_vsx(z);
fft8192_vsx(z+16384);
fft8192_vsx(z+24576);
pass_vsx(z,ff_cos_32768,4096);
}
static void fft65536_vsx(FFTComplex *z)
{
fft32768_vsx(z);
fft16384_vsx(z+32768);
fft16384_vsx(z+49152);
pass_vsx(z,ff_cos_65536,8192);
}
static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
};
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
};
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx_interleave[s->nbits-2](z);
}
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx[s->nbits-2](z);
}
#endif /* HAVE_VSX */
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment