Commit 047cf46e authored by Michael Niedermayer's avatar Michael Niedermayer

Merge commit '82ee14d2'

* commit '82ee14d2':
  ppc: dsputil: comment formatting and wording/grammar improvements

Conflicts:
	libavcodec/ppc/gmc_altivec.c
	libavcodec/ppc/idct_altivec.c
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 75cc29a8 82ee14d2
This diff is collapsed.
......@@ -32,24 +32,23 @@
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
cache line size not equal to 32 bytes.
Fortunately all processor used by Apple up to at least the 7450 (aka second
generation G4) use 32 bytes cache line.
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
single cache line, so you need to know the cache line size to use it !
It's absurd, but it's fast...
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
size: 128 bytes. Oups.
The semantic of dcbz was changed, it always clear 32 bytes. so the function
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
which is defined to clear a cache line (as dcbz before). So we still can
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
see <http://developer.apple.com/technotes/tn/tn2087.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
* a cache line size not equal to 32 bytes. Fortunately all processors used
* by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
* cache lines. This is due to the use of the 'dcbz' instruction. It simply
* clears a single cache line to zero, so you need to know the cache line
* size to use it! It's absurd, but it's fast...
*
* update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
* cache line size: 128 bytes. Oups.
* The semantics of dcbz was changed, it always clears 32 bytes. So the function
* below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
* which is defined to clear a cache line (as dcbz before). So we can still
* distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
*
* see <http://developer.apple.com/technotes/tn/tn2087.html>
* and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
static void clear_blocks_dcbz32_ppc(int16_t *blocks)
{
register int misal = ((unsigned long)blocks & 0x00000010);
......@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
}
}
/* same as above, when dcbzl clear a whole 128B cache line
i.e. the PPC970 aka G5 */
/* Same as above, when dcbzl clears a whole 128 bytes cache line
* i.e. the PPC970 AKA G5. */
#if HAVE_DCBZL
static void clear_blocks_dcbz128_ppc(int16_t *blocks)
{
register int misal = ((unsigned long)blocks & 0x0000007f);
register int i = 0;
if (misal) {
// we could probably also optimize this case,
// but there's not much point as the machines
// aren't available yet (2003-06-26)
/* We could probably also optimize this case,
* but there's not much point as the machines
* aren't available yet (2003-06-26). */
memset(blocks, 0, sizeof(int16_t)*6*64);
}
else
......@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
#endif
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* update 24/06/2003 : replace dcbz by dcbzl to get
the intended effect (Apple "fixed" dcbz)
unfortunately this cannot be used unless the assembler
knows about dcbzl ... */
/* Check dcbz report how many bytes are set to 0 by dcbz. */
/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
* (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
* assembler knows about dcbzl ... */
static long check_dcbzl_effect(void)
{
register char *fakedata = av_malloc(1024);
......@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
memset(fakedata, 0xFF, 1024);
/* below the constraint "b" seems to mean "Address base register"
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
/* Below the constraint "b" seems to mean "address base register"
* in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
__asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
for (i = 0; i < 1024 ; i ++) {
......@@ -145,7 +143,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
int mm_flags = av_get_cpu_flags();
// Common optimizations whether AltiVec is available or not
// common optimizations whether AltiVec is available or not
if (!high_bit_depth) {
switch (check_dcbzl_effect()) {
case 32:
......
......@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
#undef MERGE_S16
/* }}} */
/* Some of the initial calculations can be done as vector short
* before conversion to vector float. The following code section
* takes advantage of this. */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
/* fdct rows {{{ */
x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
......
......@@ -27,12 +27,12 @@
#include "libavcodec/fft.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before with s->revtab table. No
* 1.0/sqrt(n) normalization is done.
* AltiVec-enabled
* This code assumes that the 'z' pointer is 16 bytes-aligned
* It also assumes all FFTComplex are 8 bytes-aligned pair of float
* Do a complex FFT with the parameters defined in ff_fft_init().
* The input data must be permuted before with s->revtab table.
* No 1.0 / sqrt(n) normalization is done.
* AltiVec-enabled:
* This code assumes that the 'z' pointer is 16 bytes-aligned.
* It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
*/
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
......
/*
* GMC (Global Motion Compensation)
* AltiVec-enabled
* GMC (Global Motion Compensation), AltiVec-enabled
*
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
......@@ -25,10 +25,8 @@
#include "libavutil/ppc/util_altivec.h"
#include "dsputil_altivec.h"
/*
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
to preserve proper dst alignment.
*/
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
......@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);
// we'll be able to pick-up our 9 char elements
// at src from those 32 bytes
// we load the first batch here, as inside the loop
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
/* we'll be able to pick-up our 9 char elements at src from those
* 32 bytes we load the first batch here, as inside the loop we can
* reuse 'src + stride' from one iteration as the 'src' of the next. */
src_0 = vec_ld(0, src);
src_1 = vec_ld(16, src);
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
if (src_really_odd != 0x0000000F) {
// if (src & 0xF) == 0xF, then (src+1) is properly aligned
// on the second vector.
/* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
} else {
srcvB = src_1;
......@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
dstv = vec_ld(0, dst);
// we we'll be able to pick-up our 9 char elements
// at src + stride from those 32 bytes
// then reuse the resulting 2 vectors srvcC and srcvD
// as the next srcvA and srcvB
/* We'll be able to pick-up our 9 char elements at src + stride from
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
src_0 = vec_ld(stride + 0, src);
src_1 = vec_ld(stride + 16, src);
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
if (src_really_odd != 0x0000000F) {
// if (src & 0xF) == 0xF, then (src+1) is properly aligned
// on the second vector.
/* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
} else {
srcvD = src_1;
......@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
// OK, now we (finally) do the math :-)
// those four instructions replaces 32 int muls & 32 int adds.
// isn't AltiVec nice ?
/* OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
......
......@@ -18,24 +18,19 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* NOTE: This code is based on GPL code from the libmpeg2 project. The
/* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of FFmpeg.
*/
/*
*
* FFmpeg integration by Dieter Shirley
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function.
*/
* perform a full transpose at the end of the function. */
#include <stdlib.h> /* malloc(), free() */
#include <stdlib.h>
#include <string.h>
#include "config.h"
#if HAVE_ALTIVEC_H
......
......@@ -19,9 +19,9 @@
*/
/**
** @file
** integer misc ops.
**/
* @file
* miscellaneous integer operations
*/
#include "config.h"
#if HAVE_ALTIVEC_H
......@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int32_t score[4];
} u;
u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later
// XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
......@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
size16 = size >> 4;
while(size16) {
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
// load pix1 and the first batch of pix2
vpix1 = vec_unaligned_load(pix1);
vpix2 = vec_unaligned_load(pix2);
pix2 += 8;
//unpack
// unpack
vpix1h = vec_unpackh(vpix1);
vdiff = vec_sub(vpix1h, vpix2);
vpix1l = vec_unpackl(vpix1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment