Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
82ee14d2
Commit
82ee14d2
authored
Jan 15, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppc: dsputil: comment formatting and wording/grammar improvements
parent
cce791b1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
180 additions
and
199 deletions
+180
-199
dsputil_altivec.c
libavcodec/ppc/dsputil_altivec.c
+114
-119
dsputil_ppc.c
libavcodec/ppc/dsputil_ppc.c
+29
-31
fdct_altivec.c
libavcodec/ppc/fdct_altivec.c
+3
-4
fft_altivec.c
libavcodec/ppc/fft_altivec.c
+6
-6
gmc_altivec.c
libavcodec/ppc/gmc_altivec.c
+17
-23
idct_altivec.c
libavcodec/ppc/idct_altivec.c
+4
-9
int_altivec.c
libavcodec/ppc/int_altivec.c
+7
-7
No files found.
libavcodec/ppc/dsputil_altivec.c
View file @
82ee14d2
This diff is collapsed.
Click to expand it.
libavcodec/ppc/dsputil_ppc.c
View file @
82ee14d2
...
...
@@ -32,24 +32,23 @@
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
cache line size not equal to 32 bytes.
Fortunately all processor used by Apple up to at least the 7450 (aka second
generation G4) use 32 bytes cache line.
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
single cache line, so you need to know the cache line size to use it !
It's absurd, but it's fast...
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
size: 128 bytes. Oups.
The semantic of dcbz was changed, it always clear 32 bytes. so the function
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
which is defined to clear a cache line (as dcbz before). So we still can
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
see <http://developer.apple.com/technotes/tn/tn2087.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
* a cache line size not equal to 32 bytes. Fortunately all processors used
* by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
* cache lines. This is due to the use of the 'dcbz' instruction. It simply
* clears a single cache line to zero, so you need to know the cache line
* size to use it! It's absurd, but it's fast...
*
* update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
* cache line size: 128 bytes. Oups.
* The semantics of dcbz was changed, it always clears 32 bytes. So the function
* below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
* which is defined to clear a cache line (as dcbz before). So we can still
* distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
*
* see <http://developer.apple.com/technotes/tn/tn2087.html>
* and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
static
void
clear_blocks_dcbz32_ppc
(
int16_t
*
blocks
)
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x00000010
);
...
...
@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
}
}
/*
same as above, when dcbzl clear a whole 128B
cache line
i.e. the PPC970 aka G5
*/
/*
Same as above, when dcbzl clears a whole 128 bytes
cache line
* i.e. the PPC970 AKA G5.
*/
#if HAVE_DCBZL
static
void
clear_blocks_dcbz128_ppc
(
int16_t
*
blocks
)
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x0000007f
);
register
int
i
=
0
;
if
(
misal
)
{
/
/ w
e could probably also optimize this case,
//
but there's not much point as the machines
// aren't available yet (2003-06-26)
/
* W
e could probably also optimize this case,
*
but there's not much point as the machines
* aren't available yet (2003-06-26). */
memset
(
blocks
,
0
,
sizeof
(
int16_t
)
*
6
*
64
);
}
else
...
...
@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
#endif
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* update 24/06/2003 : replace dcbz by dcbzl to get
the intended effect (Apple "fixed" dcbz)
unfortunately this cannot be used unless the assembler
knows about dcbzl ... */
/* Check dcbz report how many bytes are set to 0 by dcbz. */
/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
* (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
* assembler knows about dcbzl ... */
static
long
check_dcbzl_effect
(
void
)
{
register
char
*
fakedata
=
av_malloc
(
1024
);
...
...
@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
memset
(
fakedata
,
0xFF
,
1024
);
/*
below the constraint "b" seems to mean "A
ddress base register"
in gcc-3.3 / RS/6000 speaks. s
eems to avoid using r0, so.... */
/*
Below the constraint "b" seems to mean "a
ddress base register"
* in gcc-3.3 / RS/6000 speaks. S
eems to avoid using r0, so.... */
__asm__
volatile
(
"dcbzl %0, %1"
:
:
"b"
(
fakedata_middle
),
"r"
(
zero
));
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
...
...
@@ -144,7 +142,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
{
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
//
C
ommon optimizations whether AltiVec is available or not
//
c
ommon optimizations whether AltiVec is available or not
if
(
!
high_bit_depth
)
{
switch
(
check_dcbzl_effect
())
{
case
32
:
...
...
libavcodec/ppc/fdct_altivec.c
View file @
82ee14d2
...
...
@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
#undef MERGE_S16
/* }}} */
/* Some of the initial calculations can be done as vector short
* before conversion to vector float. The following code section
* takes advantage of this. */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
/* fdct rows {{{ */
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
...
...
libavcodec/ppc/fft_altivec.c
View file @
82ee14d2
...
...
@@ -27,12 +27,12 @@
#include "libavcodec/fft.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init().
The
*
input data must be permuted before with s->revtab table. No
*
1.0/
sqrt(n) normalization is done.
* AltiVec-enabled
* This code assumes that the 'z' pointer is 16 bytes-aligned
* It also assumes all FFTComplex are 8 bytes-aligned pair
of float
* Do a complex FFT with the parameters defined in ff_fft_init().
*
The input data must be permuted before with s->revtab table.
*
No 1.0 /
sqrt(n) normalization is done.
* AltiVec-enabled
:
* This code assumes that the 'z' pointer is 16 bytes-aligned
.
* It also assumes all FFTComplex are 8 bytes-aligned pair
s of floats.
*/
void
ff_fft_calc_altivec
(
FFTContext
*
s
,
FFTComplex
*
z
);
...
...
libavcodec/ppc/gmc_altivec.c
View file @
82ee14d2
/*
* GMC (Global Motion Compensation)
*
AltiVec-enabled
* GMC (Global Motion Compensation)
, AltiVec-enabled
*
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of Libav.
...
...
@@ -25,10 +25,8 @@
#include "libavutil/ppc/util_altivec.h"
#include "dsputil_altivec.h"
/*
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
to preserve proper dst alignment.
*/
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
{
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
...
...
@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
// we'll be able to pick-up our 9 char elements
// at src from those 32 bytes
// we load the first batch here, as inside the loop
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
/* we'll be able to pick-up our 9 char elements at src from those
* 32 bytes we load the first batch here, as inside the loop we can
* reuse 'src + stride' from one iteration as the 'src' of the next. */
src_0
=
vec_ld
(
0
,
src
);
src_1
=
vec_ld
(
16
,
src
);
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
// on the second vector.
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
* on the second vector. */
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
}
else
{
srcvB
=
src_1
;
...
...
@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
dstv
=
vec_ld
(
0
,
dst
);
// we we'll be able to pick-up our 9 char elements
// at src + stride from those 32 bytes
// then reuse the resulting 2 vectors srvcC and srcvD
// as the next srcvA and srcvB
/* We'll be able to pick-up our 9 char elements at src + stride from
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
// on the second vector.
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
* on the second vector. */
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
}
else
{
srcvD
=
src_1
;
...
...
@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
// OK, now we (finally) do the math :-)
// those four instructions replaces 32 int muls & 32 int adds.
// isn't AltiVec nice ?
/* OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
...
...
libavcodec/ppc/idct_altivec.c
View file @
82ee14d2
...
...
@@ -18,24 +18,19 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* NOTE: This code is based on GPL code from the libmpeg2 project. The
/* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of Libav.
*/
/*
*
* Libav integration by Dieter Shirley
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function.
*/
* perform a full transpose at the end of the function. */
#include <stdlib.h>
/* malloc(), free() */
#include <stdlib.h>
#include <string.h>
#include "config.h"
#if HAVE_ALTIVEC_H
...
...
libavcodec/ppc/int_altivec.c
View file @
82ee14d2
...
...
@@ -19,9 +19,9 @@
*/
/**
*
*
@file
*
* integer misc ops.
*
*
/
* @file
*
miscellaneous integer operations
*/
#include "config.h"
#if HAVE_ALTIVEC_H
...
...
@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int32_t
score
[
4
];
}
u
;
u
.
vscore
=
vec_splat_s32
(
0
);
//
//XXX lazy way, fix it later
//
XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
...
...
@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
size16
=
size
>>
4
;
while
(
size16
)
{
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
//
load pix1 and the first batch of pix2
vpix1
=
vec_unaligned_load
(
pix1
);
vpix2
=
vec_unaligned_load
(
pix2
);
pix2
+=
8
;
//unpack
//
unpack
vpix1h
=
vec_unpackh
(
vpix1
);
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vpix1l
=
vec_unpackl
(
vpix1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment