Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
82ee14d2
Commit
82ee14d2
authored
Jan 15, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppc: dsputil: comment formatting and wording/grammar improvements
parent
cce791b1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
180 additions
and
199 deletions
+180
-199
dsputil_altivec.c
libavcodec/ppc/dsputil_altivec.c
+114
-119
dsputil_ppc.c
libavcodec/ppc/dsputil_ppc.c
+29
-31
fdct_altivec.c
libavcodec/ppc/fdct_altivec.c
+3
-4
fft_altivec.c
libavcodec/ppc/fft_altivec.c
+6
-6
gmc_altivec.c
libavcodec/ppc/gmc_altivec.c
+17
-23
idct_altivec.c
libavcodec/ppc/idct_altivec.c
+4
-9
int_altivec.c
libavcodec/ppc/int_altivec.c
+7
-7
No files found.
libavcodec/ppc/dsputil_altivec.c
View file @
82ee14d2
This diff is collapsed.
Click to expand it.
libavcodec/ppc/dsputil_ppc.c
View file @
82ee14d2
...
@@ -32,24 +32,23 @@
...
@@ -32,24 +32,23 @@
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
cache line size not equal to 32 bytes.
* a cache line size not equal to 32 bytes. Fortunately all processors used
Fortunately all processor used by Apple up to at least the 7450 (aka second
* by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
generation G4) use 32 bytes cache line.
* cache lines. This is due to the use of the 'dcbz' instruction. It simply
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
* clears a single cache line to zero, so you need to know the cache line
single cache line, so you need to know the cache line size to use it !
* size to use it! It's absurd, but it's fast...
It's absurd, but it's fast...
*
* update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
* cache line size: 128 bytes. Oups.
size: 128 bytes. Oups.
* The semantics of dcbz was changed, it always clears 32 bytes. So the function
The semantic of dcbz was changed, it always clear 32 bytes. so the function
* below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
* which is defined to clear a cache line (as dcbz before). So we can still
which is defined to clear a cache line (as dcbz before). So we still can
* distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
*
* see <http://developer.apple.com/technotes/tn/tn2087.html>
see <http://developer.apple.com/technotes/tn/tn2087.html>
* and <http://developer.apple.com/technotes/tn/tn2086.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
*/
static
void
clear_blocks_dcbz32_ppc
(
int16_t
*
blocks
)
static
void
clear_blocks_dcbz32_ppc
(
int16_t
*
blocks
)
{
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x00000010
);
register
int
misal
=
((
unsigned
long
)
blocks
&
0x00000010
);
...
@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
...
@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
}
}
}
}
/*
same as above, when dcbzl clear a whole 128B
cache line
/*
Same as above, when dcbzl clears a whole 128 bytes
cache line
i.e. the PPC970 aka G5
*/
* i.e. the PPC970 AKA G5.
*/
#if HAVE_DCBZL
#if HAVE_DCBZL
static
void
clear_blocks_dcbz128_ppc
(
int16_t
*
blocks
)
static
void
clear_blocks_dcbz128_ppc
(
int16_t
*
blocks
)
{
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x0000007f
);
register
int
misal
=
((
unsigned
long
)
blocks
&
0x0000007f
);
register
int
i
=
0
;
register
int
i
=
0
;
if
(
misal
)
{
if
(
misal
)
{
/
/ w
e could probably also optimize this case,
/
* W
e could probably also optimize this case,
//
but there's not much point as the machines
*
but there's not much point as the machines
// aren't available yet (2003-06-26)
* aren't available yet (2003-06-26). */
memset
(
blocks
,
0
,
sizeof
(
int16_t
)
*
6
*
64
);
memset
(
blocks
,
0
,
sizeof
(
int16_t
)
*
6
*
64
);
}
}
else
else
...
@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
...
@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
#endif
#endif
#if HAVE_DCBZL
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* Check dcbz report how many bytes are set to 0 by dcbz. */
/* update 24/06/2003 : replace dcbz by dcbzl to get
/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
the intended effect (Apple "fixed" dcbz)
* (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
unfortunately this cannot be used unless the assembler
* assembler knows about dcbzl ... */
knows about dcbzl ... */
static
long
check_dcbzl_effect
(
void
)
static
long
check_dcbzl_effect
(
void
)
{
{
register
char
*
fakedata
=
av_malloc
(
1024
);
register
char
*
fakedata
=
av_malloc
(
1024
);
...
@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
...
@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
memset
(
fakedata
,
0xFF
,
1024
);
memset
(
fakedata
,
0xFF
,
1024
);
/*
below the constraint "b" seems to mean "A
ddress base register"
/*
Below the constraint "b" seems to mean "a
ddress base register"
in gcc-3.3 / RS/6000 speaks. s
eems to avoid using r0, so.... */
* in gcc-3.3 / RS/6000 speaks. S
eems to avoid using r0, so.... */
__asm__
volatile
(
"dcbzl %0, %1"
:
:
"b"
(
fakedata_middle
),
"r"
(
zero
));
__asm__
volatile
(
"dcbzl %0, %1"
:
:
"b"
(
fakedata_middle
),
"r"
(
zero
));
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
...
@@ -144,7 +142,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
...
@@ -144,7 +142,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
{
{
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
//
C
ommon optimizations whether AltiVec is available or not
//
c
ommon optimizations whether AltiVec is available or not
if
(
!
high_bit_depth
)
{
if
(
!
high_bit_depth
)
{
switch
(
check_dcbzl_effect
())
{
switch
(
check_dcbzl_effect
())
{
case
32
:
case
32
:
...
...
libavcodec/ppc/fdct_altivec.c
View file @
82ee14d2
...
@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
...
@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
#undef MERGE_S16
#undef MERGE_S16
/* }}} */
/* }}} */
/* Some of the initial calculations can be done as vector short
* before conversion to vector float. The following code section
* takes advantage of this. */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
/* fdct rows {{{ */
/* fdct rows {{{ */
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
...
...
libavcodec/ppc/fft_altivec.c
View file @
82ee14d2
...
@@ -27,12 +27,12 @@
...
@@ -27,12 +27,12 @@
#include "libavcodec/fft.h"
#include "libavcodec/fft.h"
/**
/**
* Do a complex FFT with the parameters defined in ff_fft_init().
The
* Do a complex FFT with the parameters defined in ff_fft_init().
*
input data must be permuted before with s->revtab table. No
*
The input data must be permuted before with s->revtab table.
*
1.0/
sqrt(n) normalization is done.
*
No 1.0 /
sqrt(n) normalization is done.
* AltiVec-enabled
* AltiVec-enabled
:
* This code assumes that the 'z' pointer is 16 bytes-aligned
* This code assumes that the 'z' pointer is 16 bytes-aligned
.
* It also assumes all FFTComplex are 8 bytes-aligned pair
of float
* It also assumes all FFTComplex are 8 bytes-aligned pair
s of floats.
*/
*/
void
ff_fft_calc_altivec
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_altivec
(
FFTContext
*
s
,
FFTComplex
*
z
);
...
...
libavcodec/ppc/gmc_altivec.c
View file @
82ee14d2
/*
/*
* GMC (Global Motion Compensation)
* GMC (Global Motion Compensation)
, AltiVec-enabled
*
AltiVec-enabled
*
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
*
* This file is part of Libav.
* This file is part of Libav.
...
@@ -25,10 +25,8 @@
...
@@ -25,10 +25,8 @@
#include "libavutil/ppc/util_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "dsputil_altivec.h"
#include "dsputil_altivec.h"
/*
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
* to preserve proper dst alignment. */
to preserve proper dst alignment.
*/
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
{
{
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
...
@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
// we'll be able to pick-up our 9 char elements
/* we'll be able to pick-up our 9 char elements at src from those
// at src from those 32 bytes
* 32 bytes we load the first batch here, as inside the loop we can
// we load the first batch here, as inside the loop
* reuse 'src + stride' from one iteration as the 'src' of the next. */
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
src_0
=
vec_ld
(
0
,
src
);
src_0
=
vec_ld
(
0
,
src
);
src_1
=
vec_ld
(
16
,
src
);
src_1
=
vec_ld
(
16
,
src
);
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
// on the second vector.
* on the second vector. */
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
}
else
{
}
else
{
srcvB
=
src_1
;
srcvB
=
src_1
;
...
@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
dstv
=
vec_ld
(
0
,
dst
);
dstv
=
vec_ld
(
0
,
dst
);
// we we'll be able to pick-up our 9 char elements
/* We'll be able to pick-up our 9 char elements at src + stride from
// at src + stride from those 32 bytes
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
// then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
// as the next srcvA and srcvB
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
// on the second vector.
* on the second vector. */
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
}
else
{
}
else
{
srcvD
=
src_1
;
srcvD
=
src_1
;
...
@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
/* OK, now we (finally) do the math :-)
// OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
// those four instructions replaces 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
// isn't AltiVec nice ?
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
...
...
libavcodec/ppc/idct_altivec.c
View file @
82ee14d2
...
@@ -18,24 +18,19 @@
...
@@ -18,24 +18,19 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
*/
/*
/* NOTE: This code is based on GPL code from the libmpeg2 project. The
* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of Libav.
* under LGPL as part of Libav.
*/
*
/*
* Libav integration by Dieter Shirley
* Libav integration by Dieter Shirley
*
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function.
* perform a full transpose at the end of the function. */
*/
#include <stdlib.h>
/* malloc(), free() */
#include <stdlib.h>
#include <string.h>
#include <string.h>
#include "config.h"
#include "config.h"
#if HAVE_ALTIVEC_H
#if HAVE_ALTIVEC_H
...
...
libavcodec/ppc/int_altivec.c
View file @
82ee14d2
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
*/
*/
/**
/**
*
*
@file
* @file
*
* integer misc ops.
*
miscellaneous integer operations
*
*
/
*/
#include "config.h"
#include "config.h"
#if HAVE_ALTIVEC_H
#if HAVE_ALTIVEC_H
...
@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
...
@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int32_t
score
[
4
];
int32_t
score
[
4
];
}
u
;
}
u
;
u
.
vscore
=
vec_splat_s32
(
0
);
u
.
vscore
=
vec_splat_s32
(
0
);
//
//XXX lazy way, fix it later
//
XXX lazy way, fix it later
#define vec_unaligned_load(b) \
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
...
@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
...
@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
size16
=
size
>>
4
;
size16
=
size
>>
4
;
while
(
size16
)
{
while
(
size16
)
{
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
//
load pix1 and the first batch of pix2
vpix1
=
vec_unaligned_load
(
pix1
);
vpix1
=
vec_unaligned_load
(
pix1
);
vpix2
=
vec_unaligned_load
(
pix2
);
vpix2
=
vec_unaligned_load
(
pix2
);
pix2
+=
8
;
pix2
+=
8
;
//unpack
//
unpack
vpix1h
=
vec_unpackh
(
vpix1
);
vpix1h
=
vec_unpackh
(
vpix1
);
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vpix1l
=
vec_unpackl
(
vpix1
);
vpix1l
=
vec_unpackl
(
vpix1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment