Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
022184a6
Commit
022184a6
authored
Jan 15, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppc: dsputil: more K&R formatting cosmetics
parent
30f3f959
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
437 additions
and
416 deletions
+437
-416
fdct_altivec.c
libavcodec/ppc/fdct_altivec.c
+191
-193
gmc_altivec.c
libavcodec/ppc/gmc_altivec.c
+42
-41
idct_altivec.c
libavcodec/ppc/idct_altivec.c
+154
-141
int_altivec.c
libavcodec/ppc/int_altivec.c
+50
-41
No files found.
libavcodec/ppc/fdct_altivec.c
View file @
022184a6
...
...
@@ -22,39 +22,37 @@
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/common.h"
#include "dsputil_altivec.h"
#define vs16(v) ((vector signed short)(v))
#define vs32(v) ((vector signed int)(v))
#define vu8(v) ((vector unsigned char)(v))
#define vu16(v) ((vector unsigned short)(v))
#define vu32(v) ((vector unsigned int)(v))
#define C1 0.98078525066375732421875000
/* cos(1*PI/16) */
#define C2 0.92387950420379638671875000
/* cos(2*PI/16) */
#define C3 0.83146959543228149414062500
/* cos(3*PI/16) */
#define C4 0.70710676908493041992187500
/* cos(4*PI/16) */
#define C5 0.55557024478912353515625000
/* cos(5*PI/16) */
#define C6 0.38268342614173889160156250
/* cos(6*PI/16) */
#define C7 0.19509032368659973144531250
/* cos(7*PI/16) */
#define vs16(v) ((vector signed short) (v))
#define vs32(v) ((vector signed int) (v))
#define vu8(v) ((vector unsigned char) (v))
#define vu16(v) ((vector unsigned short) (v))
#define vu32(v) ((vector unsigned int) (v))
#define C1 0.98078525066375732421875000
/* cos(1 * PI / 16) */
#define C2 0.92387950420379638671875000
/* cos(2 * PI / 16) */
#define C3 0.83146959543228149414062500
/* cos(3 * PI / 16) */
#define C4 0.70710676908493041992187500
/* cos(4 * PI / 16) */
#define C5 0.55557024478912353515625000
/* cos(5 * PI / 16) */
#define C6 0.38268342614173889160156250
/* cos(6 * PI / 16) */
#define C7 0.19509032368659973144531250
/* cos(7 * PI / 16) */
#define SQRT_2 1.41421353816986083984375000
/* sqrt(2) */
#define W0 -(2 * C2)
#define W1 (2 * C6)
#define W2 (SQRT_2 * C6)
#define W3 (SQRT_2 * C3)
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
#define W5 (SQRT_2 *
(
C1 + C3 - C5 + C7))
#define W6 (SQRT_2 *
(
C1 + C3 + C5 - C7))
#define W7 (SQRT_2 *
(
C1 + C3 - C5 - C7))
#define W8 (SQRT_2 *
(
C7 - C3))
#define W5 (SQRT_2 *
(
C1 + C3 - C5 + C7))
#define W6 (SQRT_2 *
(
C1 + C3 + C5 - C7))
#define W7 (SQRT_2 *
(
C1 + C3 - C5 - C7))
#define W8 (SQRT_2 *
(
C7 - C3))
#define W9 (SQRT_2 * (-C1 - C3))
#define WA (SQRT_2 * (-C3 - C5))
#define WB (SQRT_2 * ( C5 - C3))
#define WB (SQRT_2 * (C5 - C3))
static
vector
float
fdctconsts
[
3
]
=
{
{
W0
,
W1
,
W2
,
W3
},
...
...
@@ -75,8 +73,7 @@ static vector float fdctconsts[3] = {
#define LD_WA vec_splat(cnsts2, 2)
#define LD_WB vec_splat(cnsts2, 3)
#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7)
/* {{{ */
\
#define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7)
/* {{{ */
\
x0 = vec_add(b0, b7);
/* x0 = b0 + b7; */
\
x7 = vec_sub(b0, b7);
/* x7 = b0 - b7; */
\
x1 = vec_add(b1, b6);
/* x1 = b1 + b6; */
\
...
...
@@ -133,7 +130,7 @@ static vector float fdctconsts[3] = {
b1 = vec_add(b1, x3);
/* b1 = b1 + x3; */
\
/* }}} */
#define FDCTCOL(b0,
b1,b2,b3,b4,b5,b6,b7)
/* {{{ */
\
#define FDCTCOL(b0,
b1, b2, b3, b4, b5, b6, b7)
/* {{{ */
\
x0 = vec_add(b0, b7);
/* x0 = b0 + b7; */
\
x7 = vec_sub(b0, b7);
/* x7 = b0 - b7; */
\
x1 = vec_add(b1, b6);
/* x1 = b1 + b6; */
\
...
...
@@ -190,10 +187,7 @@ static vector float fdctconsts[3] = {
b1 = vec_add(b1, x3);
/* b1 += x3; */
\
/* }}} */
/* two dimensional discrete cosine transform */
void
ff_fdct_altivec
(
int16_t
*
block
)
{
vector
signed
short
*
bp
;
...
...
@@ -205,56 +199,57 @@ void ff_fdct_altivec(int16_t *block)
/* setup constants {{{ */
/* mzero = -0.0 */
mzero
=
((
vector
float
)
vec_splat_u32
(
-
1
));
mzero
=
((
vector
float
)
vec_sl
(
vu32
(
mzero
),
vu32
(
mzero
)));
mzero
=
((
vector
float
)
vec_splat_u32
(
-
1
));
mzero
=
((
vector
float
)
vec_sl
(
vu32
(
mzero
),
vu32
(
mzero
)));
cp
=
fdctconsts
;
cnsts0
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts1
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts0
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts1
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts2
=
vec_ld
(
0
,
cp
);
/* }}} */
/* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl,
a,b) vec_merge##
hl(vs16(a), vs16(b))
#define MERGE_S16(hl,
a, b) vec_merge ##
hl(vs16(a), vs16(b))
bp
=
(
vector
signed
short
*
)
block
;
b00
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b40
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b01
=
((
vector
float
)
MERGE_S16
(
h
,
b00
,
b40
));
b11
=
((
vector
float
)
MERGE_S16
(
l
,
b00
,
b40
));
bp
=
(
vector
signed
short
*
)
block
;
b00
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b40
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b01
=
((
vector
float
)
MERGE_S16
(
h
,
b00
,
b40
));
b11
=
((
vector
float
)
MERGE_S16
(
l
,
b00
,
b40
));
bp
++
;
b10
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b50
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b21
=
((
vector
float
)
MERGE_S16
(
h
,
b10
,
b50
));
b31
=
((
vector
float
)
MERGE_S16
(
l
,
b10
,
b50
));
b10
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b50
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b21
=
((
vector
float
)
MERGE_S16
(
h
,
b10
,
b50
));
b31
=
((
vector
float
)
MERGE_S16
(
l
,
b10
,
b50
));
bp
++
;
b20
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b60
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b41
=
((
vector
float
)
MERGE_S16
(
h
,
b20
,
b60
));
b51
=
((
vector
float
)
MERGE_S16
(
l
,
b20
,
b60
));
b20
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b60
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b41
=
((
vector
float
)
MERGE_S16
(
h
,
b20
,
b60
));
b51
=
((
vector
float
)
MERGE_S16
(
l
,
b20
,
b60
));
bp
++
;
b30
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b70
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b61
=
((
vector
float
)
MERGE_S16
(
h
,
b30
,
b70
));
b71
=
((
vector
float
)
MERGE_S16
(
l
,
b30
,
b70
));
x0
=
((
vector
float
)
MERGE_S16
(
h
,
b01
,
b41
));
x1
=
((
vector
float
)
MERGE_S16
(
l
,
b01
,
b41
));
x2
=
((
vector
float
)
MERGE_S16
(
h
,
b11
,
b51
));
x3
=
((
vector
float
)
MERGE_S16
(
l
,
b11
,
b51
));
x4
=
((
vector
float
)
MERGE_S16
(
h
,
b21
,
b61
));
x5
=
((
vector
float
)
MERGE_S16
(
l
,
b21
,
b61
));
x6
=
((
vector
float
)
MERGE_S16
(
h
,
b31
,
b71
));
x7
=
((
vector
float
)
MERGE_S16
(
l
,
b31
,
b71
));
b00
=
((
vector
float
)
MERGE_S16
(
h
,
x0
,
x4
));
b10
=
((
vector
float
)
MERGE_S16
(
l
,
x0
,
x4
));
b20
=
((
vector
float
)
MERGE_S16
(
h
,
x1
,
x5
));
b30
=
((
vector
float
)
MERGE_S16
(
l
,
x1
,
x5
));
b40
=
((
vector
float
)
MERGE_S16
(
h
,
x2
,
x6
));
b50
=
((
vector
float
)
MERGE_S16
(
l
,
x2
,
x6
));
b60
=
((
vector
float
)
MERGE_S16
(
h
,
x3
,
x7
));
b70
=
((
vector
float
)
MERGE_S16
(
l
,
x3
,
x7
));
b30
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b70
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b61
=
((
vector
float
)
MERGE_S16
(
h
,
b30
,
b70
));
b71
=
((
vector
float
)
MERGE_S16
(
l
,
b30
,
b70
));
x0
=
((
vector
float
)
MERGE_S16
(
h
,
b01
,
b41
));
x1
=
((
vector
float
)
MERGE_S16
(
l
,
b01
,
b41
));
x2
=
((
vector
float
)
MERGE_S16
(
h
,
b11
,
b51
));
x3
=
((
vector
float
)
MERGE_S16
(
l
,
b11
,
b51
));
x4
=
((
vector
float
)
MERGE_S16
(
h
,
b21
,
b61
));
x5
=
((
vector
float
)
MERGE_S16
(
l
,
b21
,
b61
));
x6
=
((
vector
float
)
MERGE_S16
(
h
,
b31
,
b71
));
x7
=
((
vector
float
)
MERGE_S16
(
l
,
b31
,
b71
));
b00
=
((
vector
float
)
MERGE_S16
(
h
,
x0
,
x4
));
b10
=
((
vector
float
)
MERGE_S16
(
l
,
x0
,
x4
));
b20
=
((
vector
float
)
MERGE_S16
(
h
,
x1
,
x5
));
b30
=
((
vector
float
)
MERGE_S16
(
l
,
x1
,
x5
));
b40
=
((
vector
float
)
MERGE_S16
(
h
,
x2
,
x6
));
b50
=
((
vector
float
)
MERGE_S16
(
l
,
x2
,
x6
));
b60
=
((
vector
float
)
MERGE_S16
(
h
,
x3
,
x7
));
b70
=
((
vector
float
)
MERGE_S16
(
l
,
x3
,
x7
));
#undef MERGE_S16
/* }}} */
...
...
@@ -264,32 +259,32 @@ void ff_fdct_altivec(int16_t *block)
* takes advantage of this. */
/* fdct rows {{{ */
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x1
=
((
vector
float
)
vec_add
(
vs16
(
b10
),
vs16
(
b60
)));
x6
=
((
vector
float
)
vec_sub
(
vs16
(
b10
),
vs16
(
b60
)));
x2
=
((
vector
float
)
vec_add
(
vs16
(
b20
),
vs16
(
b50
)));
x5
=
((
vector
float
)
vec_sub
(
vs16
(
b20
),
vs16
(
b50
)));
x3
=
((
vector
float
)
vec_add
(
vs16
(
b30
),
vs16
(
b40
)));
x4
=
((
vector
float
)
vec_sub
(
vs16
(
b30
),
vs16
(
b40
)));
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x1
=
((
vector
float
)
vec_add
(
vs16
(
b10
),
vs16
(
b60
)));
x6
=
((
vector
float
)
vec_sub
(
vs16
(
b10
),
vs16
(
b60
)));
x2
=
((
vector
float
)
vec_add
(
vs16
(
b20
),
vs16
(
b50
)));
x5
=
((
vector
float
)
vec_sub
(
vs16
(
b20
),
vs16
(
b50
)));
x3
=
((
vector
float
)
vec_add
(
vs16
(
b30
),
vs16
(
b40
)));
x4
=
((
vector
float
)
vec_sub
(
vs16
(
b30
),
vs16
(
b40
)));
b70
=
((
vector
float
)
vec_add
(
vs16
(
x0
),
vs16
(
x3
)));
b10
=
((
vector
float
)
vec_add
(
vs16
(
x1
),
vs16
(
x2
)));
b70
=
((
vector
float
)
vec_add
(
vs16
(
x0
),
vs16
(
x3
)));
b10
=
((
vector
float
)
vec_add
(
vs16
(
x1
),
vs16
(
x2
)));
b00
=
((
vector
float
)
vec_add
(
vs16
(
b70
),
vs16
(
b10
)));
b40
=
((
vector
float
)
vec_sub
(
vs16
(
b70
),
vs16
(
b10
)));
b00
=
((
vector
float
)
vec_add
(
vs16
(
b70
),
vs16
(
b10
)));
b40
=
((
vector
float
)
vec_sub
(
vs16
(
b70
),
vs16
(
b10
)));
#define CTF0(n) \
b
##n##1 = ((vector float)vec_unpackl(vs16(b##n##
0))); \
b
##n##0 = ((vector float)vec_unpackh(vs16(b##n##
0))); \
b
##n##1 = vec_ctf(vs32(b##n##1), 0);
\
b
##n##0 = vec_ctf(vs32(b##n##
0), 0);
b
## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ##
0))); \
b
## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ##
0))); \
b
## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0);
\
b
## n ## 0 = vec_ctf(vs32(b ## n ##
0), 0);
CTF0
(
0
);
CTF0
(
4
);
b20
=
((
vector
float
)
vec_sub
(
vs16
(
x0
),
vs16
(
x3
)));
b60
=
((
vector
float
)
vec_sub
(
vs16
(
x1
),
vs16
(
x2
)));
b20
=
((
vector
float
)
vec_sub
(
vs16
(
x0
),
vs16
(
x3
)));
b60
=
((
vector
float
)
vec_sub
(
vs16
(
x1
),
vs16
(
x2
)));
CTF0
(
2
);
CTF0
(
6
);
...
...
@@ -309,11 +304,11 @@ void ff_fdct_altivec(int16_t *block)
b60
=
vec_madd
(
cnst
,
b60
,
x0
);
b61
=
vec_madd
(
cnst
,
b61
,
x1
);
#define CTFX(x,
b)
\
b
##0 = ((vector float)
vec_unpackh(vs16(x))); \
b
##1 = ((vector float)
vec_unpackl(vs16(x))); \
b
##0 = vec_ctf(vs32(b##0), 0);
\
b
##1 = vec_ctf(vs32(b##1), 0);
\
#define CTFX(x,
b)
\
b
## 0 = ((vector float)
vec_unpackh(vs16(x))); \
b
## 1 = ((vector float)
vec_unpackl(vs16(x))); \
b
## 0 = vec_ctf(vs32(b ## 0), 0);
\
b
## 1 = vec_ctf(vs32(b ## 1), 0);
\
CTFX
(
x4
,
b7
);
CTFX
(
x5
,
b5
);
...
...
@@ -322,7 +317,6 @@ void ff_fdct_altivec(int16_t *block)
#undef CTFX
x0
=
vec_add
(
b70
,
b10
);
x1
=
vec_add
(
b50
,
b30
);
x2
=
vec_add
(
b70
,
b30
);
...
...
@@ -354,7 +348,6 @@ void ff_fdct_altivec(int16_t *block)
b30
=
vec_add
(
b30
,
x2
);
b10
=
vec_add
(
b10
,
x3
);
x0
=
vec_add
(
b71
,
b11
);
x1
=
vec_add
(
b51
,
b31
);
x2
=
vec_add
(
b71
,
b31
);
...
...
@@ -387,7 +380,6 @@ void ff_fdct_altivec(int16_t *block)
b11
=
vec_add
(
b11
,
x3
);
/* }}} */
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
x0
=
vec_mergel
(
b00
,
b20
);
x1
=
vec_mergeh
(
b00
,
b20
);
...
...
@@ -430,28 +422,34 @@ void ff_fdct_altivec(int16_t *block)
b31
=
vec_mergel
(
x4
,
x6
);
/* }}} */
FDCTCOL
(
b00
,
b10
,
b20
,
b30
,
b40
,
b50
,
b60
,
b70
);
FDCTCOL
(
b01
,
b11
,
b21
,
b31
,
b41
,
b51
,
b61
,
b71
);
/* round, convert back to short {{{ */
#define CTS(n) \
b##n##0 = vec_round(b##n##0); \
b##n##1 = vec_round(b##n##1); \
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
vec_st(vs16(b##n##0), 0, bp);
bp
=
(
vector
signed
short
*
)
block
;
CTS
(
0
);
bp
++
;
CTS
(
1
);
bp
++
;
CTS
(
2
);
bp
++
;
CTS
(
3
);
bp
++
;
CTS
(
4
);
bp
++
;
CTS
(
5
);
bp
++
;
CTS
(
6
);
bp
++
;
b ## n ## 0 = vec_round(b ## n ## 0); \
b ## n ## 1 = vec_round(b ## n ## 1); \
b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \
b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \
b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \
vs32(b ## n ## 1))); \
vec_st(vs16(b ## n ## 0), 0, bp);
bp
=
(
vector
signed
short
*
)
block
;
CTS
(
0
);
bp
++
;
CTS
(
1
);
bp
++
;
CTS
(
2
);
bp
++
;
CTS
(
3
);
bp
++
;
CTS
(
4
);
bp
++
;
CTS
(
5
);
bp
++
;
CTS
(
6
);
bp
++
;
CTS
(
7
);
#undef CTS
...
...
libavcodec/ppc/gmc_altivec.c
View file @
022184a6
...
...
@@ -27,32 +27,36 @@
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
{
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
ABCD
)[
8
]
=
{
(
16
-
x16
)
*
(
16
-
y16
),
/* A */
(
x16
)
*
(
16
-
y16
),
/* B */
(
16
-
x16
)
*
(
y16
),
/* C */
(
x16
)
*
(
y16
),
/* D */
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
ABCD
)[
8
]
=
{
(
16
-
x16
)
*
(
16
-
y16
),
/* A */
(
x16
)
*
(
16
-
y16
),
/* B */
(
16
-
x16
)
*
(
y16
),
/* C */
(
x16
)
*
(
y16
),
/* D */
0
,
0
,
0
,
0
/* padding */
};
register
const
vector
unsigned
char
vczero
=
(
const
vector
unsigned
char
)
vec_splat_u8
(
0
);
register
const
vector
unsigned
short
vcsr8
=
(
const
vector
unsigned
short
)
vec_splat_u16
(
8
);
register
vector
unsigned
char
dstv
,
dstv2
,
src_0
,
src_1
,
srcvA
,
srcvB
,
srcvC
,
srcvD
;
register
vector
unsigned
short
Av
,
Bv
,
Cv
,
Dv
,
rounderV
,
tempA
,
tempB
,
tempC
,
tempD
;
register
const
vector
unsigned
char
vczero
=
(
const
vector
unsigned
char
)
vec_splat_u8
(
0
);
register
const
vector
unsigned
short
vcsr8
=
(
const
vector
unsigned
short
)
vec_splat_u16
(
8
);
register
vector
unsigned
char
dstv
,
dstv2
,
src_0
,
src_1
,
srcvA
,
srcvB
,
srcvC
,
srcvD
;
register
vector
unsigned
short
Av
,
Bv
,
Cv
,
Dv
,
rounderV
,
tempA
,
tempB
,
tempC
,
tempD
;
int
i
;
unsigned
long
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
unsigned
long
src_really_odd
=
(
unsigned
long
)
src
&
0x0000000F
;
unsigned
long
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
unsigned
long
src_really_odd
=
(
unsigned
long
)
src
&
0x0000000F
;
tempA
=
vec_ld
(
0
,
(
const
unsigned
short
*
)
ABCD
);
tempA
=
vec_ld
(
0
,
(
const
unsigned
short
*
)
ABCD
);
Av
=
vec_splat
(
tempA
,
0
);
Bv
=
vec_splat
(
tempA
,
1
);
Cv
=
vec_splat
(
tempA
,
2
);
Dv
=
vec_splat
(
tempA
,
3
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
/* we'll be able to pick-up our 9 char elements at src from those
* 32 bytes we load the first batch here, as inside the loop we can
...
...
@@ -61,19 +65,18 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
src_1
=
vec_ld
(
16
,
src
);
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
/* If src & 0xF == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
}
else
{
else
srcvB
=
src_1
;
}
srcvA
=
vec_mergeh
(
vczero
,
srcvA
);
srcvB
=
vec_mergeh
(
vczero
,
srcvB
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
src_really_odd
=
(((
unsigned
long
)
src
)
+
stride
)
&
0x0000000F
;
for
(
i
=
0
;
i
<
h
;
i
++
)
{
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
src_really_odd
=
(((
unsigned
long
)
src
)
+
stride
)
&
0x0000000F
;
dstv
=
vec_ld
(
0
,
dst
);
...
...
@@ -84,13 +87,12 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
src_1
=
vec_ld
(
stride
+
16
,
src
);
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
/* If src & 0xF == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
}
else
{
else
srcvD
=
src_1
;
}
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
...
...
@@ -98,23 +100,22 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
/* OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempD
=
vec_mladd
((
vector
unsigned
short
)
srcvD
,
Dv
,
tempC
);
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempD
=
vec_mladd
((
vector
unsigned
short
)
srcvD
,
Dv
,
tempC
);
srcvA
=
srcvC
;
srcvB
=
srcvD
;
tempD
=
vec_sr
(
tempD
,
vcsr8
);
dstv2
=
vec_pack
(
tempD
,
(
vector
unsigned
short
)
vczero
);
dstv2
=
vec_pack
(
tempD
,
(
vector
unsigned
short
)
vczero
);
if
(
dst_odd
)
{
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
0
,
1
,
s0
,
s1
));
}
else
{
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
s0
,
s1
,
2
,
3
));
}
if
(
dst_odd
)
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
0
,
1
,
s0
,
s1
));
else
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
s0
,
s1
,
2
,
3
));
vec_st
(
dstv2
,
0
,
dst
);
...
...
libavcodec/ppc/idct_altivec.c
View file @
022184a6
...
...
@@ -36,44 +36,44 @@
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/ppc/types_altivec.h"
#include "dsputil_altivec.h"
#define IDCT_HALF \
/* 1st stage */
\
t1 = vec_mradds
(a1, vx7, vx1 );
\
t8 = vec_mradds
(a1, vx1, vec_subs (zero, vx7));
\
t7 = vec_mradds
(a2, vx5, vx3);
\
t3 = vec_mradds
(ma2, vx3, vx5);
\
t1 = vec_mradds
(a1, vx7, vx1);
\
t8 = vec_mradds
(a1, vx1, vec_subs(zero, vx7));
\
t7 = vec_mradds
(a2, vx5, vx3);
\
t3 = vec_mradds
(ma2, vx3, vx5);
\
\
/* 2nd stage */
\
t5 = vec_adds
(vx0, vx4);
\
t0 = vec_subs
(vx0, vx4);
\
t2 = vec_mradds
(a0, vx6, vx2);
\
t4 = vec_mradds
(a0, vx2, vec_subs (zero, vx6));
\
t6 = vec_adds
(t8, t3);
\
t3 = vec_subs
(t8, t3);
\
t8 = vec_subs
(t1, t7);
\
t1 = vec_adds
(t1, t7);
\
t5 = vec_adds
(vx0, vx4);
\
t0 = vec_subs
(vx0, vx4);
\
t2 = vec_mradds
(a0, vx6, vx2);
\
t4 = vec_mradds
(a0, vx2, vec_subs(zero, vx6));
\
t6 = vec_adds
(t8, t3);
\
t3 = vec_subs
(t8, t3);
\
t8 = vec_subs
(t1, t7);
\
t1 = vec_adds
(t1, t7);
\
\
/* 3rd stage */
\
t7 = vec_adds
(t5, t2);
\
t2 = vec_subs
(t5, t2);
\
t5 = vec_adds
(t0, t4);
\
t0 = vec_subs
(t0, t4);
\
t4 = vec_subs
(t8, t3);
\
t3 = vec_adds
(t8, t3);
\
t7 = vec_adds
(t5, t2);
\
t2 = vec_subs
(t5, t2);
\
t5 = vec_adds
(t0, t4);
\
t0 = vec_subs
(t0, t4);
\
t4 = vec_subs
(t8, t3);
\
t3 = vec_adds
(t8, t3);
\
\
/* 4th stage */
\
vy0 = vec_adds (t7, t1); \
vy7 = vec_subs (t7, t1); \
vy1 = vec_mradds (c4, t3, t5); \
vy6 = vec_mradds (mc4, t3, t5); \
vy2 = vec_mradds (c4, t4, t0); \
vy5 = vec_mradds (mc4, t4, t0); \
vy3 = vec_adds (t2, t6); \
vy4 = vec_subs (t2, t6);
vy0 = vec_adds(t7, t1); \
vy7 = vec_subs(t7, t1); \
vy1 = vec_mradds(c4, t3, t5); \
vy6 = vec_mradds(mc4, t3, t5); \
vy2 = vec_mradds(c4, t4, t0); \
vy5 = vec_mradds(mc4, t4, t0); \
vy3 = vec_adds(t2, t6); \
vy4 = vec_subs(t2, t6);
#define IDCT \
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
...
...
@@ -82,101 +82,107 @@
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vec_u16 shift; \
\
c4
= vec_splat (constants[0], 0);
\
a0
= vec_splat (constants[0], 1);
\
a1
= vec_splat (constants[0], 2);
\
a2
= vec_splat (constants[0], 3);
\
mc4
= vec_splat (constants[0], 4);
\
ma2
= vec_splat (constants[0], 5);
\
bias = (vec_s16)
vec_splat ((vec_s32)constants[0], 3);
\
c4
= vec_splat(constants[0], 0);
\
a0
= vec_splat(constants[0], 1);
\
a1
= vec_splat(constants[0], 2);
\
a2
= vec_splat(constants[0], 3);
\
mc4
= vec_splat(constants[0], 4);
\
ma2
= vec_splat(constants[0], 5);
\
bias = (vec_s16)
vec_splat((vec_s32) constants[0], 3);
\
\
zero
= vec_splat_s16 (0);
\
shift = vec_splat_u16
(4);
\
zero
= vec_splat_s16(0);
\
shift = vec_splat_u16
(4);
\
\
vx0 = vec_mradds
(vec_sl (block[0], shift), constants[1], zero);
\
vx1 = vec_mradds
(vec_sl (block[1], shift), constants[2], zero);
\
vx2 = vec_mradds
(vec_sl (block[2], shift), constants[3], zero);
\
vx3 = vec_mradds
(vec_sl (block[3], shift), constants[4], zero);
\
vx4 = vec_mradds
(vec_sl (block[4], shift), constants[1], zero);
\
vx5 = vec_mradds
(vec_sl (block[5], shift), constants[4], zero);
\
vx6 = vec_mradds
(vec_sl (block[6], shift), constants[3], zero);
\
vx7 = vec_mradds
(vec_sl (block[7], shift), constants[2], zero);
\
vx0 = vec_mradds
(vec_sl(block[0], shift), constants[1], zero);
\
vx1 = vec_mradds
(vec_sl(block[1], shift), constants[2], zero);
\
vx2 = vec_mradds
(vec_sl(block[2], shift), constants[3], zero);
\
vx3 = vec_mradds
(vec_sl(block[3], shift), constants[4], zero);
\
vx4 = vec_mradds
(vec_sl(block[4], shift), constants[1], zero);
\
vx5 = vec_mradds
(vec_sl(block[5], shift), constants[4], zero);
\
vx6 = vec_mradds
(vec_sl(block[6], shift), constants[3], zero);
\
vx7 = vec_mradds
(vec_sl(block[7], shift), constants[2], zero);
\
\
IDCT_HALF \
\
vx0 = vec_mergeh
(vy0, vy4);
\
vx1 = vec_mergel
(vy0, vy4);
\
vx2 = vec_mergeh
(vy1, vy5);
\
vx3 = vec_mergel
(vy1, vy5);
\
vx4 = vec_mergeh
(vy2, vy6);
\
vx5 = vec_mergel
(vy2, vy6);
\
vx6 = vec_mergeh
(vy3, vy7);
\
vx7 = vec_mergel
(vy3, vy7);
\
vx0 = vec_mergeh
(vy0, vy4);
\
vx1 = vec_mergel
(vy0, vy4);
\
vx2 = vec_mergeh
(vy1, vy5);
\
vx3 = vec_mergel
(vy1, vy5);
\
vx4 = vec_mergeh
(vy2, vy6);
\
vx5 = vec_mergel
(vy2, vy6);
\
vx6 = vec_mergeh
(vy3, vy7);
\
vx7 = vec_mergel
(vy3, vy7);
\
\
vy0 = vec_mergeh
(vx0, vx4);
\
vy1 = vec_mergel
(vx0, vx4);
\
vy2 = vec_mergeh
(vx1, vx5);
\
vy3 = vec_mergel
(vx1, vx5);
\
vy4 = vec_mergeh
(vx2, vx6);
\
vy5 = vec_mergel
(vx2, vx6);
\
vy6 = vec_mergeh
(vx3, vx7);
\
vy7 = vec_mergel
(vx3, vx7);
\
vy0 = vec_mergeh
(vx0, vx4);
\
vy1 = vec_mergel
(vx0, vx4);
\
vy2 = vec_mergeh
(vx1, vx5);
\
vy3 = vec_mergel
(vx1, vx5);
\
vy4 = vec_mergeh
(vx2, vx6);
\
vy5 = vec_mergel
(vx2, vx6);
\
vy6 = vec_mergeh
(vx3, vx7);
\
vy7 = vec_mergel
(vx3, vx7);
\
\
vx0 = vec_adds
(vec_mergeh (vy0, vy4), bias);
\
vx1 = vec_mergel
(vy0, vy4);
\
vx2 = vec_mergeh
(vy1, vy5);
\
vx3 = vec_mergel
(vy1, vy5);
\
vx4 = vec_mergeh
(vy2, vy6);
\
vx5 = vec_mergel
(vy2, vy6);
\
vx6 = vec_mergeh
(vy3, vy7);
\
vx7 = vec_mergel
(vy3, vy7);
\
vx0 = vec_adds
(vec_mergeh(vy0, vy4), bias);
\
vx1 = vec_mergel
(vy0, vy4);
\
vx2 = vec_mergeh
(vy1, vy5);
\
vx3 = vec_mergel
(vy1, vy5);
\
vx4 = vec_mergeh
(vy2, vy6);
\
vx5 = vec_mergel
(vy2, vy6);
\
vx6 = vec_mergeh
(vy3, vy7);
\
vx7 = vec_mergel
(vy3, vy7);
\
\
IDCT_HALF \
\
shift = vec_splat_u16 (6); \
vx0 = vec_sra (vy0, shift); \
vx1 = vec_sra (vy1, shift); \
vx2 = vec_sra (vy2, shift); \
vx3 = vec_sra (vy3, shift); \
vx4 = vec_sra (vy4, shift); \
vx5 = vec_sra (vy5, shift); \
vx6 = vec_sra (vy6, shift); \
vx7 = vec_sra (vy7, shift);
shift = vec_splat_u16(6); \
vx0 = vec_sra(vy0, shift); \
vx1 = vec_sra(vy1, shift); \
vx2 = vec_sra(vy2, shift); \
vx3 = vec_sra(vy3, shift); \
vx4 = vec_sra(vy4, shift); \
vx5 = vec_sra(vy5, shift); \
vx6 = vec_sra(vy6, shift); \
vx7 = vec_sra(vy7, shift);
static
const
vec_s16
constants
[
5
]
=
{
{
23170
,
13573
,
6518
,
21895
,
-
23170
,
-
21895
,
32
,
31
},
{
16384
,
22725
,
21407
,
19266
,
16384
,
19266
,
21407
,
22725
},
{
22725
,
31521
,
29692
,
26722
,
22725
,
26722
,
29692
,
31521
},
{
21407
,
29692
,
27969
,
25172
,
21407
,
25172
,
27969
,
29692
},
{
19266
,
26722
,
25172
,
22654
,
19266
,
22654
,
25172
,
26722
}
{
23170
,
13573
,
6518
,
21895
,
-
23170
,
-
21895
,
32
,
31
},
{
16384
,
22725
,
21407
,
19266
,
16384
,
19266
,
21407
,
22725
},
{
22725
,
31521
,
29692
,
26722
,
22725
,
26722
,
29692
,
31521
},
{
21407
,
29692
,
27969
,
25172
,
21407
,
25172
,
27969
,
29692
},
{
19266
,
26722
,
25172
,
22654
,
19266
,
22654
,
25172
,
26722
}
};
void
ff_idct_put_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
void
ff_idct_put_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
{
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_u8
tmp
;
IDCT
#define COPY(dest,src) \
tmp = vec_packsu (src, src); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
COPY
(
dest
,
vx0
)
dest
+=
stride
;
COPY
(
dest
,
vx1
)
dest
+=
stride
;
COPY
(
dest
,
vx2
)
dest
+=
stride
;
COPY
(
dest
,
vx3
)
dest
+=
stride
;
COPY
(
dest
,
vx4
)
dest
+=
stride
;
COPY
(
dest
,
vx5
)
dest
+=
stride
;
COPY
(
dest
,
vx6
)
dest
+=
stride
;
COPY
(
dest
,
vx7
)
#define COPY(dest, src) \
tmp = vec_packsu(src, src); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest);
COPY
(
dest
,
vx0
)
dest
+=
stride
;
COPY
(
dest
,
vx1
)
dest
+=
stride
;
COPY
(
dest
,
vx2
)
dest
+=
stride
;
COPY
(
dest
,
vx3
)
dest
+=
stride
;
COPY
(
dest
,
vx4
)
dest
+=
stride
;
COPY
(
dest
,
vx5
)
dest
+=
stride
;
COPY
(
dest
,
vx6
)
dest
+=
stride
;
COPY
(
dest
,
vx7
)
}
void
ff_idct_add_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
void
ff_idct_add_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
{
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_u8
tmp
;
vec_s16
tmp2
,
tmp3
;
vec_u8
perm0
;
...
...
@@ -185,27 +191,34 @@ void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
IDCT
p0
=
vec_lvsl
(
0
,
dest
);
p1
=
vec_lvsl
(
stride
,
dest
);
p
=
vec_splat_u8
(
-
1
);
perm0
=
vec_mergeh
(
p
,
p0
);
perm1
=
vec_mergeh
(
p
,
p1
);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */
\
tmp = vec_ld (0, dest); \
tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
ADD
(
dest
,
vx0
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx1
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx2
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx3
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx4
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx5
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx6
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx7
,
perm1
)
p0
=
vec_lvsl
(
0
,
dest
);
p1
=
vec_lvsl
(
stride
,
dest
);
p
=
vec_splat_u8
(
-
1
);
perm0
=
vec_mergeh
(
p
,
p0
);
perm1
=
vec_mergeh
(
p
,
p1
);
#define ADD(dest, src, perm) \
/* *(uint64_t *) &tmp = *(uint64_t *) dest; */
\
tmp = vec_ld(0, dest); \
tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm); \
tmp3 = vec_adds(tmp2, src); \
tmp = vec_packsu(tmp3, tmp3); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest);
ADD
(
dest
,
vx0
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx1
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx2
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx3
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx4
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx5
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx6
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx7
,
perm1
)
}
libavcodec/ppc/int_altivec.c
View file @
022184a6
...
...
@@ -31,15 +31,16 @@
#include "libavutil/attributes.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
static
int
ssd_int8_vs_int16_altivec
(
const
int8_t
*
pix1
,
const
int16_t
*
pix2
,
int
size
)
{
int
size
)
{
int
i
,
size16
;
vector
signed
char
vpix1
;
vector
signed
short
vpix2
,
vdiff
,
vpix1l
,
vpix1h
;
union
{
vector
signed
int
vscore
;
vector
signed
short
vpix2
,
vdiff
,
vpix1l
,
vpix1h
;
union
{
vector
signed
int
vscore
;
int32_t
score
[
4
];
}
u
;
u
.
vscore
=
vec_splat_s32
(
0
);
...
...
@@ -47,11 +48,11 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
// XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,
b),vec_ld(15,b),
vec_lvsl(0, b));
vec_perm(vec_ld(0,
b), vec_ld(15, b),
vec_lvsl(0, b));
size16
=
size
>>
4
;
while
(
size16
)
{
// score += (pix1[i]-pix2[i])*(pix1[i]-
pix2[i]);
while
(
size16
)
{
// score += (pix1[i] - pix2[i]) * (pix1[i] -
pix2[i]);
// load pix1 and the first batch of pix2
vpix1
=
vec_unaligned_load
(
pix1
);
...
...
@@ -73,9 +74,9 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
u
.
vscore
=
vec_sums
(
u
.
vscore
,
vec_splat_s32
(
0
));
size
%=
16
;
for
(
i
=
0
;
i
<
size
;
i
++
)
{
u
.
score
[
3
]
+=
(
pix1
[
i
]
-
pix2
[
i
])
*
(
pix1
[
i
]
-
pix2
[
i
]);
}
for
(
i
=
0
;
i
<
size
;
i
++
)
u
.
score
[
3
]
+=
(
pix1
[
i
]
-
pix2
[
i
])
*
(
pix1
[
i
]
-
pix2
[
i
]);
return
u
.
score
[
3
];
}
...
...
@@ -88,7 +89,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
register
vec_s32
res
=
vec_splat_s32
(
0
),
t
;
int32_t
ires
;
for
(
i
=
0
;
i
<
order
;
i
+=
8
)
{
for
(
i
=
0
;
i
<
order
;
i
+=
8
)
{
vec1
=
vec_unaligned_load
(
v1
);
t
=
vec_msum
(
vec1
,
vec_ld
(
0
,
v2
),
zero_s32v
);
res
=
vec_sums
(
t
,
res
);
...
...
@@ -97,19 +98,24 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
}
res
=
vec_splat
(
res
,
3
);
vec_ste
(
res
,
0
,
&
ires
);
return
ires
;
}
static
int32_t
scalarproduct_and_madd_int16_altivec
(
int16_t
*
v1
,
const
int16_t
*
v2
,
const
int16_t
*
v3
,
int
order
,
int
mul
)
static
int32_t
scalarproduct_and_madd_int16_altivec
(
int16_t
*
v1
,
const
int16_t
*
v2
,
const
int16_t
*
v3
,
int
order
,
int
mul
)
{
LOAD_ZERO
;
vec_s16
*
pv1
=
(
vec_s16
*
)
v1
;
register
vec_s16
muls
=
{
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
};
vec_s16
*
pv1
=
(
vec_s16
*
)
v1
;
register
vec_s16
muls
=
{
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
};
register
vec_s16
t0
,
t1
,
i0
,
i1
,
i4
;
register
vec_s16
i2
=
vec_ld
(
0
,
v2
),
i3
=
vec_ld
(
0
,
v3
);
register
vec_s32
res
=
zero_s32v
;
register
vec_u8
align
=
vec_lvsl
(
0
,
v2
);
int32_t
ires
;
order
>>=
4
;
do
{
i1
=
vec_ld
(
16
,
v2
);
...
...
@@ -129,15 +135,18 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *
pv1
+=
2
;
v2
+=
16
;
v3
+=
16
;
}
while
(
--
order
);
}
while
(
--
order
);
res
=
vec_splat
(
vec_sums
(
res
,
zero_s32v
),
3
);
vec_ste
(
res
,
0
,
&
ires
);
return
ires
;
}
av_cold
void
ff_int_init_altivec
(
DSPContext
*
c
,
AVCodecContext
*
avctx
)
{
c
->
ssd_int8_vs_int16
=
ssd_int8_vs_int16_altivec
;
c
->
scalarproduct_int16
=
scalarproduct_int16_altivec
;
c
->
scalarproduct_and_madd_int16
=
scalarproduct_and_madd_int16_altivec
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment