Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
022184a6
Commit
022184a6
authored
Jan 15, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppc: dsputil: more K&R formatting cosmetics
parent
30f3f959
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
437 additions
and
416 deletions
+437
-416
fdct_altivec.c
libavcodec/ppc/fdct_altivec.c
+191
-193
gmc_altivec.c
libavcodec/ppc/gmc_altivec.c
+42
-41
idct_altivec.c
libavcodec/ppc/idct_altivec.c
+154
-141
int_altivec.c
libavcodec/ppc/int_altivec.c
+50
-41
No files found.
libavcodec/ppc/fdct_altivec.c
View file @
022184a6
...
...
@@ -22,39 +22,37 @@
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/common.h"
#include "dsputil_altivec.h"
#define vs16(v) ((vector signed short)(v))
#define vs32(v) ((vector signed int)(v))
#define vu8(v) ((vector unsigned char)(v))
#define vu16(v) ((vector unsigned short)(v))
#define vu32(v) ((vector unsigned int)(v))
#define C1 0.98078525066375732421875000
/* cos(1*PI/16) */
#define C2 0.92387950420379638671875000
/* cos(2*PI/16) */
#define C3 0.83146959543228149414062500
/* cos(3*PI/16) */
#define C4 0.70710676908493041992187500
/* cos(4*PI/16) */
#define C5 0.55557024478912353515625000
/* cos(5*PI/16) */
#define C6 0.38268342614173889160156250
/* cos(6*PI/16) */
#define C7 0.19509032368659973144531250
/* cos(7*PI/16) */
#define SQRT_2 1.41421353816986083984375000
/* sqrt(2) */
#define vs16(v) ((vector signed short) (v))
#define vs32(v) ((vector signed int) (v))
#define vu8(v) ((vector unsigned char) (v))
#define vu16(v) ((vector unsigned short) (v))
#define vu32(v) ((vector unsigned int) (v))
#define C1 0.98078525066375732421875000
/* cos(1 * PI / 16) */
#define C2 0.92387950420379638671875000
/* cos(2 * PI / 16) */
#define C3 0.83146959543228149414062500
/* cos(3 * PI / 16) */
#define C4 0.70710676908493041992187500
/* cos(4 * PI / 16) */
#define C5 0.55557024478912353515625000
/* cos(5 * PI / 16) */
#define C6 0.38268342614173889160156250
/* cos(6 * PI / 16) */
#define C7 0.19509032368659973144531250
/* cos(7 * PI / 16) */
#define SQRT_2 1.41421353816986083984375000
/* sqrt(2) */
#define W0 -(2 * C2)
#define W1 (2 * C6)
#define W1
(2 * C6)
#define W2 (SQRT_2 * C6)
#define W3 (SQRT_2 * C3)
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
#define W5 (SQRT_2 *
(
C1 + C3 - C5 + C7))
#define W6 (SQRT_2 *
(
C1 + C3 + C5 - C7))
#define W7 (SQRT_2 *
(
C1 + C3 - C5 - C7))
#define W8 (SQRT_2 *
(
C7 - C3))
#define W5 (SQRT_2 *
(
C1 + C3 - C5 + C7))
#define W6 (SQRT_2 *
(
C1 + C3 + C5 - C7))
#define W7 (SQRT_2 *
(
C1 + C3 - C5 - C7))
#define W8 (SQRT_2 *
(
C7 - C3))
#define W9 (SQRT_2 * (-C1 - C3))
#define WA (SQRT_2 * (-C3 - C5))
#define WB (SQRT_2 * ( C5 - C3))
#define WB (SQRT_2 * (C5 - C3))
static
vector
float
fdctconsts
[
3
]
=
{
{
W0
,
W1
,
W2
,
W3
},
...
...
@@ -75,8 +73,7 @@ static vector float fdctconsts[3] = {
#define LD_WA vec_splat(cnsts2, 2)
#define LD_WB vec_splat(cnsts2, 3)
#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7)
/* {{{ */
\
#define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7)
/* {{{ */
\
x0 = vec_add(b0, b7);
/* x0 = b0 + b7; */
\
x7 = vec_sub(b0, b7);
/* x7 = b0 - b7; */
\
x1 = vec_add(b1, b6);
/* x1 = b1 + b6; */
\
...
...
@@ -91,41 +88,41 @@ static vector float fdctconsts[3] = {
b0 = vec_add(b7, b1);
/* b0 = b7 + b1; */
\
b4 = vec_sub(b7, b1);
/* b4 = b7 - b1; */
\
\
b2
= vec_sub(x0, x3);
/* b2 = x0 - x3; */
\
b6
= vec_sub(x1, x2);
/* b6 = x1 - x2; */
\
b5
= vec_add(b6, b2);
/* b5 = b6 + b2; */
\
b2
= vec_sub(x0, x3);
/* b2 = x0 - x3; */
\
b6
= vec_sub(x1, x2);
/* b6 = x1 - x2; */
\
b5
= vec_add(b6, b2);
/* b5 = b6 + b2; */
\
cnst = LD_W2; \
b5
= vec_madd(cnst, b5, mzero);
/* b5 = b5 * W2; */
\
b5
= vec_madd(cnst, b5, mzero);
/* b5 = b5 * W2; */
\
cnst = LD_W1; \
b2
= vec_madd(cnst, b2, b5);
/* b2 = b5 + b2 * W1; */
\
b2
= vec_madd(cnst, b2, b5);
/* b2 = b5 + b2 * W1; */
\
cnst = LD_W0; \
b6
= vec_madd(cnst, b6, b5);
/* b6 = b5 + b6 * W0; */
\
b6
= vec_madd(cnst, b6, b5);
/* b6 = b5 + b6 * W0; */
\
\
x0
= vec_add(x4, x7);
/* x0 = x4 + x7; */
\
x1
= vec_add(x5, x6);
/* x1 = x5 + x6; */
\
x2
= vec_add(x4, x6);
/* x2 = x4 + x6; */
\
x3
= vec_add(x5, x7);
/* x3 = x5 + x7; */
\
x8
= vec_add(x2, x3);
/* x8 = x2 + x3; */
\
x0
= vec_add(x4, x7);
/* x0 = x4 + x7; */
\
x1
= vec_add(x5, x6);
/* x1 = x5 + x6; */
\
x2
= vec_add(x4, x6);
/* x2 = x4 + x6; */
\
x3
= vec_add(x5, x7);
/* x3 = x5 + x7; */
\
x8
= vec_add(x2, x3);
/* x8 = x2 + x3; */
\
cnst = LD_W3; \
x8
= vec_madd(cnst, x8, mzero);
/* x8 = x8 * W3; */
\
x8
= vec_madd(cnst, x8, mzero);
/* x8 = x8 * W3; */
\
\
cnst = LD_W8; \
x0
= vec_madd(cnst, x0, mzero);
/* x0 *= W8; */
\
x0
= vec_madd(cnst, x0, mzero);
/* x0 *= W8; */
\
cnst = LD_W9; \
x1
= vec_madd(cnst, x1, mzero);
/* x1 *= W9; */
\
x1
= vec_madd(cnst, x1, mzero);
/* x1 *= W9; */
\
cnst = LD_WA; \
x2
= vec_madd(cnst, x2, x8);
/* x2 = x2 * WA + x8; */
\
x2
= vec_madd(cnst, x2, x8);
/* x2 = x2 * WA + x8; */
\
cnst = LD_WB; \
x3
= vec_madd(cnst, x3, x8);
/* x3 = x3 * WB + x8; */
\
x3
= vec_madd(cnst, x3, x8);
/* x3 = x3 * WB + x8; */
\
\
cnst = LD_W4; \
b7
= vec_madd(cnst, x4, x0);
/* b7 = x4 * W4 + x0; */
\
b7
= vec_madd(cnst, x4, x0);
/* b7 = x4 * W4 + x0; */
\
cnst = LD_W5; \
b5
= vec_madd(cnst, x5, x1);
/* b5 = x5 * W5 + x1; */
\
b5
= vec_madd(cnst, x5, x1);
/* b5 = x5 * W5 + x1; */
\
cnst = LD_W6; \
b3
= vec_madd(cnst, x6, x1);
/* b3 = x6 * W6 + x1; */
\
b3
= vec_madd(cnst, x6, x1);
/* b3 = x6 * W6 + x1; */
\
cnst = LD_W7; \
b1
= vec_madd(cnst, x7, x0);
/* b1 = x7 * W7 + x0; */
\
b1
= vec_madd(cnst, x7, x0);
/* b1 = x7 * W7 + x0; */
\
\
b7 = vec_add(b7, x2);
/* b7 = b7 + x2; */
\
b5 = vec_add(b5, x3);
/* b5 = b5 + x3; */
\
...
...
@@ -133,7 +130,7 @@ static vector float fdctconsts[3] = {
b1 = vec_add(b1, x3);
/* b1 = b1 + x3; */
\
/* }}} */
#define FDCTCOL(b0,
b1,b2,b3,b4,b5,b6,b7)
/* {{{ */
\
#define FDCTCOL(b0,
b1, b2, b3, b4, b5, b6, b7)
/* {{{ */
\
x0 = vec_add(b0, b7);
/* x0 = b0 + b7; */
\
x7 = vec_sub(b0, b7);
/* x7 = b0 - b7; */
\
x1 = vec_add(b1, b6);
/* x1 = b1 + b6; */
\
...
...
@@ -148,41 +145,41 @@ static vector float fdctconsts[3] = {
b0 = vec_add(b7, b1);
/* b0 = b7 + b1; */
\
b4 = vec_sub(b7, b1);
/* b4 = b7 - b1; */
\
\
b2
= vec_sub(x0, x3);
/* b2 = x0 - x3; */
\
b6
= vec_sub(x1, x2);
/* b6 = x1 - x2; */
\
b5
= vec_add(b6, b2);
/* b5 = b6 + b2; */
\
b2
= vec_sub(x0, x3);
/* b2 = x0 - x3; */
\
b6
= vec_sub(x1, x2);
/* b6 = x1 - x2; */
\
b5
= vec_add(b6, b2);
/* b5 = b6 + b2; */
\
cnst = LD_W2; \
b5
= vec_madd(cnst, b5, mzero);
/* b5 = b5 * W2; */
\
b5
= vec_madd(cnst, b5, mzero);
/* b5 = b5 * W2; */
\
cnst = LD_W1; \
b2
= vec_madd(cnst, b2, b5);
/* b2 = b5 + b2 * W1; */
\
b2
= vec_madd(cnst, b2, b5);
/* b2 = b5 + b2 * W1; */
\
cnst = LD_W0; \
b6
= vec_madd(cnst, b6, b5);
/* b6 = b5 + b6 * W0; */
\
b6
= vec_madd(cnst, b6, b5);
/* b6 = b5 + b6 * W0; */
\
\
x0
= vec_add(x4, x7);
/* x0 = x4 + x7; */
\
x1
= vec_add(x5, x6);
/* x1 = x5 + x6; */
\
x2
= vec_add(x4, x6);
/* x2 = x4 + x6; */
\
x3
= vec_add(x5, x7);
/* x3 = x5 + x7; */
\
x8
= vec_add(x2, x3);
/* x8 = x2 + x3; */
\
x0
= vec_add(x4, x7);
/* x0 = x4 + x7; */
\
x1
= vec_add(x5, x6);
/* x1 = x5 + x6; */
\
x2
= vec_add(x4, x6);
/* x2 = x4 + x6; */
\
x3
= vec_add(x5, x7);
/* x3 = x5 + x7; */
\
x8
= vec_add(x2, x3);
/* x8 = x2 + x3; */
\
cnst = LD_W3; \
x8
= vec_madd(cnst, x8, mzero);
/* x8 = x8 * W3; */
\
x8
= vec_madd(cnst, x8, mzero);
/* x8 = x8 * W3; */
\
\
cnst = LD_W8; \
x0
= vec_madd(cnst, x0, mzero);
/* x0 *= W8; */
\
x0
= vec_madd(cnst, x0, mzero);
/* x0 *= W8; */
\
cnst = LD_W9; \
x1
= vec_madd(cnst, x1, mzero);
/* x1 *= W9; */
\
x1
= vec_madd(cnst, x1, mzero);
/* x1 *= W9; */
\
cnst = LD_WA; \
x2
= vec_madd(cnst, x2, x8);
/* x2 = x2 * WA + x8; */
\
x2
= vec_madd(cnst, x2, x8);
/* x2 = x2 * WA + x8; */
\
cnst = LD_WB; \
x3
= vec_madd(cnst, x3, x8);
/* x3 = x3 * WB + x8; */
\
x3
= vec_madd(cnst, x3, x8);
/* x3 = x3 * WB + x8; */
\
\
cnst = LD_W4; \
b7
= vec_madd(cnst, x4, x0);
/* b7 = x4 * W4 + x0; */
\
b7
= vec_madd(cnst, x4, x0);
/* b7 = x4 * W4 + x0; */
\
cnst = LD_W5; \
b5
= vec_madd(cnst, x5, x1);
/* b5 = x5 * W5 + x1; */
\
b5
= vec_madd(cnst, x5, x1);
/* b5 = x5 * W5 + x1; */
\
cnst = LD_W6; \
b3
= vec_madd(cnst, x6, x1);
/* b3 = x6 * W6 + x1; */
\
b3
= vec_madd(cnst, x6, x1);
/* b3 = x6 * W6 + x1; */
\
cnst = LD_W7; \
b1
= vec_madd(cnst, x7, x0);
/* b1 = x7 * W7 + x0; */
\
b1
= vec_madd(cnst, x7, x0);
/* b1 = x7 * W7 + x0; */
\
\
b7 = vec_add(b7, x2);
/* b7 += x2; */
\
b5 = vec_add(b5, x3);
/* b5 += x3; */
\
...
...
@@ -190,10 +187,7 @@ static vector float fdctconsts[3] = {
b1 = vec_add(b1, x3);
/* b1 += x3; */
\
/* }}} */
/* two dimensional discrete cosine transform */
void
ff_fdct_altivec
(
int16_t
*
block
)
{
vector
signed
short
*
bp
;
...
...
@@ -205,56 +199,57 @@ void ff_fdct_altivec(int16_t *block)
/* setup constants {{{ */
/* mzero = -0.0 */
mzero
=
((
vector
float
)
vec_splat_u32
(
-
1
));
mzero
=
((
vector
float
)
vec_sl
(
vu32
(
mzero
),
vu32
(
mzero
)));
cp
=
fdctconsts
;
cnsts0
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts1
=
vec_ld
(
0
,
cp
);
cp
++
;
mzero
=
((
vector
float
)
vec_splat_u32
(
-
1
));
mzero
=
((
vector
float
)
vec_sl
(
vu32
(
mzero
),
vu32
(
mzero
)));
cp
=
fdctconsts
;
cnsts0
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts1
=
vec_ld
(
0
,
cp
);
cp
++
;
cnsts2
=
vec_ld
(
0
,
cp
);
/* }}} */
/* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl,
a,b) vec_merge##
hl(vs16(a), vs16(b))
#define MERGE_S16(hl,
a, b) vec_merge ##
hl(vs16(a), vs16(b))
bp
=
(
vector
signed
short
*
)
block
;
b00
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b40
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b01
=
((
vector
float
)
MERGE_S16
(
h
,
b00
,
b40
));
b11
=
((
vector
float
)
MERGE_S16
(
l
,
b00
,
b40
));
bp
=
(
vector
signed
short
*
)
block
;
b00
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b40
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b01
=
((
vector
float
)
MERGE_S16
(
h
,
b00
,
b40
));
b11
=
((
vector
float
)
MERGE_S16
(
l
,
b00
,
b40
));
bp
++
;
b10
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b50
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b21
=
((
vector
float
)
MERGE_S16
(
h
,
b10
,
b50
));
b31
=
((
vector
float
)
MERGE_S16
(
l
,
b10
,
b50
));
b10
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b50
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b21
=
((
vector
float
)
MERGE_S16
(
h
,
b10
,
b50
));
b31
=
((
vector
float
)
MERGE_S16
(
l
,
b10
,
b50
));
bp
++
;
b20
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b60
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b41
=
((
vector
float
)
MERGE_S16
(
h
,
b20
,
b60
));
b51
=
((
vector
float
)
MERGE_S16
(
l
,
b20
,
b60
));
b20
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b60
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b41
=
((
vector
float
)
MERGE_S16
(
h
,
b20
,
b60
));
b51
=
((
vector
float
)
MERGE_S16
(
l
,
b20
,
b60
));
bp
++
;
b30
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b70
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b61
=
((
vector
float
)
MERGE_S16
(
h
,
b30
,
b70
));
b71
=
((
vector
float
)
MERGE_S16
(
l
,
b30
,
b70
));
x0
=
((
vector
float
)
MERGE_S16
(
h
,
b01
,
b41
));
x1
=
((
vector
float
)
MERGE_S16
(
l
,
b01
,
b41
));
x2
=
((
vector
float
)
MERGE_S16
(
h
,
b11
,
b51
));
x3
=
((
vector
float
)
MERGE_S16
(
l
,
b11
,
b51
));
x4
=
((
vector
float
)
MERGE_S16
(
h
,
b21
,
b61
));
x5
=
((
vector
float
)
MERGE_S16
(
l
,
b21
,
b61
));
x6
=
((
vector
float
)
MERGE_S16
(
h
,
b31
,
b71
));
x7
=
((
vector
float
)
MERGE_S16
(
l
,
b31
,
b71
));
b00
=
((
vector
float
)
MERGE_S16
(
h
,
x0
,
x4
));
b10
=
((
vector
float
)
MERGE_S16
(
l
,
x0
,
x4
));
b20
=
((
vector
float
)
MERGE_S16
(
h
,
x1
,
x5
));
b30
=
((
vector
float
)
MERGE_S16
(
l
,
x1
,
x5
));
b40
=
((
vector
float
)
MERGE_S16
(
h
,
x2
,
x6
));
b50
=
((
vector
float
)
MERGE_S16
(
l
,
x2
,
x6
));
b60
=
((
vector
float
)
MERGE_S16
(
h
,
x3
,
x7
));
b70
=
((
vector
float
)
MERGE_S16
(
l
,
x3
,
x7
));
b30
=
((
vector
float
)
vec_ld
(
0
,
bp
));
b70
=
((
vector
float
)
vec_ld
(
16
*
4
,
bp
));
b61
=
((
vector
float
)
MERGE_S16
(
h
,
b30
,
b70
));
b71
=
((
vector
float
)
MERGE_S16
(
l
,
b30
,
b70
));
x0
=
((
vector
float
)
MERGE_S16
(
h
,
b01
,
b41
));
x1
=
((
vector
float
)
MERGE_S16
(
l
,
b01
,
b41
));
x2
=
((
vector
float
)
MERGE_S16
(
h
,
b11
,
b51
));
x3
=
((
vector
float
)
MERGE_S16
(
l
,
b11
,
b51
));
x4
=
((
vector
float
)
MERGE_S16
(
h
,
b21
,
b61
));
x5
=
((
vector
float
)
MERGE_S16
(
l
,
b21
,
b61
));
x6
=
((
vector
float
)
MERGE_S16
(
h
,
b31
,
b71
));
x7
=
((
vector
float
)
MERGE_S16
(
l
,
b31
,
b71
));
b00
=
((
vector
float
)
MERGE_S16
(
h
,
x0
,
x4
));
b10
=
((
vector
float
)
MERGE_S16
(
l
,
x0
,
x4
));
b20
=
((
vector
float
)
MERGE_S16
(
h
,
x1
,
x5
));
b30
=
((
vector
float
)
MERGE_S16
(
l
,
x1
,
x5
));
b40
=
((
vector
float
)
MERGE_S16
(
h
,
x2
,
x6
));
b50
=
((
vector
float
)
MERGE_S16
(
l
,
x2
,
x6
));
b60
=
((
vector
float
)
MERGE_S16
(
h
,
x3
,
x7
));
b70
=
((
vector
float
)
MERGE_S16
(
l
,
x3
,
x7
));
#undef MERGE_S16
/* }}} */
...
...
@@ -264,32 +259,32 @@ void ff_fdct_altivec(int16_t *block)
* takes advantage of this. */
/* fdct rows {{{ */
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x1
=
((
vector
float
)
vec_add
(
vs16
(
b10
),
vs16
(
b60
)));
x6
=
((
vector
float
)
vec_sub
(
vs16
(
b10
),
vs16
(
b60
)));
x2
=
((
vector
float
)
vec_add
(
vs16
(
b20
),
vs16
(
b50
)));
x5
=
((
vector
float
)
vec_sub
(
vs16
(
b20
),
vs16
(
b50
)));
x3
=
((
vector
float
)
vec_add
(
vs16
(
b30
),
vs16
(
b40
)));
x4
=
((
vector
float
)
vec_sub
(
vs16
(
b30
),
vs16
(
b40
)));
b70
=
((
vector
float
)
vec_add
(
vs16
(
x0
),
vs16
(
x3
)));
b10
=
((
vector
float
)
vec_add
(
vs16
(
x1
),
vs16
(
x2
)));
b00
=
((
vector
float
)
vec_add
(
vs16
(
b70
),
vs16
(
b10
)));
b40
=
((
vector
float
)
vec_sub
(
vs16
(
b70
),
vs16
(
b10
)));
#define CTF0(n) \
b
##n##1 = ((vector float)vec_unpackl(vs16(b##n##
0))); \
b
##n##0 = ((vector float)vec_unpackh(vs16(b##n##
0))); \
b
##n##1 = vec_ctf(vs32(b##n##1), 0);
\
b
##n##0 = vec_ctf(vs32(b##n##
0), 0);
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x1
=
((
vector
float
)
vec_add
(
vs16
(
b10
),
vs16
(
b60
)));
x6
=
((
vector
float
)
vec_sub
(
vs16
(
b10
),
vs16
(
b60
)));
x2
=
((
vector
float
)
vec_add
(
vs16
(
b20
),
vs16
(
b50
)));
x5
=
((
vector
float
)
vec_sub
(
vs16
(
b20
),
vs16
(
b50
)));
x3
=
((
vector
float
)
vec_add
(
vs16
(
b30
),
vs16
(
b40
)));
x4
=
((
vector
float
)
vec_sub
(
vs16
(
b30
),
vs16
(
b40
)));
b70
=
((
vector
float
)
vec_add
(
vs16
(
x0
),
vs16
(
x3
)));
b10
=
((
vector
float
)
vec_add
(
vs16
(
x1
),
vs16
(
x2
)));
b00
=
((
vector
float
)
vec_add
(
vs16
(
b70
),
vs16
(
b10
)));
b40
=
((
vector
float
)
vec_sub
(
vs16
(
b70
),
vs16
(
b10
)));
#define CTF0(n)
\
b
## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ##
0))); \
b
## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ##
0))); \
b
## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0);
\
b
## n ## 0 = vec_ctf(vs32(b ## n ##
0), 0);
CTF0
(
0
);
CTF0
(
4
);
b20
=
((
vector
float
)
vec_sub
(
vs16
(
x0
),
vs16
(
x3
)));
b60
=
((
vector
float
)
vec_sub
(
vs16
(
x1
),
vs16
(
x2
)));
b20
=
((
vector
float
)
vec_sub
(
vs16
(
x0
),
vs16
(
x3
)));
b60
=
((
vector
float
)
vec_sub
(
vs16
(
x1
),
vs16
(
x2
)));
CTF0
(
2
);
CTF0
(
6
);
...
...
@@ -300,20 +295,20 @@ void ff_fdct_altivec(int16_t *block)
x1
=
vec_add
(
b61
,
b21
);
cnst
=
LD_W2
;
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
cnst
=
LD_W1
;
b20
=
vec_madd
(
cnst
,
b20
,
x0
);
b21
=
vec_madd
(
cnst
,
b21
,
x1
);
b20
=
vec_madd
(
cnst
,
b20
,
x0
);
b21
=
vec_madd
(
cnst
,
b21
,
x1
);
cnst
=
LD_W0
;
b60
=
vec_madd
(
cnst
,
b60
,
x0
);
b61
=
vec_madd
(
cnst
,
b61
,
x1
);
b60
=
vec_madd
(
cnst
,
b60
,
x0
);
b61
=
vec_madd
(
cnst
,
b61
,
x1
);
#define CTFX(x,
b)
\
b
##0 = ((vector float)
vec_unpackh(vs16(x))); \
b
##1 = ((vector float)
vec_unpackl(vs16(x))); \
b
##0 = vec_ctf(vs32(b##0), 0);
\
b
##1 = vec_ctf(vs32(b##1), 0);
\
#define CTFX(x,
b)
\
b
## 0 = ((vector float)
vec_unpackh(vs16(x))); \
b
## 1 = ((vector float)
vec_unpackl(vs16(x))); \
b
## 0 = vec_ctf(vs32(b ## 0), 0);
\
b
## 1 = vec_ctf(vs32(b ## 1), 0);
\
CTFX
(
x4
,
b7
);
CTFX
(
x5
,
b5
);
...
...
@@ -322,64 +317,62 @@ void ff_fdct_altivec(int16_t *block)
#undef CTFX
x0
=
vec_add
(
b70
,
b10
);
x1
=
vec_add
(
b50
,
b30
);
x2
=
vec_add
(
b70
,
b30
);
x3
=
vec_add
(
b50
,
b10
);
x8
=
vec_add
(
x2
,
x3
);
x0
=
vec_add
(
b70
,
b10
);
x1
=
vec_add
(
b50
,
b30
);
x2
=
vec_add
(
b70
,
b30
);
x3
=
vec_add
(
b50
,
b10
);
x8
=
vec_add
(
x2
,
x3
);
cnst
=
LD_W3
;
x8
=
vec_madd
(
cnst
,
x8
,
mzero
);
x8
=
vec_madd
(
cnst
,
x8
,
mzero
);
cnst
=
LD_W8
;
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
cnst
=
LD_W9
;
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
cnst
=
LD_WA
;
x2
=
vec_madd
(
cnst
,
x2
,
x8
);
x2
=
vec_madd
(
cnst
,
x2
,
x8
);
cnst
=
LD_WB
;
x3
=
vec_madd
(
cnst
,
x3
,
x8
);
x3
=
vec_madd
(
cnst
,
x3
,
x8
);
cnst
=
LD_W4
;
b70
=
vec_madd
(
cnst
,
b70
,
x0
);
b70
=
vec_madd
(
cnst
,
b70
,
x0
);
cnst
=
LD_W5
;
b50
=
vec_madd
(
cnst
,
b50
,
x1
);
b50
=
vec_madd
(
cnst
,
b50
,
x1
);
cnst
=
LD_W6
;
b30
=
vec_madd
(
cnst
,
b30
,
x1
);
b30
=
vec_madd
(
cnst
,
b30
,
x1
);
cnst
=
LD_W7
;
b10
=
vec_madd
(
cnst
,
b10
,
x0
);
b10
=
vec_madd
(
cnst
,
b10
,
x0
);
b70
=
vec_add
(
b70
,
x2
);
b50
=
vec_add
(
b50
,
x3
);
b30
=
vec_add
(
b30
,
x2
);
b10
=
vec_add
(
b10
,
x3
);
x0
=
vec_add
(
b71
,
b11
);
x1
=
vec_add
(
b51
,
b31
);
x2
=
vec_add
(
b71
,
b31
);
x3
=
vec_add
(
b51
,
b11
);
x8
=
vec_add
(
x2
,
x3
);
x0
=
vec_add
(
b71
,
b11
);
x1
=
vec_add
(
b51
,
b31
);
x2
=
vec_add
(
b71
,
b31
);
x3
=
vec_add
(
b51
,
b11
);
x8
=
vec_add
(
x2
,
x3
);
cnst
=
LD_W3
;
x8
=
vec_madd
(
cnst
,
x8
,
mzero
);
x8
=
vec_madd
(
cnst
,
x8
,
mzero
);
cnst
=
LD_W8
;
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
x0
=
vec_madd
(
cnst
,
x0
,
mzero
);
cnst
=
LD_W9
;
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
x1
=
vec_madd
(
cnst
,
x1
,
mzero
);
cnst
=
LD_WA
;
x2
=
vec_madd
(
cnst
,
x2
,
x8
);
x2
=
vec_madd
(
cnst
,
x2
,
x8
);
cnst
=
LD_WB
;
x3
=
vec_madd
(
cnst
,
x3
,
x8
);
x3
=
vec_madd
(
cnst
,
x3
,
x8
);
cnst
=
LD_W4
;
b71
=
vec_madd
(
cnst
,
b71
,
x0
);
b71
=
vec_madd
(
cnst
,
b71
,
x0
);
cnst
=
LD_W5
;
b51
=
vec_madd
(
cnst
,
b51
,
x1
);
b51
=
vec_madd
(
cnst
,
b51
,
x1
);
cnst
=
LD_W6
;
b31
=
vec_madd
(
cnst
,
b31
,
x1
);
b31
=
vec_madd
(
cnst
,
b31
,
x1
);
cnst
=
LD_W7
;
b11
=
vec_madd
(
cnst
,
b11
,
x0
);
b11
=
vec_madd
(
cnst
,
b11
,
x0
);
b71
=
vec_add
(
b71
,
x2
);
b51
=
vec_add
(
b51
,
x3
);
...
...
@@ -387,7 +380,6 @@ void ff_fdct_altivec(int16_t *block)
b11
=
vec_add
(
b11
,
x3
);
/* }}} */
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
x0
=
vec_mergel
(
b00
,
b20
);
x1
=
vec_mergeh
(
b00
,
b20
);
...
...
@@ -430,28 +422,34 @@ void ff_fdct_altivec(int16_t *block)
b31
=
vec_mergel
(
x4
,
x6
);
/* }}} */
FDCTCOL
(
b00
,
b10
,
b20
,
b30
,
b40
,
b50
,
b60
,
b70
);
FDCTCOL
(
b01
,
b11
,
b21
,
b31
,
b41
,
b51
,
b61
,
b71
);
/* round, convert back to short {{{ */
#define CTS(n) \
b##n##0 = vec_round(b##n##0); \
b##n##1 = vec_round(b##n##1); \
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
vec_st(vs16(b##n##0), 0, bp);
bp
=
(
vector
signed
short
*
)
block
;
CTS
(
0
);
bp
++
;
CTS
(
1
);
bp
++
;
CTS
(
2
);
bp
++
;
CTS
(
3
);
bp
++
;
CTS
(
4
);
bp
++
;
CTS
(
5
);
bp
++
;
CTS
(
6
);
bp
++
;
#define CTS(n) \
b ## n ## 0 = vec_round(b ## n ## 0); \
b ## n ## 1 = vec_round(b ## n ## 1); \
b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \
b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \
b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \
vs32(b ## n ## 1))); \
vec_st(vs16(b ## n ## 0), 0, bp);
bp
=
(
vector
signed
short
*
)
block
;
CTS
(
0
);
bp
++
;
CTS
(
1
);
bp
++
;
CTS
(
2
);
bp
++
;
CTS
(
3
);
bp
++
;
CTS
(
4
);
bp
++
;
CTS
(
5
);
bp
++
;
CTS
(
6
);
bp
++
;
CTS
(
7
);
#undef CTS
...
...
libavcodec/ppc/gmc_altivec.c
View file @
022184a6
...
...
@@ -27,32 +27,36 @@
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
{
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
ABCD
)[
8
]
=
{
(
16
-
x16
)
*
(
16
-
y16
),
/* A */
(
x16
)
*
(
16
-
y16
),
/* B */
(
16
-
x16
)
*
(
y16
),
/* C */
(
x16
)
*
(
y16
),
/* D */
0
,
0
,
0
,
0
/* padding */
};
register
const
vector
unsigned
char
vczero
=
(
const
vector
unsigned
char
)
vec_splat_u8
(
0
);
register
const
vector
unsigned
short
vcsr8
=
(
const
vector
unsigned
short
)
vec_splat_u16
(
8
);
register
vector
unsigned
char
dstv
,
dstv2
,
src_0
,
src_1
,
srcvA
,
srcvB
,
srcvC
,
srcvD
;
register
vector
unsigned
short
Av
,
Bv
,
Cv
,
Dv
,
rounderV
,
tempA
,
tempB
,
tempC
,
tempD
;
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
ABCD
)[
8
]
=
{
(
16
-
x16
)
*
(
16
-
y16
),
/* A */
(
x16
)
*
(
16
-
y16
),
/* B */
(
16
-
x16
)
*
(
y16
),
/* C */
(
x16
)
*
(
y16
),
/* D */
0
,
0
,
0
,
0
/* padding */
};
register
const
vector
unsigned
char
vczero
=
(
const
vector
unsigned
char
)
vec_splat_u8
(
0
);
register
const
vector
unsigned
short
vcsr8
=
(
const
vector
unsigned
short
)
vec_splat_u16
(
8
);
register
vector
unsigned
char
dstv
,
dstv2
,
src_0
,
src_1
,
srcvA
,
srcvB
,
srcvC
,
srcvD
;
register
vector
unsigned
short
Av
,
Bv
,
Cv
,
Dv
,
rounderV
,
tempA
,
tempB
,
tempC
,
tempD
;
int
i
;
unsigned
long
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
unsigned
long
src_really_odd
=
(
unsigned
long
)
src
&
0x0000000F
;
unsigned
long
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
unsigned
long
src_really_odd
=
(
unsigned
long
)
src
&
0x0000000F
;
tempA
=
vec_ld
(
0
,
(
const
unsigned
short
*
)
ABCD
);
Av
=
vec_splat
(
tempA
,
0
);
Bv
=
vec_splat
(
tempA
,
1
);
Cv
=
vec_splat
(
tempA
,
2
);
Dv
=
vec_splat
(
tempA
,
3
);
tempA
=
vec_ld
(
0
,
(
const
unsigned
short
*
)
ABCD
);
Av
=
vec_splat
(
tempA
,
0
);
Bv
=
vec_splat
(
tempA
,
1
);
Cv
=
vec_splat
(
tempA
,
2
);
Dv
=
vec_splat
(
tempA
,
3
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
/* we'll be able to pick-up our 9 char elements at src from those
* 32 bytes we load the first batch here, as inside the loop we can
...
...
@@ -61,36 +65,34 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
src_1
=
vec_ld
(
16
,
src
);
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
/* If src & 0xF == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
}
else
{
else
srcvB
=
src_1
;
}
srcvA
=
vec_mergeh
(
vczero
,
srcvA
);
srcvB
=
vec_mergeh
(
vczero
,
srcvB
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
src_really_odd
=
(((
unsigned
long
)
src
)
+
stride
)
&
0x0000000F
;
for
(
i
=
0
;
i
<
h
;
i
++
)
{
dst_odd
=
(
unsigned
long
)
dst
&
0x0000000F
;
src_really_odd
=
(((
unsigned
long
)
src
)
+
stride
)
&
0x0000000F
;
dstv
=
vec_ld
(
0
,
dst
);
/* We'll be able to pick-up our 9 char elements at src + stride from
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
/* If src & 0xF == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
}
else
{
else
srcvD
=
src_1
;
}
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
...
...
@@ -98,23 +100,22 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
/* OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempD
=
vec_mladd
((
vector
unsigned
short
)
srcvD
,
Dv
,
tempC
);
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempD
=
vec_mladd
((
vector
unsigned
short
)
srcvD
,
Dv
,
tempC
);
srcvA
=
srcvC
;
srcvB
=
srcvD
;
tempD
=
vec_sr
(
tempD
,
vcsr8
);
dstv2
=
vec_pack
(
tempD
,
(
vector
unsigned
short
)
vczero
);
dstv2
=
vec_pack
(
tempD
,
(
vector
unsigned
short
)
vczero
);
if
(
dst_odd
)
{
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
0
,
1
,
s0
,
s1
));
}
else
{
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
s0
,
s1
,
2
,
3
));
}
if
(
dst_odd
)
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
0
,
1
,
s0
,
s1
));
else
dstv2
=
vec_perm
(
dstv
,
dstv2
,
vcprm
(
s0
,
s1
,
2
,
3
));
vec_st
(
dstv2
,
0
,
dst
);
...
...
libavcodec/ppc/idct_altivec.c
View file @
022184a6
...
...
@@ -36,147 +36,153 @@
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/ppc/types_altivec.h"
#include "dsputil_altivec.h"
#define IDCT_HALF \
/* 1st stage */
\
t1 = vec_mradds
(a1, vx7, vx1 );
\
t8 = vec_mradds
(a1, vx1, vec_subs (zero, vx7));
\
t7 = vec_mradds
(a2, vx5, vx3);
\
t3 = vec_mradds
(ma2, vx3, vx5);
\
t1 = vec_mradds
(a1, vx7, vx1);
\
t8 = vec_mradds
(a1, vx1, vec_subs(zero, vx7));
\
t7 = vec_mradds
(a2, vx5, vx3);
\
t3 = vec_mradds
(ma2, vx3, vx5);
\
\
/* 2nd stage */
\
t5 = vec_adds
(vx0, vx4);
\
t0 = vec_subs
(vx0, vx4);
\
t2 = vec_mradds
(a0, vx6, vx2);
\
t4 = vec_mradds
(a0, vx2, vec_subs (zero, vx6));
\
t6 = vec_adds
(t8, t3);
\
t3 = vec_subs
(t8, t3);
\
t8 = vec_subs
(t1, t7);
\
t1 = vec_adds
(t1, t7);
\
t5 = vec_adds
(vx0, vx4);
\
t0 = vec_subs
(vx0, vx4);
\
t2 = vec_mradds
(a0, vx6, vx2);
\
t4 = vec_mradds
(a0, vx2, vec_subs(zero, vx6));
\
t6 = vec_adds
(t8, t3);
\
t3 = vec_subs
(t8, t3);
\
t8 = vec_subs
(t1, t7);
\
t1 = vec_adds
(t1, t7);
\
\
/* 3rd stage */
\
t7 = vec_adds
(t5, t2);
\
t2 = vec_subs
(t5, t2);
\
t5 = vec_adds
(t0, t4);
\
t0 = vec_subs
(t0, t4);
\
t4 = vec_subs
(t8, t3);
\
t3 = vec_adds
(t8, t3);
\
t7 = vec_adds
(t5, t2);
\
t2 = vec_subs
(t5, t2);
\
t5 = vec_adds
(t0, t4);
\
t0 = vec_subs
(t0, t4);
\
t4 = vec_subs
(t8, t3);
\
t3 = vec_adds
(t8, t3);
\
\
/* 4th stage */
\
vy0 = vec_adds (t7, t1); \
vy7 = vec_subs (t7, t1); \
vy1 = vec_mradds (c4, t3, t5); \
vy6 = vec_mradds (mc4, t3, t5); \
vy2 = vec_mradds (c4, t4, t0); \
vy5 = vec_mradds (mc4, t4, t0); \
vy3 = vec_adds (t2, t6); \
vy4 = vec_subs (t2, t6);
#define IDCT \
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vec_u16 shift; \
\
c4 = vec_splat (constants[0], 0); \
a0 = vec_splat (constants[0], 1); \
a1 = vec_splat (constants[0], 2); \
a2 = vec_splat (constants[0], 3); \
mc4 = vec_splat (constants[0], 4); \
ma2 = vec_splat (constants[0], 5); \
bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
\
zero = vec_splat_s16 (0); \
shift = vec_splat_u16 (4); \
\
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
\
IDCT_HALF \
\
vx0 = vec_mergeh (vy0, vy4); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
vy0 = vec_mergeh (vx0, vx4); \
vy1 = vec_mergel (vx0, vx4); \
vy2 = vec_mergeh (vx1, vx5); \
vy3 = vec_mergel (vx1, vx5); \
vy4 = vec_mergeh (vx2, vx6); \
vy5 = vec_mergel (vx2, vx6); \
vy6 = vec_mergeh (vx3, vx7); \
vy7 = vec_mergel (vx3, vx7); \
\
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
IDCT_HALF \
\
shift = vec_splat_u16 (6); \
vx0 = vec_sra (vy0, shift); \
vx1 = vec_sra (vy1, shift); \
vx2 = vec_sra (vy2, shift); \
vx3 = vec_sra (vy3, shift); \
vx4 = vec_sra (vy4, shift); \
vx5 = vec_sra (vy5, shift); \
vx6 = vec_sra (vy6, shift); \
vx7 = vec_sra (vy7, shift);
vy0 = vec_adds(t7, t1); \
vy7 = vec_subs(t7, t1); \
vy1 = vec_mradds(c4, t3, t5); \
vy6 = vec_mradds(mc4, t3, t5); \
vy2 = vec_mradds(c4, t4, t0); \
vy5 = vec_mradds(mc4, t4, t0); \
vy3 = vec_adds(t2, t6); \
vy4 = vec_subs(t2, t6);
#define IDCT \
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vec_u16 shift; \
\
c4 = vec_splat(constants[0], 0); \
a0 = vec_splat(constants[0], 1); \
a1 = vec_splat(constants[0], 2); \
a2 = vec_splat(constants[0], 3); \
mc4 = vec_splat(constants[0], 4); \
ma2 = vec_splat(constants[0], 5); \
bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \
\
zero = vec_splat_s16(0); \
shift = vec_splat_u16(4); \
\
vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \
vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \
vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \
vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \
vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \
vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \
vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \
vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \
\
IDCT_HALF \
\
vx0 = vec_mergeh(vy0, vy4); \
vx1 = vec_mergel(vy0, vy4); \
vx2 = vec_mergeh(vy1, vy5); \
vx3 = vec_mergel(vy1, vy5); \
vx4 = vec_mergeh(vy2, vy6); \
vx5 = vec_mergel(vy2, vy6); \
vx6 = vec_mergeh(vy3, vy7); \
vx7 = vec_mergel(vy3, vy7); \
\
vy0 = vec_mergeh(vx0, vx4); \
vy1 = vec_mergel(vx0, vx4); \
vy2 = vec_mergeh(vx1, vx5); \
vy3 = vec_mergel(vx1, vx5); \
vy4 = vec_mergeh(vx2, vx6); \
vy5 = vec_mergel(vx2, vx6); \
vy6 = vec_mergeh(vx3, vx7); \
vy7 = vec_mergel(vx3, vx7); \
\
vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \
vx1 = vec_mergel(vy0, vy4); \
vx2 = vec_mergeh(vy1, vy5); \
vx3 = vec_mergel(vy1, vy5); \
vx4 = vec_mergeh(vy2, vy6); \
vx5 = vec_mergel(vy2, vy6); \
vx6 = vec_mergeh(vy3, vy7); \
vx7 = vec_mergel(vy3, vy7); \
\
IDCT_HALF \
\
shift = vec_splat_u16(6); \
vx0 = vec_sra(vy0, shift); \
vx1 = vec_sra(vy1, shift); \
vx2 = vec_sra(vy2, shift); \
vx3 = vec_sra(vy3, shift); \
vx4 = vec_sra(vy4, shift); \
vx5 = vec_sra(vy5, shift); \
vx6 = vec_sra(vy6, shift); \
vx7 = vec_sra(vy7, shift);
static
const
vec_s16
constants
[
5
]
=
{
{
23170
,
13573
,
6518
,
21895
,
-
23170
,
-
21895
,
32
,
31
},
{
16384
,
22725
,
21407
,
19266
,
16384
,
19266
,
21407
,
22725
},
{
22725
,
31521
,
29692
,
26722
,
22725
,
26722
,
29692
,
31521
},
{
21407
,
29692
,
27969
,
25172
,
21407
,
25172
,
27969
,
29692
},
{
19266
,
26722
,
25172
,
22654
,
19266
,
22654
,
25172
,
26722
}
{
23170
,
13573
,
6518
,
21895
,
-
23170
,
-
21895
,
32
,
31
},
{
16384
,
22725
,
21407
,
19266
,
16384
,
19266
,
21407
,
22725
},
{
22725
,
31521
,
29692
,
26722
,
22725
,
26722
,
29692
,
31521
},
{
21407
,
29692
,
27969
,
25172
,
21407
,
25172
,
27969
,
29692
},
{
19266
,
26722
,
25172
,
22654
,
19266
,
22654
,
25172
,
26722
}
};
void
ff_idct_put_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
void
ff_idct_put_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
{
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_u8
tmp
;
IDCT
#define COPY(dest,src) \
tmp = vec_packsu (src, src); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
COPY
(
dest
,
vx0
)
dest
+=
stride
;
COPY
(
dest
,
vx1
)
dest
+=
stride
;
COPY
(
dest
,
vx2
)
dest
+=
stride
;
COPY
(
dest
,
vx3
)
dest
+=
stride
;
COPY
(
dest
,
vx4
)
dest
+=
stride
;
COPY
(
dest
,
vx5
)
dest
+=
stride
;
COPY
(
dest
,
vx6
)
dest
+=
stride
;
COPY
(
dest
,
vx7
)
#define COPY(dest, src) \
tmp = vec_packsu(src, src); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest);
COPY
(
dest
,
vx0
)
dest
+=
stride
;
COPY
(
dest
,
vx1
)
dest
+=
stride
;
COPY
(
dest
,
vx2
)
dest
+=
stride
;
COPY
(
dest
,
vx3
)
dest
+=
stride
;
COPY
(
dest
,
vx4
)
dest
+=
stride
;
COPY
(
dest
,
vx5
)
dest
+=
stride
;
COPY
(
dest
,
vx6
)
dest
+=
stride
;
COPY
(
dest
,
vx7
)
}
void
ff_idct_add_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
void
ff_idct_add_altivec
(
uint8_t
*
dest
,
int
stride
,
int16_t
*
blk
)
{
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_s16
*
block
=
(
vec_s16
*
)
blk
;
vec_u8
tmp
;
vec_s16
tmp2
,
tmp3
;
vec_u8
perm0
;
...
...
@@ -185,27 +191,34 @@ void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
IDCT
p0
=
vec_lvsl
(
0
,
dest
);
p1
=
vec_lvsl
(
stride
,
dest
);
p
=
vec_splat_u8
(
-
1
);
perm0
=
vec_mergeh
(
p
,
p0
);
perm1
=
vec_mergeh
(
p
,
p1
);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */
\
tmp = vec_ld (0, dest); \
tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
ADD
(
dest
,
vx0
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx1
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx2
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx3
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx4
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx5
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx6
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx7
,
perm1
)
p0
=
vec_lvsl
(
0
,
dest
);
p1
=
vec_lvsl
(
stride
,
dest
);
p
=
vec_splat_u8
(
-
1
);
perm0
=
vec_mergeh
(
p
,
p0
);
perm1
=
vec_mergeh
(
p
,
p1
);
#define ADD(dest, src, perm) \
/* *(uint64_t *) &tmp = *(uint64_t *) dest; */
\
tmp = vec_ld(0, dest); \
tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm); \
tmp3 = vec_adds(tmp2, src); \
tmp = vec_packsu(tmp3, tmp3); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest);
ADD
(
dest
,
vx0
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx1
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx2
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx3
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx4
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx5
,
perm1
)
dest
+=
stride
;
ADD
(
dest
,
vx6
,
perm0
)
dest
+=
stride
;
ADD
(
dest
,
vx7
,
perm1
)
}
libavcodec/ppc/int_altivec.c
View file @
022184a6
...
...
@@ -31,27 +31,28 @@
#include "libavutil/attributes.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
static
int
ssd_int8_vs_int16_altivec
(
const
int8_t
*
pix1
,
const
int16_t
*
pix2
,
int
size
)
{
int
size
)
{
int
i
,
size16
;
vector
signed
char
vpix1
;
vector
signed
short
vpix2
,
vdiff
,
vpix1l
,
vpix1h
;
union
{
vector
signed
int
vscore
;
int32_t
score
[
4
];
}
u
;
vector
signed
short
vpix2
,
vdiff
,
vpix1l
,
vpix1h
;
union
{
vector
signed
int
vscore
;
int32_t
score
[
4
];
}
u
;
u
.
vscore
=
vec_splat_s32
(
0
);
// XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,
b),vec_ld(15,b),
vec_lvsl(0, b));
#define vec_unaligned_load(b)
\
vec_perm(vec_ld(0,
b), vec_ld(15, b),
vec_lvsl(0, b));
size16
=
size
>>
4
;
while
(
size16
)
{
// score += (pix1[i]-pix2[i])*(pix1[i]-
pix2[i]);
while
(
size16
)
{
// score += (pix1[i] - pix2[i]) * (pix1[i] -
pix2[i]);
// load pix1 and the first batch of pix2
vpix1
=
vec_unaligned_load
(
pix1
);
...
...
@@ -62,20 +63,20 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vpix1l
=
vec_unpackl
(
vpix1
);
// load another batch from pix2
vpix2
=
vec_unaligned_load
(
pix2
);
vpix2
=
vec_unaligned_load
(
pix2
);
u
.
vscore
=
vec_msum
(
vdiff
,
vdiff
,
u
.
vscore
);
vdiff
=
vec_sub
(
vpix1l
,
vpix2
);
vdiff
=
vec_sub
(
vpix1l
,
vpix2
);
u
.
vscore
=
vec_msum
(
vdiff
,
vdiff
,
u
.
vscore
);
pix1
+=
16
;
pix2
+=
8
;
pix1
+=
16
;
pix2
+=
8
;
size16
--
;
}
u
.
vscore
=
vec_sums
(
u
.
vscore
,
vec_splat_s32
(
0
));
size
%=
16
;
for
(
i
=
0
;
i
<
size
;
i
++
)
{
u
.
score
[
3
]
+=
(
pix1
[
i
]
-
pix2
[
i
])
*
(
pix1
[
i
]
-
pix2
[
i
]);
}
for
(
i
=
0
;
i
<
size
;
i
++
)
u
.
score
[
3
]
+=
(
pix1
[
i
]
-
pix2
[
i
])
*
(
pix1
[
i
]
-
pix2
[
i
]);
return
u
.
score
[
3
];
}
...
...
@@ -88,56 +89,64 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
register
vec_s32
res
=
vec_splat_s32
(
0
),
t
;
int32_t
ires
;
for
(
i
=
0
;
i
<
order
;
i
+=
8
)
{
for
(
i
=
0
;
i
<
order
;
i
+=
8
)
{
vec1
=
vec_unaligned_load
(
v1
);
t
=
vec_msum
(
vec1
,
vec_ld
(
0
,
v2
),
zero_s32v
);
res
=
vec_sums
(
t
,
res
);
v1
+=
8
;
v2
+=
8
;
t
=
vec_msum
(
vec1
,
vec_ld
(
0
,
v2
),
zero_s32v
);
res
=
vec_sums
(
t
,
res
);
v1
+=
8
;
v2
+=
8
;
}
res
=
vec_splat
(
res
,
3
);
vec_ste
(
res
,
0
,
&
ires
);
return
ires
;
}
static
int32_t
scalarproduct_and_madd_int16_altivec
(
int16_t
*
v1
,
const
int16_t
*
v2
,
const
int16_t
*
v3
,
int
order
,
int
mul
)
static
int32_t
scalarproduct_and_madd_int16_altivec
(
int16_t
*
v1
,
const
int16_t
*
v2
,
const
int16_t
*
v3
,
int
order
,
int
mul
)
{
LOAD_ZERO
;
vec_s16
*
pv1
=
(
vec_s16
*
)
v1
;
register
vec_s16
muls
=
{
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
};
vec_s16
*
pv1
=
(
vec_s16
*
)
v1
;
register
vec_s16
muls
=
{
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
,
mul
};
register
vec_s16
t0
,
t1
,
i0
,
i1
,
i4
;
register
vec_s16
i2
=
vec_ld
(
0
,
v2
),
i3
=
vec_ld
(
0
,
v3
);
register
vec_s32
res
=
zero_s32v
;
register
vec_u8
align
=
vec_lvsl
(
0
,
v2
);
int32_t
ires
;
order
>>=
4
;
do
{
i1
=
vec_ld
(
16
,
v2
);
t0
=
vec_perm
(
i2
,
i1
,
align
);
i2
=
vec_ld
(
32
,
v2
);
t1
=
vec_perm
(
i1
,
i2
,
align
);
i0
=
pv1
[
0
];
i1
=
pv1
[
1
];
res
=
vec_msum
(
t0
,
i0
,
res
);
res
=
vec_msum
(
t1
,
i1
,
res
);
i4
=
vec_ld
(
16
,
v3
);
t0
=
vec_perm
(
i3
,
i4
,
align
);
i3
=
vec_ld
(
32
,
v3
);
t1
=
vec_perm
(
i4
,
i3
,
align
);
i1
=
vec_ld
(
16
,
v2
);
t0
=
vec_perm
(
i2
,
i1
,
align
);
i2
=
vec_ld
(
32
,
v2
);
t1
=
vec_perm
(
i1
,
i2
,
align
);
i0
=
pv1
[
0
];
i1
=
pv1
[
1
];
res
=
vec_msum
(
t0
,
i0
,
res
);
res
=
vec_msum
(
t1
,
i1
,
res
);
i4
=
vec_ld
(
16
,
v3
);
t0
=
vec_perm
(
i3
,
i4
,
align
);
i3
=
vec_ld
(
32
,
v3
);
t1
=
vec_perm
(
i4
,
i3
,
align
);
pv1
[
0
]
=
vec_mladd
(
t0
,
muls
,
i0
);
pv1
[
1
]
=
vec_mladd
(
t1
,
muls
,
i1
);
pv1
+=
2
;
v2
+=
16
;
v3
+=
16
;
}
while
(
--
order
);
pv1
+=
2
;
v2
+=
16
;
v3
+=
16
;
}
while
(
--
order
);
res
=
vec_splat
(
vec_sums
(
res
,
zero_s32v
),
3
);
vec_ste
(
res
,
0
,
&
ires
);
return
ires
;
}
av_cold
void
ff_int_init_altivec
(
DSPContext
*
c
,
AVCodecContext
*
avctx
)
{
c
->
ssd_int8_vs_int16
=
ssd_int8_vs_int16_altivec
;
c
->
scalarproduct_int16
=
scalarproduct_int16_altivec
;
c
->
scalarproduct_and_madd_int16
=
scalarproduct_and_madd_int16_altivec
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment