Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
82ee14d2
Commit
82ee14d2
authored
Jan 15, 2014
by
Diego Biurrun
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
ppc: dsputil: comment formatting and wording/grammar improvements
parent
cce791b1
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
180 additions
and
199 deletions
+180
-199
dsputil_altivec.c
libavcodec/ppc/dsputil_altivec.c
+114
-119
dsputil_ppc.c
libavcodec/ppc/dsputil_ppc.c
+29
-31
fdct_altivec.c
libavcodec/ppc/fdct_altivec.c
+3
-4
fft_altivec.c
libavcodec/ppc/fft_altivec.c
+6
-6
gmc_altivec.c
libavcodec/ppc/gmc_altivec.c
+17
-23
idct_altivec.c
libavcodec/ppc/idct_altivec.c
+4
-9
int_altivec.c
libavcodec/ppc/int_altivec.c
+7
-7
No files found.
libavcodec/ppc/dsputil_altivec.c
View file @
82ee14d2
...
@@ -47,27 +47,27 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
...
@@ -47,27 +47,27 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read unaligned pixels into our vectors. The vectors are as follows:
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-
pix1[15]
* pix1v: pix1[0] -
pix1[15]
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-
pix2[16] */
* pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] -
pix2[16] */
pix1v
=
vec_ld
(
0
,
pix1
);
pix1v
=
vec_ld
(
0
,
pix1
);
pix2l
=
vec_ld
(
0
,
pix2
);
pix2l
=
vec_ld
(
0
,
pix2
);
pix2r
=
vec_ld
(
16
,
pix2
);
pix2r
=
vec_ld
(
16
,
pix2
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm1
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm1
);
pix2iv
=
vec_perm
(
pix2l
,
pix2r
,
perm2
);
pix2iv
=
vec_perm
(
pix2l
,
pix2r
,
perm2
);
/* Calculate the average vector */
/* Calculate the average vector
.
*/
avgv
=
vec_avg
(
pix2v
,
pix2iv
);
avgv
=
vec_avg
(
pix2v
,
pix2iv
);
/* Calculate a sum of abs differences vector */
/* Calculate a sum of abs differences vector
.
*/
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t5
,
sad
);
sad
=
vec_sum4s
(
t5
,
sad
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix2
+=
line_size
;
pix2
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -91,33 +91,33 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
...
@@ -91,33 +91,33 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this
*
iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, each
*
fact to avoid a potentially expensive unaligned read, each
time around the loop.
*
time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows:
*
Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-
pix2[15]
* pix2v: pix2[0] -
pix2[15]
Split the pixel vectors into shorts
*/
* Split the pixel vectors into shorts.
*/
pix2l
=
vec_ld
(
0
,
pix2
);
pix2l
=
vec_ld
(
0
,
pix2
);
pix2r
=
vec_ld
(
15
,
pix2
);
pix2r
=
vec_ld
(
15
,
pix2
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read unaligned pixels into our vectors. The vectors are as follows:
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-
pix1[15]
* pix1v: pix1[0] -
pix1[15]
pix3v: pix3[0]-
pix3[15] */
* pix3v: pix3[0] -
pix3[15] */
pix1v
=
vec_ld
(
0
,
pix1
);
pix1v
=
vec_ld
(
0
,
pix1
);
pix2l
=
vec_ld
(
0
,
pix3
);
pix2l
=
vec_ld
(
0
,
pix3
);
pix2r
=
vec_ld
(
15
,
pix3
);
pix2r
=
vec_ld
(
15
,
pix3
);
pix3v
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
pix3v
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
/* Calculate the average vector */
/* Calculate the average vector
.
*/
avgv
=
vec_avg
(
pix2v
,
pix3v
);
avgv
=
vec_avg
(
pix2v
,
pix3v
);
/* Calculate a sum of abs differences vector */
/* Calculate a sum of abs differences vector
.
*/
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t5
,
sad
);
sad
=
vec_sum4s
(
t5
,
sad
);
pix1
+=
line_size
;
pix1
+=
line_size
;
...
@@ -126,7 +126,7 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
...
@@ -126,7 +126,7 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -157,12 +157,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
...
@@ -157,12 +157,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
s
=
0
;
s
=
0
;
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
iteration becomes pix2 in the next iteration. We can use this
*
iteration becomes pix2 in the next iteration. We can use this
fact to avoid a potentially expensive unaligned read, as well
*
fact to avoid a potentially expensive unaligned read, as well
as some splitting, and vector addition each time around the loop.
*
as some splitting, and vector addition each time around the loop.
Read unaligned pixels into our vectors. The vectors are as follows:
*
Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-
pix2[16]
* pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] -
pix2[16]
Split the pixel vectors into shorts
*/
* Split the pixel vectors into shorts.
*/
pix2l
=
vec_ld
(
0
,
pix2
);
pix2l
=
vec_ld
(
0
,
pix2
);
pix2r
=
vec_ld
(
16
,
pix2
);
pix2r
=
vec_ld
(
16
,
pix2
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm1
);
pix2v
=
vec_perm
(
pix2l
,
pix2r
,
perm1
);
...
@@ -177,8 +177,8 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
...
@@ -177,8 +177,8 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read unaligned pixels into our vectors. The vectors are as follows:
/* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-
pix1[15]
* pix1v: pix1[0] -
pix1[15]
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-
pix3[16] */
* pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] -
pix3[16] */
pix1v
=
vec_ld
(
0
,
pix1
);
pix1v
=
vec_ld
(
0
,
pix1
);
pix2l
=
vec_ld
(
0
,
pix3
);
pix2l
=
vec_ld
(
0
,
pix3
);
...
@@ -187,40 +187,40 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
...
@@ -187,40 +187,40 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
pix3iv
=
vec_perm
(
pix2l
,
pix2r
,
perm2
);
pix3iv
=
vec_perm
(
pix2l
,
pix2r
,
perm2
);
/* Note that AltiVec does have vec_avg, but this works on vector pairs
/* Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
* and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
* rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
Instead, we have to split the pixel vectors into vectors of shorts,
* it should be 1. Instead, we have to split the pixel vectors into
and do the averaging by hand. */
* vectors of shorts
and do the averaging by hand. */
/* Split the pixel vectors into shorts */
/* Split the pixel vectors into shorts
.
*/
pix3hv
=
(
vector
unsigned
short
)
vec_mergeh
(
zero
,
pix3v
);
pix3hv
=
(
vector
unsigned
short
)
vec_mergeh
(
zero
,
pix3v
);
pix3lv
=
(
vector
unsigned
short
)
vec_mergel
(
zero
,
pix3v
);
pix3lv
=
(
vector
unsigned
short
)
vec_mergel
(
zero
,
pix3v
);
pix3ihv
=
(
vector
unsigned
short
)
vec_mergeh
(
zero
,
pix3iv
);
pix3ihv
=
(
vector
unsigned
short
)
vec_mergeh
(
zero
,
pix3iv
);
pix3ilv
=
(
vector
unsigned
short
)
vec_mergel
(
zero
,
pix3iv
);
pix3ilv
=
(
vector
unsigned
short
)
vec_mergel
(
zero
,
pix3iv
);
/* Do the averaging on them */
/* Do the averaging on them
.
*/
t3
=
vec_add
(
pix3hv
,
pix3ihv
);
t3
=
vec_add
(
pix3hv
,
pix3ihv
);
t4
=
vec_add
(
pix3lv
,
pix3ilv
);
t4
=
vec_add
(
pix3lv
,
pix3ilv
);
avghv
=
vec_sr
(
vec_add
(
vec_add
(
t1
,
t3
),
two
),
two
);
avghv
=
vec_sr
(
vec_add
(
vec_add
(
t1
,
t3
),
two
),
two
);
avglv
=
vec_sr
(
vec_add
(
vec_add
(
t2
,
t4
),
two
),
two
);
avglv
=
vec_sr
(
vec_add
(
vec_add
(
t2
,
t4
),
two
),
two
);
/* Pack the shorts back into a result */
/* Pack the shorts back into a result
.
*/
avgv
=
vec_pack
(
avghv
,
avglv
);
avgv
=
vec_pack
(
avghv
,
avglv
);
/* Calculate a sum of abs differences vector */
/* Calculate a sum of abs differences vector
.
*/
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
t5
=
vec_sub
(
vec_max
(
pix1v
,
avgv
),
vec_min
(
pix1v
,
avgv
));
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t5
,
sad
);
sad
=
vec_sum4s
(
t5
,
sad
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix3
+=
line_size
;
pix3
+=
line_size
;
/* Transfer the calculated values for pix3 into pix2 */
/* Transfer the calculated values for pix3 into pix2
.
*/
t1
=
t3
;
t1
=
t3
;
t2
=
t4
;
t2
=
t4
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -242,25 +242,25 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
...
@@ -242,25 +242,25 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read potentially unaligned pixels into t1 and t2 */
/* Read potentially unaligned pixels into t1 and t2
.
*/
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2r
=
vec_ld
(
15
,
pix2
);
vector
unsigned
char
pix2r
=
vec_ld
(
15
,
pix2
);
t1
=
vec_ld
(
0
,
pix1
);
t1
=
vec_ld
(
0
,
pix1
);
t2
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
t2
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
/* Calculate a sum of abs differences vector */
/* Calculate a sum of abs differences vector
.
*/
t3
=
vec_max
(
t1
,
t2
);
t3
=
vec_max
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t5
=
vec_sub
(
t3
,
t4
);
t5
=
vec_sub
(
t3
,
t4
);
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t5
,
sad
);
sad
=
vec_sum4s
(
t5
,
sad
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix2
+=
line_size
;
pix2
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -283,9 +283,9 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
...
@@ -283,9 +283,9 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read potentially unaligned pixels into t1 and t2
/* Read potentially unaligned pixels into t1 and t2
.
Since we're reading 16 pixels, and actually only want 8,
*
Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */
*
mask out the last 8 pixels. The 0s don't change the sum. */
vector
unsigned
char
pix1l
=
vec_ld
(
0
,
pix1
);
vector
unsigned
char
pix1l
=
vec_ld
(
0
,
pix1
);
vector
unsigned
char
pix1r
=
vec_ld
(
7
,
pix1
);
vector
unsigned
char
pix1r
=
vec_ld
(
7
,
pix1
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
...
@@ -293,19 +293,19 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
...
@@ -293,19 +293,19 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
t1
=
vec_and
(
vec_perm
(
pix1l
,
pix1r
,
perm1
),
permclear
);
t1
=
vec_and
(
vec_perm
(
pix1l
,
pix1r
,
perm1
),
permclear
);
t2
=
vec_and
(
vec_perm
(
pix2l
,
pix2r
,
perm2
),
permclear
);
t2
=
vec_and
(
vec_perm
(
pix2l
,
pix2r
,
perm2
),
permclear
);
/* Calculate a sum of abs differences vector */
/* Calculate a sum of abs differences vector
.
*/
t3
=
vec_max
(
t1
,
t2
);
t3
=
vec_max
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t5
=
vec_sub
(
t3
,
t4
);
t5
=
vec_sub
(
t3
,
t4
);
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t5
,
sad
);
sad
=
vec_sum4s
(
t5
,
sad
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix2
+=
line_size
;
pix2
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -327,17 +327,17 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
...
@@ -327,17 +327,17 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
s
=
0
;
s
=
0
;
for
(
i
=
0
;
i
<
16
;
i
++
)
{
for
(
i
=
0
;
i
<
16
;
i
++
)
{
/* Read
in the potentially unaligned pixels
*/
/* Read
the potentially unaligned pixels.
*/
vector
unsigned
char
pixl
=
vec_ld
(
0
,
pix
);
vector
unsigned
char
pixl
=
vec_ld
(
0
,
pix
);
vector
unsigned
char
pixr
=
vec_ld
(
15
,
pix
);
vector
unsigned
char
pixr
=
vec_ld
(
15
,
pix
);
pixv
=
vec_perm
(
pixl
,
pixr
,
perm
);
pixv
=
vec_perm
(
pixl
,
pixr
,
perm
);
/* Square the values, and add them to our sum */
/* Square the values, and add them to our sum
.
*/
sv
=
vec_msum
(
pixv
,
pixv
,
sv
);
sv
=
vec_msum
(
pixv
,
pixv
,
sv
);
pix
+=
line_size
;
pix
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sum
=
vec_sums
((
vector
signed
int
)
sv
,
(
vector
signed
int
)
zero
);
sum
=
vec_sums
((
vector
signed
int
)
sv
,
(
vector
signed
int
)
zero
);
sum
=
vec_splat
(
sum
,
3
);
sum
=
vec_splat
(
sum
,
3
);
vec_ste
(
sum
,
0
,
&
s
);
vec_ste
(
sum
,
0
,
&
s
);
...
@@ -345,11 +345,8 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
...
@@ -345,11 +345,8 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
return
s
;
return
s
;
}
}
/**
/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
* Sum of Squared Errors for a 8x8 block.
* It's the sad8_altivec code above w/ squaring added. */
* AltiVec-enhanced.
* It's the sad8_altivec code above w/ squaring added.
*/
static
int
sse8_altivec
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
static
int
sse8_altivec
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
{
int
i
;
int
i
;
...
@@ -365,9 +362,9 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
...
@@ -365,9 +362,9 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
sum
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sum
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read potentially unaligned pixels into t1 and t2
/* Read potentially unaligned pixels into t1 and t2
.
Since we're reading 16 pixels, and actually only want 8,
*
Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */
*
mask out the last 8 pixels. The 0s don't change the sum. */
vector
unsigned
char
pix1l
=
vec_ld
(
0
,
pix1
);
vector
unsigned
char
pix1l
=
vec_ld
(
0
,
pix1
);
vector
unsigned
char
pix1r
=
vec_ld
(
7
,
pix1
);
vector
unsigned
char
pix1r
=
vec_ld
(
7
,
pix1
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
...
@@ -376,21 +373,21 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
...
@@ -376,21 +373,21 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
t2
=
vec_and
(
vec_perm
(
pix2l
,
pix2r
,
perm2
),
permclear
);
t2
=
vec_and
(
vec_perm
(
pix2l
,
pix2r
,
perm2
),
permclear
);
/* Since we want to use unsigned chars, we can take advantage
/* Since we want to use unsigned chars, we can take advantage
of the fact that abs(a-b)^2 = (a-b)^
2. */
* of the fact that abs(a - b) ^ 2 = (a - b) ^
2. */
/* Calculate abs differences vector */
/* Calculate abs differences vector
.
*/
t3
=
vec_max
(
t1
,
t2
);
t3
=
vec_max
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t5
=
vec_sub
(
t3
,
t4
);
t5
=
vec_sub
(
t3
,
t4
);
/* Square the values and add them to our sum */
/* Square the values and add them to our sum
.
*/
sum
=
vec_msum
(
t5
,
t5
,
sum
);
sum
=
vec_msum
(
t5
,
t5
,
sum
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix2
+=
line_size
;
pix2
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumsqr
=
vec_sums
((
vector
signed
int
)
sum
,
(
vector
signed
int
)
zero
);
sumsqr
=
vec_sums
((
vector
signed
int
)
sum
,
(
vector
signed
int
)
zero
);
sumsqr
=
vec_splat
(
sumsqr
,
3
);
sumsqr
=
vec_splat
(
sumsqr
,
3
);
vec_ste
(
sumsqr
,
0
,
&
s
);
vec_ste
(
sumsqr
,
0
,
&
s
);
...
@@ -398,11 +395,8 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
...
@@ -398,11 +395,8 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
return
s
;
return
s
;
}
}
/**
/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
* Sum of Squared Errors for a 16x16 block.
* It's the sad16_altivec code above w/ squaring added. */
* AltiVec-enhanced.
* It's the sad16_altivec code above w/ squaring added.
*/
static
int
sse16_altivec
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
static
int
sse16_altivec
(
void
*
v
,
uint8_t
*
pix1
,
uint8_t
*
pix2
,
int
line_size
,
int
h
)
{
{
int
i
;
int
i
;
...
@@ -416,28 +410,28 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
...
@@ -416,28 +410,28 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
sum
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sum
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
for
(
i
=
0
;
i
<
h
;
i
++
)
{
for
(
i
=
0
;
i
<
h
;
i
++
)
{
/* Read potentially unaligned pixels into t1 and t2 */
/* Read potentially unaligned pixels into t1 and t2
.
*/
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2l
=
vec_ld
(
0
,
pix2
);
vector
unsigned
char
pix2r
=
vec_ld
(
15
,
pix2
);
vector
unsigned
char
pix2r
=
vec_ld
(
15
,
pix2
);
t1
=
vec_ld
(
0
,
pix1
);
t1
=
vec_ld
(
0
,
pix1
);
t2
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
t2
=
vec_perm
(
pix2l
,
pix2r
,
perm
);
/* Since we want to use unsigned chars, we can take advantage
/* Since we want to use unsigned chars, we can take advantage
of the fact that abs(a-b)^2 = (a-b)^
2. */
* of the fact that abs(a - b) ^ 2 = (a - b) ^
2. */
/* Calculate abs differences vector */
/* Calculate abs differences vector
.
*/
t3
=
vec_max
(
t1
,
t2
);
t3
=
vec_max
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t4
=
vec_min
(
t1
,
t2
);
t5
=
vec_sub
(
t3
,
t4
);
t5
=
vec_sub
(
t3
,
t4
);
/* Square the values and add them to our sum */
/* Square the values and add them to our sum
.
*/
sum
=
vec_msum
(
t5
,
t5
,
sum
);
sum
=
vec_msum
(
t5
,
t5
,
sum
);
pix1
+=
line_size
;
pix1
+=
line_size
;
pix2
+=
line_size
;
pix2
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumsqr
=
vec_sums
((
vector
signed
int
)
sum
,
(
vector
signed
int
)
zero
);
sumsqr
=
vec_sums
((
vector
signed
int
)
sum
,
(
vector
signed
int
)
zero
);
sumsqr
=
vec_splat
(
sumsqr
,
3
);
sumsqr
=
vec_splat
(
sumsqr
,
3
);
vec_ste
(
sumsqr
,
0
,
&
s
);
vec_ste
(
sumsqr
,
0
,
&
s
);
...
@@ -459,18 +453,18 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
...
@@ -459,18 +453,18 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
sad
=
(
vector
unsigned
int
)
vec_splat_u32
(
0
);
for
(
i
=
0
;
i
<
16
;
i
++
)
{
for
(
i
=
0
;
i
<
16
;
i
++
)
{
/* Read the potentially unaligned 16 pixels into t1 */
/* Read the potentially unaligned 16 pixels into t1
.
*/
vector
unsigned
char
pixl
=
vec_ld
(
0
,
pix
);
vector
unsigned
char
pixl
=
vec_ld
(
0
,
pix
);
vector
unsigned
char
pixr
=
vec_ld
(
15
,
pix
);
vector
unsigned
char
pixr
=
vec_ld
(
15
,
pix
);
t1
=
vec_perm
(
pixl
,
pixr
,
perm
);
t1
=
vec_perm
(
pixl
,
pixr
,
perm
);
/* Add each 4 pixel group together and put 4 results into sad */
/* Add each 4 pixel group together and put 4 results into sad
.
*/
sad
=
vec_sum4s
(
t1
,
sad
);
sad
=
vec_sum4s
(
t1
,
sad
);
pix
+=
line_size
;
pix
+=
line_size
;
}
}
/* Sum up the four partial sums, and put the result into s */
/* Sum up the four partial sums, and put the result into s
.
*/
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_sums
((
vector
signed
int
)
sad
,
(
vector
signed
int
)
zero
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
sumdiffs
=
vec_splat
(
sumdiffs
,
3
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
vec_ste
(
sumdiffs
,
0
,
&
s
);
...
@@ -487,6 +481,9 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i
...
@@ -487,6 +481,9 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i
vector
signed
short
shorts
;
vector
signed
short
shorts
;
for
(
i
=
0
;
i
<
8
;
i
++
)
{
for
(
i
=
0
;
i
<
8
;
i
++
)
{
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
// Read potentially unaligned pixels.
// Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8,
// We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
// but we simply ignore the extras.
...
@@ -494,10 +491,10 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i
...
@@ -494,10 +491,10 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i
vector
unsigned
char
pixr
=
vec_ld
(
7
,
pixels
);
vector
unsigned
char
pixr
=
vec_ld
(
7
,
pixels
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm
);
//
convert the bytes into shorts
//
Convert the bytes into shorts.
shorts
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
shorts
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
//
save the data to the block, we assume the block is 16-byte aligned
//
Save the data to the block, we assume the block is 16-byte aligned.
vec_st
(
shorts
,
i
*
16
,
(
vector
signed
short
*
)
block
);
vec_st
(
shorts
,
i
*
16
,
(
vector
signed
short
*
)
block
);
pixels
+=
line_size
;
pixels
+=
line_size
;
...
@@ -515,60 +512,59 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
...
@@ -515,60 +512,59 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
vector
signed
short
shorts1
,
shorts2
;
vector
signed
short
shorts1
,
shorts2
;
for
(
i
=
0
;
i
<
4
;
i
++
)
{
for
(
i
=
0
;
i
<
4
;
i
++
)
{
/
/ Read potentially unaligned pixels
/
* Read potentially unaligned pixels.
//
We're reading 16 pixels, and actually only want 8,
*
We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
* but we simply ignore the extras. */
pixl
=
vec_ld
(
0
,
s1
);
pixl
=
vec_ld
(
0
,
s1
);
pixr
=
vec_ld
(
15
,
s1
);
pixr
=
vec_ld
(
15
,
s1
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm1
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm1
);
//
convert the bytes into shorts
//
Convert the bytes into shorts.
shorts1
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
shorts1
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
// Do the same for the second block of pixels
// Do the same for the second block of pixels
.
pixl
=
vec_ld
(
0
,
s2
);
pixl
=
vec_ld
(
0
,
s2
);
pixr
=
vec_ld
(
15
,
s2
);
pixr
=
vec_ld
(
15
,
s2
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm2
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm2
);
//
convert the bytes into shorts
//
Convert the bytes into shorts.
shorts2
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
shorts2
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
// Do the subtraction
// Do the subtraction
.
shorts1
=
vec_sub
(
shorts1
,
shorts2
);
shorts1
=
vec_sub
(
shorts1
,
shorts2
);
//
save the data to the block, we assume the block is 16-byte aligned
//
Save the data to the block, we assume the block is 16-byte aligned.
vec_st
(
shorts1
,
0
,
(
vector
signed
short
*
)
block
);
vec_st
(
shorts1
,
0
,
(
vector
signed
short
*
)
block
);
s1
+=
stride
;
s1
+=
stride
;
s2
+=
stride
;
s2
+=
stride
;
block
+=
8
;
block
+=
8
;
/* The code below is a copy of the code above...
* This is a manual unroll. */
// The code below is a copy of the code above... This is a manual
/* Read potentially unaligned pixels.
// unroll.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
// Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras.
pixl
=
vec_ld
(
0
,
s1
);
pixl
=
vec_ld
(
0
,
s1
);
pixr
=
vec_ld
(
15
,
s1
);
pixr
=
vec_ld
(
15
,
s1
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm1
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm1
);
//
convert the bytes into shorts
//
Convert the bytes into shorts.
shorts1
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
shorts1
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
// Do the same for the second block of pixels
// Do the same for the second block of pixels
.
pixl
=
vec_ld
(
0
,
s2
);
pixl
=
vec_ld
(
0
,
s2
);
pixr
=
vec_ld
(
15
,
s2
);
pixr
=
vec_ld
(
15
,
s2
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm2
);
bytes
=
vec_perm
(
pixl
,
pixr
,
perm2
);
//
convert the bytes into shorts
//
Convert the bytes into shorts.
shorts2
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
shorts2
=
(
vector
signed
short
)
vec_mergeh
(
zero
,
bytes
);
// Do the subtraction
// Do the subtraction
.
shorts1
=
vec_sub
(
shorts1
,
shorts2
);
shorts1
=
vec_sub
(
shorts1
,
shorts2
);
//
save the data to the block, we assume the block is 16-byte aligned
//
Save the data to the block, we assume the block is 16-byte aligned.
vec_st
(
shorts1
,
0
,
(
vector
signed
short
*
)
block
);
vec_st
(
shorts1
,
0
,
(
vector
signed
short
*
)
block
);
s1
+=
stride
;
s1
+=
stride
;
...
@@ -595,14 +591,14 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
...
@@ -595,14 +591,14 @@ static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
register
int
i
;
register
int
i
;
register
vector
unsigned
char
vdst
,
vsrc
;
register
vector
unsigned
char
vdst
,
vsrc
;
/* dst and src are 16 bytes-aligned (guaranteed) */
/* dst and src are 16 bytes-aligned (guaranteed)
.
*/
for
(
i
=
0
;
(
i
+
15
)
<
w
;
i
+=
16
)
{
for
(
i
=
0
;
(
i
+
15
)
<
w
;
i
+=
16
)
{
vdst
=
vec_ld
(
i
,
(
unsigned
char
*
)
dst
);
vdst
=
vec_ld
(
i
,
(
unsigned
char
*
)
dst
);
vsrc
=
vec_ld
(
i
,
(
unsigned
char
*
)
src
);
vsrc
=
vec_ld
(
i
,
(
unsigned
char
*
)
src
);
vdst
=
vec_add
(
vsrc
,
vdst
);
vdst
=
vec_add
(
vsrc
,
vdst
);
vec_st
(
vdst
,
i
,
(
unsigned
char
*
)
dst
);
vec_st
(
vdst
,
i
,
(
unsigned
char
*
)
dst
);
}
}
/*
if w is not a multiple of 16
*/
/*
If w is not a multiple of 16.
*/
for
(;
(
i
<
w
)
;
i
++
)
{
for
(;
(
i
<
w
)
;
i
++
)
{
dst
[
i
]
=
src
[
i
];
dst
[
i
]
=
src
[
i
];
}
}
...
@@ -643,8 +639,8 @@ static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, u
...
@@ -643,8 +639,8 @@ static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, u
dst1 = vec_ld(stride * i, dst); \
dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 15, dst); \
dst2 = vec_ld((stride * i) + 15, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/*
promote the unsigned chars to signed shorts */
\
/*
Promote the unsigned chars to signed shorts. */
\
/*
we're in the 8x8 function, we only care for the first 8 */
\
/*
We're in the 8x8 function, we only care for the first 8. */
\
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \
(vector signed char)srcO); \
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
...
@@ -713,24 +709,23 @@ static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, u
...
@@ -713,24 +709,23 @@ static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, u
}
}
/*
/*
16x8 works with 16 elements; it allows to avoid replicating loads, and
* 16x8 works with 16 elements; it allows to avoid replicating loads, and
give the compiler more rooms for scheduling. It's only used from
* gives the compiler more room for scheduling. It's only used from
inside hadamard8_diff16_altivec.
* inside hadamard8_diff16_altivec.
*
Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
* Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
* a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
by itself. The following code include hand-made registers allocation. It's not
* registers by itself. The following code includes hand-made register
clean, but on a 7450 the resulting code is much faster (best case fall from
* allocation. It's not clean, but on a 7450 the resulting code is much faster
700+ cycles to 550).
* (best case falls from 700+ cycles to 550).
*
xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
* xlc doesn't add spill code, but it doesn't know how to schedule for the
and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
* 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
instructions...)
* 25% fewer instructions...)
*
On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
* On the 970, the hand-made RA is still a win (around 690 vs. around 780),
xlc goes to around 660 on the regular C code...
* but xlc goes to around 660 on the regular C code...
*/
*/
static
int
hadamard8_diff16x8_altivec
(
/*MpegEncContext*/
void
*
s
,
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
,
int
h
)
{
static
int
hadamard8_diff16x8_altivec
(
/*MpegEncContext*/
void
*
s
,
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
,
int
h
)
{
int
sum
;
int
sum
;
register
vector
signed
short
register
vector
signed
short
...
@@ -805,7 +800,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
...
@@ -805,7 +800,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
dst1 = vec_ld(stride * i, dst); \
dst1 = vec_ld(stride * i, dst); \
dst2 = vec_ld((stride * i) + 16, dst); \
dst2 = vec_ld((stride * i) + 16, dst); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
/*
promote the unsigned chars to signed shorts */
\
/*
Promote the unsigned chars to signed shorts. */
\
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)srcO); \
(vector signed char)srcO); \
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
...
...
libavcodec/ppc/dsputil_ppc.c
View file @
82ee14d2
...
@@ -32,24 +32,23 @@
...
@@ -32,24 +32,23 @@
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
cache line size not equal to 32 bytes.
* a cache line size not equal to 32 bytes. Fortunately all processors used
Fortunately all processor used by Apple up to at least the 7450 (aka second
* by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
generation G4) use 32 bytes cache line.
* cache lines. This is due to the use of the 'dcbz' instruction. It simply
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
* clears a single cache line to zero, so you need to know the cache line
single cache line, so you need to know the cache line size to use it !
* size to use it! It's absurd, but it's fast...
It's absurd, but it's fast...
*
* update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
* cache line size: 128 bytes. Oups.
size: 128 bytes. Oups.
* The semantics of dcbz was changed, it always clears 32 bytes. So the function
The semantic of dcbz was changed, it always clear 32 bytes. so the function
* below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
* which is defined to clear a cache line (as dcbz before). So we can still
which is defined to clear a cache line (as dcbz before). So we still can
* distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
*
* see <http://developer.apple.com/technotes/tn/tn2087.html>
see <http://developer.apple.com/technotes/tn/tn2087.html>
* and <http://developer.apple.com/technotes/tn/tn2086.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
*/
static
void
clear_blocks_dcbz32_ppc
(
int16_t
*
blocks
)
static
void
clear_blocks_dcbz32_ppc
(
int16_t
*
blocks
)
{
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x00000010
);
register
int
misal
=
((
unsigned
long
)
blocks
&
0x00000010
);
...
@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
...
@@ -73,17 +72,17 @@ static void clear_blocks_dcbz32_ppc(int16_t *blocks)
}
}
}
}
/*
same as above, when dcbzl clear a whole 128B
cache line
/*
Same as above, when dcbzl clears a whole 128 bytes
cache line
i.e. the PPC970 aka G5
*/
* i.e. the PPC970 AKA G5.
*/
#if HAVE_DCBZL
#if HAVE_DCBZL
static
void
clear_blocks_dcbz128_ppc
(
int16_t
*
blocks
)
static
void
clear_blocks_dcbz128_ppc
(
int16_t
*
blocks
)
{
{
register
int
misal
=
((
unsigned
long
)
blocks
&
0x0000007f
);
register
int
misal
=
((
unsigned
long
)
blocks
&
0x0000007f
);
register
int
i
=
0
;
register
int
i
=
0
;
if
(
misal
)
{
if
(
misal
)
{
/
/ w
e could probably also optimize this case,
/
* W
e could probably also optimize this case,
//
but there's not much point as the machines
*
but there's not much point as the machines
// aren't available yet (2003-06-26)
* aren't available yet (2003-06-26). */
memset
(
blocks
,
0
,
sizeof
(
int16_t
)
*
6
*
64
);
memset
(
blocks
,
0
,
sizeof
(
int16_t
)
*
6
*
64
);
}
}
else
else
...
@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
...
@@ -99,11 +98,10 @@ static void clear_blocks_dcbz128_ppc(int16_t *blocks)
#endif
#endif
#if HAVE_DCBZL
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* Check dcbz report how many bytes are set to 0 by dcbz. */
/* update 24/06/2003 : replace dcbz by dcbzl to get
/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
the intended effect (Apple "fixed" dcbz)
* (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
unfortunately this cannot be used unless the assembler
* assembler knows about dcbzl ... */
knows about dcbzl ... */
static
long
check_dcbzl_effect
(
void
)
static
long
check_dcbzl_effect
(
void
)
{
{
register
char
*
fakedata
=
av_malloc
(
1024
);
register
char
*
fakedata
=
av_malloc
(
1024
);
...
@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
...
@@ -120,8 +118,8 @@ static long check_dcbzl_effect(void)
memset
(
fakedata
,
0xFF
,
1024
);
memset
(
fakedata
,
0xFF
,
1024
);
/*
below the constraint "b" seems to mean "A
ddress base register"
/*
Below the constraint "b" seems to mean "a
ddress base register"
in gcc-3.3 / RS/6000 speaks. s
eems to avoid using r0, so.... */
* in gcc-3.3 / RS/6000 speaks. S
eems to avoid using r0, so.... */
__asm__
volatile
(
"dcbzl %0, %1"
:
:
"b"
(
fakedata_middle
),
"r"
(
zero
));
__asm__
volatile
(
"dcbzl %0, %1"
:
:
"b"
(
fakedata_middle
),
"r"
(
zero
));
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
for
(
i
=
0
;
i
<
1024
;
i
++
)
{
...
@@ -144,7 +142,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
...
@@ -144,7 +142,7 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx)
{
{
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
const
int
high_bit_depth
=
avctx
->
bits_per_raw_sample
>
8
;
//
C
ommon optimizations whether AltiVec is available or not
//
c
ommon optimizations whether AltiVec is available or not
if
(
!
high_bit_depth
)
{
if
(
!
high_bit_depth
)
{
switch
(
check_dcbzl_effect
())
{
switch
(
check_dcbzl_effect
())
{
case
32
:
case
32
:
...
...
libavcodec/ppc/fdct_altivec.c
View file @
82ee14d2
...
@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
...
@@ -259,11 +259,10 @@ void ff_fdct_altivec(int16_t *block)
#undef MERGE_S16
#undef MERGE_S16
/* }}} */
/* }}} */
/* Some of the initial calculations can be done as vector short
* before conversion to vector float. The following code section
* takes advantage of this. */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
/* fdct rows {{{ */
/* fdct rows {{{ */
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x0
=
((
vector
float
)
vec_add
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
x7
=
((
vector
float
)
vec_sub
(
vs16
(
b00
),
vs16
(
b70
)));
...
...
libavcodec/ppc/fft_altivec.c
View file @
82ee14d2
...
@@ -27,12 +27,12 @@
...
@@ -27,12 +27,12 @@
#include "libavcodec/fft.h"
#include "libavcodec/fft.h"
/**
/**
* Do a complex FFT with the parameters defined in ff_fft_init().
The
* Do a complex FFT with the parameters defined in ff_fft_init().
*
input data must be permuted before with s->revtab table. No
*
The input data must be permuted before with s->revtab table.
*
1.0/
sqrt(n) normalization is done.
*
No 1.0 /
sqrt(n) normalization is done.
* AltiVec-enabled
* AltiVec-enabled
:
* This code assumes that the 'z' pointer is 16 bytes-aligned
* This code assumes that the 'z' pointer is 16 bytes-aligned
.
* It also assumes all FFTComplex are 8 bytes-aligned pair
of float
* It also assumes all FFTComplex are 8 bytes-aligned pair
s of floats.
*/
*/
void
ff_fft_calc_altivec
(
FFTContext
*
s
,
FFTComplex
*
z
);
void
ff_fft_calc_altivec
(
FFTContext
*
s
,
FFTComplex
*
z
);
...
...
libavcodec/ppc/gmc_altivec.c
View file @
82ee14d2
/*
/*
* GMC (Global Motion Compensation)
* GMC (Global Motion Compensation)
, AltiVec-enabled
*
AltiVec-enabled
*
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
*
* This file is part of Libav.
* This file is part of Libav.
...
@@ -25,10 +25,8 @@
...
@@ -25,10 +25,8 @@
#include "libavutil/ppc/util_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "dsputil_altivec.h"
#include "dsputil_altivec.h"
/*
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
* to preserve proper dst alignment. */
to preserve proper dst alignment.
*/
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
void
ff_gmc1_altivec
(
uint8_t
*
dst
/* align 8 */
,
uint8_t
*
src
/* align1 */
,
int
stride
,
int
h
,
int
x16
,
int
y16
,
int
rounder
)
{
{
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
const
DECLARE_ALIGNED
(
16
,
unsigned
short
,
rounder_a
)
=
rounder
;
...
@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -56,18 +54,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
rounderV
=
vec_splat
((
vec_u16
)
vec_lde
(
0
,
&
rounder_a
),
0
);
// we'll be able to pick-up our 9 char elements
/* we'll be able to pick-up our 9 char elements at src from those
// at src from those 32 bytes
* 32 bytes we load the first batch here, as inside the loop we can
// we load the first batch here, as inside the loop
* reuse 'src + stride' from one iteration as the 'src' of the next. */
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
src_0
=
vec_ld
(
0
,
src
);
src_0
=
vec_ld
(
0
,
src
);
src_1
=
vec_ld
(
16
,
src
);
src_1
=
vec_ld
(
16
,
src
);
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
srcvA
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
// on the second vector.
* on the second vector. */
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
srcvB
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
1
,
src
));
}
else
{
}
else
{
srcvB
=
src_1
;
srcvB
=
src_1
;
...
@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -81,17 +77,16 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
dstv
=
vec_ld
(
0
,
dst
);
dstv
=
vec_ld
(
0
,
dst
);
// we we'll be able to pick-up our 9 char elements
/* We'll be able to pick-up our 9 char elements at src + stride from
// at src + stride from those 32 bytes
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
// then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
// as the next srcvA and srcvB
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_0
=
vec_ld
(
stride
+
0
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
src_1
=
vec_ld
(
stride
+
16
,
src
);
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
srcvC
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
0
,
src
));
if
(
src_really_odd
!=
0x0000000F
)
{
if
(
src_really_odd
!=
0x0000000F
)
{
/
/ if src & 0xF == 0xF, then (src+
1) is properly aligned
/
* If src & 0xF == 0xF, then (src +
1) is properly aligned
// on the second vector.
* on the second vector. */
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
srcvD
=
vec_perm
(
src_0
,
src_1
,
vec_lvsl
(
stride
+
1
,
src
));
}
else
{
}
else
{
srcvD
=
src_1
;
srcvD
=
src_1
;
...
@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
...
@@ -100,10 +95,9 @@ void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvC
=
vec_mergeh
(
vczero
,
srcvC
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
srcvD
=
vec_mergeh
(
vczero
,
srcvD
);
/* OK, now we (finally) do the math :-)
// OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
// those four instructions replaces 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
// isn't AltiVec nice ?
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempA
=
vec_mladd
((
vector
unsigned
short
)
srcvA
,
Av
,
rounderV
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempB
=
vec_mladd
((
vector
unsigned
short
)
srcvB
,
Bv
,
tempA
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
tempC
=
vec_mladd
((
vector
unsigned
short
)
srcvC
,
Cv
,
tempB
);
...
...
libavcodec/ppc/idct_altivec.c
View file @
82ee14d2
...
@@ -18,24 +18,19 @@
...
@@ -18,24 +18,19 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
*/
/*
/* NOTE: This code is based on GPL code from the libmpeg2 project. The
* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of Libav.
* under LGPL as part of Libav.
*/
*
/*
* Libav integration by Dieter Shirley
* Libav integration by Dieter Shirley
*
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function.
* perform a full transpose at the end of the function. */
*/
#include <stdlib.h>
/* malloc(), free() */
#include <stdlib.h>
#include <string.h>
#include <string.h>
#include "config.h"
#include "config.h"
#if HAVE_ALTIVEC_H
#if HAVE_ALTIVEC_H
...
...
libavcodec/ppc/int_altivec.c
View file @
82ee14d2
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
*/
*/
/**
/**
*
*
@file
* @file
*
* integer misc ops.
*
miscellaneous integer operations
*
*
/
*/
#include "config.h"
#include "config.h"
#if HAVE_ALTIVEC_H
#if HAVE_ALTIVEC_H
...
@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
...
@@ -43,8 +43,8 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int32_t
score
[
4
];
int32_t
score
[
4
];
}
u
;
}
u
;
u
.
vscore
=
vec_splat_s32
(
0
);
u
.
vscore
=
vec_splat_s32
(
0
);
//
//XXX lazy way, fix it later
//
XXX lazy way, fix it later
#define vec_unaligned_load(b) \
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
...
@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
...
@@ -52,12 +52,12 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
size16
=
size
>>
4
;
size16
=
size
>>
4
;
while
(
size16
)
{
while
(
size16
)
{
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
//
load pix1 and the first batch of pix2
vpix1
=
vec_unaligned_load
(
pix1
);
vpix1
=
vec_unaligned_load
(
pix1
);
vpix2
=
vec_unaligned_load
(
pix2
);
vpix2
=
vec_unaligned_load
(
pix2
);
pix2
+=
8
;
pix2
+=
8
;
//unpack
//
unpack
vpix1h
=
vec_unpackh
(
vpix1
);
vpix1h
=
vec_unpackh
(
vpix1
);
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vdiff
=
vec_sub
(
vpix1h
,
vpix2
);
vpix1l
=
vec_unpackl
(
vpix1
);
vpix1l
=
vec_unpackl
(
vpix1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment