Commit 4c387c70 authored by Mans Rullgard's avatar Mans Rullgard

ppc: dsputil: do unaligned block accesses correctly

To load unaligned vector data in the usual way, explicit vec_ld()
should be used rather than dereferencing a pointer to a vector type.
When the VSX extension is enabled, gcc may compile vector pointer
dereferences using the VSX lxvw4x instruction instead of the lvx
instruction typically used with Altivec/VMX.  As the behaviour of
these instructions with unaligned addresses differs, it is important
that only lvx is used here.
Signed-off-by: 's avatarMans Rullgard <mans@mansr.com>
parent c75eca9d
...@@ -34,7 +34,9 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size ...@@ -34,7 +34,9 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
int i; int i;
int s; int s;
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector unsigned char *tv; vector unsigned char perm1 = vec_lvsl(0, pix2);
vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
vector unsigned char pix2l, pix2r;
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
vector unsigned int sad; vector unsigned int sad;
vector signed int sumdiffs; vector signed int sumdiffs;
...@@ -45,14 +47,11 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size ...@@ -45,14 +47,11 @@ static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
/* Read unaligned pixels into our vectors. The vectors are as follows: /* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
tv = (vector unsigned char *) pix1; pix1v = vec_ld( 0, pix1);
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); pix2l = vec_ld( 0, pix2);
pix2r = vec_ld(16, pix2);
tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(pix2l, pix2r, perm1);
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2iv = vec_perm(pix2l, pix2r, perm2);
tv = (vector unsigned char *) &pix2[1];
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
/* Calculate the average vector */ /* Calculate the average vector */
avgv = vec_avg(pix2v, pix2iv); avgv = vec_avg(pix2v, pix2iv);
...@@ -79,7 +78,8 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size ...@@ -79,7 +78,8 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
int i; int i;
int s; int s;
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector unsigned char *tv; vector unsigned char perm = vec_lvsl(0, pix2);
vector unsigned char pix2l, pix2r;
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
vector unsigned int sad; vector unsigned int sad;
vector signed int sumdiffs; vector signed int sumdiffs;
...@@ -95,18 +95,19 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size ...@@ -95,18 +95,19 @@ static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2v: pix2[0]-pix2[15]
Split the pixel vectors into shorts */ Split the pixel vectors into shorts */
tv = (vector unsigned char *) &pix2[0]; pix2l = vec_ld( 0, pix2);
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2r = vec_ld(15, pix2);
pix2v = vec_perm(pix2l, pix2r, perm);
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* Read unaligned pixels into our vectors. The vectors are as follows: /* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] */ pix3v: pix3[0]-pix3[15] */
tv = (vector unsigned char *) pix1; pix1v = vec_ld(0, pix1);
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
tv = (vector unsigned char *) &pix3[0]; pix2l = vec_ld( 0, pix3);
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); pix2r = vec_ld(15, pix3);
pix3v = vec_perm(pix2l, pix2r, perm);
/* Calculate the average vector */ /* Calculate the average vector */
avgv = vec_avg(pix2v, pix3v); avgv = vec_avg(pix2v, pix3v);
...@@ -137,7 +138,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz ...@@ -137,7 +138,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
uint8_t *pix3 = pix2 + line_size; uint8_t *pix3 = pix2 + line_size;
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
vector unsigned char *tv, avgv, t5; vector unsigned char avgv, t5;
vector unsigned char perm1 = vec_lvsl(0, pix2);
vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
vector unsigned char pix2l, pix2r;
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
...@@ -157,11 +161,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz ...@@ -157,11 +161,10 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
Read unaligned pixels into our vectors. The vectors are as follows: Read unaligned pixels into our vectors. The vectors are as follows:
pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
Split the pixel vectors into shorts */ Split the pixel vectors into shorts */
tv = (vector unsigned char *) &pix2[0]; pix2l = vec_ld( 0, pix2);
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); pix2r = vec_ld(16, pix2);
pix2v = vec_perm(pix2l, pix2r, perm1);
tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(pix2l, pix2r, perm2);
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
...@@ -174,14 +177,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz ...@@ -174,14 +177,12 @@ static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_siz
/* Read unaligned pixels into our vectors. The vectors are as follows: /* Read unaligned pixels into our vectors. The vectors are as follows:
pix1v: pix1[0]-pix1[15] pix1v: pix1[0]-pix1[15]
pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
tv = (vector unsigned char *) pix1; pix1v = vec_ld(0, pix1);
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
tv = (vector unsigned char *) &pix3[0];
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
tv = (vector unsigned char *) &pix3[1]; pix2l = vec_ld( 0, pix3);
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); pix2r = vec_ld(16, pix3);
pix3v = vec_perm(pix2l, pix2r, perm1);
pix3iv = vec_perm(pix2l, pix2r, perm2);
/* Note that AltiVec does have vec_avg, but this works on vector pairs /* Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
...@@ -230,7 +231,7 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i ...@@ -230,7 +231,7 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
int i; int i;
int s; int s;
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, pix1v_low, pix1v_high, pix2v_low, pix2v_high; vector unsigned char perm = vec_lvsl(0, pix2);
vector unsigned char t1, t2, t3,t4, t5; vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sad; vector unsigned int sad;
vector signed int sumdiffs; vector signed int sumdiffs;
...@@ -240,14 +241,10 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i ...@@ -240,14 +241,10 @@ static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); vector unsigned char pix2l = vec_ld( 0, pix2);
pix1v_high = vec_ld( 0, pix1); vector unsigned char pix2r = vec_ld(15, pix2);
pix1v_low = vec_ld(15, pix1); t1 = vec_ld(0, pix1);
perm2 = vec_lvsl(0, pix2); t2 = vec_perm(pix2l, pix2r, perm);
pix2v_high = vec_ld( 0, pix2);
pix2v_low = vec_ld(15, pix2);
t1 = vec_perm(pix1v_high, pix1v_low, perm1);
t2 = vec_perm(pix2v_high, pix2v_low, perm2);
/* Calculate a sum of abs differences vector */ /* Calculate a sum of abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -274,25 +271,25 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in ...@@ -274,25 +271,25 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
int i; int i;
int s; int s;
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
vector unsigned char perm1 = vec_lvsl(0, pix1);
vector unsigned char perm2 = vec_lvsl(0, pix2);
vector unsigned char t1, t2, t3,t4, t5; vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sad; vector unsigned int sad;
vector signed int sumdiffs; vector signed int sumdiffs;
sad = (vector unsigned int)vec_splat_u32(0); sad = (vector unsigned int)vec_splat_u32(0);
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
perm1 = vec_lvsl(0, pix1); vector unsigned char pix1l = vec_ld( 0, pix1);
pix1v = (vector unsigned char *) pix1; vector unsigned char pix1r = vec_ld(15, pix1);
perm2 = vec_lvsl(0, pix2); vector unsigned char pix2l = vec_ld( 0, pix2);
pix2v = (vector unsigned char *) pix2; vector unsigned char pix2r = vec_ld(15, pix2);
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
/* Calculate a sum of abs differences vector */ /* Calculate a sum of abs differences vector */
t3 = vec_max(t1, t2); t3 = vec_max(t1, t2);
...@@ -319,7 +316,7 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size) ...@@ -319,7 +316,7 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
int i; int i;
int s; int s;
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char *tv; vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned char pixv; vector unsigned char pixv;
vector unsigned int sv; vector unsigned int sv;
vector signed int sum; vector signed int sum;
...@@ -329,8 +326,9 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size) ...@@ -329,8 +326,9 @@ static int pix_norm1_altivec(uint8_t *pix, int line_size)
s = 0; s = 0;
for (i = 0; i < 16; i++) { for (i = 0; i < 16; i++) {
/* Read in the potentially unaligned pixels */ /* Read in the potentially unaligned pixels */
tv = (vector unsigned char *) pix; vector unsigned char pixl = vec_ld( 0, pix);
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); vector unsigned char pixr = vec_ld(15, pix);
pixv = vec_perm(pixl, pixr, perm);
/* Square the values, and add them to our sum */ /* Square the values, and add them to our sum */
sv = vec_msum(pixv, pixv, sv); sv = vec_msum(pixv, pixv, sv);
...@@ -355,26 +353,25 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in ...@@ -355,26 +353,25 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
int i; int i;
int s; int s;
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
vector unsigned char perm1 = vec_lvsl(0, pix1);
vector unsigned char perm2 = vec_lvsl(0, pix2);
vector unsigned char t1, t2, t3,t4, t5; vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sum; vector unsigned int sum;
vector signed int sumsqr; vector signed int sumsqr;
sum = (vector unsigned int)vec_splat_u32(0); sum = (vector unsigned int)vec_splat_u32(0);
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
perm1 = vec_lvsl(0, pix1); vector unsigned char pix1l = vec_ld( 0, pix1);
pix1v = (vector unsigned char *) pix1; vector unsigned char pix1r = vec_ld(15, pix1);
perm2 = vec_lvsl(0, pix2); vector unsigned char pix2l = vec_ld( 0, pix2);
pix2v = (vector unsigned char *) pix2; vector unsigned char pix2r = vec_ld(15, pix2);
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
/* Since we want to use unsigned chars, we can take advantage /* Since we want to use unsigned chars, we can take advantage
of the fact that abs(a-b)^2 = (a-b)^2. */ of the fact that abs(a-b)^2 = (a-b)^2. */
...@@ -409,7 +406,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i ...@@ -409,7 +406,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
int i; int i;
int s; int s;
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char perm = vec_lvsl(0, pix2);
vector unsigned char t1, t2, t3,t4, t5; vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sum; vector unsigned int sum;
vector signed int sumsqr; vector signed int sumsqr;
...@@ -418,12 +415,10 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i ...@@ -418,12 +415,10 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
for (i = 0; i < h; i++) { for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2 */ /* Read potentially unaligned pixels into t1 and t2 */
perm1 = vec_lvsl(0, pix1); vector unsigned char pix2l = vec_ld( 0, pix2);
pix1v = (vector unsigned char *) pix1; vector unsigned char pix2r = vec_ld(15, pix2);
perm2 = vec_lvsl(0, pix2); t1 = vec_ld(0, pix1);
pix2v = (vector unsigned char *) pix2; t2 = vec_perm(pix2l, pix2r, perm);
t1 = vec_perm(pix1v[0], pix1v[1], perm1);
t2 = vec_perm(pix2v[0], pix2v[1], perm2);
/* Since we want to use unsigned chars, we can take advantage /* Since we want to use unsigned chars, we can take advantage
of the fact that abs(a-b)^2 = (a-b)^2. */ of the fact that abs(a-b)^2 = (a-b)^2. */
...@@ -451,7 +446,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i ...@@ -451,7 +446,7 @@ static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, i
static int pix_sum_altivec(uint8_t * pix, int line_size) static int pix_sum_altivec(uint8_t * pix, int line_size)
{ {
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm, *pixv; vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned char t1; vector unsigned char t1;
vector unsigned int sad; vector unsigned int sad;
vector signed int sumdiffs; vector signed int sumdiffs;
...@@ -463,9 +458,9 @@ static int pix_sum_altivec(uint8_t * pix, int line_size) ...@@ -463,9 +458,9 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
for (i = 0; i < 16; i++) { for (i = 0; i < 16; i++) {
/* Read the potentially unaligned 16 pixels into t1 */ /* Read the potentially unaligned 16 pixels into t1 */
perm = vec_lvsl(0, pix); vector unsigned char pixl = vec_ld( 0, pix);
pixv = (vector unsigned char *) pix; vector unsigned char pixr = vec_ld(15, pix);
t1 = vec_perm(pixv[0], pixv[1], perm); t1 = vec_perm(pixl, pixr, perm);
/* Add each 4 pixel group together and put 4 results into sad */ /* Add each 4 pixel group together and put 4 results into sad */
sad = vec_sum4s(t1, sad); sad = vec_sum4s(t1, sad);
...@@ -484,7 +479,8 @@ static int pix_sum_altivec(uint8_t * pix, int line_size) ...@@ -484,7 +479,8 @@ static int pix_sum_altivec(uint8_t * pix, int line_size)
static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
{ {
int i; int i;
vector unsigned char perm, bytes, *pixv; vector unsigned char perm = vec_lvsl(0, pixels);
vector unsigned char bytes;
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts; vector signed short shorts;
...@@ -492,9 +488,9 @@ static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, i ...@@ -492,9 +488,9 @@ static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, i
// Read potentially unaligned pixels. // Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
perm = vec_lvsl(0, pixels); vector unsigned char pixl = vec_ld( 0, pixels);
pixv = (vector unsigned char *) pixels; vector unsigned char pixr = vec_ld(15, pixels);
bytes = vec_perm(pixv[0], pixv[1], perm); bytes = vec_perm(pixl, pixr, perm);
// convert the bytes into shorts // convert the bytes into shorts
shorts = (vector signed short)vec_mergeh(zero, bytes); shorts = (vector signed short)vec_mergeh(zero, bytes);
...@@ -510,7 +506,9 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ...@@ -510,7 +506,9 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride) const uint8_t *s2, int stride)
{ {
int i; int i;
vector unsigned char perm, bytes, *pixv; vector unsigned char perm1 = vec_lvsl(0, s1);
vector unsigned char perm2 = vec_lvsl(0, s2);
vector unsigned char bytes, pixl, pixr;
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts1, shorts2; vector signed short shorts1, shorts2;
...@@ -518,17 +516,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ...@@ -518,17 +516,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
// Read potentially unaligned pixels // Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
perm = vec_lvsl(0, s1); pixl = vec_ld( 0, s1);
pixv = (vector unsigned char *) s1; pixr = vec_ld(15, s1);
bytes = vec_perm(pixv[0], pixv[1], perm); bytes = vec_perm(pixl, pixr, perm1);
// convert the bytes into shorts // convert the bytes into shorts
shorts1 = (vector signed short)vec_mergeh(zero, bytes); shorts1 = (vector signed short)vec_mergeh(zero, bytes);
// Do the same for the second block of pixels // Do the same for the second block of pixels
perm = vec_lvsl(0, s2); pixl = vec_ld( 0, s2);
pixv = (vector unsigned char *) s2; pixr = vec_ld(15, s2);
bytes = vec_perm(pixv[0], pixv[1], perm); bytes = vec_perm(pixl, pixr, perm2);
// convert the bytes into shorts // convert the bytes into shorts
shorts2 = (vector signed short)vec_mergeh(zero, bytes); shorts2 = (vector signed short)vec_mergeh(zero, bytes);
...@@ -550,17 +548,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ...@@ -550,17 +548,17 @@ static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
// Read potentially unaligned pixels // Read potentially unaligned pixels
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
perm = vec_lvsl(0, s1); pixl = vec_ld( 0, s1);
pixv = (vector unsigned char *) s1; pixr = vec_ld(15, s1);
bytes = vec_perm(pixv[0], pixv[1], perm); bytes = vec_perm(pixl, pixr, perm1);
// convert the bytes into shorts // convert the bytes into shorts
shorts1 = (vector signed short)vec_mergeh(zero, bytes); shorts1 = (vector signed short)vec_mergeh(zero, bytes);
// Do the same for the second block of pixels // Do the same for the second block of pixels
perm = vec_lvsl(0, s2); pixl = vec_ld( 0, s2);
pixv = (vector unsigned char *) s2; pixr = vec_ld(15, s2);
bytes = vec_perm(pixv[0], pixv[1], perm); bytes = vec_perm(pixl, pixr, perm2);
// convert the bytes into shorts // convert the bytes into shorts
shorts2 = (vector signed short)vec_mergeh(zero, bytes); shorts2 = (vector signed short)vec_mergeh(zero, bytes);
......
...@@ -114,31 +114,31 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t * ...@@ -114,31 +114,31 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *
{ {
LOAD_ZERO; LOAD_ZERO;
vec_s16 *pv1 = (vec_s16*)v1; vec_s16 *pv1 = (vec_s16*)v1;
vec_s16 *pv2 = (vec_s16*)v2;
vec_s16 *pv3 = (vec_s16*)v3;
register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul}; register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
register vec_s16 t0, t1, i0, i1; register vec_s16 t0, t1, i0, i1, i4;
register vec_s16 i2 = pv2[0], i3 = pv3[0]; register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
register vec_s32 res = zero_s32v; register vec_s32 res = zero_s32v;
register vec_u8 align = vec_lvsl(0, v2); register vec_u8 align = vec_lvsl(0, v2);
int32_t ires; int32_t ires;
order >>= 4; order >>= 4;
do { do {
t0 = vec_perm(i2, pv2[1], align); i1 = vec_ld(16, v2);
i2 = pv2[2]; t0 = vec_perm(i2, i1, align);
t1 = vec_perm(pv2[1], i2, align); i2 = vec_ld(32, v2);
t1 = vec_perm(i1, i2, align);
i0 = pv1[0]; i0 = pv1[0];
i1 = pv1[1]; i1 = pv1[1];
res = vec_msum(t0, i0, res); res = vec_msum(t0, i0, res);
res = vec_msum(t1, i1, res); res = vec_msum(t1, i1, res);
t0 = vec_perm(i3, pv3[1], align); i4 = vec_ld(16, v3);
i3 = pv3[2]; t0 = vec_perm(i3, i4, align);
t1 = vec_perm(pv3[1], i3, align); i3 = vec_ld(32, v3);
t1 = vec_perm(i4, i3, align);
pv1[0] = vec_mladd(t0, muls, i0); pv1[0] = vec_mladd(t0, muls, i0);
pv1[1] = vec_mladd(t1, muls, i1); pv1[1] = vec_mladd(t1, muls, i1);
pv1 += 2; pv1 += 2;
pv2 += 2; v2 += 8;
pv3 += 2; v3 += 8;
} while(--order); } while(--order);
res = vec_splat(vec_sums(res, zero_s32v), 3); res = vec_splat(vec_sums(res, zero_s32v), 3);
vec_ste(res, 0, &ires); vec_ste(res, 0, &ires);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment