Commit 98fdfa99 authored by Janne Grunau's avatar Janne Grunau

ppc: reduce overreads when loading 8 pixels in altivec dsp functions

Altivec can only load naturally aligned vectors. To handle possibly
unaligned data a second vector is loaded from an offset of the original
location and the data is recovered through a vector permutation.
Overreads are minimal if the offset for second load points to the last
element of data. This is 7 for loading eight 8-bit pixels and overreads
are reduced from 16 bytes to 8 bytes if the pixels are 64-bit aligned.
For unaligned pixels the overread is reduced from 23 bytes to 15 bytes
in the worst case.
parent 3fbad007
...@@ -285,10 +285,10 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in ...@@ -285,10 +285,10 @@ static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
vector unsigned char pix1l = vec_ld( 0, pix1); vector unsigned char pix1l = vec_ld(0, pix1);
vector unsigned char pix1r = vec_ld(15, pix1); vector unsigned char pix1r = vec_ld(7, pix1);
vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2l = vec_ld(0, pix2);
vector unsigned char pix2r = vec_ld(15, pix2); vector unsigned char pix2r = vec_ld(7, pix2);
t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
...@@ -367,10 +367,10 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in ...@@ -367,10 +367,10 @@ static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, in
/* Read potentially unaligned pixels into t1 and t2 /* Read potentially unaligned pixels into t1 and t2
Since we're reading 16 pixels, and actually only want 8, Since we're reading 16 pixels, and actually only want 8,
mask out the last 8 pixels. The 0s don't change the sum. */ mask out the last 8 pixels. The 0s don't change the sum. */
vector unsigned char pix1l = vec_ld( 0, pix1); vector unsigned char pix1l = vec_ld(0, pix1);
vector unsigned char pix1r = vec_ld(15, pix1); vector unsigned char pix1r = vec_ld(7, pix1);
vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2l = vec_ld(0, pix2);
vector unsigned char pix2r = vec_ld(15, pix2); vector unsigned char pix2r = vec_ld(7, pix2);
t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear); t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear); t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
...@@ -489,8 +489,8 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i ...@@ -489,8 +489,8 @@ static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, i
// Read potentially unaligned pixels. // Read potentially unaligned pixels.
// We're reading 16 pixels, and actually only want 8, // We're reading 16 pixels, and actually only want 8,
// but we simply ignore the extras. // but we simply ignore the extras.
vector unsigned char pixl = vec_ld( 0, pixels); vector unsigned char pixl = vec_ld(0, pixels);
vector unsigned char pixr = vec_ld(15, pixels); vector unsigned char pixr = vec_ld(7, pixels);
bytes = vec_perm(pixl, pixr, perm); bytes = vec_perm(pixl, pixr, perm);
// convert the bytes into shorts // convert the bytes into shorts
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment