Commit 42809816 authored by Diego Biurrun's avatar Diego Biurrun

cosmetics attack, part III: Remove all tabs and prettyprint/reindent the code.

Originally committed as revision 23175 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
parent ce505b85
...@@ -156,7 +156,7 @@ const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ ...@@ -156,7 +156,7 @@ const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
#ifdef HAVE_MMX #ifdef HAVE_MMX
/* hope these constant values are cache line aligned */ /* hope these constant values are cache line aligned */
static uint64_t attribute_used __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ffULL; static uint64_t attribute_used __attribute__((aligned(8))) mmx_00ffw = 0x00ff00ff00ff00ffULL;
static uint64_t attribute_used __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8ULL; static uint64_t attribute_used __attribute__((aligned(8))) mmx_redmask = 0xf8f8f8f8f8f8f8f8ULL;
static uint64_t attribute_used __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfcULL; static uint64_t attribute_used __attribute__((aligned(8))) mmx_grnmask = 0xfcfcfcfcfcfcfcfcULL;
...@@ -172,12 +172,12 @@ static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; ...@@ -172,12 +172,12 @@ static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
static uint64_t __attribute__((aligned(8))) dither4[2]={ static uint64_t __attribute__((aligned(8))) dither4[2]={
0x0103010301030103LL, 0x0103010301030103LL,
0x0200020002000200LL,}; 0x0200020002000200LL,};
static uint64_t __attribute__((aligned(8))) dither8[2]={ static uint64_t __attribute__((aligned(8))) dither8[2]={
0x0602060206020602LL, 0x0602060206020602LL,
0x0004000400040004LL,}; 0x0004000400040004LL,};
#undef HAVE_MMX #undef HAVE_MMX
...@@ -210,404 +210,404 @@ const int32_t Inverse_Table_6_9[8][4] = { ...@@ -210,404 +210,404 @@ const int32_t Inverse_Table_6_9[8][4] = {
{117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */ {117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */
}; };
#define RGB(i) \ #define RGB(i) \
U = pu[i]; \ U = pu[i]; \
V = pv[i]; \ V = pv[i]; \
r = (void *)c->table_rV[V]; \ r = (void *)c->table_rV[V]; \
g = (void *)(c->table_gU[U] + c->table_gV[V]); \ g = (void *)(c->table_gU[U] + c->table_gV[V]); \
b = (void *)c->table_bU[U]; b = (void *)c->table_bU[U];
#define DST1(i) \ #define DST1(i) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
dst_1[2*i] = r[Y] + g[Y] + b[Y]; \ dst_1[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y] + g[Y] + b[Y]; dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST2(i) \ #define DST2(i) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
dst_2[2*i] = r[Y] + g[Y] + b[Y]; \ dst_2[2*i] = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y] + g[Y] + b[Y]; dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
#define DST1RGB(i) \ #define DST1RGB(i) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \ dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y]; dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
#define DST2RGB(i) \ #define DST2RGB(i) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \ dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y]; dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
#define DST1BGR(i) \ #define DST1BGR(i) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \ dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y]; dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
#define DST2BGR(i) \ #define DST2BGR(i) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \ dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y]; dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
#define PROLOG(func_name, dst_type) \ #define PROLOG(func_name, dst_type) \
static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \ static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
int srcSliceH, uint8_t* dst[], int dstStride[]){\ int srcSliceH, uint8_t* dst[], int dstStride[]){\
int y;\ int y;\
\ \
if(c->srcFormat == PIX_FMT_YUV422P){\ if (c->srcFormat == PIX_FMT_YUV422P){\
srcStride[1] *= 2;\ srcStride[1] *= 2;\
srcStride[2] *= 2;\ srcStride[2] *= 2;\
}\ }\
for(y=0; y<srcSliceH; y+=2){\ for (y=0; y<srcSliceH; y+=2){\
dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\ dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\
dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\ dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
dst_type attribute_unused *r, *b;\ dst_type attribute_unused *r, *b;\
dst_type *g;\ dst_type *g;\
uint8_t *py_1= src[0] + y*srcStride[0];\ uint8_t *py_1= src[0] + y*srcStride[0];\
uint8_t *py_2= py_1 + srcStride[0];\ uint8_t *py_2= py_1 + srcStride[0];\
uint8_t *pu= src[1] + (y>>1)*srcStride[1];\ uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
uint8_t *pv= src[2] + (y>>1)*srcStride[2];\ uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
unsigned int h_size= c->dstW>>3;\ unsigned int h_size= c->dstW>>3;\
while (h_size--) {\ while (h_size--) {\
int attribute_unused U, V;\ int attribute_unused U, V;\
int Y;\ int Y;\
#define EPILOG(dst_delta)\ #define EPILOG(dst_delta)\
pu += 4;\ pu += 4;\
pv += 4;\ pv += 4;\
py_1 += 8;\ py_1 += 8;\
py_2 += 8;\ py_2 += 8;\
dst_1 += dst_delta;\ dst_1 += dst_delta;\
dst_2 += dst_delta;\ dst_2 += dst_delta;\
}\ }\
}\ }\
return srcSliceH;\ return srcSliceH;\
} }
PROLOG(yuv2rgb_c_32, uint32_t) PROLOG(yuv2rgb_c_32, uint32_t)
RGB(0); RGB(0);
DST1(0); DST1(0);
DST2(0); DST2(0);
RGB(1); RGB(1);
DST2(1); DST2(1);
DST1(1); DST1(1);
RGB(2); RGB(2);
DST1(2); DST1(2);
DST2(2); DST2(2);
RGB(3); RGB(3);
DST2(3); DST2(3);
DST1(3); DST1(3);
EPILOG(8) EPILOG(8)
PROLOG(yuv2rgb_c_24_rgb, uint8_t) PROLOG(yuv2rgb_c_24_rgb, uint8_t)
RGB(0); RGB(0);
DST1RGB(0); DST1RGB(0);
DST2RGB(0); DST2RGB(0);
RGB(1); RGB(1);
DST2RGB(1); DST2RGB(1);
DST1RGB(1); DST1RGB(1);
RGB(2); RGB(2);
DST1RGB(2); DST1RGB(2);
DST2RGB(2); DST2RGB(2);
RGB(3); RGB(3);
DST2RGB(3); DST2RGB(3);
DST1RGB(3); DST1RGB(3);
EPILOG(24) EPILOG(24)
// only trivial mods from yuv2rgb_c_24_rgb // only trivial mods from yuv2rgb_c_24_rgb
PROLOG(yuv2rgb_c_24_bgr, uint8_t) PROLOG(yuv2rgb_c_24_bgr, uint8_t)
RGB(0); RGB(0);
DST1BGR(0); DST1BGR(0);
DST2BGR(0); DST2BGR(0);
RGB(1); RGB(1);
DST2BGR(1); DST2BGR(1);
DST1BGR(1); DST1BGR(1);
RGB(2); RGB(2);
DST1BGR(2); DST1BGR(2);
DST2BGR(2); DST2BGR(2);
RGB(3); RGB(3);
DST2BGR(3); DST2BGR(3);
DST1BGR(3); DST1BGR(3);
EPILOG(24) EPILOG(24)
// This is exactly the same code as yuv2rgb_c_32 except for the types of // This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2 // r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_16, uint16_t) PROLOG(yuv2rgb_c_16, uint16_t)
RGB(0); RGB(0);
DST1(0); DST1(0);
DST2(0); DST2(0);
RGB(1); RGB(1);
DST2(1); DST2(1);
DST1(1); DST1(1);
RGB(2); RGB(2);
DST1(2); DST1(2);
DST2(2); DST2(2);
RGB(3); RGB(3);
DST2(3); DST2(3);
DST1(3); DST1(3);
EPILOG(8) EPILOG(8)
// This is exactly the same code as yuv2rgb_c_32 except for the types of // This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2 // r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_8, uint8_t) PROLOG(yuv2rgb_c_8, uint8_t)
RGB(0); RGB(0);
DST1(0); DST1(0);
DST2(0); DST2(0);
RGB(1); RGB(1);
DST2(1); DST2(1);
DST1(1); DST1(1);
RGB(2); RGB(2);
DST1(2); DST1(2);
DST2(2); DST2(2);
RGB(3); RGB(3);
DST2(3); DST2(3);
DST1(3); DST1(3);
EPILOG(8) EPILOG(8)
// r, g, b, dst_1, dst_2 // r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t) PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
const uint8_t *d32= dither_8x8_32[y&7]; const uint8_t *d32= dither_8x8_32[y&7];
const uint8_t *d64= dither_8x8_73[y&7]; const uint8_t *d64= dither_8x8_73[y&7];
#define DST1bpp8(i,o) \ #define DST1bpp8(i,o) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \ dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]]; dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
#define DST2bpp8(i,o) \ #define DST2bpp8(i,o) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \ dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]]; dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]];
RGB(0); RGB(0);
DST1bpp8(0,0); DST1bpp8(0,0);
DST2bpp8(0,0); DST2bpp8(0,0);
RGB(1); RGB(1);
DST2bpp8(1,2); DST2bpp8(1,2);
DST1bpp8(1,2); DST1bpp8(1,2);
RGB(2); RGB(2);
DST1bpp8(2,4); DST1bpp8(2,4);
DST2bpp8(2,4); DST2bpp8(2,4);
RGB(3); RGB(3);
DST2bpp8(3,6); DST2bpp8(3,6);
DST1bpp8(3,6); DST1bpp8(3,6);
EPILOG(8) EPILOG(8)
// This is exactly the same code as yuv2rgb_c_32 except for the types of // This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2 // r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_4, uint8_t) PROLOG(yuv2rgb_c_4, uint8_t)
int acc; int acc;
#define DST1_4(i) \ #define DST1_4(i) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \ acc = r[Y] + g[Y] + b[Y]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\ acc |= (r[Y] + g[Y] + b[Y])<<4; \
dst_1[i] = acc; dst_1[i] = acc;
#define DST2_4(i) \ #define DST2_4(i) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
acc = r[Y] + g[Y] + b[Y]; \ acc = r[Y] + g[Y] + b[Y]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
acc |= (r[Y] + g[Y] + b[Y])<<4;\ acc |= (r[Y] + g[Y] + b[Y])<<4; \
dst_2[i] = acc; dst_2[i] = acc;
RGB(0); RGB(0);
DST1_4(0); DST1_4(0);
DST2_4(0); DST2_4(0);
RGB(1); RGB(1);
DST2_4(1); DST2_4(1);
DST1_4(1); DST1_4(1);
RGB(2); RGB(2);
DST1_4(2); DST1_4(2);
DST2_4(2); DST2_4(2);
RGB(3); RGB(3);
DST2_4(3); DST2_4(3);
DST1_4(3); DST1_4(3);
EPILOG(4) EPILOG(4)
PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t) PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
const uint8_t *d64= dither_8x8_73[y&7]; const uint8_t *d64= dither_8x8_73[y&7];
const uint8_t *d128=dither_8x8_220[y&7]; const uint8_t *d128=dither_8x8_220[y&7];
int acc; int acc;
#define DST1bpp4(i,o) \ #define DST1bpp4(i,o) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;\ acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4; \
dst_1[i]= acc; dst_1[i]= acc;
#define DST2bpp4(i,o) \ #define DST2bpp4(i,o) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4;\ acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4; \
dst_2[i]= acc; dst_2[i]= acc;
RGB(0); RGB(0);
DST1bpp4(0,0); DST1bpp4(0,0);
DST2bpp4(0,0); DST2bpp4(0,0);
RGB(1); RGB(1);
DST2bpp4(1,2); DST2bpp4(1,2);
DST1bpp4(1,2); DST1bpp4(1,2);
RGB(2); RGB(2);
DST1bpp4(2,4); DST1bpp4(2,4);
DST2bpp4(2,4); DST2bpp4(2,4);
RGB(3); RGB(3);
DST2bpp4(3,6); DST2bpp4(3,6);
DST1bpp4(3,6); DST1bpp4(3,6);
EPILOG(4) EPILOG(4)
// This is exactly the same code as yuv2rgb_c_32 except for the types of // This is exactly the same code as yuv2rgb_c_32 except for the types of
// r, g, b, dst_1, dst_2 // r, g, b, dst_1, dst_2
PROLOG(yuv2rgb_c_4b, uint8_t) PROLOG(yuv2rgb_c_4b, uint8_t)
RGB(0); RGB(0);
DST1(0); DST1(0);
DST2(0); DST2(0);
RGB(1); RGB(1);
DST2(1); DST2(1);
DST1(1); DST1(1);
RGB(2); RGB(2);
DST1(2); DST1(2);
DST2(2); DST2(2);
RGB(3); RGB(3);
DST2(3); DST2(3);
DST1(3); DST1(3);
EPILOG(8) EPILOG(8)
PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t) PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
const uint8_t *d64= dither_8x8_73[y&7]; const uint8_t *d64= dither_8x8_73[y&7];
const uint8_t *d128=dither_8x8_220[y&7]; const uint8_t *d128=dither_8x8_220[y&7];
#define DST1bpp4b(i,o) \ #define DST1bpp4b(i,o) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]]; dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
#define DST2bpp4b(i,o) \ #define DST2bpp4b(i,o) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]]; dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]];
RGB(0); RGB(0);
DST1bpp4b(0,0); DST1bpp4b(0,0);
DST2bpp4b(0,0); DST2bpp4b(0,0);
RGB(1); RGB(1);
DST2bpp4b(1,2); DST2bpp4b(1,2);
DST1bpp4b(1,2); DST1bpp4b(1,2);
RGB(2); RGB(2);
DST1bpp4b(2,4); DST1bpp4b(2,4);
DST2bpp4b(2,4); DST2bpp4b(2,4);
RGB(3); RGB(3);
DST2bpp4b(3,6); DST2bpp4b(3,6);
DST1bpp4b(3,6); DST1bpp4b(3,6);
EPILOG(8) EPILOG(8)
PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t) PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
const uint8_t *d128=dither_8x8_220[y&7]; const uint8_t *d128=dither_8x8_220[y&7];
char out_1=0, out_2=0; char out_1=0, out_2=0;
g= c->table_gU[128] + c->table_gV[128]; g= c->table_gU[128] + c->table_gV[128];
#define DST1bpp1(i,o) \ #define DST1bpp1(i,o) \
Y = py_1[2*i]; \ Y = py_1[2*i]; \
out_1+= out_1 + g[Y+d128[0+o]]; \ out_1+= out_1 + g[Y+d128[0+o]]; \
Y = py_1[2*i+1]; \ Y = py_1[2*i+1]; \
out_1+= out_1 + g[Y+d128[1+o]]; out_1+= out_1 + g[Y+d128[1+o]];
#define DST2bpp1(i,o) \ #define DST2bpp1(i,o) \
Y = py_2[2*i]; \ Y = py_2[2*i]; \
out_2+= out_2 + g[Y+d128[8+o]]; \ out_2+= out_2 + g[Y+d128[8+o]]; \
Y = py_2[2*i+1]; \ Y = py_2[2*i+1]; \
out_2+= out_2 + g[Y+d128[9+o]]; out_2+= out_2 + g[Y+d128[9+o]];
DST1bpp1(0,0); DST1bpp1(0,0);
DST2bpp1(0,0); DST2bpp1(0,0);
DST2bpp1(1,2); DST2bpp1(1,2);
DST1bpp1(1,2); DST1bpp1(1,2);
DST1bpp1(2,4); DST1bpp1(2,4);
DST2bpp1(2,4); DST2bpp1(2,4);
DST2bpp1(3,6); DST2bpp1(3,6);
DST1bpp1(3,6); DST1bpp1(3,6);
dst_1[0]= out_1; dst_1[0]= out_1;
dst_2[0]= out_2; dst_2[0]= out_2;
EPILOG(1) EPILOG(1)
SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
{ {
#if defined(HAVE_MMX2) || defined(HAVE_MMX) #if defined(HAVE_MMX2) || defined(HAVE_MMX)
if(c->flags & SWS_CPU_CAPS_MMX2){ if (c->flags & SWS_CPU_CAPS_MMX2){
switch(c->dstFormat){ switch(c->dstFormat){
case PIX_FMT_RGB32: return yuv420_rgb32_MMX2; case PIX_FMT_RGB32: return yuv420_rgb32_MMX2;
case PIX_FMT_BGR24: return yuv420_rgb24_MMX2; case PIX_FMT_BGR24: return yuv420_rgb24_MMX2;
case PIX_FMT_BGR565: return yuv420_rgb16_MMX2; case PIX_FMT_BGR565: return yuv420_rgb16_MMX2;
case PIX_FMT_BGR555: return yuv420_rgb15_MMX2; case PIX_FMT_BGR555: return yuv420_rgb15_MMX2;
} }
} }
if(c->flags & SWS_CPU_CAPS_MMX){ if (c->flags & SWS_CPU_CAPS_MMX){
switch(c->dstFormat){ switch(c->dstFormat){
case PIX_FMT_RGB32: return yuv420_rgb32_MMX; case PIX_FMT_RGB32: return yuv420_rgb32_MMX;
case PIX_FMT_BGR24: return yuv420_rgb24_MMX; case PIX_FMT_BGR24: return yuv420_rgb24_MMX;
case PIX_FMT_BGR565: return yuv420_rgb16_MMX; case PIX_FMT_BGR565: return yuv420_rgb16_MMX;
case PIX_FMT_BGR555: return yuv420_rgb15_MMX; case PIX_FMT_BGR555: return yuv420_rgb15_MMX;
} }
} }
#endif #endif
#ifdef HAVE_MLIB #ifdef HAVE_MLIB
{ {
SwsFunc t= yuv2rgb_init_mlib(c); SwsFunc t= yuv2rgb_init_mlib(c);
if(t) return t; if (t) return t;
} }
#endif #endif
#ifdef HAVE_ALTIVEC #ifdef HAVE_ALTIVEC
if (c->flags & SWS_CPU_CAPS_ALTIVEC) if (c->flags & SWS_CPU_CAPS_ALTIVEC)
{ {
SwsFunc t = yuv2rgb_init_altivec(c); SwsFunc t = yuv2rgb_init_altivec(c);
if(t) return t; if (t) return t;
} }
#endif #endif
...@@ -630,7 +630,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) ...@@ -630,7 +630,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
case PIX_FMT_BGR4_BYTE: return yuv2rgb_c_4b_ordered_dither; case PIX_FMT_BGR4_BYTE: return yuv2rgb_c_4b_ordered_dither;
case PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither; case PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither;
default: default:
assert(0); assert(0);
} }
return NULL; return NULL;
} }
...@@ -638,9 +638,9 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) ...@@ -638,9 +638,9 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
static int div_round (int dividend, int divisor) static int div_round (int dividend, int divisor)
{ {
if (dividend > 0) if (dividend > 0)
return (dividend + (divisor>>1)) / divisor; return (dividend + (divisor>>1)) / divisor;
else else
return -((-dividend + (divisor>>1)) / divisor); return -((-dividend + (divisor>>1)) / divisor);
} }
int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation) int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
...@@ -667,9 +667,9 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, ...@@ -667,9 +667,9 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
int64_t oy = 0; int64_t oy = 0;
//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
if(!fullRange){ if (!fullRange){
cy= (cy*255) / 219; cy= (cy*255) / 219;
oy= 16<<16; oy= 16<<16;
}else{ }else{
crv= (crv*224) / 255; crv= (crv*224) / 255;
cbu= (cbu*224) / 255; cbu= (cbu*224) / 255;
...@@ -686,163 +686,163 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, ...@@ -686,163 +686,163 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
oy -= 256*brightness; oy -= 256*brightness;
for (i = 0; i < 1024; i++) { for (i = 0; i < 1024; i++) {
int j; int j;
j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32; j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
j = (j < 0) ? 0 : ((j > 255) ? 255 : j); j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
table_Y[i] = j; table_Y[i] = j;
} }
switch (bpp) { switch (bpp) {
case 32: case 32:
table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t)); table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
entry_size = sizeof (uint32_t); entry_size = sizeof (uint32_t);
table_r = table_32 + 197; table_r = table_32 + 197;
table_b = table_32 + 197 + 685; table_b = table_32 + 197 + 685;
table_g = table_32 + 197 + 2*682; table_g = table_32 + 197 + 2*682;
for (i = -197; i < 256+197; i++) for (i = -197; i < 256+197; i++)
((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0); ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
for (i = -132; i < 256+132; i++) for (i = -132; i < 256+132; i++)
((uint32_t *)table_g)[i] = table_Y[i+384] << 8; ((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
for (i = -232; i < 256+232; i++) for (i = -232; i < 256+232; i++)
((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16); ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
break; break;
case 24: case 24:
table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t)); table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t));
entry_size = sizeof (uint8_t); entry_size = sizeof (uint8_t);
table_r = table_g = table_b = table_8 + 232; table_r = table_g = table_b = table_8 + 232;
for (i = -232; i < 256+232; i++) for (i = -232; i < 256+232; i++)
((uint8_t * )table_b)[i] = table_Y[i+384]; ((uint8_t * )table_b)[i] = table_Y[i+384];
break; break;
case 15: case 15:
case 16: case 16:
table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t)); table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
entry_size = sizeof (uint16_t); entry_size = sizeof (uint16_t);
table_r = table_16 + 197; table_r = table_16 + 197;
table_b = table_16 + 197 + 685; table_b = table_16 + 197 + 685;
table_g = table_16 + 197 + 2*682; table_g = table_16 + 197 + 2*682;
for (i = -197; i < 256+197; i++) { for (i = -197; i < 256+197; i++) {
int j = table_Y[i+384] >> 3; int j = table_Y[i+384] >> 3;
if (isRgb) if (isRgb)
j <<= ((bpp==16) ? 11 : 10); j <<= ((bpp==16) ? 11 : 10);
((uint16_t *)table_r)[i] = j; ((uint16_t *)table_r)[i] = j;
} }
for (i = -132; i < 256+132; i++) { for (i = -132; i < 256+132; i++) {
int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3); int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
((uint16_t *)table_g)[i] = j << 5; ((uint16_t *)table_g)[i] = j << 5;
} }
for (i = -232; i < 256+232; i++) { for (i = -232; i < 256+232; i++) {
int j = table_Y[i+384] >> 3; int j = table_Y[i+384] >> 3;
if (!isRgb) if (!isRgb)
j <<= ((bpp==16) ? 11 : 10); j <<= ((bpp==16) ? 11 : 10);
((uint16_t *)table_b)[i] = j; ((uint16_t *)table_b)[i] = j;
} }
break; break;
case 8: case 8:
table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
entry_size = sizeof (uint8_t); entry_size = sizeof (uint8_t);
table_r = table_332 + 197; table_r = table_332 + 197;
table_b = table_332 + 197 + 685; table_b = table_332 + 197 + 685;
table_g = table_332 + 197 + 2*682; table_g = table_332 + 197 + 2*682;
for (i = -197; i < 256+197; i++) { for (i = -197; i < 256+197; i++) {
int j = (table_Y[i+384 - 16] + 18)/36; int j = (table_Y[i+384 - 16] + 18)/36;
if (isRgb) if (isRgb)
j <<= 5; j <<= 5;
((uint8_t *)table_r)[i] = j; ((uint8_t *)table_r)[i] = j;
} }
for (i = -132; i < 256+132; i++) { for (i = -132; i < 256+132; i++) {
int j = (table_Y[i+384 - 16] + 18)/36; int j = (table_Y[i+384 - 16] + 18)/36;
if (!isRgb) if (!isRgb)
j <<= 1; j <<= 1;
((uint8_t *)table_g)[i] = j << 2; ((uint8_t *)table_g)[i] = j << 2;
} }
for (i = -232; i < 256+232; i++) { for (i = -232; i < 256+232; i++) {
int j = (table_Y[i+384 - 37] + 43)/85; int j = (table_Y[i+384 - 37] + 43)/85;
if (!isRgb) if (!isRgb)
j <<= 6; j <<= 6;
((uint8_t *)table_b)[i] = j; ((uint8_t *)table_b)[i] = j;
} }
break; break;
case 4: case 4:
case 4|128: case 4|128:
table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
entry_size = sizeof (uint8_t); entry_size = sizeof (uint8_t);
table_r = table_121 + 197; table_r = table_121 + 197;
table_b = table_121 + 197 + 685; table_b = table_121 + 197 + 685;
table_g = table_121 + 197 + 2*682; table_g = table_121 + 197 + 2*682;
for (i = -197; i < 256+197; i++) { for (i = -197; i < 256+197; i++) {
int j = table_Y[i+384 - 110] >> 7; int j = table_Y[i+384 - 110] >> 7;
if (isRgb) if (isRgb)
j <<= 3; j <<= 3;
((uint8_t *)table_r)[i] = j; ((uint8_t *)table_r)[i] = j;
} }
for (i = -132; i < 256+132; i++) { for (i = -132; i < 256+132; i++) {
int j = (table_Y[i+384 - 37]+ 43)/85; int j = (table_Y[i+384 - 37]+ 43)/85;
((uint8_t *)table_g)[i] = j << 1; ((uint8_t *)table_g)[i] = j << 1;
} }
for (i = -232; i < 256+232; i++) { for (i = -232; i < 256+232; i++) {
int j =table_Y[i+384 - 110] >> 7; int j =table_Y[i+384 - 110] >> 7;
if (!isRgb) if (!isRgb)
j <<= 3; j <<= 3;
((uint8_t *)table_b)[i] = j; ((uint8_t *)table_b)[i] = j;
} }
break; break;
case 1: case 1:
table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t)); table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t));
entry_size = sizeof (uint8_t); entry_size = sizeof (uint8_t);
table_g = table_1; table_g = table_1;
table_r = table_b = NULL; table_r = table_b = NULL;
for (i = 0; i < 256+256; i++) { for (i = 0; i < 256+256; i++) {
int j = table_Y[i + 384 - 110]>>7; int j = table_Y[i + 384 - 110]>>7;
((uint8_t *)table_g)[i] = j; ((uint8_t *)table_g)[i] = j;
} }
break; break;
default: default:
table_start= NULL; table_start= NULL;
av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp); av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
//free mem? //free mem?
return -1; return -1;
} }
for (i = 0; i < 256; i++) { for (i = 0; i < 256; i++) {
c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309); c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309);
c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309); c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309);
c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309); c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309); c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309);
} }
av_free(c->yuvTable); av_free(c->yuvTable);
......
...@@ -139,70 +139,70 @@ typedef signed char sbyte; ...@@ -139,70 +139,70 @@ typedef signed char sbyte;
static static
const vector unsigned char const vector unsigned char
perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
#define vec_merge3(x2,x1,x0,y0,y1,y2) \ #define vec_merge3(x2,x1,x0,y0,y1,y2) \
do { \ do { \
typeof(x0) o0,o2,o3; \ typeof(x0) o0,o2,o3; \
o0 = vec_mergeh (x0,x1); \ o0 = vec_mergeh (x0,x1); \
y0 = vec_perm (o0, x2, perm_rgb_0);\ y0 = vec_perm (o0, x2, perm_rgb_0); \
o2 = vec_perm (o0, x2, perm_rgb_1);\ o2 = vec_perm (o0, x2, perm_rgb_1); \
o3 = vec_mergel (x0,x1); \ o3 = vec_mergel (x0,x1); \
y1 = vec_perm (o3,o2,perm_rgb_2); \ y1 = vec_perm (o3,o2,perm_rgb_2); \
y2 = vec_perm (o3,o2,perm_rgb_3); \ y2 = vec_perm (o3,o2,perm_rgb_3); \
} while(0) } while(0)
#define vec_mstbgr24(x0,x1,x2,ptr) \ #define vec_mstbgr24(x0,x1,x2,ptr) \
do { \ do { \
typeof(x0) _0,_1,_2; \ typeof(x0) _0,_1,_2; \
vec_merge3 (x0,x1,x2,_0,_1,_2); \ vec_merge3 (x0,x1,x2,_0,_1,_2); \
vec_st (_0, 0, ptr++); \ vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \ vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \ vec_st (_2, 0, ptr++); \
} while (0); } while (0);
#define vec_mstrgb24(x0,x1,x2,ptr) \ #define vec_mstrgb24(x0,x1,x2,ptr) \
do { \ do { \
typeof(x0) _0,_1,_2; \ typeof(x0) _0,_1,_2; \
vec_merge3 (x2,x1,x0,_0,_1,_2); \ vec_merge3 (x2,x1,x0,_0,_1,_2); \
vec_st (_0, 0, ptr++); \ vec_st (_0, 0, ptr++); \
vec_st (_1, 0, ptr++); \ vec_st (_1, 0, ptr++); \
vec_st (_2, 0, ptr++); \ vec_st (_2, 0, ptr++); \
} while (0); } while (0);
/* pack the pixels in rgb0 format /* pack the pixels in rgb0 format
msb R msb R
lsb 0 lsb 0
*/ */
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
do { \ do { \
T _0,_1,_2,_3; \ T _0,_1,_2,_3; \
_0 = vec_mergeh (x0,x1); \ _0 = vec_mergeh (x0,x1); \
_1 = vec_mergeh (x2,x3); \ _1 = vec_mergeh (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 0*16, (T *)ptr); \ vec_st (_2, 0*16, (T *)ptr); \
vec_st (_3, 1*16, (T *)ptr); \ vec_st (_3, 1*16, (T *)ptr); \
_0 = vec_mergel (x0,x1); \ _0 = vec_mergel (x0,x1); \
_1 = vec_mergel (x2,x3); \ _1 = vec_mergel (x2,x3); \
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
vec_st (_2, 2*16, (T *)ptr); \ vec_st (_2, 2*16, (T *)ptr); \
vec_st (_3, 3*16, (T *)ptr); \ vec_st (_3, 3*16, (T *)ptr); \
ptr += 4; \ ptr += 4; \
} while (0); } while (0);
/* /*
| 1 0 1.4021 | | Y | | 1 0 1.4021 | | Y |
| 1 -0.3441 -0.7142 |x| Cb| | 1 -0.3441 -0.7142 |x| Cb|
| 1 1.7718 0 | | Cr| | 1 1.7718 0 | | Cr|
Y: [-128 127] Y: [-128 127]
...@@ -216,51 +216,51 @@ do { \ ...@@ -216,51 +216,51 @@ do { \
#define vec_unh(x) \ #define vec_unh(x) \
(vector signed short) \ (vector signed short) \
vec_perm(x,(typeof(x))AVV(0),\ vec_perm(x,(typeof(x))AVV(0),\
(vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
#define vec_unl(x) \ #define vec_unl(x) \
(vector signed short) \ (vector signed short) \
vec_perm(x,(typeof(x))AVV(0),\ vec_perm(x,(typeof(x))AVV(0),\
(vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
#define vec_clip_s16(x) \ #define vec_clip_s16(x) \
vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
(vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 )) (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
#define vec_packclp(x,y) \ #define vec_packclp(x,y) \
(vector unsigned char)vec_packs \ (vector unsigned char)vec_packs \
((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
(vector unsigned short)vec_max (y,(vector signed short) AVV(0))) (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr) //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
static inline void cvtyuvtoRGB (SwsContext *c, static inline void cvtyuvtoRGB (SwsContext *c,
vector signed short Y, vector signed short U, vector signed short V, vector signed short Y, vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G, vector signed short *B) vector signed short *R, vector signed short *G, vector signed short *B)
{ {
vector signed short vx,ux,uvx; vector signed short vx,ux,uvx;
Y = vec_mradds (Y, c->CY, c->OY); Y = vec_mradds (Y, c->CY, c->OY);
U = vec_sub (U,(vector signed short) U = vec_sub (U,(vector signed short)
vec_splat((vector signed short)AVV(128),0)); vec_splat((vector signed short)AVV(128),0));
V = vec_sub (V,(vector signed short) V = vec_sub (V,(vector signed short)
vec_splat((vector signed short)AVV(128),0)); vec_splat((vector signed short)AVV(128),0));
// ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
ux = vec_sl (U, c->CSHIFT); ux = vec_sl (U, c->CSHIFT);
*B = vec_mradds (ux, c->CBU, Y); *B = vec_mradds (ux, c->CBU, Y);
// vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
vx = vec_sl (V, c->CSHIFT); vx = vec_sl (V, c->CSHIFT);
*R = vec_mradds (vx, c->CRV, Y); *R = vec_mradds (vx, c->CRV, Y);
// uvx = ((CGU*u) + (CGV*v))>>15; // uvx = ((CGU*u) + (CGV*v))>>15;
uvx = vec_mradds (U, c->CGU, Y); uvx = vec_mradds (U, c->CGU, Y);
*G = vec_mradds (V, c->CGV, uvx); *G = vec_mradds (V, c->CGV, uvx);
} }
...@@ -271,164 +271,168 @@ static inline void cvtyuvtoRGB (SwsContext *c, ...@@ -271,164 +271,168 @@ static inline void cvtyuvtoRGB (SwsContext *c,
*/ */
#define DEFCSP420_CVT(name,out_pixels) \ #define DEFCSP420_CVT(name,out_pixels) \
static int altivec_##name (SwsContext *c, \ static int altivec_##name (SwsContext *c, \
unsigned char **in, int *instrides, \ unsigned char **in, int *instrides, \
int srcSliceY, int srcSliceH, \ int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \ unsigned char **oplanes, int *outstrides) \
{ \ { \
int w = c->srcW; \ int w = c->srcW; \
int h = srcSliceH; \ int h = srcSliceH; \
int i,j; \ int i,j; \
int instrides_scl[3]; \ int instrides_scl[3]; \
vector unsigned char y0,y1; \ vector unsigned char y0,y1; \
\ \
vector signed char u,v; \ vector signed char u,v; \
\ \
vector signed short Y0,Y1,Y2,Y3; \ vector signed short Y0,Y1,Y2,Y3; \
vector signed short U,V; \ vector signed short U,V; \
vector signed short vx,ux,uvx; \ vector signed short vx,ux,uvx; \
vector signed short vx0,ux0,uvx0; \ vector signed short vx0,ux0,uvx0; \
vector signed short vx1,ux1,uvx1; \ vector signed short vx1,ux1,uvx1; \
vector signed short R0,G0,B0; \ vector signed short R0,G0,B0; \
vector signed short R1,G1,B1; \ vector signed short R1,G1,B1; \
vector unsigned char R,G,B; \ vector unsigned char R,G,B; \
\ \
vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
vector unsigned char align_perm; \ vector unsigned char align_perm; \
\ \
vector signed short \ vector signed short \
lCY = c->CY, \ lCY = c->CY, \
lOY = c->OY, \ lOY = c->OY, \
lCRV = c->CRV, \ lCRV = c->CRV, \
lCBU = c->CBU, \ lCBU = c->CBU, \
lCGU = c->CGU, \ lCGU = c->CGU, \
lCGV = c->CGV; \ lCGV = c->CGV; \
\ \
vector unsigned short lCSHIFT = c->CSHIFT; \ vector unsigned short lCSHIFT = c->CSHIFT; \
\ \
ubyte *y1i = in[0]; \ ubyte *y1i = in[0]; \
ubyte *y2i = in[0]+instrides[0]; \ ubyte *y2i = in[0]+instrides[0]; \
ubyte *ui = in[1]; \ ubyte *ui = in[1]; \
ubyte *vi = in[2]; \ ubyte *vi = in[2]; \
\ \
vector unsigned char *oute \ vector unsigned char *oute \
= (vector unsigned char *) \ = (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]); \ (oplanes[0]+srcSliceY*outstrides[0]); \
vector unsigned char *outo \ vector unsigned char *outo \
= (vector unsigned char *) \ = (vector unsigned char *) \
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
\ \
\ \
instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
\ \
\ \
for (i=0;i<h/2;i++) { \ for (i=0;i<h/2;i++) { \
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
\ \
for (j=0;j<w/16;j++) { \ for (j=0;j<w/16;j++) { \
\ \
y1ivP = (vector unsigned char *)y1i; \ y1ivP = (vector unsigned char *)y1i; \
y2ivP = (vector unsigned char *)y2i; \ y2ivP = (vector unsigned char *)y2i; \
uivP = (vector unsigned char *)ui; \ uivP = (vector unsigned char *)ui; \
vivP = (vector unsigned char *)vi; \ vivP = (vector unsigned char *)vi; \
\ \
align_perm = vec_lvsl (0, y1i); \ align_perm = vec_lvsl (0, y1i); \
y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\ y0 = (vector unsigned char) \
\ vec_perm (y1ivP[0], y1ivP[1], align_perm); \
align_perm = vec_lvsl (0, y2i); \ \
y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\ align_perm = vec_lvsl (0, y2i); \
\ y1 = (vector unsigned char) \
align_perm = vec_lvsl (0, ui); \ vec_perm (y2ivP[0], y2ivP[1], align_perm); \
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \ \
\ align_perm = vec_lvsl (0, ui); \
align_perm = vec_lvsl (0, vi); \ u = (vector signed char) \
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \ vec_perm (uivP[0], uivP[1], align_perm); \
\ \
u = (vector signed char) \ align_perm = vec_lvsl (0, vi); \
vec_sub (u,(vector signed char) \ v = (vector signed char) \
vec_splat((vector signed char)AVV(128),0));\ vec_perm (vivP[0], vivP[1], align_perm); \
v = (vector signed char) \ \
vec_sub (v,(vector signed char) \ u = (vector signed char) \
vec_splat((vector signed char)AVV(128),0));\ vec_sub (u,(vector signed char) \
\ vec_splat((vector signed char)AVV(128),0)); \
U = vec_unpackh (u); \ v = (vector signed char) \
V = vec_unpackh (v); \ vec_sub (v,(vector signed char) \
\ vec_splat((vector signed char)AVV(128),0)); \
\ \
Y0 = vec_unh (y0); \ U = vec_unpackh (u); \
Y1 = vec_unl (y0); \ V = vec_unpackh (v); \
Y2 = vec_unh (y1); \ \
Y3 = vec_unl (y1); \ \
\ Y0 = vec_unh (y0); \
Y0 = vec_mradds (Y0, lCY, lOY); \ Y1 = vec_unl (y0); \
Y1 = vec_mradds (Y1, lCY, lOY); \ Y2 = vec_unh (y1); \
Y2 = vec_mradds (Y2, lCY, lOY); \ Y3 = vec_unl (y1); \
Y3 = vec_mradds (Y3, lCY, lOY); \ \
\ Y0 = vec_mradds (Y0, lCY, lOY); \
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ Y1 = vec_mradds (Y1, lCY, lOY); \
ux = vec_sl (U, lCSHIFT); \ Y2 = vec_mradds (Y2, lCY, lOY); \
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ Y3 = vec_mradds (Y3, lCY, lOY); \
ux0 = vec_mergeh (ux,ux); \ \
ux1 = vec_mergel (ux,ux); \ /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
\ ux = vec_sl (U, lCSHIFT); \
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
vx = vec_sl (V, lCSHIFT); \ ux0 = vec_mergeh (ux,ux); \
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ ux1 = vec_mergel (ux,ux); \
vx0 = vec_mergeh (vx,vx); \ \
vx1 = vec_mergel (vx,vx); \ /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
\ vx = vec_sl (V, lCSHIFT); \
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \ vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ vx0 = vec_mergeh (vx,vx); \
uvx = vec_mradds (V, lCGV, uvx); \ vx1 = vec_mergel (vx,vx); \
uvx0 = vec_mergeh (uvx,uvx); \ \
uvx1 = vec_mergel (uvx,uvx); \ /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
\ uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
R0 = vec_add (Y0,vx0); \ uvx = vec_mradds (V, lCGV, uvx); \
G0 = vec_add (Y0,uvx0); \ uvx0 = vec_mergeh (uvx,uvx); \
B0 = vec_add (Y0,ux0); \ uvx1 = vec_mergel (uvx,uvx); \
R1 = vec_add (Y1,vx1); \ \
G1 = vec_add (Y1,uvx1); \ R0 = vec_add (Y0,vx0); \
B1 = vec_add (Y1,ux1); \ G0 = vec_add (Y0,uvx0); \
\ B0 = vec_add (Y0,ux0); \
R = vec_packclp (R0,R1); \ R1 = vec_add (Y1,vx1); \
G = vec_packclp (G0,G1); \ G1 = vec_add (Y1,uvx1); \
B = vec_packclp (B0,B1); \ B1 = vec_add (Y1,ux1); \
\ \
out_pixels(R,G,B,oute); \ R = vec_packclp (R0,R1); \
\ G = vec_packclp (G0,G1); \
R0 = vec_add (Y2,vx0); \ B = vec_packclp (B0,B1); \
G0 = vec_add (Y2,uvx0); \ \
B0 = vec_add (Y2,ux0); \ out_pixels(R,G,B,oute); \
R1 = vec_add (Y3,vx1); \ \
G1 = vec_add (Y3,uvx1); \ R0 = vec_add (Y2,vx0); \
B1 = vec_add (Y3,ux1); \ G0 = vec_add (Y2,uvx0); \
R = vec_packclp (R0,R1); \ B0 = vec_add (Y2,ux0); \
G = vec_packclp (G0,G1); \ R1 = vec_add (Y3,vx1); \
B = vec_packclp (B0,B1); \ G1 = vec_add (Y3,uvx1); \
\ B1 = vec_add (Y3,ux1); \
\ R = vec_packclp (R0,R1); \
out_pixels(R,G,B,outo); \ G = vec_packclp (G0,G1); \
\ B = vec_packclp (B0,B1); \
y1i += 16; \ \
y2i += 16; \ \
ui += 8; \ out_pixels(R,G,B,outo); \
vi += 8; \ \
\ y1i += 16; \
} \ y2i += 16; \
\ ui += 8; \
outo += (outstrides[0])>>4; \ vi += 8; \
oute += (outstrides[0])>>4; \ \
\ } \
ui += instrides_scl[1]; \ \
vi += instrides_scl[2]; \ outo += (outstrides[0])>>4; \
y1i += instrides_scl[0]; \ oute += (outstrides[0])>>4; \
y2i += instrides_scl[0]; \ \
} \ ui += instrides_scl[1]; \
return srcSliceH; \ vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
} }
...@@ -444,150 +448,150 @@ DEFCSP420_CVT (yuv2_abgr, out_abgr) ...@@ -444,150 +448,150 @@ DEFCSP420_CVT (yuv2_abgr, out_abgr)
DEFCSP420_CVT (yuv2_bgra, out_bgra) DEFCSP420_CVT (yuv2_bgra, out_bgra)
#else #else
static int altivec_yuv2_bgra32 (SwsContext *c, static int altivec_yuv2_bgra32 (SwsContext *c,
unsigned char **in, int *instrides, unsigned char **in, int *instrides,
int srcSliceY, int srcSliceH, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides) unsigned char **oplanes, int *outstrides)
{ {
int w = c->srcW; int w = c->srcW;
int h = srcSliceH; int h = srcSliceH;
int i,j; int i,j;
int instrides_scl[3]; int instrides_scl[3];
vector unsigned char y0,y1; vector unsigned char y0,y1;
vector signed char u,v; vector signed char u,v;
vector signed short Y0,Y1,Y2,Y3; vector signed short Y0,Y1,Y2,Y3;
vector signed short U,V; vector signed short U,V;
vector signed short vx,ux,uvx; vector signed short vx,ux,uvx;
vector signed short vx0,ux0,uvx0; vector signed short vx0,ux0,uvx0;
vector signed short vx1,ux1,uvx1; vector signed short vx1,ux1,uvx1;
vector signed short R0,G0,B0; vector signed short R0,G0,B0;
vector signed short R1,G1,B1; vector signed short R1,G1,B1;
vector unsigned char R,G,B; vector unsigned char R,G,B;
vector unsigned char *uivP, *vivP; vector unsigned char *uivP, *vivP;
vector unsigned char align_perm; vector unsigned char align_perm;
vector signed short vector signed short
lCY = c->CY, lCY = c->CY,
lOY = c->OY, lOY = c->OY,
lCRV = c->CRV, lCRV = c->CRV,
lCBU = c->CBU, lCBU = c->CBU,
lCGU = c->CGU, lCGU = c->CGU,
lCGV = c->CGV; lCGV = c->CGV;
vector unsigned short lCSHIFT = c->CSHIFT; vector unsigned short lCSHIFT = c->CSHIFT;
ubyte *y1i = in[0]; ubyte *y1i = in[0];
ubyte *y2i = in[0]+w; ubyte *y2i = in[0]+w;
ubyte *ui = in[1]; ubyte *ui = in[1];
ubyte *vi = in[2]; ubyte *vi = in[2];
vector unsigned char *oute vector unsigned char *oute
= (vector unsigned char *) = (vector unsigned char *)
(oplanes[0]+srcSliceY*outstrides[0]); (oplanes[0]+srcSliceY*outstrides[0]);
vector unsigned char *outo vector unsigned char *outo
= (vector unsigned char *) = (vector unsigned char *)
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
instrides_scl[0] = instrides[0]; instrides_scl[0] = instrides[0];
instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
for (i=0;i<h/2;i++) { for (i=0;i<h/2;i++) {
vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
for (j=0;j<w/16;j++) { for (j=0;j<w/16;j++) {
y0 = vec_ldl (0,y1i); y0 = vec_ldl (0,y1i);
y1 = vec_ldl (0,y2i); y1 = vec_ldl (0,y2i);
uivP = (vector unsigned char *)ui; uivP = (vector unsigned char *)ui;
vivP = (vector unsigned char *)vi; vivP = (vector unsigned char *)vi;
align_perm = vec_lvsl (0, ui); align_perm = vec_lvsl (0, ui);
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
align_perm = vec_lvsl (0, vi); align_perm = vec_lvsl (0, vi);
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
u = (vector signed char) u = (vector signed char)
vec_sub (u,(vector signed char) vec_sub (u,(vector signed char)
vec_splat((vector signed char)AVV(128),0)); vec_splat((vector signed char)AVV(128),0));
v = (vector signed char) v = (vector signed char)
vec_sub (v, (vector signed char) vec_sub (v, (vector signed char)
vec_splat((vector signed char)AVV(128),0)); vec_splat((vector signed char)AVV(128),0));
U = vec_unpackh (u); U = vec_unpackh (u);
V = vec_unpackh (v); V = vec_unpackh (v);
Y0 = vec_unh (y0); Y0 = vec_unh (y0);
Y1 = vec_unl (y0); Y1 = vec_unl (y0);
Y2 = vec_unh (y1); Y2 = vec_unh (y1);
Y3 = vec_unl (y1); Y3 = vec_unl (y1);
Y0 = vec_mradds (Y0, lCY, lOY); Y0 = vec_mradds (Y0, lCY, lOY);
Y1 = vec_mradds (Y1, lCY, lOY); Y1 = vec_mradds (Y1, lCY, lOY);
Y2 = vec_mradds (Y2, lCY, lOY); Y2 = vec_mradds (Y2, lCY, lOY);
Y3 = vec_mradds (Y3, lCY, lOY); Y3 = vec_mradds (Y3, lCY, lOY);
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
ux = vec_sl (U, lCSHIFT); ux = vec_sl (U, lCSHIFT);
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
ux0 = vec_mergeh (ux,ux); ux0 = vec_mergeh (ux,ux);
ux1 = vec_mergel (ux,ux); ux1 = vec_mergel (ux,ux);
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
vx = vec_sl (V, lCSHIFT); vx = vec_sl (V, lCSHIFT);
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
vx0 = vec_mergeh (vx,vx); vx0 = vec_mergeh (vx,vx);
vx1 = vec_mergel (vx,vx); vx1 = vec_mergel (vx,vx);
/* uvx = ((CGU*u) + (CGV*v))>>15 */ /* uvx = ((CGU*u) + (CGV*v))>>15 */
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
uvx = vec_mradds (V, lCGV, uvx); uvx = vec_mradds (V, lCGV, uvx);
uvx0 = vec_mergeh (uvx,uvx); uvx0 = vec_mergeh (uvx,uvx);
uvx1 = vec_mergel (uvx,uvx); uvx1 = vec_mergel (uvx,uvx);
R0 = vec_add (Y0,vx0); R0 = vec_add (Y0,vx0);
G0 = vec_add (Y0,uvx0); G0 = vec_add (Y0,uvx0);
B0 = vec_add (Y0,ux0); B0 = vec_add (Y0,ux0);
R1 = vec_add (Y1,vx1); R1 = vec_add (Y1,vx1);
G1 = vec_add (Y1,uvx1); G1 = vec_add (Y1,uvx1);
B1 = vec_add (Y1,ux1); B1 = vec_add (Y1,ux1);
R = vec_packclp (R0,R1); R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1); G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1); B = vec_packclp (B0,B1);
out_argb(R,G,B,oute); out_argb(R,G,B,oute);
R0 = vec_add (Y2,vx0); R0 = vec_add (Y2,vx0);
G0 = vec_add (Y2,uvx0); G0 = vec_add (Y2,uvx0);
B0 = vec_add (Y2,ux0); B0 = vec_add (Y2,ux0);
R1 = vec_add (Y3,vx1); R1 = vec_add (Y3,vx1);
G1 = vec_add (Y3,uvx1); G1 = vec_add (Y3,uvx1);
B1 = vec_add (Y3,ux1); B1 = vec_add (Y3,ux1);
R = vec_packclp (R0,R1); R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1); G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1); B = vec_packclp (B0,B1);
out_argb(R,G,B,outo); out_argb(R,G,B,outo);
y1i += 16; y1i += 16;
y2i += 16; y2i += 16;
ui += 8; ui += 8;
vi += 8; vi += 8;
} }
outo += (outstrides[0])>>4; outo += (outstrides[0])>>4;
oute += (outstrides[0])>>4; oute += (outstrides[0])>>4;
ui += instrides_scl[1]; ui += instrides_scl[1];
vi += instrides_scl[2]; vi += instrides_scl[2];
y1i += instrides_scl[0]; y1i += instrides_scl[0];
y2i += instrides_scl[0]; y2i += instrides_scl[0];
} }
return srcSliceH; return srcSliceH;
} }
#endif #endif
...@@ -603,77 +607,77 @@ DEFCSP420_CVT (yuv2_bgr24, out_bgr24) ...@@ -603,77 +607,77 @@ DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
// 0123 4567 89ab cdef // 0123 4567 89ab cdef
static static
const vector unsigned char const vector unsigned char
demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
0x10,0x04,0x10,0x04, 0x10,0x04,0x10,0x04,
0x10,0x08,0x10,0x08, 0x10,0x08,0x10,0x08,
0x10,0x0c,0x10,0x0c), 0x10,0x0c,0x10,0x0c),
demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
0x10,0x06,0x10,0x06, 0x10,0x06,0x10,0x06,
0x10,0x0A,0x10,0x0A, 0x10,0x0A,0x10,0x0A,
0x10,0x0E,0x10,0x0E), 0x10,0x0E,0x10,0x0E),
demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
0x10,0x05,0x10,0x07, 0x10,0x05,0x10,0x07,
0x10,0x09,0x10,0x0B, 0x10,0x09,0x10,0x0B,
0x10,0x0D,0x10,0x0F); 0x10,0x0D,0x10,0x0F);
/* /*
this is so I can play live CCIR raw video this is so I can play live CCIR raw video
*/ */
static int altivec_uyvy_rgb32 (SwsContext *c, static int altivec_uyvy_rgb32 (SwsContext *c,
unsigned char **in, int *instrides, unsigned char **in, int *instrides,
int srcSliceY, int srcSliceH, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides) unsigned char **oplanes, int *outstrides)
{ {
int w = c->srcW; int w = c->srcW;
int h = srcSliceH; int h = srcSliceH;
int i,j; int i,j;
vector unsigned char uyvy; vector unsigned char uyvy;
vector signed short Y,U,V; vector signed short Y,U,V;
vector signed short R0,G0,B0,R1,G1,B1; vector signed short R0,G0,B0,R1,G1,B1;
vector unsigned char R,G,B; vector unsigned char R,G,B;
vector unsigned char *out; vector unsigned char *out;
ubyte *img; ubyte *img;
img = in[0]; img = in[0];
out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
for (i=0;i<h;i++) { for (i=0;i<h;i++) {
for (j=0;j<w/16;j++) { for (j=0;j<w/16;j++) {
uyvy = vec_ld (0, img); uyvy = vec_ld (0, img);
U = (vector signed short) U = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
V = (vector signed short) V = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
Y = (vector signed short) Y = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
uyvy = vec_ld (16, img); uyvy = vec_ld (16, img);
U = (vector signed short) U = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
V = (vector signed short) V = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
Y = (vector signed short) Y = (vector signed short)
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
R = vec_packclp (R0,R1); R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1); G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1); B = vec_packclp (B0,B1);
// vec_mstbgr24 (R,G,B, out); // vec_mstbgr24 (R,G,B, out);
out_rgba (R,G,B,out); out_rgba (R,G,B,out);
img += 32; img += 32;
}
} }
} return srcSliceH;
return srcSliceH;
} }
...@@ -686,278 +690,278 @@ static int altivec_uyvy_rgb32 (SwsContext *c, ...@@ -686,278 +690,278 @@ static int altivec_uyvy_rgb32 (SwsContext *c,
*/ */
SwsFunc yuv2rgb_init_altivec (SwsContext *c) SwsFunc yuv2rgb_init_altivec (SwsContext *c)
{ {
if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
return NULL; return NULL;
/* /*
and this seems not to matter too much I tried a bunch of and this seems not to matter too much I tried a bunch of
videos with abnormal widths and mplayer crashes else where. videos with abnormal widths and mplayer crashes else where.
mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
boom with X11 bad match. boom with X11 bad match.
*/
if ((c->srcW & 0xf) != 0) return NULL;
switch (c->srcFormat) {
case PIX_FMT_YUV410P:
case PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case PIX_FMT_GRAY8:
case PIX_FMT_NV12:
case PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch(c->dstFormat){
case PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
case PIX_FMT_UYVY422:
switch(c->dstFormat){
case PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
} */
return NULL; if ((c->srcW & 0xf) != 0) return NULL;
switch (c->srcFormat) {
case PIX_FMT_YUV410P:
case PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case PIX_FMT_GRAY8:
case PIX_FMT_NV12:
case PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch(c->dstFormat){
case PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
case PIX_FMT_UYVY422:
switch(c->dstFormat){
case PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
return NULL;
} }
static uint16_t roundToInt16(int64_t f){ static uint16_t roundToInt16(int64_t f){
int r= (f + (1<<15))>>16; int r= (f + (1<<15))>>16;
if(r<-0x7FFF) return 0x8000; if (r<-0x7FFF) return 0x8000;
else if(r> 0x7FFF) return 0x7FFF; else if (r> 0x7FFF) return 0x7FFF;
else return r; else return r;
} }
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
{ {
union { union {
signed short tmp[8] __attribute__ ((aligned(16))); signed short tmp[8] __attribute__ ((aligned(16)));
vector signed short vec; vector signed short vec;
} buf; } buf;
buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
buf.tmp[1] = -256*brightness; //oy buf.tmp[1] = -256*brightness; //oy
buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
c->CSHIFT = (vector unsigned short)vec_splat_u16(2); c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
c->CY = vec_splat ((vector signed short)buf.vec, 0); c->CY = vec_splat ((vector signed short)buf.vec, 0);
c->OY = vec_splat ((vector signed short)buf.vec, 1); c->OY = vec_splat ((vector signed short)buf.vec, 1);
c->CRV = vec_splat ((vector signed short)buf.vec, 2); c->CRV = vec_splat ((vector signed short)buf.vec, 2);
c->CBU = vec_splat ((vector signed short)buf.vec, 3); c->CBU = vec_splat ((vector signed short)buf.vec, 3);
c->CGU = vec_splat ((vector signed short)buf.vec, 4); c->CGU = vec_splat ((vector signed short)buf.vec, 4);
c->CGV = vec_splat ((vector signed short)buf.vec, 5); c->CGV = vec_splat ((vector signed short)buf.vec, 5);
#if 0 #if 0
{ {
int i; int i;
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
for (i=0; i<6;i++) for (i=0; i<6; i++)
printf("%s %d ", v[i],buf.tmp[i] ); printf("%s %d ", v[i],buf.tmp[i] );
printf("\n"); printf("\n");
} }
#endif #endif
return; return;
} }
void void
altivec_yuv2packedX (SwsContext *c, altivec_yuv2packedX (SwsContext *c,
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
uint8_t *dest, int dstW, int dstY) uint8_t *dest, int dstW, int dstY)
{ {
int i,j; int i,j;
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
vector signed short R0,G0,B0,R1,G1,B1; vector signed short R0,G0,B0,R1,G1,B1;
vector unsigned char R,G,B;
vector unsigned char *out,*nout;
vector signed short RND = vec_splat_s16(1<<3); vector unsigned char R,G,B;
vector unsigned short SCL = vec_splat_u16(4); vector unsigned char *out,*nout;
unsigned long scratch[16] __attribute__ ((aligned (16)));
vector signed short *YCoeffs, *CCoeffs; vector signed short RND = vec_splat_s16(1<<3);
vector unsigned short SCL = vec_splat_u16(4);
unsigned long scratch[16] __attribute__ ((aligned (16)));
YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; vector signed short *YCoeffs, *CCoeffs;
CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
out = (vector unsigned char *)dest; YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
for(i=0; i<dstW; i+=16){ out = (vector unsigned char *)dest;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for(j=0; j<lumFilterSize; j++) {
X0 = vec_ld (0, &lumSrc[j][i]);
X1 = vec_ld (16, &lumSrc[j][i]);
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for(j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+2048]);
V = vec_mradds (X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra (Y0, SCL);
Y1 = vec_sra (Y1, SCL);
U = vec_sra (U, SCL);
V = vec_sra (V, SCL);
Y0 = vec_clip_s16 (Y0);
Y1 = vec_clip_s16 (Y1);
U = vec_clip_s16 (U);
V = vec_clip_s16 (V);
/* now we have for (i=0; i<dstW; i+=16){
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 Y0 = RND;
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 Y1 = RND;
/* extract 16 coeffs from lumSrc */
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 for (j=0; j<lumFilterSize; j++) {
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 X0 = vec_ld (0, &lumSrc[j][i]);
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 X1 = vec_ld (16, &lumSrc[j][i]);
*/ Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
U0 = vec_mergeh (U,U); U = RND;
V0 = vec_mergeh (V,V); V = RND;
/* extract 8 coeffs from U,V */
U1 = vec_mergel (U,U); for (j=0; j<chrFilterSize; j++) {
V1 = vec_mergel (V,V); X = vec_ld (0, &chrSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); X = vec_ld (0, &chrSrc[j][i/2+2048]);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); V = vec_mradds (X, CCoeffs[j], V);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
switch(c->dstFormat) {
case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
instead. */
static int printed_error_message;
if(!printed_error_message) {
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
sws_format_name(c->dstFormat));
printed_error_message=1;
}
return;
} }
}
}
if (i < dstW) {
i -= 16;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for(j=0; j<lumFilterSize; j++) {
X0 = vec_ld (0, &lumSrc[j][i]);
X1 = vec_ld (16, &lumSrc[j][i]);
Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
U = RND; /* scale and clip signals */
V = RND; Y0 = vec_sra (Y0, SCL);
/* extract 8 coeffs from U,V */ Y1 = vec_sra (Y1, SCL);
for(j=0; j<chrFilterSize; j++) { U = vec_sra (U, SCL);
X = vec_ld (0, &chrSrc[j][i/2]); V = vec_sra (V, SCL);
U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+2048]); Y0 = vec_clip_s16 (Y0);
V = vec_mradds (X, CCoeffs[j], V); Y1 = vec_clip_s16 (Y1);
U = vec_clip_s16 (U);
V = vec_clip_s16 (V);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh (U,U);
V0 = vec_mergeh (V,V);
U1 = vec_mergel (U,U);
V1 = vec_mergel (V,V);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
switch(c->dstFormat) {
case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
instead. */
static int printed_error_message;
if (!printed_error_message) {
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
sws_format_name(c->dstFormat));
printed_error_message=1;
}
return;
}
}
} }
/* scale and clip signals */ if (i < dstW) {
Y0 = vec_sra (Y0, SCL); i -= 16;
Y1 = vec_sra (Y1, SCL);
U = vec_sra (U, SCL); Y0 = RND;
V = vec_sra (V, SCL); Y1 = RND;
/* extract 16 coeffs from lumSrc */
Y0 = vec_clip_s16 (Y0); for (j=0; j<lumFilterSize; j++) {
Y1 = vec_clip_s16 (Y1); X0 = vec_ld (0, &lumSrc[j][i]);
U = vec_clip_s16 (U); X1 = vec_ld (16, &lumSrc[j][i]);
V = vec_clip_s16 (V); Y0 = vec_mradds (X0, YCoeffs[j], Y0);
Y1 = vec_mradds (X1, YCoeffs[j], Y1);
}
/* now we have U = RND;
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 V = RND;
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 /* extract 8 coeffs from U,V */
for (j=0; j<chrFilterSize; j++) {
X = vec_ld (0, &chrSrc[j][i/2]);
U = vec_mradds (X, CCoeffs[j], U);
X = vec_ld (0, &chrSrc[j][i/2+2048]);
V = vec_mradds (X, CCoeffs[j], V);
}
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 /* scale and clip signals */
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 Y0 = vec_sra (Y0, SCL);
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 Y1 = vec_sra (Y1, SCL);
*/ U = vec_sra (U, SCL);
V = vec_sra (V, SCL);
Y0 = vec_clip_s16 (Y0);
Y1 = vec_clip_s16 (Y1);
U = vec_clip_s16 (U);
V = vec_clip_s16 (V);
/* now we have
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh (U,U);
V0 = vec_mergeh (V,V);
U1 = vec_mergel (U,U);
V1 = vec_mergel (V,V);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
nout = (vector unsigned char *)scratch;
switch(c->dstFormat) {
case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
sws_format_name(c->dstFormat));
return;
}
U0 = vec_mergeh (U,U); memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
V0 = vec_mergeh (V,V);
U1 = vec_mergel (U,U);
V1 = vec_mergel (V,V);
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
R = vec_packclp (R0,R1);
G = vec_packclp (G0,G1);
B = vec_packclp (B0,B1);
nout = (vector unsigned char *)scratch;
switch(c->dstFormat) {
case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
sws_format_name(c->dstFormat));
return;
} }
memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
}
} }
...@@ -32,55 +32,55 @@ ...@@ -32,55 +32,55 @@
#include "swscale.h" #include "swscale.h"
static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
assert(srcStride[1] == srcStride[2]); assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]); srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH; return srcSliceH;
} }
static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
assert(srcStride[1] == srcStride[2]); assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]); srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH; return srcSliceH;
} }
static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
assert(srcStride[1] == srcStride[2]); assert(srcStride[1] == srcStride[2]);
mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
srcSliceH, dstStride[0], srcStride[0], srcStride[1]); srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
return srcSliceH; return srcSliceH;
} }
SwsFunc yuv2rgb_init_mlib(SwsContext *c) SwsFunc yuv2rgb_init_mlib(SwsContext *c)
{ {
switch(c->dstFormat){ switch(c->dstFormat){
case PIX_FMT_RGB24: return mlib_YUV2RGB420_24; case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32; case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32; case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
default: return NULL; default: return NULL;
} }
} }
...@@ -47,169 +47,169 @@ ...@@ -47,169 +47,169 @@
#endif #endif
#define YUV2RGB \ #define YUV2RGB \
/* Do the multiply part of the conversion for even and odd pixels, /* Do the multiply part of the conversion for even and odd pixels,
register usage: register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */\ mm6 -> Y even, mm7 -> Y odd */\
/* convert the chroma part */\ /* convert the chroma part */\
"punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
"punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\ \
"psllw $3, %%mm0;" /* Promote precision */ \ "psllw $3, %%mm0;" /* Promote precision */ \
"psllw $3, %%mm1;" /* Promote precision */ \ "psllw $3, %%mm1;" /* Promote precision */ \
\ \
"psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
"psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
\ \
"movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
"movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
\ \
"pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
"pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
\ \
"pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
"pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
\ \
"paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
\ \
/* convert the luma part */\ /* convert the luma part */\
"movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
"pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
\ \
"psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
\ \
"psllw $3, %%mm6;" /* Promote precision */\ "psllw $3, %%mm6;" /* Promote precision */\
"psllw $3, %%mm7;" /* Promote precision */\ "psllw $3, %%mm7;" /* Promote precision */\
\ \
"psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
"psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
\ \
"pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
"pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
\ \
/* Do the addition part of the conversion for even and odd pixels, /* Do the addition part of the conversion for even and odd pixels,
register usage: register usage:
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
mm6 -> Y even, mm7 -> Y odd */\ mm6 -> Y even, mm7 -> Y odd */\
"movq %%mm0, %%mm3;" /* Copy Cblue */\ "movq %%mm0, %%mm3;" /* Copy Cblue */\
"movq %%mm1, %%mm4;" /* Copy Cred */\ "movq %%mm1, %%mm4;" /* Copy Cred */\
"movq %%mm2, %%mm5;" /* Copy Cgreen */\ "movq %%mm2, %%mm5;" /* Copy Cgreen */\
\ \
"paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
"paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
\ \
"paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
"paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
\ \
"paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
"paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
\ \
/* Limit RGB even to 0..255 */\ /* Limit RGB even to 0..255 */\
"packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
"packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
"packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
\ \
/* Limit RGB odd to 0..255 */\ /* Limit RGB odd to 0..255 */\
"packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
"packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
"packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
\ \
/* Interleave RGB even and odd */\ /* Interleave RGB even and odd */\
"punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
"punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
"punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size; int y, h_size;
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
h_size= (c->dstW+7)&~7; h_size= (c->dstW+7)&~7;
if(h_size*2 > FFABS(dstStride[0])) h_size-=8; if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]); //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for (y= 0; y<srcSliceH; y++ ) { for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0]; uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
b5Dither= dither8[y&1]; b5Dither= dither8[y&1];
g6Dither= dither4[y&1]; g6Dither= dither4[y&1];
g5Dither= dither8[y&1]; g5Dither= dither8[y&1];
r5Dither= dither8[(y+1)&1]; r5Dither= dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */ pixels in each iteration */
__asm__ __volatile__ ( __asm__ __volatile__ (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t" //".balign 16 \n\t"
"1: \n\t" "1: \n\t"
/* no speed diference on my p3@500 with prefetch, /* no speed diference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me * if it is faster for anyone with -benchmark then tell me
PREFETCH" 64(%0) \n\t" PREFETCH" 64(%0) \n\t"
PREFETCH" 64(%1) \n\t" PREFETCH" 64(%1) \n\t"
PREFETCH" 64(%2) \n\t" PREFETCH" 64(%2) \n\t"
*/ */
YUV2RGB YUV2RGB
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm0;" "paddusb "MANGLE(b5Dither)", %%mm0;"
"paddusb "MANGLE(g6Dither)", %%mm2;" "paddusb "MANGLE(g6Dither)", %%mm2;"
"paddusb "MANGLE(r5Dither)", %%mm1;" "paddusb "MANGLE(r5Dither)", %%mm1;"
#endif #endif
/* mask unneeded bits off */ /* mask unneeded bits off */
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"pxor %%mm4, %%mm4;" /* zero mm4 */ "pxor %%mm4, %%mm4;" /* zero mm4 */
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
"add $16, %1 \n\t" "add $16, %1 \n\t"
"add $4, %0 \n\t" "add $4, %0 \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+r" (index), "+r" (_image) : "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
); );
} }
__asm__ __volatile__ (EMMS); __asm__ __volatile__ (EMMS);
...@@ -218,88 +218,88 @@ YUV2RGB ...@@ -218,88 +218,88 @@ YUV2RGB
} }
static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size; int y, h_size;
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
h_size= (c->dstW+7)&~7; h_size= (c->dstW+7)&~7;
if(h_size*2 > FFABS(dstStride[0])) h_size-=8; if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
//printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
//srcStride[0],srcStride[1],srcStride[2],dstStride[0]); //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
for (y= 0; y<srcSliceH; y++ ) { for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0]; uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
b5Dither= dither8[y&1]; b5Dither= dither8[y&1];
g6Dither= dither4[y&1]; g6Dither= dither4[y&1];
g5Dither= dither8[y&1]; g5Dither= dither8[y&1];
r5Dither= dither8[(y+1)&1]; r5Dither= dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */ pixels in each iteration */
__asm__ __volatile__ ( __asm__ __volatile__ (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t" //".balign 16 \n\t"
"1: \n\t" "1: \n\t"
YUV2RGB YUV2RGB
#ifdef DITHER1XBPP #ifdef DITHER1XBPP
"paddusb "MANGLE(b5Dither)", %%mm0 \n\t" "paddusb "MANGLE(b5Dither)", %%mm0 \n\t"
"paddusb "MANGLE(g5Dither)", %%mm2 \n\t" "paddusb "MANGLE(g5Dither)", %%mm2 \n\t"
"paddusb "MANGLE(r5Dither)", %%mm1 \n\t" "paddusb "MANGLE(r5Dither)", %%mm1 \n\t"
#endif #endif
/* mask unneeded bits off */ /* mask unneeded bits off */
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
"psrlw $3,%%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
"psrlw $1,%%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ "psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
"pxor %%mm4, %%mm4;" /* zero mm4 */ "pxor %%mm4, %%mm4;" /* zero mm4 */
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
"psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
"add $16, %1 \n\t" "add $16, %1 \n\t"
"add $4, %0 \n\t" "add $4, %0 \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+r" (index), "+r" (_image) : "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
); );
} }
__asm__ __volatile__ (EMMS); __asm__ __volatile__ (EMMS);
...@@ -307,12 +307,12 @@ YUV2RGB ...@@ -307,12 +307,12 @@ YUV2RGB
} }
static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size; int y, h_size;
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
h_size= (c->dstW+7)&~7; h_size= (c->dstW+7)&~7;
...@@ -321,131 +321,131 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr ...@@ -321,131 +321,131 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
for (y= 0; y<srcSliceH; y++ ) { for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0]; uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */ pixels in each iteration */
__asm__ __volatile__ ( __asm__ __volatile__ (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t" //".balign 16 \n\t"
"1: \n\t" "1: \n\t"
YUV2RGB YUV2RGB
/* mm0=B, %%mm2=G, %%mm1=R */ /* mm0=B, %%mm2=G, %%mm1=R */
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
"movq "MANGLE(M24A)", %%mm4 \n\t" "movq "MANGLE(M24A)", %%mm4 \n\t"
"movq "MANGLE(M24C)", %%mm7 \n\t" "movq "MANGLE(M24C)", %%mm7 \n\t"
"pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */
"pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */
"pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */
"pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */
"pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */ "pand %%mm7, %%mm6 \n\t" /* R1 R0 */
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */
"por %%mm5, %%mm6 \n\t" "por %%mm5, %%mm6 \n\t"
"por %%mm3, %%mm6 \n\t" "por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, (%1) \n\t" MOVNTQ" %%mm6, (%1) \n\t"
"psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */
"pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */
"pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */
"pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */
"pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */ "pand "MANGLE(M24B)", %%mm5 \n\t" /* B5 B4 B3 */
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */ "pand %%mm7, %%mm3 \n\t" /* G4 G3 */
"pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */
"por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */
"por %%mm3, %%mm6 \n\t" "por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, 8(%1) \n\t" MOVNTQ" %%mm6, 8(%1) \n\t"
"pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */
"pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */
"pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"pand %%mm7, %%mm5 \n\t" /* B7 B6 */ "pand %%mm7, %%mm5 \n\t" /* B7 B6 */
"pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */
"pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */ "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
\ \
"por %%mm5, %%mm3 \n\t" "por %%mm5, %%mm3 \n\t"
"por %%mm3, %%mm6 \n\t" "por %%mm3, %%mm6 \n\t"
MOVNTQ" %%mm6, 16(%1) \n\t" MOVNTQ" %%mm6, 16(%1) \n\t"
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
#else #else
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
"movq %%mm0, %%mm5 \n\t" /* B */ "movq %%mm0, %%mm5 \n\t" /* B */
"movq %%mm1, %%mm6 \n\t" /* R */ "movq %%mm1, %%mm6 \n\t" /* R */
"punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
"punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
"punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
"punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
"movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
"movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
"punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
"punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
"punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
"movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
"movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
"movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
"movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
"psllq $40, %%mm7 \n\t" /* RGB00000 0 */ "psllq $40, %%mm7 \n\t" /* RGB00000 0 */
"psllq $40, %%mm0 \n\t" /* RGB00000 1 */ "psllq $40, %%mm0 \n\t" /* RGB00000 1 */
"psllq $40, %%mm5 \n\t" /* RGB00000 2 */ "psllq $40, %%mm5 \n\t" /* RGB00000 2 */
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */ "psllq $40, %%mm3 \n\t" /* RGB00000 3 */
"punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
"punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
"punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
"punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
"psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
"movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
"psllq $40, %%mm0 \n\t" /* GB000000 1 */ "psllq $40, %%mm0 \n\t" /* GB000000 1 */
"por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
MOVNTQ" %%mm7, (%1) \n\t" MOVNTQ" %%mm7, (%1) \n\t"
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
"movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
"psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
"por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
MOVNTQ" %%mm6, 8(%1) \n\t" MOVNTQ" %%mm6, 8(%1) \n\t"
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
"por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
MOVNTQ" %%mm1, 16(%1) \n\t" MOVNTQ" %%mm1, 16(%1) \n\t"
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
#endif #endif
"add $24, %1 \n\t" "add $24, %1 \n\t"
"add $4, %0 \n\t" "add $4, %0 \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+r" (index), "+r" (_image) : "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
); );
} }
__asm__ __volatile__ (EMMS); __asm__ __volatile__ (EMMS);
...@@ -453,12 +453,12 @@ YUV2RGB ...@@ -453,12 +453,12 @@ YUV2RGB
} }
static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]){ int srcSliceH, uint8_t* dst[], int dstStride[]){
int y, h_size; int y, h_size;
if(c->srcFormat == PIX_FMT_YUV422P){ if(c->srcFormat == PIX_FMT_YUV422P){
srcStride[1] *= 2; srcStride[1] *= 2;
srcStride[2] *= 2; srcStride[2] *= 2;
} }
h_size= (c->dstW+7)&~7; h_size= (c->dstW+7)&~7;
...@@ -467,71 +467,71 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr ...@@ -467,71 +467,71 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr
__asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ ); __asm__ __volatile__ ("pxor %mm4, %mm4;" /* zero mm4 */ );
for (y= 0; y<srcSliceH; y++ ) { for (y= 0; y<srcSliceH; y++ ) {
uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0]; uint8_t *_image = dst[0] + (y+srcSliceY)*dstStride[0];
uint8_t *_py = src[0] + y*srcStride[0]; uint8_t *_py = src[0] + y*srcStride[0];
uint8_t *_pu = src[1] + (y>>1)*srcStride[1]; uint8_t *_pu = src[1] + (y>>1)*srcStride[1];
uint8_t *_pv = src[2] + (y>>1)*srcStride[2]; uint8_t *_pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */ pixels in each iteration */
__asm__ __volatile__ ( __asm__ __volatile__ (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
// ".balign 16 \n\t" //".balign 16 \n\t"
"1: \n\t" "1: \n\t"
YUV2RGB YUV2RGB
/* convert RGB plane to RGB packed format, /* convert RGB plane to RGB packed format,
mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
mm4 -> GB, mm5 -> AR pixel 4-7, mm4 -> GB, mm5 -> AR pixel 4-7,
mm6 -> GB, mm7 -> AR pixel 0-3 */ mm6 -> GB, mm7 -> AR pixel 0-3 */
"pxor %%mm3, %%mm3;" /* zero mm3 */ "pxor %%mm3, %%mm3;" /* zero mm3 */
"movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
"movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
"punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
"punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */
"movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
"punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */
"punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
"punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
"movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
"punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
"punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
"pxor %%mm4, %%mm4;" /* zero mm4 */ "pxor %%mm4, %%mm4;" /* zero mm4 */
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
"add $32, %1 \n\t" "add $32, %1 \n\t"
"add $4, %0 \n\t" "add $4, %0 \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+r" (index), "+r" (_image) : "+r" (index), "+r" (_image)
: "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index) : "r" (_pu - index), "r" (_pv - index), "r"(&c->redDither), "r" (_py - 2*index)
); );
} }
__asm__ __volatile__ (EMMS); __asm__ __volatile__ (EMMS);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment