Commit b5d08c27 authored by Ronald S. Bultje's avatar Ronald S. Bultje

swscale: convert rgb/bgr24ToY/UV_mmx functions from inline asm to yasm.

Also implement sse2/ssse3/avx versions.
parent 3b15a6d7
This diff is collapsed.
......@@ -31,10 +31,6 @@ DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
DECLARE_ASM_CONST(8, uint64_t, bm00001111)=0x00000000FFFFFFFFLL;
DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL;
DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL;
DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL;
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
......@@ -68,19 +64,6 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUV)[2][4] = {
{0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
{0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
};
DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
//MMX versions
#if HAVE_MMX
#undef RENAME
......@@ -244,24 +227,29 @@ VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *src, const uint8_t *unused1, \
int w, uint32_t *unused2)
#define INPUT_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
int w, uint32_t *unused); \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt)
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
void ff_sws_init_swScale_mmx(SwsContext *c)
......@@ -311,6 +299,12 @@ switch(c->dstBpc){ \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
}
#define case_rgb(x, X, opt) \
case PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (cpu_flags & AV_CPU_FLAG_MMX) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
......@@ -337,6 +331,8 @@ switch(c->dstBpc){ \
case PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
default:
break;
}
......@@ -379,11 +375,21 @@ switch(c->dstBpc){ \
case PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_sse2;
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
default:
break;
}
}
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
switch (c->srcFormat) {
case_rgb(rgb24, RGB24, ssse3);
case_rgb(bgr24, BGR24, ssse3);
default:
break;
}
}
if (cpu_flags & AV_CPU_FLAG_SSE4) {
/* Xto15 don't need special sse4 functions */
......@@ -412,6 +418,8 @@ switch(c->dstBpc){ \
case PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_avx;
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
default:
break;
}
......
......@@ -1361,148 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
}
}
static av_always_inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src,
int width, enum PixelFormat srcFormat)
{
if(srcFormat == PIX_FMT_BGR24) {
__asm__ volatile(
"movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
"movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
:
);
} else {
__asm__ volatile(
"movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
"movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
:
);
}
__asm__ volatile(
"movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
"mov %2, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
PREFETCH" 64(%0) \n\t"
"movd (%0), %%mm0 \n\t"
"movd 2(%0), %%mm1 \n\t"
"movd 6(%0), %%mm2 \n\t"
"movd 8(%0), %%mm3 \n\t"
"add $12, %0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm4, %%mm0 \n\t"
"paddd %%mm4, %%mm2 \n\t"
"psrad $15, %%mm0 \n\t"
"psrad $15, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, (%1, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
: "r" (dst+width), "g" ((x86_reg)-width)
: "%"REG_a
);
}
static void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src,
int width, uint32_t *unused)
{
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
}
static void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src,
int width, uint32_t *unused)
{
RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
}
static av_always_inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *src, int width,
enum PixelFormat srcFormat)
{
__asm__ volatile(
"movq 24(%4), %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
PREFETCH" 64(%0) \n\t"
"movd (%0), %%mm0 \n\t"
"movd 2(%0), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"pmaddwd (%4), %%mm0 \n\t"
"pmaddwd 8(%4), %%mm1 \n\t"
"pmaddwd 16(%4), %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"movd 6(%0), %%mm1 \n\t"
"movd 8(%0), %%mm3 \n\t"
"add $12, %0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"pmaddwd (%4), %%mm1 \n\t"
"pmaddwd 8(%4), %%mm3 \n\t"
"pmaddwd 16(%4), %%mm4 \n\t"
"pmaddwd %%mm6, %%mm5 \n\t"
"paddd %%mm3, %%mm1 \n\t"
"paddd %%mm5, %%mm4 \n\t"
"movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
"paddd %%mm3, %%mm0 \n\t"
"paddd %%mm3, %%mm2 \n\t"
"paddd %%mm3, %%mm1 \n\t"
"paddd %%mm3, %%mm4 \n\t"
"psrad $15, %%mm0 \n\t"
"psrad $15, %%mm2 \n\t"
"psrad $15, %%mm1 \n\t"
"psrad $15, %%mm4 \n\t"
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm4, %%mm2 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm2, %%mm2 \n\t"
"movd %%mm0, (%1, %%"REG_a") \n\t"
"movd %%mm2, (%2, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+r" (src)
: "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
: "%"REG_a
);
}
static void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
assert(src1 == src2);
}
static void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
assert(src1==src2);
RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
}
#if COMPILE_TEMPLATE_MMX2
static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
......@@ -1689,8 +1547,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
{
enum PixelFormat srcFormat = c->srcFormat,
dstFormat = c->dstFormat;
enum PixelFormat dstFormat = c->dstFormat;
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21) {
......@@ -1762,18 +1619,4 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
#endif /* COMPILE_TEMPLATE_MMX2 */
}
if (!c->chrSrcHSubSample) {
switch(srcFormat) {
case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
default: break;
}
}
switch (srcFormat) {
case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
default: break;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment