Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
c4356536
Commit
c4356536
authored
Nov 06, 2011
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: write yuv2plane1 MMX/SSE2/SSE4/AVX functions.
parent
2f7f2e4b
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
170 additions
and
112 deletions
+170
-112
scale.asm
libswscale/x86/scale.asm
+142
-0
swscale_mmx.c
libswscale/x86/swscale_mmx.c
+28
-0
swscale_template.c
libswscale/x86/swscale_template.c
+0
-112
No files found.
libswscale/x86/scale.asm
View file @
c4356536
...
@@ -34,6 +34,12 @@ yuv2yuvX_10_start: times 4 dd 0x10000
...
@@ -34,6 +34,12 @@ yuv2yuvX_10_start: times 4 dd 0x10000
yuv2yuvX_9_start
:
times
4
dd
0x20000
yuv2yuvX_9_start
:
times
4
dd
0x20000
yuv2yuvX_10_upper
:
times
8
dw
0x3ff
yuv2yuvX_10_upper
:
times
8
dw
0x3ff
yuv2yuvX_9_upper
:
times
8
dw
0x1ff
yuv2yuvX_9_upper
:
times
8
dw
0x1ff
pd_4
:
times
4
dd
4
pd_4min0x40000
:
times
4
dd
4
-
(
0x40000
)
pw_16
:
times
8
dw
16
pw_32
:
times
8
dw
32
pw_512
:
times
8
dw
512
pw_1024
:
times
8
dw
1024
SECTION
.
text
SECTION
.
text
...
@@ -663,3 +669,139 @@ INIT_AVX
...
@@ -663,3 +669,139 @@ INIT_AVX
yuv2planeX_fn
avx
,
8
,
10
,
7
yuv2planeX_fn
avx
,
8
,
10
,
7
yuv2planeX_fn
avx
,
9
,
7
,
5
yuv2planeX_fn
avx
,
9
,
7
,
5
yuv2planeX_fn
avx
,
10
,
7
,
5
yuv2planeX_fn
avx
,
10
,
7
,
5
; %1=outout-bpc, %2=alignment (u/a)
%macro
yuv2plane1_mainloop
2
.
loop_
%2
:
%if
%1
==
8
paddsw
m0
,
m2
,
[
r0
+
r2
*
2
+
mmsize
*
0
]
paddsw
m1
,
m3
,
[
r0
+
r2
*
2
+
mmsize
*
1
]
psraw
m0
,
7
psraw
m1
,
7
packuswb
m0
,
m1
mov%2
[
r1
+
r2
]
,
m0
%elif
%1
==
16
paddd
m0
,
m4
,
[
r0
+
r2
*
4
+
mmsize
*
0
]
paddd
m1
,
m4
,
[
r0
+
r2
*
4
+
mmsize
*
1
]
paddd
m2
,
m4
,
[
r0
+
r2
*
4
+
mmsize
*
2
]
paddd
m3
,
m4
,
[
r0
+
r2
*
4
+
mmsize
*
3
]
psrad
m0
,
3
psrad
m1
,
3
psrad
m2
,
3
psrad
m3
,
3
%if
cpuflag
(
sse4
)
; avx/sse4
packusdw
m0
,
m1
packusdw
m2
,
m3
%else
; mmx/sse2
packssdw
m0
,
m1
packssdw
m2
,
m3
paddw
m0
,
m5
paddw
m2
,
m5
%endif
; mmx/sse2/sse4/avx
mov%2
[
r1
+
r2
*
2
]
,
m0
mov%2
[
r1
+
r2
*
2
+
mmsize
]
,
m2
%else
paddsw
m0
,
m2
,
[
r0
+
r2
*
2
+
mmsize
*
0
]
paddsw
m1
,
m2
,
[
r0
+
r2
*
2
+
mmsize
*
1
]
psraw
m0
,
15
-
%1
psraw
m1
,
15
-
%1
pmaxsw
m0
,
m4
pmaxsw
m1
,
m4
pminsw
m0
,
m3
pminsw
m1
,
m3
mov%2
[
r1
+
r2
*
2
]
,
m0
mov%2
[
r1
+
r2
*
2
+
mmsize
]
,
m1
%endif
add
r2
,
mmsize
jl
.
loop_
%2
%endmacro
%macro
yuv2plane1_fn
3
cglobal
yuv2plane1_
%1
,
%3
,
%3
,
%2
%if
%1
==
8
add
r1
,
r2
%else
; %1 != 8
lea
r1
,
[
r1
+
r2
*
2
]
%endif
; %1 == 8
%if
%1
==
16
lea
r0
,
[
r0
+
r2
*
4
]
%else
; %1 != 16
lea
r0
,
[
r0
+
r2
*
2
]
%endif
; %1 == 16
neg
r2
%if
%1
==
8
pxor
m4
,
m4
; zero
; create registers holding dither
movq
m3
,
[r3]
; dither
test
r4d
,
r4d
jz
.
no_rot
%if
mmsize
==
16
punpcklqdq
m3
,
m3
%endif
; mmsize == 16
PALIGNR_MMX
m3
,
m3
,
3
,
m2
.
no_rot
:
%if
mmsize
==
8
mova
m2
,
m3
punpckhbw
m3
,
m4
; byte->word
punpcklbw
m2
,
m4
; byte->word
%else
punpcklbw
m3
,
m4
mova
m2
,
m3
%endif
%elif
%1
==
9
pxor
m4
,
m4
mova
m3
,
[
pw_512
]
mova
m2
,
[
pw_32
]
%elif
%1
==
10
pxor
m4
,
m4
mova
m3
,
[
pw_1024
]
mova
m2
,
[
pw_16
]
%else
; %1 == 16
%if
cpuflag
(
sse4
)
; sse4/avx
mova
m4
,
[
pd_4
]
%else
; mmx/sse2
mova
m4
,
[
pd_4min0x40000
]
mova
m5
,
[minshort]
%endif
; mmx/sse2/sse4/avx
%endif
; %1 == ..
; actual pixel scaling
%if
mmsize
==
8
yuv2plane1_mainloop
%1
,
a
%else
; mmsize == 16
test
r1
,
15
jnz
.
unaligned
yuv2plane1_mainloop
%1
,
a
REP_RET
.
unaligned
:
yuv2plane1_mainloop
%1
,
u
%endif
; mmsize == 8/16
REP_RET
%endmacro
%ifdef
ARCH_X86_32
INIT_MMX
mmx
yuv2plane1_fn
8
,
0
,
5
yuv2plane1_fn
16
,
0
,
3
INIT_MMX
mmx2
yuv2plane1_fn
9
,
0
,
3
yuv2plane1_fn
10
,
0
,
3
%endif
INIT_XMM
sse2
yuv2plane1_fn
8
,
5
,
5
yuv2plane1_fn
9
,
5
,
3
yuv2plane1_fn
10
,
5
,
3
yuv2plane1_fn
16
,
6
,
3
INIT_XMM
sse4
yuv2plane1_fn
16
,
5
,
3
INIT_XMM
avx
yuv2plane1_fn
8
,
5
,
5
yuv2plane1_fn
9
,
5
,
3
yuv2plane1_fn
10
,
5
,
3
yuv2plane1_fn
16
,
5
,
3
libswscale/x86/swscale_mmx.c
View file @
c4356536
...
@@ -228,6 +228,22 @@ VSCALEX_FUNCS(sse4, sse4);
...
@@ -228,6 +228,22 @@ VSCALEX_FUNCS(sse4, sse4);
VSCALEX_FUNC
(
16
,
sse4
);
VSCALEX_FUNC
(
16
,
sse4
);
VSCALEX_FUNCS
(
avx
,
avx
);
VSCALEX_FUNCS
(
avx
,
avx
);
#define VSCALE_FUNC(size, opt) \
extern void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
const uint8_t *dither, int offset)
#define VSCALE_FUNCS(opt1, opt2) \
VSCALE_FUNC(8, opt1); \
VSCALE_FUNC(9, opt2); \
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
#if ARCH_X86_32
VSCALE_FUNCS
(
mmx
,
mmx2
);
#endif
VSCALE_FUNCS
(
sse2
,
sse2
);
VSCALE_FUNC
(
16
,
sse4
);
VSCALE_FUNCS
(
avx
,
avx
);
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
{
{
int
cpu_flags
=
av_get_cpu_flags
();
int
cpu_flags
=
av_get_cpu_flags
();
...
@@ -268,11 +284,19 @@ switch(c->dstBpc){ \
...
@@ -268,11 +284,19 @@ switch(c->dstBpc){ \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_9_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_9_ ## opt2; break; \
default: vscalefn = ff_yuv2planeX_8_ ## opt1; break; \
default: vscalefn = ff_yuv2planeX_8_ ## opt1; break; \
}
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
}
#if ARCH_X86_32
#if ARCH_X86_32
if
(
cpu_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_MMX
)
{
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
,);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
);
}
}
#endif
#endif
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
...
@@ -287,6 +311,7 @@ switch(c->dstBpc){ \
...
@@ -287,6 +311,7 @@ switch(c->dstBpc){ \
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse2
,
sse2
,
1
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse2
,
sse2
,
1
,);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
sse2
,
sse2
,
1
);
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
...
@@ -298,10 +323,13 @@ switch(c->dstBpc){ \
...
@@ -298,10 +323,13 @@ switch(c->dstBpc){ \
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse4
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse4
,
ssse3
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse4
,
sse4
,
1
,
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse4
,
sse4
,
1
,
if
(
!
isBE
(
c
->
dstFormat
))
c
->
yuv2planeX
=
ff_yuv2planeX_16_sse4
);
if
(
!
isBE
(
c
->
dstFormat
))
c
->
yuv2planeX
=
ff_yuv2planeX_16_sse4
);
if
(
c
->
dstBpc
==
16
&&
!
isBE
(
c
->
dstFormat
))
c
->
yuv2plane1
=
ff_yuv2plane1_16_sse4
;
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
avx
,
avx
,
1
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
avx
,
avx
,
1
,);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
avx
,
avx
,
1
);
}
}
#endif
#endif
}
}
libswscale/x86/swscale_template.c
View file @
c4356536
...
@@ -35,116 +35,6 @@
...
@@ -35,116 +35,6 @@
#endif
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#if !COMPILE_TEMPLATE_MMX2
static
av_always_inline
void
dither_8to16
(
SwsContext
*
c
,
const
uint8_t
*
srcDither
,
int
rot
)
{
if
(
rot
)
{
__asm__
volatile
(
"pxor %%mm0, %%mm0
\n\t
"
"movq (%0), %%mm3
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"psrlq $24, %%mm3
\n\t
"
"psllq $40, %%mm4
\n\t
"
"por %%mm4, %%mm3
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"punpcklbw %%mm0, %%mm3
\n\t
"
"punpckhbw %%mm0, %%mm4
\n\t
"
"psraw $4, %%mm3
\n\t
"
"psraw $4, %%mm4
\n\t
"
"movq %%mm3, "
DITHER16
"+0(%1)
\n\t
"
"movq %%mm4, "
DITHER16
"+8(%1)
\n\t
"
::
"r"
(
srcDither
),
"r"
(
&
c
->
redDither
)
);
}
else
{
__asm__
volatile
(
"pxor %%mm0, %%mm0
\n\t
"
"movq (%0), %%mm3
\n\t
"
"movq %%mm3, %%mm4
\n\t
"
"punpcklbw %%mm0, %%mm3
\n\t
"
"punpckhbw %%mm0, %%mm4
\n\t
"
"psraw $4, %%mm3
\n\t
"
"psraw $4, %%mm4
\n\t
"
"movq %%mm3, "
DITHER16
"+0(%1)
\n\t
"
"movq %%mm4, "
DITHER16
"+8(%1)
\n\t
"
::
"r"
(
srcDither
),
"r"
(
&
c
->
redDither
)
);
}
}
#endif
static
void
RENAME
(
yuv2yuv1
)(
SwsContext
*
c
,
const
int16_t
*
lumSrc
,
const
int16_t
*
chrUSrc
,
const
int16_t
*
chrVSrc
,
const
int16_t
*
alpSrc
,
uint8_t
*
dst
[
4
],
int
dstW
,
int
chrDstW
)
{
int
p
=
4
;
const
int16_t
*
src
[
4
]
=
{
lumSrc
+
dstW
,
chrUSrc
+
chrDstW
,
chrVSrc
+
chrDstW
,
alpSrc
+
dstW
};
x86_reg
counter
[
4
]
=
{
dstW
,
chrDstW
,
chrDstW
,
dstW
};
while
(
p
--
)
{
if
(
dst
[
p
])
{
__asm__
volatile
(
"mov %2, %%"
REG_a
"
\n\t
"
".p2align 4
\n\t
"
/* FIXME Unroll? */
"1:
\n\t
"
"movq (%0, %%"
REG_a
", 2), %%mm0
\n\t
"
"movq 8(%0, %%"
REG_a
", 2), %%mm1
\n\t
"
"psraw $7, %%mm0
\n\t
"
"psraw $7, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
MOVNTQ
(
%%
mm0
,
(
%
1
,
%%
REGa
))
"add $8, %%"
REG_a
"
\n\t
"
"jnc 1b
\n\t
"
::
"r"
(
src
[
p
]),
"r"
(
dst
[
p
]
+
counter
[
p
]),
"g"
(
-
counter
[
p
])
:
"%"
REG_a
);
}
}
}
static
void
RENAME
(
yuv2yuv1_ar
)(
SwsContext
*
c
,
const
int16_t
*
lumSrc
,
const
int16_t
*
chrUSrc
,
const
int16_t
*
chrVSrc
,
const
int16_t
*
alpSrc
,
uint8_t
*
dst
[
4
],
int
dstW
,
int
chrDstW
)
{
int
p
=
4
;
const
int16_t
*
src
[
4
]
=
{
lumSrc
+
dstW
,
chrUSrc
+
chrDstW
,
chrVSrc
+
chrDstW
,
alpSrc
+
dstW
};
x86_reg
counter
[
4
]
=
{
dstW
,
chrDstW
,
chrDstW
,
dstW
};
const
uint8_t
*
lumDither
=
c
->
lumDither8
,
*
chrDither
=
c
->
chrDither8
;
while
(
p
--
)
{
if
(
dst
[
p
])
{
dither_8to16
(
c
,
(
p
==
2
||
p
==
3
)
?
chrDither
:
lumDither
,
p
==
2
);
__asm__
volatile
(
"mov %2, %%"
REG_a
"
\n\t
"
"movq "
DITHER16
"+0(%3), %%mm6
\n\t
"
"movq "
DITHER16
"+8(%3), %%mm7
\n\t
"
".p2align 4
\n\t
"
/* FIXME Unroll? */
"1:
\n\t
"
"movq (%0, %%"
REG_a
", 2), %%mm0
\n\t
"
"movq 8(%0, %%"
REG_a
", 2), %%mm1
\n\t
"
"paddsw %%mm6, %%mm0
\n\t
"
"paddsw %%mm7, %%mm1
\n\t
"
"psraw $7, %%mm0
\n\t
"
"psraw $7, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
MOVNTQ
(
%%
mm0
,
(
%
1
,
%%
REGa
))
"add $8, %%"
REG_a
"
\n\t
"
"jnc 1b
\n\t
"
::
"r"
(
src
[
p
]),
"r"
(
dst
[
p
]
+
counter
[
p
]),
"g"
(
-
counter
[
p
]),
"r"
(
&
c
->
redDither
)
:
"%"
REG_a
);
}
}
}
#define YSCALEYUV2PACKEDX_UV \
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
...
@@ -1899,7 +1789,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -1899,7 +1789,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
dstFormat
!=
PIX_FMT_NV12
&&
dstFormat
!=
PIX_FMT_NV21
)
{
dstFormat
!=
PIX_FMT_NV12
&&
dstFormat
!=
PIX_FMT_NV21
)
{
if
(
!
(
c
->
flags
&
SWS_BITEXACT
))
{
if
(
!
(
c
->
flags
&
SWS_BITEXACT
))
{
if
(
c
->
flags
&
SWS_ACCURATE_RND
)
{
if
(
c
->
flags
&
SWS_ACCURATE_RND
)
{
//c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
switch
(
c
->
dstFormat
)
{
switch
(
c
->
dstFormat
)
{
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X_ar
);
break
;
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X_ar
);
break
;
...
@@ -1911,7 +1800,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -1911,7 +1800,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
}
}
}
else
{
}
else
{
//c->yuv2yuv1 = RENAME(yuv2yuv1 );
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
switch
(
c
->
dstFormat
)
{
switch
(
c
->
dstFormat
)
{
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X
);
break
;
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X
);
break
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment