Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
2170a0e6
Commit
2170a0e6
authored
Jan 09, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: convert yuy2/uyvy/nv12/nv21ToY/UV from inline asm to yasm.
Also implement SSE2/AVX variants.
parent
59985574
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
327 additions
and
164 deletions
+327
-164
Makefile
libswscale/Makefile
+2
-1
input.asm
libswscale/x86/input.asm
+242
-0
swscale_mmx.c
libswscale/x86/swscale_mmx.c
+83
-0
swscale_template.c
libswscale/x86/swscale_template.c
+0
-163
No files found.
libswscale/Makefile
View file @
2170a0e6
...
@@ -17,7 +17,8 @@ OBJS-$(HAVE_MMX) += x86/rgb2rgb.o \
...
@@ -17,7 +17,8 @@ OBJS-$(HAVE_MMX) += x86/rgb2rgb.o \
x86/swscale_mmx.o
\
x86/swscale_mmx.o
\
x86/yuv2rgb_mmx.o
x86/yuv2rgb_mmx.o
OBJS-$(HAVE_VIS)
+=
sparc/yuv2rgb_vis.o
OBJS-$(HAVE_VIS)
+=
sparc/yuv2rgb_vis.o
MMX-OBJS-$(HAVE_YASM)
+=
x86/output.o
\
MMX-OBJS-$(HAVE_YASM)
+=
x86/input.o
\
x86/output.o
\
x86/scale.o
x86/scale.o
TESTPROGS
=
colorspace swscale
TESTPROGS
=
colorspace swscale
...
...
libswscale/x86/input.asm
0 → 100644
View file @
2170a0e6
;******************************************************************************
;* x86-optimized input routines; does shuffling of packed
;* YUV formats into individual planes, and converts RGB
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"x86inc.asm"
%include
"x86util.asm"
SECTION_RODATA
SECTION
.
text
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro
LOOP_YUYV_TO_Y
2
.
loop_
%1
:
mov%1
m0
,
[
srcq
+
wq
*
2
]
; (byte) { Y0, U0, Y1, V0, ... }
mov%1
m1
,
[
srcq
+
wq
*
2
+
mmsize
]
; (byte) { Y8, U4, Y9, V4, ... }
%ifidn
%2
,
yuyv
pand
m0
,
m2
; (word) { Y0, Y1, ..., Y7 }
pand
m1
,
m2
; (word) { Y8, Y9, ..., Y15 }
%else
; uyvy
psrlw
m0
,
8
; (word) { Y0, Y1, ..., Y7 }
psrlw
m1
,
8
; (word) { Y8, Y9, ..., Y15 }
%endif
; yuyv/uyvy
packuswb
m0
,
m1
; (byte) { Y0, ..., Y15 }
mova
[
dstq
+
wq
]
,
m0
add
wq
,
mmsize
jl
.
loop_
%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro
YUYV_TO_Y_FN
2
-
3
cglobal
%2
ToY
,
3
,
3
,
%1
,
dst
,
src
,
w
%ifdef
ARCH_X86_64
movsxd
wq
,
wd
%endif
add
dstq
,
wq
%if
mmsize
==
16
test
srcq
,
15
%endif
lea
srcq
,
[
srcq
+
wq
*
2
]
%ifidn
%2
,
yuyv
pcmpeqb
m2
,
m2
; (byte) { 0xff } x 16
psrlw
m2
,
8
; (word) { 0x00ff } x 8
%endif
; yuyv
%if
mmsize
==
16
jnz
.
loop_u_start
neg
wq
LOOP_YUYV_TO_Y
a
,
%2
.
loop_u_start
:
neg
wq
LOOP_YUYV_TO_Y
u
,
%2
%else
; mmsize == 8
neg
wq
LOOP_YUYV_TO_Y
a
,
%2
%endif
; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro
LOOP_YUYV_TO_UV
2
.
loop_
%1
:
%ifidn
%2
,
yuyv
mov%1
m0
,
[
srcq
+
wq
*
4
]
; (byte) { Y0, U0, Y1, V0, ... }
mov%1
m1
,
[
srcq
+
wq
*
4
+
mmsize
]
; (byte) { Y8, U4, Y9, V4, ... }
psrlw
m0
,
8
; (word) { U0, V0, ..., U3, V3 }
psrlw
m1
,
8
; (word) { U4, V4, ..., U7, V7 }
%else
; uyvy
%if
cpuflag
(
avx
)
vpand
m0
,
m2
,
[
srcq
+
wq
*
4
]
; (word) { U0, V0, ..., U3, V3 }
vpand
m1
,
m2
,
[
srcq
+
wq
*
4
+
mmsize
]
; (word) { U4, V4, ..., U7, V7 }
%else
mov%1
m0
,
[
srcq
+
wq
*
4
]
; (byte) { Y0, U0, Y1, V0, ... }
mov%1
m1
,
[
srcq
+
wq
*
4
+
mmsize
]
; (byte) { Y8, U4, Y9, V4, ... }
pand
m0
,
m2
; (word) { U0, V0, ..., U3, V3 }
pand
m1
,
m2
; (word) { U4, V4, ..., U7, V7 }
%endif
%endif
; yuyv/uyvy
packuswb
m0
,
m1
; (byte) { U0, V0, ..., U7, V7 }
pand
m1
,
m0
,
m2
; (word) { U0, U1, ..., U7 }
psrlw
m0
,
8
; (word) { V0, V1, ..., V7 }
%if
mmsize
==
16
packuswb
m1
,
m0
; (byte) { U0, ... U7, V1, ... V7 }
movh
[
dstUq
+
wq
]
,
m1
movhps
[
dstVq
+
wq
]
,
m1
%else
; mmsize == 8
packuswb
m1
,
m1
; (byte) { U0, ... U3 }
packuswb
m0
,
m0
; (byte) { V0, ... V3 }
movh
[
dstUq
+
wq
]
,
m1
movh
[
dstVq
+
wq
]
,
m0
%endif
; mmsize == 8/16
add
wq
,
mmsize
/
2
jl
.
loop_
%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro
YUYV_TO_UV_FN
2
-
3
cglobal
%2
ToUV
,
3
,
4
,
%1
,
dstU
,
dstV
,
src
,
w
%ifdef
ARCH_X86_64
movsxd
wq
,
r4m
%else
; x86-32
mov
wq
,
r4m
%endif
add
dstUq
,
wq
add
dstVq
,
wq
%if
mmsize
==
16
&&
%0
==
2
test
srcq
,
15
%endif
lea
srcq
,
[
srcq
+
wq
*
4
]
pcmpeqb
m2
,
m2
; (byte) { 0xff } x 16
psrlw
m2
,
8
; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
%if
mmsize
==
16
&&
%0
==
2
jnz
.
loop_u_start
neg
wq
LOOP_YUYV_TO_UV
a
,
%2
.
loop_u_start
:
neg
wq
LOOP_YUYV_TO_UV
u
,
%2
%else
; mmsize == 8
neg
wq
LOOP_YUYV_TO_UV
a
,
%2
%endif
; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = nv12 or nv21
%macro
LOOP_NVXX_TO_UV
2
.
loop_
%1
:
mov%1
m0
,
[
srcq
+
wq
*
2
]
; (byte) { U0, V0, U1, V1, ... }
mov%1
m1
,
[
srcq
+
wq
*
2
+
mmsize
]
; (byte) { U8, V8, U9, V9, ... }
pand
m2
,
m0
,
m4
; (word) { U0, U1, ..., U7 }
pand
m3
,
m1
,
m4
; (word) { U8, U9, ..., U15 }
psrlw
m0
,
8
; (word) { V0, V1, ..., V7 }
psrlw
m1
,
8
; (word) { V8, V9, ..., V15 }
packuswb
m2
,
m3
; (byte) { U0, ..., U15 }
packuswb
m0
,
m1
; (byte) { V0, ..., V15 }
%ifidn
%2
,
nv12
mova
[
dstUq
+
wq
]
,
m2
mova
[
dstVq
+
wq
]
,
m0
%else
; nv21
mova
[
dstVq
+
wq
]
,
m2
mova
[
dstUq
+
wq
]
,
m0
%endif
; nv12/21
add
wq
,
mmsize
jl
.
loop_
%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro
NVXX_TO_UV_FN
2
cglobal
%2
ToUV
,
3
,
4
,
%1
,
dstU
,
dstV
,
src
,
w
%ifdef
ARCH_X86_64
movsxd
wq
,
r4m
%else
; x86-32
mov
wq
,
r4m
%endif
add
dstUq
,
wq
add
dstVq
,
wq
%if
mmsize
==
16
test
srcq
,
15
%endif
lea
srcq
,
[
srcq
+
wq
*
2
]
pcmpeqb
m4
,
m4
; (byte) { 0xff } x 16
psrlw
m4
,
8
; (word) { 0x00ff } x 8
%if
mmsize
==
16
jnz
.
loop_u_start
neg
wq
LOOP_NVXX_TO_UV
a
,
%2
.
loop_u_start
:
neg
wq
LOOP_NVXX_TO_UV
u
,
%2
%else
; mmsize == 8
neg
wq
LOOP_NVXX_TO_UV
a
,
%2
%endif
; mmsize == 8/16
%endmacro
%ifdef
ARCH_X86_32
INIT_MMX
mmx
YUYV_TO_Y_FN
0
,
yuyv
YUYV_TO_Y_FN
0
,
uyvy
YUYV_TO_UV_FN
0
,
yuyv
YUYV_TO_UV_FN
0
,
uyvy
NVXX_TO_UV_FN
0
,
nv12
NVXX_TO_UV_FN
0
,
nv21
%endif
INIT_XMM
sse2
YUYV_TO_Y_FN
3
,
yuyv
YUYV_TO_Y_FN
2
,
uyvy
YUYV_TO_UV_FN
3
,
yuyv
YUYV_TO_UV_FN
3
,
uyvy
NVXX_TO_UV_FN
5
,
nv12
NVXX_TO_UV_FN
5
,
nv21
INIT_XMM
avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
YUYV_TO_UV_FN
3
,
yuyv
YUYV_TO_UV_FN
3
,
uyvy
,
1
NVXX_TO_UV_FN
5
,
nv12
NVXX_TO_UV_FN
5
,
nv21
libswscale/x86/swscale_mmx.c
View file @
2170a0e6
...
@@ -244,6 +244,26 @@ VSCALE_FUNCS(sse2, sse2);
...
@@ -244,6 +244,26 @@ VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC
(
16
,
sse4
);
VSCALE_FUNC
(
16
,
sse4
);
VSCALE_FUNCS
(
avx
,
avx
);
VSCALE_FUNCS
(
avx
,
avx
);
#define INPUT_UV_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *src, const uint8_t *unused1, \
int w, uint32_t *unused2)
#define INPUT_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
int w, uint32_t *unused); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt)
#if ARCH_X86_32
INPUT_FUNCS
(
mmx
);
#endif
INPUT_FUNCS
(
sse2
);
INPUT_FUNCS
(
avx
);
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
{
{
int
cpu_flags
=
av_get_cpu_flags
();
int
cpu_flags
=
av_get_cpu_flags
();
...
@@ -296,6 +316,30 @@ switch(c->dstBpc){ \
...
@@ -296,6 +316,30 @@ switch(c->dstBpc){ \
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
);
switch
(
c
->
srcFormat
)
{
case
PIX_FMT_Y400A
:
c
->
lumToYV12
=
ff_yuyvToY_mmx
;
if
(
c
->
alpPixBuf
)
c
->
alpToYV12
=
ff_uyvyToY_mmx
;
break
;
case
PIX_FMT_YUYV422
:
c
->
lumToYV12
=
ff_yuyvToY_mmx
;
c
->
chrToYV12
=
ff_yuyvToUV_mmx
;
break
;
case
PIX_FMT_UYVY422
:
c
->
lumToYV12
=
ff_uyvyToY_mmx
;
c
->
chrToYV12
=
ff_uyvyToUV_mmx
;
break
;
case
PIX_FMT_NV12
:
c
->
chrToYV12
=
ff_nv12ToUV_mmx
;
break
;
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_mmx
;
break
;
default:
break
;
}
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_MMX2
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_MMX2
)
{
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
mmx2
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
mmx2
,);
...
@@ -314,6 +358,28 @@ switch(c->dstBpc){ \
...
@@ -314,6 +358,28 @@ switch(c->dstBpc){ \
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse2
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse2
,);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
sse2
,
sse2
,
1
);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
sse2
,
sse2
,
1
);
switch
(
c
->
srcFormat
)
{
case
PIX_FMT_Y400A
:
c
->
lumToYV12
=
ff_yuyvToY_sse2
;
if
(
c
->
alpPixBuf
)
c
->
alpToYV12
=
ff_uyvyToY_sse2
;
break
;
case
PIX_FMT_YUYV422
:
c
->
lumToYV12
=
ff_yuyvToY_sse2
;
c
->
chrToYV12
=
ff_yuyvToUV_sse2
;
break
;
case
PIX_FMT_UYVY422
:
c
->
lumToYV12
=
ff_uyvyToY_sse2
;
c
->
chrToYV12
=
ff_uyvyToUV_sse2
;
break
;
case
PIX_FMT_NV12
:
c
->
chrToYV12
=
ff_nv12ToUV_sse2
;
break
;
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_sse2
;
break
;
}
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
...
@@ -332,6 +398,23 @@ switch(c->dstBpc){ \
...
@@ -332,6 +398,23 @@ switch(c->dstBpc){ \
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
avx
,);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
avx
,);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
avx
,
avx
,
1
);
ASSIGN_VSCALE_FUNC
(
c
->
yuv2plane1
,
avx
,
avx
,
1
);
switch
(
c
->
srcFormat
)
{
case
PIX_FMT_YUYV422
:
c
->
chrToYV12
=
ff_yuyvToUV_avx
;
break
;
case
PIX_FMT_UYVY422
:
c
->
chrToYV12
=
ff_uyvyToUV_avx
;
break
;
case
PIX_FMT_NV12
:
c
->
chrToYV12
=
ff_nv12ToUV_avx
;
break
;
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_avx
;
break
;
default:
break
;
}
}
}
#endif
#endif
}
}
libswscale/x86/swscale_template.c
View file @
2170a0e6
...
@@ -1361,147 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
...
@@ -1361,147 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
}
}
}
}
#if !COMPILE_TEMPLATE_MMX2
//FIXME yuy2* can read up to 7 samples too much
static
void
RENAME
(
yuy2ToY
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
uint32_t
*
unused
)
{
__asm__
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm2
\n\t
"
"mov %0, %%"
REG_a
"
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
",2), %%mm0
\n\t
"
"movq 8(%1, %%"
REG_a
",2), %%mm1
\n\t
"
"pand %%mm2, %%mm0
\n\t
"
"pand %%mm2, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
:
"g"
((
x86_reg
)
-
width
),
"r"
(
src
+
width
*
2
),
"r"
(
dst
+
width
)
:
"%"
REG_a
);
}
static
void
RENAME
(
yuy2ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
__asm__
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm4
\n\t
"
"mov %0, %%"
REG_a
"
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
",4), %%mm0
\n\t
"
"movq 8(%1, %%"
REG_a
",4), %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"psrlw $8, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"packuswb %%mm1, %%mm1
\n\t
"
"movd %%mm0, (%3, %%"
REG_a
")
\n\t
"
"movd %%mm1, (%2, %%"
REG_a
")
\n\t
"
"add $4, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
:
"g"
((
x86_reg
)
-
width
),
"r"
(
src1
+
width
*
4
),
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
)
:
"%"
REG_a
);
assert
(
src1
==
src2
);
}
/* This is almost identical to the previous, end exists only because
* yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
static
void
RENAME
(
uyvyToY
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
uint32_t
*
unused
)
{
__asm__
volatile
(
"mov %0, %%"
REG_a
"
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
",2), %%mm0
\n\t
"
"movq 8(%1, %%"
REG_a
",2), %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"psrlw $8, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
:
"g"
((
x86_reg
)
-
width
),
"r"
(
src
+
width
*
2
),
"r"
(
dst
+
width
)
:
"%"
REG_a
);
}
static
void
RENAME
(
uyvyToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
__asm__
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm4
\n\t
"
"mov %0, %%"
REG_a
"
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
",4), %%mm0
\n\t
"
"movq 8(%1, %%"
REG_a
",4), %%mm1
\n\t
"
"pand %%mm4, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"movq %%mm0, %%mm1
\n\t
"
"psrlw $8, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"packuswb %%mm1, %%mm1
\n\t
"
"movd %%mm0, (%3, %%"
REG_a
")
\n\t
"
"movd %%mm1, (%2, %%"
REG_a
")
\n\t
"
"add $4, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
:
"g"
((
x86_reg
)
-
width
),
"r"
(
src1
+
width
*
4
),
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
)
:
"%"
REG_a
);
assert
(
src1
==
src2
);
}
static
av_always_inline
void
RENAME
(
nvXXtoUV
)(
uint8_t
*
dst1
,
uint8_t
*
dst2
,
const
uint8_t
*
src
,
int
width
)
{
__asm__
volatile
(
"movq "
MANGLE
(
bm01010101
)
", %%mm4
\n\t
"
"mov %0, %%"
REG_a
"
\n\t
"
"1:
\n\t
"
"movq (%1, %%"
REG_a
",2), %%mm0
\n\t
"
"movq 8(%1, %%"
REG_a
",2), %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"pand %%mm4, %%mm0
\n\t
"
"pand %%mm4, %%mm1
\n\t
"
"psrlw $8, %%mm2
\n\t
"
"psrlw $8, %%mm3
\n\t
"
"packuswb %%mm1, %%mm0
\n\t
"
"packuswb %%mm3, %%mm2
\n\t
"
"movq %%mm0, (%2, %%"
REG_a
")
\n\t
"
"movq %%mm2, (%3, %%"
REG_a
")
\n\t
"
"add $8, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
:
"g"
((
x86_reg
)
-
width
),
"r"
(
src
+
width
*
2
),
"r"
(
dst1
+
width
),
"r"
(
dst2
+
width
)
:
"%"
REG_a
);
}
static
void
RENAME
(
nv12ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
RENAME
(
nvXXtoUV
)(
dstU
,
dstV
,
src1
,
width
);
}
static
void
RENAME
(
nv21ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
RENAME
(
nvXXtoUV
)(
dstV
,
dstU
,
src1
,
width
);
}
#endif
/* !COMPILE_TEMPLATE_MMX2 */
static
av_always_inline
void
RENAME
(
bgr24ToY_mmx
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
static
av_always_inline
void
RENAME
(
bgr24ToY_mmx
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
enum
PixelFormat
srcFormat
)
int
width
,
enum
PixelFormat
srcFormat
)
{
{
...
@@ -1856,15 +1715,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -1856,15 +1715,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
#endif
/* COMPILE_TEMPLATE_MMX2 */
#endif
/* COMPILE_TEMPLATE_MMX2 */
}
}
#if !COMPILE_TEMPLATE_MMX2
switch
(
srcFormat
)
{
case
PIX_FMT_YUYV422
:
c
->
chrToYV12
=
RENAME
(
yuy2ToUV
);
break
;
case
PIX_FMT_UYVY422
:
c
->
chrToYV12
=
RENAME
(
uyvyToUV
);
break
;
case
PIX_FMT_NV12
:
c
->
chrToYV12
=
RENAME
(
nv12ToUV
);
break
;
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
RENAME
(
nv21ToUV
);
break
;
default:
break
;
}
#endif
/* !COMPILE_TEMPLATE_MMX2 */
if
(
!
c
->
chrSrcHSubSample
)
{
if
(
!
c
->
chrSrcHSubSample
)
{
switch
(
srcFormat
)
{
switch
(
srcFormat
)
{
case
PIX_FMT_BGR24
:
c
->
chrToYV12
=
RENAME
(
bgr24ToUV
);
break
;
case
PIX_FMT_BGR24
:
c
->
chrToYV12
=
RENAME
(
bgr24ToUV
);
break
;
...
@@ -1874,21 +1724,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -1874,21 +1724,8 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
switch
(
srcFormat
)
{
switch
(
srcFormat
)
{
#if !COMPILE_TEMPLATE_MMX2
case
PIX_FMT_YUYV422
:
case
PIX_FMT_Y400A
:
c
->
lumToYV12
=
RENAME
(
yuy2ToY
);
break
;
case
PIX_FMT_UYVY422
:
c
->
lumToYV12
=
RENAME
(
uyvyToY
);
break
;
#endif
/* !COMPILE_TEMPLATE_MMX2 */
case
PIX_FMT_BGR24
:
c
->
lumToYV12
=
RENAME
(
bgr24ToY
);
break
;
case
PIX_FMT_BGR24
:
c
->
lumToYV12
=
RENAME
(
bgr24ToY
);
break
;
case
PIX_FMT_RGB24
:
c
->
lumToYV12
=
RENAME
(
rgb24ToY
);
break
;
case
PIX_FMT_RGB24
:
c
->
lumToYV12
=
RENAME
(
rgb24ToY
);
break
;
default:
break
;
default:
break
;
}
}
#if !COMPILE_TEMPLATE_MMX2
if
(
c
->
alpPixBuf
)
{
switch
(
srcFormat
)
{
case
PIX_FMT_Y400A
:
c
->
alpToYV12
=
RENAME
(
yuy2ToY
);
break
;
default:
break
;
}
}
#endif
/* !COMPILE_TEMPLATE_MMX2 */
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment