Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
b5d08c27
Commit
b5d08c27
authored
Jan 27, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: convert rgb/bgr24ToY/UV_mmx functions from inline asm to yasm.
Also implement sse2/ssse3/avx versions.
parent
3b15a6d7
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
300 additions
and
178 deletions
+300
-178
input.asm
libswscale/x86/input.asm
+271
-0
swscale_mmx.c
libswscale/x86/swscale_mmx.c
+28
-20
swscale_template.c
libswscale/x86/swscale_template.c
+1
-158
No files found.
libswscale/x86/input.asm
View file @
b5d08c27
...
...
@@ -26,8 +26,279 @@
SECTION_RODATA
%define
RY
0x20DE
%define
GY
0x4087
%define
BY
0x0C88
%define
RU
0xECFF
%define
GU
0xDAC8
%define
BU
0x3838
%define
RV
0x3838
%define
GV
0xD0E3
%define
BV
0xF6E4
rgb_Yrnd
:
times
4
dd
0x84000
; 16.5 << 15
rgb_UVrnd
:
times
4
dd
0x404000
; 128.5 << 15
bgr_Ycoeff_12x4
:
times
2
dw
BY
,
GY
,
0
,
BY
bgr_Ycoeff_3x56
:
times
2
dw
RY
,
0
,
GY
,
RY
rgb_Ycoeff_12x4
:
times
2
dw
RY
,
GY
,
0
,
RY
rgb_Ycoeff_3x56
:
times
2
dw
BY
,
0
,
GY
,
BY
bgr_Ucoeff_12x4
:
times
2
dw
BU
,
GU
,
0
,
BU
bgr_Ucoeff_3x56
:
times
2
dw
RU
,
0
,
GU
,
RU
rgb_Ucoeff_12x4
:
times
2
dw
RU
,
GU
,
0
,
RU
rgb_Ucoeff_3x56
:
times
2
dw
BU
,
0
,
GU
,
BU
bgr_Vcoeff_12x4
:
times
2
dw
BV
,
GV
,
0
,
BV
bgr_Vcoeff_3x56
:
times
2
dw
RV
,
0
,
GV
,
RV
rgb_Vcoeff_12x4
:
times
2
dw
RV
,
GV
,
0
,
RV
rgb_Vcoeff_3x56
:
times
2
dw
BV
,
0
,
GV
,
BV
shuf_rgb_12x4
:
db
0
,
0x80
,
1
,
0x80
,
2
,
0x80
,
3
,
0x80
,
\
6
,
0x80
,
7
,
0x80
,
8
,
0x80
,
9
,
0x80
shuf_rgb_3x56
:
db
2
,
0x80
,
3
,
0x80
,
4
,
0x80
,
5
,
0x80
,
\
8
,
0x80
,
9
,
0x80
,
10
,
0x80
,
11
,
0x80
SECTION
.
text
;-----------------------------------------------------------------------------
; RGB to Y/UV.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro
RGB24_TO_Y_FN
2
-
3
cglobal
%2
%
+
24
ToY
,
3
,
3
,
%1
,
dst
,
src
,
w
%if
mmsize
==
8
mova
m5
,
[
%2
_Ycoeff_12x4
]
mova
m6
,
[
%2
_Ycoeff_3x56
]
%define
coeff1
m5
%define
coeff2
m6
%elif
ARCH_X86_64
mova
m8
,
[
%2
_Ycoeff_12x4
]
mova
m9
,
[
%2
_Ycoeff_3x56
]
%define
coeff1
m8
%define
coeff2
m9
%else
; x86-32 && mmsize == 16
%define
coeff1
[
%2
_Ycoeff_12x4
]
%define
coeff2
[
%2
_Ycoeff_3x56
]
%endif
; x86-32/64 && mmsize == 8/16
%if
(
ARCH_X86_64
||
mmsize
==
8
)
&&
%0
==
3
jmp
mangle
(
program_name
%
+
_
%
+
%3
%
+
24
ToY
%
+
SUFFIX
).
body
%else
; (ARCH_X86_64 && %0 == 3) || mmsize == 8
.
body
:
%if
cpuflag
(
ssse3
)
mova
m7
,
[
shuf_rgb_12x4
]
%define
shuf_rgb1
m7
%if
ARCH_X86_64
mova
m10
,
[
shuf_rgb_3x56
]
%define
shuf_rgb2
m10
%else
; x86-32
%define
shuf_rgb2
[
shuf_rgb_3x56
]
%endif
; x86-32/64
%endif
; cpuflag(ssse3)
%if
ARCH_X86_64
movsxd
wq
,
wd
%endif
add
dstq
,
wq
neg
wq
%if
notcpuflag
(
ssse3
)
pxor
m7
,
m7
%endif
; !cpuflag(ssse3)
mova
m4
,
[
rgb_Yrnd
]
.
loop
:
%if
cpuflag
(
ssse3
)
movu
m0
,
[
srcq
+
0
]
; (byte) { Bx, Gx, Rx }[0-3]
movu
m2
,
[
srcq
+
12
]
; (byte) { Bx, Gx, Rx }[4-7]
pshufb
m1
,
m0
,
shuf_rgb2
; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb
m0
,
shuf_rgb1
; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
pshufb
m3
,
m2
,
shuf_rgb2
; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb
m2
,
shuf_rgb1
; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else
; !cpuflag(ssse3)
movd
m0
,
[
srcq
+
0
]
; (byte) { B0, G0, R0, B1 }
movd
m1
,
[
srcq
+
2
]
; (byte) { R0, B1, G1, R1 }
movd
m2
,
[
srcq
+
6
]
; (byte) { B2, G2, R2, B3 }
movd
m3
,
[
srcq
+
8
]
; (byte) { R2, B3, G3, R3 }
%if
mmsize
==
16
; i.e. sse2
punpckldq
m0
,
m2
; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq
m1
,
m3
; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd
m2
,
[
srcq
+
12
]
; (byte) { B4, G4, R4, B5 }
movd
m3
,
[
srcq
+
14
]
; (byte) { R4, B5, G5, R5 }
movd
m5
,
[
srcq
+
18
]
; (byte) { B6, G6, R6, B7 }
movd
m6
,
[
srcq
+
20
]
; (byte) { R6, B7, G7, R7 }
punpckldq
m2
,
m5
; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq
m3
,
m6
; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif
; mmsize == 16
punpcklbw
m0
,
m7
; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw
m1
,
m7
; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw
m2
,
m7
; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw
m3
,
m7
; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif
; cpuflag(ssse3)
add
srcq
,
3
*
mmsize
/
2
pmaddwd
m0
,
coeff1
; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
pmaddwd
m1
,
coeff2
; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
pmaddwd
m2
,
coeff1
; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
pmaddwd
m3
,
coeff2
; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
paddd
m0
,
m1
; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
paddd
m2
,
m3
; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd
m0
,
m4
; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd
m2
,
m4
; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
psrad
m0
,
15
psrad
m2
,
15
packssdw
m0
,
m2
; (word) { Y[0-7] }
packuswb
m0
,
m0
; (byte) { Y[0-7] }
movh
[
dstq
+
wq
]
,
m0
add
wq
,
mmsize
/
2
jl
.
loop
REP_RET
%endif
; (ARCH_X86_64 && %0 == 3) || mmsize == 8
%endmacro
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro
RGB24_TO_UV_FN
2
-
3
cglobal
%2
%
+
24
ToUV
,
3
,
4
,
%1
,
dstU
,
dstV
,
src
,
w
%if
ARCH_X86_64
mova
m8
,
[
%2
_Ucoeff_12x4
]
mova
m9
,
[
%2
_Ucoeff_3x56
]
mova
m10
,
[
%2
_Vcoeff_12x4
]
mova
m11
,
[
%2
_Vcoeff_3x56
]
%define
coeffU1
m8
%define
coeffU2
m9
%define
coeffV1
m10
%define
coeffV2
m11
%else
; x86-32
%define
coeffU1
[
%2
_Ucoeff_12x4
]
%define
coeffU2
[
%2
_Ucoeff_3x56
]
%define
coeffV1
[
%2
_Vcoeff_12x4
]
%define
coeffV2
[
%2
_Vcoeff_3x56
]
%endif
; x86-32/64
%if
ARCH_X86_64
&&
%0
==
3
jmp
mangle
(
program_name
%
+
_
%
+
%3
%
+
24
ToUV
%
+
SUFFIX
).
body
%else
; ARCH_X86_64 && %0 == 3
.
body
:
%if
cpuflag
(
ssse3
)
mova
m7
,
[
shuf_rgb_12x4
]
%define
shuf_rgb1
m7
%if
ARCH_X86_64
mova
m12
,
[
shuf_rgb_3x56
]
%define
shuf_rgb2
m12
%else
; x86-32
%define
shuf_rgb2
[
shuf_rgb_3x56
]
%endif
; x86-32/64
%endif
; cpuflag(ssse3)
%if
ARCH_X86_64
movsxd
wq
,
dword
r4m
%else
; x86-32
mov
wq
,
r4m
%endif
add
dstUq
,
wq
add
dstVq
,
wq
neg
wq
mova
m6
,
[
rgb_UVrnd
]
%if
notcpuflag
(
ssse3
)
pxor
m7
,
m7
%endif
.
loop
:
%if
cpuflag
(
ssse3
)
movu
m0
,
[
srcq
+
0
]
; (byte) { Bx, Gx, Rx }[0-3]
movu
m4
,
[
srcq
+
12
]
; (byte) { Bx, Gx, Rx }[4-7]
pshufb
m1
,
m0
,
shuf_rgb2
; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb
m0
,
shuf_rgb1
; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
%else
; !cpuflag(ssse3)
movd
m0
,
[
srcq
+
0
]
; (byte) { B0, G0, R0, B1 }
movd
m1
,
[
srcq
+
2
]
; (byte) { R0, B1, G1, R1 }
movd
m4
,
[
srcq
+
6
]
; (byte) { B2, G2, R2, B3 }
movd
m5
,
[
srcq
+
8
]
; (byte) { R2, B3, G3, R3 }
%if
mmsize
==
16
punpckldq
m0
,
m4
; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq
m1
,
m5
; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd
m4
,
[
srcq
+
12
]
; (byte) { B4, G4, R4, B5 }
movd
m5
,
[
srcq
+
14
]
; (byte) { R4, B5, G5, R5 }
%endif
; mmsize == 16
punpcklbw
m0
,
m7
; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw
m1
,
m7
; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif
; cpuflag(ssse3)
pmaddwd
m2
,
m0
,
coeffV1
; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
pmaddwd
m3
,
m1
,
coeffV2
; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
pmaddwd
m0
,
coeffU1
; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
pmaddwd
m1
,
coeffU2
; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
paddd
m0
,
m1
; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
paddd
m2
,
m3
; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
%if
cpuflag
(
ssse3
)
pshufb
m5
,
m4
,
shuf_rgb2
; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb
m4
,
shuf_rgb1
; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else
; !cpuflag(ssse3)
%if
mmsize
==
16
movd
m1
,
[
srcq
+
18
]
; (byte) { B6, G6, R6, B7 }
movd
m3
,
[
srcq
+
20
]
; (byte) { R6, B7, G7, R7 }
punpckldq
m4
,
m1
; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq
m5
,
m3
; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif
; mmsize == 16 && !cpuflag(ssse3)
punpcklbw
m4
,
m7
; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw
m5
,
m7
; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif
; cpuflag(ssse3)
add
srcq
,
3
*
mmsize
/
2
pmaddwd
m1
,
m4
,
coeffU1
; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
pmaddwd
m3
,
m5
,
coeffU2
; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
pmaddwd
m4
,
coeffV1
; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
pmaddwd
m5
,
coeffV2
; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
paddd
m1
,
m3
; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
paddd
m4
,
m5
; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
paddd
m0
,
m6
; += rgb_UVrnd, i.e. (dword) { U[0-3] }
paddd
m2
,
m6
; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd
m1
,
m6
; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd
m4
,
m6
; += rgb_UVrnd, i.e. (dword) { V[4-7] }
psrad
m0
,
15
psrad
m2
,
15
psrad
m1
,
15
psrad
m4
,
15
packssdw
m0
,
m1
; (word) { U[0-7] }
packssdw
m2
,
m4
; (word) { V[0-7] }
%if
mmsize
==
8
packuswb
m0
,
m0
; (byte) { U[0-3] }
packuswb
m2
,
m2
; (byte) { V[0-3] }
movh
[
dstUq
+
wq
]
,
m0
movh
[
dstVq
+
wq
]
,
m2
%else
; mmsize == 16
packuswb
m0
,
m2
; (byte) { U[0-7], V[0-7] }
movh
[
dstUq
+
wq
]
,
m0
movhps
[
dstVq
+
wq
]
,
m0
%endif
; mmsize == 8/16
add
wq
,
mmsize
/
2
jl
.
loop
REP_RET
%endif
; ARCH_X86_64 && %0 == 3
%endmacro
%if
ARCH_X86_32
INIT_MMX
mmx
RGB24_TO_Y_FN
0
,
rgb
RGB24_TO_Y_FN
0
,
bgr
,
rgb
RGB24_TO_UV_FN
0
,
rgb
RGB24_TO_UV_FN
0
,
bgr
,
rgb
%endif
INIT_XMM
sse2
RGB24_TO_Y_FN
10
,
rgb
RGB24_TO_Y_FN
10
,
bgr
,
rgb
RGB24_TO_UV_FN
12
,
rgb
RGB24_TO_UV_FN
12
,
bgr
,
rgb
INIT_XMM
ssse3
RGB24_TO_Y_FN
11
,
rgb
RGB24_TO_Y_FN
11
,
bgr
,
rgb
RGB24_TO_UV_FN
13
,
rgb
RGB24_TO_UV_FN
13
,
bgr
,
rgb
INIT_XMM
avx
RGB24_TO_Y_FN
11
,
rgb
RGB24_TO_Y_FN
11
,
bgr
,
rgb
RGB24_TO_UV_FN
13
,
rgb
RGB24_TO_UV_FN
13
,
bgr
,
rgb
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
...
...
libswscale/x86/swscale_mmx.c
View file @
b5d08c27
...
...
@@ -31,10 +31,6 @@ DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
bFC
)
=
0xFCFCFCFCFCFCFCFCLL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
w10
)
=
0x0010001000100010LL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
w02
)
=
0x0002000200020002LL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
bm00001111
)
=
0x00000000FFFFFFFFLL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
bm00000111
)
=
0x0000000000FFFFFFLL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
bm11111000
)
=
0xFFFFFFFFFF000000LL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
bm01010101
)
=
0x00FF00FF00FF00FFLL
;
const
DECLARE_ALIGNED
(
8
,
uint64_t
,
ff_dither4
)[
2
]
=
{
0x0103010301030103LL
,
...
...
@@ -68,19 +64,6 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_bgr2UVOffset
)
=
0x8080808080808080ULL
;
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_w1111
)
=
0x0001000100010001ULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_bgr24toY1Coeff
)
=
0x0C88000040870C88ULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_bgr24toY2Coeff
)
=
0x20DE4087000020DEULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_rgb24toY1Coeff
)
=
0x20DE0000408720DEULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_rgb24toY2Coeff
)
=
0x0C88408700000C88ULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_bgr24toYOffset
)
=
0x0008400000084000ULL
;
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_bgr24toUV
)[
2
][
4
]
=
{
{
0x38380000DAC83838ULL
,
0xECFFDAC80000ECFFULL
,
0xF6E40000D0E3F6E4ULL
,
0x3838D0E300003838ULL
},
{
0xECFF0000DAC8ECFFULL
,
0x3838DAC800003838ULL
,
0x38380000D0E33838ULL
,
0xF6E4D0E30000F6E4ULL
},
};
DECLARE_ASM_CONST
(
8
,
uint64_t
,
ff_bgr24toUVOffset
)
=
0x0040400000404000ULL
;
//MMX versions
#if HAVE_MMX
#undef RENAME
...
...
@@ -244,24 +227,29 @@ VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC
(
16
,
sse4
);
VSCALE_FUNCS
(
avx
,
avx
);
#define INPUT_Y_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *src, const uint8_t *unused1, \
int w, uint32_t *unused2)
#define INPUT_FUNC(fmt, opt) \
extern void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
int w, uint32_t *unused); \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt)
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
#if ARCH_X86_32
INPUT_FUNCS
(
mmx
);
#endif
INPUT_FUNCS
(
sse2
);
INPUT_FUNCS
(
ssse3
);
INPUT_FUNCS
(
avx
);
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
...
...
@@ -311,6 +299,12 @@ switch(c->dstBpc){ \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
default: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
}
#define case_rgb(x, X, opt) \
case PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if
(
cpu_flags
&
AV_CPU_FLAG_MMX
)
{
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
...
...
@@ -337,6 +331,8 @@ switch(c->dstBpc){ \
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_mmx
;
break
;
case_rgb
(
rgb24
,
RGB24
,
mmx
);
case_rgb
(
bgr24
,
BGR24
,
mmx
);
default:
break
;
}
...
...
@@ -379,11 +375,21 @@ switch(c->dstBpc){ \
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_sse2
;
break
;
case_rgb
(
rgb24
,
RGB24
,
sse2
);
case_rgb
(
bgr24
,
BGR24
,
sse2
);
default:
break
;
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
ssse3
,
ssse3
);
switch
(
c
->
srcFormat
)
{
case_rgb
(
rgb24
,
RGB24
,
ssse3
);
case_rgb
(
bgr24
,
BGR24
,
ssse3
);
default:
break
;
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_SSE4
)
{
/* Xto15 don't need special sse4 functions */
...
...
@@ -412,6 +418,8 @@ switch(c->dstBpc){ \
case
PIX_FMT_NV21
:
c
->
chrToYV12
=
ff_nv21ToUV_avx
;
break
;
case_rgb
(
rgb24
,
RGB24
,
avx
);
case_rgb
(
bgr24
,
BGR24
,
avx
);
default:
break
;
}
...
...
libswscale/x86/swscale_template.c
View file @
b5d08c27
...
...
@@ -1361,148 +1361,6 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
}
}
static
av_always_inline
void
RENAME
(
bgr24ToY_mmx
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
enum
PixelFormat
srcFormat
)
{
if
(
srcFormat
==
PIX_FMT_BGR24
)
{
__asm__
volatile
(
"movq "
MANGLE
(
ff_bgr24toY1Coeff
)
", %%mm5
\n\t
"
"movq "
MANGLE
(
ff_bgr24toY2Coeff
)
", %%mm6
\n\t
"
:
);
}
else
{
__asm__
volatile
(
"movq "
MANGLE
(
ff_rgb24toY1Coeff
)
", %%mm5
\n\t
"
"movq "
MANGLE
(
ff_rgb24toY2Coeff
)
", %%mm6
\n\t
"
:
);
}
__asm__
volatile
(
"movq "
MANGLE
(
ff_bgr24toYOffset
)
", %%mm4
\n\t
"
"mov %2, %%"
REG_a
"
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0)
\n\t
"
"movd (%0), %%mm0
\n\t
"
"movd 2(%0), %%mm1
\n\t
"
"movd 6(%0), %%mm2
\n\t
"
"movd 8(%0), %%mm3
\n\t
"
"add $12, %0
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm2
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"pmaddwd %%mm5, %%mm0
\n\t
"
"pmaddwd %%mm6, %%mm1
\n\t
"
"pmaddwd %%mm5, %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
"paddd %%mm1, %%mm0
\n\t
"
"paddd %%mm3, %%mm2
\n\t
"
"paddd %%mm4, %%mm0
\n\t
"
"paddd %%mm4, %%mm2
\n\t
"
"psrad $15, %%mm0
\n\t
"
"psrad $15, %%mm2
\n\t
"
"packssdw %%mm2, %%mm0
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"movd %%mm0, (%1, %%"
REG_a
")
\n\t
"
"add $4, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
src
)
:
"r"
(
dst
+
width
),
"g"
((
x86_reg
)
-
width
)
:
"%"
REG_a
);
}
static
void
RENAME
(
bgr24ToY
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
uint32_t
*
unused
)
{
RENAME
(
bgr24ToY_mmx
)(
dst
,
src
,
width
,
PIX_FMT_BGR24
);
}
static
void
RENAME
(
rgb24ToY
)(
uint8_t
*
dst
,
const
uint8_t
*
src
,
int
width
,
uint32_t
*
unused
)
{
RENAME
(
bgr24ToY_mmx
)(
dst
,
src
,
width
,
PIX_FMT_RGB24
);
}
static
av_always_inline
void
RENAME
(
bgr24ToUV_mmx
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src
,
int
width
,
enum
PixelFormat
srcFormat
)
{
__asm__
volatile
(
"movq 24(%4), %%mm6
\n\t
"
"mov %3, %%"
REG_a
"
\n\t
"
"pxor %%mm7, %%mm7
\n\t
"
"1:
\n\t
"
PREFETCH
" 64(%0)
\n\t
"
"movd (%0), %%mm0
\n\t
"
"movd 2(%0), %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"movq %%mm0, %%mm2
\n\t
"
"movq %%mm1, %%mm3
\n\t
"
"pmaddwd (%4), %%mm0
\n\t
"
"pmaddwd 8(%4), %%mm1
\n\t
"
"pmaddwd 16(%4), %%mm2
\n\t
"
"pmaddwd %%mm6, %%mm3
\n\t
"
"paddd %%mm1, %%mm0
\n\t
"
"paddd %%mm3, %%mm2
\n\t
"
"movd 6(%0), %%mm1
\n\t
"
"movd 8(%0), %%mm3
\n\t
"
"add $12, %0
\n\t
"
"punpcklbw %%mm7, %%mm1
\n\t
"
"punpcklbw %%mm7, %%mm3
\n\t
"
"movq %%mm1, %%mm4
\n\t
"
"movq %%mm3, %%mm5
\n\t
"
"pmaddwd (%4), %%mm1
\n\t
"
"pmaddwd 8(%4), %%mm3
\n\t
"
"pmaddwd 16(%4), %%mm4
\n\t
"
"pmaddwd %%mm6, %%mm5
\n\t
"
"paddd %%mm3, %%mm1
\n\t
"
"paddd %%mm5, %%mm4
\n\t
"
"movq "
MANGLE
(
ff_bgr24toUVOffset
)
", %%mm3
\n\t
"
"paddd %%mm3, %%mm0
\n\t
"
"paddd %%mm3, %%mm2
\n\t
"
"paddd %%mm3, %%mm1
\n\t
"
"paddd %%mm3, %%mm4
\n\t
"
"psrad $15, %%mm0
\n\t
"
"psrad $15, %%mm2
\n\t
"
"psrad $15, %%mm1
\n\t
"
"psrad $15, %%mm4
\n\t
"
"packssdw %%mm1, %%mm0
\n\t
"
"packssdw %%mm4, %%mm2
\n\t
"
"packuswb %%mm0, %%mm0
\n\t
"
"packuswb %%mm2, %%mm2
\n\t
"
"movd %%mm0, (%1, %%"
REG_a
")
\n\t
"
"movd %%mm2, (%2, %%"
REG_a
")
\n\t
"
"add $4, %%"
REG_a
"
\n\t
"
" js 1b
\n\t
"
:
"+r"
(
src
)
:
"r"
(
dstU
+
width
),
"r"
(
dstV
+
width
),
"g"
((
x86_reg
)
-
width
),
"r"
(
ff_bgr24toUV
[
srcFormat
==
PIX_FMT_RGB24
])
:
"%"
REG_a
);
}
static
void
RENAME
(
bgr24ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
RENAME
(
bgr24ToUV_mmx
)(
dstU
,
dstV
,
src1
,
width
,
PIX_FMT_BGR24
);
assert
(
src1
==
src2
);
}
static
void
RENAME
(
rgb24ToUV
)(
uint8_t
*
dstU
,
uint8_t
*
dstV
,
const
uint8_t
*
src1
,
const
uint8_t
*
src2
,
int
width
,
uint32_t
*
unused
)
{
assert
(
src1
==
src2
);
RENAME
(
bgr24ToUV_mmx
)(
dstU
,
dstV
,
src1
,
width
,
PIX_FMT_RGB24
);
}
#if COMPILE_TEMPLATE_MMX2
static
void
RENAME
(
hyscale_fast
)(
SwsContext
*
c
,
int16_t
*
dst
,
int
dstWidth
,
const
uint8_t
*
src
,
...
...
@@ -1689,8 +1547,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
static
av_cold
void
RENAME
(
sws_init_swScale
)(
SwsContext
*
c
)
{
enum
PixelFormat
srcFormat
=
c
->
srcFormat
,
dstFormat
=
c
->
dstFormat
;
enum
PixelFormat
dstFormat
=
c
->
dstFormat
;
if
(
!
is16BPS
(
dstFormat
)
&&
!
is9_OR_10BPS
(
dstFormat
)
&&
dstFormat
!=
PIX_FMT_NV12
&&
dstFormat
!=
PIX_FMT_NV21
)
{
...
...
@@ -1762,18 +1619,4 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
#endif
/* COMPILE_TEMPLATE_MMX2 */
}
if
(
!
c
->
chrSrcHSubSample
)
{
switch
(
srcFormat
)
{
case
PIX_FMT_BGR24
:
c
->
chrToYV12
=
RENAME
(
bgr24ToUV
);
break
;
case
PIX_FMT_RGB24
:
c
->
chrToYV12
=
RENAME
(
rgb24ToUV
);
break
;
default:
break
;
}
}
switch
(
srcFormat
)
{
case
PIX_FMT_BGR24
:
c
->
lumToYV12
=
RENAME
(
bgr24ToY
);
break
;
case
PIX_FMT_RGB24
:
c
->
lumToYV12
=
RENAME
(
rgb24ToY
);
break
;
default:
break
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment