Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
5ce703a6
Commit
5ce703a6
authored
Apr 06, 2016
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vf_colorspace: x86-64 SIMD (SSE2) optimizations.
parent
2e2e08a3
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1543 additions
and
0 deletions
+1543
-0
colorspacedsp.c
libavfilter/colorspacedsp.c
+3
-0
colorspacedsp.h
libavfilter/colorspacedsp.h
+3
-0
Makefile
libavfilter/x86/Makefile
+2
-0
colorspacedsp.asm
libavfilter/x86/colorspacedsp.asm
+1097
-0
colorspacedsp_init.c
libavfilter/x86/colorspacedsp_init.c
+119
-0
Makefile
tests/checkasm/Makefile
+1
-0
checkasm.c
tests/checkasm/checkasm.c
+3
-0
checkasm.h
tests/checkasm/checkasm.h
+1
-0
vf_colorspace.c
tests/checkasm/vf_colorspace.c
+314
-0
No files found.
libavfilter/colorspacedsp.c
View file @
5ce703a6
...
...
@@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp)
init_yuv2yuv_fns
(
2
,
12
);
dsp
->
multiply3x3
=
multiply3x3_c
;
if
(
ARCH_X86
)
ff_colorspacedsp_x86_init
(
dsp
);
}
libavfilter/colorspacedsp.h
View file @
5ce703a6
...
...
@@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext {
void
ff_colorspacedsp_init
(
ColorSpaceDSPContext
*
dsp
);
/* internal */
void
ff_colorspacedsp_x86_init
(
ColorSpaceDSPContext
*
dsp
);
#endif
/* AVFILTER_COLORSPACEDSP_H */
libavfilter/x86/Makefile
View file @
5ce703a6
OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend_init.o
OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif_init.o
OBJS-$(CONFIG_COLORSPACE_FILTER)
+=
x86/colorspacedsp_init.o
OBJS-$(CONFIG_EQ_FILTER)
+=
x86/vf_eq.o
OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp_init.o
OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun_init.o
...
...
@@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
YASM-OBJS-$(CONFIG_BLEND_FILTER)
+=
x86/vf_blend.o
YASM-OBJS-$(CONFIG_BWDIF_FILTER)
+=
x86/vf_bwdif.o
YASM-OBJS-$(CONFIG_COLORSPACE_FILTER)
+=
x86/colorspacedsp.o
YASM-OBJS-$(CONFIG_FSPP_FILTER)
+=
x86/vf_fspp.o
YASM-OBJS-$(CONFIG_GRADFUN_FILTER)
+=
x86/vf_gradfun.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER)
+=
x86/vf_hqdn3d.o
...
...
libavfilter/x86/colorspacedsp.asm
0 → 100644
View file @
5ce703a6
;*****************************************************************************
;* x86-optimized functions for colorspace filter
;*
;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1
:
times
8
dw
1
pw_2
:
times
8
dw
2
pw_4
:
times
8
dw
4
pw_8
:
times
8
dw
8
pw_16
:
times
8
dw
16
pw_64
:
times
8
dw
64
pw_128
:
times
8
dw
128
pw_256
:
times
8
dw
256
pw_512
:
times
8
dw
512
pw_1023
:
times
8
dw
1023
pw_1024
:
times
8
dw
1024
pw_2048
:
times
8
dw
2048
pw_4095
:
times
8
dw
4095
pw_8192
:
times
8
dw
8192
pw_16384
:
times
8
dw
16384
pd_1
:
times
4
dd
1
pd_2
:
times
4
dd
2
pd_128
:
times
4
dd
128
pd_512
:
times
4
dd
512
pd_2048
:
times
4
dd
2048
pd_8192
:
times
4
dd
8192
pd_32768
:
times
4
dd
32768
pd_131072
:
times
4
dd
131072
SECTION
.
text
; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
; const int16_t yuv_offset[2][8])
%if
ARCH_X86_64
%macro
YUV2YUV_FN
4
; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
(
14
+
%1
-
%2
)
%assign
%%
rnd
(
1
<<
(
%%
sh
-
1
))
%assign
%%
uvinoff
(
128
<<
(
%1
-
8
))
%assign
%%
uvoutoff
(
128
<<
(
%2
-
8
))
%if
%3
==
0
%assign
%%
ss
444
%elif
%4
==
0
%assign
%%
ss
422
%else
; %4 == 1
%assign
%%
ss
420
%endif
; %3/%4
%if
%2
!
=
8
%assign
%%
maxval
(
1
<<
%2
)
-
1
%endif
; %2 != 8
%assign
%%
ypsh
%%
sh
-
1
%if
%%
ypsh
>
14
%assign
%%
yoffsh
%%
ypsh
-
13
%assign
%%
ypsh
14
%else
%assign
%%
yoffsh
1
%endif
%assign
%%
yprnd
(
1
<<
(
%%
yoffsh
-
1
))
%assign
%%
ypmul
(
1
<<
%%
ypsh
)
cglobal
yuv2yuv_
%
+
%%
ss
%
+
p%1to%2
,
8
,
14
,
16
,
0
-
(
4
*
mmsize
),
\
yo
,
yos
,
yi
,
yis
,
w
,
h
,
c
,
yoff
,
ui
,
vi
,
uo
,
vo
%if
%3
==
1
inc
wd
sar
wd
,
1
%if
%4
==
1
inc
hd
sar
hd
,
1
%endif
; %4 == 1
%endif
; %3 == 1
mov
[
rsp
+
3
*
mmsize
+
0
]
,
wd
mov
[
rsp
+
3
*
mmsize
+
4
]
,
hd
mova
m10
,
[cq]
pxor
m11
,
m11
mova
m12
,
[
pd_
%
+
%%
uvoutoff
]
pslld
m12
,
%%
sh
paddd
m12
,
[
pd_
%
+
%%
rnd
]
mova
m13
,
[
pw_
%
+
%%
uvinoff
]
mova
m14
,
[
yoffq
+
0
]
; y_off_in
mova
m15
,
[
yoffq
+
16
]
; y_off_out
%if
%%
yoffsh
!
=
0
psllw
m15
,
%%
yoffsh
%endif
paddw
m15
,
[
pw_
%
+
%%
yprnd
]
punpcklwd
m10
,
m15
mova
m15
,
[
pw_
%
+
%%
ypmul
]
movh
m0
,
[
cq
+
1
*
16
]
; cyu
movh
m1
,
[
cq
+
2
*
16
]
; cyv
movh
m2
,
[
cq
+
4
*
16
]
; cuu
movh
m3
,
[
cq
+
5
*
16
]
; cuv
movh
m4
,
[
cq
+
7
*
16
]
; cvu
movh
m5
,
[
cq
+
8
*
16
]
; cvv
punpcklwd
m0
,
m1
punpcklwd
m2
,
m3
punpcklwd
m4
,
m5
mova
[
rsp
+
0
*
mmsize
]
,
m0
mova
[
rsp
+
1
*
mmsize
]
,
m2
mova
[
rsp
+
2
*
mmsize
]
,
m4
DEFINE_ARGS
yo
,
yos
,
yi
,
yis
,
ui
,
vi
,
uo
,
vo
,
uis
,
vis
,
uos
,
vos
,
x
,
tmp
mov
uiq
,
[
yiq
+
gprsize
*
1
]
mov
viq
,
[
yiq
+
gprsize
*
2
]
mov
yiq
,
[
yiq
+
gprsize
*
0
]
mov
uoq
,
[
yoq
+
gprsize
*
1
]
mov
voq
,
[
yoq
+
gprsize
*
2
]
mov
yoq
,
[
yoq
+
gprsize
*
0
]
mov
uisq
,
[
yisq
+
gprsize
*
1
]
mov
visq
,
[
yisq
+
gprsize
*
2
]
mov
yisq
,
[
yisq
+
gprsize
*
0
]
mov
uosq
,
[
yosq
+
gprsize
*
1
]
mov
vosq
,
[
yosq
+
gprsize
*
2
]
mov
yosq
,
[
yosq
+
gprsize
*
0
]
.
loop_v
:
xor
xq
,
xq
.
loop_h
:
%if
%4
==
1
lea
tmpq
,
[
yiq
+
yisq
]
%endif
; %4 == 1
%if
%1
==
8
movu
m0
,
[
yiq
+
xq
*
(
1
<<
%3
)
]
; y00/01
%if
%4
==
1
movu
m2
,
[
tmpq
+
xq
*
2
]
; y10/11
%endif
; %4 == 1
%if
%3
==
1
movh
m4
,
[
uiq
+
xq
]
; u
movh
m5
,
[
viq
+
xq
]
; v
%else
; %3 != 1
movu
m4
,
[
uiq
+
xq
]
; u
movu
m5
,
[
viq
+
xq
]
; v
%endif
; %3 ==/!= 1
punpckhbw
m1
,
m0
,
m11
punpcklbw
m0
,
m11
%if
%4
==
1
punpckhbw
m3
,
m2
,
m11
punpcklbw
m2
,
m11
%endif
; %4 == 1
%if
%3
==
0
punpckhbw
m2
,
m4
,
m11
punpckhbw
m3
,
m5
,
m11
%endif
; %3 == 0
punpcklbw
m4
,
m11
punpcklbw
m5
,
m11
%else
; %1 != 8
movu
m0
,
[
yiq
+
xq
*
(
2
<<
%3
)
]
; y00/01
movu
m1
,
[
yiq
+
xq
*
(
2
<<
%3
)
+
mmsize
]
; y00/01
%if
%4
==
1
movu
m2
,
[
tmpq
+
xq
*
4
]
; y10/11
movu
m3
,
[
tmpq
+
xq
*
4
+
mmsize
]
; y10/11
%endif
; %4 == 1
movu
m4
,
[
uiq
+
xq
*
2
]
; u
movu
m5
,
[
viq
+
xq
*
2
]
; v
%if
%3
==
0
movu
m2
,
[
uiq
+
xq
*
2
+
mmsize
]
movu
m3
,
[
viq
+
xq
*
2
+
mmsize
]
%endif
; %3 == 0
%endif
; %1 ==/!= 8
psubw
m0
,
m14
psubw
m1
,
m14
%if
%4
==
1
psubw
m2
,
m14
psubw
m3
,
m14
%endif
; %4 == 1
psubw
m4
,
m13
psubw
m5
,
m13
%if
%3
==
0
psubw
m2
,
m13
psubw
m3
,
m13
%endif
; %3 == 0
SBUTTERFLY
wd
,
4
,
5
,
6
pmaddwd
m6
,
m4
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m7
,
m5
,
[
rsp
+
1
*
mmsize
]
%if
%3
==
0
SBUTTERFLY
wd
,
2
,
3
,
8
pmaddwd
m8
,
m2
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m9
,
m3
,
[
rsp
+
1
*
mmsize
]
%else
; %3 != 0
pmaddwd
m8
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m5
,
[
rsp
+
2
*
mmsize
]
%endif
paddd
m6
,
m12
paddd
m7
,
m12
paddd
m8
,
m12
paddd
m9
,
m12
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%2
==
8
packuswb
m6
,
m8
%if
%3
==
0
movu
[
uoq
+
xq
]
,
m6
%else
; %3 != 0
movh
[
uoq
+
xq
]
,
m6
movhps
[
voq
+
xq
]
,
m6
%endif
; %3 ==/!= 0
%else
; %2 != 8
CLIPW
m6
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
uoq
+
xq
*
2
]
,
m6
%if
%3
==
0
movu
[
uoq
+
xq
*
2
+
mmsize
]
,
m8
%else
; %3 != 0
movu
[
voq
+
xq
*
2
]
,
m8
%endif
; %3 ==/!= 0
%endif
; %2 ==/!= 8
%if
%3
==
0
pmaddwd
m6
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m7
,
m5
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m8
,
m2
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m3
,
[
rsp
+
2
*
mmsize
]
paddd
m6
,
m12
paddd
m7
,
m12
paddd
m8
,
m12
paddd
m9
,
m12
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%2
==
8
packuswb
m6
,
m8
movu
[
voq
+
xq
]
,
m6
%else
; %2 != 8
CLIPW
m6
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
voq
+
xq
*
2
]
,
m6
movu
[
voq
+
xq
*
2
+
mmsize
]
,
m8
%endif
; %2 ==/!= 8
%endif
; %3 == 0
pmaddwd
m4
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
0
*
mmsize
]
; uv_val
%if
%3
==
0
pmaddwd
m2
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
0
*
mmsize
]
%endif
; %3 == 0
; unpack y pixels with m15 (shifted round + offset), then multiply
; by m10, add uv pixels, and we're done!
%if
%3
==
1
punpckhdq
m8
,
m4
,
m4
punpckldq
m4
,
m4
punpckhdq
m9
,
m5
,
m5
punpckldq
m5
,
m5
%else
; %3 != 1
SWAP
8
,
5
,
2
SWAP
3
,
9
%endif
; %3 ==/!= 1
%if
%4
==
1
punpckhwd
m6
,
m2
,
m15
punpcklwd
m2
,
m15
punpckhwd
m7
,
m3
,
m15
punpcklwd
m3
,
m15
pmaddwd
m2
,
m10
pmaddwd
m6
,
m10
pmaddwd
m3
,
m10
pmaddwd
m7
,
m10
paddd
m2
,
m4
paddd
m6
,
m8
paddd
m3
,
m5
paddd
m7
,
m9
psrad
m2
,
%%
sh
psrad
m6
,
%%
sh
psrad
m3
,
%%
sh
psrad
m7
,
%%
sh
packssdw
m2
,
m6
packssdw
m3
,
m7
lea
tmpq
,
[
yoq
+
yosq
]
%if
%2
==
8
packuswb
m2
,
m3
movu
[
tmpq
+
xq
*
2
]
,
m2
%else
; %2 != 8
CLIPW
m2
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m3
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
tmpq
+
xq
*
4
]
,
m2
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m3
%endif
; %2 ==/!= 8
%endif
; %4 == 1
punpckhwd
m6
,
m0
,
m15
punpcklwd
m0
,
m15
punpckhwd
m7
,
m1
,
m15
punpcklwd
m1
,
m15
pmaddwd
m0
,
m10
pmaddwd
m6
,
m10
pmaddwd
m1
,
m10
pmaddwd
m7
,
m10
paddd
m0
,
m4
paddd
m6
,
m8
paddd
m1
,
m5
paddd
m7
,
m9
psrad
m0
,
%%
sh
psrad
m6
,
%%
sh
psrad
m1
,
%%
sh
psrad
m7
,
%%
sh
packssdw
m0
,
m6
packssdw
m1
,
m7
%if
%2
==
8
packuswb
m0
,
m1
movu
[
yoq
+
xq
*
(
1
<<
%3
)
]
,
m0
%else
; %2 != 8
CLIPW
m0
,
m11
,
[
pw_
%
+
%%
maxval
]
CLIPW
m1
,
m11
,
[
pw_
%
+
%%
maxval
]
movu
[
yoq
+
xq
*
(
2
<<
%3
)
]
,
m0
movu
[
yoq
+
xq
*
(
2
<<
%3
)
+
mmsize
]
,
m1
%endif
; %2 ==/!= 8
add
xq
,
mmsize
>>
%3
cmp
xd
,
dword
[
rsp
+
3
*
mmsize
+
0
]
jl
.
loop_h
%if
%4
==
1
lea
yiq
,
[
yiq
+
yisq
*
2
]
lea
yoq
,
[
yoq
+
yosq
*
2
]
%else
; %4 != 1
add
yiq
,
yisq
add
yoq
,
yosq
%endif
; %4 ==/!= 1
add
uiq
,
uisq
add
viq
,
visq
add
uoq
,
uosq
add
voq
,
vosq
dec
dword
[
rsp
+
3
*
mmsize
+
4
]
jg
.
loop_v
RET
%endmacro
%macro
YUV2YUV_FNS
2
; ss_w, ss_h
YUV2YUV_FN
8
,
8
,
%1
,
%2
YUV2YUV_FN
10
,
8
,
%1
,
%2
YUV2YUV_FN
12
,
8
,
%1
,
%2
YUV2YUV_FN
8
,
10
,
%1
,
%2
YUV2YUV_FN
10
,
10
,
%1
,
%2
YUV2YUV_FN
12
,
10
,
%1
,
%2
YUV2YUV_FN
8
,
12
,
%1
,
%2
YUV2YUV_FN
10
,
12
,
%1
,
%2
YUV2YUV_FN
12
,
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
YUV2YUV_FNS
0
,
0
YUV2YUV_FNS
1
,
0
YUV2YUV_FNS
1
,
1
; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
; uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
; const int16_t yuv_offset[8])
%macro
YUV2RGB_FN
3
; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
(
%1
-
1
)
%assign
%%
rnd
(
1
<<
(
%%
sh
-
1
))
%assign
%%
uvoff
(
1
<<
(
%1
-
1
))
%if
%2
==
0
%assign
%%
ss
444
%elif
%3
==
0
%assign
%%
ss
422
%else
; %3 == 1
%assign
%%
ss
420
%endif
; %2/%3
cglobal
yuv2rgb_
%
+
%%
ss
%
+
p%1
,
8
,
14
,
16
,
0
-
8
*
mmsize
,
\
rgb
,
rgbs
,
yuv
,
yuvs
,
ww
,
h
,
c
,
yoff
%if
%2
==
1
inc
wwd
sar
wwd
,
1
%endif
; %2 == 1
%if
%3
==
1
inc
hd
sar
hd
,
1
%endif
; %3 == 1
pxor
m11
,
m11
mova
m15
,
[yoffq]
; yoff
movh
m14
,
[
cq
+
0
]
; cy
movh
m10
,
[
cq
+
32
]
; crv
movh
m13
,
[
cq
+
112
]
; cbu
movh
m12
,
[
cq
+
64
]
; cgu
movh
m9
,
[
cq
+
80
]
; cgv
punpcklwd
m14
,
[
pw_
%
+
%%
rnd
]
; cy, rnd
punpcklwd
m13
,
m11
; cbu, 0
punpcklwd
m11
,
m10
; 0, crv
punpcklwd
m12
,
m9
; cgu, cgv
mova
[
rsp
+
0
*
mmsize
]
,
m11
mova
[
rsp
+
1
*
mmsize
]
,
m12
mova
[
rsp
+
2
*
mmsize
]
,
m13
mova
[
rsp
+
3
*
mmsize
]
,
m14
pxor
m14
,
m14
DEFINE_ARGS
r
,
rgbs
,
y
,
ys
,
ww
,
h
,
g
,
b
,
u
,
v
,
us
,
vs
,
x
,
tmp
mov
gq
,
[
rq
+
1
*
gprsize
]
mov
bq
,
[
rq
+
2
*
gprsize
]
mov
rq
,
[
rq
+
0
*
gprsize
]
mov
uq
,
[
yq
+
1
*
gprsize
]
mov
vq
,
[
yq
+
2
*
gprsize
]
mov
yq
,
[
yq
+
0
*
gprsize
]
mov
usq
,
[
ysq
+
1
*
gprsize
]
mov
vsq
,
[
ysq
+
2
*
gprsize
]
mov
ysq
,
[
ysq
+
0
*
gprsize
]
.
loop_v
:
xor
xq
,
xq
.
loop_h
:
%if
%3
==
1
lea
tmpq
,
[
yq
+
ysq
]
%endif
; %3 == 1
%if
%1
==
8
movu
m0
,
[
yq
+
xq
*
(
1
<<
%2
)
]
%if
%3
==
1
movu
m2
,
[
tmpq
+
xq
*
2
]
%endif
; %3 == 1
%if
%2
==
1
movh
m4
,
[
uq
+
xq
]
movh
m5
,
[
vq
+
xq
]
%else
; %2 != 1
movu
m4
,
[
uq
+
xq
]
movu
m5
,
[
vq
+
xq
]
%endif
; %2 ==/!= 1
punpckhbw
m1
,
m0
,
m14
punpcklbw
m0
,
m14
%if
%3
==
1
punpckhbw
m3
,
m2
,
m14
punpcklbw
m2
,
m14
%endif
; %3 == 1
%if
%2
==
0
punpckhbw
m2
,
m4
,
m14
punpckhbw
m3
,
m5
,
m14
%endif
; %2 == 0
punpcklbw
m4
,
m14
punpcklbw
m5
,
m14
%else
; %1 != 8
movu
m0
,
[
yq
+
xq
*
(
2
<<
%2
)
]
movu
m1
,
[
yq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
%if
%3
==
1
movu
m2
,
[
tmpq
+
xq
*
4
]
movu
m3
,
[
tmpq
+
xq
*
4
+
mmsize
]
%endif
; %3 == 1
movu
m4
,
[
uq
+
xq
*
2
]
movu
m5
,
[
vq
+
xq
*
2
]
%if
%2
==
0
movu
m2
,
[
uq
+
xq
*
2
+
mmsize
]
movu
m3
,
[
vq
+
xq
*
2
+
mmsize
]
%endif
; %2 == 0
%endif
; %1 ==/!= 8
psubw
m0
,
m15
psubw
m1
,
m15
%if
%3
==
1
psubw
m2
,
m15
psubw
m3
,
m15
%endif
; %3 == 1
psubw
m4
,
[
pw_
%
+
%%
uvoff
]
psubw
m5
,
[
pw_
%
+
%%
uvoff
]
SBUTTERFLY
wd
,
4
,
5
,
6
%if
%2
==
0
psubw
m2
,
[
pw_
%
+
%%
uvoff
]
psubw
m3
,
[
pw_
%
+
%%
uvoff
]
SBUTTERFLY
wd
,
2
,
3
,
6
%endif
; %2 == 0
; calculate y+rnd full-resolution [0-3,6-9]
punpckhwd
m6
,
m0
,
[
pw_1
]
; y, 1
punpcklwd
m0
,
[
pw_1
]
; y, 1
punpckhwd
m7
,
m1
,
[
pw_1
]
; y, 1
punpcklwd
m1
,
[
pw_1
]
; y, 1
pmaddwd
m0
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m1
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
3
*
mmsize
]
%if
%3
==
1
punpckhwd
m8
,
m2
,
[
pw_1
]
; y, 1
punpcklwd
m2
,
[
pw_1
]
; y, 1
punpckhwd
m9
,
m3
,
[
pw_1
]
; y, 1
punpcklwd
m3
,
[
pw_1
]
; y, 1
pmaddwd
m2
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m8
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
3
*
mmsize
]
mova
[
rsp
+
4
*
mmsize
]
,
m2
mova
[
rsp
+
5
*
mmsize
]
,
m8
mova
[
rsp
+
6
*
mmsize
]
,
m3
mova
[
rsp
+
7
*
mmsize
]
,
m9
%endif
; %3 == 1
; calculate r offsets (un-subsampled, then duplicate)
pmaddwd
m10
,
m4
,
[
rsp
+
0
*
mmsize
]
%if
%2
==
1
pmaddwd
m12
,
m5
,
[
rsp
+
0
*
mmsize
]
punpckhdq
m11
,
m10
,
m10
punpckldq
m10
,
m10
punpckhdq
m13
,
m12
,
m12
punpckldq
m12
,
m12
%else
; %2 != 1
pmaddwd
m11
,
m5
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m12
,
m2
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m13
,
m3
,
[
rsp
+
0
*
mmsize
]
%endif
; %2 ==/!= 1
%if
%3
==
1
paddd
m2
,
m10
,
[
rsp
+
4
*
mmsize
]
paddd
m3
,
m11
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m12
,
[
rsp
+
6
*
mmsize
]
paddd
m9
,
m13
,
[
rsp
+
7
*
mmsize
]
%endif
paddd
m10
,
m0
paddd
m11
,
m6
paddd
m12
,
m1
paddd
m13
,
m7
%if
%3
==
1
psrad
m2
,
%%
sh
psrad
m3
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
%endif
; %3 == 1
psrad
m10
,
%%
sh
psrad
m11
,
%%
sh
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
%if
%3
==
1
lea
tmpq
,
[
rq
+
rgbsq
*
2
]
packssdw
m2
,
m3
packssdw
m8
,
m9
mova
[
tmpq
+
xq
*
4
]
,
m2
mova
[
tmpq
+
xq
*
4
+
mmsize
]
,
m8
%endif
; %3 == 1
packssdw
m10
,
m11
packssdw
m12
,
m13
mova
[
rq
+
xq
*
(
2
<<
%2
)
]
,
m10
mova
[
rq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m12
; calculate g offsets (un-subsampled, then duplicate)
pmaddwd
m10
,
m4
,
[
rsp
+
1
*
mmsize
]
%if
%2
==
1
pmaddwd
m12
,
m5
,
[
rsp
+
1
*
mmsize
]
punpckhdq
m11
,
m10
,
m10
punpckldq
m10
,
m10
punpckhdq
m13
,
m12
,
m12
punpckldq
m12
,
m12
%else
; %2 != 1
pmaddwd
m11
,
m5
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m12
,
m2
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m13
,
m3
,
[
rsp
+
1
*
mmsize
]
%endif
; %2 ==/!= 1
%if
%3
==
1
paddd
m2
,
m10
,
[
rsp
+
4
*
mmsize
]
paddd
m3
,
m11
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m12
,
[
rsp
+
6
*
mmsize
]
paddd
m9
,
m13
,
[
rsp
+
7
*
mmsize
]
%endif
; %3 == 1
paddd
m10
,
m0
paddd
m11
,
m6
paddd
m12
,
m1
paddd
m13
,
m7
%if
%3
==
1
psrad
m2
,
%%
sh
psrad
m3
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
%endif
; %3 == 1
psrad
m10
,
%%
sh
psrad
m11
,
%%
sh
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
%if
%3
==
1
lea
tmpq
,
[
gq
+
rgbsq
*
2
]
packssdw
m2
,
m3
packssdw
m8
,
m9
mova
[
tmpq
+
xq
*
4
]
,
m2
mova
[
tmpq
+
xq
*
4
+
mmsize
]
,
m8
%endif
; %3 == 1
packssdw
m10
,
m11
packssdw
m12
,
m13
mova
[
gq
+
xq
*
(
2
<<
%2
)
]
,
m10
mova
[
gq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m12
; calculate b offsets (un-subsampled, then duplicate)
pmaddwd
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
2
*
mmsize
]
%if
%2
==
1
punpckhdq
m2
,
m4
,
m4
punpckldq
m4
,
m4
punpckhdq
m3
,
m5
,
m5
punpckldq
m5
,
m5
%else
; %2 != 1
pmaddwd
m2
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
2
*
mmsize
]
SWAP
2
,
5
%endif
; %2 ==/!= 1
paddd
m0
,
m4
paddd
m6
,
m2
paddd
m1
,
m5
paddd
m7
,
m3
%if
%3
==
1
paddd
m4
,
[
rsp
+
4
*
mmsize
]
paddd
m2
,
[
rsp
+
5
*
mmsize
]
paddd
m5
,
[
rsp
+
6
*
mmsize
]
paddd
m3
,
[
rsp
+
7
*
mmsize
]
%endif
; %3 == 1
psrad
m0
,
%%
sh
psrad
m6
,
%%
sh
psrad
m1
,
%%
sh
psrad
m7
,
%%
sh
%if
%3
==
1
psrad
m4
,
%%
sh
psrad
m2
,
%%
sh
psrad
m5
,
%%
sh
psrad
m3
,
%%
sh
%endif
; %3 == 1
packssdw
m0
,
m6
packssdw
m1
,
m7
movu
[
bq
+
xq
*
(
2
<<
%2
)
]
,
m0
movu
[
bq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m1
%if
%3
==
1
lea
tmpq
,
[
bq
+
rgbsq
*
2
]
packssdw
m4
,
m2
packssdw
m5
,
m3
movu
[
tmpq
+
xq
*
4
]
,
m4
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m5
%endif
; %3 == 1
add
xd
,
mmsize
>>
%2
cmp
xd
,
wwd
jl
.
loop_h
lea
rq
,
[
rq
+
rgbsq
*
(
2
<<
%3
)
]
lea
gq
,
[
gq
+
rgbsq
*
(
2
<<
%3
)
]
lea
bq
,
[
bq
+
rgbsq
*
(
2
<<
%3
)
]
%if
%3
==
1
lea
yq
,
[
yq
+
ysq
*
2
]
%else
; %3 != 0
add
yq
,
ysq
%endif
; %3 ==/!= 1
add
uq
,
usq
add
vq
,
vsq
dec
hd
jg
.
loop_v
RET
%endmacro
%macro
YUV2RGB_FNS
2
YUV2RGB_FN
8
,
%1
,
%2
YUV2RGB_FN
10
,
%1
,
%2
YUV2RGB_FN
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
YUV2RGB_FNS
0
,
0
YUV2RGB_FNS
1
,
0
YUV2RGB_FNS
1
,
1
%macro
RGB2YUV_FN
3
; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
%assign
%%
sh
29
-
%1
%assign
%%
rnd
(
1
<<
(
%%
sh
-
15
))
%assign
%%
uvrnd
((
128
<<
(
%1
-
8
))
<<
(
%%
sh
-
14
))
%if
%1
!
=
8
%assign
%%
maxval
((
1
<<
%1
)
-
1
)
%endif
; %1 != 8
%if
%2
==
0
%assign
%%
ss
444
%elif
%3
==
0
%assign
%%
ss
422
%else
; %3 == 1
%assign
%%
ss
420
%endif
; %2/%3
cglobal
rgb2yuv_
%
+
%%
ss
%
+
p%1
,
8
,
14
,
16
,
0
-
6
*
mmsize
,
\
yuv
,
yuvs
,
rgb
,
rgbs
,
ww
,
h
,
c
,
off
%if
%2
==
1
inc
wwd
sar
wwd
,
1
%endif
; %2 == 1
%if
%3
==
1
inc
hd
sar
hd
,
1
%endif
; %3 == 1
; prepare coeffs
movh
m8
,
[offq]
movh
m9
,
[
pw_
%
+
%%
uvrnd
]
psllw
m8
,
%%
sh
-
14
paddw
m9
,
[
pw_
%
+
%%
rnd
]
paddw
m8
,
[
pw_
%
+
%%
rnd
]
movh
m0
,
[
cq
+
0
]
movh
m1
,
[
cq
+
16
]
movh
m2
,
[
cq
+
32
]
movh
m3
,
[
cq
+
48
]
movh
m4
,
[
cq
+
64
]
movh
m5
,
[
cq
+
80
]
movh
m6
,
[
cq
+
112
]
movh
m7
,
[
cq
+
128
]
punpcklwd
m0
,
m1
punpcklwd
m2
,
m8
punpcklwd
m3
,
m4
punpcklwd
m4
,
m5
,
m9
punpcklwd
m5
,
m6
punpcklwd
m7
,
m9
mova
[
rsp
+
0
*
mmsize
]
,
m0
; cry, cgy
mova
[
rsp
+
1
*
mmsize
]
,
m2
; cby, off + rnd
mova
[
rsp
+
2
*
mmsize
]
,
m3
; cru, cgu
mova
[
rsp
+
3
*
mmsize
]
,
m4
; cburv, uvoff + rnd
mova
[
rsp
+
4
*
mmsize
]
,
m5
; cburv, cgv
mova
[
rsp
+
5
*
mmsize
]
,
m7
; cbv, uvoff + rnd
DEFINE_ARGS
y
,
ys
,
r
,
rgbs
,
ww
,
h
,
u
,
v
,
us
,
vs
,
g
,
b
,
tmp
,
x
mov
gq
,
[
rq
+
gprsize
*
1
]
mov
bq
,
[
rq
+
gprsize
*
2
]
mov
rq
,
[
rq
+
gprsize
*
0
]
mov
uq
,
[
yq
+
gprsize
*
1
]
mov
vq
,
[
yq
+
gprsize
*
2
]
mov
yq
,
[
yq
+
gprsize
*
0
]
mov
usq
,
[
ysq
+
gprsize
*
1
]
mov
vsq
,
[
ysq
+
gprsize
*
2
]
mov
ysq
,
[
ysq
+
gprsize
*
0
]
pxor
m15
,
m15
.
loop_v
:
xor
xd
,
xd
.
loop_h
:
; top line y
mova
m0
,
[
rq
+
xq
*
(
2
<<
%2
)
]
mova
m3
,
[
rq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
mova
m1
,
[
gq
+
xq
*
(
2
<<
%2
)
]
mova
m4
,
[
gq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
mova
m2
,
[
bq
+
xq
*
(
2
<<
%2
)
]
mova
m5
,
[
bq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
punpcklwd
m6
,
m0
,
m1
punpckhwd
m7
,
m0
,
m1
punpcklwd
m8
,
m3
,
m4
punpckhwd
m9
,
m3
,
m4
punpcklwd
m10
,
m2
,
[
pw_16384
]
punpckhwd
m11
,
m2
,
[
pw_16384
]
punpcklwd
m12
,
m5
,
[
pw_16384
]
punpckhwd
m13
,
m5
,
[
pw_16384
]
pmaddwd
m6
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m8
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m10
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m11
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m12
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m13
,
[
rsp
+
1
*
mmsize
]
paddd
m6
,
m10
paddd
m7
,
m11
paddd
m8
,
m12
paddd
m9
,
m13
psrad
m6
,
%%
sh
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
packssdw
m6
,
m7
packssdw
m8
,
m9
%if
%1
==
8
packuswb
m6
,
m8
movu
[
yq
+
xq
*
(
1
<<
%2
)
]
,
m6
%else
CLIPW
m6
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
yq
+
xq
*
(
2
<<
%2
)
]
,
m6
movu
[
yq
+
xq
*
(
2
<<
%2
)
+
mmsize
]
,
m8
%endif
%if
%2
==
1
; subsampling cached data
pmaddwd
m0
,
[
pw_1
]
pmaddwd
m1
,
[
pw_1
]
pmaddwd
m2
,
[
pw_1
]
pmaddwd
m3
,
[
pw_1
]
pmaddwd
m4
,
[
pw_1
]
pmaddwd
m5
,
[
pw_1
]
%if
%3
==
1
; bottom line y, r/g portion only
lea
tmpq
,
[
rgbsq
+
xq
*
2
]
mova
m6
,
[
rq
+
tmpq
*
2
]
mova
m9
,
[
rq
+
tmpq
*
2
+
mmsize
]
mova
m7
,
[
gq
+
tmpq
*
2
]
mova
m10
,
[
gq
+
tmpq
*
2
+
mmsize
]
mova
m8
,
[
bq
+
tmpq
*
2
]
mova
m11
,
[
bq
+
tmpq
*
2
+
mmsize
]
punpcklwd
m12
,
m6
,
m7
punpckhwd
m13
,
m6
,
m7
punpcklwd
m14
,
m9
,
m10
punpckhwd
m15
,
m9
,
m10
; release two more registers
pmaddwd
m6
,
[
pw_1
]
pmaddwd
m7
,
[
pw_1
]
pmaddwd
m9
,
[
pw_1
]
pmaddwd
m10
,
[
pw_1
]
paddd
m0
,
m6
paddd
m3
,
m9
paddd
m1
,
m7
paddd
m4
,
m10
; bottom line y, b/rnd portion only
punpcklwd
m6
,
m8
,
[
pw_16384
]
punpckhwd
m7
,
m8
,
[
pw_16384
]
punpcklwd
m9
,
m11
,
[
pw_16384
]
punpckhwd
m10
,
m11
,
[
pw_16384
]
pmaddwd
m12
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m13
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m14
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m15
,
[
rsp
+
0
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m7
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m9
,
[
rsp
+
1
*
mmsize
]
pmaddwd
m10
,
[
rsp
+
1
*
mmsize
]
paddd
m12
,
m6
paddd
m13
,
m7
paddd
m14
,
m9
paddd
m15
,
m10
psrad
m12
,
%%
sh
psrad
m13
,
%%
sh
psrad
m14
,
%%
sh
psrad
m15
,
%%
sh
packssdw
m12
,
m13
packssdw
m14
,
m15
lea
tmpq
,
[
yq
+
ysq
]
%if
%1
==
8
packuswb
m12
,
m14
movu
[
tmpq
+
xq
*
2
]
,
m12
%else
pxor
m15
,
m15
CLIPW
m12
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m14
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
tmpq
+
xq
*
4
]
,
m12
movu
[
tmpq
+
xq
*
4
+
mmsize
]
,
m14
%endif
; complete subsampling of r/g/b pixels for u/v
pmaddwd
m8
,
[
pw_1
]
pmaddwd
m11
,
[
pw_1
]
paddd
m2
,
m8
paddd
m5
,
m11
paddd
m0
,
[
pd_2
]
paddd
m1
,
[
pd_2
]
paddd
m2
,
[
pd_2
]
paddd
m3
,
[
pd_2
]
paddd
m4
,
[
pd_2
]
paddd
m5
,
[
pd_2
]
psrad
m0
,
2
psrad
m1
,
2
psrad
m2
,
2
psrad
m3
,
2
psrad
m4
,
2
psrad
m5
,
2
%else
; %3 != 1
paddd
m0
,
[
pd_1
]
paddd
m1
,
[
pd_1
]
paddd
m2
,
[
pd_1
]
paddd
m3
,
[
pd_1
]
paddd
m4
,
[
pd_1
]
paddd
m5
,
[
pd_1
]
psrad
m0
,
1
psrad
m1
,
1
psrad
m2
,
1
psrad
m3
,
1
psrad
m4
,
1
psrad
m5
,
1
%endif
; %3 ==/!= 1
packssdw
m0
,
m3
packssdw
m1
,
m4
packssdw
m2
,
m5
%endif
; %2 == 1
; convert u/v pixels
SBUTTERFLY
wd
,
0
,
1
,
6
punpckhwd
m6
,
m2
,
[
pw_16384
]
punpcklwd
m2
,
[
pw_16384
]
pmaddwd
m7
,
m0
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m8
,
m1
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m2
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m10
,
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m0
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m1
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m2
,
[
rsp
+
5
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
5
*
mmsize
]
paddd
m7
,
m9
paddd
m8
,
m10
paddd
m0
,
m2
paddd
m1
,
m6
psrad
m7
,
%%
sh
psrad
m8
,
%%
sh
psrad
m0
,
%%
sh
psrad
m1
,
%%
sh
packssdw
m7
,
m8
packssdw
m0
,
m1
%if
%2
==
1
%if
%1
==
8
packuswb
m7
,
m0
movh
[
uq
+
xq
]
,
m7
movhps
[
vq
+
xq
]
,
m7
%else
CLIPW
m7
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m0
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
uq
+
xq
*
2
]
,
m7
movu
[
vq
+
xq
*
2
]
,
m0
%endif
%else
; %2 != 1
; second set of u/v pixels
SBUTTERFLY
wd
,
3
,
4
,
6
punpckhwd
m6
,
m5
,
[
pw_16384
]
punpcklwd
m5
,
[
pw_16384
]
pmaddwd
m8
,
m3
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m9
,
m4
,
[
rsp
+
2
*
mmsize
]
pmaddwd
m10
,
m5
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m11
,
m6
,
[
rsp
+
3
*
mmsize
]
pmaddwd
m3
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m4
,
[
rsp
+
4
*
mmsize
]
pmaddwd
m5
,
[
rsp
+
5
*
mmsize
]
pmaddwd
m6
,
[
rsp
+
5
*
mmsize
]
paddd
m8
,
m10
paddd
m9
,
m11
paddd
m3
,
m5
paddd
m4
,
m6
psrad
m8
,
%%
sh
psrad
m9
,
%%
sh
psrad
m3
,
%%
sh
psrad
m4
,
%%
sh
packssdw
m8
,
m9
packssdw
m3
,
m4
%if
%1
==
8
packuswb
m7
,
m8
packuswb
m0
,
m3
movu
[
uq
+
xq
]
,
m7
movu
[
vq
+
xq
]
,
m0
%else
CLIPW
m7
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m0
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m8
,
m15
,
[
pw_
%
+
%%
maxval
]
CLIPW
m3
,
m15
,
[
pw_
%
+
%%
maxval
]
movu
[
uq
+
xq
*
2
]
,
m7
movu
[
uq
+
xq
*
2
+
mmsize
]
,
m8
movu
[
vq
+
xq
*
2
]
,
m0
movu
[
vq
+
xq
*
2
+
mmsize
]
,
m3
%endif
%endif
; %2 ==/!= 1
add
xq
,
mmsize
>>
%2
cmp
xd
,
wwd
jl
.
loop_h
%if
%3
==
0
add
yq
,
ysq
%else
; %3 != 0
lea
yq
,
[
yq
+
ysq
*
2
]
%endif
; %3 ==/!= 0
add
uq
,
usq
add
vq
,
vsq
lea
rq
,
[
rq
+
rgbsq
*
(
2
<<
%3
)
]
lea
gq
,
[
gq
+
rgbsq
*
(
2
<<
%3
)
]
lea
bq
,
[
bq
+
rgbsq
*
(
2
<<
%3
)
]
dec
hd
jg
.
loop_v
RET
%endmacro
%macro
RGB2YUV_FNS
2
RGB2YUV_FN
8
,
%1
,
%2
RGB2YUV_FN
10
,
%1
,
%2
RGB2YUV_FN
12
,
%1
,
%2
%endmacro
INIT_XMM
sse2
RGB2YUV_FNS
0
,
0
RGB2YUV_FNS
1
,
0
RGB2YUV_FNS
1
,
1
; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
; int w, int h, const int16_t coeff[3][3][8])
INIT_XMM
sse2
cglobal
multiply3x3
,
5
,
7
,
16
,
data
,
stride
,
ww
,
h
,
c
movh
m0
,
[
cq
+
0
]
movh
m1
,
[
cq
+
32
]
movh
m2
,
[
cq
+
48
]
movh
m3
,
[
cq
+
80
]
movh
m4
,
[
cq
+
96
]
movh
m5
,
[
cq
+
128
]
punpcklwd
m0
,
[
cq
+
16
]
punpcklwd
m1
,
[
pw_8192
]
punpcklwd
m2
,
[
cq
+
64
]
punpcklwd
m3
,
[
pw_8192
]
punpcklwd
m4
,
[
cq
+
112
]
punpcklwd
m5
,
[
pw_8192
]
DEFINE_ARGS
data0
,
stride
,
ww
,
h
,
data1
,
data2
,
x
shl
strideq
,
1
mov
data1q
,
[
data0q
+
gprsize
*
1
]
mov
data2q
,
[
data0q
+
gprsize
*
2
]
mov
data0q
,
[
data0q
+
gprsize
*
0
]
.
loop_v
:
xor
xd
,
xd
.
loop_h
:
mova
m6
,
[
data0q
+
xq
*
2
]
mova
m7
,
[
data1q
+
xq
*
2
]
mova
m8
,
[
data2q
+
xq
*
2
]
SBUTTERFLY
wd
,
6
,
7
,
9
punpckhwd
m9
,
m8
,
[
pw_1
]
punpcklwd
m8
,
[
pw_1
]
pmaddwd
m10
,
m6
,
m0
pmaddwd
m11
,
m7
,
m0
pmaddwd
m12
,
m8
,
m1
pmaddwd
m13
,
m9
,
m1
paddd
m10
,
m12
paddd
m11
,
m13
psrad
m10
,
14
psrad
m11
,
14
pmaddwd
m12
,
m6
,
m2
pmaddwd
m13
,
m7
,
m2
pmaddwd
m14
,
m8
,
m3
pmaddwd
m15
,
m9
,
m3
paddd
m12
,
m14
paddd
m13
,
m15
psrad
m12
,
14
psrad
m13
,
14
pmaddwd
m6
,
m4
pmaddwd
m7
,
m4
pmaddwd
m8
,
m5
pmaddwd
m9
,
m5
paddd
m6
,
m8
paddd
m7
,
m9
psrad
m6
,
14
psrad
m7
,
14
packssdw
m10
,
m11
packssdw
m12
,
m13
packssdw
m6
,
m7
mova
[
data0q
+
xq
*
2
]
,
m10
mova
[
data1q
+
xq
*
2
]
,
m12
mova
[
data2q
+
xq
*
2
]
,
m6
add
xd
,
mmsize
/
2
cmp
xd
,
wwd
jl
.
loop_h
add
data0q
,
strideq
add
data1q
,
strideq
add
data2q
,
strideq
dec
hd
jg
.
loop_v
RET
%endif
libavfilter/x86/colorspacedsp_init.c
0 → 100644
View file @
5ce703a6
/*
* Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavfilter/colorspacedsp.h"
#define decl_yuv2yuv_fn(t) \
void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], \
uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], \
int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], \
const int16_t yuv_offset[2][8])
#define decl_yuv2yuv_fns(ss) \
decl_yuv2yuv_fn(ss##p8to8); \
decl_yuv2yuv_fn(ss##p10to8); \
decl_yuv2yuv_fn(ss##p12to8); \
decl_yuv2yuv_fn(ss##p8to10); \
decl_yuv2yuv_fn(ss##p10to10); \
decl_yuv2yuv_fn(ss##p12to10); \
decl_yuv2yuv_fn(ss##p8to12); \
decl_yuv2yuv_fn(ss##p10to12); \
decl_yuv2yuv_fn(ss##p12to12)
decl_yuv2yuv_fns
(
420
);
decl_yuv2yuv_fns
(
422
);
decl_yuv2yuv_fns
(
444
);
#define decl_yuv2rgb_fn(t) \
void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \
uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \
int w, int h, const int16_t coeff[3][3][8], \
const int16_t yuv_offset[8])
#define decl_yuv2rgb_fns(ss) \
decl_yuv2rgb_fn(ss##p8); \
decl_yuv2rgb_fn(ss##p10); \
decl_yuv2rgb_fn(ss##p12)
decl_yuv2rgb_fns
(
420
);
decl_yuv2rgb_fns
(
422
);
decl_yuv2rgb_fns
(
444
);
#define decl_rgb2yuv_fn(t) \
void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \
int16_t *rgb_in[3], ptrdiff_t rgb_stride, \
int w, int h, const int16_t coeff[3][3][8], \
const int16_t yuv_offset[8])
#define decl_rgb2yuv_fns(ss) \
decl_rgb2yuv_fn(ss##p8); \
decl_rgb2yuv_fn(ss##p10); \
decl_rgb2yuv_fn(ss##p12)
decl_rgb2yuv_fns
(
420
);
decl_rgb2yuv_fns
(
422
);
decl_rgb2yuv_fns
(
444
);
void
ff_multiply3x3_sse2
(
int16_t
*
data
[
3
],
ptrdiff_t
stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
]);
void
ff_colorspacedsp_x86_init
(
ColorSpaceDSPContext
*
dsp
)
{
int
cpu_flags
=
av_get_cpu_flags
();
if
(
ARCH_X86_64
&&
EXTERNAL_SSE2
(
cpu_flags
))
{
#define assign_yuv2yuv_fns(idx, ss) \
dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \
dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \
dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \
dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \
dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \
dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \
dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \
dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \
dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2
assign_yuv2yuv_fns
(
2
,
420
);
assign_yuv2yuv_fns
(
1
,
422
);
assign_yuv2yuv_fns
(
0
,
444
);
#define assign_yuv2rgb_fns(idx, ss) \
dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \
dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \
dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2
assign_yuv2rgb_fns
(
2
,
420
);
assign_yuv2rgb_fns
(
1
,
422
);
assign_yuv2rgb_fns
(
0
,
444
);
#define assign_rgb2yuv_fns(idx, ss) \
dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \
dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \
dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2
assign_rgb2yuv_fns
(
2
,
420
);
assign_rgb2yuv_fns
(
1
,
422
);
assign_rgb2yuv_fns
(
0
,
444
);
dsp
->
multiply3x3
=
ff_multiply3x3_sse2
;
}
}
tests/checkasm/Makefile
View file @
5ce703a6
...
...
@@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
# libavfilter tests
AVFILTEROBJS-$(CONFIG_BLEND_FILTER)
+=
vf_blend.o
AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER)
+=
vf_colorspace.o
CHECKASMOBJS-$(CONFIG_AVFILTER)
+=
$(AVFILTEROBJS-yes)
...
...
tests/checkasm/checkasm.c
View file @
5ce703a6
...
...
@@ -106,6 +106,9 @@ static const struct {
#if CONFIG_BLEND_FILTER
{
"vf_blend"
,
checkasm_check_blend
},
#endif
#if CONFIG_COLORSPACE_FILTER
{
"vf_colorspace"
,
checkasm_check_colorspace
},
#endif
#endif
{
NULL
}
};
...
...
tests/checkasm/checkasm.h
View file @
5ce703a6
...
...
@@ -33,6 +33,7 @@
void
checkasm_check_alacdsp
(
void
);
void
checkasm_check_blend
(
void
);
void
checkasm_check_bswapdsp
(
void
);
void
checkasm_check_colorspace
(
void
);
void
checkasm_check_flacdsp
(
void
);
void
checkasm_check_fmtconvert
(
void
);
void
checkasm_check_h264pred
(
void
);
...
...
tests/checkasm/vf_colorspace.c
0 → 100644
View file @
5ce703a6
/*
* Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <string.h>
#include "checkasm.h"
#include "libavfilter/colorspacedsp.h"
#include "libavutil/common.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
#define W 64
#define H 64
#define randomize_buffers() \
do { \
unsigned mask = bpp_mask[idepth]; \
int n, m; \
int bpp = 1 + (!!idepth); \
int buf_size = W * H * bpp; \
for (m = 0; m < 3; m++) { \
int ss = m ? ss_w + ss_h : 0; \
int plane_sz = buf_size >> ss; \
for (n = 0; n < plane_sz; n += 4) { \
unsigned r = rnd() & mask; \
AV_WN32A(&src[m][n], r); \
} \
} \
} while (0)
static
const
char
*
format_string
[]
=
{
"444"
,
"422"
,
"420"
};
static
unsigned
bpp_mask
[]
=
{
0xffffffff
,
0x03ff03ff
,
0x0fff0fff
};
static
void
check_yuv2yuv
(
void
)
{
declare_func
(
void
,
uint8_t
*
dst
[
3
],
ptrdiff_t
dst_stride
[
3
],
uint8_t
*
src
[
3
],
ptrdiff_t
src_stride
[
3
],
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
2
][
8
]);
ColorSpaceDSPContext
dsp
;
int
idepth
,
odepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
uint8_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_v
,
[
W
*
H
*
2
]);
uint8_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_v
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_v
,
[
W
*
H
*
2
]);
uint8_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset_buf
,
[
16
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
offset
)[
8
]
=
(
int16_t
(
*
)[
8
])
offset_buf
;
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
0
][
n
]
=
offset
[
1
][
n
]
=
16
;
coeff
[
0
][
0
][
n
]
=
(
1
<<
14
)
+
(
1
<<
7
)
+
1
;
coeff
[
0
][
1
][
n
]
=
(
1
<<
7
)
-
1
;
coeff
[
0
][
2
][
n
]
=
-
(
1
<<
8
);
coeff
[
1
][
0
][
n
]
=
coeff
[
2
][
0
][
n
]
=
0
;
coeff
[
1
][
1
][
n
]
=
(
1
<<
14
)
+
(
1
<<
7
);
coeff
[
1
][
2
][
n
]
=
-
(
1
<<
7
);
coeff
[
2
][
2
][
n
]
=
(
1
<<
14
)
-
(
1
<<
6
);
coeff
[
2
][
1
][
n
]
=
1
<<
6
;
}
for
(
idepth
=
0
;
idepth
<
3
;
idepth
++
)
{
for
(
odepth
=
0
;
odepth
<
3
;
odepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
yuv2yuv
[
idepth
][
odepth
][
fmt
],
"ff_colorspacedsp_yuv2yuv_%sp%dto%d"
,
format_string
[
fmt
],
idepth
*
2
+
8
,
odepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_src_stride
=
W
<<
!!
idepth
,
y_dst_stride
=
W
<<
!!
odepth
;
int
uv_src_stride
=
y_src_stride
>>
ss_w
,
uv_dst_stride
=
y_dst_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
y_dst_stride
*
H
)
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
uv_dst_stride
*
H
>>
ss_h
)
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
uv_dst_stride
*
H
>>
ss_h
))
{
fail
();
}
}
}
}
}
report
(
"yuv2yuv"
);
}
static
void
check_yuv2rgb
(
void
)
{
declare_func
(
void
,
int16_t
*
dst
[
3
],
ptrdiff_t
dst_stride
,
uint8_t
*
src
[
3
],
ptrdiff_t
src_stride
[
3
],
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
8
]);
ColorSpaceDSPContext
dsp
;
int
idepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
uint8_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
uint8_t
,
src_v
,
[
W
*
H
*
2
]);
uint8_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
int16_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_v
,
[
W
*
H
]);
int16_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset
,
[
8
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
n
]
=
16
;
coeff
[
0
][
0
][
n
]
=
coeff
[
1
][
0
][
n
]
=
coeff
[
2
][
0
][
n
]
=
(
1
<<
14
)
|
1
;
coeff
[
0
][
1
][
n
]
=
coeff
[
2
][
2
][
n
]
=
0
;
coeff
[
0
][
2
][
n
]
=
1
<<
13
;
coeff
[
1
][
1
][
n
]
=
-
(
1
<<
12
);
coeff
[
1
][
2
][
n
]
=
1
<<
12
;
coeff
[
2
][
1
][
n
]
=
1
<<
11
;
}
for
(
idepth
=
0
;
idepth
<
3
;
idepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
yuv2rgb
[
idepth
][
fmt
],
"ff_colorspacedsp_yuv2rgb_%sp%d"
,
format_string
[
fmt
],
idepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_src_stride
=
W
<<
!!
idepth
;
int
uv_src_stride
=
y_src_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
W
,
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
W
,
src
,
(
ptrdiff_t
[
3
])
{
y_src_stride
,
uv_src_stride
,
uv_src_stride
},
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
W
*
H
*
sizeof
(
int16_t
))
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
W
*
H
*
sizeof
(
int16_t
))
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
W
*
H
*
sizeof
(
int16_t
)))
{
fail
();
}
}
}
}
report
(
"yuv2rgb"
);
}
#undef randomize_buffers
#define randomize_buffers() \
do { \
int y, x, p; \
for (p = 0; p < 3; p++) { \
for (y = 0; y < H; y++) { \
for (x = 0; x < W; x++) { \
int r = rnd() & 0x7fff; \
r -= (32768 - 28672) >> 1; \
src[p][y * W + x] = r; \
} \
} \
} \
} while (0)
static
void
check_rgb2yuv
(
void
)
{
declare_func
(
void
,
uint8_t
*
dst
[
3
],
ptrdiff_t
dst_stride
[
3
],
int16_t
*
src
[
3
],
ptrdiff_t
src_stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
],
const
int16_t
off
[
8
]);
ColorSpaceDSPContext
dsp
;
int
odepth
,
fmt
,
n
;
LOCAL_ALIGNED_32
(
int16_t
,
src_y
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
int16_t
,
src_u
,
[
W
*
H
*
2
]);
LOCAL_ALIGNED_32
(
int16_t
,
src_v
,
[
W
*
H
*
2
]);
int16_t
*
src
[
3
]
=
{
src_y
,
src_u
,
src_v
};
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
uint8_t
,
dst1_v
,
[
W
*
H
]);
uint8_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
LOCAL_ALIGNED_32
(
int16_t
,
offset
,
[
8
]);
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
offset
[
n
]
=
16
;
// these somewhat resemble bt601/smpte170m coefficients
coeff
[
0
][
0
][
n
]
=
lrint
(
0
.
3
*
(
1
<<
14
));
coeff
[
0
][
1
][
n
]
=
lrint
(
0
.
6
*
(
1
<<
14
));
coeff
[
0
][
2
][
n
]
=
lrint
(
0
.
1
*
(
1
<<
14
));
coeff
[
1
][
0
][
n
]
=
lrint
(
-
0
.
15
*
(
1
<<
14
));
coeff
[
1
][
1
][
n
]
=
lrint
(
-
0
.
35
*
(
1
<<
14
));
coeff
[
1
][
2
][
n
]
=
lrint
(
0
.
5
*
(
1
<<
14
));
coeff
[
2
][
0
][
n
]
=
lrint
(
0
.
5
*
(
1
<<
14
));
coeff
[
2
][
1
][
n
]
=
lrint
(
-
0
.
42
*
(
1
<<
14
));
coeff
[
2
][
2
][
n
]
=
lrint
(
-
0
.
08
*
(
1
<<
14
));
}
for
(
odepth
=
0
;
odepth
<
3
;
odepth
++
)
{
for
(
fmt
=
0
;
fmt
<
3
;
fmt
++
)
{
if
(
check_func
(
dsp
.
rgb2yuv
[
odepth
][
fmt
],
"ff_colorspacedsp_rgb2yuv_%sp%d"
,
format_string
[
fmt
],
odepth
*
2
+
8
))
{
int
ss_w
=
!!
fmt
,
ss_h
=
fmt
==
2
;
int
y_dst_stride
=
W
<<
!!
odepth
;
int
uv_dst_stride
=
y_dst_stride
>>
ss_w
;
randomize_buffers
();
call_ref
(
dst0
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
W
,
W
,
H
,
coeff
,
offset
);
call_new
(
dst1
,
(
ptrdiff_t
[
3
])
{
y_dst_stride
,
uv_dst_stride
,
uv_dst_stride
},
src
,
W
,
W
,
H
,
coeff
,
offset
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
H
*
y_dst_stride
)
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
H
*
uv_dst_stride
>>
ss_h
)
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
H
*
uv_dst_stride
>>
ss_h
))
{
fail
();
}
}
}
}
report
(
"rgb2yuv"
);
}
static
void
check_multiply3x3
(
void
)
{
declare_func
(
void
,
int16_t
*
data
[
3
],
ptrdiff_t
stride
,
int
w
,
int
h
,
const
int16_t
coeff
[
3
][
3
][
8
]);
ColorSpaceDSPContext
dsp
;
LOCAL_ALIGNED_32
(
int16_t
,
dst0_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst0_v
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_y
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_u
,
[
W
*
H
]);
LOCAL_ALIGNED_32
(
int16_t
,
dst1_v
,
[
W
*
H
]);
int16_t
*
dst0
[
3
]
=
{
dst0_y
,
dst0_u
,
dst0_v
},
*
dst1
[
3
]
=
{
dst1_y
,
dst1_u
,
dst1_v
};
int16_t
**
src
=
dst0
;
LOCAL_ALIGNED_32
(
int16_t
,
coeff_buf
,
[
3
*
3
*
8
]);
int16_t
(
*
coeff
)[
3
][
8
]
=
(
int16_t
(
*
)[
3
][
8
])
coeff_buf
;
int
n
;
ff_colorspacedsp_init
(
&
dsp
);
for
(
n
=
0
;
n
<
8
;
n
++
)
{
coeff
[
0
][
0
][
n
]
=
lrint
(
0
.
85
*
(
1
<<
14
));
coeff
[
0
][
1
][
n
]
=
lrint
(
0
.
10
*
(
1
<<
14
));
coeff
[
0
][
2
][
n
]
=
lrint
(
0
.
05
*
(
1
<<
14
));
coeff
[
1
][
0
][
n
]
=
lrint
(
-
0
.
1
*
(
1
<<
14
));
coeff
[
1
][
1
][
n
]
=
lrint
(
0
.
95
*
(
1
<<
14
));
coeff
[
1
][
2
][
n
]
=
lrint
(
0
.
15
*
(
1
<<
14
));
coeff
[
2
][
0
][
n
]
=
lrint
(
-
0
.
2
*
(
1
<<
14
));
coeff
[
2
][
1
][
n
]
=
lrint
(
0
.
30
*
(
1
<<
14
));
coeff
[
2
][
2
][
n
]
=
lrint
(
0
.
90
*
(
1
<<
14
));
}
if
(
check_func
(
dsp
.
multiply3x3
,
"ff_colorspacedsp_multiply3x3"
))
{
randomize_buffers
();
memcpy
(
dst1_y
,
dst0_y
,
W
*
H
*
sizeof
(
*
dst1_y
));
memcpy
(
dst1_u
,
dst0_u
,
W
*
H
*
sizeof
(
*
dst1_u
));
memcpy
(
dst1_v
,
dst0_v
,
W
*
H
*
sizeof
(
*
dst1_v
));
call_ref
(
dst0
,
W
,
W
,
H
,
coeff
);
call_new
(
dst1
,
W
,
W
,
H
,
coeff
);
if
(
memcmp
(
dst0
[
0
],
dst1
[
0
],
H
*
W
*
sizeof
(
*
dst0_y
))
||
memcmp
(
dst0
[
1
],
dst1
[
1
],
H
*
W
*
sizeof
(
*
dst0_u
))
||
memcmp
(
dst0
[
2
],
dst1
[
2
],
H
*
W
*
sizeof
(
*
dst0_v
)))
{
fail
();
}
}
report
(
"multiply3x3"
);
}
void
checkasm_check_colorspace
(
void
)
{
check_yuv2yuv
();
check_yuv2rgb
();
check_rgb2yuv
();
check_multiply3x3
();
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment