Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
6cacecdc
Commit
6cacecdc
authored
Oct 15, 2011
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
swscale: make yuv2yuvX_10_sse2/avx 8/9/16-bits aware.
Also implement MMX/MMX2 versions and SSE4 versions.
parent
7fbbf952
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
227 additions
and
255 deletions
+227
-255
scale.asm
libswscale/x86/scale.asm
+200
-39
swscale_mmx.c
libswscale/x86/swscale_mmx.c
+27
-10
swscale_template.c
libswscale/x86/swscale_template.c
+0
-206
No files found.
libswscale/x86/scale.asm
View file @
6cacecdc
...
@@ -29,8 +29,11 @@ max_19bit_int: times 4 dd 0x7ffff
...
@@ -29,8 +29,11 @@ max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt
:
times
4
dd
524287
.
0
max_19bit_flt
:
times
4
dd
524287
.
0
minshort
:
times
8
dw
0x8000
minshort
:
times
8
dw
0x8000
unicoeff
:
times
4
dd
0x20000000
unicoeff
:
times
4
dd
0x20000000
yuv2yuvX_16_start
:
times
4
dd
0x4000
-
0x40000000
yuv2yuvX_10_start
:
times
4
dd
0x10000
yuv2yuvX_10_start
:
times
4
dd
0x10000
yuv2yuvX_9_start
:
times
4
dd
0x20000
yuv2yuvX_10_upper
:
times
8
dw
0x3ff
yuv2yuvX_10_upper
:
times
8
dw
0x3ff
yuv2yuvX_9_upper
:
times
8
dw
0x1ff
SECTION
.
text
SECTION
.
text
...
@@ -447,33 +450,134 @@ SCALE_FUNCS2 sse4, 6, 6, 8
...
@@ -447,33 +450,134 @@ SCALE_FUNCS2 sse4, 6, 6, 8
; of 2. $offset is either 0 or 3. $dither holds 8 values.
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
yuv2planeX
10
1
%macro
yuv2planeX
_fn
4
%ifdef
ARCH_X86_32
%ifdef
ARCH_X86_32
%define
cntr_reg
r1
%define
cntr_reg
r1
%define
movsx
mov
%else
%else
%define
cntr_reg
r11
%define
cntr_reg
r11
%define
movsx
movsxd
%endif
%endif
cglobal
yuv2planeX10_
%1
,
7
,
7
cglobal
yuv2planeX_
%2
_
%1
,
%4
,
7
,
%3
%if
%2
==
8
||
%2
==
9
||
%2
==
10
pxor
m6
,
m6
%endif
; %2 == 8/9/10
%if
%2
==
8
%ifdef
ARCH_X86_32
%assign
pad
0x2c
-
(
stack_offset
&
15
)
SUB
rsp
,
pad
%define
m_dith
m7
%else
; x86-64
%define
m_dith
m9
%endif
; x86-32
; create registers holding dither
movq
m_dith
,
[r5]
; dither
test
r6d
,
r6d
jz
.
no_rot
%if
mmsize
==
16
punpcklqdq
m_dith
,
m_dith
%endif
; mmsize == 16
PALIGNR
m_dith
,
m_dith
,
3
,
m0
.
no_rot
:
%if
mmsize
==
16
punpcklbw
m_dith
,
m6
%ifdef
ARCH_X86_64
punpcklwd
m8
,
m_dith
,
m6
pslld
m8
,
12
%else
; x86-32
punpcklwd
m5
,
m_dith
,
m6
pslld
m5
,
12
%endif
; x86-32/64
punpckhwd
m_dith
,
m6
pslld
m_dith
,
12
%ifdef
ARCH_X86_32
mova
[
rsp
+
0
]
,
m5
mova
[
rsp
+
16
]
,
m_dith
%endif
%else
; mmsize == 8
punpcklbw
m5
,
m_dith
,
m6
punpckhbw
m_dith
,
m6
punpcklwd
m4
,
m5
,
m6
punpckhwd
m5
,
m6
punpcklwd
m3
,
m_dith
,
m6
punpckhwd
m_dith
,
m6
pslld
m4
,
12
pslld
m5
,
12
pslld
m3
,
12
pslld
m_dith
,
12
mova
[
rsp
+
0
]
,
m4
mova
[
rsp
+
8
]
,
m5
mova
[
rsp
+
16
]
,
m3
mova
[
rsp
+
24
]
,
m_dith
%endif
; mmsize == 8/16
%endif
; %2 == 8
xor
r5
,
r5
xor
r5
,
r5
.
pixelloop
.
pixelloop
mova
m1
,
[
yuv2yuvX_10_start
]
%assign
%%
i
0
; the rep here is for the 8bit output mmx case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if
%2
==
8
%rep
16
/
mmsize
%endif
; %2 == 8
%if
%2
==
8
%ifdef
ARCH_X86_32
mova
m2
,
[
rsp
+
mmsize
*
(
0
+
%%
i
)
]
mova
m1
,
[
rsp
+
mmsize
*
(
1
+
%%
i
)
]
%else
; x86-64
mova
m2
,
m8
mova
m1
,
m_dith
%endif
; x86-32/64
%else
; %2 == 9/10/16
mova
m1
,
[
yuv2yuvX_
%2
_start
]
mova
m2
,
m1
mova
m2
,
m1
movsxdifnidn
cntr_reg
,
r1d
%endif
; %2 == 8/9/10/16
.
filterloop
movsx
cntr_reg
,
r1m
pxor
m0
,
m0
.
filterloop_
%
+
%%
i
; input pixels
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
2
*
gprsize
]
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
2
*
gprsize
]
mova
m3
,
[
r6
+
r5
]
%if
%2
==
16
mova
m3
,
[
r6
+
r5
*
4
]
mova
m5
,
[
r6
+
r5
*
4
+
mmsize
]
%else
; %2 == 8/9/10
mova
m3
,
[
r6
+
r5
*
2
]
%endif
; %2 == 8/9/10/16
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
gprsize
]
mov
r6
,
[
r2
+
gprsize
*
cntr_reg
-
gprsize
]
mova
m4
,
[
r6
+
r5
]
%if
%2
==
16
mova
m4
,
[
r6
+
r5
*
4
]
mova
m6
,
[
r6
+
r5
*
4
+
mmsize
]
%else
; %2 == 8/9/10
mova
m4
,
[
r6
+
r5
*
2
]
%endif
; %2 == 8/9/10/16
; coefficients
movd
m0
,
[
r0
+
2
*
cntr_reg
-
4
]
; coeff[0], coeff[1]
%if
%2
==
16
pshuflw
m7
,
m0
,
0
; coeff[0]
pshuflw
m0
,
m0
,
0x55
; coeff[1]
pmovsxwd
m7
,
m7
; word -> dword
pmovsxwd
m0
,
m0
; word -> dword
pmulld
m3
,
m7
pmulld
m5
,
m7
pmulld
m4
,
m0
pmulld
m6
,
m0
paddd
m2
,
m3
paddd
m1
,
m5
paddd
m2
,
m4
paddd
m1
,
m6
%else
; %2 == 10/9/8
punpcklwd
m5
,
m3
,
m4
punpcklwd
m5
,
m3
,
m4
punpckhwd
m3
,
m4
punpckhwd
m3
,
m4
movd
m0
,
[
r0
+
2
*
cntr_reg
-
4
]
SPLATD
m0
,
m0
SPLATD
m0
,
m0
pmaddwd
m5
,
m0
pmaddwd
m5
,
m0
...
@@ -481,24 +585,81 @@ cglobal yuv2planeX10_%1, 7, 7
...
@@ -481,24 +585,81 @@ cglobal yuv2planeX10_%1, 7, 7
paddd
m2
,
m5
paddd
m2
,
m5
paddd
m1
,
m3
paddd
m1
,
m3
%endif
; %2 == 8/9/10/16
sub
cntr_reg
,
2
sub
cntr_reg
,
2
jg
.
filterloop
jg
.
filterloop_
%
+
%%
i
psrad
m2
,
17
%if
%2
==
16
psrad
m1
,
17
psrad
m2
,
31
-
%2
psrad
m1
,
31
-
%2
%else
; %2 == 10/9/8
psrad
m2
,
27
-
%2
psrad
m1
,
27
-
%2
%endif
; %2 == 8/9/10/16
%if
%2
==
8
packssdw
m2
,
m1
packuswb
m2
,
m2
movh
[
r3
+
r5
*
1
]
,
m2
%else
; %2 == 9/10/16
%if
%2
==
16
packssdw
m2
,
m1
paddw
m2
,
[minshort]
%else
; %2 == 9/10
%ifidn
%1
,
sse4
packusdw
m2
,
m1
packusdw
m2
,
m1
pminsw
m2
,
[
yuv2yuvX_10_upper
]
%elifidn
%1
,
avx
mova
[
r3
+
r5
]
,
m2
packusdw
m2
,
m1
%else
; mmx2/sse2
add
r5
,
mmsize
packssdw
m2
,
m1
pmaxsw
m2
,
m6
%endif
; mmx2/sse2/sse4/avx
pminsw
m2
,
[
yuv2yuvX_
%2
_upper
]
%endif
; %2 == 9/10/16
mova
[
r3
+
r5
*
2
]
,
m2
%endif
; %2 == 8/9/10/16
add
r5
,
mmsize
/
2
sub
r4d
,
mmsize
/
2
sub
r4d
,
mmsize
/
2
%if
%2
==
8
%assign
%%
i
%%
i
+
2
%endrep
%endif
; %2 == 8
jg
.
pixelloop
jg
.
pixelloop
%if
%2
==
8
%ifdef
ARCH_X86_32
ADD
rsp
,
pad
RET
%else
; x86-64
REP_RET
REP_RET
%endif
; x86-32/64
%else
; %2 == 9/10/16
REP_RET
%endif
; %2 == 8/9/10/16
%endmacro
%endmacro
%define
PALIGNR
PALIGNR_MMX
%ifdef
ARCH_X86_32
INIT_MMX
yuv2planeX_fn
mmx
,
8
,
0
,
7
yuv2planeX_fn
mmx2
,
9
,
0
,
5
yuv2planeX_fn
mmx2
,
10
,
0
,
5
%endif
INIT_XMM
INIT_XMM
yuv2planeX10
sse4
yuv2planeX_fn
sse2
,
8
,
10
,
7
yuv2planeX_fn
sse2
,
9
,
7
,
5
yuv2planeX_fn
sse2
,
10
,
7
,
5
%define
PALIGNR
PALIGNR_SSSE3
yuv2planeX_fn
sse4
,
8
,
10
,
7
yuv2planeX_fn
sse4
,
9
,
7
,
5
yuv2planeX_fn
sse4
,
10
,
7
,
5
yuv2planeX_fn
sse4
,
16
,
8
,
5
INIT_AVX
INIT_AVX
yuv2planeX10
avx
yuv2planeX_fn
avx
,
8
,
10
,
7
yuv2planeX_fn
avx
,
9
,
7
,
5
yuv2planeX_fn
avx
,
10
,
7
,
5
libswscale/x86/swscale_mmx.c
View file @
6cacecdc
...
@@ -211,13 +211,22 @@ SCALE_FUNCS_SSE(sse2);
...
@@ -211,13 +211,22 @@ SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE
(
ssse3
);
SCALE_FUNCS_SSE
(
ssse3
);
SCALE_FUNCS_SSE
(
sse4
);
SCALE_FUNCS_SSE
(
sse4
);
extern
void
ff_yuv2planeX10_sse4
(
const
int16_t
*
filter
,
int
filterSize
,
#define VSCALEX_FUNC(size, opt) \
const
int16_t
**
src
,
uint8_t
*
dest
,
int
dstW
,
extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
const
uint8_t
*
dither
,
int
offset
);
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)
#define VSCALEX_FUNCS(opt1, opt2) \
VSCALEX_FUNC(8, opt1); \
VSCALEX_FUNC(9, opt2); \
VSCALEX_FUNC(10, opt2)
extern
void
ff_yuv2planeX10_avx
(
const
int16_t
*
filter
,
int
filterSize
,
#if ARCH_X86_32
const
int16_t
**
src
,
uint8_t
*
dest
,
int
dstW
,
VSCALEX_FUNCS
(
mmx
,
mmx2
);
const
uint8_t
*
dither
,
int
offset
);
#endif
VSCALEX_FUNCS
(
sse2
,
sse2
);
VSCALEX_FUNCS
(
sse4
,
sse4
);
VSCALEX_FUNC
(
16
,
sse4
);
VSCALEX_FUNCS
(
avx
,
avx
);
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
void
ff_sws_init_swScale_mmx
(
SwsContext
*
c
)
{
{
...
@@ -252,10 +261,18 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
...
@@ -252,10 +261,18 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt1, opt2, opt2chk, do_16_case) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_9_ ## opt2; break; \
default: vscalefn = ff_yuv2planeX_8_ ## opt1; break; \
}
#if ARCH_X86_32
#if ARCH_X86_32
if
(
cpu_flags
&
AV_CPU_FLAG_MMX
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_MMX
)
{
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_MMX_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
mmx
,
mmx
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
mmx
,
mmx2
,
cpu_flags
&
AV_CPU_FLAG_MMX2
,);
}
}
#endif
#endif
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
...
@@ -269,6 +286,7 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
...
@@ -269,6 +286,7 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
if
(
cpu_flags
&
AV_CPU_FLAG_SSE2
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_SSE2
)
{
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse2
,
sse2
);
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse2
,
sse2
,
1
,);
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_SSSE3
)
{
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
ssse3
,
ssse3
);
...
@@ -278,13 +296,12 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
...
@@ -278,13 +296,12 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
/* Xto15 don't need special sse4 functions */
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse4
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hyScale
,
c
->
hLumFilterSize
,
sse4
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse4
,
ssse3
);
ASSIGN_SSE_SCALE_FUNC
(
c
->
hcScale
,
c
->
hChrFilterSize
,
sse4
,
ssse3
);
if
(
c
->
dstBpc
==
10
&&
!
isBE
(
c
->
dstFormat
))
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
sse4
,
sse4
,
1
,
c
->
yuv2planeX
=
ff_yuv2planeX10_sse4
;
if
(
!
isBE
(
c
->
dstFormat
))
c
->
yuv2planeX
=
ff_yuv2planeX_16_sse4
)
;
}
}
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
if
(
cpu_flags
&
AV_CPU_FLAG_AVX
)
{
if
(
c
->
dstBpc
==
10
&&
!
isBE
(
c
->
dstFormat
))
ASSIGN_VSCALEX_FUNC
(
c
->
yuv2planeX
,
avx
,
avx
,
1
,);
c
->
yuv2planeX
=
ff_yuv2planeX10_avx
;
}
}
#endif
#endif
}
}
libswscale/x86/swscale_template.c
View file @
6cacecdc
...
@@ -35,41 +35,6 @@
...
@@ -35,41 +35,6 @@
#endif
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
#define YSCALEYUV2YV12X(offset, dest, end, pos) \
__asm__ volatile(\
"movq "DITHER16"+0(%0), %%mm3 \n\t"\
"movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t"
/* FIXME Unroll? */
\
"1: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t"
/* filterCoeff */
\
"movq (%%"REG_S", %3, 2), %%mm2 \n\t"
/* srcData */
\
"movq 8(%%"REG_S", %3, 2), %%mm5 \n\t"
/* srcData */
\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%mm3 \n\t"\
"psraw $3, %%mm4 \n\t"\
"packuswb %%mm4, %%mm3 \n\t"\
MOVNTQ(%%mm3, (%1, %3))\
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
"movq "DITHER16"+0(%0), %%mm3 \n\t"\
"movq "DITHER16"+8(%0), %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "r" (&c->redDither),\
"r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
: "%"REG_d, "%"REG_S\
);
#if !COMPILE_TEMPLATE_MMX2
#if !COMPILE_TEMPLATE_MMX2
static
av_always_inline
void
static
av_always_inline
void
dither_8to16
(
SwsContext
*
c
,
const
uint8_t
*
srcDither
,
int
rot
)
dither_8to16
(
SwsContext
*
c
,
const
uint8_t
*
srcDither
,
int
rot
)
...
@@ -106,175 +71,6 @@ dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
...
@@ -106,175 +71,6 @@ dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot)
}
}
#endif
#endif
static
void
RENAME
(
yuv2yuvX
)(
SwsContext
*
c
,
const
int16_t
*
lumFilter
,
const
int16_t
**
lumSrc
,
int
lumFilterSize
,
const
int16_t
*
chrFilter
,
const
int16_t
**
chrUSrc
,
const
int16_t
**
chrVSrc
,
int
chrFilterSize
,
const
int16_t
**
alpSrc
,
uint8_t
*
dest
[
4
],
int
dstW
,
int
chrDstW
)
{
uint8_t
*
yDest
=
dest
[
0
],
*
uDest
=
dest
[
1
],
*
vDest
=
dest
[
2
],
*
aDest
=
CONFIG_SWSCALE_ALPHA
?
dest
[
3
]
:
NULL
;
const
uint8_t
*
lumDither
=
c
->
lumDither8
,
*
chrDither
=
c
->
chrDither8
;
if
(
uDest
)
{
x86_reg
uv_off
=
c
->
uv_off_byte
>>
1
;
dither_8to16
(
c
,
chrDither
,
0
);
YSCALEYUV2YV12X
(
CHR_MMX_FILTER_OFFSET
,
uDest
,
chrDstW
,
0
)
dither_8to16
(
c
,
chrDither
,
1
);
YSCALEYUV2YV12X
(
CHR_MMX_FILTER_OFFSET
,
vDest
-
uv_off
,
chrDstW
+
uv_off
,
uv_off
)
}
dither_8to16
(
c
,
lumDither
,
0
);
if
(
CONFIG_SWSCALE_ALPHA
&&
aDest
)
{
YSCALEYUV2YV12X
(
ALP_MMX_FILTER_OFFSET
,
aDest
,
dstW
,
0
)
}
YSCALEYUV2YV12X
(
LUM_MMX_FILTER_OFFSET
,
yDest
,
dstW
,
0
)
}
#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \
__asm__ volatile(\
"lea " offset "(%0), %%"REG_d" \n\t"\
"movq "DITHER32"+0(%0), %%mm4 \n\t"\
"movq "DITHER32"+8(%0), %%mm5 \n\t"\
"movq "DITHER32"+16(%0), %%mm6 \n\t"\
"movq "DITHER32"+24(%0), %%mm7 \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%%"REG_S", %3, 2), %%mm0 \n\t"
/* srcData */
\
"movq 8(%%"REG_S", %3, 2), %%mm2 \n\t"
/* srcData */
\
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
"movq (%%"REG_S", %3, 2), %%mm1 \n\t"
/* srcData */
\
"movq %%mm0, %%mm3 \n\t"\
"punpcklwd %%mm1, %%mm0 \n\t"\
"punpckhwd %%mm1, %%mm3 \n\t"\
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t"
/* filterCoeff */
\
"pmaddwd %%mm1, %%mm0 \n\t"\
"pmaddwd %%mm1, %%mm3 \n\t"\
"paddd %%mm0, %%mm4 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\
"movq 8(%%"REG_S", %3, 2), %%mm3 \n\t"
/* srcData */
\
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"punpcklwd %%mm3, %%mm2 \n\t"\
"punpckhwd %%mm3, %%mm0 \n\t"\
"pmaddwd %%mm1, %%mm2 \n\t"\
"pmaddwd %%mm1, %%mm0 \n\t"\
"paddd %%mm2, %%mm6 \n\t"\
"paddd %%mm0, %%mm7 \n\t"\
" jnz 1b \n\t"\
"psrad $16, %%mm4 \n\t"\
"psrad $16, %%mm5 \n\t"\
"psrad $16, %%mm6 \n\t"\
"psrad $16, %%mm7 \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
"packssdw %%mm5, %%mm4 \n\t"\
"packssdw %%mm7, %%mm6 \n\t"\
"paddw %%mm0, %%mm4 \n\t"\
"paddw %%mm0, %%mm6 \n\t"\
"psraw $3, %%mm4 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm4 \n\t"\
MOVNTQ(%%mm4, (%1, %3))\
"add $8, %3 \n\t"\
"cmp %2, %3 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"movq "DITHER32"+0(%0), %%mm4 \n\t"\
"movq "DITHER32"+8(%0), %%mm5 \n\t"\
"movq "DITHER32"+16(%0), %%mm6 \n\t"\
"movq "DITHER32"+24(%0), %%mm7 \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "r" (&c->redDither),\
"r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\
: "%"REG_a, "%"REG_d, "%"REG_S\
);
#if !COMPILE_TEMPLATE_MMX2
static
av_always_inline
void
dither_8to32
(
SwsContext
*
c
,
const
uint8_t
*
srcDither
,
int
rot
)
{
if
(
rot
)
{
__asm__
volatile
(
"pxor %%mm0, %%mm0
\n\t
"
"movq (%0), %%mm4
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"psrlq $24, %%mm4
\n\t
"
"psllq $40, %%mm5
\n\t
"
"por %%mm5, %%mm4
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"punpcklbw %%mm0, %%mm4
\n\t
"
"punpckhbw %%mm0, %%mm6
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"movq %%mm6, %%mm7
\n\t
"
"punpcklwd %%mm0, %%mm4
\n\t
"
"punpckhwd %%mm0, %%mm5
\n\t
"
"punpcklwd %%mm0, %%mm6
\n\t
"
"punpckhwd %%mm0, %%mm7
\n\t
"
"pslld $12, %%mm4
\n\t
"
"pslld $12, %%mm5
\n\t
"
"pslld $12, %%mm6
\n\t
"
"pslld $12, %%mm7
\n\t
"
"movq %%mm4, "
DITHER32
"+0(%1)
\n\t
"
"movq %%mm5, "
DITHER32
"+8(%1)
\n\t
"
"movq %%mm6, "
DITHER32
"+16(%1)
\n\t
"
"movq %%mm7, "
DITHER32
"+24(%1)
\n\t
"
::
"r"
(
srcDither
),
"r"
(
&
c
->
redDither
)
);
}
else
{
__asm__
volatile
(
"pxor %%mm0, %%mm0
\n\t
"
"movq (%0), %%mm4
\n\t
"
"movq %%mm4, %%mm6
\n\t
"
"punpcklbw %%mm0, %%mm4
\n\t
"
"punpckhbw %%mm0, %%mm6
\n\t
"
"movq %%mm4, %%mm5
\n\t
"
"movq %%mm6, %%mm7
\n\t
"
"punpcklwd %%mm0, %%mm4
\n\t
"
"punpckhwd %%mm0, %%mm5
\n\t
"
"punpcklwd %%mm0, %%mm6
\n\t
"
"punpckhwd %%mm0, %%mm7
\n\t
"
"pslld $12, %%mm4
\n\t
"
"pslld $12, %%mm5
\n\t
"
"pslld $12, %%mm6
\n\t
"
"pslld $12, %%mm7
\n\t
"
"movq %%mm4, "
DITHER32
"+0(%1)
\n\t
"
"movq %%mm5, "
DITHER32
"+8(%1)
\n\t
"
"movq %%mm6, "
DITHER32
"+16(%1)
\n\t
"
"movq %%mm7, "
DITHER32
"+24(%1)
\n\t
"
::
"r"
(
srcDither
),
"r"
(
&
c
->
redDither
)
);
}
}
#endif
static
void
RENAME
(
yuv2yuvX_ar
)(
SwsContext
*
c
,
const
int16_t
*
lumFilter
,
const
int16_t
**
lumSrc
,
int
lumFilterSize
,
const
int16_t
*
chrFilter
,
const
int16_t
**
chrUSrc
,
const
int16_t
**
chrVSrc
,
int
chrFilterSize
,
const
int16_t
**
alpSrc
,
uint8_t
*
dest
[
4
],
int
dstW
,
int
chrDstW
)
{
uint8_t
*
yDest
=
dest
[
0
],
*
uDest
=
dest
[
1
],
*
vDest
=
dest
[
2
],
*
aDest
=
CONFIG_SWSCALE_ALPHA
?
dest
[
3
]
:
NULL
;
const
uint8_t
*
lumDither
=
c
->
lumDither8
,
*
chrDither
=
c
->
chrDither8
;
if
(
uDest
)
{
x86_reg
uv_off
=
c
->
uv_off_byte
>>
1
;
dither_8to32
(
c
,
chrDither
,
0
);
YSCALEYUV2YV12X_ACCURATE
(
CHR_MMX_FILTER_OFFSET
,
uDest
,
chrDstW
,
0
)
dither_8to32
(
c
,
chrDither
,
1
);
YSCALEYUV2YV12X_ACCURATE
(
CHR_MMX_FILTER_OFFSET
,
vDest
-
uv_off
,
chrDstW
+
uv_off
,
uv_off
)
}
dither_8to32
(
c
,
lumDither
,
0
);
if
(
CONFIG_SWSCALE_ALPHA
&&
aDest
)
{
YSCALEYUV2YV12X_ACCURATE
(
ALP_MMX_FILTER_OFFSET
,
aDest
,
dstW
,
0
)
}
YSCALEYUV2YV12X_ACCURATE
(
LUM_MMX_FILTER_OFFSET
,
yDest
,
dstW
,
0
)
}
static
void
RENAME
(
yuv2yuv1
)(
SwsContext
*
c
,
const
int16_t
*
lumSrc
,
static
void
RENAME
(
yuv2yuv1
)(
SwsContext
*
c
,
const
int16_t
*
lumSrc
,
const
int16_t
*
chrUSrc
,
const
int16_t
*
chrVSrc
,
const
int16_t
*
chrUSrc
,
const
int16_t
*
chrVSrc
,
const
int16_t
*
alpSrc
,
const
int16_t
*
alpSrc
,
...
@@ -2104,7 +1900,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -2104,7 +1900,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
if
(
!
(
c
->
flags
&
SWS_BITEXACT
))
{
if
(
!
(
c
->
flags
&
SWS_BITEXACT
))
{
if
(
c
->
flags
&
SWS_ACCURATE_RND
)
{
if
(
c
->
flags
&
SWS_ACCURATE_RND
)
{
//c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
//c->yuv2yuv1 = RENAME(yuv2yuv1_ar );
//c->yuv2yuvX = RENAME(yuv2yuvX_ar );
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
switch
(
c
->
dstFormat
)
{
switch
(
c
->
dstFormat
)
{
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X_ar
);
break
;
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X_ar
);
break
;
...
@@ -2117,7 +1912,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
...
@@ -2117,7 +1912,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
}
}
}
else
{
}
else
{
//c->yuv2yuv1 = RENAME(yuv2yuv1 );
//c->yuv2yuv1 = RENAME(yuv2yuv1 );
//c->yuv2yuvX = RENAME(yuv2yuvX );
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
if
(
!
(
c
->
flags
&
SWS_FULL_CHR_H_INT
))
{
switch
(
c
->
dstFormat
)
{
switch
(
c
->
dstFormat
)
{
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X
);
break
;
case
PIX_FMT_RGB32
:
c
->
yuv2packedX
=
RENAME
(
yuv2rgb32_X
);
break
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment