Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
2cba1825
Commit
2cba1825
authored
Jan 15, 2016
by
James Darnley
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avcodec/v210: add avx2 version of the 10-bit line encoder
Around 25% faster than the ssse3 version.
parent
3836f404
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
31 additions
and
9 deletions
+31
-9
v210enc.c
libavcodec/v210enc.c
+9
-2
constants.c
libavcodec/x86/constants.c
+2
-1
constants.h
libavcodec/x86/constants.h
+1
-1
v210enc.asm
libavcodec/x86/v210enc.asm
+15
-5
v210enc_init.c
libavcodec/x86/v210enc_init.c
+4
-0
No files found.
libavcodec/v210enc.c
View file @
2cba1825
...
@@ -135,13 +135,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
...
@@ -135,13 +135,20 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
const
uint16_t
*
v
=
(
const
uint16_t
*
)
pic
->
data
[
2
];
const
uint16_t
*
v
=
(
const
uint16_t
*
)
pic
->
data
[
2
];
for
(
h
=
0
;
h
<
avctx
->
height
;
h
++
)
{
for
(
h
=
0
;
h
<
avctx
->
height
;
h
++
)
{
uint32_t
val
;
uint32_t
val
;
w
=
(
avctx
->
width
/
6
)
*
6
;
w
=
(
avctx
->
width
/
(
6
*
s
->
sample_factor
))
*
6
*
s
->
sample_factor
;
s
->
pack_line_10
(
y
,
u
,
v
,
dst
,
w
);
s
->
pack_line_10
(
y
,
u
,
v
,
dst
,
w
);
y
+=
w
;
y
+=
w
;
u
+=
w
>>
1
;
u
+=
w
>>
1
;
v
+=
w
>>
1
;
v
+=
w
>>
1
;
dst
+=
(
w
/
6
)
*
16
;
dst
+=
(
w
/
(
6
*
s
->
sample_factor
))
*
16
*
s
->
sample_factor
;
for
(;
w
<
avctx
->
width
-
5
;
w
+=
6
)
{
WRITE_PIXELS
(
u
,
y
,
v
);
WRITE_PIXELS
(
y
,
u
,
y
);
WRITE_PIXELS
(
v
,
y
,
u
);
WRITE_PIXELS
(
y
,
v
,
y
);
}
if
(
w
<
avctx
->
width
-
1
)
{
if
(
w
<
avctx
->
width
-
1
)
{
WRITE_PIXELS
(
u
,
y
,
v
);
WRITE_PIXELS
(
u
,
y
,
v
);
...
...
libavcodec/x86/constants.c
View file @
2cba1825
...
@@ -27,7 +27,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x000
...
@@ -27,7 +27,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x000
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_2
)
=
{
0x0002000200020002ULL
,
0x0002000200020002ULL
,
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_2
)
=
{
0x0002000200020002ULL
,
0x0002000200020002ULL
,
0x0002000200020002ULL
,
0x0002000200020002ULL
};
0x0002000200020002ULL
,
0x0002000200020002ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_3
)
=
{
0x0003000300030003ULL
,
0x0003000300030003ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_3
)
=
{
0x0003000300030003ULL
,
0x0003000300030003ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_4
)
=
{
0x0004000400040004ULL
,
0x0004000400040004ULL
,
0x0004000400040004ULL
,
0x0004000400040004ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_5
)
=
{
0x0005000500050005ULL
,
0x0005000500050005ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_5
)
=
{
0x0005000500050005ULL
,
0x0005000500050005ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_8
)
=
{
0x0008000800080008ULL
,
0x0008000800080008ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_8
)
=
{
0x0008000800080008ULL
,
0x0008000800080008ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_9
)
=
{
0x0009000900090009ULL
,
0x0009000900090009ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_9
)
=
{
0x0009000900090009ULL
,
0x0009000900090009ULL
};
...
...
libavcodec/x86/constants.h
View file @
2cba1825
...
@@ -28,7 +28,7 @@
...
@@ -28,7 +28,7 @@
extern
const
ymm_reg
ff_pw_1
;
extern
const
ymm_reg
ff_pw_1
;
extern
const
ymm_reg
ff_pw_2
;
extern
const
ymm_reg
ff_pw_2
;
extern
const
xmm_reg
ff_pw_3
;
extern
const
xmm_reg
ff_pw_3
;
extern
const
x
mm_reg
ff_pw_4
;
extern
const
y
mm_reg
ff_pw_4
;
extern
const
xmm_reg
ff_pw_5
;
extern
const
xmm_reg
ff_pw_5
;
extern
const
xmm_reg
ff_pw_8
;
extern
const
xmm_reg
ff_pw_8
;
extern
const
xmm_reg
ff_pw_9
;
extern
const
xmm_reg
ff_pw_9
;
...
...
libavcodec/x86/v210enc.asm
View file @
2cba1825
...
@@ -51,7 +51,7 @@ SECTION .text
...
@@ -51,7 +51,7 @@ SECTION .text
%macro
v210_planar_pack_10
0
%macro
v210_planar_pack_10
0
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
; v210_planar_pack_10(const uint16_t *y, const uint16_t *u, const uint16_t *v, uint8_t *dst, ptrdiff_t width)
cglobal
v210_planar_pack_10
,
5
,
5
,
4
,
y
,
u
,
v
,
dst
,
width
cglobal
v210_planar_pack_10
,
5
,
5
,
4
+
cpuflag
(
avx2
)
,
y
,
u
,
v
,
dst
,
width
lea
r0
,
[
yq
+
2
*
widthq
]
lea
r0
,
[
yq
+
2
*
widthq
]
add
uq
,
widthq
add
uq
,
widthq
add
vq
,
widthq
add
vq
,
widthq
...
@@ -61,11 +61,19 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
...
@@ -61,11 +61,19 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
mova
m3
,
[
v210_enc_max_10
]
mova
m3
,
[
v210_enc_max_10
]
.
loop
:
.
loop
:
movu
m0
,
[
yq
+
2
*
widthq
]
movu
xm0
,
[
yq
+
2
*
widthq
]
%if
cpuflag
(
avx2
)
vinserti128
m0
,
m0
,
[
yq
+
widthq
*
2
+
12
]
,
1
%endif
CLIPW
m0
,
m2
,
m3
CLIPW
m0
,
m2
,
m3
movq
m1
,
[
uq
+
widthq
]
movq
xm1
,
[
uq
+
widthq
]
movhps
m1
,
[
vq
+
widthq
]
movhps
xm1
,
[
vq
+
widthq
]
%if
cpuflag
(
avx2
)
movq
xm4
,
[
uq
+
widthq
+
6
]
movhps
xm4
,
[
vq
+
widthq
+
6
]
vinserti128
m1
,
m1
,
xm4
,
1
%endif
CLIPW
m1
,
m2
,
m3
CLIPW
m1
,
m2
,
m3
pmullw
m0
,
[
v210_enc_luma_mult_10
]
pmullw
m0
,
[
v210_enc_luma_mult_10
]
...
@@ -79,7 +87,7 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
...
@@ -79,7 +87,7 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
movu
[dstq],
m0
movu
[dstq],
m0
add
dstq
,
mmsize
add
dstq
,
mmsize
add
widthq
,
6
add
widthq
,
(
mmsize
*
3
)
/
8
jl
.
loop
jl
.
loop
RET
RET
...
@@ -87,6 +95,8 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
...
@@ -87,6 +95,8 @@ cglobal v210_planar_pack_10, 5, 5, 4, y, u, v, dst, width
INIT_XMM
ssse3
INIT_XMM
ssse3
v210_planar_pack_10
v210_planar_pack_10
INIT_YMM
avx2
v210_planar_pack_10
%macro
v210_planar_pack_8
0
%macro
v210_planar_pack_8
0
...
...
libavcodec/x86/v210enc_init.c
View file @
2cba1825
...
@@ -29,6 +29,9 @@ void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
...
@@ -29,6 +29,9 @@ void ff_v210_planar_pack_8_avx2(const uint8_t *y, const uint8_t *u,
void
ff_v210_planar_pack_10_ssse3
(
const
uint16_t
*
y
,
const
uint16_t
*
u
,
void
ff_v210_planar_pack_10_ssse3
(
const
uint16_t
*
y
,
const
uint16_t
*
u
,
const
uint16_t
*
v
,
uint8_t
*
dst
,
const
uint16_t
*
v
,
uint8_t
*
dst
,
ptrdiff_t
width
);
ptrdiff_t
width
);
void
ff_v210_planar_pack_10_avx2
(
const
uint16_t
*
y
,
const
uint16_t
*
u
,
const
uint16_t
*
v
,
uint8_t
*
dst
,
ptrdiff_t
width
);
av_cold
void
ff_v210enc_init_x86
(
V210EncContext
*
s
)
av_cold
void
ff_v210enc_init_x86
(
V210EncContext
*
s
)
{
{
...
@@ -44,6 +47,7 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
...
@@ -44,6 +47,7 @@ av_cold void ff_v210enc_init_x86(V210EncContext *s)
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
s
->
pack_line_8
=
ff_v210_planar_pack_8_avx2
;
s
->
pack_line_8
=
ff_v210_planar_pack_8_avx2
;
s
->
pack_line_10
=
ff_v210_planar_pack_10_avx2
;
s
->
sample_factor
=
2
;
s
->
sample_factor
=
2
;
}
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment