Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
f896bca0
Commit
f896bca0
authored
Jan 10, 2014
by
Janne Grunau
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
aarch64: h264 (bi)weight NEON optimizations
Ported from ARMv7 NEON.
parent
36e3b1f2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
264 additions
and
0 deletions
+264
-0
h264dsp_init_aarch64.c
libavcodec/aarch64/h264dsp_init_aarch64.c
+25
-0
h264dsp_neon.S
libavcodec/aarch64/h264dsp_neon.S
+239
-0
No files found.
libavcodec/aarch64/h264dsp_init_aarch64.c
View file @
f896bca0
...
...
@@ -34,6 +34,23 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void
ff_h264_h_loop_filter_chroma_neon
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
);
void
ff_weight_h264_pixels_16_neon
(
uint8_t
*
dst
,
int
stride
,
int
height
,
int
log2_den
,
int
weight
,
int
offset
);
void
ff_weight_h264_pixels_8_neon
(
uint8_t
*
dst
,
int
stride
,
int
height
,
int
log2_den
,
int
weight
,
int
offset
);
void
ff_weight_h264_pixels_4_neon
(
uint8_t
*
dst
,
int
stride
,
int
height
,
int
log2_den
,
int
weight
,
int
offset
);
void
ff_biweight_h264_pixels_16_neon
(
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
,
int
height
,
int
log2_den
,
int
weightd
,
int
weights
,
int
offset
);
void
ff_biweight_h264_pixels_8_neon
(
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
,
int
height
,
int
log2_den
,
int
weightd
,
int
weights
,
int
offset
);
void
ff_biweight_h264_pixels_4_neon
(
uint8_t
*
dst
,
uint8_t
*
src
,
int
stride
,
int
height
,
int
log2_den
,
int
weightd
,
int
weights
,
int
offset
);
void
ff_h264_idct_add_neon
(
uint8_t
*
dst
,
int16_t
*
block
,
int
stride
);
void
ff_h264_idct_dc_add_neon
(
uint8_t
*
dst
,
int16_t
*
block
,
int
stride
);
void
ff_h264_idct_add16_neon
(
uint8_t
*
dst
,
const
int
*
block_offset
,
...
...
@@ -63,6 +80,14 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
c
->
h264_v_loop_filter_chroma
=
ff_h264_v_loop_filter_chroma_neon
;
c
->
h264_h_loop_filter_chroma
=
ff_h264_h_loop_filter_chroma_neon
;
c
->
weight_h264_pixels_tab
[
0
]
=
ff_weight_h264_pixels_16_neon
;
c
->
weight_h264_pixels_tab
[
1
]
=
ff_weight_h264_pixels_8_neon
;
c
->
weight_h264_pixels_tab
[
2
]
=
ff_weight_h264_pixels_4_neon
;
c
->
biweight_h264_pixels_tab
[
0
]
=
ff_biweight_h264_pixels_16_neon
;
c
->
biweight_h264_pixels_tab
[
1
]
=
ff_biweight_h264_pixels_8_neon
;
c
->
biweight_h264_pixels_tab
[
2
]
=
ff_biweight_h264_pixels_4_neon
;
c
->
h264_idct_add
=
ff_h264_idct_add_neon
;
c
->
h264_idct_dc_add
=
ff_h264_idct_dc_add_neon
;
c
->
h264_idct_add16
=
ff_h264_idct_add16_neon
;
...
...
libavcodec/aarch64/h264dsp_neon.S
View file @
f896bca0
...
...
@@ -257,3 +257,242 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
ret
endfunc
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6
mov v4.16B, v16.16B
mov v6.16B, v16.16B
1: subs w3, w3, #2
ld1 {v20.16B}, [x0], x2
\macd v4.8H, v0.8B, v20.8B
\macd\()2 v6.8H, v0.16B, v20.16B
ld1 {v22.16B}, [x1], x2
\macs v4.8H, v1.8B, v22.8B
\macs\()2 v6.8H, v1.16B, v22.16B
mov v24.16B, v16.16B
ld1 {v28.16B}, [x0], x2
mov v26.16B, v16.16B
\macd v24.8H, v0.8B, v28.8B
\macd\()2 v26.8H, v0.16B, v28.16B
ld1 {v30.16B}, [x1], x2
\macs v24.8H, v1.8B, v30.8B
\macs\()2 v26.8H, v1.16B, v30.16B
sshl v4.8H, v4.8H, v18.8H
sshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
sshl v24.8H, v24.8H, v18.8H
sshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
mov v6.16B, v16.16B
st1 {v4.16B}, [x7], x2
mov v4.16B, v16.16B
st1 {v24.16B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_8 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B, v16.16B
1: subs w3, w3, #2
ld1 {v4.8B}, [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.8B}, [x1], x2
\macs v2.8H, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.8B}, [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.8B}, [x7], x2
mov v2.16B, v16.16B
st1 {v4.8B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_4 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B,v16.16B
1: subs w3, w3, #4
ld1 {v4.S}[0], [x0], x2
ld1 {v4.S}[1], [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.S}[0], [x1], x2
ld1 {v5.S}[1], [x1], x2
\macs v2.8H, v1.8B, v5.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x2
ld1 {v6.S}[1], [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.S}[0], [x1], x2
ld1 {v7.S}[1], [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
mov v2.16B, v16.16B
st1 {v4.S}[0], [x7], x2
st1 {v4.S}[1], [x7], x2
b.ne 1b
ret
2: sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
ret
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
sxtw x2, w2
lsr w8, w5, #31
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
dup v18.8H, w4
lsl w7, w7, w4
not v18.16B, v18.16B
dup v16.8H, w7
mov x7, x0
cbz w8, 10f
subs w8, w8, #1
b.eq 20f
subs w8, w8, #1
b.eq 30f
b 40f
10: biweight_\w umlal, umlal
20: neg w5, w5
biweight_\w umlal, umlsl
30: neg w5, w5
neg w6, w6
biweight_\w umlsl, umlsl
40: neg w6, w6
biweight_\w umlsl, umlal
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
.macro weight_16 add
dup v0.16B, w4
1: subs w2, w2, #2
ld1 {v20.16B}, [x0], x1
umull v4.8H, v0.8B, v20.8B
umull2 v6.8H, v0.16B, v20.16B
ld1 {v28.16B}, [x0], x1
umull v24.8H, v0.8B, v28.8B
umull2 v26.8H, v0.16B, v28.16B
\add v4.8H, v16.8H, v4.8H
srshl v4.8H, v4.8H, v18.8H
\add v6.8H, v16.8H, v6.8H
srshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
\add v24.8H, v16.8H, v24.8H
srshl v24.8H, v24.8H, v18.8H
\add v26.8H, v16.8H, v26.8H
srshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
st1 {v4.16B}, [x5], x1
st1 {v24.16B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_8 add
dup v0.8B, w4
1: subs w2, w2, #2
ld1 {v4.8B}, [x0], x1
umull v2.8H, v0.8B, v4.8B
ld1 {v6.8B}, [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.8B}, [x5], x1
st1 {v4.8B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_4 add
dup v0.8B, w4
1: subs w2, w2, #4
ld1 {v4.S}[0], [x0], x1
ld1 {v4.S}[1], [x0], x1
umull v2.8H, v0.8B, v4.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x1
ld1 {v6.S}[1], [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8h, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
st1 {v4.S}[0], [x5], x1
st1 {v4.S}[1], [x5], x1
b.ne 1b
ret
2: \add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
ret
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
sxtw x1, w1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
dup v16.8H, w5
mov x5, x0
b.le 20f
sub w6, w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w shadd
10: neg w4, w4
weight_\w shsub
20: neg w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w add
10: neg w4, w4
weight_\w sub
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment