Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
36e3b1f2
Commit
36e3b1f2
authored
Dec 20, 2013
by
Janne Grunau
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
aarch64: h264 loop filter NEON optimizations
Ported from ARMv7 NEON.
parent
c65d67ef
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
299 additions
and
1 deletion
+299
-1
Makefile
libavcodec/aarch64/Makefile
+2
-1
h264dsp_init_aarch64.c
libavcodec/aarch64/h264dsp_init_aarch64.c
+14
-0
h264dsp_neon.S
libavcodec/aarch64/h264dsp_neon.S
+259
-0
neon.S
libavcodec/aarch64/neon.S
+24
-0
No files found.
libavcodec/aarch64/Makefile
View file @
36e3b1f2
...
...
@@ -6,7 +6,8 @@ OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1_DECODER)
+=
aarch64/vc1dsp_init_aarch64.o
NEON-OBJS-$(CONFIG_H264CHROMA)
+=
aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP)
+=
aarch64/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264DSP)
+=
aarch64/h264dsp_neon.o
\
aarch64/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264QPEL)
+=
aarch64/h264qpel_neon.o
\
aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP)
+=
aarch64/hpeldsp_neon.o
libavcodec/aarch64/h264dsp_init_aarch64.c
View file @
36e3b1f2
...
...
@@ -25,6 +25,15 @@
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/h264dsp.h"
void
ff_h264_v_loop_filter_luma_neon
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
);
void
ff_h264_h_loop_filter_luma_neon
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
);
void
ff_h264_v_loop_filter_chroma_neon
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
);
void
ff_h264_h_loop_filter_chroma_neon
(
uint8_t
*
pix
,
int
stride
,
int
alpha
,
int
beta
,
int8_t
*
tc0
);
void
ff_h264_idct_add_neon
(
uint8_t
*
dst
,
int16_t
*
block
,
int
stride
);
void
ff_h264_idct_dc_add_neon
(
uint8_t
*
dst
,
int16_t
*
block
,
int
stride
);
void
ff_h264_idct_add16_neon
(
uint8_t
*
dst
,
const
int
*
block_offset
,
...
...
@@ -49,6 +58,11 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
int
cpu_flags
=
av_get_cpu_flags
();
if
(
have_neon
(
cpu_flags
)
&&
bit_depth
==
8
)
{
c
->
h264_v_loop_filter_luma
=
ff_h264_v_loop_filter_luma_neon
;
c
->
h264_h_loop_filter_luma
=
ff_h264_h_loop_filter_luma_neon
;
c
->
h264_v_loop_filter_chroma
=
ff_h264_v_loop_filter_chroma_neon
;
c
->
h264_h_loop_filter_chroma
=
ff_h264_h_loop_filter_chroma_neon
;
c
->
h264_idct_add
=
ff_h264_idct_add_neon
;
c
->
h264_idct_dc_add
=
ff_h264_idct_dc_add_neon
;
c
->
h264_idct_add16
=
ff_h264_idct_add16_neon
;
...
...
libavcodec/aarch64/h264dsp_neon.S
0 → 100644
View file @
36e3b1f2
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.S[0], w6
and w6, w6, w6, lsl #16
b.eq 1f
ands w6, w6, w6, lsl #8
b.ge 2f
1:
ret
2:
.endm
.macro h264_loop_filter_luma
dup v22.16B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
uxtl v24.4S, v24.4H
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
sli v24.8H, v24.8H, #8
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
sli v24.4S, v24.4S, #16
cmhi v21.16B, v22.16B, v21.16B // < alpha
dup v22.16B, w3 // beta
cmlt v23.16B, v24.16B, #0
cmhi v28.16B, v22.16B, v28.16B // < beta
cmhi v30.16B, v22.16B, v30.16B // < beta
bic v21.16B, v21.16B, v23.16B
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
cmhi v17.16B, v22.16B, v17.16B // < beta
and v21.16B, v21.16B, v30.16B
cmhi v19.16B, v22.16B, v19.16B // < beta
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
urhadd v28.16B, v16.16B, v0.16B
sub v21.16B, v24.16B, v17.16B
uqadd v23.16B, v18.16B, v24.16B
uhadd v20.16B, v20.16B, v28.16B
sub v21.16B, v21.16B, v19.16B
uhadd v28.16B, v4.16B, v28.16B
umin v23.16B, v23.16B, v20.16B
uqsub v22.16B, v18.16B, v24.16B
uqadd v4.16B, v2.16B, v24.16B
umax v23.16B, v23.16B, v22.16B
uqsub v22.16B, v2.16B, v24.16B
umin v28.16B, v4.16B, v28.16B
uxtl v4.8H, v0.8B
umax v28.16B, v28.16B, v22.16B
uxtl2 v20.8H, v0.16B
usubw v4.8H, v4.8H, v16.8B
usubw2 v20.8H, v20.8H, v16.16B
shl v4.8H, v4.8H, #2
shl v20.8H, v20.8H, #2
uaddw v4.8H, v4.8H, v18.8B
uaddw2 v20.8H, v20.8H, v18.16B
usubw v4.8H, v4.8H, v2.8B
usubw2 v20.8H, v20.8H, v2.16B
rshrn v4.8B, v4.8H, #3
rshrn2 v4.16B, v20.8H, #3
bsl v17.16B, v23.16B, v18.16B
bsl v19.16B, v28.16B, v2.16B
neg v23.16B, v21.16B
uxtl v28.8H, v16.8B
smin v4.16B, v4.16B, v21.16B
uxtl2 v21.8H, v16.16B
smax v4.16B, v4.16B, v23.16B
uxtl v22.8H, v0.8B
uxtl2 v24.8H, v0.16B
saddw v28.8H, v28.8H, v4.8B
saddw2 v21.8H, v21.8H, v4.16B
ssubw v22.8H, v22.8H, v4.8B
ssubw2 v24.8H, v24.8H, v4.16B
sqxtun v16.8B, v28.8H
sqxtun2 v16.16B, v21.8H
sqxtun v0.8B, v22.8H
sqxtun2 v0.16B, v24.8H
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
ld1 {v0.16B}, [x0], x1
ld1 {v2.16B}, [x0], x1
ld1 {v4.16B}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16B}, [x0], x1
ld1 {v18.16B}, [x0], x1
ld1 {v16.16B}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16B}, [x0], x1
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
ret
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8B}, [x0], x1
ld1 {v20.8B}, [x0], x1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0], x1
ld1 {v4.8B}, [x0], x1
ld1 {v26.8B}, [x0], x1
ld1 {v6.D}[1], [x0], x1
ld1 {v20.D}[1], [x0], x1
ld1 {v18.D}[1], [x0], x1
ld1 {v16.D}[1], [x0], x1
ld1 {v0.D}[1], [x0], x1
ld1 {v2.D}[1], [x0], x1
ld1 {v4.D}[1], [x0], x1
ld1 {v26.D}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v19.S}[0], [x0], x1
st1 {v17.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v19.S}[1], [x0], x1
st1 {v17.S}[2], [x0], x1
st1 {v16.S}[2], [x0], x1
st1 {v0.S}[2], [x0], x1
st1 {v19.S}[2], [x0], x1
st1 {v17.S}[3], [x0], x1
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
uxtl v4.8H, v0.8B
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
usubw v4.8H, v4.8H, v16.8B
sli v24.8H, v24.8H, #8
shl v4.8H, v4.8H, #2
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
uaddw v4.8H, v4.8H, v18.8B
cmhi v26.8B, v22.8B, v26.8B // < alpha
usubw v4.8H, v4.8H, v2.8B
dup v22.8B, w3 // beta
rshrn v4.8B, v4.8H, #3
cmhi v28.8B, v22.8B, v28.8B // < beta
cmhi v30.8B, v22.8B, v30.8B // < beta
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
and v26.8B, v26.8B, v28.8B
smax v4.8B, v4.8B, v25.8B
and v26.8B, v26.8B, v30.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
saddw v28.8H, v28.8H, v4.8B
ssubw v22.8H, v22.8H, v4.8B
sqxtun v16.8B, v28.8H
sqxtun v0.8B, v22.8H
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, #2
ld1 {v18.S}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1
ld1 {v2.S}[0], [x0], x1
ld1 {v18.S}[1], [x0], x1
ld1 {v16.S}[1], [x0], x1
ld1 {v0.S}[1], [x0], x1
ld1 {v2.S}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v2.S}[0], [x0], x1
st1 {v18.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
ret
endfunc
libavcodec/aarch64/neon.S
View file @
36e3b1f2
...
...
@@ -80,6 +80,30 @@
trn2 \r7\().4S, \t1\().4S, \r7\().4S
.endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16B, \r0\().16B, \r1\().16B
trn2 \t5\().16B, \r0\().16B, \r1\().16B
trn1 \t6\().16B, \r2\().16B, \r3\().16B
trn2 \t7\().16B, \r2\().16B, \r3\().16B
trn1 \r0\().8H, \t4\().8H, \t6\().8H
trn2 \r2\().8H, \t4\().8H, \t6\().8H
trn1 \r1\().8H, \t5\().8H, \t7\().8H
trn2 \r3\().8H, \t5\().8H, \t7\().8H
.endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8B, \r0\().8B, \r1\().8B
trn2 \t5\().8B, \r0\().8B, \r1\().8B
trn1 \t6\().8B, \r2\().8B, \r3\().8B
trn2 \t7\().8B, \r2\().8B, \r3\().8B
trn1 \r0\().4H, \t4\().4H, \t6\().4H
trn2 \r2\().4H, \t4\().4H, \t6\().4H
trn1 \r1\().4H, \t5\().4H, \t7\().4H
trn2 \r3\().4H, \t5\().4H, \t7\().4H
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment