Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
2f99117f
Commit
2f99117f
authored
Nov 22, 2016
by
Martin Storsjö
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it
Signed-off-by:
Martin Storsjö
<
martin@martin.st
>
parent
2dbe2aa2
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
11 deletions
+15
-11
vp9itxfm_neon.S
libavcodec/aarch64/vp9itxfm_neon.S
+15
-11
No files found.
libavcodec/aarch64/vp9itxfm_neon.S
View file @
2f99117f
...
@@ -599,9 +599,9 @@ endfunc
...
@@ -599,9 +599,9 @@ endfunc
// x1 = unused
// x1 = unused
// x2 = src
// x2 = src
// x3 = slice offset
// x3 = slice offset
// x9 = input stride
.macro itxfm16_1d_funcs txfm
.macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_8x16_pass1_neon
function \txfm\()16_1d_8x16_pass1_neon
mov x9, #32
movi v2.8h, #0
movi v2.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
load_clear \i, x2, x9
...
@@ -649,8 +649,8 @@ endfunc
...
@@ -649,8 +649,8 @@ endfunc
// x1 = dst stride
// x1 = dst stride
// x2 = src (temp buffer)
// x2 = src (temp buffer)
// x3 = slice offset
// x3 = slice offset
// x9 = temp buffer stride
function \txfm\()16_1d_8x16_pass2_neon
function \txfm\()16_1d_8x16_pass2_neon
mov x9, #32
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
load \i, x2, x9
.endr
.endr
...
@@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
...
@@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1,idct
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
ld1 {v0.8h,v1.8h}, [x10]
.endif
.endif
mov x9, #32
.irp i, 0, 8
.irp i, 0, 8
add x0, sp, #(\i*32)
add x0, sp, #(\i*32)
...
@@ -882,13 +883,12 @@ endfunc
...
@@ -882,13 +883,12 @@ endfunc
// x0 = dst (temp buffer)
// x0 = dst (temp buffer)
// x1 = unused
// x1 = unused
// x2 = src
// x2 = src
// x9 = double input stride
// x10 = idct_coeffs
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass1_neon
function idct32_1d_8x32_pass1_neon
ld1 {v0.8h,v1.8h}, [x10]
ld1 {v0.8h,v1.8h}, [x10]
// Double stride of the input, since we only read every other line
mov x9, #128
movi v4.8h, #0
movi v4.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
...
@@ -987,12 +987,13 @@ endfunc
...
@@ -987,12 +987,13 @@ endfunc
// x0 = dst
// x0 = dst
// x1 = dst stride
// x1 = dst stride
// x2 = src (temp buffer)
// x2 = src (temp buffer)
// x7 = negative double temp buffer stride
// x9 = double temp buffer stride
// x10 = idct_coeffs
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass2_neon
function idct32_1d_8x32_pass2_neon
ld1 {v0.8h,v1.8h}, [x10]
ld1 {v0.8h,v1.8h}, [x10]
mov x9, #128
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2], x9
ld1 {v\i\().8h}, [x2], x9
...
@@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
...
@@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
idct16
idct16
mov x9, #128
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
st1 {v\i\().8h}, [x2], x9
st1 {v\i\().8h}, [x2], x9
.endr
.endr
...
@@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
...
@@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
idct32_odd
idct32_odd
mov x9, #128
.macro load_acc_store a, b, c, d, neg=0
.macro load_acc_store a, b, c, d, neg=0
.if \neg == 0
ld1 {v4.8h}, [x2], x9
ld1 {v4.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9
.if \neg == 0
add v4.8h, v4.8h, v\a\().8h
add v4.8h, v4.8h, v\a\().8h
ld1 {v6.8h}, [x2], x9
ld1 {v6.8h}, [x2], x9
add v5.8h, v5.8h, v\b\().8h
add v5.8h, v5.8h, v\b\().8h
...
@@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
...
@@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
add v6.8h, v6.8h, v\c\().8h
add v6.8h, v6.8h, v\c\().8h
add v7.8h, v7.8h, v\d\().8h
add v7.8h, v7.8h, v\d\().8h
.else
.else
ld1 {v4.8h}, [x2], x7
ld1 {v5.8h}, [x2], x7
sub v4.8h, v4.8h, v\a\().8h
sub v4.8h, v4.8h, v\a\().8h
ld1 {v6.8h}, [x2], x
9
ld1 {v6.8h}, [x2], x
7
sub v5.8h, v5.8h, v\b\().8h
sub v5.8h, v5.8h, v\b\().8h
ld1 {v7.8h}, [x2], x
9
ld1 {v7.8h}, [x2], x
7
sub v6.8h, v6.8h, v\c\().8h
sub v6.8h, v6.8h, v\c\().8h
sub v7.8h, v7.8h, v\d\().8h
sub v7.8h, v7.8h, v\d\().8h
.endif
.endif
...
@@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
...
@@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
load_acc_store 23, 22, 21, 20
load_acc_store 23, 22, 21, 20
load_acc_store 19, 18, 17, 16
load_acc_store 19, 18, 17, 16
sub x2, x2, x9
sub x2, x2, x9
neg x9, x9
load_acc_store 16, 17, 18, 19, 1
load_acc_store 16, 17, 18, 19, 1
load_acc_store 20, 21, 22, 23, 1
load_acc_store 20, 21, 22, 23, 1
load_acc_store 24, 25, 26, 27, 1
load_acc_store 24, 25, 26, 27, 1
...
@@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
...
@@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
mov x5, x1
mov x5, x1
mov x6, x2
mov x6, x2
// Double stride of the input, since we only read every other line
mov x9, #128
neg x7, x9
.irp i, 0, 8, 16, 24
.irp i, 0, 8, 16, 24
add x0, sp, #(\i*64)
add x0, sp, #(\i*64)
add x2, x6, #(\i*2)
add x2, x6, #(\i*2)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment