Commit c950beb6 authored by Martin Storsjö's avatar Martin Storsjö

aarch64: vp8: Fix assembling with clang

This also partially fixes assembling with MS armasm64 (via
gas-preprocessor).

The movrel macro invocations need to pass the offset via a separate
parameter. Mach-o and COFF relocations don't allow a negative
offset to a symbol, which is handled properly if the offset is passed
via the parameter. If no offset parameter is given, the macro
evaluates to something like "adrp x17, subpel_filters-16+(0)", which
older clang versions also fail to parse (the older clang versions
only support one single offset term, although it can be a parenthesis.
Signed-off-by: 's avatarMartin Storsjö <martin@martin.st>
(cherry picked from commit 26d7af4c)
parent abc5ac3c
...@@ -31,10 +31,10 @@ function ff_vp8_idct_add_neon, export=1 ...@@ -31,10 +31,10 @@ function ff_vp8_idct_add_neon, export=1
movk w4, #35468/2, lsl 16 movk w4, #35468/2, lsl 16
dup v4.2s, w4 dup v4.2s, w4
smull v26.4s, v1.4h, v4.4h[0] smull v26.4s, v1.4h, v4.h[0]
smull v27.4s, v3.4h, v4.4h[0] smull v27.4s, v3.4h, v4.h[0]
sqdmulh v20.4h, v1.4h, v4.4h[1] sqdmulh v20.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.4h[1] sqdmulh v23.4h, v3.4h, v4.h[1]
sqshrn v21.4h, v26.4s, #16 sqshrn v21.4h, v26.4s, #16
sqshrn v22.4h, v27.4s, #16 sqshrn v22.4h, v27.4s, #16
add v21.4h, v21.4h, v1.4h add v21.4h, v21.4h, v1.4h
...@@ -54,12 +54,12 @@ function ff_vp8_idct_add_neon, export=1 ...@@ -54,12 +54,12 @@ function ff_vp8_idct_add_neon, export=1
transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
movi v29.8h, #0 movi v29.8h, #0
smull v26.4s, v1.4h, v4.4h[0] smull v26.4s, v1.4h, v4.h[0]
st1 {v29.8h}, [x1], #16 st1 {v29.8h}, [x1], #16
smull v27.4s, v3.4h, v4.4h[0] smull v27.4s, v3.4h, v4.h[0]
st1 {v29.16b}, [x1] st1 {v29.16b}, [x1]
sqdmulh v21.4h, v1.4h, v4.4h[1] sqdmulh v21.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.4h[1] sqdmulh v23.4h, v3.4h, v4.h[1]
sqshrn v20.4h, v26.4s, #16 sqshrn v20.4h, v26.4s, #16
sqshrn v22.4h, v27.4s, #16 sqshrn v22.4h, v27.4s, #16
add v20.4h, v20.4h, v1.4h add v20.4h, v20.4h, v1.4h
...@@ -469,7 +469,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 ...@@ -469,7 +469,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
ld1 {v6.d}[1], [x0], x1 ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1 ld1 {v7.d}[1], [x0], x1
transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w2 // flim_E dup v22.16b, w2 // flim_E
.if !\simple .if !\simple
...@@ -480,7 +480,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 ...@@ -480,7 +480,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
sub x0, x0, x1, lsl #4 // backup 16 rows sub x0, x0, x1, lsl #4 // backup 16 rows
transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels: // Store pixels:
st1 {v0.d}[0], [x0], x1 st1 {v0.d}[0], [x0], x1
...@@ -531,7 +531,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 ...@@ -531,7 +531,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
ld1 {v7.d}[0], [x0], x2 ld1 {v7.d}[0], [x0], x2
ld1 {v7.d}[1], [x1], x2 ld1 {v7.d}[1], [x1], x2
transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w3 // flim_E dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I dup v23.16b, w4 // flim_I
...@@ -541,7 +541,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 ...@@ -541,7 +541,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #3 // backup u 8 rows sub x0, x0, x2, lsl #3 // backup u 8 rows
sub x1, x1, x2, lsl #3 // backup v 8 rows sub x1, x1, x2, lsl #3 // backup v 8 rows
transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels: // Store pixels:
st1 {v0.d}[0], [x0], x2 // load u st1 {v0.d}[0], [x0], x2 // load u
...@@ -613,13 +613,13 @@ endfunc ...@@ -613,13 +613,13 @@ endfunc
uxtl v22.8h, v24.8b uxtl v22.8h, v24.8b
ext v26.8b, \s0\().8b, \s1\().8b, #5 ext v26.8b, \s0\().8b, \s1\().8b, #5
uxtl v25.8h, v25.8b uxtl v25.8h, v25.8b
mul v21.8h, v21.8h, v0.8h[2] mul v21.8h, v21.8h, v0.h[2]
uxtl v26.8h, v26.8b uxtl v26.8h, v26.8b
mul v22.8h, v22.8h, v0.8h[3] mul v22.8h, v22.8h, v0.h[3]
mls v21.8h, v19.8h, v0.8h[1] mls v21.8h, v19.8h, v0.h[1]
mls v22.8h, v25.8h, v0.8h[4] mls v22.8h, v25.8h, v0.h[4]
mla v21.8h, v18.8h, v0.8h[0] mla v21.8h, v18.8h, v0.h[0]
mla v22.8h, v26.8h, v0.8h[5] mla v22.8h, v26.8h, v0.h[5]
sqadd v22.8h, v21.8h, v22.8h sqadd v22.8h, v21.8h, v22.8h
sqrshrun \d\().8b, v22.8h, #7 sqrshrun \d\().8b, v22.8h, #7
.endm .endm
...@@ -640,20 +640,20 @@ endfunc ...@@ -640,20 +640,20 @@ endfunc
uxtl2 v2.8h, v2.16b uxtl2 v2.8h, v2.16b
uxtl v17.8h, v16.8b uxtl v17.8h, v16.8b
uxtl2 v16.8h, v16.16b uxtl2 v16.8h, v16.16b
mul v19.8h, v19.8h, v0.8h[3] mul v19.8h, v19.8h, v0.h[3]
mul v18.8h, v18.8h, v0.8h[2] mul v18.8h, v18.8h, v0.h[2]
mul v3.8h, v3.8h, v0.8h[2] mul v3.8h, v3.8h, v0.h[2]
mul v22.8h, v22.8h, v0.8h[3] mul v22.8h, v22.8h, v0.h[3]
mls v19.8h, v20.8h, v0.8h[4] mls v19.8h, v20.8h, v0.h[4]
uxtl v20.8h, \v0\().8b uxtl v20.8h, \v0\().8b
uxtl2 v1.8h, \v0\().16b uxtl2 v1.8h, \v0\().16b
mls v18.8h, v17.8h, v0.8h[1] mls v18.8h, v17.8h, v0.h[1]
mls v3.8h, v16.8h, v0.8h[1] mls v3.8h, v16.8h, v0.h[1]
mls v22.8h, v23.8h, v0.8h[4] mls v22.8h, v23.8h, v0.h[4]
mla v18.8h, v20.8h, v0.8h[0] mla v18.8h, v20.8h, v0.h[0]
mla v19.8h, v21.8h, v0.8h[5] mla v19.8h, v21.8h, v0.h[5]
mla v3.8h, v1.8h, v0.8h[0] mla v3.8h, v1.8h, v0.h[0]
mla v22.8h, v2.8h, v0.8h[5] mla v22.8h, v2.8h, v0.h[5]
sqadd v19.8h, v18.8h, v19.8h sqadd v19.8h, v18.8h, v19.8h
sqadd v22.8h, v3.8h, v22.8h sqadd v22.8h, v3.8h, v22.8h
sqrshrun \d0\().8b, v19.8h, #7 sqrshrun \d0\().8b, v19.8h, #7
...@@ -667,12 +667,12 @@ endfunc ...@@ -667,12 +667,12 @@ endfunc
uxtl \s4\().8h, \s4\().8b uxtl \s4\().8h, \s4\().8b
uxtl \s0\().8h, \s0\().8b uxtl \s0\().8h, \s0\().8b
uxtl \s5\().8h, \s5\().8b uxtl \s5\().8h, \s5\().8b
mul \s2\().8h, \s2\().8h, v0.8h[2] mul \s2\().8h, \s2\().8h, v0.h[2]
mul \s3\().8h, \s3\().8h, v0.8h[3] mul \s3\().8h, \s3\().8h, v0.h[3]
mls \s2\().8h, \s1\().8h, v0.8h[1] mls \s2\().8h, \s1\().8h, v0.h[1]
mls \s3\().8h, \s4\().8h, v0.8h[4] mls \s3\().8h, \s4\().8h, v0.h[4]
mla \s2\().8h, \s0\().8h, v0.8h[0] mla \s2\().8h, \s0\().8h, v0.h[0]
mla \s3\().8h, \s5\().8h, v0.8h[5] mla \s3\().8h, \s5\().8h, v0.h[5]
sqadd \s3\().8h, \s2\().8h, \s3\().8h sqadd \s3\().8h, \s2\().8h, \s3\().8h
sqrshrun \d0\().8b, \s3\().8h, #7 sqrshrun \d0\().8b, \s3\().8h, #7
.endm .endm
...@@ -685,20 +685,20 @@ endfunc ...@@ -685,20 +685,20 @@ endfunc
uxtl \s4\().8h, \s4\().8b uxtl \s4\().8h, \s4\().8b
uxtl \s2\().8h, \s2\().8b uxtl \s2\().8h, \s2\().8b
uxtl \s5\().8h, \s5\().8b uxtl \s5\().8h, \s5\().8b
mul \s0\().8h, \s0\().8h, v0.8h[0] mul \s0\().8h, \s0\().8h, v0.h[0]
mul v31.8h , \s3\().8h, v0.8h[3] mul v31.8h , \s3\().8h, v0.h[3]
mul \s3\().8h, \s3\().8h, v0.8h[2] mul \s3\().8h, \s3\().8h, v0.h[2]
mul \s6\().8h, \s6\().8h, v0.8h[5] mul \s6\().8h, \s6\().8h, v0.h[5]
mls \s0\().8h, \s1\().8h, v0.8h[1] mls \s0\().8h, \s1\().8h, v0.h[1]
mls v31.8h , \s4\().8h, v0.8h[4] mls v31.8h , \s4\().8h, v0.h[4]
mls \s3\().8h, \s2\().8h, v0.8h[1] mls \s3\().8h, \s2\().8h, v0.h[1]
mls \s6\().8h, \s5\().8h, v0.8h[4] mls \s6\().8h, \s5\().8h, v0.h[4]
mla \s0\().8h, \s2\().8h, v0.8h[2] mla \s0\().8h, \s2\().8h, v0.h[2]
mla v31.8h , \s5\().8h, v0.8h[5] mla v31.8h , \s5\().8h, v0.h[5]
mla \s3\().8h, \s1\().8h, v0.8h[0] mla \s3\().8h, \s1\().8h, v0.h[0]
mla \s6\().8h, \s4\().8h, v0.8h[3] mla \s6\().8h, \s4\().8h, v0.h[3]
sqadd v31.8h , \s0\().8h, v31.8h sqadd v31.8h , \s0\().8h, v31.8h
sqadd \s6\().8h, \s3\().8h, \s6\().8h sqadd \s6\().8h, \s3\().8h, \s6\().8h
sqrshrun \d0\().8b, v31.8h, #7 sqrshrun \d0\().8b, v31.8h, #7
...@@ -713,10 +713,10 @@ endfunc ...@@ -713,10 +713,10 @@ endfunc
ext v25.8b, \v0\().8b, \v1\().8b, #3 ext v25.8b, \v0\().8b, \v1\().8b, #3
uxtl v22.8h, v23.8b uxtl v22.8h, v23.8b
uxtl v25.8h, v25.8b uxtl v25.8h, v25.8b
mul v20.8h, v20.8h, v0.8h[2] mul v20.8h, v20.8h, v0.h[2]
mul v22.8h, v22.8h, v0.8h[3] mul v22.8h, v22.8h, v0.h[3]
mls v20.8h, v19.8h, v0.8h[1] mls v20.8h, v19.8h, v0.h[1]
mls v22.8h, v25.8h, v0.8h[4] mls v22.8h, v25.8h, v0.h[4]
sqadd v22.8h, v20.8h, v22.8h sqadd v22.8h, v20.8h, v22.8h
sqrshrun \d\().8b, v22.8h, #7 sqrshrun \d\().8b, v22.8h, #7
.endm .endm
...@@ -727,14 +727,14 @@ endfunc ...@@ -727,14 +727,14 @@ endfunc
uxtl \s2\().8h, \s2\().8b uxtl \s2\().8h, \s2\().8b
uxtl \s3\().8h, \s3\().8b uxtl \s3\().8h, \s3\().8b
uxtl \s4\().8h, \s4\().8b uxtl \s4\().8h, \s4\().8b
mul v21.8h, \s1\().8h, v0.8h[2] mul v21.8h, \s1\().8h, v0.h[2]
mul v23.8h, \s2\().8h, v0.8h[3] mul v23.8h, \s2\().8h, v0.h[3]
mul \s2\().8h, \s2\().8h, v0.8h[2] mul \s2\().8h, \s2\().8h, v0.h[2]
mul v22.8h, \s3\().8h, v0.8h[3] mul v22.8h, \s3\().8h, v0.h[3]
mls v21.8h, \s0\().8h, v0.8h[1] mls v21.8h, \s0\().8h, v0.h[1]
mls v23.8h, \s3\().8h, v0.8h[4] mls v23.8h, \s3\().8h, v0.h[4]
mls \s2\().8h, \s1\().8h, v0.8h[1] mls \s2\().8h, \s1\().8h, v0.h[1]
mls v22.8h, \s4\().8h, v0.8h[4] mls v22.8h, \s4\().8h, v0.h[4]
sqadd v21.8h, v21.8h, v23.8h sqadd v21.8h, v21.8h, v23.8h
sqadd \s2\().8h, \s2\().8h, v22.8h sqadd \s2\().8h, \s2\().8h, v22.8h
sqrshrun \d0\().8b, v21.8h, #7 sqrshrun \d0\().8b, v21.8h, #7
...@@ -759,7 +759,7 @@ function ff_put_vp8_epel16_v6_neon, export=1 ...@@ -759,7 +759,7 @@ function ff_put_vp8_epel16_v6_neon, export=1
sxtw x4, w4 sxtw x4, w4
sxtw x6, w6 sxtw x6, w6
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
add x6, x17, x6, lsl #4 // y add x6, x17, x6, lsl #4 // y
ld1 {v0.8h}, [x6] ld1 {v0.8h}, [x6]
1: 1:
...@@ -788,7 +788,7 @@ function ff_put_vp8_epel16_h6_neon, export=1 ...@@ -788,7 +788,7 @@ function ff_put_vp8_epel16_h6_neon, export=1
sxtw x5, w5 // x sxtw x5, w5 // x
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
add x5, x17, x5, lsl #4 // x add x5, x17, x5, lsl #4 // x
ld1 {v0.8h}, [x5] ld1 {v0.8h}, [x5]
1: 1:
...@@ -807,7 +807,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 ...@@ -807,7 +807,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1
sub x2, x2, #2 sub x2, x2, #2
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
sxtw x5, w5 // x sxtw x5, w5 // x
add x16, x17, x5, lsl #4 // x add x16, x17, x5, lsl #4 // x
sub sp, sp, #336+16 sub sp, sp, #336+16
...@@ -854,7 +854,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 ...@@ -854,7 +854,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1
sxtw x4, w4 sxtw x4, w4
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
sxtw x5, w5 sxtw x5, w5
add x5, x17, x5, lsl #4 // x add x5, x17, x5, lsl #4 // x
sub sp, sp, #168+16 sub sp, sp, #168+16
...@@ -900,7 +900,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 ...@@ -900,7 +900,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1
sxtw x4, w4 sxtw x4, w4
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
sxtw x5, w5 sxtw x5, w5
add x5, x17, x5, lsl #4 // x add x5, x17, x5, lsl #4 // x
sub sp, sp, #168+16 sub sp, sp, #168+16
...@@ -947,7 +947,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 ...@@ -947,7 +947,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
sxtw x5, w5 sxtw x5, w5
add x5, x17, x5, lsl #4 // x add x5, x17, x5, lsl #4 // x
sub sp, sp, #168+16 sub sp, sp, #168+16
...@@ -992,7 +992,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 ...@@ -992,7 +992,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1
// first pass (horizontal): // first pass (horizontal):
movrel x17, subpel_filters-16 movrel x17, subpel_filters, -16
sxtw x5, w5 sxtw x5, w5
add x5, x17, x5, lsl #4 // x add x5, x17, x5, lsl #4 // x
sub sp, sp, #168+16 sub sp, sp, #168+16
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment