Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
e5786383
Commit
e5786383
authored
Oct 12, 2015
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9: use registers for constant loading where possible.
parent
408bb855
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
146 additions
and
122 deletions
+146
-122
vp9itxfm_16bpp.asm
libavcodec/x86/vp9itxfm_16bpp.asm
+146
-122
No files found.
libavcodec/x86/vp9itxfm_16bpp.asm
View file @
e5786383
...
@@ -345,9 +345,9 @@ IADST4_FN iadst, IADST4, iadst, IADST4
...
@@ -345,9 +345,9 @@ IADST4_FN iadst, IADST4, iadst, IADST4
;
;
; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
; dst1 = src1 * coef1 + src2 * coef2 + rnd >> 14
; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
; dst2 = src1 * coef2 - src2 * coef1 + rnd >> 14
%macro
SUMSUB_MUL
6
; src/dst 1-2, tmp1-2, coef1-2
%macro
SUMSUB_MUL
6
-
8
[
pd_8192
]
,
[
pd_3fff
]
; src/dst 1-2, tmp1-2, coef1-2, rnd, mask
pand
m%3
,
m%1
,
[
pd_3fff
]
pand
m%3
,
m%1
,
%8
pand
m%4
,
m%2
,
[
pd_3fff
]
pand
m%4
,
m%2
,
%8
psrad
m%1
,
14
psrad
m%1
,
14
psrad
m%2
,
14
psrad
m%2
,
14
packssdw
m%4
,
m%2
packssdw
m%4
,
m%2
...
@@ -358,20 +358,20 @@ IADST4_FN iadst, IADST4, iadst, IADST4
...
@@ -358,20 +358,20 @@ IADST4_FN iadst, IADST4, iadst, IADST4
pmaddwd
m%1
,
m%2
,
[
pw_
%6
_
%5
]
pmaddwd
m%1
,
m%2
,
[
pw_
%6
_
%5
]
pmaddwd
m%4
,
[
pw_m
%5
_
%6
]
pmaddwd
m%4
,
[
pw_m
%5
_
%6
]
pmaddwd
m%2
,
[
pw_m
%5
_
%6
]
pmaddwd
m%2
,
[
pw_m
%5
_
%6
]
paddd
m%3
,
[
pd_8192
]
paddd
m%3
,
%7
paddd
m%4
,
[
pd_8192
]
paddd
m%4
,
%7
psrad
m%3
,
14
psrad
m%3
,
14
psrad
m%4
,
14
psrad
m%4
,
14
paddd
m%1
,
m%3
paddd
m%1
,
m%3
paddd
m%2
,
m%4
paddd
m%2
,
m%4
%endmacro
%endmacro
%macro
IDCT4_12BPP_1D
0
-
6
0
,
1
,
2
,
3
,
4
,
5
%macro
IDCT4_12BPP_1D
0
-
8
[
pd_8192
]
,
[
pd_3fff
]
,
0
,
1
,
2
,
3
,
4
,
5
; rnd, mask, in/out0-3, tmp0-1
SUMSUB_MUL
%
1
,
%3
,
%5
,
%6
,
11585
,
11585
SUMSUB_MUL
%
3
,
%5
,
%7
,
%8
,
11585
,
11585
,
%1
,
%2
SUMSUB_MUL
%
2
,
%4
,
%5
,
%6
,
15137
,
6270
SUMSUB_MUL
%
4
,
%6
,
%7
,
%8
,
15137
,
6270
,
%1
,
%2
SUMSUB_BA
d
,
%
2
,
%1
,
%5
SUMSUB_BA
d
,
%
4
,
%3
,
%7
SUMSUB_BA
d
,
%
4
,
%3
,
%5
SUMSUB_BA
d
,
%
6
,
%5
,
%7
SWAP
%
2
,
%4
,
%1
SWAP
%
4
,
%6
,
%3
%endmacro
%endmacro
%macro
STORE_4x4
6
; tmp1-2, reg1-2, min, max
%macro
STORE_4x4
6
; tmp1-2, reg1-2, min, max
...
@@ -433,10 +433,12 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -433,10 +433,12 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
mova
m1
,
[
blockq
+
1
*
16
]
mova
m1
,
[
blockq
+
1
*
16
]
mova
m2
,
[
blockq
+
2
*
16
]
mova
m2
,
[
blockq
+
2
*
16
]
mova
m3
,
[
blockq
+
3
*
16
]
mova
m3
,
[
blockq
+
3
*
16
]
mova
m6
,
[
pd_8192
]
mova
m7
,
[
pd_3fff
]
IDCT4_12BPP_1D
IDCT4_12BPP_1D
m6
,
m7
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
4
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
4
IDCT4_12BPP_1D
IDCT4_12BPP_1D
m6
,
m7
pxor
m4
,
m4
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
...
@@ -445,7 +447,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -445,7 +447,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
DEFINE_ARGS
dst
,
stride
,
stride3
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
lea
stride3q
,
[
strideq
*
3
]
mova
m5
,
[
pw_4095
]
mova
m5
,
[
pw_4095
]
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m4
,
m5
,
[
pd_8
]
,
4
mova
m6
,
[
pd_8
]
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m4
,
m5
,
m6
,
4
RET
RET
%macro
SCRATCH
3
-
4
%macro
SCRATCH
3
-
4
...
@@ -473,21 +476,32 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -473,21 +476,32 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endif
%endif
%endmacro
%endmacro
%macro
PRELOAD
2
-
3
%if
ARCH_X86_64
mova
m%1
,
[
%2
]
%if
%0
==
3
%define
reg_
%3
m%1
%endif
%elif
%0
==
3
%define
reg_
%3
[
%2
]
%endif
%endmacro
; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
; out0 = 5283 * in0 + 13377 + in1 + 15212 * in2 + 9929 * in3 + rnd >> 14
; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
; out1 = 9929 * in0 + 13377 * in1 - 5283 * in2 - 15282 * in3 + rnd >> 14
; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
; out2 = 13377 * in0 - 13377 * in2 + 13377 * in3 + rnd >> 14
; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
; out3 = 15212 * in0 - 13377 * in1 + 9929 * in2 - 5283 * in3 + rnd >> 14
%macro
IADST4_12BPP_1D
0
%macro
IADST4_12BPP_1D
0
-
2
[
pd_8192
]
,
[
pd_3fff
]
; rnd, mask
pand
m4
,
m0
,
[
pd_3fff
]
pand
m4
,
m0
,
%2
pand
m5
,
m1
,
[
pd_3fff
]
pand
m5
,
m1
,
%2
psrad
m0
,
14
psrad
m0
,
14
psrad
m1
,
14
psrad
m1
,
14
packssdw
m5
,
m1
packssdw
m5
,
m1
packssdw
m4
,
m0
packssdw
m4
,
m0
punpckhwd
m1
,
m4
,
m5
punpckhwd
m1
,
m4
,
m5
punpcklwd
m4
,
m5
punpcklwd
m4
,
m5
pand
m5
,
m2
,
[
pd_3fff
]
pand
m5
,
m2
,
%2
pand
m6
,
m3
,
[
pd_3fff
]
pand
m6
,
m3
,
%2
psrad
m2
,
14
psrad
m2
,
14
psrad
m3
,
14
psrad
m3
,
14
packssdw
m6
,
m3
packssdw
m6
,
m3
...
@@ -501,29 +515,35 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -501,29 +515,35 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
; m4/5 have the low bits of 0,1,2,3
; m4/5 have the low bits of 0,1,2,3
; m0/2/6/7 are free
; m0/2/6/7 are free
pmaddwd
m7
,
reg_b
,
[
pw_15212_9929
]
mova
m2
,
[
pw_15212_9929
]
pmaddwd
m6
,
m4
,
[
pw_5283_13377
]
mova
m0
,
[
pw_5283_13377
]
pmaddwd
m2
,
m3
,
[
pw_15212_9929
]
pmaddwd
m7
,
m2
,
reg_b
pmaddwd
m0
,
reg_a
,
[
pw_5283_13377
]
pmaddwd
m6
,
m4
,
m0
pmaddwd
m2
,
m3
pmaddwd
m0
,
reg_a
paddd
m6
,
m7
paddd
m6
,
m7
paddd
m0
,
m2
paddd
m0
,
m2
pmaddwd
m7
,
reg_b
,
[
pw_m13377_13377
]
mova
m1
,
[
pw_m13377_13377
]
pmaddwd
m2
,
m4
,
[
pw_13377_0
]
mova
m5
,
[
pw_13377_0
]
pmaddwd
m1
,
m3
,
[
pw_m13377_13377
]
pmaddwd
m7
,
m1
,
reg_b
pmaddwd
m5
,
reg_a
,
[
pw_13377_0
]
pmaddwd
m2
,
m4
,
m5
pmaddwd
m1
,
m3
pmaddwd
m5
,
reg_a
paddd
m2
,
m7
paddd
m2
,
m7
paddd
m1
,
m5
paddd
m1
,
m5
paddd
m6
,
[
pd_8192
]
paddd
m6
,
%1
paddd
m2
,
[
pd_8192
]
paddd
m2
,
%1
psrad
m6
,
14
psrad
m6
,
14
psrad
m2
,
14
psrad
m2
,
14
paddd
m0
,
m6
; t0
paddd
m0
,
m6
; t0
paddd
m2
,
m1
; t2
paddd
m2
,
m1
; t2
pmaddwd
m1
,
reg_b
,
[
pw_m5283_m15212
]
mova
m7
,
[
pw_m5283_m15212
]
pmaddwd
m6
,
m4
,
[
pw_9929_13377
]
mova
m5
,
[
pw_9929_13377
]
pmaddwd
m7
,
m3
,
[
pw_m5283_m15212
]
pmaddwd
m1
,
m7
,
reg_b
pmaddwd
m5
,
reg_a
,
[
pw_9929_13377
]
pmaddwd
m6
,
m4
,
m5
pmaddwd
m7
,
m3
pmaddwd
m5
,
reg_a
paddd
m6
,
m1
paddd
m6
,
m1
paddd
m7
,
m5
paddd
m7
,
m5
UNSCRATCH
5
,
9
,
rsp
+
1
*
mmsize
,
b
UNSCRATCH
5
,
9
,
rsp
+
1
*
mmsize
,
b
...
@@ -534,8 +554,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -534,8 +554,8 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
pmaddwd
m1
,
[
pw_15212_m13377
]
pmaddwd
m1
,
[
pw_15212_m13377
]
paddd
m4
,
m5
paddd
m4
,
m5
paddd
m3
,
m1
paddd
m3
,
m1
paddd
m6
,
[
pd_8192
]
paddd
m6
,
%1
paddd
m4
,
[
pd_8192
]
paddd
m4
,
%1
psrad
m6
,
14
psrad
m6
,
14
psrad
m4
,
14
psrad
m4
,
14
paddd
m7
,
m6
; t1
paddd
m7
,
m6
; t1
...
@@ -545,15 +565,17 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
...
@@ -545,15 +565,17 @@ cglobal vp9_idct_idct_4x4_add_12, 4, 4, 6, dst, stride, block, eob
%endmacro
%endmacro
%macro
IADST4_12BPP_FN
4
%macro
IADST4_12BPP_FN
4
cglobal
vp9_
%1
_
%3
_4x4_add_12
,
3
,
3
,
1
0
,
2
*
ARCH_X86_32
*
mmsize
,
dst
,
stride
,
block
,
eob
cglobal
vp9_
%1
_
%3
_4x4_add_12
,
3
,
3
,
1
2
,
2
*
ARCH_X86_32
*
mmsize
,
dst
,
stride
,
block
,
eob
mova
m0
,
[
blockq
+
0
*
16
]
mova
m0
,
[
blockq
+
0
*
16
]
mova
m1
,
[
blockq
+
1
*
16
]
mova
m1
,
[
blockq
+
1
*
16
]
mova
m2
,
[
blockq
+
2
*
16
]
mova
m2
,
[
blockq
+
2
*
16
]
mova
m3
,
[
blockq
+
3
*
16
]
mova
m3
,
[
blockq
+
3
*
16
]
%2
_12BPP_1D
PRELOAD
10
,
pd_8192
,
rnd
PRELOAD
11
,
pd_3fff
,
mask
%2
_12BPP_1D
reg_rnd
,
reg_mask
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
4
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
4
%4
_12BPP_1D
%4
_12BPP_1D
reg_rnd
,
reg_mask
pxor
m4
,
m4
pxor
m4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
ZERO_BLOCK
blockq
,
16
,
4
,
m4
...
@@ -562,7 +584,8 @@ cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, b
...
@@ -562,7 +584,8 @@ cglobal vp9_%1_%3_4x4_add_12, 3, 3, 10, 2 * ARCH_X86_32 * mmsize, dst, stride, b
DEFINE_ARGS
dst
,
stride
,
stride3
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
lea
stride3q
,
[
strideq
*
3
]
mova
m5
,
[
pw_4095
]
mova
m5
,
[
pw_4095
]
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m4
,
m5
,
[
pd_8
]
,
4
mova
m6
,
[
pd_8
]
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m4
,
m5
,
m6
,
4
RET
RET
%endmacro
%endmacro
...
@@ -573,30 +596,30 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
...
@@ -573,30 +596,30 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
; the following line has not been executed at the end of this macro:
; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+%3*mmsize
; UNSCRATCH 6, 8, rsp+%3*mmsize
%macro
IDCT8_1D
1
-
3
2
*
mmsize
,
17
; src
, src_stride, stack_offset
%macro
IDCT8_1D
1
-
5
[
pd_8192
]
,
[
pd_3fff
]
,
2
*
mmsize
,
17
; src, rnd, mask
, src_stride, stack_offset
mova
m0
,
[
%1
+
0
*
%
2
]
mova
m0
,
[
%1
+
0
*
%
4
]
mova
m2
,
[
%1
+
2
*
%
2
]
mova
m2
,
[
%1
+
2
*
%
4
]
mova
m4
,
[
%1
+
4
*
%
2
]
mova
m4
,
[
%1
+
4
*
%
4
]
mova
m6
,
[
%1
+
6
*
%
2
]
mova
m6
,
[
%1
+
6
*
%
4
]
IDCT4_12BPP_1D
0
,
2
,
4
,
6
,
1
,
3
; m0/2/4/6 have t0/1/2/3
IDCT4_12BPP_1D
%2
,
%3
,
0
,
2
,
4
,
6
,
1
,
3
; m0/2/4/6 have t0/1/2/3
SCRATCH
4
,
8
,
rsp
+
(
%
3
+
0
)
*
mmsize
SCRATCH
4
,
8
,
rsp
+
(
%
5
+
0
)
*
mmsize
SCRATCH
6
,
9
,
rsp
+
(
%
3
+
1
)
*
mmsize
SCRATCH
6
,
9
,
rsp
+
(
%
5
+
1
)
*
mmsize
mova
m1
,
[
%1
+
1
*
%
2
]
mova
m1
,
[
%1
+
1
*
%
4
]
mova
m3
,
[
%1
+
3
*
%
2
]
mova
m3
,
[
%1
+
3
*
%
4
]
mova
m5
,
[
%1
+
5
*
%
2
]
mova
m5
,
[
%1
+
5
*
%
4
]
mova
m7
,
[
%1
+
7
*
%
2
]
mova
m7
,
[
%1
+
7
*
%
4
]
SUMSUB_MUL
1
,
7
,
4
,
6
,
16069
,
3196
; m1=t7a, m7=t4a
SUMSUB_MUL
1
,
7
,
4
,
6
,
16069
,
3196
,
%2
,
%3
; m1=t7a, m7=t4a
SUMSUB_MUL
5
,
3
,
4
,
6
,
9102
,
13623
; m5=t6a, m3=t5a
SUMSUB_MUL
5
,
3
,
4
,
6
,
9102
,
13623
,
%2
,
%3
; m5=t6a, m3=t5a
SUMSUB_BA
d
,
3
,
7
,
4
; m3=t4, m7=t5a
SUMSUB_BA
d
,
3
,
7
,
4
; m3=t4, m7=t5a
SUMSUB_BA
d
,
5
,
1
,
4
; m5=t7, m1=t6a
SUMSUB_BA
d
,
5
,
1
,
4
; m5=t7, m1=t6a
SUMSUB_MUL
1
,
7
,
4
,
6
,
11585
,
11585
; m1=t6, m7=t5
SUMSUB_MUL
1
,
7
,
4
,
6
,
11585
,
11585
,
%2
,
%3
; m1=t6, m7=t5
SUMSUB_BA
d
,
5
,
0
,
4
; m5=out0, m0=out7
SUMSUB_BA
d
,
5
,
0
,
4
; m5=out0, m0=out7
SUMSUB_BA
d
,
1
,
2
,
4
; m1=out1, m2=out6
SUMSUB_BA
d
,
1
,
2
,
4
; m1=out1, m2=out6
UNSCRATCH
4
,
8
,
rsp
+
(
%
3
+
0
)
*
mmsize
UNSCRATCH
4
,
8
,
rsp
+
(
%
5
+
0
)
*
mmsize
UNSCRATCH
6
,
9
,
rsp
+
(
%
3
+
1
)
*
mmsize
UNSCRATCH
6
,
9
,
rsp
+
(
%
5
+
1
)
*
mmsize
SCRATCH
2
,
8
,
rsp
+
(
%
3
+
0
)
*
mmsize
SCRATCH
2
,
8
,
rsp
+
(
%
5
+
0
)
*
mmsize
SUMSUB_BA
d
,
7
,
4
,
2
; m7=out2, m4=out5
SUMSUB_BA
d
,
7
,
4
,
2
; m7=out2, m4=out5
SUMSUB_BA
d
,
3
,
6
,
2
; m3=out3, m6=out4
SUMSUB_BA
d
,
3
,
6
,
2
; m3=out3, m6=out4
SWAP
0
,
5
,
4
,
6
,
2
,
7
SWAP
0
,
5
,
4
,
6
,
2
,
7
%endmacro
%endmacro
...
@@ -613,23 +636,12 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
...
@@ -613,23 +636,12 @@ IADST4_12BPP_FN iadst, IADST4, iadst, IADST4
mova
[
%6
+
%7
*
1
]
,
m%2
mova
[
%6
+
%7
*
1
]
,
m%2
%endmacro
%endmacro
%macro
PRELOAD
2
-
3
%if
ARCH_X86_64
mova
m%1
,
[
%2
]
%if
%0
==
3
%define
reg_
%3
m%1
%endif
%elif
%0
==
3
%define
reg_
%3
[
%2
]
%endif
%endmacro
; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
; FIXME we can use the intermediate storage (rsp[0-15]) on x86-32 for temp
; storage also instead of allocating two more stack spaces. This doesn't
; storage also instead of allocating two more stack spaces. This doesn't
; matter much but it's something...
; matter much but it's something...
INIT_XMM
sse2
INIT_XMM
sse2
cglobal
vp9_idct_idct_8x8_add_10
,
4
,
6
+
ARCH_X86_64
,
1
0
,
\
cglobal
vp9_idct_idct_8x8_add_10
,
4
,
6
+
ARCH_X86_64
,
1
4
,
\
1
7
*
mmsize
+
2
*
ARCH_X86_32
*
mmsize
,
\
1
6
*
mmsize
+
3
*
ARCH_X86_32
*
mmsize
,
\
dst
,
stride
,
block
,
eob
dst
,
stride
,
block
,
eob
mova
m0
,
[
pw_1023
]
mova
m0
,
[
pw_1023
]
cmp
eobd
,
1
cmp
eobd
,
1
...
@@ -654,7 +666,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
...
@@ -654,7 +666,7 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
RET
RET
.
idctfull
:
.
idctfull
:
mova
[
rsp
+
16
*
mmsize
]
,
m0
SCRATCH
0
,
12
,
rsp
+
16
*
mmsize
,
max
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
skip
,
dstbak
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
skip
,
dstbak
%if
ARCH_X86_64
%if
ARCH_X86_64
mov
dstbakq
,
dstq
mov
dstbakq
,
dstq
...
@@ -669,8 +681,11 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
...
@@ -669,8 +681,11 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov
skipd
,
2
mov
skipd
,
2
sub
skipd
,
cntd
sub
skipd
,
cntd
mov
ptrq
,
rsp
mov
ptrq
,
rsp
PRELOAD
10
,
pd_8192
,
rnd
PRELOAD
11
,
pd_3fff
,
mask
PRELOAD
13
,
pd_16
,
srnd
.
loop_1
:
.
loop_1
:
IDCT8_1D
blockq
IDCT8_1D
blockq
,
reg_rnd
,
reg_mask
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
6
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
6
mova
[
ptrq
+
0
*
mmsize
]
,
m0
mova
[
ptrq
+
0
*
mmsize
]
,
m0
...
@@ -709,14 +724,15 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
...
@@ -709,14 +724,15 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
mov
cntd
,
2
mov
cntd
,
2
mov
ptrq
,
rsp
mov
ptrq
,
rsp
.
loop_2
:
.
loop_2
:
IDCT8_1D
ptrq
IDCT8_1D
ptrq
,
reg_rnd
,
reg_mask
pxor
m6
,
m6
pxor
m6
,
m6
PRELOAD
9
,
rsp
+
16
*
mmsize
,
max
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m6
,
reg_max
,
reg_srnd
,
5
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m6
,
reg_max
,
[
pd_16
]
,
5
lea
dstq
,
[
dstq
+
strideq
*
4
]
lea
dstq
,
[
dstq
+
strideq
*
4
]
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
ROUND_AND_STORE_4x4
4
,
5
,
0
,
7
,
m6
,
reg_max
,
[
pd_16
]
,
5
UNSCRATCH
1
,
12
,
rsp
+
16
*
mmsize
,
max
UNSCRATCH
2
,
13
,
pd_16
,
srnd
ROUND_AND_STORE_4x4
4
,
5
,
0
,
7
,
m6
,
m1
,
m2
,
5
add
ptrq
,
16
add
ptrq
,
16
%if
ARCH_X86_64
%if
ARCH_X86_64
lea
dstq
,
[
dstbakq
+
8
]
lea
dstq
,
[
dstbakq
+
8
]
...
@@ -763,8 +779,8 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
...
@@ -763,8 +779,8 @@ cglobal vp9_idct_idct_8x8_add_10, 4, 6 + ARCH_X86_64, 10, \
%endmacro
%endmacro
INIT_XMM
sse2
INIT_XMM
sse2
cglobal
vp9_idct_idct_8x8_add_12
,
4
,
6
+
ARCH_X86_64
,
1
0
,
\
cglobal
vp9_idct_idct_8x8_add_12
,
4
,
6
+
ARCH_X86_64
,
1
4
,
\
1
7
*
mmsize
+
2
*
ARCH_X86_32
*
mmsize
,
\
1
6
*
mmsize
+
3
*
ARCH_X86_32
*
mmsize
,
\
dst
,
stride
,
block
,
eob
dst
,
stride
,
block
,
eob
mova
m0
,
[
pw_4095
]
mova
m0
,
[
pw_4095
]
cmp
eobd
,
1
cmp
eobd
,
1
...
@@ -791,9 +807,9 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
...
@@ -791,9 +807,9 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
;
;
; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
; dst1[hi]:dst3[lo] = src1 * coef1 + src2 * coef2
; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
; dst2[hi]:dst4[lo] = src1 * coef2 - src2 * coef1
%macro
SUMSUB_MUL_D
6
; src/dst 1-2, dst3-4, coef1-2
%macro
SUMSUB_MUL_D
6
-
7
[
pd_3fff
]
; src/dst 1-2, dst3-4, coef1-2, mask
pand
m%3
,
m%1
,
[
pd_3fff
]
pand
m%3
,
m%1
,
%7
pand
m%4
,
m%2
,
[
pd_3fff
]
pand
m%4
,
m%2
,
%7
psrad
m%1
,
14
psrad
m%1
,
14
psrad
m%2
,
14
psrad
m%2
,
14
packssdw
m%4
,
m%2
packssdw
m%4
,
m%2
...
@@ -808,11 +824,11 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
...
@@ -808,11 +824,11 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
; dst1 = src2[hi]:src4[lo] + src1[hi]:src3[lo] + rnd >> 14
; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
; dst2 = src2[hi]:src4[lo] - src1[hi]:src3[lo] + rnd >> 14
%macro
SUMSUB_PACK_D
5
; src/dst 1-2, src3-4, tmp
%macro
SUMSUB_PACK_D
5
-
6
[
pd_8192
]
; src/dst 1-2, src3-4, tmp, rnd
SUMSUB_BA
d
,
%1
,
%2
,
%5
SUMSUB_BA
d
,
%1
,
%2
,
%5
SUMSUB_BA
d
,
%3
,
%4
,
%5
SUMSUB_BA
d
,
%3
,
%4
,
%5
paddd
m%3
,
[
pd_8192
]
paddd
m%3
,
%6
paddd
m%4
,
[
pd_8192
]
paddd
m%4
,
%6
psrad
m%3
,
14
psrad
m%3
,
14
psrad
m%4
,
14
psrad
m%4
,
14
paddd
m%1
,
m%3
paddd
m%1
,
m%3
...
@@ -830,17 +846,17 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
...
@@ -830,17 +846,17 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
; the following line has not been executed at the end of this macro:
; the following line has not been executed at the end of this macro:
; UNSCRATCH 6, 8, rsp+17*mmsize
; UNSCRATCH 6, 8, rsp+17*mmsize
%macro
IADST8_1D
1
; src
%macro
IADST8_1D
1
-
3
[
pd_8192
]
,
[
pd_3fff
]
; src, rnd, mask
mova
m0
,
[
%1
+
0
*
mmsize
]
mova
m0
,
[
%1
+
0
*
mmsize
]
mova
m3
,
[
%1
+
6
*
mmsize
]
mova
m3
,
[
%1
+
6
*
mmsize
]
mova
m4
,
[
%1
+
8
*
mmsize
]
mova
m4
,
[
%1
+
8
*
mmsize
]
mova
m7
,
[
%1
+
14
*
mmsize
]
mova
m7
,
[
%1
+
14
*
mmsize
]
SUMSUB_MUL_D
7
,
0
,
1
,
2
,
16305
,
1606
; m7/1=t0a, m0/2=t1a
SUMSUB_MUL_D
7
,
0
,
1
,
2
,
16305
,
1606
,
%3
; m7/1=t0a, m0/2=t1a
SUMSUB_MUL_D
3
,
4
,
5
,
6
,
10394
,
12665
; m3/5=t4a, m4/6=t5a
SUMSUB_MUL_D
3
,
4
,
5
,
6
,
10394
,
12665
,
%3
; m3/5=t4a, m4/6=t5a
SCRATCH
0
,
8
,
rsp
+
17
*
mmsize
SCRATCH
0
,
8
,
rsp
+
17
*
mmsize
SUMSUB_PACK_D
3
,
7
,
5
,
1
,
0
; m3=t0, m7=t4
SUMSUB_PACK_D
3
,
7
,
5
,
1
,
0
,
%2
; m3=t0, m7=t4
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
SUMSUB_PACK_D
4
,
0
,
6
,
2
,
1
; m4=t1, m0=t5
SUMSUB_PACK_D
4
,
0
,
6
,
2
,
1
,
%2
; m4=t1, m0=t5
SCRATCH
3
,
8
,
rsp
+
17
*
mmsize
SCRATCH
3
,
8
,
rsp
+
17
*
mmsize
SCRATCH
4
,
9
,
rsp
+
18
*
mmsize
SCRATCH
4
,
9
,
rsp
+
18
*
mmsize
...
@@ -851,26 +867,26 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
...
@@ -851,26 +867,26 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
mova
m2
,
[
%1
+
4
*
mmsize
]
mova
m2
,
[
%1
+
4
*
mmsize
]
mova
m5
,
[
%1
+
10
*
mmsize
]
mova
m5
,
[
%1
+
10
*
mmsize
]
mova
m6
,
[
%1
+
12
*
mmsize
]
mova
m6
,
[
%1
+
12
*
mmsize
]
SUMSUB_MUL_D
5
,
2
,
3
,
4
,
14449
,
7723
; m5/8=t2a, m2/9=t3a
SUMSUB_MUL_D
5
,
2
,
3
,
4
,
14449
,
7723
,
%3
; m5/8=t2a, m2/9=t3a
SUMSUB_MUL_D
1
,
6
,
7
,
0
,
4756
,
15679
; m1/10=t6a, m6/11=t7a
SUMSUB_MUL_D
1
,
6
,
7
,
0
,
4756
,
15679
,
%3
; m1/10=t6a, m6/11=t7a
SCRATCH
2
,
12
,
rsp
+
21
*
mmsize
SCRATCH
2
,
12
,
rsp
+
21
*
mmsize
SUMSUB_PACK_D
1
,
5
,
7
,
3
,
2
; m1=t2, m5=t6
SUMSUB_PACK_D
1
,
5
,
7
,
3
,
2
,
%2
; m1=t2, m5=t6
UNSCRATCH
2
,
12
,
rsp
+
21
*
mmsize
UNSCRATCH
2
,
12
,
rsp
+
21
*
mmsize
SUMSUB_PACK_D
6
,
2
,
0
,
4
,
3
; m6=t3, m2=t7
SUMSUB_PACK_D
6
,
2
,
0
,
4
,
3
,
%2
; m6=t3, m2=t7
UNSCRATCH
7
,
10
,
rsp
+
19
*
mmsize
UNSCRATCH
7
,
10
,
rsp
+
19
*
mmsize
UNSCRATCH
0
,
11
,
rsp
+
20
*
mmsize
UNSCRATCH
0
,
11
,
rsp
+
20
*
mmsize
SCRATCH
1
,
10
,
rsp
+
19
*
mmsize
SCRATCH
1
,
10
,
rsp
+
19
*
mmsize
SCRATCH
6
,
11
,
rsp
+
20
*
mmsize
SCRATCH
6
,
11
,
rsp
+
20
*
mmsize
SUMSUB_MUL_D
7
,
0
,
3
,
4
,
15137
,
6270
; m7/8=t4a, m0/9=t5a
SUMSUB_MUL_D
7
,
0
,
3
,
4
,
15137
,
6270
,
%3
; m7/8=t4a, m0/9=t5a
SUMSUB_MUL_D
2
,
5
,
1
,
6
,
6270
,
15137
; m2/10=t7a, m5/11=t6a
SUMSUB_MUL_D
2
,
5
,
1
,
6
,
6270
,
15137
,
%3
; m2/10=t7a, m5/11=t6a
SCRATCH
2
,
12
,
rsp
+
21
*
mmsize
SCRATCH
2
,
12
,
rsp
+
21
*
mmsize
SUMSUB_PACK_D
5
,
7
,
6
,
3
,
2
; m5=-out1, m7=t6
SUMSUB_PACK_D
5
,
7
,
6
,
3
,
2
,
%2
; m5=-out1, m7=t6
UNSCRATCH
2
,
12
,
rsp
+
21
*
mmsize
UNSCRATCH
2
,
12
,
rsp
+
21
*
mmsize
NEGD
m5
; m5=out1
NEGD
m5
; m5=out1
SUMSUB_PACK_D
2
,
0
,
1
,
4
,
3
; m2=out6, m0=t7
SUMSUB_PACK_D
2
,
0
,
1
,
4
,
3
,
%2
; m2=out6, m0=t7
SUMSUB_MUL
7
,
0
,
3
,
4
,
11585
,
11585
; m7=out2, m0=-out5
SUMSUB_MUL
7
,
0
,
3
,
4
,
11585
,
11585
,
%2
,
%3
; m7=out2, m0=-out5
NEGD
m0
; m0=out5
NEGD
m0
; m0=out5
UNSCRATCH
3
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
3
,
8
,
rsp
+
17
*
mmsize
...
@@ -883,7 +899,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
...
@@ -883,7 +899,7 @@ cglobal vp9_idct_idct_8x8_add_12, 4, 6 + ARCH_X86_64, 10, \
SUMSUB_BA
d
,
1
,
3
,
2
; m1=out0, m3=t2
SUMSUB_BA
d
,
1
,
3
,
2
; m1=out0, m3=t2
SUMSUB_BA
d
,
6
,
4
,
2
; m6=-out7, m4=t3
SUMSUB_BA
d
,
6
,
4
,
2
; m6=-out7, m4=t3
NEGD
m6
; m6=out7
NEGD
m6
; m6=out7
SUMSUB_MUL
3
,
4
,
2
,
0
,
11585
,
11585
; m3=-out3, m4=out4
SUMSUB_MUL
3
,
4
,
2
,
0
,
11585
,
11585
,
%2
,
%3
; m3=-out3, m4=out4
NEGD
m3
; m3=out3
NEGD
m3
; m3=out3
UNSCRATCH
0
,
9
,
rsp
+
18
*
mmsize
UNSCRATCH
0
,
9
,
rsp
+
18
*
mmsize
...
@@ -899,7 +915,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -899,7 +915,7 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mova
m0
,
[
pw_1023
]
mova
m0
,
[
pw_1023
]
.
body
:
.
body
:
mova
[
rsp
+
16
*
mmsize
]
,
m0
SCRATCH
0
,
13
,
rsp
+
16
*
mmsize
,
max
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
skip
,
dstbak
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
skip
,
dstbak
%if
ARCH_X86_64
%if
ARCH_X86_64
mov
dstbakq
,
dstq
mov
dstbakq
,
dstq
...
@@ -914,8 +930,10 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -914,8 +930,10 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov
skipd
,
2
mov
skipd
,
2
sub
skipd
,
cntd
sub
skipd
,
cntd
mov
ptrq
,
rsp
mov
ptrq
,
rsp
PRELOAD
14
,
pd_8192
,
rnd
PRELOAD
15
,
pd_3fff
,
mask
.
loop_1
:
.
loop_1
:
%2
_1D
blockq
%2
_1D
blockq
,
reg_rnd
,
reg_mask
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
6
TRANSPOSE4x4D
0
,
1
,
2
,
3
,
6
mova
[
ptrq
+
0
*
mmsize
]
,
m0
mova
[
ptrq
+
0
*
mmsize
]
,
m0
...
@@ -954,14 +972,16 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -954,14 +972,16 @@ cglobal vp9_%1_%3_8x8_add_10, 4, 6 + ARCH_X86_64, 16, \
mov
cntd
,
2
mov
cntd
,
2
mov
ptrq
,
rsp
mov
ptrq
,
rsp
.
loop_2
:
.
loop_2
:
%4
_1D
ptrq
%4
_1D
ptrq
,
reg_rnd
,
reg_mask
pxor
m6
,
m6
pxor
m6
,
m6
PRELOAD
9
,
rsp
+
16
*
mmsize
,
max
PRELOAD
9
,
pd_16
,
srnd
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m6
,
reg_max
,
[
pd_16
]
,
5
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m6
,
reg_max
,
reg_srnd
,
5
lea
dstq
,
[
dstq
+
strideq
*
4
]
lea
dstq
,
[
dstq
+
strideq
*
4
]
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
UNSCRATCH
0
,
8
,
rsp
+
17
*
mmsize
ROUND_AND_STORE_4x4
4
,
5
,
0
,
7
,
m6
,
reg_max
,
[
pd_16
]
,
5
UNSCRATCH
1
,
13
,
rsp
+
16
*
mmsize
,
max
UNSCRATCH
2
,
9
,
pd_16
,
srnd
ROUND_AND_STORE_4x4
4
,
5
,
0
,
7
,
m6
,
m1
,
m2
,
5
add
ptrq
,
16
add
ptrq
,
16
%if
ARCH_X86_64
%if
ARCH_X86_64
lea
dstq
,
[
dstbakq
+
8
]
lea
dstq
,
[
dstbakq
+
8
]
...
@@ -989,7 +1009,7 @@ IADST8_FN iadst, IADST8, idct, IDCT8, col
...
@@ -989,7 +1009,7 @@ IADST8_FN iadst, IADST8, idct, IDCT8, col
IADST8_FN
iadst
,
IADST8
,
iadst
,
IADST8
,
default
IADST8_FN
iadst
,
IADST8
,
iadst
,
IADST8
,
default
%macro
IDCT16_1D
1
-
4
4
*
mmsize
,
65
,
67
; src, src_stride, stack_offset, mm32bit_stack_offset
%macro
IDCT16_1D
1
-
4
4
*
mmsize
,
65
,
67
; src, src_stride, stack_offset, mm32bit_stack_offset
IDCT8_1D
%1
,
%2
*
2
,
%4
; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
IDCT8_1D
%1
,
[
pd_8192
]
,
[
pd_3fff
]
,
%2
*
2
,
%4
; m0-3=t0-3a, m4-5/m8|r67/m7=t4-7
; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
; SCRATCH 6, 8, rsp+(%4+0)*mmsize ; t6
SCRATCH
0
,
15
,
rsp
+
(
%4
+
7
)
*
mmsize
; t0a
SCRATCH
0
,
15
,
rsp
+
(
%4
+
7
)
*
mmsize
; t0a
SCRATCH
1
,
14
,
rsp
+
(
%4
+
6
)
*
mmsize
; t1a
SCRATCH
1
,
14
,
rsp
+
(
%4
+
6
)
*
mmsize
; t1a
...
@@ -1186,7 +1206,9 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1186,7 +1206,9 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
lea
dstq
,
[
dstq
+
strideq
*
4
]
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
m0
,
[
rsp
+
65
*
mmsize
]
mova
m0
,
[
rsp
+
65
*
mmsize
]
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
mova
m1
,
[
rsp
+
64
*
mmsize
]
mova
m2
,
[
pd_32
]
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
m1
,
m2
,
6
%if
ARCH_X86_64
%if
ARCH_X86_64
DEFINE_ARGS
dstbak
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dst
DEFINE_ARGS
dstbak
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dst
...
@@ -1194,10 +1216,10 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1194,10 +1216,10 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov
dstq
,
dstm
mov
dstq
,
dstm
%endif
%endif
UNSCRATCH
0
,
8
,
rsp
+
67
*
mmsize
UNSCRATCH
0
,
8
,
rsp
+
67
*
mmsize
UNSCRATCH
1
,
9
,
rsp
+
68
*
mmsize
UNSCRATCH
4
,
9
,
rsp
+
68
*
mmsize
UNSCRATCH
2
,
10
,
rsp
+
69
*
mmsize
UNSCRATCH
5
,
10
,
rsp
+
69
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
70
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
70
*
mmsize
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
0
,
4
,
5
,
3
,
m7
,
m1
,
m2
,
6
%if
ARCH_X86_64
%if
ARCH_X86_64
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
lea
dstq
,
[
dstbakq
+
stride3q
*
4
]
lea
dstq
,
[
dstbakq
+
stride3q
*
4
]
...
@@ -1208,7 +1230,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1208,7 +1230,7 @@ cglobal vp9_idct_idct_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH
5
,
13
,
rsp
+
72
*
mmsize
UNSCRATCH
5
,
13
,
rsp
+
72
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
73
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
73
*
mmsize
UNSCRATCH
0
,
15
,
rsp
+
74
*
mmsize
UNSCRATCH
0
,
15
,
rsp
+
74
*
mmsize
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
m1
,
m2
,
6
add
ptrq
,
mmsize
add
ptrq
,
mmsize
%if
ARCH_X86_64
%if
ARCH_X86_64
...
@@ -1501,7 +1523,9 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1501,7 +1523,9 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
lea
dstq
,
[
dstq
+
strideq
*
4
]
lea
dstq
,
[
dstq
+
strideq
*
4
]
mova
m0
,
[
rsp
+
65
*
mmsize
]
mova
m0
,
[
rsp
+
65
*
mmsize
]
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
mova
m1
,
[
rsp
+
64
*
mmsize
]
mova
m2
,
[
pd_32
]
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
m1
,
m2
,
6
%if
ARCH_X86_64
%if
ARCH_X86_64
DEFINE_ARGS
dstbak
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dst
DEFINE_ARGS
dstbak
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dst
...
@@ -1509,10 +1533,10 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1509,10 +1533,10 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
mov
dstq
,
dstm
mov
dstq
,
dstm
%endif
%endif
UNSCRATCH
0
,
8
,
rsp
+
(
%6
+
0
)
*
mmsize
UNSCRATCH
0
,
8
,
rsp
+
(
%6
+
0
)
*
mmsize
UNSCRATCH
1
,
9
,
rsp
+
(
%6
+
1
)
*
mmsize
UNSCRATCH
4
,
9
,
rsp
+
(
%6
+
1
)
*
mmsize
UNSCRATCH
2
,
10
,
rsp
+
(
%6
+
2
)
*
mmsize
UNSCRATCH
5
,
10
,
rsp
+
(
%6
+
2
)
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
(
%6
+
3
)
*
mmsize
UNSCRATCH
3
,
11
,
rsp
+
(
%6
+
3
)
*
mmsize
ROUND_AND_STORE_4x4
0
,
1
,
2
,
3
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
0
,
4
,
5
,
3
,
m7
,
m1
,
m2
,
6
%if
ARCH_X86_64
%if
ARCH_X86_64
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
DEFINE_ARGS
dst
,
stride
,
block
,
cnt
,
ptr
,
stride3
,
dstbak
lea
dstq
,
[
dstbakq
+
stride3q
*
4
]
lea
dstq
,
[
dstbakq
+
stride3q
*
4
]
...
@@ -1523,7 +1547,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
...
@@ -1523,7 +1547,7 @@ cglobal vp9_%1_%4_16x16_add_10, 4, 6 + ARCH_X86_64, 16, \
UNSCRATCH
5
,
13
,
rsp
+
(
%6
+
5
)
*
mmsize
UNSCRATCH
5
,
13
,
rsp
+
(
%6
+
5
)
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
(
%6
+
6
)
*
mmsize
UNSCRATCH
6
,
14
,
rsp
+
(
%6
+
6
)
*
mmsize
UNSCRATCH
0
,
15
,
rsp
+
(
%6
+
7
)
*
mmsize
UNSCRATCH
0
,
15
,
rsp
+
(
%6
+
7
)
*
mmsize
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
[
rsp
+
64
*
mmsize
]
,
[
pd_32
]
,
6
ROUND_AND_STORE_4x4
4
,
5
,
6
,
0
,
m7
,
m1
,
m2
,
6
add
ptrq
,
mmsize
add
ptrq
,
mmsize
%if
ARCH_X86_64
%if
ARCH_X86_64
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment