Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
21ffc78f
Commit
21ffc78f
authored
Mar 03, 2012
by
Ronald S. Bultje
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp8: convert mc x86 assembly to use named arguments.
parent
28170f1a
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
272 additions
and
272 deletions
+272
-272
vp8dsp.asm
libavcodec/x86/vp8dsp.asm
+272
-272
No files found.
libavcodec/x86/vp8dsp.asm
View file @
21ffc78f
...
...
@@ -116,23 +116,25 @@ bilinear_filter_vb_m: times 8 db 7, 1
times
8
db
1
,
7
%ifdef
PIC
%define
fourtap_filter_hw
r11
%define
sixtap_filter_hw
r11
%define
fourtap_filter_hb
r11
%define
sixtap_filter_hb
r11
%define
fourtap_filter_v
r11
%define
sixtap_filter_v
r11
%define
bilinear_filter_vw
r11
%define
bilinear_filter_vb
r11
%define
fourtap_filter_hw
picregq
%define
sixtap_filter_hw
picregq
%define
fourtap_filter_hb
picregq
%define
sixtap_filter_hb
picregq
%define
fourtap_filter_v
picregq
%define
sixtap_filter_v
picregq
%define
bilinear_filter_vw
picregq
%define
bilinear_filter_vb
picregq
%define
npicregs
1
%else
%define
fourtap_filter_hw
fourtap_filter_hw_m
%define
sixtap_filter_hw
sixtap_filter_hw_m
%define
fourtap_filter_hb
fourtap_filter_hb_m
%define
sixtap_filter_hb
sixtap_filter_hb_m
%define
fourtap_filter_v
fourtap_filter_v_m
%define
sixtap_filter_v
sixtap_filter_v_m
%define
fourtap_filter_hw
fourtap_filter_hw_m
%define
sixtap_filter_hw
sixtap_filter_hw_m
%define
fourtap_filter_hb
fourtap_filter_hb_m
%define
sixtap_filter_hb
sixtap_filter_hb_m
%define
fourtap_filter_v
fourtap_filter_v_m
%define
sixtap_filter_v
sixtap_filter_v_m
%define
bilinear_filter_vw
bilinear_filter_vw_m
%define
bilinear_filter_vb
bilinear_filter_vb_m
%define
npicregs
0
%endif
filter_h2_shuf
:
db
0
,
1
,
1
,
2
,
2
,
3
,
3
,
4
,
4
,
5
,
5
,
6
,
6
,
7
,
7
,
8
...
...
@@ -174,25 +176,25 @@ SECTION .text
;-----------------------------------------------------------------------------
%macro
FILTER_SSSE3
1
cglobal
put_vp8_epel
%1
_h6
,
6
,
6
,
8
lea
r5d
,
[
r5
*
3
]
cglobal
put_vp8_epel
%1
_h6
,
6
,
6
+
npicregs
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
lea
mxd
,
[
mxq
*
3
]
mova
m3
,
[
filter_h6_shuf2
]
mova
m4
,
[
filter_h6_shuf3
]
%ifdef
PIC
lea
r11
,
[
sixtap_filter_hb_m
]
lea
picregq
,
[
sixtap_filter_hb_m
]
%endif
mova
m5
,
[
sixtap_filter_hb
+
r5
*
8
-
48
]
; set up 6tap filter in bytes
mova
m6
,
[
sixtap_filter_hb
+
r5
*
8
-
32
]
mova
m7
,
[
sixtap_filter_hb
+
r5
*
8
-
16
]
mova
m5
,
[
sixtap_filter_hb
+
mxq
*
8
-
48
]
; set up 6tap filter in bytes
mova
m6
,
[
sixtap_filter_hb
+
mxq
*
8
-
32
]
mova
m7
,
[
sixtap_filter_hb
+
mxq
*
8
-
16
]
.
nextrow
movu
m0
,
[
r2
-
2
]
movu
m0
,
[
srcq
-
2
]
mova
m1
,
m0
mova
m2
,
m0
%if
mmsize
==
8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw
m0
,
[
r2
+
3
]
punpcklbw
m0
,
[
srcq
+
3
]
%else
pshufb
m0
,
[
filter_h6_shuf1
]
%endif
...
...
@@ -206,28 +208,28 @@ cglobal put_vp8_epel%1_h6, 6, 6, 8
paddsw
m0
,
[
pw_64
]
psraw
m0
,
7
packuswb
m0
,
m0
movh
[r0
],
m0
; store
movh
[dstq
],
m0
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
cglobal
put_vp8_epel
%1
_h4
,
6
,
6
,
7
shl
r5
d
,
4
cglobal
put_vp8_epel
%1
_h4
,
6
,
6
+
npicregs
,
7
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
shl
mx
d
,
4
mova
m2
,
[
pw_64
]
mova
m3
,
[
filter_h2_shuf
]
mova
m4
,
[
filter_h4_shuf
]
%ifdef
PIC
lea
r11
,
[
fourtap_filter_hb_m
]
lea
picregq
,
[
fourtap_filter_hb_m
]
%endif
mova
m5
,
[
fourtap_filter_hb
+
r5
-
16
]
; set up 4tap filter in bytes
mova
m6
,
[
fourtap_filter_hb
+
r5
]
mova
m5
,
[
fourtap_filter_hb
+
mxq
-
16
]
; set up 4tap filter in bytes
mova
m6
,
[
fourtap_filter_hb
+
mxq
]
.
nextrow
movu
m0
,
[
r2
-
1
]
movu
m0
,
[
srcq
-
1
]
mova
m1
,
m0
pshufb
m0
,
m3
pshufb
m1
,
m4
...
...
@@ -237,33 +239,33 @@ cglobal put_vp8_epel%1_h4, 6, 6, 7
paddsw
m0
,
m1
psraw
m0
,
7
packuswb
m0
,
m0
movh
[r0
],
m0
; store
movh
[dstq
],
m0
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
cglobal
put_vp8_epel
%1
_v4
,
7
,
7
,
8
shl
r6
d
,
4
cglobal
put_vp8_epel
%1
_v4
,
7
,
7
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
shl
my
d
,
4
%ifdef
PIC
lea
r11
,
[
fourtap_filter_hb_m
]
lea
picregq
,
[
fourtap_filter_hb_m
]
%endif
mova
m5
,
[
fourtap_filter_hb
+
r6
-
16
]
mova
m6
,
[
fourtap_filter_hb
+
r6
]
mova
m5
,
[
fourtap_filter_hb
+
myq
-
16
]
mova
m6
,
[
fourtap_filter_hb
+
myq
]
mova
m7
,
[
pw_64
]
; read 3 lines
sub
r2
,
r3
movh
m0
,
[
r2
]
movh
m1
,
[
r2
+
r3
]
movh
m2
,
[
r2
+
2
*
r3
]
add
r2
,
r3
sub
srcq
,
srcstrideq
movh
m0
,
[
srcq
]
movh
m1
,
[
srcq
+
srcstrideq
]
movh
m2
,
[
srcq
+
2
*
srcstrideq
]
add
srcq
,
srcstrideq
.
nextrow
movh
m3
,
[
r2
+
2
*
r3
]
; read new row
movh
m3
,
[
srcq
+
2
*
srcstrideq
]
; read new row
mova
m4
,
m0
mova
m0
,
m1
punpcklbw
m4
,
m1
...
...
@@ -276,44 +278,44 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8
paddsw
m4
,
m7
psraw
m4
,
7
packuswb
m4
,
m4
movh
[r0
],
m4
movh
[dstq
],
m4
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
cglobal
put_vp8_epel
%1
_v6
,
7
,
7
,
8
lea
r6d
,
[
r6
*
3
]
cglobal
put_vp8_epel
%1
_v6
,
7
,
7
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
lea
myd
,
[
myq
*
3
]
%ifdef
PIC
lea
r11
,
[
sixtap_filter_hb_m
]
lea
picregq
,
[
sixtap_filter_hb_m
]
%endif
lea
r6
,
[
sixtap_filter_hb
+
r6
*
8
]
lea
myq
,
[
sixtap_filter_hb
+
myq
*
8
]
; read 5 lines
sub
r2
,
r3
sub
r2
,
r3
movh
m0
,
[
r2
]
movh
m1
,
[
r2
+
r3
]
movh
m2
,
[
r2
+
r3
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
add
r2
,
r3
movh
m3
,
[
r2
]
movh
m4
,
[
r2
+
r3
]
sub
srcq
,
srcstrideq
sub
srcq
,
srcstrideq
movh
m0
,
[
srcq
]
movh
m1
,
[
srcq
+
srcstrideq
]
movh
m2
,
[
srcq
+
srcstrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
add
srcq
,
srcstrideq
movh
m3
,
[
srcq
]
movh
m4
,
[
srcq
+
srcstrideq
]
.
nextrow
movh
m5
,
[
r2
+
2
*
r3
]
; read new row
movh
m5
,
[
srcq
+
2
*
srcstrideq
]
; read new row
mova
m6
,
m0
punpcklbw
m6
,
m5
mova
m0
,
m1
punpcklbw
m1
,
m2
mova
m7
,
m3
punpcklbw
m7
,
m4
pmaddubsw
m6
,
[
r6
-
48
]
pmaddubsw
m1
,
[
r6
-
32
]
pmaddubsw
m7
,
[
r6
-
16
]
pmaddubsw
m6
,
[
myq
-
48
]
pmaddubsw
m1
,
[
myq
-
32
]
pmaddubsw
m7
,
[
myq
-
16
]
paddsw
m6
,
m1
paddsw
m6
,
m7
mova
m1
,
m2
...
...
@@ -323,12 +325,12 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8
mova
m3
,
m4
packuswb
m6
,
m6
mova
m4
,
m5
movh
[r0
],
m6
movh
[dstq
],
m6
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
%endmacro
...
...
@@ -340,18 +342,18 @@ FILTER_SSSE3 8
; 4x4 block, H-only 4-tap filter
INIT_MMX
mmx2
cglobal
put_vp8_epel4_h4
,
6
,
6
shl
r5
d
,
4
cglobal
put_vp8_epel4_h4
,
6
,
6
+
npicregs
,
0
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
shl
mx
d
,
4
%ifdef
PIC
lea
r11
,
[
fourtap_filter_hw_m
]
lea
picregq
,
[
fourtap_filter_hw_m
]
%endif
movq
mm4
,
[
fourtap_filter_hw
+
r5
-
16
]
; set up 4tap filter in words
movq
mm5
,
[
fourtap_filter_hw
+
r5
]
movq
mm4
,
[
fourtap_filter_hw
+
mxq
-
16
]
; set up 4tap filter in words
movq
mm5
,
[
fourtap_filter_hw
+
mxq
]
movq
mm7
,
[
pw_64
]
pxor
mm6
,
mm6
.
nextrow
movq
mm1
,
[
r2
-
1
]
; (ABCDEFGH) load 8 horizontal pixels
movq
mm1
,
[
srcq
-
1
]
; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq
mm2
,
mm1
; byte ABCD..
...
...
@@ -377,30 +379,30 @@ cglobal put_vp8_epel4_h4, 6, 6
paddsw
mm3
,
mm7
; rounding
psraw
mm3
,
7
packuswb
mm3
,
mm6
; clip and word->bytes
movd
[r0
],
mm3
; store
movd
[dstq
],
mm3
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
; 4x4 block, H-only 6-tap filter
INIT_MMX
mmx2
cglobal
put_vp8_epel4_h6
,
6
,
6
lea
r5d
,
[
r5
*
3
]
cglobal
put_vp8_epel4_h6
,
6
,
6
+
npicregs
,
0
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
lea
mxd
,
[
mxq
*
3
]
%ifdef
PIC
lea
r11
,
[
sixtap_filter_hw_m
]
lea
picregq
,
[
sixtap_filter_hw_m
]
%endif
movq
mm4
,
[
sixtap_filter_hw
+
r5
*
8
-
48
]
; set up 4tap filter in words
movq
mm5
,
[
sixtap_filter_hw
+
r5
*
8
-
32
]
movq
mm6
,
[
sixtap_filter_hw
+
r5
*
8
-
16
]
movq
mm4
,
[
sixtap_filter_hw
+
mxq
*
8
-
48
]
; set up 4tap filter in words
movq
mm5
,
[
sixtap_filter_hw
+
mxq
*
8
-
32
]
movq
mm6
,
[
sixtap_filter_hw
+
mxq
*
8
-
16
]
movq
mm7
,
[
pw_64
]
pxor
mm3
,
mm3
.
nextrow
movq
mm1
,
[
r2
-
2
]
; (ABCDEFGH) load 8 horizontal pixels
movq
mm1
,
[
srcq
-
2
]
; (ABCDEFGH) load 8 horizontal pixels
; first set of 2 pixels
movq
mm2
,
mm1
; byte ABCD..
...
...
@@ -420,7 +422,7 @@ cglobal put_vp8_epel4_h6, 6, 6
paddd
mm1
,
mm2
; finish 1st 2px
; second set of 2 pixels, use backup of above
movd
mm2
,
[
r2
+
3
]
; byte FGHI (prevent overreads)
movd
mm2
,
[
srcq
+
3
]
; byte FGHI (prevent overreads)
pmaddwd
mm0
,
mm4
; multiply 1st backed up 2px with F0/F1
pmaddwd
mm3
,
mm5
; multiply 2nd backed up 2px with F2/F3
paddd
mm0
,
mm3
; add to 2nd 2px cache
...
...
@@ -435,35 +437,35 @@ cglobal put_vp8_epel4_h6, 6, 6
paddsw
mm1
,
mm7
; rounding
psraw
mm1
,
7
packuswb
mm1
,
mm3
; clip and word->bytes
movd
[r0
],
mm1
; store
movd
[dstq
],
mm1
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
INIT_XMM
sse2
cglobal
put_vp8_epel8_h4
,
6
,
6
,
10
shl
r5
d
,
5
cglobal
put_vp8_epel8_h4
,
6
,
6
+
npicregs
,
10
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
shl
mx
d
,
5
%ifdef
PIC
lea
r11
,
[
fourtap_filter_v_m
]
lea
picregq
,
[
fourtap_filter_v_m
]
%endif
lea
r5
,
[
fourtap_filter_v
+
r5
-
32
]
lea
mxq
,
[
fourtap_filter_v
+
mxq
-
32
]
pxor
m7
,
m7
mova
m4
,
[
pw_64
]
mova
m5
,
[
r5
+
0
]
mova
m6
,
[
r5
+
16
]
mova
m5
,
[
mxq
+
0
]
mova
m6
,
[
mxq
+
16
]
%ifdef
m8
mova
m8
,
[
r5
+
32
]
mova
m9
,
[
r5
+
48
]
mova
m8
,
[
mxq
+
32
]
mova
m9
,
[
mxq
+
48
]
%endif
.
nextrow
movq
m0
,
[
r2
-
1
]
movq
m1
,
[
r2
-
0
]
movq
m2
,
[
r2
+
1
]
movq
m3
,
[
r2
+
2
]
movq
m0
,
[
srcq
-
1
]
movq
m1
,
[
srcq
-
0
]
movq
m2
,
[
srcq
+
1
]
movq
m3
,
[
srcq
+
2
]
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
...
...
@@ -474,8 +476,8 @@ cglobal put_vp8_epel8_h4, 6, 6, 10
pmullw
m2
,
m8
pmullw
m3
,
m9
%else
pmullw
m2
,
[
r5
+
32
]
pmullw
m3
,
[
r5
+
48
]
pmullw
m2
,
[
mxq
+
32
]
pmullw
m3
,
[
mxq
+
48
]
%endif
paddsw
m0
,
m1
paddsw
m2
,
m3
...
...
@@ -483,40 +485,40 @@ cglobal put_vp8_epel8_h4, 6, 6, 10
paddsw
m0
,
m4
psraw
m0
,
7
packuswb
m0
,
m7
movh
[r0
],
m0
; store
movh
[dstq
],
m0
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
INIT_XMM
sse2
cglobal
put_vp8_epel8_h6
,
6
,
6
,
14
lea
r5d
,
[
r5
*
3
]
shl
r5
d
,
4
cglobal
put_vp8_epel8_h6
,
6
,
6
+
npicregs
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
lea
mxd
,
[
mxq
*
3
]
shl
mx
d
,
4
%ifdef
PIC
lea
r11
,
[
sixtap_filter_v_m
]
lea
picregq
,
[
sixtap_filter_v_m
]
%endif
lea
r5
,
[
sixtap_filter_v
+
r5
-
96
]
lea
mxq
,
[
sixtap_filter_v
+
mxq
-
96
]
pxor
m7
,
m7
mova
m6
,
[
pw_64
]
%ifdef
m8
mova
m8
,
[
r5
+
0
]
mova
m9
,
[
r5
+
16
]
mova
m10
,
[
r5
+
32
]
mova
m11
,
[
r5
+
48
]
mova
m12
,
[
r5
+
64
]
mova
m13
,
[
r5
+
80
]
mova
m8
,
[
mxq
+
0
]
mova
m9
,
[
mxq
+
16
]
mova
m10
,
[
mxq
+
32
]
mova
m11
,
[
mxq
+
48
]
mova
m12
,
[
mxq
+
64
]
mova
m13
,
[
mxq
+
80
]
%endif
.
nextrow
movq
m0
,
[
r2
-
2
]
movq
m1
,
[
r2
-
1
]
movq
m2
,
[
r2
-
0
]
movq
m3
,
[
r2
+
1
]
movq
m4
,
[
r2
+
2
]
movq
m5
,
[
r2
+
3
]
movq
m0
,
[
srcq
-
2
]
movq
m1
,
[
srcq
-
1
]
movq
m2
,
[
srcq
-
0
]
movq
m3
,
[
srcq
+
1
]
movq
m4
,
[
srcq
+
2
]
movq
m5
,
[
srcq
+
3
]
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
...
...
@@ -531,12 +533,12 @@ cglobal put_vp8_epel8_h6, 6, 6, 14
pmullw
m4
,
m12
pmullw
m5
,
m13
%else
pmullw
m0
,
[
r5
+
0
]
pmullw
m1
,
[
r5
+
16
]
pmullw
m2
,
[
r5
+
32
]
pmullw
m3
,
[
r5
+
48
]
pmullw
m4
,
[
r5
+
64
]
pmullw
m5
,
[
r5
+
80
]
pmullw
m0
,
[
mxq
+
0
]
pmullw
m1
,
[
mxq
+
16
]
pmullw
m2
,
[
mxq
+
32
]
pmullw
m3
,
[
mxq
+
48
]
pmullw
m4
,
[
mxq
+
64
]
pmullw
m5
,
[
mxq
+
80
]
%endif
paddsw
m1
,
m4
paddsw
m0
,
m5
...
...
@@ -546,52 +548,52 @@ cglobal put_vp8_epel8_h6, 6, 6, 14
paddsw
m0
,
m6
psraw
m0
,
7
packuswb
m0
,
m7
movh
[r0
],
m0
; store
movh
[dstq
],
m0
; store
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
%macro
FILTER_V
1
; 4x4 block, V-only 4-tap filter
cglobal
put_vp8_epel
%1
_v4
,
7
,
7
,
8
shl
r6
d
,
5
cglobal
put_vp8_epel
%1
_v4
,
7
,
7
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
shl
my
d
,
5
%ifdef
PIC
lea
r11
,
[
fourtap_filter_v_m
]
lea
picregq
,
[
fourtap_filter_v_m
]
%endif
lea
r6
,
[
fourtap_filter_v
+
r6
-
32
]
lea
myq
,
[
fourtap_filter_v
+
myq
-
32
]
mova
m6
,
[
pw_64
]
pxor
m7
,
m7
mova
m5
,
[
r6
+
48
]
mova
m5
,
[
myq
+
48
]
; read 3 lines
sub
r2
,
r3
movh
m0
,
[
r2
]
movh
m1
,
[
r2
+
r3
]
movh
m2
,
[
r2
+
2
*
r3
]
add
r2
,
r3
sub
srcq
,
srcstrideq
movh
m0
,
[
srcq
]
movh
m1
,
[
srcq
+
srcstrideq
]
movh
m2
,
[
srcq
+
2
*
srcstrideq
]
add
srcq
,
srcstrideq
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
.
nextrow
; first calculate negative taps (to prevent losing positive overflows)
movh
m4
,
[
r2
+
2
*
r3
]
; read new row
movh
m4
,
[
srcq
+
2
*
srcstrideq
]
; read new row
punpcklbw
m4
,
m7
mova
m3
,
m4
pmullw
m0
,
[
r6
+
0
]
pmullw
m0
,
[
myq
+
0
]
pmullw
m4
,
m5
paddsw
m4
,
m0
; then calculate positive taps
mova
m0
,
m1
pmullw
m1
,
[
r6
+
16
]
pmullw
m1
,
[
myq
+
16
]
paddsw
m4
,
m1
mova
m1
,
m2
pmullw
m2
,
[
r6
+
32
]
pmullw
m2
,
[
myq
+
32
]
paddsw
m4
,
m2
mova
m2
,
m3
...
...
@@ -599,36 +601,36 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8
paddsw
m4
,
m6
psraw
m4
,
7
packuswb
m4
,
m7
movh
[r0
],
m4
movh
[dstq
],
m4
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
; 4x4 block, V-only 6-tap filter
cglobal
put_vp8_epel
%1
_v6
,
7
,
7
,
8
shl
r6
d
,
4
lea
r6
,
[
r6
*
3
]
cglobal
put_vp8_epel
%1
_v6
,
7
,
7
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
shl
my
d
,
4
lea
myq
,
[
myq
*
3
]
%ifdef
PIC
lea
r11
,
[
sixtap_filter_v_m
]
lea
picregq
,
[
sixtap_filter_v_m
]
%endif
lea
r6
,
[
sixtap_filter_v
+
r6
-
96
]
lea
myq
,
[
sixtap_filter_v
+
myq
-
96
]
pxor
m7
,
m7
; read 5 lines
sub
r2
,
r3
sub
r2
,
r3
movh
m0
,
[
r2
]
movh
m1
,
[
r2
+
r3
]
movh
m2
,
[
r2
+
r3
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
add
r2
,
r3
movh
m3
,
[
r2
]
movh
m4
,
[
r2
+
r3
]
sub
srcq
,
srcstrideq
sub
srcq
,
srcstrideq
movh
m0
,
[
srcq
]
movh
m1
,
[
srcq
+
srcstrideq
]
movh
m2
,
[
srcq
+
srcstrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
add
srcq
,
srcstrideq
movh
m3
,
[
srcq
]
movh
m4
,
[
srcq
+
srcstrideq
]
punpcklbw
m0
,
m7
punpcklbw
m1
,
m7
punpcklbw
m2
,
m7
...
...
@@ -638,38 +640,38 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8
.
nextrow
; first calculate negative taps (to prevent losing positive overflows)
mova
m5
,
m1
pmullw
m5
,
[
r6
+
16
]
pmullw
m5
,
[
myq
+
16
]
mova
m6
,
m4
pmullw
m6
,
[
r6
+
64
]
pmullw
m6
,
[
myq
+
64
]
paddsw
m6
,
m5
; then calculate positive taps
movh
m5
,
[
r2
+
2
*
r3
]
; read new row
movh
m5
,
[
srcq
+
2
*
srcstrideq
]
; read new row
punpcklbw
m5
,
m7
pmullw
m0
,
[
r6
+
0
]
pmullw
m0
,
[
myq
+
0
]
paddsw
m6
,
m0
mova
m0
,
m1
mova
m1
,
m2
pmullw
m2
,
[
r6
+
32
]
pmullw
m2
,
[
myq
+
32
]
paddsw
m6
,
m2
mova
m2
,
m3
pmullw
m3
,
[
r6
+
48
]
pmullw
m3
,
[
myq
+
48
]
paddsw
m6
,
m3
mova
m3
,
m4
mova
m4
,
m5
pmullw
m5
,
[
r6
+
80
]
pmullw
m5
,
[
myq
+
80
]
paddsw
m6
,
m5
; round/clip/store
paddsw
m6
,
[
pw_64
]
psraw
m6
,
7
packuswb
m6
,
m7
movh
[r0
],
m6
movh
[dstq
],
m6
; go to next line
add
r0
,
r1
add
r2
,
r3
dec
r4
d
; next row
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
height
d
; next row
jg
.
nextrow
REP_RET
%endmacro
...
...
@@ -680,20 +682,19 @@ INIT_XMM sse2
FILTER_V
8
%macro
FILTER_BILINEAR
1
cglobal
put_vp8_bilinear
%1
_v
,
7
,
7
,
7
mov
r5d
,
8
*
16
shl
r6d
,
4
sub
r5d
,
r6d
cglobal
put_vp8_bilinear
%1
_v
,
7
,
7
,
7
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
shl
myd
,
4
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vw_m
]
lea
picregq
,
[
bilinear_filter_vw_m
]
%endif
pxor
m6
,
m6
mova
m4
,
[
bilinear_filter_vw
+
r5
-
16
]
mova
m5
,
[
bilinear_filter_vw
+
r6
-
16
]
mova
m5
,
[
bilinear_filter_vw
+
myq
-
1
*
16
]
neg
myq
mova
m4
,
[
bilinear_filter_vw
+
myq
+
7
*
16
]
.
nextrow
movh
m0
,
[
r2
+
r3
*
0
]
movh
m1
,
[
r2
+
r3
*
1
]
movh
m3
,
[
r2
+
r3
*
2
]
movh
m0
,
[
srcq
+
srcstrideq
*
0
]
movh
m1
,
[
srcq
+
srcstrideq
*
1
]
movh
m3
,
[
srcq
+
srcstrideq
*
2
]
punpcklbw
m0
,
m6
punpcklbw
m1
,
m6
punpcklbw
m3
,
m6
...
...
@@ -711,35 +712,34 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m2
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
1
]
,
m2
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movh
[
dstq
+
dststrideq
*
1
]
,
m2
%else
packuswb
m0
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movhps
[
r0
+
r1
*
1
]
,
m0
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movhps
[
dstq
+
dststrideq
*
1
]
,
m0
%endif
lea
r0
,
[
r0
+
r1
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
sub
r4
d
,
2
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
cglobal
put_vp8_bilinear
%1
_h
,
7
,
7
,
7
mov
r6d
,
8
*
16
shl
r5d
,
4
sub
r6d
,
r5d
cglobal
put_vp8_bilinear
%1
_h
,
6
,
6
+
npicregs
,
7
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
shl
mxd
,
4
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vw_m
]
lea
picregq
,
[
bilinear_filter_vw_m
]
%endif
pxor
m6
,
m6
mova
m4
,
[
bilinear_filter_vw
+
r6
-
16
]
mova
m5
,
[
bilinear_filter_vw
+
r5
-
16
]
mova
m5
,
[
bilinear_filter_vw
+
mxq
-
1
*
16
]
neg
mxq
mova
m4
,
[
bilinear_filter_vw
+
mxq
+
7
*
16
]
.
nextrow
movh
m0
,
[
r2
+
r3
*
0
+
0
]
movh
m1
,
[
r2
+
r3
*
0
+
1
]
movh
m2
,
[
r2
+
r3
*
1
+
0
]
movh
m3
,
[
r2
+
r3
*
1
+
1
]
movh
m0
,
[
srcq
+
srcstrideq
*
0
+
0
]
movh
m1
,
[
srcq
+
srcstrideq
*
0
+
1
]
movh
m2
,
[
srcq
+
srcstrideq
*
1
+
0
]
movh
m3
,
[
srcq
+
srcstrideq
*
1
+
1
]
punpcklbw
m0
,
m6
punpcklbw
m1
,
m6
punpcklbw
m2
,
m6
...
...
@@ -757,17 +757,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 7
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m2
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
1
]
,
m2
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movh
[
dstq
+
dststrideq
*
1
]
,
m2
%else
packuswb
m0
,
m2
movh
[
r0
+
r1
*
0
]
,
m0
movhps
[
r0
+
r1
*
1
]
,
m0
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movhps
[
dstq
+
dststrideq
*
1
]
,
m0
%endif
lea
r0
,
[
r0
+
r1
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
sub
r4
d
,
2
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
%endmacro
...
...
@@ -778,17 +778,17 @@ INIT_XMM sse2
FILTER_BILINEAR
8
%macro
FILTER_BILINEAR_SSSE3
1
cglobal
put_vp8_bilinear
%1
_v
,
7
,
7
,
5
shl
r6
d
,
4
cglobal
put_vp8_bilinear
%1
_v
,
7
,
7
,
5
,
dst
,
dststride
,
src
,
srcstride
,
height
,
picreg
,
my
shl
my
d
,
4
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vb_m
]
lea
picregq
,
[
bilinear_filter_vb_m
]
%endif
pxor
m4
,
m4
mova
m3
,
[
bilinear_filter_vb
+
r6
-
16
]
mova
m3
,
[
bilinear_filter_vb
+
myq
-
16
]
.
nextrow
movh
m0
,
[
r2
+
r3
*
0
]
movh
m1
,
[
r2
+
r3
*
1
]
movh
m2
,
[
r2
+
r3
*
2
]
movh
m0
,
[
srcq
+
srcstrideq
*
0
]
movh
m1
,
[
srcq
+
srcstrideq
*
1
]
movh
m2
,
[
srcq
+
srcstrideq
*
2
]
punpcklbw
m0
,
m1
punpcklbw
m1
,
m2
pmaddubsw
m0
,
m3
...
...
@@ -800,31 +800,31 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 5
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m1
,
m1
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
1
]
,
m1
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movh
[
dstq
+
dststrideq
*
1
]
,
m1
%else
packuswb
m0
,
m1
movh
[
r0
+
r1
*
0
]
,
m0
movhps
[
r0
+
r1
*
1
]
,
m0
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movhps
[
dstq
+
dststrideq
*
1
]
,
m0
%endif
lea
r0
,
[
r0
+
r1
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
sub
r4
d
,
2
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
cglobal
put_vp8_bilinear
%1
_h
,
7
,
7
,
5
shl
r5
d
,
4
cglobal
put_vp8_bilinear
%1
_h
,
6
,
6
+
npicregs
,
5
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
picreg
shl
mx
d
,
4
%ifdef
PIC
lea
r11
,
[
bilinear_filter_vb_m
]
lea
picregq
,
[
bilinear_filter_vb_m
]
%endif
pxor
m4
,
m4
mova
m2
,
[
filter_h2_shuf
]
mova
m3
,
[
bilinear_filter_vb
+
r5
-
16
]
mova
m3
,
[
bilinear_filter_vb
+
mxq
-
16
]
.
nextrow
movu
m0
,
[
r2
+
r3
*
0
]
movu
m1
,
[
r2
+
r3
*
1
]
movu
m0
,
[
srcq
+
srcstrideq
*
0
]
movu
m1
,
[
srcq
+
srcstrideq
*
1
]
pshufb
m0
,
m2
pshufb
m1
,
m2
pmaddubsw
m0
,
m3
...
...
@@ -836,17 +836,17 @@ cglobal put_vp8_bilinear%1_h, 7, 7, 5
%if
mmsize
==
8
packuswb
m0
,
m0
packuswb
m1
,
m1
movh
[
r0
+
r1
*
0
]
,
m0
movh
[
r0
+
r1
*
1
]
,
m1
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movh
[
dstq
+
dststrideq
*
1
]
,
m1
%else
packuswb
m0
,
m1
movh
[
r0
+
r1
*
0
]
,
m0
movhps
[
r0
+
r1
*
1
]
,
m0
movh
[
dstq
+
dststrideq
*
0
]
,
m0
movhps
[
dstq
+
dststrideq
*
1
]
,
m0
%endif
lea
r0
,
[
r0
+
r1
*
2
]
lea
r2
,
[
r2
+
r3
*
2
]
sub
r4
d
,
2
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
%endmacro
...
...
@@ -857,47 +857,47 @@ INIT_XMM ssse3
FILTER_BILINEAR_SSSE3
8
INIT_MMX
mmx
cglobal
put_vp8_pixels8
,
5
,
5
cglobal
put_vp8_pixels8
,
5
,
5
,
0
,
dst
,
dststride
,
src
,
srcstride
,
height
.
nextrow
:
movq
mm0
,
[
r2
+
r3
*
0
]
movq
mm1
,
[
r2
+
r3
*
1
]
lea
r2
,
[
r2
+
r3
*
2
]
movq
[
r0
+
r1
*
0
]
,
mm0
movq
[
r0
+
r1
*
1
]
,
mm1
lea
r0
,
[
r0
+
r1
*
2
]
sub
r4
d
,
2
movq
mm0
,
[
srcq
+
srcstrideq
*
0
]
movq
mm1
,
[
srcq
+
srcstrideq
*
1
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
movq
[
dstq
+
dststrideq
*
0
]
,
mm0
movq
[
dstq
+
dststrideq
*
1
]
,
mm1
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
%if
ARCH_X86_32
INIT_MMX
mmx
cglobal
put_vp8_pixels16
,
5
,
5
cglobal
put_vp8_pixels16
,
5
,
5
,
0
,
dst
,
dststride
,
src
,
srcstride
,
height
.
nextrow
:
movq
mm0
,
[
r2
+
r3
*
0
+
0
]
movq
mm1
,
[
r2
+
r3
*
0
+
8
]
movq
mm2
,
[
r2
+
r3
*
1
+
0
]
movq
mm3
,
[
r2
+
r3
*
1
+
8
]
lea
r2
,
[
r2
+
r3
*
2
]
movq
[
r0
+
r1
*
0
+
0
]
,
mm0
movq
[
r0
+
r1
*
0
+
8
]
,
mm1
movq
[
r0
+
r1
*
1
+
0
]
,
mm2
movq
[
r0
+
r1
*
1
+
8
]
,
mm3
lea
r0
,
[
r0
+
r1
*
2
]
sub
r4
d
,
2
movq
mm0
,
[
srcq
+
srcstrideq
*
0
+
0
]
movq
mm1
,
[
srcq
+
srcstrideq
*
0
+
8
]
movq
mm2
,
[
srcq
+
srcstrideq
*
1
+
0
]
movq
mm3
,
[
srcq
+
srcstrideq
*
1
+
8
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
movq
[
dstq
+
dststrideq
*
0
+
0
]
,
mm0
movq
[
dstq
+
dststrideq
*
0
+
8
]
,
mm1
movq
[
dstq
+
dststrideq
*
1
+
0
]
,
mm2
movq
[
dstq
+
dststrideq
*
1
+
8
]
,
mm3
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
%endif
INIT_XMM
sse
cglobal
put_vp8_pixels16
,
5
,
5
,
2
cglobal
put_vp8_pixels16
,
5
,
5
,
2
,
dst
,
dststride
,
src
,
srcstride
,
height
.
nextrow
:
movups
xmm0
,
[
r2
+
r3
*
0
]
movups
xmm1
,
[
r2
+
r3
*
1
]
lea
r2
,
[
r2
+
r3
*
2
]
movaps
[
r0
+
r1
*
0
]
,
xmm0
movaps
[
r0
+
r1
*
1
]
,
xmm1
lea
r0
,
[
r0
+
r1
*
2
]
sub
r4
d
,
2
movups
xmm0
,
[
srcq
+
srcstrideq
*
0
]
movups
xmm1
,
[
srcq
+
srcstrideq
*
1
]
lea
srcq
,
[
srcq
+
srcstrideq
*
2
]
movaps
[
dstq
+
dststrideq
*
0
]
,
xmm0
movaps
[
dstq
+
dststrideq
*
1
]
,
xmm1
lea
dstq
,
[
dstq
+
dststrideq
*
2
]
sub
height
d
,
2
jg
.
nextrow
REP_RET
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment