Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
406fbd24
Commit
406fbd24
authored
Jul 22, 2011
by
Daniel Kang
Committed by
Ronald S. Bultje
Jul 22, 2011
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
H.264: Add optimizations to predict x86 assembly.
Signed-off-by:
Ronald S. Bultje
<
rsbultje@gmail.com
>
parent
505345ed
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
437 additions
and
714 deletions
+437
-714
h264_intrapred.asm
libavcodec/x86/h264_intrapred.asm
+2
-3
h264_intrapred_10bit.asm
libavcodec/x86/h264_intrapred_10bit.asm
+414
-703
h264_intrapred_init.c
libavcodec/x86/h264_intrapred_init.c
+21
-8
No files found.
libavcodec/x86/h264_intrapred.asm
View file @
406fbd24
...
@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3
...
@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3
punpckldq
m1
,
[r1]
punpckldq
m1
,
[r1]
movq
m2
,
m1
movq
m2
,
m1
movq
m3
,
m1
movq
m3
,
m1
movq
m4
,
m1
psllq
m1
,
8
psllq
m1
,
8
pxor
m2
,
m1
pxor
m2
,
m1
psrlq
m2
,
8
psrlq
m2
,
8
pxor
m
3
,
m2
pxor
m
2
,
m3
PRED4x4_LOWPASS
m0
,
m1
,
m
3
,
m4
,
m5
PRED4x4_LOWPASS
m0
,
m1
,
m
2
,
m3
,
m4
lea
r1
,
[
r0
+
r2
*
2
]
lea
r1
,
[
r0
+
r2
*
2
]
psrlq
m0
,
8
psrlq
m0
,
8
movd
[
r0
+
r2
*
1
]
,
m0
movd
[
r0
+
r2
*
1
]
,
m0
...
...
libavcodec/x86/h264_intrapred_10bit.asm
View file @
406fbd24
...
@@ -27,8 +27,6 @@
...
@@ -27,8 +27,6 @@
SECTION_RODATA
SECTION_RODATA
SECTION
.
text
cextern
pw_16
cextern
pw_16
cextern
pw_8
cextern
pw_8
cextern
pw_4
cextern
pw_4
...
@@ -42,6 +40,8 @@ pw_512: times 8 dw 512
...
@@ -42,6 +40,8 @@ pw_512: times 8 dw 512
pd_17
:
times
4
dd
17
pd_17
:
times
4
dd
17
pd_16
:
times
4
dd
16
pd_16
:
times
4
dd
16
SECTION
.
text
; dest, left, right, src
; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro
PRED4x4_LOWPASS
4
%macro
PRED4x4_LOWPASS
4
...
@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3
...
@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3
movq
m3
,
[r0]
movq
m3
,
[r0]
punpckhdq
m1
,
m2
punpckhdq
m1
,
m2
PALIGNR
m3
,
m1
,
10
,
m1
PALIGNR
m3
,
m1
,
10
,
m1
mova
m1
,
m3
movhps
m4
,
[
r1
+
r2
*
1
-
8
]
movhps
m4
,
[
r1
+
r2
*
1
-
8
]
PALIGNR
m3
,
m4
,
14
,
m4
PALIGNR
m0
,
m3
,
m4
,
14
,
m4
mova
m2
,
m3
movhps
m4
,
[
r1
+
r2
*
2
-
8
]
movhps
m4
,
[
r1
+
r2
*
2
-
8
]
PALIGNR
m
3
,
m4
,
14
,
m4
PALIGNR
m
2
,
m0
,
m4
,
14
,
m4
PRED4x4_LOWPASS
m0
,
m
3
,
m1
,
m2
PRED4x4_LOWPASS
m0
,
m
2
,
m3
,
m0
movq
[
r1
+
r2
*
2
]
,
m0
movq
[
r1
+
r2
*
2
]
,
m0
psrldq
m0
,
2
psrldq
m0
,
2
movq
[
r1
+
r2
*
1
]
,
m0
movq
[
r1
+
r2
*
1
]
,
m0
...
@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6
...
@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6
pavgw
m5
,
m0
pavgw
m5
,
m0
movhps
m1
,
[
r0
+
r2
*
1
-
8
]
movhps
m1
,
[
r0
+
r2
*
1
-
8
]
PALIGNR
m0
,
m1
,
14
,
m1
; ....t3t2t1t0ltl0
PALIGNR
m0
,
m1
,
14
,
m1
; ....t3t2t1t0ltl0
mova
m1
,
m0
movhps
m2
,
[
r0
+
r2
*
2
-
8
]
movhps
m2
,
[
r0
+
r2
*
2
-
8
]
PALIGNR
m0
,
m2
,
14
,
m2
; ..t3t2t1t0ltl0l1
PALIGNR
m1
,
m0
,
m2
,
14
,
m2
; ..t3t2t1t0ltl0l1
mova
m2
,
m0
movhps
m3
,
[
r1
+
r2
*
1
-
8
]
movhps
m3
,
[
r1
+
r2
*
1
-
8
]
PALIGNR
m
0
,
m3
,
14
,
m3
; t3t2t1t0ltl0l1l2
PALIGNR
m
2
,
m1
,
m3
,
14
,
m3
; t3t2t1t0ltl0l1l2
PRED4x4_LOWPASS
m
3
,
m1
,
m0
,
m2
PRED4x4_LOWPASS
m
1
,
m0
,
m2
,
m1
pslldq
m
1
,
m3
,
12
pslldq
m
0
,
m1
,
12
psrldq
m
3
,
4
psrldq
m
1
,
4
movq
[
r0
+
r2
*
1
]
,
m5
movq
[
r0
+
r2
*
1
]
,
m5
movq
[
r0
+
r2
*
2
]
,
m
3
movq
[
r0
+
r2
*
2
]
,
m
1
PALIGNR
m5
,
m
1
,
14
,
m2
PALIGNR
m5
,
m
0
,
14
,
m2
pslldq
m
1
,
2
pslldq
m
0
,
2
movq
[
r1
+
r2
*
1
]
,
m5
movq
[
r1
+
r2
*
1
]
,
m5
PALIGNR
m
3
,
m1
,
14
,
m1
PALIGNR
m
1
,
m0
,
14
,
m0
movq
[
r1
+
r2
*
2
]
,
m
3
movq
[
r1
+
r2
*
2
]
,
m
1
RET
RET
%endmacro
%endmacro
...
@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3
...
@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3
punpckhdq
m1
,
m2
; l0 l1 l2 l3
punpckhdq
m1
,
m2
; l0 l1 l2 l3
punpckhqdq
m1
,
m0
; t2 t1 t0 lt l0 l1 l2 l3
punpckhqdq
m1
,
m0
; t2 t1 t0 lt l0 l1 l2 l3
psrldq
m0
,
m1
,
4
; .. .. t2 t1 t0 lt l0 l1
psrldq
m0
,
m1
,
4
; .. .. t2 t1 t0 lt l0 l1
psrldq
m
2
,
m1
,
2
; .. t2 t1 t0 lt l0 l1 l2
psrldq
m
3
,
m1
,
2
; .. t2 t1 t0 lt l0 l1 l2
pavgw
m5
,
m1
,
m
2
pavgw
m5
,
m1
,
m
3
PRED4x4_LOWPASS
m3
,
m1
,
m0
,
m
2
PRED4x4_LOWPASS
m3
,
m1
,
m0
,
m
3
punpcklwd
m5
,
m3
punpcklwd
m5
,
m3
psrldq
m3
,
8
psrldq
m3
,
8
PALIGNR
m3
,
m5
,
12
,
m4
PALIGNR
m3
,
m5
,
12
,
m4
...
@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3
...
@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;TODO: more AVX here
%macro
PRED4x4_DL
1
%macro
PRED4x4_DL
1
cglobal
pred4x4_down_left_10_
%1
,
3
,
3
cglobal
pred4x4_down_left_10_
%1
,
3
,
3
sub
r0
,
r2
sub
r0
,
r2
movq
m1
,
[r0]
movq
m0
,
[r0]
movhps
m1
,
[r1]
movhps
m0
,
[r1]
pslldq
m5
,
m1
,
2
psrldq
m2
,
m0
,
2
pxor
m2
,
m5
,
m1
pslldq
m3
,
m0
,
2
psrldq
m2
,
2
pshufhw
m2
,
m2
,
10100100
b
pxor
m3
,
m1
,
m2
PRED4x4_LOWPASS
m0
,
m3
,
m2
,
m0
PRED4x4_LOWPASS
m0
,
m5
,
m3
,
m1
lea
r1
,
[
r0
+
r2
*
2
]
lea
r1
,
[
r0
+
r2
*
2
]
movhps
[
r1
+
r2
*
2
]
,
m0
movhps
[
r1
+
r2
*
2
]
,
m0
psrldq
m0
,
2
psrldq
m0
,
2
...
@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
...
@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
sub
r0
,
r2
sub
r0
,
r2
movu
m1
,
[r0]
movu
m1
,
[r0]
movhps
m1
,
[r1]
movhps
m1
,
[r1]
psrldq
m
3
,
m1
,
2
psrldq
m
0
,
m1
,
2
psrldq
m2
,
m1
,
4
psrldq
m2
,
m1
,
4
pavgw
m4
,
m
3
,
m1
pavgw
m4
,
m
0
,
m1
PRED4x4_LOWPASS
m0
,
m1
,
m2
,
m
3
PRED4x4_LOWPASS
m0
,
m1
,
m2
,
m
0
lea
r1
,
[
r0
+
r2
*
2
]
lea
r1
,
[
r0
+
r2
*
2
]
movq
[
r0
+
r2
*
1
]
,
m4
movq
[
r0
+
r2
*
1
]
,
m4
movq
[
r0
+
r2
*
2
]
,
m0
movq
[
r0
+
r2
*
2
]
,
m0
...
@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3
...
@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3
pavgw
m2
,
m0
pavgw
m2
,
m0
pshufw
m5
,
m0
,
11111110
b
pshufw
m5
,
m0
,
11111110
b
PRED4x4_LOWPASS
m
3
,
m0
,
m5
,
m1
PRED4x4_LOWPASS
m
1
,
m0
,
m5
,
m1
movq
m6
,
m2
movq
m6
,
m2
punpcklwd
m6
,
m
3
punpcklwd
m6
,
m
1
movq
[
r0
+
r2
*
1
]
,
m6
movq
[
r0
+
r2
*
1
]
,
m6
psrlq
m2
,
16
psrlq
m2
,
16
psrlq
m
3
,
16
psrlq
m
1
,
16
punpcklwd
m2
,
m
3
punpcklwd
m2
,
m
1
movq
[
r0
+
r2
*
2
]
,
m2
movq
[
r0
+
r2
*
2
]
,
m2
psrlq
m2
,
32
psrlq
m2
,
32
movd
[
r1
+
r2
*
1
]
,
m2
movd
[
r1
+
r2
*
1
]
,
m2
...
@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2
...
@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
INIT_XMM
INIT_XMM
cglobal
pred8x8_horizontal_10_sse2
,
2
,
3
cglobal
pred8x8_horizontal_10_sse2
,
2
,
3
mov
r2
,
4
mov
r2d
,
4
.
loop
:
.
loop
:
movq
m0
,
[
r0
+
r1
*
0
-
8
]
movq
m0
,
[
r0
+
r1
*
0
-
8
]
movq
m1
,
[
r0
+
r1
*
1
-
8
]
movq
m1
,
[
r0
+
r1
*
1
-
8
]
...
@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
...
@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
mova
[
r0
+
r1
*
0
]
,
m0
mova
[
r0
+
r1
*
0
]
,
m0
mova
[
r0
+
r1
*
1
]
,
m1
mova
[
r0
+
r1
*
1
]
,
m1
lea
r0
,
[
r0
+
r1
*
2
]
lea
r0
,
[
r0
+
r1
*
2
]
dec
r2
dec
r2
d
jg
.
loop
jg
.
loop
REP_RET
REP_RET
...
@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
...
@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
%endmacro
%endmacro
%macro
PRED8x8_DC
2
%macro
PRED8x8_DC
2
cglobal
pred8x8_dc_10_
%1
,
2
,
4
cglobal
pred8x8_dc_10_
%1
,
2
,
6
%ifdef
ARCH_X86_64
%define
t0
r10
%else
%define
t0
r0m
%endif
sub
r0
,
r1
sub
r0
,
r1
pxor
m4
,
m4
pxor
m4
,
m4
movq
m0
,
[
r0
+
0
]
movq
m0
,
[
r0
+
0
]
movq
m1
,
[
r0
+
8
]
movq
m1
,
[
r0
+
8
]
HADDW
m0
,
m2
%if
mmsize
==
16
mov
t0
,
r0
punpcklwd
m0
,
m1
HADDW
m1
,
m2
movhlps
m1
,
m0
paddw
m0
,
m1
%else
pshufw
m2
,
m0
,
00001110
b
pshufw
m3
,
m1
,
00001110
b
paddw
m0
,
m2
paddw
m1
,
m3
punpcklwd
m0
,
m1
%endif
%2
m2
,
m0
,
00001110
b
paddw
m0
,
m2
lea
r5
,
[
r1
*
3
]
lea
r4
,
[
r0
+
r1
*
4
]
movzx
r2d
,
word
[
r0
+
r1
*
1
-
2
]
movzx
r2d
,
word
[
r0
+
r1
*
1
-
2
]
movzx
r3d
,
word
[
r0
+
r1
*
2
-
2
]
movzx
r3d
,
word
[
r0
+
r1
*
2
-
2
]
lea
r0
,
[
r0
+
r1
*
2
]
add
r2d
,
r3d
add
r2d
,
r3d
movzx
r3d
,
word
[
r0
+
r
1
*
1
-
2
]
movzx
r3d
,
word
[
r0
+
r
5
*
1
-
2
]
add
r2d
,
r3d
add
r2d
,
r3d
movzx
r3d
,
word
[
r
0
+
r1
*
2
-
2
]
movzx
r3d
,
word
[
r
4
-
2
]
add
r2d
,
r3d
add
r2d
,
r3d
lea
r0
,
[
r0
+
r1
*
2
]
movd
m2
,
r2d
; s2
movd
m2
,
r2d
; s2
movzx
r2d
,
word
[
r0
+
r1
*
1
-
2
]
movzx
r2d
,
word
[
r4
+
r1
*
1
-
2
]
movzx
r3d
,
word
[
r0
+
r1
*
2
-
2
]
movzx
r3d
,
word
[
r4
+
r1
*
2
-
2
]
lea
r0
,
[
r0
+
r1
*
2
]
add
r2d
,
r3d
add
r2d
,
r3d
movzx
r3d
,
word
[
r
0
+
r1
*
1
-
2
]
movzx
r3d
,
word
[
r
4
+
r5
*
1
-
2
]
add
r2d
,
r3d
add
r2d
,
r3d
movzx
r3d
,
word
[
r
0
+
r1
*
2
-
2
]
movzx
r3d
,
word
[
r
4
+
r1
*
4
-
2
]
add
r2d
,
r3d
add
r2d
,
r3d
movd
m3
,
r2d
; s3
movd
m3
,
r2d
; s3
punpcklwd
m0
,
m1
mov
r0
,
t0
punpcklwd
m2
,
m3
punpcklwd
m2
,
m3
punpckldq
m0
,
m2
; s0, s1, s2, s3
punpckldq
m0
,
m2
; s0, s1, s2, s3
%2
m3
,
m0
,
11110110
b
; s2, s1, s3, s3
%2
m3
,
m0
,
11110110
b
; s2, s1, s3, s3
lea
r2
,
[
r1
+
r1
*
2
]
%2
m0
,
m0
,
01110100
b
; s0, s1, s3, s1
%2
m0
,
m0
,
01110100
b
; s0, s1, s3, s1
paddw
m0
,
m3
paddw
m0
,
m3
lea
r3
,
[
r0
+
r1
*
4
]
psrlw
m0
,
2
psrlw
m0
,
2
pavgw
m0
,
m4
; s0+s2, s1, s3, s1+s3
pavgw
m0
,
m4
; s0+s2, s1, s3, s1+s3
%if
idn
%1
,
sse2
%if
mmsize
==
16
punpcklwd
m0
,
m0
punpcklwd
m0
,
m0
pshufd
m3
,
m0
,
11111010
b
pshufd
m3
,
m0
,
11111010
b
punpckldq
m0
,
m0
punpckldq
m0
,
m0
...
@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4
...
@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4
%endif
%endif
MOV8
r0
+
r1
*
1
,
m1
,
m2
MOV8
r0
+
r1
*
1
,
m1
,
m2
MOV8
r0
+
r1
*
2
,
m1
,
m2
MOV8
r0
+
r1
*
2
,
m1
,
m2
MOV8
r0
+
r
2
*
1
,
m1
,
m2
MOV8
r0
+
r
5
*
1
,
m1
,
m2
MOV8
r0
+
r1
*
4
,
m1
,
m2
MOV8
r0
+
r1
*
4
,
m1
,
m2
MOV8
r
3
+
r1
*
1
,
m3
,
m4
MOV8
r
4
+
r1
*
1
,
m3
,
m4
MOV8
r
3
+
r1
*
2
,
m3
,
m4
MOV8
r
4
+
r1
*
2
,
m3
,
m4
MOV8
r
3
+
r2
*
1
,
m3
,
m4
MOV8
r
4
+
r5
*
1
,
m3
,
m4
MOV8
r
3
+
r1
*
4
,
m3
,
m4
MOV8
r
4
+
r1
*
4
,
m3
,
m4
RET
RET
%endmacro
%endmacro
...
@@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw
...
@@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8_top_dc(pixel *src, int stride)
; void pred8x8_top_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8_TOP_DC
2
INIT_XMM
cglobal
pred8x8_top_dc_10_
%1
,
2
,
4
cglobal
pred8x8_top_dc_10_
sse2
,
2
,
4
sub
r0
,
r1
sub
r0
,
r1
movq
m0
,
[
r0
+
0
]
mova
m0
,
[r0]
movq
m1
,
[
r0
+
8
]
pshuflw
m1
,
m0
,
0x4e
HADDW
m0
,
m2
pshufhw
m1
,
m1
,
0x4e
HADDW
m1
,
m3
paddw
m0
,
m1
lea
r2
,
[
r1
+
r1
*
2
]
pshuflw
m1
,
m0
,
0xb1
paddw
m0
,
[
pw_2
]
pshufhw
m1
,
m1
,
0xb1
paddw
m1
,
[
pw_2
]
paddw
m0
,
m1
lea
r2
,
[
r1
*
3
]
lea
r3
,
[
r0
+
r1
*
4
]
lea
r3
,
[
r0
+
r1
*
4
]
paddw
m0
,
[
pw_2
]
psrlw
m0
,
2
psrlw
m0
,
2
psrlw
m1
,
2
mova
[
r0
+
r1
*
1
]
,
m0
%2
m0
,
m0
,
0
mova
[
r0
+
r1
*
2
]
,
m0
%2
m1
,
m1
,
0
mova
[
r0
+
r2
*
1
]
,
m0
%ifidn
%1
,
sse2
mova
[
r0
+
r1
*
4
]
,
m0
punpcklqdq
m0
,
m1
mova
[
r3
+
r1
*
1
]
,
m0
%endif
mova
[
r3
+
r1
*
2
]
,
m0
MOV8
r0
+
r1
*
1
,
m0
,
m1
mova
[
r3
+
r2
*
1
]
,
m0
MOV8
r0
+
r1
*
2
,
m0
,
m1
mova
[
r3
+
r1
*
4
]
,
m0
MOV8
r0
+
r2
*
1
,
m0
,
m1
MOV8
r0
+
r1
*
4
,
m0
,
m1
MOV8
r3
+
r1
*
1
,
m0
,
m1
MOV8
r3
+
r1
*
2
,
m0
,
m1
MOV8
r3
+
r2
*
1
,
m0
,
m1
MOV8
r3
+
r1
*
4
,
m0
,
m1
RET
RET
%endmacro
INIT_MMX
PRED8x8_TOP_DC
mmxext
,
pshufw
INIT_XMM
PRED8x8_TOP_DC
sse2
,
pshuflw
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8_plane(pixel *src, int stride)
; void pred8x8_plane(pixel *src, int stride)
...
@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw
...
@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw
INIT_XMM
INIT_XMM
cglobal
pred8x8_plane_10_sse2
,
2
,
7
,
7
cglobal
pred8x8_plane_10_sse2
,
2
,
7
,
7
sub
r0
,
r1
sub
r0
,
r1
lea
r2
,
[
r1
+
r1
*
2
]
lea
r2
,
[
r1
*
3
]
lea
r3
,
[
r0
+
r1
*
4
]
lea
r3
,
[
r0
+
r1
*
4
]
mova
m2
,
[r0]
mova
m2
,
[r0]
pmaddwd
m2
,
[
pw_m32101234
]
pmaddwd
m2
,
[
pw_m32101234
]
...
@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
...
@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
movzx
r5d
,
word
[
r3
+
r2
*
1
-
2
]
; src[6*stride-1]
movzx
r5d
,
word
[
r3
+
r2
*
1
-
2
]
; src[6*stride-1]
movzx
r6d
,
word
[
r0
+
r1
*
1
-
2
]
; src[0*stride-1]
movzx
r6d
,
word
[
r0
+
r1
*
1
-
2
]
; src[0*stride-1]
sub
r5d
,
r6d
sub
r5d
,
r6d
lea
r5d
,
[
r5
+
r5
*
2
]
lea
r5d
,
[
r5
*
3
]
add
r4d
,
r5d
add
r4d
,
r5d
movzx
r6d
,
word
[
r3
+
r1
*
4
-
2
]
; src[7*stride-1]
movzx
r6d
,
word
[
r3
+
r1
*
4
-
2
]
; src[7*stride-1]
movzx
r5d
,
word
[
r0
+
r1
*
0
-
2
]
; src[ -stride-1]
movzx
r5d
,
word
[
r0
+
r1
*
0
-
2
]
; src[ -stride-1]
...
@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
...
@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_128_DC
1
%macro
PRED8x8L_128_DC
1
cglobal
pred8x8l_128_dc_10_
%1
,
4
,
4
cglobal
pred8x8l_128_dc_10_
%1
,
4
,
4
mova
m0
,
[
pw_512
]
mova
m0
,
[
pw_512
]
; (1<<(BIT_DEPTH-1))
lea
r1
,
[
r3
+
r3
*
2
]
lea
r1
,
[
r3
*
3
]
lea
r2
,
[
r0
+
r3
*
4
]
lea
r2
,
[
r0
+
r3
*
4
]
MOV8
r0
+
r3
*
0
,
m0
,
m0
MOV8
r0
+
r3
*
0
,
m0
,
m0
MOV8
r0
+
r3
*
1
,
m0
,
m0
MOV8
r0
+
r3
*
1
,
m0
,
m0
...
@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2
...
@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2
%macro
PRED8x8L_TOP_DC
1
%macro
PRED8x8L_TOP_DC
1
cglobal
pred8x8l_top_dc_10_
%1
,
4
,
4
,
6
cglobal
pred8x8l_top_dc_10_
%1
,
4
,
4
,
6
sub
r0
,
r3
sub
r0
,
r3
pxor
m7
,
m7
mova
m0
,
[r0]
mova
m0
,
[
r0
-
16
]
shr
r1d
,
14
mova
m3
,
[r0]
shr
r2d
,
13
mova
m1
,
[
r0
+
16
]
neg
r1
mova
m2
,
m3
pslldq
m1
,
m0
,
2
mova
m4
,
m3
psrldq
m2
,
m0
,
2
PALIGNR
m2
,
m0
,
14
,
m0
pinsrw
m1
,
[
r0
+
r1
]
,
0
PALIGNR
m1
,
m4
,
2
,
m4
pinsrw
m2
,
[
r0
+
r2
+
14
]
,
7
test
r1
,
r1
; top_left
lea
r1
,
[
r3
*
3
]
jz
.
fix_lt_2
test
r2
,
r2
; top_right
jz
.
fix_tr_1
jmp
.
body
.
fix_lt_2
:
mova
m5
,
m3
pxor
m5
,
m2
pslldq
m5
,
14
psrldq
m5
,
14
pxor
m2
,
m5
test
r2
,
r2
; top_right
jnz
.
body
.
fix_tr_1
:
mova
m5
,
m3
pxor
m5
,
m1
psrldq
m5
,
14
pslldq
m5
,
14
pxor
m1
,
m5
.
body
lea
r1
,
[
r3
+
r3
*
2
]
lea
r2
,
[
r0
+
r3
*
4
]
lea
r2
,
[
r0
+
r3
*
4
]
PRED4x4_LOWPASS
m0
,
m2
,
m1
,
m
3
PRED4x4_LOWPASS
m0
,
m2
,
m1
,
m
0
HADDW
m0
,
m1
HADDW
m0
,
m1
paddw
m0
,
[
pw_4
]
paddw
m0
,
[
pw_4
]
psrlw
m0
,
3
psrlw
m0
,
3
...
@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6
...
@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6
%endmacro
%endmacro
INIT_XMM
INIT_XMM
%define
PALIGNR
PALIGNR_MMX
PRED8x8L_TOP_DC
sse2
PRED8x8L_TOP_DC
sse2
%define
PALIGNR
PALIGNR_SSSE3
%ifdef
HAVE_AVX
PRED8x8L_TOP_DC
ssse3
INIT_AVX
PRED8x8L_TOP_DC
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;TODO: see if scalar is faster
;TODO: see if scalar is faster
%macro
PRED8x8L_DC
1
%macro
PRED8x8L_DC
1
cglobal
pred8x8l_dc_10_
%1
,
4
,
5
,
8
cglobal
pred8x8l_dc_10_
%1
,
4
,
6
,
6
sub
r0
,
r3
sub
r0
,
r3
lea
r4
,
[
r0
+
r3
*
2
]
lea
r4
,
[
r0
+
r3
*
4
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
lea
r5
,
[
r3
*
3
]
punpckhwd
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m0
,
[
r0
+
r3
*
2
-
16
]
mova
m1
,
[
r4
+
r3
*
1
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
mova
m1
,
[
r4
+
r3
*
0
-
16
]
mov
r4
,
r0
punpckhwd
m1
,
[
r0
+
r5
*
1
-
16
]
punpckhdq
m1
,
m0
punpckhdq
m1
,
m0
lea
r0
,
[
r0
+
r3
*
4
]
mova
m2
,
[
r4
+
r3
*
2
-
16
]
mova
m2
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r4
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r0
+
r3
*
0
-
16
]
mova
m3
,
[
r4
+
r3
*
4
-
16
]
lea
r0
,
[
r0
+
r3
*
2
]
punpckhwd
m3
,
[
r4
+
r5
*
1
-
16
]
mova
m3
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m3
,
[
r0
+
r3
*
0
-
16
]
punpckhdq
m3
,
m2
punpckhdq
m3
,
m2
punpckhqdq
m3
,
m1
punpckhqdq
m3
,
m1
lea
r0
,
[
r0
+
r3
*
2
]
mova
m0
,
[r0]
mova
m0
,
[
r0
+
r3
*
0
-
16
]
shr
r1d
,
14
mova
m1
,
[r4]
shr
r2d
,
13
mov
r0
,
r4
neg
r1
mova
m4
,
m3
pslldq
m1
,
m0
,
2
mova
m2
,
m3
psrldq
m2
,
m0
,
2
PALIGNR
m4
,
m0
,
14
,
m0
pinsrw
m1
,
[
r0
+
r1
]
,
0
PALIGNR
m1
,
m2
,
2
,
m2
pinsrw
m2
,
[
r0
+
r2
+
14
]
,
7
test
r1
,
r1
not
r1
jnz
.
do_left
and
r1
,
r3
.
fix_lt_1
:
pslldq
m4
,
m3
,
2
mova
m5
,
m3
psrldq
m5
,
m3
,
2
pxor
m5
,
m4
pshuflw
m4
,
m4
,
11100101
b
psrldq
m5
,
14
pinsrw
m5
,
[
r0
+
r1
-
2
]
,
7
pslldq
m5
,
12
PRED4x4_LOWPASS
m3
,
m4
,
m5
,
m3
pxor
m1
,
m5
PRED4x4_LOWPASS
m0
,
m2
,
m1
,
m0
jmp
.
do_left
paddw
m0
,
m3
.
fix_lt_2
:
HADDW
m0
,
m1
mova
m5
,
m3
paddw
m0
,
[
pw_8
]
pxor
m5
,
m2
psrlw
m0
,
4
pslldq
m5
,
14
SPLATW
m0
,
m0
psrldq
m5
,
14
mova
[
r0
+
r3
*
1
]
,
m0
pxor
m2
,
m5
mova
[
r0
+
r3
*
2
]
,
m0
test
r2
,
r2
mova
[
r0
+
r5
*
1
]
,
m0
jnz
.
body
mova
[
r0
+
r3
*
4
]
,
m0
.
fix_tr_1
:
mova
[
r4
+
r3
*
1
]
,
m0
mova
m5
,
m3
mova
[
r4
+
r3
*
2
]
,
m0
pxor
m5
,
m1
mova
[
r4
+
r5
*
1
]
,
m0
psrldq
m5
,
14
mova
[
r4
+
r3
*
4
]
,
m0
pslldq
m5
,
14
pxor
m1
,
m5
jmp
.
body
.
do_left
:
mova
m0
,
m4
PRED4x4_LOWPASS
m2
,
m1
,
m4
,
m3
mova
m4
,
m0
mova
m7
,
m2
PRED4x4_LOWPASS
m1
,
m3
,
m0
,
m4
pslldq
m1
,
14
PALIGNR
m7
,
m1
,
14
,
m3
mova
m0
,
[
r0
-
16
]
mova
m3
,
[r0]
mova
m1
,
[
r0
+
16
]
mova
m2
,
m3
mova
m4
,
m3
PALIGNR
m2
,
m0
,
14
,
m0
PALIGNR
m1
,
m4
,
2
,
m4
test
r1
,
r1
jz
.
fix_lt_2
test
r2
,
r2
jz
.
fix_tr_1
.
body
lea
r1
,
[
r3
+
r3
*
2
]
PRED4x4_LOWPASS
m6
,
m2
,
m1
,
m3
HADDW
m7
,
m0
HADDW
m6
,
m0
lea
r2
,
[
r0
+
r3
*
4
]
paddw
m7
,
[
pw_8
]
paddw
m7
,
m6
psrlw
m7
,
4
SPLATW
m7
,
m7
mova
[
r0
+
r3
*
1
]
,
m7
mova
[
r0
+
r3
*
2
]
,
m7
mova
[
r0
+
r1
*
1
]
,
m7
mova
[
r0
+
r3
*
4
]
,
m7
mova
[
r2
+
r3
*
1
]
,
m7
mova
[
r2
+
r3
*
2
]
,
m7
mova
[
r2
+
r1
*
1
]
,
m7
mova
[
r2
+
r3
*
4
]
,
m7
RET
RET
%endmacro
%endmacro
INIT_XMM
INIT_XMM
%define
PALIGNR
PALIGNR_MMX
PRED8x8L_DC
sse2
PRED8x8L_DC
sse2
%define
PALIGNR
PALIGNR_SSSE3
%ifdef
HAVE_AVX
PRED8x8L_DC
ssse3
INIT_AVX
PRED8x8L_DC
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
...
@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3
...
@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3
%macro
PRED8x8L_VERTICAL
1
%macro
PRED8x8L_VERTICAL
1
cglobal
pred8x8l_vertical_10_
%1
,
4
,
4
,
6
cglobal
pred8x8l_vertical_10_
%1
,
4
,
4
,
6
sub
r0
,
r3
sub
r0
,
r3
mova
m0
,
[
r0
-
16
]
mova
m0
,
[r0]
mova
m3
,
[r0]
shr
r1d
,
14
mova
m1
,
[
r0
+
16
]
shr
r2d
,
13
mova
m2
,
m3
neg
r1
mova
m4
,
m3
pslldq
m1
,
m0
,
2
PALIGNR
m2
,
m0
,
14
,
m0
psrldq
m2
,
m0
,
2
PALIGNR
m1
,
m4
,
2
,
m4
pinsrw
m1
,
[
r0
+
r1
]
,
0
test
r1
,
r1
; top_left
pinsrw
m2
,
[
r0
+
r2
+
14
]
,
7
jz
.
fix_lt_2
lea
r1
,
[
r3
*
3
]
test
r2
,
r2
; top_right
jz
.
fix_tr_1
jmp
.
body
.
fix_lt_2
:
mova
m5
,
m3
pxor
m5
,
m2
pslldq
m5
,
14
psrldq
m5
,
14
pxor
m2
,
m5
test
r2
,
r2
; top_right
jnz
.
body
.
fix_tr_1
:
mova
m5
,
m3
pxor
m5
,
m1
psrldq
m5
,
14
pslldq
m5
,
14
pxor
m1
,
m5
.
body
lea
r1
,
[
r3
+
r3
*
2
]
lea
r2
,
[
r0
+
r3
*
4
]
lea
r2
,
[
r0
+
r3
*
4
]
PRED4x4_LOWPASS
m0
,
m2
,
m1
,
m
3
PRED4x4_LOWPASS
m0
,
m2
,
m1
,
m
0
mova
[
r0
+
r3
*
1
]
,
m0
mova
[
r0
+
r3
*
1
]
,
m0
mova
[
r0
+
r3
*
2
]
,
m0
mova
[
r0
+
r3
*
2
]
,
m0
mova
[
r0
+
r1
*
1
]
,
m0
mova
[
r0
+
r1
*
1
]
,
m0
...
@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
...
@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
%endmacro
%endmacro
INIT_XMM
INIT_XMM
%define
PALIGNR
PALIGNR_MMX
PRED8x8L_VERTICAL
sse2
PRED8x8L_VERTICAL
sse2
%define
PALIGNR
PALIGNR_SSSE3
%ifdef
HAVE_AVX
PRED8x8L_VERTICAL
ssse3
INIT_AVX
PRED8x8L_VERTICAL
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_HORIZONTAL
1
%macro
PRED8x8L_HORIZONTAL
1
cglobal
pred8x8l_horizontal_10_
%1
,
4
,
4
,
8
cglobal
pred8x8l_horizontal_10_
%1
,
4
,
4
,
5
sub
r0
,
r3
mova
m0
,
[
r0
-
16
]
lea
r2
,
[
r0
+
r3
*
2
]
shr
r1d
,
14
mova
m0
,
[
r0
+
r3
*
1
-
16
]
dec
r1
test
r1
,
r1
and
r1
,
r3
lea
r1
,
[
r0
+
r3
]
sub
r1
,
r3
cmovnz
r1
,
r0
punpckhwd
m0
,
[
r0
+
r1
-
16
]
punpckhwd
m0
,
[
r1
+
r3
*
0
-
16
]
mova
m1
,
[
r0
+
r3
*
2
-
16
]
mova
m1
,
[
r2
+
r3
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
lea
r2
,
[
r0
+
r3
*
4
]
mov
r2
,
r0
lea
r1
,
[
r3
*
3
]
punpckhdq
m1
,
m0
punpckhdq
m1
,
m0
lea
r0
,
[
r0
+
r3
*
4
]
mova
m2
,
[
r2
+
r3
*
0
-
16
]
mova
m2
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r0
+
r1
-
16
]
punpckhwd
m2
,
[
r0
+
r3
*
0
-
16
]
mova
m3
,
[
r2
+
r3
*
2
-
16
]
lea
r0
,
[
r0
+
r3
*
2
]
punpckhwd
m3
,
[
r2
+
r3
*
1
-
16
]
mova
m3
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m3
,
[
r0
+
r3
*
0
-
16
]
punpckhdq
m3
,
m2
punpckhdq
m3
,
m2
punpckhqdq
m3
,
m1
punpckhqdq
m3
,
m1
lea
r0
,
[
r0
+
r3
*
2
]
PALIGNR
m4
,
m3
,
[
r2
+
r1
-
16
]
,
14
,
m0
mova
m0
,
[
r0
+
r3
*
0
-
16
]
pslldq
m0
,
m4
,
2
mova
m1
,
[
r1
+
r3
*
0
-
16
]
pshuflw
m0
,
m0
,
11100101
b
mov
r0
,
r2
PRED4x4_LOWPASS
m4
,
m3
,
m0
,
m4
mova
m4
,
m3
punpckhwd
m3
,
m4
,
m4
mova
m2
,
m3
punpcklwd
m4
,
m4
PALIGNR
m4
,
m0
,
14
,
m0
PALIGNR
m1
,
m2
,
2
,
m2
mova
m0
,
m4
PRED4x4_LOWPASS
m2
,
m1
,
m4
,
m3
mova
m4
,
m0
mova
m7
,
m2
PRED4x4_LOWPASS
m1
,
m3
,
m0
,
m4
pslldq
m1
,
14
PALIGNR
m7
,
m1
,
14
,
m3
lea
r1
,
[
r3
+
r3
*
2
]
punpckhwd
m3
,
m7
,
m7
punpcklwd
m7
,
m7
pshufd
m0
,
m3
,
0xff
pshufd
m0
,
m3
,
0xff
pshufd
m1
,
m3
,
0xaa
pshufd
m1
,
m3
,
0xaa
lea
r2
,
[
r0
+
r3
*
4
]
pshufd
m2
,
m3
,
0x55
pshufd
m2
,
m3
,
0x55
pshufd
m3
,
m3
,
0x00
pshufd
m3
,
m3
,
0x00
pshufd
m4
,
m7
,
0xff
mova
[
r0
+
r3
*
0
]
,
m0
pshufd
m5
,
m7
,
0xaa
mova
[
r0
+
r3
*
1
]
,
m1
pshufd
m6
,
m7
,
0x55
mova
[
r0
+
r3
*
2
]
,
m2
pshufd
m7
,
m7
,
0x00
mova
[
r0
+
r1
*
1
]
,
m3
mova
[
r0
+
r3
*
1
]
,
m0
pshufd
m0
,
m4
,
0xff
mova
[
r0
+
r3
*
2
]
,
m1
pshufd
m1
,
m4
,
0xaa
mova
[
r0
+
r1
*
1
]
,
m2
pshufd
m2
,
m4
,
0x55
mova
[
r0
+
r3
*
4
]
,
m3
pshufd
m3
,
m4
,
0x00
mova
[
r2
+
r3
*
1
]
,
m4
mova
[
r2
+
r3
*
0
]
,
m0
mova
[
r2
+
r3
*
2
]
,
m5
mova
[
r2
+
r3
*
1
]
,
m1
mova
[
r2
+
r
1
*
1
]
,
m6
mova
[
r2
+
r
3
*
2
]
,
m2
mova
[
r2
+
r
3
*
4
]
,
m7
mova
[
r2
+
r
1
*
1
]
,
m3
RET
RET
%endmacro
%endmacro
...
@@ -837,116 +728,68 @@ INIT_XMM
...
@@ -837,116 +728,68 @@ INIT_XMM
PRED8x8L_HORIZONTAL
sse2
PRED8x8L_HORIZONTAL
sse2
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
PRED8x8L_HORIZONTAL
ssse3
PRED8x8L_HORIZONTAL
ssse3
%ifdef
HAVE_AVX
INIT_AVX
PRED8x8L_HORIZONTAL
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_DOWN_LEFT
1
%macro
PRED8x8L_DOWN_LEFT
1
cglobal
pred8x8l_down_left_10_
%1
,
4
,
4
,
8
cglobal
pred8x8l_down_left_10_
%1
,
4
,
4
,
7
sub
r0
,
r3
sub
r0
,
r3
mova
m0
,
[
r0
-
16
]
mova
m3
,
[r0]
mova
m3
,
[r0]
shr
r1d
,
14
neg
r1
shr
r2d
,
13
pslldq
m1
,
m3
,
2
psrldq
m2
,
m3
,
2
pinsrw
m1
,
[
r0
+
r1
]
,
0
pinsrw
m2
,
[
r0
+
r2
+
14
]
,
7
PRED4x4_LOWPASS
m6
,
m2
,
m1
,
m3
jz
.
fix_tr
; flags from shr r2d
mova
m1
,
[
r0
+
16
]
mova
m1
,
[
r0
+
16
]
mova
m2
,
m3
psrldq
m5
,
m1
,
2
mova
m4
,
m3
PALIGNR
m2
,
m1
,
m3
,
14
,
m3
PALIGNR
m2
,
m0
,
14
,
m0
pshufhw
m5
,
m5
,
10100100
b
PALIGNR
m1
,
m4
,
2
,
m4
PRED4x4_LOWPASS
m1
,
m2
,
m5
,
m1
test
r1
,
r1
jz
.
fix_lt_2
test
r2
,
r2
jz
.
fix_tr_1
jmp
.
do_top
.
fix_lt_2
:
mova
m5
,
m3
pxor
m5
,
m2
pslldq
m5
,
14
psrldq
m5
,
14
pxor
m2
,
m5
test
r2
,
r2
jnz
.
do_top
.
fix_tr_1
:
mova
m5
,
m3
pxor
m5
,
m1
psrldq
m5
,
14
pslldq
m5
,
14
pxor
m1
,
m5
jmp
.
do_top
.
fix_tr_2
:
punpckhwd
m3
,
m3
pshufd
m1
,
m3
,
0xFF
jmp
.
do_topright
.
do_top
:
PRED4x4_LOWPASS
m4
,
m2
,
m1
,
m3
mova
m7
,
m4
test
r2
,
r2
jz
.
fix_tr_2
mova
m0
,
[
r0
+
16
]
mova
m5
,
m0
mova
m2
,
m0
mova
m4
,
m0
psrldq
m5
,
14
PALIGNR
m2
,
m3
,
14
,
m3
PALIGNR
m5
,
m4
,
2
,
m4
PRED4x4_LOWPASS
m1
,
m2
,
m5
,
m0
.
do_topright
:
.
do_topright
:
lea
r1
,
[
r3
+
r3
*
2
]
lea
r1
,
[
r3
*
3
]
mova
m6
,
m1
psrldq
m5
,
m1
,
14
psrldq
m1
,
14
mova
m4
,
m1
lea
r2
,
[
r0
+
r3
*
4
]
lea
r2
,
[
r0
+
r3
*
4
]
mova
m2
,
m6
PALIGNR
m2
,
m1
,
m6
,
2
,
m0
PALIGNR
m2
,
m7
,
2
,
m0
PALIGNR
m3
,
m1
,
m6
,
14
,
m0
mova
m3
,
m6
PALIGNR
m5
,
m1
,
2
,
m0
PALIGNR
m3
,
m7
,
14
,
m0
pslldq
m4
,
m6
,
2
PALIGNR
m4
,
m6
,
2
,
m0
PRED4x4_LOWPASS
m6
,
m4
,
m2
,
m6
mova
m5
,
m7
PRED4x4_LOWPASS
m1
,
m3
,
m5
,
m1
mova
m1
,
m7
mova
m7
,
m6
pslldq
m1
,
2
PRED4x4_LOWPASS
m0
,
m1
,
m2
,
m5
PRED4x4_LOWPASS
m1
,
m3
,
m4
,
m7
mova
[
r2
+
r3
*
4
]
,
m1
mova
[
r2
+
r3
*
4
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r2
+
r1
*
1
]
,
m1
mova
[
r2
+
r1
*
1
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r2
+
r3
*
2
]
,
m1
mova
[
r2
+
r3
*
2
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r2
+
r3
*
1
]
,
m1
mova
[
r2
+
r3
*
1
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r0
+
r3
*
4
]
,
m1
mova
[
r0
+
r3
*
4
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r0
+
r1
*
1
]
,
m1
mova
[
r0
+
r1
*
1
]
,
m1
mova
m2
,
m0
PALIGNR
m1
,
m6
,
14
,
m2
pslldq
m1
,
2
pslldq
m6
,
2
psrldq
m2
,
14
pslldq
m0
,
2
por
m1
,
m2
mova
[
r0
+
r3
*
2
]
,
m1
mova
[
r0
+
r3
*
2
]
,
m1
pslldq
m1
,
2
PALIGNR
m1
,
m6
,
14
,
m6
psrldq
m0
,
14
por
m1
,
m0
mova
[
r0
+
r3
*
1
]
,
m1
mova
[
r0
+
r3
*
1
]
,
m1
RET
RET
.
fix_tr
:
punpckhwd
m3
,
m3
pshufd
m1
,
m3
,
0xFF
jmp
.
do_topright
%endmacro
%endmacro
INIT_XMM
INIT_XMM
...
@@ -954,139 +797,73 @@ INIT_XMM
...
@@ -954,139 +797,73 @@ INIT_XMM
PRED8x8L_DOWN_LEFT
sse2
PRED8x8L_DOWN_LEFT
sse2
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
PRED8x8L_DOWN_LEFT
ssse3
PRED8x8L_DOWN_LEFT
ssse3
%ifdef
HAVE_AVX
INIT_AVX
PRED8x8L_DOWN_LEFT
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;void pred8x8l_down_right
_mxext
(pixel *src, int has_topleft, int has_topright, int stride)
;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_DOWN_RIGHT
1
%macro
PRED8x8L_DOWN_RIGHT
1
; standard forbids this when has_topleft is false
; no need to check
cglobal
pred8x8l_down_right_10_
%1
,
4
,
5
,
8
cglobal
pred8x8l_down_right_10_
%1
,
4
,
5
,
8
sub
r0
,
r3
sub
r0
,
r3
lea
r4
,
[
r0
+
r3
*
2
]
lea
r4
,
[
r0
+
r3
*
4
]
lea
r1
,
[
r3
*
3
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
0
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m1
,
[
r
4
+
r3
*
1
-
16
]
mova
m1
,
[
r
0
+
r1
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
mov
r4
,
r0
punpckhdq
m1
,
m0
punpckhdq
m1
,
m0
lea
r0
,
[
r0
+
r3
*
4
]
mova
m2
,
[
r4
+
r3
*
1
-
16
]
mova
m2
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r4
+
r3
*
0
-
16
]
punpckhwd
m2
,
[
r0
+
r3
*
0
-
16
]
mova
m3
,
[
r4
+
r1
*
1
-
16
]
lea
r0
,
[
r0
+
r3
*
2
]
punpckhwd
m3
,
[
r4
+
r3
*
2
-
16
]
mova
m3
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m3
,
[
r0
+
r3
*
0
-
16
]
punpckhdq
m3
,
m2
punpckhdq
m3
,
m2
punpckhqdq
m3
,
m1
punpckhqdq
m3
,
m1
lea
r0
,
[
r0
+
r3
*
2
]
mova
m0
,
[
r4
+
r3
*
4
-
16
]
mova
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m1
,
[r0]
mova
m1
,
[r4]
PALIGNR
m4
,
m3
,
m0
,
14
,
m0
mov
r0
,
r4
PALIGNR
m1
,
m3
,
2
,
m2
mova
m4
,
m3
pslldq
m0
,
m4
,
2
mova
m2
,
m3
pshuflw
m0
,
m0
,
11100101
b
PALIGNR
m4
,
m0
,
14
,
m0
PRED4x4_LOWPASS
m6
,
m1
,
m4
,
m3
PALIGNR
m1
,
m2
,
2
,
m2
PRED4x4_LOWPASS
m4
,
m3
,
m0
,
m4
test
r1
,
r1
; top_left
jz
.
fix_lt_1
.
do_left
:
mova
m0
,
m4
PRED4x4_LOWPASS
m2
,
m1
,
m4
,
m3
mova
m4
,
m0
mova
m7
,
m2
mova
m6
,
m2
PRED4x4_LOWPASS
m1
,
m3
,
m0
,
m4
pslldq
m1
,
14
PALIGNR
m7
,
m1
,
14
,
m3
mova
m0
,
[
r0
-
16
]
mova
m3
,
[r0]
mova
m3
,
[r0]
mova
m1
,
[
r0
+
16
]
shr
r2d
,
13
mova
m2
,
m3
pslldq
m1
,
m3
,
2
mova
m4
,
m3
psrldq
m2
,
m3
,
2
PALIGNR
m2
,
m0
,
14
,
m0
pinsrw
m1
,
[
r0
-
2
]
,
0
PALIGNR
m1
,
m4
,
2
,
m4
pinsrw
m2
,
[
r0
+
r2
+
14
]
,
7
test
r1
,
r1
; top_left
PRED4x4_LOWPASS
m3
,
m2
,
m1
,
m3
jz
.
fix_lt_2
PALIGNR
m2
,
m3
,
m6
,
2
,
m0
test
r2
,
r2
; top_right
PALIGNR
m5
,
m3
,
m6
,
14
,
m0
jz
.
fix_tr_1
psrldq
m7
,
m3
,
2
.
do_top
:
PRED4x4_LOWPASS
m6
,
m4
,
m2
,
m6
PRED4x4_LOWPASS
m4
,
m2
,
m1
,
m3
PRED4x4_LOWPASS
m3
,
m5
,
m7
,
m3
mova
m5
,
m4
mova
[
r4
+
r3
*
4
]
,
m6
jmp
.
body
PALIGNR
m3
,
m6
,
14
,
m2
.
fix_lt_1
:
pslldq
m6
,
2
mova
m5
,
m3
mova
[
r0
+
r3
*
1
]
,
m3
pxor
m5
,
m4
PALIGNR
m3
,
m6
,
14
,
m2
psrldq
m5
,
14
pslldq
m6
,
2
pslldq
m5
,
12
mova
[
r0
+
r3
*
2
]
,
m3
pxor
m1
,
m5
PALIGNR
m3
,
m6
,
14
,
m2
jmp
.
do_left
pslldq
m6
,
2
.
fix_lt_2
:
mova
[
r0
+
r1
*
1
]
,
m3
mova
m5
,
m3
PALIGNR
m3
,
m6
,
14
,
m2
pxor
m5
,
m2
pslldq
m6
,
2
pslldq
m5
,
14
mova
[
r0
+
r3
*
4
]
,
m3
psrldq
m5
,
14
PALIGNR
m3
,
m6
,
14
,
m2
pxor
m2
,
m5
pslldq
m6
,
2
test
r2
,
r2
; top_right
mova
[
r4
+
r3
*
1
]
,
m3
jnz
.
do_top
PALIGNR
m3
,
m6
,
14
,
m2
.
fix_tr_1
:
pslldq
m6
,
2
mova
m5
,
m3
mova
[
r4
+
r3
*
2
]
,
m3
pxor
m5
,
m1
PALIGNR
m3
,
m6
,
14
,
m6
psrldq
m5
,
14
mova
[
r4
+
r1
*
1
]
,
m3
pslldq
m5
,
14
pxor
m1
,
m5
jmp
.
do_top
.
body
lea
r1
,
[
r3
+
r3
*
2
]
mova
m1
,
m7
mova
m7
,
m5
mova
m5
,
m6
mova
m2
,
m7
lea
r2
,
[
r0
+
r3
*
4
]
PALIGNR
m2
,
m6
,
2
,
m0
mova
m3
,
m7
PALIGNR
m3
,
m6
,
14
,
m0
mova
m4
,
m7
psrldq
m4
,
2
PRED4x4_LOWPASS
m0
,
m1
,
m2
,
m5
PRED4x4_LOWPASS
m1
,
m3
,
m4
,
m7
mova
[
r2
+
r3
*
4
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r2
+
r1
*
1
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r2
+
r3
*
2
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r2
+
r3
*
1
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r0
+
r3
*
4
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r0
+
r1
*
1
]
,
m0
mova
m2
,
m1
psrldq
m0
,
2
pslldq
m2
,
14
psrldq
m1
,
2
por
m0
,
m2
mova
[
r0
+
r3
*
2
]
,
m0
psrldq
m0
,
2
pslldq
m1
,
14
por
m0
,
m1
mova
[
r0
+
r3
*
1
]
,
m0
RET
RET
%endmacro
%endmacro
...
@@ -1095,114 +872,69 @@ INIT_XMM
...
@@ -1095,114 +872,69 @@ INIT_XMM
PRED8x8L_DOWN_RIGHT
sse2
PRED8x8L_DOWN_RIGHT
sse2
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
PRED8x8L_DOWN_RIGHT
ssse3
PRED8x8L_DOWN_RIGHT
ssse3
%ifdef
HAVE_AVX
INIT_AVX
PRED8x8L_DOWN_RIGHT
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_VERTICAL_RIGHT
1
%macro
PRED8x8L_VERTICAL_RIGHT
1
cglobal
pred8x8l_vertical_right_10_
%1
,
4
,
5
,
8
; likewise with 8x8l_down_right
cglobal
pred8x8l_vertical_right_10_
%1
,
4
,
5
,
7
sub
r0
,
r3
sub
r0
,
r3
lea
r4
,
[
r0
+
r3
*
2
]
lea
r4
,
[
r0
+
r3
*
4
]
lea
r1
,
[
r3
*
3
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
0
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m1
,
[
r
4
+
r3
*
1
-
16
]
mova
m1
,
[
r
0
+
r1
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
mov
r4
,
r0
punpckhdq
m1
,
m0
punpckhdq
m1
,
m0
lea
r0
,
[
r0
+
r3
*
4
]
mova
m2
,
[
r4
+
r3
*
1
-
16
]
mova
m2
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r4
+
r3
*
0
-
16
]
punpckhwd
m2
,
[
r0
+
r3
*
0
-
16
]
mova
m3
,
[
r4
+
r1
*
1
-
16
]
lea
r0
,
[
r0
+
r3
*
2
]
punpckhwd
m3
,
[
r4
+
r3
*
2
-
16
]
mova
m3
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m3
,
[
r0
+
r3
*
0
-
16
]
punpckhdq
m3
,
m2
punpckhdq
m3
,
m2
punpckhqdq
m3
,
m1
punpckhqdq
m3
,
m1
lea
r0
,
[
r0
+
r3
*
2
]
mova
m0
,
[
r4
+
r3
*
4
-
16
]
mova
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m1
,
[r0]
mova
m1
,
[r4]
PALIGNR
m4
,
m3
,
m0
,
14
,
m0
mov
r0
,
r4
PALIGNR
m1
,
m3
,
2
,
m2
mova
m4
,
m3
PRED4x4_LOWPASS
m3
,
m1
,
m4
,
m3
mova
m2
,
m3
mova
m2
,
[r0]
PALIGNR
m4
,
m0
,
14
,
m0
shr
r2d
,
13
PALIGNR
m1
,
m2
,
2
,
m2
pslldq
m1
,
m2
,
2
test
r1
,
r1
psrldq
m5
,
m2
,
2
jz
.
fix_lt_1
pinsrw
m1
,
[
r0
-
2
]
,
0
jmp
.
do_left
pinsrw
m5
,
[
r0
+
r2
+
14
]
,
7
.
fix_lt_1
:
PRED4x4_LOWPASS
m2
,
m5
,
m1
,
m2
mova
m5
,
m3
PALIGNR
m6
,
m2
,
m3
,
12
,
m1
pxor
m5
,
m4
PALIGNR
m5
,
m2
,
m3
,
14
,
m0
psrldq
m5
,
14
PRED4x4_LOWPASS
m0
,
m6
,
m2
,
m5
pslldq
m5
,
12
pavgw
m2
,
m5
pxor
m1
,
m5
jmp
.
do_left
.
fix_lt_2
:
mova
m5
,
m3
pxor
m5
,
m2
pslldq
m5
,
14
psrldq
m5
,
14
pxor
m2
,
m5
test
r2
,
r2
jnz
.
do_top
.
fix_tr_1
:
mova
m5
,
m3
pxor
m5
,
m1
psrldq
m5
,
14
pslldq
m5
,
14
pxor
m1
,
m5
jmp
.
do_top
.
do_left
:
mova
m0
,
m4
PRED4x4_LOWPASS
m2
,
m1
,
m4
,
m3
mova
m7
,
m2
mova
m0
,
[
r0
-
16
]
mova
m3
,
[r0]
mova
m1
,
[
r0
+
16
]
mova
m2
,
m3
mova
m4
,
m3
PALIGNR
m2
,
m0
,
14
,
m0
PALIGNR
m1
,
m4
,
2
,
m4
test
r1
,
r1
jz
.
fix_lt_2
test
r2
,
r2
jz
.
fix_tr_1
.
do_top
PRED4x4_LOWPASS
m6
,
m2
,
m1
,
m3
lea
r1
,
[
r3
+
r3
*
2
]
mova
m2
,
m6
mova
m3
,
m6
PALIGNR
m3
,
m7
,
14
,
m0
PALIGNR
m6
,
m7
,
12
,
m1
mova
m4
,
m3
pavgw
m3
,
m2
lea
r2
,
[
r0
+
r3
*
4
]
PRED4x4_LOWPASS
m0
,
m6
,
m2
,
m4
mova
[
r0
+
r3
*
1
]
,
m3
mova
[
r0
+
r3
*
2
]
,
m0
mova
[
r0
+
r3
*
2
]
,
m0
mova
m5
,
m0
mova
[
r0
+
r3
*
1
]
,
m2
mova
m6
,
m3
pslldq
m6
,
m3
,
4
mova
m1
,
m7
pslldq
m1
,
m3
,
2
mova
m2
,
m1
PRED4x4_LOWPASS
m1
,
m3
,
m6
,
m1
pslldq
m2
,
2
PALIGNR
m2
,
m1
,
14
,
m4
mova
m3
,
m1
mova
[
r0
+
r1
*
1
]
,
m2
pslldq
m3
,
4
pslldq
m1
,
2
PRED4x4_LOWPASS
m0
,
m1
,
m3
,
m2
PALIGNR
m0
,
m1
,
14
,
m3
PALIGNR
m6
,
m0
,
14
,
m2
mova
[
r0
+
r3
*
4
]
,
m0
mova
[
r0
+
r1
*
1
]
,
m6
pslldq
m1
,
2
pslldq
m0
,
2
PALIGNR
m2
,
m1
,
14
,
m4
PALIGNR
m5
,
m0
,
14
,
m1
mova
[
r4
+
r3
*
1
]
,
m2
mova
[
r0
+
r3
*
4
]
,
m5
pslldq
m1
,
2
pslldq
m0
,
2
PALIGNR
m0
,
m1
,
14
,
m3
PALIGNR
m6
,
m0
,
14
,
m2
mova
[
r4
+
r3
*
2
]
,
m0
mova
[
r2
+
r3
*
1
]
,
m6
pslldq
m1
,
2
pslldq
m0
,
2
PALIGNR
m2
,
m1
,
14
,
m4
PALIGNR
m5
,
m0
,
14
,
m1
mova
[
r4
+
r1
*
1
]
,
m2
mova
[
r2
+
r3
*
2
]
,
m5
pslldq
m1
,
2
pslldq
m0
,
2
PALIGNR
m0
,
m1
,
14
,
m1
PALIGNR
m6
,
m0
,
14
,
m2
mova
[
r4
+
r3
*
4
]
,
m0
mova
[
r2
+
r1
*
1
]
,
m6
pslldq
m0
,
2
PALIGNR
m5
,
m0
,
14
,
m1
mova
[
r2
+
r3
*
4
]
,
m5
RET
RET
%endmacro
%endmacro
...
@@ -1211,84 +943,60 @@ INIT_XMM
...
@@ -1211,84 +943,60 @@ INIT_XMM
PRED8x8L_VERTICAL_RIGHT
sse2
PRED8x8L_VERTICAL_RIGHT
sse2
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
PRED8x8L_VERTICAL_RIGHT
ssse3
PRED8x8L_VERTICAL_RIGHT
ssse3
%ifdef
HAVE_AVX
INIT_AVX
PRED8x8L_VERTICAL_RIGHT
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED8x8L_HORIZONTAL_UP
1
%macro
PRED8x8L_HORIZONTAL_UP
1
cglobal
pred8x8l_horizontal_up_10_
%1
,
4
,
4
,
8
cglobal
pred8x8l_horizontal_up_10_
%1
,
4
,
4
,
6
sub
r0
,
r3
lea
r2
,
[
r0
+
r3
*
2
]
mova
m0
,
[
r0
+
r3
*
1
-
16
]
test
r1
,
r1
lea
r1
,
[
r0
+
r3
]
cmovnz
r1
,
r0
punpckhwd
m0
,
[
r1
+
r3
*
0
-
16
]
mova
m1
,
[
r2
+
r3
*
1
-
16
]
punpckhwd
m1
,
[
r0
+
r3
*
2
-
16
]
mov
r2
,
r0
punpckhdq
m1
,
m0
lea
r0
,
[
r0
+
r3
*
4
]
mova
m2
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m2
,
[
r0
+
r3
*
0
-
16
]
lea
r0
,
[
r0
+
r3
*
2
]
mova
m3
,
[
r0
+
r3
*
1
-
16
]
punpckhwd
m3
,
[
r0
+
r3
*
0
-
16
]
punpckhdq
m3
,
m2
punpckhqdq
m3
,
m1
lea
r0
,
[
r0
+
r3
*
2
]
mova
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m0
,
[
r0
+
r3
*
0
-
16
]
mova
m1
,
[
r1
+
r3
*
0
-
16
]
punpckhwd
m0
,
[
r0
+
r3
*
1
-
16
]
mov
r0
,
r2
shr
r1d
,
14
mova
m4
,
m3
dec
r1
mova
m2
,
m3
and
r1
,
r3
PALIGNR
m4
,
m0
,
14
,
m0
sub
r1
,
r3
PALIGNR
m1
,
m2
,
2
,
m2
mova
m4
,
[
r0
+
r1
*
1
-
16
]
mova
m0
,
m4
lea
r1
,
[
r3
*
3
]
PRED4x4_LOWPASS
m2
,
m1
,
m4
,
m3
mova
m4
,
m0
mova
m7
,
m2
PRED4x4_LOWPASS
m1
,
m3
,
m0
,
m4
pslldq
m1
,
14
PALIGNR
m7
,
m1
,
14
,
m3
lea
r1
,
[
r3
+
r3
*
2
]
pshufd
m0
,
m7
,
00011011
b
; l6 l7 l4 l5 l2 l3 l0 l1
pslldq
m7
,
14
; l7 .. .. .. .. .. .. ..
mova
m2
,
m0
pslld
m0
,
16
psrld
m2
,
16
por
m2
,
m0
; l7 l6 l5 l4 l3 l2 l1 l0
mova
m3
,
m2
mova
m4
,
m2
mova
m5
,
m2
psrldq
m2
,
2
psrldq
m3
,
4
lea
r2
,
[
r0
+
r3
*
4
]
lea
r2
,
[
r0
+
r3
*
4
]
por
m2
,
m7
; l7 l7 l6 l5 l4 l3 l2 l1
mova
m1
,
[
r0
+
r3
*
2
-
16
]
punpckhwd
m7
,
m7
punpckhwd
m1
,
[
r0
+
r1
*
1
-
16
]
por
m3
,
m7
; l7 l7 l7 l6 l5 l4 l3 l2
punpckhdq
m0
,
m1
pavgw
m4
,
m2
mova
m2
,
[
r2
+
r3
*
0
-
16
]
PRED4x4_LOWPASS
m1
,
m3
,
m5
,
m2
punpckhwd
m2
,
[
r2
+
r3
*
1
-
16
]
mova
m5
,
m4
mova
m3
,
[
r2
+
r3
*
2
-
16
]
punpcklwd
m4
,
m1
; p4 p3 p2 p1
punpckhwd
m3
,
[
r2
+
r1
*
1
-
16
]
punpckhwd
m5
,
m1
; p8 p7 p6 p5
punpckhdq
m2
,
m3
mova
m6
,
m5
punpckhqdq
m0
,
m2
mova
m7
,
m5
PALIGNR
m1
,
m0
,
m4
,
14
,
m4
mova
m0
,
m5
psrldq
m2
,
m0
,
2
PALIGNR
m5
,
m4
,
4
,
m1
pshufhw
m2
,
m2
,
10100100
b
pshufd
m1
,
m6
,
11111001
b
PRED4x4_LOWPASS
m0
,
m1
,
m2
,
m0
PALIGNR
m6
,
m4
,
8
,
m2
psrldq
m1
,
m0
,
2
pshufd
m2
,
m7
,
11111110
b
psrldq
m2
,
m0
,
4
PALIGNR
m7
,
m4
,
12
,
m3
pshufhw
m1
,
m1
,
10100100
b
pshufd
m3
,
m0
,
11111111
b
pshufhw
m2
,
m2
,
01010100
b
mova
[
r0
+
r3
*
1
]
,
m4
pavgw
m4
,
m0
,
m1
mova
[
r0
+
r3
*
2
]
,
m5
PRED4x4_LOWPASS
m1
,
m2
,
m0
,
m1
mova
[
r0
+
r1
*
1
]
,
m6
punpckhwd
m5
,
m4
,
m1
mova
[
r0
+
r3
*
4
]
,
m7
punpcklwd
m4
,
m1
mova
[
r2
+
r3
*
0
]
,
m5
mova
[
r0
+
r3
*
0
]
,
m4
pshufd
m0
,
m5
,
11111001
b
pshufd
m1
,
m5
,
11111110
b
pshufd
m2
,
m5
,
11111111
b
mova
[
r2
+
r3
*
1
]
,
m0
mova
[
r2
+
r3
*
1
]
,
m0
mova
[
r2
+
r3
*
2
]
,
m1
mova
[
r2
+
r3
*
2
]
,
m1
mova
[
r2
+
r1
*
1
]
,
m2
mova
[
r2
+
r1
*
1
]
,
m2
mova
[
r2
+
r3
*
4
]
,
m3
PALIGNR
m2
,
m5
,
m4
,
4
,
m0
PALIGNR
m3
,
m5
,
m4
,
8
,
m1
PALIGNR
m5
,
m5
,
m4
,
12
,
m4
mova
[
r0
+
r3
*
1
]
,
m2
mova
[
r0
+
r3
*
2
]
,
m3
mova
[
r0
+
r1
*
1
]
,
m5
RET
RET
%endmacro
%endmacro
...
@@ -1297,7 +1005,10 @@ INIT_XMM
...
@@ -1297,7 +1005,10 @@ INIT_XMM
PRED8x8L_HORIZONTAL_UP
sse2
PRED8x8L_HORIZONTAL_UP
sse2
%define
PALIGNR
PALIGNR_SSSE3
%define
PALIGNR
PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_UP
ssse3
PRED8x8L_HORIZONTAL_UP
ssse3
%ifdef
HAVE_AVX
INIT_AVX
PRED8x8L_HORIZONTAL_UP
avx
%endif
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
...
@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3
...
@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3
%macro
PRED16x16_VERTICAL
1
%macro
PRED16x16_VERTICAL
1
cglobal
pred16x16_vertical_10_
%1
,
2
,
3
cglobal
pred16x16_vertical_10_
%1
,
2
,
3
sub
r0
,
r1
sub
r0
,
r1
mov
r2
,
8
mov
r2d
,
8
mova
m0
,
[
r0
+
0
]
mova
m0
,
[
r0
+
0
]
mova
m1
,
[
r0
+
mmsize
]
mova
m1
,
[
r0
+
mmsize
]
%if
mmsize
==
8
%if
mmsize
==
8
...
@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3
...
@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3
MOV16
r0
+
r1
*
1
,
m0
,
m1
,
m2
,
m3
MOV16
r0
+
r1
*
1
,
m0
,
m1
,
m2
,
m3
MOV16
r0
+
r1
*
2
,
m0
,
m1
,
m2
,
m3
MOV16
r0
+
r1
*
2
,
m0
,
m1
,
m2
,
m3
lea
r0
,
[
r0
+
r1
*
2
]
lea
r0
,
[
r0
+
r1
*
2
]
dec
r2
dec
r2
d
jg
.
loop
jg
.
loop
REP_RET
REP_RET
%endmacro
%endmacro
...
@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2
...
@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED16x16_HORIZONTAL
1
%macro
PRED16x16_HORIZONTAL
1
cglobal
pred16x16_horizontal_10_
%1
,
2
,
3
cglobal
pred16x16_horizontal_10_
%1
,
2
,
3
mov
r2
,
8
mov
r2d
,
8
.
vloop
:
.
vloop
:
movd
m0
,
[
r0
+
r1
*
0
-
4
]
movd
m0
,
[
r0
+
r1
*
0
-
4
]
movd
m1
,
[
r0
+
r1
*
1
-
4
]
movd
m1
,
[
r0
+
r1
*
1
-
4
]
...
@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
...
@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
MOV16
r0
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r0
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r0
+
r1
*
1
,
m1
,
m1
,
m1
,
m1
MOV16
r0
+
r1
*
1
,
m1
,
m1
,
m1
,
m1
lea
r0
,
[
r0
+
r1
*
2
]
lea
r0
,
[
r0
+
r1
*
2
]
dec
r2
dec
r2
d
jg
.
vloop
jg
.
vloop
REP_RET
REP_RET
%endmacro
%endmacro
...
@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2
...
@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2
; void pred16x16_dc(pixel *src, int stride)
; void pred16x16_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED16x16_DC
1
%macro
PRED16x16_DC
1
cglobal
pred16x16_dc_10_
%1
,
2
,
7
cglobal
pred16x16_dc_10_
%1
,
2
,
6
mov
r
4
,
r0
mov
r
5
,
r0
sub
r0
,
r1
sub
r0
,
r1
mova
m0
,
[
r0
+
0
]
mova
m0
,
[
r0
+
0
]
paddw
m0
,
[
r0
+
mmsize
]
paddw
m0
,
[
r0
+
mmsize
]
...
@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7
...
@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7
%endif
%endif
HADDW
m0
,
m2
HADDW
m0
,
m2
sub
r0
,
2
lea
r0
,
[
r0
+
r1
-
2
]
movzx
r3d
,
word
[
r0
+
r1
*
1
]
movzx
r3d
,
word
[r0]
movzx
r
5d
,
word
[
r0
+
r1
*
2
]
movzx
r
4d
,
word
[
r0
+
r1
]
%rep
7
%rep
7
lea
r0
,
[
r0
+
r1
*
2
]
lea
r0
,
[
r0
+
r1
*
2
]
movzx
r2d
,
word
[
r0
+
r1
*
1
]
movzx
r2d
,
word
[r0]
add
r3d
,
r2d
add
r3d
,
r2d
movzx
r2d
,
word
[
r0
+
r1
*
2
]
movzx
r2d
,
word
[
r0
+
r1
]
add
r
5
d
,
r2d
add
r
4
d
,
r2d
%endrep
%endrep
lea
r3d
,
[
r3
+
r
5
+
16
]
lea
r3d
,
[
r3
+
r
4
+
16
]
movd
m1
,
r3d
movd
m1
,
r3d
paddw
m0
,
m1
paddw
m0
,
m1
...
@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7
...
@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7
SPLATW
m0
,
m0
SPLATW
m0
,
m0
mov
r3d
,
8
mov
r3d
,
8
.
loop
:
.
loop
:
MOV16
r
4
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r
5
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r
4
+
r1
*
1
,
m0
,
m0
,
m0
,
m0
MOV16
r
5
+
r1
*
1
,
m0
,
m0
,
m0
,
m0
lea
r
4
,
[
r4
+
r1
*
2
]
lea
r
5
,
[
r5
+
r1
*
2
]
dec
r3d
dec
r3d
jg
.
loop
jg
.
loop
REP_RET
REP_RET
...
@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2
...
@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2
; void pred16x16_left_dc(pixel *src, int stride)
; void pred16x16_left_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro
PRED16x16_LEFT_DC
1
%macro
PRED16x16_LEFT_DC
1
cglobal
pred16x16_left_dc_10_
%1
,
2
,
7
cglobal
pred16x16_left_dc_10_
%1
,
2
,
6
mov
r
4
,
r0
mov
r
5
,
r0
sub
r0
,
2
sub
r0
,
2
movzx
r
5d
,
word
[
r0
+
r1
*
0
]
movzx
r
3d
,
word
[r
0]
movzx
r
6d
,
word
[
r0
+
r1
*
1
]
movzx
r
4d
,
word
[
r0
+
r
1
]
%rep
7
%rep
7
lea
r0
,
[
r0
+
r1
*
2
]
lea
r0
,
[
r0
+
r1
*
2
]
movzx
r2d
,
word
[
r0
+
r1
*
0
]
movzx
r2d
,
word
[r0]
movzx
r3d
,
word
[
r0
+
r1
*
1
]
add
r3d
,
r2d
add
r5d
,
r2d
movzx
r2d
,
word
[
r0
+
r1
]
add
r
6d
,
r3
d
add
r
4d
,
r2
d
%endrep
%endrep
lea
r
2d
,
[
r5
+
r6
+
8
]
lea
r
3d
,
[
r3
+
r4
+
8
]
shr
r
2
d
,
4
shr
r
3
d
,
4
movd
m0
,
r
2
d
movd
m0
,
r
3
d
SPLATW
m0
,
m0
SPLATW
m0
,
m0
mov
r3d
,
8
mov
r3d
,
8
.
loop
:
.
loop
:
MOV16
r
4
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r
5
+
r1
*
0
,
m0
,
m0
,
m0
,
m0
MOV16
r
4
+
r1
*
1
,
m0
,
m0
,
m0
,
m0
MOV16
r
5
+
r1
*
1
,
m0
,
m0
,
m0
,
m0
lea
r
4
,
[
r4
+
r1
*
2
]
lea
r
5
,
[
r5
+
r1
*
2
]
dec
r3d
dec
r3d
jg
.
loop
jg
.
loop
REP_RET
REP_RET
...
...
libavcodec/x86/h264_intrapred_init.c
View file @
406fbd24
...
@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
...
@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
PRED8x8
(
dc
,
10
,
mmxext
)
PRED8x8
(
dc
,
10
,
mmxext
)
PRED8x8
(
dc
,
10
,
sse2
)
PRED8x8
(
dc
,
10
,
sse2
)
PRED8x8
(
top_dc
,
10
,
mmxext
)
PRED8x8
(
top_dc
,
10
,
sse2
)
PRED8x8
(
top_dc
,
10
,
sse2
)
PRED8x8
(
plane
,
10
,
sse2
)
PRED8x8
(
plane
,
10
,
sse2
)
PRED8x8
(
vertical
,
10
,
sse2
)
PRED8x8
(
vertical
,
10
,
sse2
)
...
@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2)
...
@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2)
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride);
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride);
PRED8x8L
(
dc
,
10
,
sse2
)
PRED8x8L
(
dc
,
10
,
sse2
)
PRED8x8L
(
dc
,
10
,
ssse3
)
PRED8x8L
(
dc
,
10
,
avx
)
PRED8x8L
(
128
_dc
,
10
,
mmxext
)
PRED8x8L
(
128
_dc
,
10
,
mmxext
)
PRED8x8L
(
128
_dc
,
10
,
sse2
)
PRED8x8L
(
128
_dc
,
10
,
sse2
)
PRED8x8L
(
top_dc
,
10
,
sse2
)
PRED8x8L
(
top_dc
,
10
,
sse2
)
PRED8x8L
(
top_dc
,
10
,
ssse3
)
PRED8x8L
(
top_dc
,
10
,
avx
)
PRED8x8L
(
vertical
,
10
,
sse2
)
PRED8x8L
(
vertical
,
10
,
sse2
)
PRED8x8L
(
vertical
,
10
,
ssse3
)
PRED8x8L
(
vertical
,
10
,
avx
)
PRED8x8L
(
horizontal
,
10
,
sse2
)
PRED8x8L
(
horizontal
,
10
,
sse2
)
PRED8x8L
(
horizontal
,
10
,
ssse3
)
PRED8x8L
(
horizontal
,
10
,
ssse3
)
PRED8x8L
(
horizontal
,
10
,
avx
)
PRED8x8L
(
down_left
,
10
,
sse2
)
PRED8x8L
(
down_left
,
10
,
sse2
)
PRED8x8L
(
down_left
,
10
,
ssse3
)
PRED8x8L
(
down_left
,
10
,
ssse3
)
PRED8x8L
(
down_left
,
10
,
avx
)
PRED8x8L
(
down_right
,
10
,
sse2
)
PRED8x8L
(
down_right
,
10
,
sse2
)
PRED8x8L
(
down_right
,
10
,
ssse3
)
PRED8x8L
(
down_right
,
10
,
ssse3
)
PRED8x8L
(
down_right
,
10
,
avx
)
PRED8x8L
(
vertical_right
,
10
,
sse2
)
PRED8x8L
(
vertical_right
,
10
,
sse2
)
PRED8x8L
(
vertical_right
,
10
,
ssse3
)
PRED8x8L
(
vertical_right
,
10
,
ssse3
)
PRED8x8L
(
vertical_right
,
10
,
avx
)
PRED8x8L
(
horizontal_up
,
10
,
sse2
)
PRED8x8L
(
horizontal_up
,
10
,
sse2
)
PRED8x8L
(
horizontal_up
,
10
,
ssse3
)
PRED8x8L
(
horizontal_up
,
10
,
ssse3
)
PRED8x8L
(
horizontal_up
,
10
,
avx
)
#define PRED16x16(TYPE, DEPTH, OPT)\
#define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
...
@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
...
@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
h
->
pred4x4
[
HOR_UP_PRED
]
=
ff_pred4x4_horizontal_up_10_mmxext
;
h
->
pred4x4
[
HOR_UP_PRED
]
=
ff_pred4x4_horizontal_up_10_mmxext
;
h
->
pred8x8
[
DC_PRED8x8
]
=
ff_pred8x8_dc_10_mmxext
;
h
->
pred8x8
[
DC_PRED8x8
]
=
ff_pred8x8_dc_10_mmxext
;
h
->
pred8x8
[
TOP_DC_PRED8x8
]
=
ff_pred8x8_top_dc_10_mmxext
;
h
->
pred8x8l
[
DC_128_PRED
]
=
ff_pred8x8l_128_dc_10_mmxext
;
h
->
pred8x8l
[
DC_128_PRED
]
=
ff_pred8x8l_128_dc_10_mmxext
;
...
@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
...
@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
h
->
pred4x4
[
VERT_RIGHT_PRED
]
=
ff_pred4x4_vertical_right_10_ssse3
;
h
->
pred4x4
[
VERT_RIGHT_PRED
]
=
ff_pred4x4_vertical_right_10_ssse3
;
h
->
pred4x4
[
HOR_DOWN_PRED
]
=
ff_pred4x4_horizontal_down_10_ssse3
;
h
->
pred4x4
[
HOR_DOWN_PRED
]
=
ff_pred4x4_horizontal_down_10_ssse3
;
h
->
pred8x8l
[
VERT_PRED
]
=
ff_pred8x8l_vertical_10_ssse3
;
h
->
pred8x8l
[
HOR_PRED
]
=
ff_pred8x8l_horizontal_10_ssse3
;
h
->
pred8x8l
[
HOR_PRED
]
=
ff_pred8x8l_horizontal_10_ssse3
;
h
->
pred8x8l
[
DC_PRED
]
=
ff_pred8x8l_dc_10_ssse3
;
h
->
pred8x8l
[
TOP_DC_PRED
]
=
ff_pred8x8l_top_dc_10_ssse3
;
h
->
pred8x8l
[
DIAG_DOWN_LEFT_PRED
]
=
ff_pred8x8l_down_left_10_ssse3
;
h
->
pred8x8l
[
DIAG_DOWN_LEFT_PRED
]
=
ff_pred8x8l_down_left_10_ssse3
;
h
->
pred8x8l
[
DIAG_DOWN_RIGHT_PRED
]
=
ff_pred8x8l_down_right_10_ssse3
;
h
->
pred8x8l
[
VERT_RIGHT_PRED
]
=
ff_pred8x8l_vertical_right_10_ssse3
;
h
->
pred8x8l
[
HOR_UP_PRED
]
=
ff_pred8x8l_horizontal_up_10_ssse3
;
}
}
#if HAVE_AVX
#if HAVE_AVX
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
if
(
mm_flags
&
AV_CPU_FLAG_AVX
)
{
h
->
pred4x4
[
DIAG_DOWN_LEFT_PRED
]
=
ff_pred4x4_down_left_10_avx
;
h
->
pred4x4
[
DIAG_DOWN_LEFT_PRED
]
=
ff_pred4x4_down_left_10_avx
;
h
->
pred4x4
[
DIAG_DOWN_RIGHT_PRED
]
=
ff_pred4x4_down_right_10_avx
;
h
->
pred4x4
[
DIAG_DOWN_RIGHT_PRED
]
=
ff_pred4x4_down_right_10_avx
;
h
->
pred4x4
[
VERT_LEFT_PRED
]
=
ff_pred4x4_vertical_left_10_avx
;
h
->
pred4x4
[
VERT_RIGHT_PRED
]
=
ff_pred4x4_vertical_right_10_avx
;
h
->
pred4x4
[
VERT_RIGHT_PRED
]
=
ff_pred4x4_vertical_right_10_avx
;
h
->
pred4x4
[
HOR_DOWN_PRED
]
=
ff_pred4x4_horizontal_down_10_avx
;
h
->
pred4x4
[
HOR_DOWN_PRED
]
=
ff_pred4x4_horizontal_down_10_avx
;
h
->
pred8x8l
[
VERT_PRED
]
=
ff_pred8x8l_vertical_10_avx
;
h
->
pred8x8l
[
HOR_PRED
]
=
ff_pred8x8l_horizontal_10_avx
;
h
->
pred8x8l
[
DC_PRED
]
=
ff_pred8x8l_dc_10_avx
;
h
->
pred8x8l
[
TOP_DC_PRED
]
=
ff_pred8x8l_top_dc_10_avx
;
h
->
pred8x8l
[
DIAG_DOWN_RIGHT_PRED
]
=
ff_pred8x8l_down_right_10_avx
;
h
->
pred8x8l
[
DIAG_DOWN_LEFT_PRED
]
=
ff_pred8x8l_down_left_10_avx
;
h
->
pred8x8l
[
VERT_RIGHT_PRED
]
=
ff_pred8x8l_vertical_right_10_avx
;
h
->
pred8x8l
[
HOR_UP_PRED
]
=
ff_pred8x8l_horizontal_up_10_avx
;
}
}
#endif
/* HAVE_AVX */
#endif
/* HAVE_AVX */
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment