Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
bdc1e3e3
Commit
bdc1e3e3
authored
Dec 16, 2014
by
Ronald S. Bultje
Committed by
Michael Niedermayer
Dec 19, 2014
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
vp9/x86: intra prediction sse2/32bit support.
Signed-off-by:
Michael Niedermayer
<
michaelni@gmx.at
>
parent
b6e17112
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
805 additions
and
270 deletions
+805
-270
constants.c
libavcodec/x86/constants.c
+1
-1
constants.h
libavcodec/x86/constants.h
+1
-1
vp9dsp_init.c
libavcodec/x86/vp9dsp_init.c
+104
-65
vp9intrapred.asm
libavcodec/x86/vp9intrapred.asm
+699
-203
No files found.
libavcodec/x86/constants.c
View file @
bdc1e3e3
...
...
@@ -40,7 +40,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_64
)
=
{
0x0040004000400040ULL
,
0x0040004000400040ULL
};
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_pw_96
)
=
0x0060006000600060ULL
;
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_pw_128
)
=
0x0080008000800080ULL
;
DECLARE_ALIGNED
(
8
,
const
uint64_t
,
ff_pw_255
)
=
0x00ff00ff00ff00ffULL
;
DECLARE_ALIGNED
(
8
,
const
xmm_reg
,
ff_pw_255
)
=
{
0x00ff00ff00ff00ffULL
,
0x00ff00ff00ff00ffULL
}
;
DECLARE_ALIGNED
(
32
,
const
ymm_reg
,
ff_pw_256
)
=
{
0x0100010001000100ULL
,
0x0100010001000100ULL
,
0x0100010001000100ULL
,
0x0100010001000100ULL
};
DECLARE_ALIGNED
(
16
,
const
xmm_reg
,
ff_pw_512
)
=
{
0x0200020002000200ULL
,
0x0200020002000200ULL
};
...
...
libavcodec/x86/constants.h
View file @
bdc1e3e3
...
...
@@ -42,7 +42,7 @@ extern const uint64_t ff_pw_53;
extern
const
xmm_reg
ff_pw_64
;
extern
const
uint64_t
ff_pw_96
;
extern
const
uint64_t
ff_pw_128
;
extern
const
uint64_t
ff_pw_255
;
extern
const
xmm_reg
ff_pw_255
;
extern
const
xmm_reg
ff_pw_512
;
extern
const
xmm_reg
ff_pw_1024
;
extern
const
xmm_reg
ff_pw_2048
;
...
...
libavcodec/x86/vp9dsp_init.c
View file @
bdc1e3e3
...
...
@@ -243,40 +243,58 @@ lpf_funcs(88, 16, avx);
void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \
const uint8_t *l, const uint8_t *a)
#define ipred_funcs(type, opt) \
ipred_func(4, type, opt); \
ipred_func(8, type, opt); \
ipred_func(16, type, opt); \
ipred_func(32, type, opt)
ipred_func
(
8
,
v
,
mmx
);
ipred_funcs
(
dc
,
ssse3
);
ipred_funcs
(
dc_left
,
ssse3
);
ipred_funcs
(
dc_top
,
ssse3
);
#define ipred_dc_funcs(size, opt) \
ipred_func(size, dc, opt); \
ipred_func(size, dc_left, opt); \
ipred_func(size, dc_top, opt)
#undef ipred_funcs
ipred_dc_funcs
(
4
,
mmxext
);
ipred_dc_funcs
(
8
,
mmxext
);
ipred_func
(
8
,
v
,
mmx
);
ipred_func
(
16
,
v
,
sse2
);
ipred_func
(
32
,
v
,
sse2
);
#define ipred_func_set(size, type, opt1, opt2) \
ipred_func(size, type, opt1); \
ipred_func(size, type, opt2)
#define ipred_funcs(type, opt1, opt2) \
ipred_func(4, type, opt1); \
ipred_func_set(8, type, opt1, opt2); \
ipred_func_set(16, type, opt1, opt2); \
ipred_func_set(32, type, opt1, opt2)
ipred_funcs
(
h
,
ssse3
,
avx
);
ipred_funcs
(
tm
,
ssse3
,
avx
);
ipred_funcs
(
dl
,
ssse3
,
avx
);
ipred_funcs
(
dr
,
ssse3
,
avx
);
ipred_funcs
(
hu
,
ssse3
,
avx
);
ipred_funcs
(
hd
,
ssse3
,
avx
);
ipred_funcs
(
vl
,
ssse3
,
avx
);
ipred_funcs
(
vr
,
ssse3
,
avx
);
#define ipred_dir_tm_funcs(size, opt) \
ipred_func(size, tm, opt); \
ipred_func(size, dl, opt); \
ipred_func(size, dr, opt); \
ipred_func(size, hd, opt); \
ipred_func(size, hu, opt); \
ipred_func(size, vl, opt); \
ipred_func(size, vr, opt)
ipred_dir_tm_funcs
(
4
,
mmxext
);
ipred_func
(
16
,
v
,
sse
);
ipred_func
(
32
,
v
,
sse
);
ipred_dc_funcs
(
16
,
sse2
);
ipred_dc_funcs
(
32
,
sse2
);
#define ipred_dir_tm_h_funcs(size, opt) \
ipred_dir_tm_funcs(size, opt); \
ipred_func(size, h, opt)
ipred_dir_tm_h_funcs
(
8
,
sse2
);
ipred_dir_tm_h_funcs
(
16
,
sse2
);
ipred_dir_tm_h_funcs
(
32
,
sse2
);
ipred_func
(
4
,
h
,
sse2
);
#define ipred_all_funcs(size, opt) \
ipred_dc_funcs(size, opt); \
ipred_dir_tm_h_funcs(size, opt)
// FIXME hd/vl_4x4_ssse3 does not exist
ipred_all_funcs
(
4
,
ssse3
);
ipred_all_funcs
(
8
,
ssse3
);
ipred_all_funcs
(
16
,
ssse3
);
ipred_all_funcs
(
32
,
ssse3
);
ipred_dir_tm_h_funcs
(
8
,
avx
);
ipred_dir_tm_h_funcs
(
16
,
avx
);
ipred_dir_tm_h_funcs
(
32
,
avx
);
ipred_func
(
32
,
v
,
avx
);
ipred_func
(
32
,
dc
,
avx2
);
ipred_func
(
32
,
dc_left
,
avx2
);
...
...
@@ -285,9 +303,14 @@ ipred_func(32, v, avx2);
ipred_func
(
32
,
h
,
avx2
);
ipred_func
(
32
,
tm
,
avx2
);
#undef ipred_funcs
#undef ipred_func_set
ipred_dc_funcs
(
32
,
avx2
);
ipred_func
(
32
,
h
,
avx2
);
ipred_func
(
32
,
tm
,
avx2
);
#undef ipred_func
#undef ipred_dir_tm_h_funcs
#undef ipred_dir_tm_funcs
#undef ipred_dc_funcs
#endif
/* HAVE_YASM */
...
...
@@ -340,23 +363,32 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
} \
} while (0)
#define init_ipred(tx, sz, opt) do { \
dsp->intra_pred[tx][HOR_PRED] = ff_vp9_ipred_h_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED] = ff_vp9_ipred_dl_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = ff_vp9_ipred_dr_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][HOR_DOWN_PRED] = ff_vp9_ipred_hd_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][VERT_LEFT_PRED] = ff_vp9_ipred_vl_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][HOR_UP_PRED] = ff_vp9_ipred_hu_##sz##x##sz##_##opt; \
if (ARCH_X86_64 || tx != TX_32X32) { \
dsp->intra_pred[tx][VERT_RIGHT_PRED] = ff_vp9_ipred_vr_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][TM_VP8_PRED] = ff_vp9_ipred_tm_##sz##x##sz##_##opt; \
} \
#define init_ipred(sz, opt, t, e) \
dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt
#define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext
#define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext
#define init_dir_tm_ipred(sz, opt) do { \
init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \
init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \
init_ipred(sz, opt, hd, HOR_DOWN); \
init_ipred(sz, opt, vl, VERT_LEFT); \
init_ipred(sz, opt, hu, HOR_UP); \
init_ipred(sz, opt, tm, TM_VP8); \
init_ipred(sz, opt, vr, VERT_RIGHT); \
} while (0)
#define init_dir_tm_h_ipred(sz, opt) do { \
init_dir_tm_ipred(sz, opt); \
init_ipred(sz, opt, h, HOR); \
} while (0)
#define init_dc_ipred(sz, opt) do { \
init_ipred(sz, opt, dc, DC); \
init_ipred(sz, opt, dc_left, LEFT_DC); \
init_ipred(sz, opt, dc_top, TOP_DC); \
} while (0)
#define init_dc_ipred(tx, sz, opt) do { \
init_ipred(tx, sz, opt); \
dsp->intra_pred[tx][DC_PRED] = ff_vp9_ipred_dc_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_##sz##x##sz##_##opt; \
dsp->intra_pred[tx][TOP_DC_PRED] = ff_vp9_ipred_dc_top_##sz##x##sz##_##opt; \
#define init_all_ipred(sz, opt) do { \
init_dc_ipred(sz, opt); \
init_dir_tm_h_ipred(sz, opt); \
} while (0)
if
(
EXTERNAL_MMX
(
cpu_flags
))
{
...
...
@@ -366,7 +398,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp
->
itxfm_add
[
4
/* lossless */
][
ADST_DCT
]
=
dsp
->
itxfm_add
[
4
/* lossless */
][
DCT_ADST
]
=
dsp
->
itxfm_add
[
4
/* lossless */
][
ADST_ADST
]
=
ff_vp9_iwht_iwht_4x4_add_mmx
;
dsp
->
intra_pred
[
TX_8X8
][
VERT_PRED
]
=
ff_vp9_ipred_v_8x8_mmx
;
init_ipred
(
8
,
mmx
,
v
,
VERT
)
;
}
if
(
EXTERNAL_MMXEXT
(
cpu_flags
))
{
...
...
@@ -375,12 +407,17 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel
(
4
,
1
,
4
,
avg
,
mmxext
);
init_fpel
(
3
,
1
,
8
,
avg
,
mmxext
);
dsp
->
itxfm_add
[
TX_4X4
][
DCT_DCT
]
=
ff_vp9_idct_idct_4x4_add_mmxext
;
init_dc_ipred
(
4
,
mmxext
);
init_dc_ipred
(
8
,
mmxext
);
init_dir_tm_ipred
(
4
,
mmxext
);
}
if
(
EXTERNAL_SSE
(
cpu_flags
))
{
init_fpel
(
2
,
0
,
16
,
put
,
sse
);
init_fpel
(
1
,
0
,
32
,
put
,
sse
);
init_fpel
(
0
,
0
,
64
,
put
,
sse
);
init_ipred
(
16
,
sse
,
v
,
VERT
);
init_ipred
(
32
,
sse
,
v
,
VERT
);
}
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
...
...
@@ -405,8 +442,12 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp
->
itxfm_add
[
TX_32X32
][
ADST_DCT
]
=
dsp
->
itxfm_add
[
TX_32X32
][
DCT_ADST
]
=
dsp
->
itxfm_add
[
TX_32X32
][
DCT_DCT
]
=
ff_vp9_idct_idct_32x32_add_sse2
;
dsp
->
intra_pred
[
TX_16X16
][
VERT_PRED
]
=
ff_vp9_ipred_v_16x16_sse2
;
dsp
->
intra_pred
[
TX_32X32
][
VERT_PRED
]
=
ff_vp9_ipred_v_32x32_sse2
;
init_dc_ipred
(
16
,
sse2
);
init_dc_ipred
(
32
,
sse2
);
init_dir_tm_h_ipred
(
8
,
sse2
);
init_dir_tm_h_ipred
(
16
,
sse2
);
init_dir_tm_h_ipred
(
32
,
sse2
);
init_ipred
(
4
,
sse2
,
h
,
HOR
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
...
...
@@ -429,10 +470,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp
->
itxfm_add
[
TX_32X32
][
DCT_ADST
]
=
dsp
->
itxfm_add
[
TX_32X32
][
DCT_DCT
]
=
ff_vp9_idct_idct_32x32_add_ssse3
;
init_lpf
(
ssse3
);
init_
dc_ipred
(
TX_4X4
,
4
,
ssse3
);
init_
dc_ipred
(
TX_8X8
,
8
,
ssse3
);
init_
dc_ipred
(
TX_16X16
,
16
,
ssse3
);
init_
dc_ipred
(
TX_32X32
,
32
,
ssse3
);
init_
all_ipred
(
4
,
ssse3
);
init_
all_ipred
(
8
,
ssse3
);
init_
all_ipred
(
16
,
ssse3
);
init_
all_ipred
(
32
,
ssse3
);
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
...
...
@@ -451,9 +492,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_fpel
(
1
,
0
,
32
,
put
,
avx
);
init_fpel
(
0
,
0
,
64
,
put
,
avx
);
init_lpf
(
avx
);
init_ipred
(
TX_8X8
,
8
,
avx
);
init_ipred
(
TX_16X16
,
16
,
avx
);
init_ipred
(
TX_32X32
,
32
,
avx
);
init_dir_tm_h_ipred
(
8
,
avx
);
init_dir_tm_h_ipred
(
16
,
avx
);
init_dir_tm_h_ipred
(
32
,
avx
);
init_ipred
(
32
,
avx
,
v
,
VERT
);
}
if
(
EXTERNAL_AVX2
(
cpu_flags
))
{
...
...
@@ -465,12 +507,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
init_subpel3_32_64
(
1
,
avg
,
avx2
);
#endif
}
dsp
->
intra_pred
[
TX_32X32
][
DC_PRED
]
=
ff_vp9_ipred_dc_32x32_avx2
;
dsp
->
intra_pred
[
TX_32X32
][
LEFT_DC_PRED
]
=
ff_vp9_ipred_dc_left_32x32_avx2
;
dsp
->
intra_pred
[
TX_32X32
][
TOP_DC_PRED
]
=
ff_vp9_ipred_dc_top_32x32_avx2
;
dsp
->
intra_pred
[
TX_32X32
][
VERT_PRED
]
=
ff_vp9_ipred_v_32x32_avx2
;
dsp
->
intra_pred
[
TX_32X32
][
HOR_PRED
]
=
ff_vp9_ipred_h_32x32_avx2
;
dsp
->
intra_pred
[
TX_32X32
][
TM_VP8_PRED
]
=
ff_vp9_ipred_tm_32x32_avx2
;
init_dc_ipred
(
32
,
avx2
);
init_ipred
(
32
,
avx2
,
h
,
HOR
);
init_ipred
(
32
,
avx2
,
tm
,
TM_VP8
);
}
#undef init_fpel
...
...
libavcodec/x86/vp9intrapred.asm
View file @
bdc1e3e3
...
...
@@ -66,11 +66,23 @@ pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
pb_2
:
times
32
db
2
pb_15
:
times
16
db
15
pb_15x0_1xm1
:
times
15
db
0
db
-
1
pb_0to2_5x3
:
db
0
,
1
,
2
times
5
db
3
pb_6xm1_2x0
:
times
6
db
-
1
times
2
db
0
pb_6x0_2xm1
:
times
6
db
0
times
2
db
-
1
cextern
pb_1
cextern
pb_3
cextern
pw_2
cextern
pw_4
cextern
pw_8
cextern
pw_16
cextern
pw_32
cextern
pw_255
cextern
pw_512
cextern
pw_1024
cextern
pw_2048
...
...
@@ -80,14 +92,21 @@ SECTION .text
; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
INIT_MMX
ssse3
%macro
DC_4to8_FUNCS
0
cglobal
vp9_ipred_dc_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movd
m0
,
[lq]
punpckldq
m0
,
[aq]
pxor
m1
,
m1
psadbw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_4096
]
pshufb
m0
,
m1
%else
paddw
m0
,
[
pw_4
]
psraw
m0
,
3
punpcklbw
m0
,
m0
pshufw
m0
,
m0
,
q0000
%endif
movd
[
dstq
+
strideq
*
0
]
,
m0
movd
[
dstq
+
strideq
*
1
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
2
]
...
...
@@ -95,7 +114,6 @@ cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
movd
[
dstq
+
strideq
*
1
]
,
m0
RET
INIT_MMX
ssse3
cglobal
vp9_ipred_dc_8x8
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movq
m0
,
[lq]
movq
m1
,
[aq]
...
...
@@ -105,8 +123,15 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
psadbw
m0
,
m2
psadbw
m1
,
m2
paddw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_2048
]
pshufb
m0
,
m2
%else
paddw
m0
,
[
pw_8
]
psraw
m0
,
4
punpcklbw
m0
,
m0
pshufw
m0
,
m0
,
q0000
%endif
movq
[
dstq
+
strideq
*
0
]
,
m0
movq
[
dstq
+
strideq
*
1
]
,
m0
movq
[
dstq
+
strideq
*
2
]
,
m0
...
...
@@ -117,8 +142,14 @@ cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
movq
[
dstq
+
strideq
*
2
]
,
m0
movq
[
dstq
+
stride3q
]
,
m0
RET
%endmacro
INIT_XMM
ssse3
INIT_MMX
mmxext
DC_4to8_FUNCS
INIT_MMX
ssse3
DC_4to8_FUNCS
%macro
DC_16to32_FUNCS
0
cglobal
vp9_ipred_dc_16x16
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
mova
m1
,
[aq]
...
...
@@ -130,8 +161,16 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
paddw
m0
,
m1
movhlps
m1
,
m0
paddw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_1024
]
pshufb
m0
,
m2
%else
paddw
m0
,
[
pw_16
]
psraw
m0
,
5
punpcklbw
m0
,
m0
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
%endif
mov
cntd
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
]
,
m0
...
...
@@ -143,7 +182,6 @@ cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
jg
.
loop
RET
INIT_XMM
ssse3
cglobal
vp9_ipred_dc_32x32
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
mova
m1
,
[
lq
+
16
]
...
...
@@ -161,8 +199,16 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
paddw
m0
,
m2
movhlps
m1
,
m0
paddw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_512
]
pshufb
m0
,
m4
%else
paddw
m0
,
[
pw_32
]
psraw
m0
,
6
punpcklbw
m0
,
m0
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
%endif
mov
cntd
,
8
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
...
...
@@ -177,6 +223,12 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
dec
cntd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
DC_16to32_FUNCS
INIT_XMM
ssse3
DC_16to32_FUNCS
%if
HAVE_AVX2_EXTERNAL
INIT_YMM
avx2
...
...
@@ -214,14 +266,20 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
%macro
DC_1D_FUNCS
2
; dir (top or left), arg (a or l)
INIT_MMX
ssse3
%macro
DC_1D_4to8_FUNCS
2
; dir (top or left), arg (a or l)
cglobal
vp9_ipred_dc_
%1
_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movd
m0
,
[
%2
q
]
pxor
m1
,
m1
psadbw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_8192
]
pshufb
m0
,
m1
%else
paddw
m0
,
[
pw_2
]
psraw
m0
,
2
punpcklbw
m0
,
m0
pshufw
m0
,
m0
,
q0000
%endif
movd
[
dstq
+
strideq
*
0
]
,
m0
movd
[
dstq
+
strideq
*
1
]
,
m0
lea
dstq
,
[
dstq
+
strideq
*
2
]
...
...
@@ -229,15 +287,21 @@ cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
movd
[
dstq
+
strideq
*
1
]
,
m0
RET
INIT_MMX
ssse3
cglobal
vp9_ipred_dc_
%1
_8x8
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movq
m0
,
[
%2
q
]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pxor
m1
,
m1
psadbw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_4096
]
pshufb
m0
,
m1
%else
paddw
m0
,
[
pw_4
]
psraw
m0
,
3
punpcklbw
m0
,
m0
pshufw
m0
,
m0
,
q0000
%endif
movq
[
dstq
+
strideq
*
0
]
,
m0
movq
[
dstq
+
strideq
*
1
]
,
m0
movq
[
dstq
+
strideq
*
2
]
,
m0
...
...
@@ -248,8 +312,16 @@ cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
movq
[
dstq
+
strideq
*
2
]
,
m0
movq
[
dstq
+
stride3q
]
,
m0
RET
%endmacro
INIT_XMM
ssse3
INIT_MMX
mmxext
DC_1D_4to8_FUNCS
top
,
a
DC_1D_4to8_FUNCS
left
,
l
INIT_MMX
ssse3
DC_1D_4to8_FUNCS
top
,
a
DC_1D_4to8_FUNCS
left
,
l
%macro
DC_1D_16to32_FUNCS
2
; dir (top or left), arg (a or l)
cglobal
vp9_ipred_dc_
%1
_16x16
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
q
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
...
...
@@ -258,8 +330,16 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
psadbw
m0
,
m2
movhlps
m1
,
m0
paddw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_2048
]
pshufb
m0
,
m2
%else
paddw
m0
,
[
pw_8
]
psraw
m0
,
4
punpcklbw
m0
,
m0
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
%endif
mov
cntd
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
]
,
m0
...
...
@@ -271,7 +351,6 @@ cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
jg
.
loop
RET
INIT_XMM
ssse3
cglobal
vp9_ipred_dc_
%1
_32x32
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
q
]
mova
m1
,
[
%2
q
+
16
]
...
...
@@ -283,8 +362,16 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
paddw
m0
,
m1
movhlps
m1
,
m0
paddw
m0
,
m1
%if
cpuflag
(
ssse3
)
pmulhrsw
m0
,
[
pw_1024
]
pshufb
m0
,
m2
%else
paddw
m0
,
[
pw_16
]
psraw
m0
,
5
punpcklbw
m0
,
m0
pshuflw
m0
,
m0
,
q0000
punpcklqdq
m0
,
m0
%endif
mov
cntd
,
8
.
loop
:
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
...
...
@@ -299,9 +386,17 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
dec
cntd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
DC_1D_16to32_FUNCS
top
,
a
DC_1D_16to32_FUNCS
left
,
l
INIT_XMM
ssse3
DC_1D_16to32_FUNCS
top
,
a
DC_1D_16to32_FUNCS
left
,
l
%macro
DC_1D_AVX2_FUNCS
2
; dir (top or left), arg (a or l)
%if
HAVE_AVX2_EXTERNAL
INIT_YMM
avx2
cglobal
vp9_ipred_dc_
%1
_32x32
,
4
,
4
,
3
,
dst
,
stride
,
l
,
a
mova
m0
,
[
%2
q
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
...
...
@@ -332,8 +427,9 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
%endif
%endmacro
DC_1D_FUNCS
top
,
a
DC_1D_FUNCS
left
,
l
INIT_YMM
avx2
DC_1D_AVX2_FUNCS
top
,
a
DC_1D_AVX2_FUNCS
left
,
l
; v
...
...
@@ -353,7 +449,7 @@ cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
movq
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
sse
2
INIT_XMM
sse
cglobal
vp9_ipred_v_16x16
,
4
,
4
,
1
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
...
...
@@ -369,7 +465,7 @@ cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
jg
.
loop
RET
INIT_XMM
sse
2
INIT_XMM
sse
cglobal
vp9_ipred_v_32x32
,
4
,
4
,
2
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
mova
m1
,
[
aq
+
16
]
...
...
@@ -390,8 +486,7 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
jg
.
loop
RET
%if
HAVE_AVX2_EXTERNAL
INIT_YMM
avx2
INIT_YMM
avx
cglobal
vp9_ipred_v_32x32
,
4
,
4
,
1
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
...
...
@@ -411,14 +506,20 @@ cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
dec
cntd
jg
.
loop
RET
%endif
; h
INIT_XMM
ssse3
%macro
H_XMM_FUNCS
2
%if
notcpuflag
(
avx
)
cglobal
vp9_ipred_h_4x4
,
3
,
4
,
1
,
dst
,
stride
,
l
,
stride3
movd
m0
,
[lq]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_4x3_4x2_4x1_4x0
]
%else
punpcklbw
m0
,
m0
pshuflw
m0
,
m0
,
q0123
punpcklwd
m0
,
m0
%endif
lea
stride3q
,
[
strideq
*
3
]
movd
[
dstq
+
strideq
*
0
]
,
m0
psrldq
m0
,
4
...
...
@@ -428,18 +529,26 @@ cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
psrldq
m0
,
4
movd
[
dstq
+
stride3q
]
,
m0
RET
%endif
%macro
H_XMM_FUNCS
1
INIT_XMM
%1
cglobal
vp9_ipred_h_8x8
,
3
,
5
,
4
,
dst
,
stride
,
l
,
stride3
,
cnt
cglobal
vp9_ipred_h_8x8
,
3
,
5
,
%1
,
dst
,
stride
,
l
,
stride3
,
cnt
%if
cpuflag
(
ssse3
)
mova
m2
,
[
pb_8x1_8x0
]
mova
m3
,
[
pb_8x3_8x2
]
%endif
lea
stride3q
,
[
strideq
*
3
]
mov
cntq
,
1
.
loop
:
movd
m0
,
[
lq
+
cntq
*
4
]
%if
cpuflag
(
ssse3
)
pshufb
m1
,
m0
,
m3
pshufb
m0
,
m2
%else
punpcklbw
m0
,
m0
punpcklwd
m0
,
m0
pshufd
m1
,
m0
,
q2233
pshufd
m0
,
m0
,
q0011
%endif
movq
[
dstq
+
strideq
*
0
]
,
m1
movhps
[
dstq
+
strideq
*
1
]
,
m1
movq
[
dstq
+
strideq
*
2
]
,
m0
...
...
@@ -449,22 +558,35 @@ cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
jge
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_h_16x16
,
3
,
5
,
8
,
dst
,
stride
,
l
,
stride3
,
cnt
cglobal
vp9_ipred_h_16x16
,
3
,
5
,
%2
,
dst
,
stride
,
l
,
stride3
,
cnt
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pb_1
]
mova
m6
,
[
pb_2
]
mova
m7
,
[
pb_3
]
pxor
m4
,
m4
%endif
lea
stride3q
,
[
strideq
*
3
]
mov
cntq
,
3
.
loop
:
movd
m3
,
[
lq
+
cntq
*
4
]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m3
,
m7
pshufb
m1
,
m3
,
m6
%else
punpcklbw
m3
,
m3
punpcklwd
m3
,
m3
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
%endif
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m1
%if
cpuflag
(
ssse3
)
pshufb
m2
,
m3
,
m5
pshufb
m3
,
m4
%else
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
%endif
mova
[
dstq
+
strideq
*
2
]
,
m2
mova
[
dstq
+
stride3q
]
,
m3
lea
dstq
,
[
dstq
+
strideq
*
4
]
...
...
@@ -472,24 +594,37 @@ cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
jge
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_h_32x32
,
3
,
5
,
8
,
dst
,
stride
,
l
,
stride3
,
cnt
cglobal
vp9_ipred_h_32x32
,
3
,
5
,
%2
,
dst
,
stride
,
l
,
stride3
,
cnt
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pb_1
]
mova
m6
,
[
pb_2
]
mova
m7
,
[
pb_3
]
pxor
m4
,
m4
%endif
lea
stride3q
,
[
strideq
*
3
]
mov
cntq
,
7
.
loop
:
movd
m3
,
[
lq
+
cntq
*
4
]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m3
,
m7
pshufb
m1
,
m3
,
m6
%else
punpcklbw
m3
,
m3
punpcklwd
m3
,
m3
pshufd
m0
,
m3
,
q3333
pshufd
m1
,
m3
,
q2222
%endif
mova
[
dstq
+
strideq
*
0
+
0
]
,
m0
mova
[
dstq
+
strideq
*
0
+
16
]
,
m0
mova
[
dstq
+
strideq
*
1
+
0
]
,
m1
mova
[
dstq
+
strideq
*
1
+
16
]
,
m1
%if
cpuflag
(
ssse3
)
pshufb
m2
,
m3
,
m5
pshufb
m3
,
m4
%else
pshufd
m2
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
%endif
mova
[
dstq
+
strideq
*
2
+
0
]
,
m2
mova
[
dstq
+
strideq
*
2
+
16
]
,
m2
mova
[
dstq
+
stride3q
+
0
]
,
m3
...
...
@@ -500,8 +635,12 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
RET
%endmacro
H_XMM_FUNCS
ssse3
H_XMM_FUNCS
avx
INIT_XMM
sse2
H_XMM_FUNCS
2
,
4
INIT_XMM
ssse3
H_XMM_FUNCS
4
,
8
INIT_XMM
avx
H_XMM_FUNCS
4
,
8
%if
HAVE_AVX2_EXTERNAL
INIT_YMM
avx2
...
...
@@ -531,83 +670,124 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
; tm
INIT_MMX
ssse3
%macro
TM_MMX_FUNCS
0
cglobal
vp9_ipred_tm_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
pxor
m1
,
m1
pinsrw
m2
,
[
aq
-
1
]
,
0
movd
m0
,
[aq]
pinsrw
m2
,
[
aq
-
1
]
,
0
punpcklbw
m0
,
m1
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
%if
cpuflag
(
ssse3
)
mova
m3
,
[
pw_m256
]
mova
m
4
,
[
pw_m255
]
mova
m
1
,
[
pw_m255
]
pshufb
m2
,
m3
punpcklbw
m0
,
m1
%else
punpcklbw
m2
,
m1
pshufw
m2
,
m2
,
q0000
%endif
psubw
m0
,
m2
mov
cntq
,
1
.
loop
:
pinsrw
m2
,
[
lq
+
cntq
*
2
]
,
0
pshufb
m1
,
m2
,
m4
%if
cpuflag
(
ssse3
)
pshufb
m4
,
m2
,
m1
pshufb
m2
,
m3
paddw
m1
,
m0
%else
punpcklbw
m2
,
m1
pshufw
m4
,
m2
,
q1111
pshufw
m2
,
m2
,
q0000
%endif
paddw
m4
,
m0
paddw
m2
,
m0
packuswb
m
1
,
m1
packuswb
m
4
,
m4
packuswb
m2
,
m2
movd
[
dstq
+
strideq
*
0
]
,
m
1
movd
[
dstq
+
strideq
*
0
]
,
m
4
movd
[
dstq
+
strideq
*
1
]
,
m2
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntq
jge
.
loop
RET
%endmacro
INIT_MMX
mmxext
TM_MMX_FUNCS
INIT_MMX
ssse3
TM_MMX_FUNCS
%macro
TM_XMM_FUNCS
1
INIT_XMM
%1
%macro
TM_XMM_FUNCS
0
cglobal
vp9_ipred_tm_8x8
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
pxor
m1
,
m1
pinsrw
m2
,
[
aq
-
1
]
,
0
movh
m0
,
[aq]
pinsrw
m2
,
[
aq
-
1
]
,
0
punpcklbw
m0
,
m1
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
%if
cpuflag
(
ssse3
)
mova
m3
,
[
pw_m256
]
mova
m
4
,
[
pw_m255
]
mova
m
1
,
[
pw_m255
]
pshufb
m2
,
m3
punpcklbw
m0
,
m1
%else
punpcklbw
m2
,
m1
punpcklwd
m2
,
m2
pshufd
m2
,
m2
,
q0000
%endif
psubw
m0
,
m2
mov
cntq
,
3
.
loop
:
pinsrw
m2
,
[
lq
+
cntq
*
2
]
,
0
pshufb
m1
,
m2
,
m4
%if
cpuflag
(
ssse3
)
pshufb
m4
,
m2
,
m1
pshufb
m2
,
m3
paddw
m1
,
m0
%else
punpcklbw
m2
,
m1
punpcklwd
m2
,
m2
pshufd
m4
,
m2
,
q1111
pshufd
m2
,
m2
,
q0000
%endif
paddw
m4
,
m0
paddw
m2
,
m0
packuswb
m
1
,
m2
movh
[
dstq
+
strideq
*
0
]
,
m
1
movhps
[
dstq
+
strideq
*
1
]
,
m
1
packuswb
m
4
,
m2
movh
[
dstq
+
strideq
*
0
]
,
m
4
movhps
[
dstq
+
strideq
*
1
]
,
m
4
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntq
jge
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_tm_16x16
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
pxor
m3
,
m3
pinsrw
m2
,
[
aq
-
1
]
,
0
mova
m0
,
[aq]
pinsrw
m2
,
[
aq
-
1
]
,
0
punpckhbw
m1
,
m0
,
m3
punpcklbw
m0
,
m3
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pw_m256
]
mova
m
5
,
[
pw_m255
]
mova
m
3
,
[
pw_m255
]
pshufb
m2
,
m4
punpckhbw
m1
,
m0
,
m3
punpcklbw
m0
,
m3
%else
punpcklbw
m2
,
m3
punpcklwd
m2
,
m2
pshufd
m2
,
m2
,
q0000
%endif
psubw
m1
,
m2
psubw
m0
,
m2
mov
cntq
,
7
.
loop
:
pinsrw
m7
,
[
lq
+
cntq
*
2
]
,
0
pshufb
m3
,
m7
,
m5
%if
cpuflag
(
ssse3
)
pshufb
m5
,
m7
,
m3
pshufb
m7
,
m4
paddw
m2
,
m3
,
m0
paddw
m3
,
m1
%else
punpcklbw
m7
,
m3
punpcklwd
m7
,
m7
pshufd
m5
,
m7
,
q1111
pshufd
m7
,
m7
,
q0000
%endif
paddw
m2
,
m5
,
m0
paddw
m5
,
m1
paddw
m6
,
m7
,
m0
paddw
m7
,
m1
packuswb
m2
,
m
3
packuswb
m2
,
m
5
packuswb
m6
,
m7
mova
[
dstq
+
strideq
*
0
]
,
m2
mova
[
dstq
+
strideq
*
1
]
,
m6
...
...
@@ -617,16 +797,32 @@ cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
RET
%if
ARCH_X86_64
INIT_XMM
%1
cglobal
vp9_ipred_tm_32x32
,
4
,
4
,
14
,
dst
,
stride
,
l
,
a
%define
mem
0
%else
%define
mem
64
%endif
cglobal
vp9_ipred_tm_32x32
,
4
,
4
,
14
,
mem
,
dst
,
stride
,
l
,
a
pxor
m5
,
m5
pinsrw
m4
,
[
aq
-
1
]
,
0
mova
m0
,
[aq]
mova
m2
,
[
aq
+
16
]
DEFINE_ARGS
dst
,
stride
,
l
,
cnt
mova
m8
,
[
pw_m256
]
mova
m9
,
[
pw_m255
]
pshufb
m4
,
m8
%if
cpuflag
(
ssse3
)
%if
ARCH_X86_64
mova
m12
,
[
pw_m256
]
mova
m13
,
[
pw_m255
]
%define
pw_m256_reg
m12
%define
pw_m255_reg
m13
%else
%define
pw_m256_reg
[
pw_m256
]
%define
pw_m255_reg
[
pw_m255
]
%endif
pshufb
m4
,
pw_m256_reg
%else
punpcklbw
m4
,
m5
punpcklwd
m4
,
m4
pshufd
m4
,
m4
,
q0000
%endif
punpckhbw
m1
,
m0
,
m5
punpckhbw
m3
,
m2
,
m5
punpcklbw
m0
,
m5
...
...
@@ -635,36 +831,72 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
psubw
m0
,
m4
psubw
m3
,
m4
psubw
m2
,
m4
%if
ARCH_X86_64
SWAP
0
,
8
SWAP
1
,
9
SWAP
2
,
10
SWAP
3
,
11
%else
mova
[
rsp
+
0
*
16
]
,
m0
mova
[
rsp
+
1
*
16
]
,
m1
mova
[
rsp
+
2
*
16
]
,
m2
mova
[
rsp
+
3
*
16
]
,
m3
%endif
mov
cntq
,
15
.
loop
:
pinsrw
m13
,
[
lq
+
cntq
*
2
]
,
0
pshufb
m7
,
m13
,
m9
pshufb
m13
,
m8
paddw
m4
,
m7
,
m0
paddw
m5
,
m7
,
m1
paddw
m6
,
m7
,
m2
paddw
m7
,
m3
paddw
m10
,
m13
,
m0
paddw
m11
,
m13
,
m1
paddw
m12
,
m13
,
m2
paddw
m13
,
m3
pinsrw
m3
,
[
lq
+
cntq
*
2
]
,
0
%if
cpuflag
(
ssse3
)
pshufb
m7
,
m3
,
pw_m255_reg
pshufb
m3
,
pw_m256_reg
%else
pxor
m7
,
m7
punpcklbw
m3
,
m7
punpcklwd
m3
,
m3
pshufd
m7
,
m3
,
q1111
pshufd
m3
,
m3
,
q0000
%endif
%if
ARCH_X86_64
paddw
m4
,
m7
,
m8
paddw
m5
,
m7
,
m9
paddw
m6
,
m7
,
m10
paddw
m7
,
m11
paddw
m0
,
m3
,
m8
paddw
m1
,
m3
,
m9
paddw
m2
,
m3
,
m10
paddw
m3
,
m11
%else
paddw
m4
,
m7
,
[
rsp
+
0
*
16
]
paddw
m5
,
m7
,
[
rsp
+
1
*
16
]
paddw
m6
,
m7
,
[
rsp
+
2
*
16
]
paddw
m7
,
[
rsp
+
3
*
16
]
paddw
m0
,
m3
,
[
rsp
+
0
*
16
]
paddw
m1
,
m3
,
[
rsp
+
1
*
16
]
paddw
m2
,
m3
,
[
rsp
+
2
*
16
]
paddw
m3
,
[
rsp
+
3
*
16
]
%endif
packuswb
m4
,
m5
packuswb
m6
,
m7
packuswb
m10
,
m1
1
packuswb
m12
,
m1
3
packuswb
m0
,
m
1
packuswb
m2
,
m
3
mova
[
dstq
+
strideq
*
0
+
0
]
,
m4
mova
[
dstq
+
strideq
*
0
+
16
]
,
m6
mova
[
dstq
+
strideq
*
1
+
0
]
,
m
1
0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m
1
2
mova
[
dstq
+
strideq
*
1
+
0
]
,
m0
mova
[
dstq
+
strideq
*
1
+
16
]
,
m2
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntq
jge
.
loop
RET
%endif
%undef
pw_m256_reg
%undef
pw_m255_reg
%undef
mem
%endmacro
TM_XMM_FUNCS
ssse3
TM_XMM_FUNCS
avx
INIT_XMM
sse2
TM_XMM_FUNCS
INIT_XMM
ssse3
TM_XMM_FUNCS
INIT_XMM
avx
TM_XMM_FUNCS
%if
HAVE_AVX2_EXTERNAL
INIT_YMM
avx2
...
...
@@ -711,11 +943,20 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
pavgb
m%1
,
m%2
%endmacro
INIT_MMX
ssse3
%macro
DL_MMX_FUNCS
0
cglobal
vp9_ipred_dl_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movq
m1
,
[aq]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m1
,
[
pb_0to5_2x7
]
pshufb
m2
,
m1
,
[
pb_2to6_3x7
]
%else
punpckhbw
m3
,
m1
,
m1
; 44556677
pand
m0
,
m1
,
[
pb_6xm1_2x0
]
; 012345__
pand
m3
,
[
pb_6x0_2xm1
]
; ______77
psrlq
m2
,
m1
,
16
; 234567__
por
m0
,
m3
; 01234577
por
m2
,
m3
; 23456777
%endif
psrlq
m1
,
8
LOWPASS
0
,
1
,
2
,
3
...
...
@@ -728,15 +969,29 @@ cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
movd
[
dstq
+
strideq
*
0
]
,
m0
movd
[
dstq
+
strideq
*
2
]
,
m1
RET
%endmacro
INIT_MMX
mmxext
DL_MMX_FUNCS
INIT_MMX
ssse3
DL_MMX_FUNCS
%macro
DL_XMM_FUNCS
1
INIT_XMM
%1
%macro
DL_XMM_FUNCS
0
cglobal
vp9_ipred_dl_8x8
,
4
,
4
,
4
,
dst
,
stride
,
stride5
,
a
movq
m0
,
[aq]
lea
stride5q
,
[
strideq
*
5
]
%if
cpuflag
(
ssse3
)
pshufb
m1
,
m0
,
[
pb_1to6_10x7
]
%else
punpcklbw
m1
,
m0
,
m0
; 0011223344556677
punpckhwd
m1
,
m1
; 4x4,4x5,4x6,4x7
%endif
shufps
m0
,
m1
,
q3310
%if
notcpuflag
(
ssse3
)
psrldq
m1
,
m0
,
1
shufps
m1
,
m0
,
q3210
%endif
psrldq
m2
,
m1
,
1
shufps
m0
,
m1
,
q3210
LOWPASS
0
,
1
,
2
,
3
pshufd
m1
,
m0
,
q3321
...
...
@@ -757,46 +1012,72 @@ cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
movq
[
dstq
+
stride5q
]
,
m1
RET
INIT_XMM
%1
cglobal
vp9_ipred_dl_16x16
,
4
,
4
,
6
,
dst
,
stride
,
l
,
a
mova
m5
,
[
pb_1toE_2xF
]
mova
m0
,
[aq]
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pb_1toE_2xF
]
pshufb
m1
,
m0
,
m5
pshufb
m2
,
m1
,
m5
pshufb
m4
,
m0
,
[
pb_15
]
%else
pand
m5
,
m0
,
[
pb_15x0_1xm1
]
; _______________F
psrldq
m1
,
m0
,
1
; 123456789ABCDEF_
por
m1
,
m5
; 123456789ABCDEFF
psrldq
m2
,
m1
,
1
; 23456789ABCDEFF_
por
m2
,
m5
; 23456789ABCDEFFF
pshufhw
m4
,
m1
,
q3333
; xxxxxxxxFFFFFFFF
%endif
LOWPASS
0
,
1
,
2
,
3
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride9
lea
stride9q
,
[
strideq
*
3
]
lea
stride9q
,
[
strideq
+
strideq
*
8
]
mov
cntd
,
4
lea
stride9q
,
[
stride9q
*
3
]
.
loop
:
movhlps
m4
,
m0
mova
[
dstq
+
strideq
*
0
]
,
m0
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m5
%else
psrldq
m0
,
1
por
m0
,
m5
%endif
mova
[
dstq
+
strideq
*
8
]
,
m4
movhlps
m4
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m0
%if
cpuflag
(
ssse3
)
pshufb
m0
,
m5
%else
psrldq
m0
,
1
por
m0
,
m5
%endif
mova
[
dstq
+
stride9q
]
,
m4
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_dl_32x32
,
4
,
5
,
8
,
dst
,
stride
,
cnt
,
a
,
dst16
mova
m5
,
[
pb_1toE_2xF
]
mova
m0
,
[aq]
mova
m1
,
[
aq
+
16
]
palignr
m2
,
m1
,
m0
,
1
palignr
m3
,
m1
,
m0
,
2
PALIGNR
m2
,
m1
,
m0
,
1
,
m4
PALIGNR
m3
,
m1
,
m0
,
2
,
m4
LOWPASS
0
,
2
,
3
,
4
%if
cpuflag
(
ssse3
)
mova
m5
,
[
pb_1toE_2xF
]
pshufb
m2
,
m1
,
m5
pshufb
m3
,
m2
,
m5
pshufb
m6
,
m1
,
[
pb_15
]
LOWPASS
1
,
2
,
3
,
4
mova
m7
,
m6
%else
pand
m5
,
m1
,
[
pb_15x0_1xm1
]
; _______________F
psrldq
m2
,
m1
,
1
; 123456789ABCDEF_
por
m2
,
m5
; 123456789ABCDEFF
psrldq
m3
,
m2
,
1
; 23456789ABCDEFF_
por
m3
,
m5
; 23456789ABCDEFFF
pshufhw
m7
,
m2
,
q3333
; xxxxxxxxFFFFFFFF
pshufd
m6
,
m7
,
q3333
%endif
LOWPASS
1
,
2
,
3
,
4
lea
dst16q
,
[
dstq
+
strideq
*
8
]
mov
cntd
,
8
lea
dst16q
,
[
dst16q
+
strideq
*
8
]
...
...
@@ -814,10 +1095,17 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
%if
cpuflag
(
avx
)
vpalignr
m0
,
m1
,
m0
,
1
pshufb
m1
,
m5
%el
se
%el
if
cpuflag
(
ssse3
)
palignr
m2
,
m1
,
m0
,
1
pshufb
m1
,
m5
mova
m0
,
m2
%else
mova
m4
,
m1
psrldq
m0
,
1
pslldq
m4
,
15
psrldq
m1
,
1
por
m0
,
m4
por
m1
,
m5
%endif
add
dstq
,
strideq
add
dst16q
,
strideq
...
...
@@ -826,19 +1114,23 @@ cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
RET
%endmacro
DL_XMM_FUNCS
ssse3
DL_XMM_FUNCS
avx
INIT_XMM
sse2
DL_XMM_FUNCS
INIT_XMM
ssse3
DL_XMM_FUNCS
INIT_XMM
avx
DL_XMM_FUNCS
; dr
INIT_MMX
ssse3
%macro
DR_MMX_FUNCS
0
cglobal
vp9_ipred_dr_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movd
m0
,
[lq]
punpckldq
m0
,
[
aq
-
1
]
movd
m1
,
[
aq
+
3
]
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
palignr
m1
,
m0
,
1
PALIGNR
m1
,
m0
,
1
,
m3
psrlq
m2
,
m1
,
8
LOWPASS
0
,
1
,
2
,
3
...
...
@@ -850,9 +1142,14 @@ cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
psrlq
m0
,
8
movd
[
dstq
+
strideq
*
0
]
,
m0
RET
%endmacro
INIT_MMX
mmxext
DR_MMX_FUNCS
INIT_MMX
ssse3
DR_MMX_FUNCS
%macro
DR_XMM_FUNCS
1
INIT_XMM
%1
%macro
DR_XMM_FUNCS
0
cglobal
vp9_ipred_dr_8x8
,
4
,
4
,
4
,
dst
,
stride
,
l
,
a
movq
m1
,
[lq]
movhps
m1
,
[
aq
-
1
]
...
...
@@ -860,7 +1157,7 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pslldq
m0
,
m1
,
1
palignr
m2
,
m1
,
1
PALIGNR
m2
,
m1
,
1
,
m3
LOWPASS
0
,
1
,
2
,
3
movhps
[
dstq
+
strideq
*
0
]
,
m0
...
...
@@ -881,7 +1178,6 @@ cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
movhps
[
dstq
+
stride3q
]
,
m0
RET
INIT_XMM
%1
cglobal
vp9_ipred_dr_16x16
,
4
,
4
,
6
,
dst
,
stride
,
l
,
a
mova
m1
,
[lq]
movu
m2
,
[
aq
-
1
]
...
...
@@ -890,30 +1186,29 @@ cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
lea
stride9q
,
[
strideq
*
3
]
mov
cntd
,
4
lea
stride9q
,
[
stride9q
*
3
]
palignr
m4
,
m2
,
1
palignr
m3
,
m2
,
m1
,
1
5
PALIGNR
m4
,
m2
,
1
,
m5
PALIGNR
m3
,
m2
,
m1
,
15
,
m
5
LOWPASS
3
,
2
,
4
,
5
pslldq
m0
,
m1
,
1
palignr
m2
,
m1
,
1
PALIGNR
m2
,
m1
,
1
,
m4
LOWPASS
0
,
1
,
2
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
]
,
m3
movhps
[
dstq
+
strideq
*
8
+
0
]
,
m0
movq
[
dstq
+
strideq
*
8
+
8
]
,
m3
palignr
m3
,
m0
,
15
PALIGNR
m3
,
m0
,
15
,
m1
pslldq
m0
,
1
mova
[
dstq
+
strideq
*
1
]
,
m3
movhps
[
dstq
+
stride9q
+
0
]
,
m0
movq
[
dstq
+
stride9q
+
8
]
,
m3
palignr
m3
,
m0
,
15
PALIGNR
m3
,
m0
,
15
,
m1
pslldq
m0
,
1
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_dr_32x32
,
4
,
4
,
8
,
dst
,
stride
,
l
,
a
mova
m1
,
[lq]
mova
m2
,
[
lq
+
16
]
...
...
@@ -922,16 +1217,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
movd
m5
,
[
aq
+
31
]
DEFINE_ARGS
dst
,
stride
,
stride8
,
cnt
lea
stride8q
,
[
strideq
*
8
]
palignr
m5
,
m4
,
1
palignr
m6
,
m4
,
m3
,
15
PALIGNR
m5
,
m4
,
1
,
m7
PALIGNR
m6
,
m4
,
m3
,
15
,
m7
LOWPASS
5
,
4
,
6
,
7
palignr
m4
,
m3
,
1
palignr
m6
,
m3
,
m2
,
15
PALIGNR
m4
,
m3
,
1
,
m7
PALIGNR
m6
,
m3
,
m2
,
15
,
m7
LOWPASS
4
,
3
,
6
,
7
palignr
m3
,
m2
,
1
palignr
m6
,
m2
,
m1
,
15
PALIGNR
m3
,
m2
,
1
,
m7
PALIGNR
m6
,
m2
,
m1
,
15
,
m7
LOWPASS
3
,
2
,
6
,
7
palignr
m2
,
m1
,
1
PALIGNR
m2
,
m1
,
1
,
m6
pslldq
m0
,
m1
,
1
LOWPASS
2
,
1
,
0
,
6
mov
cntd
,
16
...
...
@@ -942,9 +1237,9 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
mova
[
dstq
+
stride8q
*
0
+
16
]
,
m5
mova
[
dstq
+
stride8q
*
2
+
0
]
,
m3
mova
[
dstq
+
stride8q
*
2
+
16
]
,
m4
palignr
m5
,
m4
,
15
palignr
m4
,
m3
,
15
palignr
m3
,
m2
,
15
PALIGNR
m5
,
m4
,
15
,
m6
PALIGNR
m4
,
m3
,
15
,
m6
PALIGNR
m3
,
m2
,
15
,
m6
pslldq
m2
,
1
add
dstq
,
strideq
dec
cntd
...
...
@@ -952,12 +1247,16 @@ cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
RET
%endmacro
DR_XMM_FUNCS
ssse3
DR_XMM_FUNCS
avx
INIT_XMM
sse2
DR_XMM_FUNCS
INIT_XMM
ssse3
DR_XMM_FUNCS
INIT_XMM
avx
DR_XMM_FUNCS
; vl
INIT_MMX
ssse3
INIT_MMX
mmxext
cglobal
vp9_ipred_vl_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movq
m0
,
[aq]
psrlq
m1
,
m0
,
8
...
...
@@ -973,11 +1272,16 @@ cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
movd
[
dstq
+
strideq
*
1
]
,
m2
RET
%macro
VL_XMM_FUNCS
1
INIT_XMM
%1
%macro
VL_XMM_FUNCS
0
cglobal
vp9_ipred_vl_8x8
,
4
,
4
,
4
,
dst
,
stride
,
l
,
a
movq
m0
,
[aq]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_0to6_9x7
]
%else
punpcklbw
m1
,
m0
,
m0
punpckhwd
m1
,
m1
shufps
m0
,
m1
,
q3310
%endif
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
psrldq
m1
,
m0
,
1
...
...
@@ -1002,48 +1306,82 @@ cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
movq
[
dstq
+
stride3q
]
,
m2
RET
INIT_XMM
%1
cglobal
vp9_ipred_vl_16x16
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
mova
m4
,
[
pb_1toE_2xF
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_1toE_2xF
]
pshufb
m1
,
m0
,
m4
pshufb
m2
,
m1
,
m4
%else
pand
m4
,
m0
,
[
pb_15x0_1xm1
]
; _______________F
psrldq
m1
,
m0
,
1
; 123456789ABCDEF_
por
m1
,
m4
; 123456789ABCDEFF
psrldq
m2
,
m1
,
1
; 23456789ABCDEFF_
por
m2
,
m4
; 23456789ABCDEFFF
%endif
LOWPASS
2
,
1
,
0
,
3
pavgb
m1
,
m0
mov
cntd
,
4
.
loop
:
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
1
]
,
m2
%if
cpuflag
(
ssse3
)
pshufb
m1
,
m4
pshufb
m2
,
m4
%else
psrldq
m1
,
1
psrldq
m2
,
1
por
m1
,
m4
por
m2
,
m4
%endif
mova
[
dstq
+
strideq
*
2
]
,
m1
mova
[
dstq
+
stride3q
]
,
m2
%if
cpuflag
(
ssse3
)
pshufb
m1
,
m4
pshufb
m2
,
m4
%else
psrldq
m1
,
1
psrldq
m2
,
1
por
m1
,
m4
por
m2
,
m4
%endif
lea
dstq
,
[
dstq
+
strideq
*
4
]
dec
cntd
jg
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_vl_32x32
,
4
,
4
,
7
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
mova
m5
,
[
aq
+
16
]
mova
m4
,
[
pb_1toE_2xF
]
DEFINE_ARGS
dst
,
stride
,
dst16
,
cnt
palignr
m2
,
m5
,
m0
,
1
palignr
m3
,
m5
,
m0
,
2
PALIGNR
m2
,
m5
,
m0
,
1
,
m4
PALIGNR
m3
,
m5
,
m0
,
2
,
m4
lea
dst16q
,
[
dstq
+
strideq
*
8
]
LOWPASS
3
,
2
,
0
,
6
pavgb
m2
,
m0
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_1toE_2xF
]
pshufb
m0
,
m5
,
m4
pshufb
m1
,
m0
,
m4
%else
pand
m4
,
m5
,
[
pb_15x0_1xm1
]
; _______________F
psrldq
m0
,
m5
,
1
; 123456789ABCDEF_
por
m0
,
m4
; 123456789ABCDEFF
psrldq
m1
,
m0
,
1
; 23456789ABCDEFF_
por
m1
,
m4
; 23456789ABCDEFFF
%endif
lea
dst16q
,
[
dst16q
+
strideq
*
8
]
LOWPASS
1
,
0
,
5
,
6
pavgb
m0
,
m5
%if
cpuflag
(
ssse3
)
pshufb
m5
,
[
pb_15
]
%else
punpckhbw
m5
,
m4
,
m4
pshufhw
m5
,
m5
,
q3333
punpckhqdq
m5
,
m5
%endif
mov
cntd
,
8
.
loop
:
...
...
@@ -1056,10 +1394,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
%if
cpuflag
(
avx
)
palignr
%2
,
%3
,
%2
,
1
pshufb
%3
,
m4
%el
se
%el
if
cpuflag
(
ssse3
)
palignr
m6
,
%3
,
%2
,
1
pshufb
%3
,
m4
mova
%2
,
m6
%else
pslldq
m6
,
%3
,
15
psrldq
%3
,
1
psrldq
%2
,
1
por
%3
,
m4
por
%2
,
m6
%endif
%endmacro
...
...
@@ -1072,12 +1416,16 @@ cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
RET
%endmacro
VL_XMM_FUNCS
ssse3
VL_XMM_FUNCS
avx
INIT_XMM
sse2
VL_XMM_FUNCS
INIT_XMM
ssse3
VL_XMM_FUNCS
INIT_XMM
avx
VL_XMM_FUNCS
; vr
INIT_MMX
ssse3
%macro
VR_MMX_FUNCS
0
cglobal
vp9_ipred_vr_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movq
m1
,
[
aq
-
1
]
punpckldq
m2
,
[lq]
...
...
@@ -1085,7 +1433,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pavgb
m0
,
m1
palignr
m1
,
m2
,
5
PALIGNR
m1
,
m2
,
5
,
m3
psrlq
m2
,
m1
,
8
psllq
m3
,
m1
,
8
LOWPASS
2
,
1
,
3
,
4
...
...
@@ -1095,6 +1443,7 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
; IABC | m0 contains ABCDxxxx
; JEFG | m2 contains xJIEFGHx
%if
cpuflag
(
ssse3
)
punpckldq
m0
,
m2
pshufb
m2
,
[
pb_13456_3xm1
]
movd
[
dstq
+
strideq
*
0
]
,
m0
...
...
@@ -1103,10 +1452,26 @@ cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
psrlq
m2
,
8
movd
[
dstq
+
strideq
*
2
]
,
m0
movd
[
dstq
+
strideq
*
1
]
,
m2
%else
psllq
m1
,
m2
,
40
psrlq
m2
,
24
movd
[
dstq
+
strideq
*
0
]
,
m0
movd
[
dstq
+
strideq
*
1
]
,
m2
PALIGNR
m0
,
m1
,
7
,
m3
psllq
m1
,
8
PALIGNR
m2
,
m1
,
7
,
m3
movd
[
dstq
+
strideq
*
2
]
,
m0
movd
[
dstq
+
stride3q
]
,
m2
%endif
RET
%endmacro
INIT_MMX
mmxext
VR_MMX_FUNCS
INIT_MMX
ssse3
VR_MMX_FUNCS
%macro
VR_XMM_FUNCS
1
INIT_XMM
%1
%macro
VR_XMM_FUNCS
1
; n_xmm_regs for 16x16
cglobal
vp9_ipred_vr_8x8
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
movu
m1
,
[
aq
-
1
]
movhps
m2
,
[lq]
...
...
@@ -1114,7 +1479,7 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
pavgb
m0
,
m1
palignr
m1
,
m2
,
9
PALIGNR
m1
,
m2
,
9
,
m3
pslldq
m2
,
m1
,
1
pslldq
m3
,
m1
,
2
LOWPASS
1
,
2
,
3
,
4
...
...
@@ -1128,83 +1493,118 @@ cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
; USQABCDE
; VTRIJKLM
%if
cpuflag
(
ssse3
)
punpcklqdq
m0
,
m1
; ABCDEFGHxxVUTSRQ
%endif
movq
[
dstq
+
strideq
*
0
]
,
m0
pshufb
m0
,
[
pb_6xm1_BDF_0to6
]
; xxxxxxUSQABCDEFG
movhps
[
dstq
+
strideq
*
1
]
,
m1
pshufb
m1
,
[
pb_6xm1_246_8toE
]
; xxxxxxVTRIJKLMNO
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_6xm1_BDF_0to6
]
; xxxxxxUSQABCDEFG
pshufb
m1
,
[
pb_6xm1_246_8toE
]
; xxxxxxVTRIJKLMNO
%else
psrlw
m2
,
m1
,
8
; x_U_S_Q_xxxxxxxx
pand
m3
,
m1
,
[
pw_255
]
; x_V_T_R_xxxxxxxx
packuswb
m3
,
m2
; xVTRxxxxxUSQxxxx
pslldq
m3
,
4
; xxxxxVTRxxxxxUSQ
PALIGNR
m0
,
m3
,
7
,
m4
; xxxxxxUSQABCDEFG
psrldq
m1
,
8
pslldq
m3
,
8
PALIGNR
m1
,
m3
,
7
,
m4
; xxxxxxVTRIJKLMNO
%endif
movhps
[
dstq
+
strideq
*
2
]
,
m0
pslldq
m0
,
1
movhps
[
dstq
+
stride3q
]
,
m1
lea
dstq
,
[
dstq
+
strideq
*
4
]
pslldq
m0
,
1
pslldq
m1
,
1
movhps
[
dstq
+
strideq
*
0
]
,
m0
pslldq
m0
,
1
movhps
[
dstq
+
strideq
*
1
]
,
m1
pslldq
m0
,
1
pslldq
m1
,
1
movhps
[
dstq
+
strideq
*
2
]
,
m0
movhps
[
dstq
+
stride3q
]
,
m1
RET
INIT_XMM
%1
cglobal
vp9_ipred_vr_16x16
,
4
,
4
,
6
,
dst
,
stride
,
l
,
a
cglobal
vp9_ipred_vr_16x16
,
4
,
4
,
%1
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
movu
m1
,
[
aq
-
1
]
mova
m2
,
[lq]
DEFINE_ARGS
dst
,
stride
,
stride3
,
cnt
lea
stride3q
,
[
strideq
*
3
]
palignr
m3
,
m1
,
m2
,
15
PALIGNR
m3
,
m1
,
m2
,
15
,
m6
LOWPASS
3
,
1
,
0
,
4
pavgb
m0
,
m1
palignr
m1
,
m2
,
1
PALIGNR
m1
,
m2
,
1
,
m6
pslldq
m4
,
m2
,
1
LOWPASS
1
,
2
,
4
,
5
%if
cpuflag
(
ssse3
)
pshufb
m1
,
[
pb_02468ACE_13579BDF
]
%else
psrlw
m5
,
m1
,
8
pand
m1
,
[
pw_255
]
packuswb
m1
,
m5
%endif
mov
cntd
,
4
.
loop
:
movlhps
m2
,
m1
mova
[
dstq
+
strideq
*
0
]
,
m0
mova
[
dstq
+
strideq
*
1
]
,
m3
palignr
m4
,
m0
,
m1
,
15
palignr
m5
,
m3
,
m2
,
15
PALIGNR
m4
,
m0
,
m1
,
15
,
m6
PALIGNR
m5
,
m3
,
m2
,
15
,
m6
mova
[
dstq
+
strideq
*
2
]
,
m4
mova
[
dstq
+
stride3q
]
,
m5
lea
dstq
,
[
dstq
+
strideq
*
4
]
palignr
m0
,
m1
,
14
palignr
m3
,
m2
,
14
PALIGNR
m0
,
m1
,
14
,
m6
PALIGNR
m3
,
m2
,
14
,
m6
pslldq
m1
,
2
dec
cntd
jg
.
loop
RET
%if
ARCH_X86_64
INIT_XMM
%1
cglobal
vp9_ipred_vr_32x32
,
4
,
4
,
9
,
dst
,
stride
,
l
,
a
mova
m0
,
[aq]
mova
m2
,
[
aq
+
16
]
movu
m1
,
[
aq
-
1
]
palignr
m3
,
m2
,
m0
,
15
palignr
m4
,
m2
,
m0
,
14
PALIGNR
m3
,
m2
,
m0
,
15
,
m6
PALIGNR
m4
,
m2
,
m0
,
14
,
m6
LOWPASS
4
,
3
,
2
,
5
pavgb
m3
,
m2
mova
m2
,
[
lq
+
16
]
palignr
m5
,
m1
,
m2
,
15
PALIGNR
m5
,
m1
,
m2
,
15
,
m6
LOWPASS
5
,
1
,
0
,
6
pavgb
m0
,
m1
mova
m6
,
[lq]
palignr
m1
,
m2
,
1
palignr
m7
,
m2
,
m6
,
15
LOWPASS
1
,
2
,
7
,
8
palignr
m2
,
m6
,
1
%if
ARCH_X86_64
SWAP
0
,
8
%else
mova
[dstq],
m0
%endif
PALIGNR
m1
,
m2
,
1
,
m0
PALIGNR
m7
,
m2
,
m6
,
15
,
m0
LOWPASS
1
,
2
,
7
,
0
PALIGNR
m2
,
m6
,
1
,
m0
pslldq
m7
,
m6
,
1
LOWPASS
2
,
6
,
7
,
8
LOWPASS
2
,
6
,
7
,
0
%if
cpuflag
(
ssse3
)
pshufb
m1
,
[
pb_02468ACE_13579BDF
]
pshufb
m2
,
[
pb_02468ACE_13579BDF
]
%else
psrlw
m0
,
m1
,
8
psrlw
m6
,
m2
,
8
pand
m1
,
[
pw_255
]
pand
m2
,
[
pw_255
]
packuswb
m1
,
m0
packuswb
m2
,
m6
%endif
DEFINE_ARGS
dst
,
stride
,
dst16
,
cnt
lea
dst16q
,
[
dstq
+
strideq
*
8
]
lea
dst16q
,
[
dst16q
+
strideq
*
8
]
SBUTTERFLY
qdq
,
2
,
1
,
6
%if
ARCH_X86_64
SWAP
0
,
8
%else
mova
m0
,
[dstq]
%endif
mov
cntd
,
8
.
loop
:
...
...
@@ -1216,8 +1616,8 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
movhps
[
dst16q
+
stride%1
]
,
%2
movu
[
dst16q
+
stride%1
+
8
]
,
%3
movq
[
dst16q
+
stride%1
+
24
]
,
%4
palignr
%4
,
%3
,
15
palignr
%3
,
%2
,
15
PALIGNR
%4
,
%3
,
15
,
m6
PALIGNR
%3
,
%2
,
15
,
m6
pslldq
%2
,
1
%endmacro
...
...
@@ -1228,15 +1628,18 @@ cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
dec
cntd
jg
.
loop
RET
%endif
%endmacro
VR_XMM_FUNCS
ssse3
VR_XMM_FUNCS
avx
INIT_XMM
sse2
VR_XMM_FUNCS
7
INIT_XMM
ssse3
VR_XMM_FUNCS
6
INIT_XMM
avx
VR_XMM_FUNCS
6
; hd
INIT_MMX
ssse3
INIT_MMX
mmxext
cglobal
vp9_ipred_hd_4x4
,
4
,
4
,
0
,
dst
,
stride
,
l
,
a
movd
m0
,
[lq]
punpckldq
m0
,
[
aq
-
1
]
...
...
@@ -1266,9 +1669,8 @@ cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
movd
[
dstq
+
strideq
*
0
]
,
m0
RET
%macro
HD_XMM_FUNCS
1
INIT_XMM
%1
cglobal
vp9_ipred_hd_8x8
,
4
,
4
,
4
,
dst
,
stride
,
l
,
a
%macro
HD_XMM_FUNCS
0
cglobal
vp9_ipred_hd_8x8
,
4
,
4
,
5
,
dst
,
stride
,
l
,
a
movq
m0
,
[lq]
movhps
m0
,
[
aq
-
1
]
DEFINE_ARGS
dst
,
stride
,
stride3
,
dst4
...
...
@@ -1296,18 +1698,17 @@ cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
movhps
[
dstq
+
stride3q
]
,
m1
movq
[
dst4q
+
stride3q
]
,
m1
palignr
m3
,
m2
,
m1
,
2
PALIGNR
m3
,
m2
,
m1
,
2
,
m4
movhps
[
dstq
+
strideq
*
2
]
,
m3
movq
[
dst4q
+
strideq
*
2
]
,
m3
palignr
m3
,
m2
,
m1
,
4
PALIGNR
m3
,
m2
,
m1
,
4
,
m
4
movhps
[
dstq
+
strideq
*
1
]
,
m3
movq
[
dst4q
+
strideq
*
1
]
,
m3
palignr
m2
,
m1
,
6
PALIGNR
m2
,
m1
,
6
,
m4
movhps
[
dstq
+
strideq
*
0
]
,
m2
movq
[
dst4q
+
strideq
*
0
]
,
m2
RET
INIT_XMM
%1
cglobal
vp9_ipred_hd_16x16
,
4
,
6
,
7
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
movu
m3
,
[
aq
-
1
]
...
...
@@ -1319,8 +1720,8 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
psrldq
m4
,
m3
,
1
psrldq
m5
,
m3
,
2
LOWPASS
5
,
4
,
3
,
6
palignr
m1
,
m3
,
m0
,
1
palignr
m2
,
m3
,
m0
,
2
PALIGNR
m1
,
m3
,
m0
,
1
,
m6
PALIGNR
m2
,
m3
,
m0
,
2
,
m6
LOWPASS
2
,
1
,
0
,
6
pavgb
m1
,
m0
SBUTTERFLY
bw
,
1
,
2
,
6
...
...
@@ -1338,17 +1739,26 @@ cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
%if
cpuflag
(
avx
)
palignr
m1
,
m2
,
m1
,
2
palignr
m2
,
m5
,
m2
,
2
%el
se
%el
if
cpuflag
(
ssse3
)
palignr
m3
,
m2
,
m1
,
2
palignr
m0
,
m5
,
m2
,
2
mova
m1
,
m3
mova
m2
,
m0
%else
; slightly modified version of PALIGNR
mova
m6
,
m2
mova
m4
,
m5
pslldq
m6
,
14
pslldq
m4
,
14
psrldq
m1
,
2
psrldq
m2
,
2
por
m1
,
m6
por
m2
,
m4
%endif
psrldq
m5
,
2
jg
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_hd_32x32
,
4
,
6
,
8
,
dst
,
stride
,
l
,
a
mova
m0
,
[lq]
mova
m1
,
[
lq
+
16
]
...
...
@@ -1362,15 +1772,15 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
psrldq
m4
,
m3
,
1
psrldq
m5
,
m3
,
2
LOWPASS
5
,
4
,
3
,
6
palignr
m4
,
m3
,
m2
,
2
palignr
m3
,
m2
,
1
PALIGNR
m4
,
m3
,
m2
,
2
,
m6
PALIGNR
m3
,
m2
,
1
,
m6
LOWPASS
4
,
3
,
2
,
6
palignr
m3
,
m2
,
m1
,
2
palignr
m2
,
m1
,
1
PALIGNR
m3
,
m2
,
m1
,
2
,
m6
PALIGNR
m2
,
m1
,
1
,
m6
LOWPASS
3
,
2
,
1
,
6
pavgb
m2
,
m1
palignr
m6
,
m1
,
m0
,
1
palignr
m1
,
m0
,
2
PALIGNR
m6
,
m1
,
m0
,
1
,
m7
PALIGNR
m1
,
m0
,
2
,
m7
LOWPASS
1
,
6
,
0
,
7
pavgb
m0
,
m6
SBUTTERFLY
bw
,
2
,
3
,
6
...
...
@@ -1394,7 +1804,7 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
palignr
m3
,
m4
,
m3
,
2
palignr
m4
,
m5
,
m4
,
2
psrldq
m5
,
2
%el
se
%el
if
cpuflag
(
ssse3
)
psrldq
m6
,
m5
,
2
palignr
m5
,
m4
,
2
palignr
m4
,
m3
,
2
...
...
@@ -1407,18 +1817,46 @@ cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
mova
m3
,
m4
mova
m4
,
m5
mova
m5
,
m6
%else
; sort of a half-integrated version of PALIGNR
pslldq
m7
,
m4
,
14
pslldq
m6
,
m5
,
14
psrldq
m4
,
2
psrldq
m5
,
2
por
m4
,
m6
pslldq
m6
,
m3
,
14
psrldq
m3
,
2
por
m3
,
m7
pslldq
m7
,
m2
,
14
psrldq
m2
,
2
por
m2
,
m6
pslldq
m6
,
m1
,
14
psrldq
m1
,
2
por
m1
,
m7
psrldq
m0
,
2
por
m0
,
m6
%endif
jg
.
loop
RET
%endmacro
HD_XMM_FUNCS
ssse3
HD_XMM_FUNCS
avx
INIT_XMM
sse2
HD_XMM_FUNCS
INIT_XMM
ssse3
HD_XMM_FUNCS
INIT_XMM
avx
HD_XMM_FUNCS
INIT_MMX
ssse3
%macro
HU_MMX_FUNCS
0
cglobal
vp9_ipred_hu_4x4
,
3
,
3
,
0
,
dst
,
stride
,
l
movd
m0
,
[lq]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_0to2_5x3
]
%else
punpcklbw
m1
,
m0
,
m0
; 00112233
pshufw
m1
,
m1
,
q3333
; 33333333
punpckldq
m0
,
m1
; 01233333
%endif
psrlq
m1
,
m0
,
8
psrlq
m2
,
m1
,
8
LOWPASS
2
,
1
,
0
,
3
...
...
@@ -1426,7 +1864,7 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
DEFINE_ARGS
dst
,
stride
,
stride3
lea
stride3q
,
[
strideq
*
3
]
SBUTTERFLY
bw
,
1
,
2
,
0
palignr
m2
,
m1
,
2
PALIGNR
m2
,
m1
,
2
,
m0
movd
[
dstq
+
strideq
*
0
]
,
m1
movd
[
dstq
+
strideq
*
1
]
,
m2
punpckhdq
m1
,
m1
...
...
@@ -1434,12 +1872,23 @@ cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
movd
[
dstq
+
strideq
*
2
]
,
m1
movd
[
dstq
+
stride3q
]
,
m2
RET
%endmacro
%macro
HU_XMM_FUNCS
1
INIT_XMM
%1
INIT_MMX
mmxext
HU_MMX_FUNCS
INIT_MMX
ssse3
HU_MMX_FUNCS
%macro
HU_XMM_FUNCS
1
; n_xmm_regs in hu_32x32
cglobal
vp9_ipred_hu_8x8
,
3
,
4
,
4
,
dst
,
stride
,
l
movq
m0
,
[lq]
%if
cpuflag
(
ssse3
)
pshufb
m0
,
[
pb_0to6_9x7
]
%else
punpcklbw
m1
,
m0
,
m0
; 0011223344556677
punpckhwd
m1
,
m1
; 4444555566667777
shufps
m0
,
m1
,
q3310
; 0123456777777777
%endif
psrldq
m1
,
m0
,
1
psrldq
m2
,
m1
,
1
LOWPASS
2
,
1
,
0
,
3
...
...
@@ -1450,56 +1899,81 @@ cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
SBUTTERFLY
bw
,
1
,
2
,
0
movq
[
dstq
+
strideq
*
0
]
,
m1
movhps
[
dst4q
+
strideq
*
0
]
,
m1
palignr
m0
,
m2
,
m1
,
2
PALIGNR
m0
,
m2
,
m1
,
2
,
m3
movq
[
dstq
+
strideq
*
1
]
,
m0
movhps
[
dst4q
+
strideq
*
1
]
,
m0
palignr
m0
,
m2
,
m1
,
4
PALIGNR
m0
,
m2
,
m1
,
4
,
m3
movq
[
dstq
+
strideq
*
2
]
,
m0
movhps
[
dst4q
+
strideq
*
2
]
,
m0
palignr
m2
,
m1
,
6
PALIGNR
m2
,
m1
,
6
,
m3
movq
[
dstq
+
stride3q
]
,
m2
movhps
[
dst4q
+
stride3q
]
,
m2
RET
INIT_XMM
%1
cglobal
vp9_ipred_hu_16x16
,
3
,
4
,
5
,
dst
,
stride
,
l
mova
m0
,
[lq]
%if
cpuflag
(
ssse3
)
mova
m3
,
[
pb_2toE_3xF
]
pshufb
m1
,
m0
,
[
pb_1toE_2xF
]
pshufb
m2
,
m0
,
m3
%else
pand
m3
,
m0
,
[
pb_15x0_1xm1
]
psrldq
m1
,
m0
,
1
por
m1
,
m3
punpckhbw
m3
,
m3
psrldq
m2
,
m0
,
2
por
m2
,
m3
%endif
LOWPASS
2
,
1
,
0
,
4
pavgb
m1
,
m0
DEFINE_ARGS
dst
,
stride
,
stride9
,
cnt
lea
stride9q
,
[
strideq
*
3
]
lea
stride9q
,
[
strideq
*
8
+
strideq
]
mov
cntd
,
4
lea
stride9q
,
[
stride9q
*
3
]
SBUTTERFLY
bw
,
1
,
2
,
0
.
loop
:
mova
[
dstq
+
strideq
*
0
]
,
m1
mova
[
dstq
+
strideq
*
8
]
,
m2
palignr
m0
,
m2
,
m1
,
2
PALIGNR
m0
,
m2
,
m1
,
2
,
m4
%if
cpuflag
(
ssse3
)
pshufb
m2
,
m3
%else
psrldq
m2
,
2
por
m2
,
m3
%endif
mova
[
dstq
+
strideq
*
1
]
,
m0
mova
[
dstq
+
stride9q
]
,
m2
palignr
m1
,
m2
,
m0
,
2
PALIGNR
m1
,
m2
,
m0
,
2
,
m4
%if
cpuflag
(
ssse3
)
pshufb
m2
,
m3
%else
psrldq
m2
,
2
por
m2
,
m3
%endif
lea
dstq
,
[
dstq
+
strideq
*
2
]
dec
cntd
jg
.
loop
RET
INIT_XMM
%1
cglobal
vp9_ipred_hu_32x32
,
3
,
7
,
7
,
dst
,
stride
,
l
cglobal
vp9_ipred_hu_32x32
,
3
,
7
,
%1
,
dst
,
stride
,
l
mova
m1
,
[lq]
mova
m0
,
[
lq
+
16
]
mova
m4
,
[
pb_2toE_3xF
]
palignr
m2
,
m0
,
m1
,
1
palignr
m3
,
m0
,
m1
,
2
PALIGNR
m2
,
m0
,
m1
,
1
,
m5
PALIGNR
m3
,
m0
,
m1
,
2
,
m5
LOWPASS
3
,
2
,
1
,
5
pavgb
m2
,
m1
pshufb
m1
,
m0
,
m4
%if
cpuflag
(
ssse3
)
mova
m4
,
[
pb_2toE_3xF
]
pshufb
m5
,
m0
,
[
pb_1toE_2xF
]
pshufb
m1
,
m0
,
m4
%else
pand
m4
,
m0
,
[
pb_15x0_1xm1
]
psrldq
m5
,
m0
,
1
por
m5
,
m4
punpckhbw
m4
,
m4
psrldq
m1
,
m0
,
2
por
m1
,
m4
%endif
LOWPASS
1
,
5
,
0
,
6
pavgb
m0
,
m5
DEFINE_ARGS
dst
,
stride
,
cnt
,
stride0
,
dst8
,
dst16
,
dst24
...
...
@@ -1510,7 +1984,12 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
lea
dst24q
,
[
dst16q
+
strideq
*
8
]
SBUTTERFLY
bw
,
0
,
1
,
5
SBUTTERFLY
bw
,
2
,
3
,
5
%if
cpuflag
(
ssse3
)
pshufb
m6
,
m1
,
[
pb_15
]
%else
pshufhw
m6
,
m4
,
q3333
punpckhqdq
m6
,
m6
%endif
.
loop
:
mova
[
dstq
+
stride0q
+
0
]
,
m2
...
...
@@ -1526,7 +2005,7 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
palignr
m3
,
m0
,
m3
,
2
palignr
m0
,
m1
,
m0
,
2
pshufb
m1
,
m4
%el
se
%el
if
cpuflag
(
ssse3
)
pshufb
m5
,
m1
,
m4
palignr
m1
,
m0
,
2
palignr
m0
,
m3
,
2
...
...
@@ -1535,6 +2014,19 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
mova
m3
,
m0
mova
m0
,
m1
mova
m1
,
m5
%else
; half-integrated version of PALIGNR
pslldq
m5
,
m1
,
14
pslldq
m7
,
m0
,
14
psrldq
m1
,
2
psrldq
m0
,
2
por
m1
,
m4
por
m0
,
m5
pslldq
m5
,
m3
,
14
psrldq
m3
,
2
por
m3
,
m7
psrldq
m2
,
2
por
m2
,
m5
%endif
add
stride0q
,
strideq
dec
cntd
...
...
@@ -1542,7 +2034,11 @@ cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
RET
%endmacro
HU_XMM_FUNCS
ssse3
HU_XMM_FUNCS
avx
INIT_XMM
sse2
HU_XMM_FUNCS
8
INIT_XMM
ssse3
HU_XMM_FUNCS
7
INIT_XMM
avx
HU_XMM_FUNCS
7
; FIXME 127, 128, 129 ?
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment