Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
F
ffmpeg.wasm-core
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Linshizhi
ffmpeg.wasm-core
Commits
e7078e84
Commit
e7078e84
authored
Jul 24, 2015
by
Anton Khirnov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
hevcdsp: add x86 SIMD for MC
parent
0cef06df
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1125 additions
and
15 deletions
+1125
-15
hevc.c
libavcodec/hevc.c
+3
-3
hevc.h
libavcodec/hevc.h
+1
-1
hevcdsp.c
libavcodec/hevcdsp.c
+23
-1
hevcdsp.h
libavcodec/hevcdsp.h
+4
-1
hevcdsp_template.c
libavcodec/hevcdsp_template.c
+4
-4
Makefile
libavcodec/x86/Makefile
+2
-1
hevc_mc.asm
libavcodec/x86/hevc_mc.asm
+851
-0
hevcdsp_init.c
libavcodec/x86/hevcdsp_init.c
+237
-4
No files found.
libavcodec/hevc.c
View file @
e7078e84
...
...
@@ -38,9 +38,9 @@
#include "golomb.h"
#include "hevc.h"
const
uint8_t
ff_hevc_qpel_extra_before
[
4
]
=
{
0
,
3
,
3
,
2
};
const
uint8_t
ff_hevc_qpel_extra_after
[
4
]
=
{
0
,
3
,
4
,
4
};
const
uint8_t
ff_hevc_qpel_extra
[
4
]
=
{
0
,
6
,
7
,
6
};
const
uint8_t
ff_hevc_qpel_extra_before
[
4
]
=
{
0
,
3
,
3
,
3
};
const
uint8_t
ff_hevc_qpel_extra_after
[
4
]
=
{
0
,
4
,
4
,
4
};
const
uint8_t
ff_hevc_qpel_extra
[
4
]
=
{
0
,
7
,
7
,
7
};
static
const
uint8_t
scan_1x1
[
1
]
=
{
0
};
...
...
libavcodec/hevc.h
View file @
e7078e84
...
...
@@ -740,7 +740,7 @@ typedef struct HEVCPredContext {
}
HEVCPredContext
;
typedef
struct
HEVCLocalContext
{
DECLARE_ALIGNED
(
16
,
int16_t
,
mc_buffer
[(
MAX_PB_SIZE
+
7
)
*
MAX_PB_SIZE
]);
DECLARE_ALIGNED
(
16
,
int16_t
,
mc_buffer
[(
MAX_PB_SIZE
+
24
)
*
MAX_PB_SIZE
]);
uint8_t
cabac_state
[
HEVC_CONTEXTS
];
uint8_t
first_qp_group
;
...
...
libavcodec/hevcdsp.c
View file @
e7078e84
...
...
@@ -89,7 +89,7 @@ static const int8_t transform[32][32] = {
90
,
-
90
,
88
,
-
85
,
82
,
-
78
,
73
,
-
67
,
61
,
-
54
,
46
,
-
38
,
31
,
-
22
,
13
,
-
4
},
};
DECLARE_ALIGNED
(
16
,
const
int
8_t
,
ff_hevc_epel_filter
s
[
7
][
16
])
=
{
DECLARE_ALIGNED
(
16
,
const
int
16_t
,
ff_hevc_epel_coeff
s
[
7
][
16
])
=
{
{
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
},
{
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
},
{
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
},
...
...
@@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
{
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
},
};
DECLARE_ALIGNED
(
16
,
const
int8_t
,
ff_hevc_epel_coeffs8
[
7
][
16
])
=
{
{
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
,
-
2
,
58
,
10
,
-
2
},
{
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
,
-
4
,
54
,
16
,
-
2
},
{
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
,
-
6
,
46
,
28
,
-
4
},
{
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
,
-
4
,
36
,
36
,
-
4
},
{
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
,
-
4
,
28
,
46
,
-
6
},
{
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
,
-
2
,
16
,
54
,
-
4
},
{
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
,
-
2
,
10
,
58
,
-
2
},
};
DECLARE_ALIGNED
(
16
,
const
int16_t
,
ff_hevc_qpel_coeffs
[
3
][
8
])
=
{
{
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
},
{
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
},
{
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
},
};
DECLARE_ALIGNED
(
16
,
const
int8_t
,
ff_hevc_qpel_coeffs8
[
3
][
16
])
=
{
{
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
,
-
1
,
4
,
-
10
,
58
,
17
,
-
5
,
1
,
0
},
{
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
,
-
1
,
4
,
-
11
,
40
,
40
,
-
11
,
4
,
-
1
},
{
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
,
0
,
1
,
-
5
,
17
,
58
,
-
10
,
4
,
-
1
},
};
#define BIT_DEPTH 8
#include "hevcdsp_template.c"
#undef BIT_DEPTH
...
...
libavcodec/hevcdsp.h
View file @
e7078e84
...
...
@@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
);
extern
const
int8_t
ff_hevc_epel_filters
[
7
][
16
];
extern
const
int16_t
ff_hevc_epel_coeffs
[
7
][
16
];
extern
const
int8_t
ff_hevc_epel_coeffs8
[
7
][
16
];
extern
const
int16_t
ff_hevc_qpel_coeffs
[
3
][
8
];
extern
const
int8_t
ff_hevc_qpel_coeffs8
[
3
][
16
];
#endif
/* AVCODEC_HEVCDSP_H */
libavcodec/hevcdsp_template.c
View file @
e7078e84
...
...
@@ -1018,7 +1018,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter
=
ff_hevc_epel_filter
s
[
mx
-
1
];
const
int
16_t
*
filter
=
ff_hevc_epel_coeff
s
[
mx
-
1
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_2
=
filter
[
2
];
...
...
@@ -1040,7 +1040,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter
=
ff_hevc_epel_filter
s
[
my
-
1
];
const
int
16_t
*
filter
=
ff_hevc_epel_coeff
s
[
my
-
1
];
int8_t
filter_0
=
filter
[
0
];
int8_t
filter_1
=
filter
[
1
];
int8_t
filter_2
=
filter
[
2
];
...
...
@@ -1063,8 +1063,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
int
x
,
y
;
pixel
*
src
=
(
pixel
*
)
_src
;
ptrdiff_t
srcstride
=
_srcstride
/
sizeof
(
pixel
);
const
int
8_t
*
filter_h
=
ff_hevc_epel_filter
s
[
mx
-
1
];
const
int
8_t
*
filter_v
=
ff_hevc_epel_filter
s
[
my
-
1
];
const
int
16_t
*
filter_h
=
ff_hevc_epel_coeff
s
[
mx
-
1
];
const
int
16_t
*
filter_v
=
ff_hevc_epel_coeff
s
[
my
-
1
];
int8_t
filter_0
=
filter_h
[
0
];
int8_t
filter_1
=
filter_h
[
1
];
int8_t
filter_2
=
filter_h
[
2
];
...
...
libavcodec/x86/Makefile
View file @
e7078e84
...
...
@@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \
YASM-OBJS-$(CONFIG_AAC_DECODER)
+=
x86/sbrdsp.o
YASM-OBJS-$(CONFIG_APE_DECODER)
+=
x86/apedsp.o
YASM-OBJS-$(CONFIG_DCA_DECODER)
+=
x86/dcadsp.o
YASM-OBJS-$(CONFIG_HEVC_DECODER)
+=
x86/hevc_deblock.o
YASM-OBJS-$(CONFIG_HEVC_DECODER)
+=
x86/hevc_deblock.o
\
x86/hevc_mc.o
YASM-OBJS-$(CONFIG_PNG_DECODER)
+=
x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER)
+=
x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER)
+=
x86/rv40dsp.o
...
...
libavcodec/x86/hevc_mc.asm
0 → 100644
View file @
e7078e84
;*****************************************************************************
;* x86-optimized HEVC MC
;* Copyright 2015 Anton Khirnov
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include
"libavutil/x86/x86util.asm"
SECTION
.
rodata
pw_1023
:
times
8
dw
1023
cextern
hevc_qpel_coeffs
cextern
hevc_qpel_coeffs8
cextern
hevc_epel_coeffs
cextern
hevc_epel_coeffs8
cextern
pw_8
cextern
pw_16
cextern
pw_32
cextern
pw_64
SECTION
.
text
; %1: width
; %2: bit depth
%macro
COMMON_DEFS
2
%
assign
blocksize
8
%
assign
nb_blocks
((
%1
+
blocksize
-
1
)
/
blocksize
)
%
define
last_block_truncated
(
blocksize
*
nb_blocks
>
%1
)
%
if
%2
>
8
%
define
LOAD_BLOCK
movu
%
define
LOAD_HALFBLOCK
movq
%
assign
pixelsize
2
%
else
%
define
LOAD_BLOCK
movq
%
define
LOAD_HALFBLOCK
movd
%
assign
pixelsize
1
%
endif
%
define
STORE_BLOCK
mova
%
define
STORE_HALFBLOCK
movq
%endmacro
; %1: block index
%macro
BLOCK_DEFS
1
%
if
last_block_truncated
&&
%1
==
nb_blocks
-
1
%
define
block_truncated
1
%
define
LOAD
LOAD_HALFBLOCK
%
define
STORE
STORE_HALFBLOCK
%
else
%
define
block_truncated
0
%
define
LOAD
LOAD_BLOCK
%
define
STORE
STORE_BLOCK
%
endif
%endmacro
; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
; pixel *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; %1: block width
; %2: bit depth
; %3: log2 of height unroll
%macro
GET_PIXELS
3
cglobal
hevc_get_pixels_
%
+
%1
%
+
_
%
+
%2
,
5
,
5
,
2
,
dst
,
dststride
,
src
,
srcstride
,
height
; rest of the args unused
%
assign
shift
14
-
%2
COMMON_DEFS
%1
,
%2
%if
pixelsize
==
1
pxor
m0
,
m0
%endif
shr
heightd
,
%3
.
loop
:
%assign
i
0
%rep
(
1
<<
%3
)
%assign
j
0
%rep
nb_blocks
BLOCK_DEFS
j
LOAD
m1
,
[
srcq
+
j
*
pixelsize
*
blocksize
]
%if
pixelsize
==
1
punpcklbw
m1
,
m0
%endif
psllw
m1
,
shift
STORE
[
dstq
+
j
*
2
*
blocksize
]
,
m1
%assign
j
(
j
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%assign
i
(
i
+
1
)
%endrep
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
GET_PIXELS
4
,
8
,
1
GET_PIXELS
8
,
8
,
1
GET_PIXELS
12
,
8
,
3
GET_PIXELS
16
,
8
,
2
GET_PIXELS
24
,
8
,
3
GET_PIXELS
32
,
8
,
3
GET_PIXELS
48
,
8
,
3
GET_PIXELS
64
,
8
,
3
GET_PIXELS
4
,
10
,
1
GET_PIXELS
8
,
10
,
1
GET_PIXELS
12
,
10
,
3
GET_PIXELS
16
,
10
,
2
GET_PIXELS
24
,
10
,
3
GET_PIXELS
32
,
10
,
3
GET_PIXELS
48
,
10
,
3
GET_PIXELS
64
,
10
,
3
; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit qpel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro
QPEL_8
2
%if
%2
%
define
postfix
v
%
define
mvfrac
myq
%
define
coeffsaddr
r5q
%
define
pixstride
srcstrideq
%
define
pixstride3
r5q
%
define
src_m3
r6q
%else
%
define
postfix
h
%
define
mvfrac
mxq
%
define
coeffsaddr
r6q
%
define
pixstride
1
%
define
pixstride3
3
%
define
src_m3
(
srcq
-
3
)
%endif
COMMON_DEFS
%1
,
8
cglobal
hevc_qpel_
%
+
postfix
%
+
_
%
+
%1
%
+
_8
,
7
,
7
,
7
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
and
mvfrac
,
0x3
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsaddr
,
[
hevc_qpel_coeffs8
]
mova
m0
,
[
coeffsaddr
+
mvfrac
]
SPLATW
m1
,
m0
,
1
SPLATW
m2
,
m0
,
2
SPLATW
m3
,
m0
,
3
SPLATW
m0
,
m0
,
0
%if
%2
lea
pixstride3
,
[
srcstrideq
+
2
*
srcstrideq
]
mov
src_m3
,
srcq
sub
src_m3
,
pixstride3
%endif
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m4
,
[
src_m3
+
i
*
blocksize
]
LOAD
m5
,
[
src_m3
+
i
*
blocksize
+
1
*
pixstride
]
punpcklbw
m4
,
m5
pmaddubsw
m4
,
m0
LOAD
m5
,
[
src_m3
+
i
*
blocksize
+
2
*
pixstride
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m1
paddsw
m4
,
m5
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
1
*
pixstride
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
+
2
*
pixstride
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m2
paddsw
m4
,
m5
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
pixstride3
]
LOAD
m6
,
[
srcq
+
i
*
blocksize
+
4
*
pixstride
]
punpcklbw
m5
,
m6
pmaddubsw
m5
,
m3
paddsw
m4
,
m5
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m4
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%2
add
src_m3
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
ssse3
QPEL_8
4
,
0
QPEL_8
8
,
0
QPEL_8
12
,
0
QPEL_8
16
,
0
QPEL_8
24
,
0
QPEL_8
32
,
0
QPEL_8
48
,
0
QPEL_8
64
,
0
QPEL_8
4
,
1
QPEL_8
8
,
1
QPEL_8
12
,
1
QPEL_8
16
,
1
QPEL_8
24
,
1
QPEL_8
32
,
1
QPEL_8
48
,
1
QPEL_8
64
,
1
; 16-bit qpel interpolation
; %1: block width
; %2: shift applied to the result
; %3: 0 - horizontal; 1 - vertical
%macro
QPEL_16
3
%if
%3
%
define
mvfrac
myq
%
define
pixstride
srcstrideq
%
define
pixstride3
sstride3q
%
define
src_m3
srcm3q
%else
%
define
mvfrac
mxq
%
define
pixstride
2
%
define
pixstride3
6
%
define
src_m3
(
srcq
-
6
)
%endif
COMMON_DEFS
%1
,
16
and
mvfrac
,
0x3
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsregq
,
[
hevc_qpel_coeffs
]
mova
m0
,
[
coeffsregq
+
mvfrac
]
pshufd
m1
,
m0
,
0x55
pshufd
m2
,
m0
,
0xaa
pshufd
m3
,
m0
,
0xff
pshufd
m0
,
m0
,
0x00
%if
%3
lea
sstride3q
,
[
srcstrideq
+
2
*
srcstrideq
]
mov
srcm3q
,
srcq
sub
srcm3q
,
sstride3q
%endif
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m4
,
[
src_m3
+
i
*
2
*
blocksize
]
LOAD
m5
,
[
src_m3
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m6
,
[
src_m3
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m7
,
[
srcq
+
i
*
2
*
blocksize
+
0
*
pixstride
]
LOAD
m8
,
[
srcq
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m9
,
[
srcq
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m10
,
[
srcq
+
i
*
2
*
blocksize
+
pixstride3
]
LOAD
m11
,
[
srcq
+
i
*
2
*
blocksize
+
4
*
pixstride
]
punpcklwd
m12
,
m4
,
m5
pmaddwd
m12
,
m0
punpcklwd
m13
,
m6
,
m7
pmaddwd
m13
,
m1
paddd
m12
,
m13
punpcklwd
m13
,
m8
,
m9
pmaddwd
m13
,
m2
paddd
m12
,
m13
punpcklwd
m13
,
m10
,
m11
pmaddwd
m13
,
m3
paddd
m12
,
m13
psrad
m12
,
%2
%
if
block_truncated
==
0
punpckhwd
m4
,
m5
pmaddwd
m4
,
m0
punpckhwd
m6
,
m7
pmaddwd
m6
,
m1
paddd
m4
,
m6
punpckhwd
m8
,
m9
pmaddwd
m8
,
m2
paddd
m4
,
m8
punpckhwd
m10
,
m11
pmaddwd
m10
,
m3
paddd
m4
,
m10
psrad
m4
,
%2
%
endif
packssdw
m12
,
m4
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m12
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%3
add
srcm3q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
%macro
QPEL_H_10
1
cglobal
hevc_qpel_h_
%
+
%1
%
+
_10
,
7
,
9
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
mcbuffer
,
coeffsreg
QPEL_16
%1
,
2
,
0
%endmacro
INIT_XMM
avx
QPEL_H_10
4
QPEL_H_10
8
QPEL_H_10
12
QPEL_H_10
16
QPEL_H_10
24
QPEL_H_10
32
QPEL_H_10
48
QPEL_H_10
64
%macro
QPEL_V_10
1
cglobal
hevc_qpel_v_
%
+
%1
%
+
_10
,
7
,
10
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
srcm3
,
coeffsreg
QPEL_16
%1
,
2
,
1
%endmacro
INIT_XMM
avx
QPEL_V_10
4
QPEL_V_10
8
QPEL_V_10
12
QPEL_V_10
16
QPEL_V_10
24
QPEL_V_10
32
QPEL_V_10
48
QPEL_V_10
64
; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro
QPEL_HV
1
cglobal
hevc_qpel_hv_
%
+
%1
,
7
,
10
,
14
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
srcm3
,
coeffsreg
QPEL_16
%1
,
6
,
1
%endmacro
INIT_XMM
avx
QPEL_HV
4
QPEL_HV
8
QPEL_HV
12
QPEL_HV
16
QPEL_HV
24
QPEL_HV
32
QPEL_HV
48
QPEL_HV
64
%endif
; ARCH_X86_64
; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit epel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro
EPEL_8
2
%if
%2
%
define
postfix
v
%
define
mvfrac
myq
%
define
coeffsaddr
r5q
%
define
pixstride
srcstrideq
%
define
pixstride3
r5q
%else
%
define
postfix
h
%
define
mvfrac
mxq
%
define
coeffsaddr
r6q
%
define
pixstride
1
%
define
pixstride3
3
%endif
COMMON_DEFS
%1
,
8
cglobal
hevc_epel_
%
+
postfix
%
+
_
%
+
%1
%
+
_8
,
7
,
7
,
6
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
and
mvfrac
,
0x7
dec
mvfrac
shl
mvfrac
,
4
lea
coeffsaddr
,
[
hevc_epel_coeffs8
]
movq
m0
,
[
coeffsaddr
+
mvfrac
]
SPLATW
m1
,
m0
,
1
SPLATW
m0
,
m0
,
0
%if
%2
lea
pixstride3
,
[
srcstrideq
+
2
*
srcstrideq
]
%endif
sub
srcq
,
pixstride
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m2
,
[
srcq
+
i
*
blocksize
+
0
*
pixstride
]
LOAD
m3
,
[
srcq
+
i
*
blocksize
+
1
*
pixstride
]
LOAD
m4
,
[
srcq
+
i
*
blocksize
+
2
*
pixstride
]
LOAD
m5
,
[
srcq
+
i
*
blocksize
+
pixstride3
]
punpcklbw
m2
,
m3
punpcklbw
m4
,
m5
pmaddubsw
m2
,
m0
pmaddubsw
m4
,
m1
paddsw
m2
,
m4
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m2
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
ssse3
EPEL_8
4
,
0
EPEL_8
8
,
0
EPEL_8
12
,
0
EPEL_8
16
,
0
EPEL_8
24
,
0
EPEL_8
32
,
0
EPEL_8
4
,
1
EPEL_8
8
,
1
EPEL_8
12
,
1
EPEL_8
16
,
1
EPEL_8
24
,
1
EPEL_8
32
,
1
%macro
EPEL_16
3
%if
%3
%
define
mvfrac
myq
%
define
pixstride
srcstrideq
%
define
pixstride3
sstride3q
%else
%
define
mvfrac
mxq
%
define
pixstride
2
%
define
pixstride3
6
%endif
COMMON_DEFS
%1
,
16
and
mvfrac
,
0x7
dec
mvfrac
shl
mvfrac
,
5
lea
coeffsregq
,
[
hevc_epel_coeffs
]
mova
m0
,
[
coeffsregq
+
mvfrac
]
pshufd
m1
,
m0
,
0x55
pshufd
m0
,
m0
,
0x00
%if
%3
lea
sstride3q
,
[
srcstrideq
+
2
*
srcstrideq
]
%endif
sub
srcq
,
pixstride
.
loop
%assign
i
0
%rep
nb_blocks
BLOCK_DEFS
i
LOAD
m2
,
[
srcq
+
i
*
2
*
blocksize
+
0
*
pixstride
]
LOAD
m3
,
[
srcq
+
i
*
2
*
blocksize
+
1
*
pixstride
]
LOAD
m4
,
[
srcq
+
i
*
2
*
blocksize
+
2
*
pixstride
]
LOAD
m5
,
[
srcq
+
i
*
2
*
blocksize
+
pixstride3
]
punpcklwd
m6
,
m2
,
m3
punpcklwd
m7
,
m4
,
m5
pmaddwd
m6
,
m0
pmaddwd
m7
,
m1
paddd
m6
,
m7
psrad
m6
,
%2
%
if
block_truncated
==
0
punpckhwd
m2
,
m3
punpckhwd
m4
,
m5
pmaddwd
m2
,
m0
pmaddwd
m4
,
m1
paddd
m2
,
m4
psrad
m2
,
%2
%
endif
packssdw
m6
,
m2
STORE
[
dstq
+
i
*
2
*
blocksize
]
,
m6
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
%macro
EPEL_H_10
1
cglobal
hevc_epel_h_
%
+
%1
%
+
_10
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
2
,
0
%endmacro
INIT_XMM
avx
EPEL_H_10
4
EPEL_H_10
8
EPEL_H_10
12
EPEL_H_10
16
EPEL_H_10
24
EPEL_H_10
32
%macro
EPEL_V_10
1
cglobal
hevc_epel_v_
%
+
%1
%
+
_10
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
2
,
1
%endmacro
INIT_XMM
avx
EPEL_V_10
4
EPEL_V_10
8
EPEL_V_10
12
EPEL_V_10
16
EPEL_V_10
24
EPEL_V_10
32
; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro
EPEL_HV
1
cglobal
hevc_epel_hv_
%
+
%1
,
8
,
9
,
8
,
dst
,
dststride
,
src
,
srcstride
,
height
,
mx
,
my
,
sstride3
,
coeffsreg
EPEL_16
%1
,
6
,
1
%endmacro
INIT_XMM
avx
EPEL_HV
4
EPEL_HV
8
EPEL_HV
12
EPEL_HV
16
EPEL_HV
24
EPEL_HV
32
%endif
; ARCH_X86_64
; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height)
%macro
AVG
5
%
if
%3
%
if
%4
==
4
movq
%5
,
%2
paddsw
%1
,
%5
%
else
paddsw
%1
,
%2
%
endif
%
endif
%endmacro
; %1: 0 - one source; 1 - two sources
; %2: width
; %3: bit depth
%macro
PUT_PRED
3
%if
%1
cglobal
hevc_put_unweighted_pred_avg_
%
+
%2
%
+
_
%
+
%3
,
6
,
6
,
4
,
dst
,
dststride
,
src
,
src2
,
srcstride
,
height
%else
cglobal
hevc_put_unweighted_pred_
%
+
%2
%
+
_
%
+
%3
,
5
,
5
,
4
,
dst
,
dststride
,
src
,
srcstride
,
height
%endif
%assign
shift
14
+
%1
-
%3
%assign
offset
(
1
<<
(
shift
-
1
))
%define
offset_data
pw_
%
+
offset
mova
m0
,
[
offset_data
]
%if
%3
>
8
%
define
STORE_BLOCK
movu
%
define
STORE_HALF
movq
%
assign
pixel_max
((
1
<<
%3
)
-
1
)
%
define
pw_pixel_max
pw_
%
+
pixel_max
pxor
m1
,
m1
mova
m2
,
[
pw_pixel_max
]
%else
%
define
STORE_BLOCK
movq
%
define
STORE_HALF
movd
%endif
.
loop
%assign
i
0
%rep
(
%2
+
7
)
/
8
%
if
(
i
+
1
)
*
8
>
%2
%
define
LOAD
movq
%
define
STORE
STORE_HALF
%
else
%
define
LOAD
mova
%
define
STORE
STORE_BLOCK
%
endif
LOAD
m3
,
[
srcq
+
16
*
i
]
AVG
m3
,
[
src2q
+
16
*
i
]
,
%1
,
%3
-
i
*
8
,
m4
paddsw
m3
,
m0
psraw
m3
,
shift
%
if
%3
==
8
packuswb
m3
,
m3
STORE
[
dstq
+
8
*
i
]
,
m3
%
else
CLIPW
m3
,
m1
,
m2
STORE
[
dstq
+
16
*
i
]
,
m3
%
endif
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
srcq
,
srcstrideq
%if
%1
add
src2q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
INIT_XMM
sse2
PUT_PRED
0
,
4
,
8
PUT_PRED
1
,
4
,
8
PUT_PRED
0
,
8
,
8
PUT_PRED
1
,
8
,
8
PUT_PRED
0
,
12
,
8
PUT_PRED
1
,
12
,
8
PUT_PRED
0
,
16
,
8
PUT_PRED
1
,
16
,
8
PUT_PRED
0
,
24
,
8
PUT_PRED
1
,
24
,
8
PUT_PRED
0
,
32
,
8
PUT_PRED
1
,
32
,
8
PUT_PRED
0
,
48
,
8
PUT_PRED
1
,
48
,
8
PUT_PRED
0
,
64
,
8
PUT_PRED
1
,
64
,
8
PUT_PRED
0
,
4
,
10
PUT_PRED
1
,
4
,
10
PUT_PRED
0
,
8
,
10
PUT_PRED
1
,
8
,
10
PUT_PRED
0
,
12
,
10
PUT_PRED
1
,
12
,
10
PUT_PRED
0
,
16
,
10
PUT_PRED
1
,
16
,
10
PUT_PRED
0
,
24
,
10
PUT_PRED
1
,
24
,
10
PUT_PRED
0
,
32
,
10
PUT_PRED
1
,
32
,
10
PUT_PRED
0
,
48
,
10
PUT_PRED
1
,
48
,
10
PUT_PRED
0
,
64
,
10
PUT_PRED
1
,
64
,
10
%macro
PUT_WEIGHTED_PRED
3
%if
%1
cglobal
hevc_put_weighted_pred_avg_
%
+
%2
%
+
_
%
+
%3
,
11
,
11
,
8
,
denom
,
weight0
,
weight1
,
offset0
,
offset1
,
dst
,
dststride
,
src0
,
src1
,
srcstride
,
height
%else
cglobal
hevc_put_weighted_pred_
%
+
%2
%
+
_
%
+
%3
,
8
,
8
,
8
,
denom
,
weight0
,
offset0
,
dst
,
dststride
,
src0
,
srcstride
,
height
%endif
and
denomd
,
0xff
movsx
weight0d
,
weight0w
movsx
offset0d
,
offset0w
%if
%1
movsx
weight1d
,
weight1w
movsx
offset1d
,
offset1w
%endif
add
denomd
,
14
+
%1
-
%3
movd
m0
,
denomd
%if
%3
>
8
%
assign
pixel_max
((
1
<<
%3
)
-
1
)
%
define
pw_pixel_max
pw_
%
+
pixel_max
pxor
m4
,
m4
mova
m5
,
[
pw_pixel_max
]
shl
offset0d
,
%3
-
8
%if
%1
shl
offset1d
,
%3
-
8
%endif
%endif
%if
%1
lea
offset0d
,
[
offset0d
+
offset1d
+
1
]
%else
lea
offset0d
,
[
2
*
offset0d
+
1
]
%endif
movd
m1
,
offset0d
SPLATD
m1
pslld
m1
,
m0
psrad
m1
,
1
movd
m2
,
weight0d
SPLATD
m2
%if
%1
movd
m3
,
weight1d
SPLATD
m3
%endif
.
loop
%assign
i
0
%rep
(
%2
+
3
)
/
4
pmovsxwd
m6
,
[
src0q
+
8
*
i
]
pmulld
m6
,
m2
%if
%1
pmovsxwd
m7
,
[
src1q
+
8
*
i
]
pmulld
m7
,
m3
paddd
m6
,
m7
%endif
paddd
m6
,
m1
psrad
m6
,
m0
packssdw
m6
,
m6
%if
%3
>
8
CLIPW
m6
,
m4
,
m5
movq
[
dstq
+
8
*
i
]
,
m6
%else
packuswb
m6
,
m6
movd
[
dstq
+
4
*
i
]
,
m6
%endif
%assign
i
(
i
+
1
)
%endrep
add
dstq
,
dststrideq
add
src0q
,
srcstrideq
%if
%1
add
src1q
,
srcstrideq
%endif
dec
heightd
jg
.
loop
RET
%endmacro
%if
ARCH_X86_64
INIT_XMM
sse4
PUT_WEIGHTED_PRED
0
,
4
,
8
PUT_WEIGHTED_PRED
1
,
4
,
8
PUT_WEIGHTED_PRED
0
,
8
,
8
PUT_WEIGHTED_PRED
1
,
8
,
8
PUT_WEIGHTED_PRED
0
,
12
,
8
PUT_WEIGHTED_PRED
1
,
12
,
8
PUT_WEIGHTED_PRED
0
,
16
,
8
PUT_WEIGHTED_PRED
1
,
16
,
8
PUT_WEIGHTED_PRED
0
,
24
,
8
PUT_WEIGHTED_PRED
1
,
24
,
8
PUT_WEIGHTED_PRED
0
,
32
,
8
PUT_WEIGHTED_PRED
1
,
32
,
8
PUT_WEIGHTED_PRED
0
,
48
,
8
PUT_WEIGHTED_PRED
1
,
48
,
8
PUT_WEIGHTED_PRED
0
,
64
,
8
PUT_WEIGHTED_PRED
1
,
64
,
8
PUT_WEIGHTED_PRED
0
,
4
,
10
PUT_WEIGHTED_PRED
1
,
4
,
10
PUT_WEIGHTED_PRED
0
,
8
,
10
PUT_WEIGHTED_PRED
1
,
8
,
10
PUT_WEIGHTED_PRED
0
,
12
,
10
PUT_WEIGHTED_PRED
1
,
12
,
10
PUT_WEIGHTED_PRED
0
,
16
,
10
PUT_WEIGHTED_PRED
1
,
16
,
10
PUT_WEIGHTED_PRED
0
,
24
,
10
PUT_WEIGHTED_PRED
1
,
24
,
10
PUT_WEIGHTED_PRED
0
,
32
,
10
PUT_WEIGHTED_PRED
1
,
32
,
10
PUT_WEIGHTED_PRED
0
,
48
,
10
PUT_WEIGHTED_PRED
1
,
48
,
10
PUT_WEIGHTED_PRED
0
,
64
,
10
PUT_WEIGHTED_PRED
1
,
64
,
10
%endif
; ARCH_X86_64
libavcodec/x86/hevcdsp_init.c
View file @
e7078e84
...
...
@@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10)
LFL_FUNCS
(
uint8_t
,
8
)
LFL_FUNCS
(
uint8_t
,
10
)
#define GET_PIXELS(width, depth, cf) \
void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer);
GET_PIXELS
(
4
,
8
,
sse2
)
GET_PIXELS
(
8
,
8
,
sse2
)
GET_PIXELS
(
12
,
8
,
sse2
)
GET_PIXELS
(
16
,
8
,
sse2
)
GET_PIXELS
(
24
,
8
,
sse2
)
GET_PIXELS
(
32
,
8
,
sse2
)
GET_PIXELS
(
48
,
8
,
sse2
)
GET_PIXELS
(
64
,
8
,
sse2
)
GET_PIXELS
(
4
,
10
,
sse2
)
GET_PIXELS
(
8
,
10
,
sse2
)
GET_PIXELS
(
12
,
10
,
sse2
)
GET_PIXELS
(
16
,
10
,
sse2
)
GET_PIXELS
(
24
,
10
,
sse2
)
GET_PIXELS
(
32
,
10
,
sse2
)
GET_PIXELS
(
48
,
10
,
sse2
)
GET_PIXELS
(
64
,
10
,
sse2
)
/* those are independent of the bit depth, so declared separately */
#define INTERP_HV_FUNC(width, cf) \
void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer);
INTERP_HV_FUNC
(
4
,
avx
)
INTERP_HV_FUNC
(
8
,
avx
)
INTERP_HV_FUNC
(
12
,
avx
)
INTERP_HV_FUNC
(
16
,
avx
)
INTERP_HV_FUNC
(
24
,
avx
)
INTERP_HV_FUNC
(
32
,
avx
)
INTERP_HV_FUNC
(
48
,
avx
)
INTERP_HV_FUNC
(
64
,
avx
)
#if ARCH_X86_64
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer) \
{ \
const ptrdiff_t stride = FFALIGN(width + 7, 8); \
ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
height + 7, mx, my, mcbuffer); \
ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
height, mx, my, mcbuffer); \
}
#else
#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
#endif
#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
QPEL_FUNCS
(
4
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
8
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
12
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
16
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
24
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
32
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
48
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
64
,
8
,
ssse3
,
ssse3
,
avx
)
QPEL_FUNCS
(
4
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
8
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
12
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
16
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
24
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
32
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
48
,
10
,
avx
,
avx
,
avx
)
QPEL_FUNCS
(
64
,
10
,
avx
,
avx
,
avx
)
#if ARCH_X86_64
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer) \
{ \
const ptrdiff_t stride = FFALIGN(width + 3, 8); \
ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
height + 3, mx, my, mcbuffer); \
ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
height, mx, my, mcbuffer); \
}
#else
#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
#endif
#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int height, int mx, int my, int16_t *mcbuffer); \
EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
EPEL_FUNCS
(
4
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
8
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
12
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
16
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
24
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
32
,
8
,
ssse3
,
ssse3
,
avx
)
EPEL_FUNCS
(
4
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
8
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
12
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
16
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
24
,
10
,
avx
,
avx
,
avx
)
EPEL_FUNCS
(
32
,
10
,
avx
,
avx
,
avx
)
#define PUT_PRED(width, depth, cf_uw, cf_w) \
void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height); \
void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
int16_t *src1, int16_t *src2, \
ptrdiff_t srcstride, int height); \
void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
uint8_t *dst, ptrdiff_t dststride, \
int16_t *src, ptrdiff_t srcstride, \
int height); \
void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
int16_t offset0, int16_t offset1, \
uint8_t *dst, ptrdiff_t dststride, \
int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
int height);
PUT_PRED
(
4
,
8
,
sse2
,
sse4
)
PUT_PRED
(
8
,
8
,
sse2
,
sse4
)
PUT_PRED
(
12
,
8
,
sse2
,
sse4
)
PUT_PRED
(
16
,
8
,
sse2
,
sse4
)
PUT_PRED
(
24
,
8
,
sse2
,
sse4
)
PUT_PRED
(
32
,
8
,
sse2
,
sse4
)
PUT_PRED
(
48
,
8
,
sse2
,
sse4
)
PUT_PRED
(
64
,
8
,
sse2
,
sse4
)
PUT_PRED
(
4
,
10
,
sse2
,
sse4
)
PUT_PRED
(
8
,
10
,
sse2
,
sse4
)
PUT_PRED
(
12
,
10
,
sse2
,
sse4
)
PUT_PRED
(
16
,
10
,
sse2
,
sse4
)
PUT_PRED
(
24
,
10
,
sse2
,
sse4
)
PUT_PRED
(
32
,
10
,
sse2
,
sse4
)
PUT_PRED
(
48
,
10
,
sse2
,
sse4
)
PUT_PRED
(
64
,
10
,
sse2
,
sse4
)
void
ff_hevc_dsp_init_x86
(
HEVCDSPContext
*
c
,
const
int
bit_depth
)
{
int
cpu_flags
=
av_get_cpu_flags
();
#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
if
(
bit_depth
==
8
)
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_8_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_8_sse2
;
SET_QPEL_FUNCS
(
0
,
0
,
8
,
sse2
,
ff_hevc_get_pixels
);
SET_EPEL_FUNCS
(
0
,
0
,
8
,
sse2
,
ff_hevc_get_pixels
);
SET_LUMA_FUNCS
(
put_unweighted_pred
,
ff_hevc_put_unweighted_pred
,
8
,
sse2
);
SET_LUMA_FUNCS
(
put_unweighted_pred_avg
,
ff_hevc_put_unweighted_pred_avg
,
8
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_chroma
,
ff_hevc_put_unweighted_pred
,
8
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_avg_chroma
,
ff_hevc_put_unweighted_pred_avg
,
8
,
sse2
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
)
&&
ARCH_X86_64
)
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_8_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_8_ssse3
;
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
SET_QPEL_FUNCS
(
0
,
1
,
8
,
ssse3
,
ff_hevc_qpel_h
);
SET_QPEL_FUNCS
(
1
,
0
,
8
,
ssse3
,
ff_hevc_qpel_v
);
SET_EPEL_FUNCS
(
0
,
1
,
8
,
ssse3
,
ff_hevc_epel_h
);
SET_EPEL_FUNCS
(
1
,
0
,
8
,
ssse3
,
ff_hevc_epel_v
);
}
}
else
if
(
bit_depth
==
10
)
{
if
(
EXTERNAL_SSE2
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_chroma
=
ff_hevc_v_loop_filter_chroma_10_sse2
;
c
->
hevc_h_loop_filter_chroma
=
ff_hevc_h_loop_filter_chroma_10_sse2
;
SET_QPEL_FUNCS
(
0
,
0
,
10
,
sse2
,
ff_hevc_get_pixels
);
SET_EPEL_FUNCS
(
0
,
0
,
10
,
sse2
,
ff_hevc_get_pixels
);
SET_LUMA_FUNCS
(
put_unweighted_pred
,
ff_hevc_put_unweighted_pred
,
10
,
sse2
);
SET_LUMA_FUNCS
(
put_unweighted_pred_avg
,
ff_hevc_put_unweighted_pred_avg
,
10
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_chroma
,
ff_hevc_put_unweighted_pred
,
10
,
sse2
);
SET_CHROMA_FUNCS
(
put_unweighted_pred_avg_chroma
,
ff_hevc_put_unweighted_pred_avg
,
10
,
sse2
);
}
}
#if ARCH_X86_64
if
(
bit_depth
==
8
)
{
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_8_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_8_ssse3
;
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
SET_LUMA_FUNCS
(
weighted_pred
,
ff_hevc_put_weighted_pred
,
8
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_chroma
,
ff_hevc_put_weighted_pred
,
8
,
sse4
);
SET_LUMA_FUNCS
(
weighted_pred_avg
,
ff_hevc_put_weighted_pred_avg
,
8
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_avg_chroma
,
ff_hevc_put_weighted_pred_avg
,
8
,
sse4
);
}
if
(
EXTERNAL_SSSE3
(
cpu_flags
)
&&
ARCH_X86_64
)
{
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
SET_QPEL_FUNCS
(
1
,
1
,
8
,
avx
,
hevc_qpel_hv
);
SET_EPEL_FUNCS
(
1
,
1
,
8
,
avx
,
hevc_epel_hv
);
}
}
else
if
(
bit_depth
==
10
)
{
if
(
EXTERNAL_SSSE3
(
cpu_flags
))
{
c
->
hevc_v_loop_filter_luma
=
ff_hevc_v_loop_filter_luma_10_ssse3
;
c
->
hevc_h_loop_filter_luma
=
ff_hevc_h_loop_filter_luma_10_ssse3
;
}
if
(
EXTERNAL_SSE4
(
cpu_flags
))
{
SET_LUMA_FUNCS
(
weighted_pred
,
ff_hevc_put_weighted_pred
,
10
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_chroma
,
ff_hevc_put_weighted_pred
,
10
,
sse4
);
SET_LUMA_FUNCS
(
weighted_pred_avg
,
ff_hevc_put_weighted_pred_avg
,
10
,
sse4
);
SET_CHROMA_FUNCS
(
weighted_pred_avg_chroma
,
ff_hevc_put_weighted_pred_avg
,
10
,
sse4
);
}
if
(
EXTERNAL_AVX
(
cpu_flags
))
{
SET_QPEL_FUNCS
(
0
,
1
,
10
,
avx
,
ff_hevc_qpel_h
);
SET_QPEL_FUNCS
(
1
,
0
,
10
,
avx
,
ff_hevc_qpel_v
);
SET_QPEL_FUNCS
(
1
,
1
,
10
,
avx
,
hevc_qpel_hv
);
SET_EPEL_FUNCS
(
0
,
1
,
10
,
avx
,
ff_hevc_epel_h
);
SET_EPEL_FUNCS
(
1
,
0
,
10
,
avx
,
ff_hevc_epel_v
);
SET_EPEL_FUNCS
(
1
,
1
,
10
,
avx
,
hevc_epel_hv
);
}
}
#endif
/* ARCH_X86_64 */
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment