Commit 042c1159 authored by James Almer's avatar James Almer

x86/hevcdsp: add ff_hevc_sao_edge_filter_8_{ssse3,avx2}

Original x86 intrinsics code and initial yasm port by Pierre-Edouard Lepere.
Refactoring and optimizations by James Almer.

Benchmarks of BQTerrace_1920x1080_60_qp22.bin with an Intel Core i5-4200U

Width 32
158583 decicycles in edge, sao_edge_filter_8 runs, 0 skips
5205 decicycles in ff_hevc_sao_edge_filter_32_8_ssse3, 32767 runs, 1 skips
2942 decicycles in ff_hevc_sao_edge_filter_32_8_avx2, 32767 runs, 1 skips

Width 64
705639 decicycles in sao_edge_filter_8, 262144 runs, 0 skips
19224 decicycles in ff_hevc_sao_edge_filter_64_8_ssse3, 262111 runs, 33 skips
10433 decicycles in ff_hevc_sao_edge_filter_64_8_avx2, 262115 runs, 29 skips
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 1f1c7c8a
...@@ -246,7 +246,7 @@ static void restore_tqb_pixels(HEVCContext *s, ...@@ -246,7 +246,7 @@ static void restore_tqb_pixels(HEVCContext *s,
static void sao_filter_CTB(HEVCContext *s, int x, int y) static void sao_filter_CTB(HEVCContext *s, int x, int y)
{ {
static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
HEVCLocalContext *lc = s->HEVClc; HEVCLocalContext *lc = s->HEVClc;
int c_idx; int c_idx;
int edges[4]; // 0 left 1 top 2 right 3 bottom int edges[4]; // 0 left 1 top 2 right 3 bottom
...@@ -312,7 +312,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -312,7 +312,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx]; int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx];
int width = FFMIN(ctb_size_h, (s->sps->width >> s->sps->hshift[c_idx]) - x0); int width = FFMIN(ctb_size_h, (s->sps->width >> s->sps->hshift[c_idx]) - x0);
int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0); int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0);
int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1]; int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)]; uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)];
int stride_dst; int stride_dst;
uint8_t *dst; uint8_t *dst;
...@@ -427,7 +427,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -427,7 +427,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb); x_ctb, y_ctb);
s->hevcdsp.sao_edge_filter(src, dst, stride_src, sao->offset_val[c_idx], s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
sao->eo_class[c_idx], width, height); sao->eo_class[c_idx], width, height);
s->hevcdsp.sao_edge_restore[restore](src, dst, s->hevcdsp.sao_edge_restore[restore](src, dst,
stride_src, stride_dst, stride_src, stride_dst,
......
...@@ -217,7 +217,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) ...@@ -217,7 +217,11 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
hevcdsp->sao_band_filter[2] = \ hevcdsp->sao_band_filter[2] = \
hevcdsp->sao_band_filter[3] = \ hevcdsp->sao_band_filter[3] = \
hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter_0, depth); \ hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter_0, depth); \
hevcdsp->sao_edge_filter = FUNC(sao_edge_filter, depth); \ hevcdsp->sao_edge_filter[0] = \
hevcdsp->sao_edge_filter[1] = \
hevcdsp->sao_edge_filter[2] = \
hevcdsp->sao_edge_filter[3] = \
hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth); \
hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
\ \
......
...@@ -62,8 +62,8 @@ typedef struct HEVCDSPContext { ...@@ -62,8 +62,8 @@ typedef struct HEVCDSPContext {
int16_t *sao_offset_val, int sao_left_class, int width, int height); int16_t *sao_offset_val, int sao_left_class, int width, int height);
/* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE */ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE */
void (*sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
int sao_eo_class, int width, int height); int16_t *sao_offset_val, int sao_eo_class, int width, int height);
void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
......
...@@ -23,14 +23,25 @@ ...@@ -23,14 +23,25 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
%if ARCH_X86_64
SECTION_RODATA 32 SECTION_RODATA 32
pw_mask10: times 16 dw 0x03FF pw_mask10: times 16 dw 0x03FF
pw_mask12: times 16 dw 0x0FFF pw_mask12: times 16 dw 0x0FFF
pb_2: times 32 db 2
pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
cextern pb_1
SECTION_TEXT SECTION_TEXT
%define MAX_PB_SIZE 64
%define PADDING_SIZE 32 ; FF_INPUT_BUFFER_PADDING_SIZE
;******************************************************************************
;SAO Band Filter
;******************************************************************************
%if ARCH_X86_64
%macro HEVC_SAO_BAND_FILTER_INIT 1 %macro HEVC_SAO_BAND_FILTER_INIT 1
and leftq, 31 and leftq, 31
movd xm0, leftd movd xm0, leftd
...@@ -239,3 +250,161 @@ HEVC_SAO_BAND_FILTER_16 12, 48, 1 ...@@ -239,3 +250,161 @@ HEVC_SAO_BAND_FILTER_16 12, 48, 1
HEVC_SAO_BAND_FILTER_16 12, 64, 2 HEVC_SAO_BAND_FILTER_16 12, 64, 2
%endif %endif
%endif %endif
;******************************************************************************
;SAO Edge Filter
;******************************************************************************
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
%macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1
pminub m4, m1, m2
pminub m5, m1, m3
pcmpeqb m2, m4
pcmpeqb m3, m5
pcmpeqb m4, m1
pcmpeqb m5, m1
psubb m4, m2
psubb m5, m3
paddb m4, m6
paddb m4, m5
pshufb m2, m0, m4
%if %1 > 8
punpckhbw m5, m7, m1
punpckhbw m4, m2, m7
punpcklbw m3, m7, m1
punpcklbw m2, m7
pmaddubsw m5, m4
pmaddubsw m3, m2
packuswb m3, m5
%else
punpcklbw m3, m7, m1
punpcklbw m2, m7
pmaddubsw m3, m2
packuswb m3, m3
%endif
%endmacro
;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
; int eo, int width, int height);
%macro HEVC_SAO_EDGE_FILTER_8 2-3
%if WIN64
cglobal hevc_sao_edge_filter_%1_8, 4, 8, 8, dst, src, dststride, offset, a_stride, b_stride, height, tmp
%define eoq heightq
movsxd eoq, dword r4m
movsx a_strideq, byte [pb_eo+eoq*4+1]
movsx b_strideq, byte [pb_eo+eoq*4+3]
imul a_strideq, EDGE_SRCSTRIDE
imul b_strideq, EDGE_SRCSTRIDE
movsx tmpq, byte [pb_eo+eoq*4]
add a_strideq, tmpq
movsx tmpq, byte [pb_eo+eoq*4+2]
add b_strideq, tmpq
mov heightd, r6m
%elif ARCH_X86_64
cglobal hevc_sao_edge_filter_%1_8, 5, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
%define tmp2q heightq
movsxd eoq, eod
lea tmp2q, [pb_eo]
movsx a_strideq, byte [tmp2q+eoq*4+1]
movsx b_strideq, byte [tmp2q+eoq*4+3]
imul a_strideq, EDGE_SRCSTRIDE
imul b_strideq, EDGE_SRCSTRIDE
movsx tmpq, byte [tmp2q+eoq*4]
add a_strideq, tmpq
movsx tmpq, byte [tmp2q+eoq*4+2]
add b_strideq, tmpq
mov heightd, r6m
%else ; ARCH_X86_32
cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
%define eoq srcq
%define tmpq heightq
%define tmp2q dststrideq
%define offsetq heightq
mov eoq, r4m
lea tmp2q, [pb_eo]
movsx a_strideq, byte [tmp2q+eoq*4+1]
movsx b_strideq, byte [tmp2q+eoq*4+3]
imul a_strideq, EDGE_SRCSTRIDE
imul b_strideq, EDGE_SRCSTRIDE
movsx tmpq, byte [tmp2q+eoq*4]
add a_strideq, tmpq
movsx tmpq, byte [tmp2q+eoq*4+2]
add b_strideq, tmpq
mov srcq, srcm
mov offsetq, r3m
mov dststrideq, dststridem
%endif ; ARCH
%if mmsize > 16
vbroadcasti128 m0, [offsetq]
%else
movu m0, [offsetq]
%endif
mova m1, [pb_edge_shuffle]
packsswb m0, m0
mova m7, [pb_1]
pshufb m0, m1
mova m6, [pb_2]
%if ARCH_X86_32
mov heightd, r6m
%endif
align 16
.loop:
%if %1 == 8
movq m1, [srcq]
movq m2, [srcq + a_strideq]
movq m3, [srcq + b_strideq]
HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
movq [dstq], m3
%endif
%assign i 0
%rep %2
mova m1, [srcq + i]
movu m2, [srcq + a_strideq + i]
movu m3, [srcq + b_strideq + i]
HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
mov%3 [dstq + i], m3
%assign i i+mmsize
%endrep
%if %1 == 48
INIT_XMM cpuname
mova m1, [srcq + i]
movu m2, [srcq + a_strideq + i]
movu m3, [srcq + b_strideq + i]
HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
mova [dstq + i], m3
%if cpuflag(avx2)
INIT_YMM cpuname
%endif
%endif
add dstq, dststrideq
add srcq, EDGE_SRCSTRIDE
dec heightd
jg .loop
RET
%endmacro
INIT_XMM ssse3
HEVC_SAO_EDGE_FILTER_8 8, 0
HEVC_SAO_EDGE_FILTER_8 16, 1, a
HEVC_SAO_EDGE_FILTER_8 32, 2, a
HEVC_SAO_EDGE_FILTER_8 48, 2, a
HEVC_SAO_EDGE_FILTER_8 64, 4, a
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
HEVC_SAO_EDGE_FILTER_8 32, 1, a
HEVC_SAO_EDGE_FILTER_8 48, 1, u
HEVC_SAO_EDGE_FILTER_8 64, 2, a
%endif
...@@ -500,6 +500,37 @@ SAO_BAND_FILTER_FUNCS(8, avx2); ...@@ -500,6 +500,37 @@ SAO_BAND_FILTER_FUNCS(8, avx2);
SAO_BAND_FILTER_FUNCS(10, avx2); SAO_BAND_FILTER_FUNCS(10, avx2);
SAO_BAND_FILTER_FUNCS(12, avx2); SAO_BAND_FILTER_FUNCS(12, avx2);
#define SAO_BAND_INIT(bitd, opt) do { \
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
} while (0)
#define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
int eo, int width, int height); \
void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
int eo, int width, int height); \
void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
int eo, int width, int height); \
void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
int eo, int width, int height); \
void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
int eo, int width, int height); \
SAO_EDGE_FILTER_FUNCS(8, ssse3);
SAO_EDGE_FILTER_FUNCS(8, avx2);
#define SAO_EDGE_INIT(bitd, opt) do { \
c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
} while (0)
#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \ #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \ PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
...@@ -520,14 +551,6 @@ SAO_BAND_FILTER_FUNCS(12, avx2); ...@@ -520,14 +551,6 @@ SAO_BAND_FILTER_FUNCS(12, avx2);
PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \ PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ) PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
#define SAO_BAND_INIT(bitd, opt) do { \
c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
} while (0)
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
{ {
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
...@@ -555,10 +578,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -555,10 +578,13 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->transform_add[2] = ff_hevc_transform_add16_8_sse2; c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
c->transform_add[3] = ff_hevc_transform_add32_8_sse2; c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
} }
if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { if (EXTERNAL_SSSE3(cpu_flags)) {
if(ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
} }
SAO_EDGE_INIT(8, ssse3);
}
if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4); EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
...@@ -590,6 +616,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) ...@@ -590,6 +616,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (ARCH_X86_64) { if (ARCH_X86_64) {
SAO_BAND_INIT(8, avx2); SAO_BAND_INIT(8, avx2);
} }
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
c->transform_add[3] = ff_hevc_transform_add32_8_avx2; c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment