Commit 52f2adc0 authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Michael Niedermayer

avcodec/hevc: Update the USE_SAO_SMALL_BUFFER case for the alignment requirements in FFmpeg

Use edge emu buffers
And enable the code unconditionally

Speed difference without USE_SAO_SMALL_BUFFER and with the new code:
Decicycles: 26772->26220 (BO32),  83803->80942 (BO64)
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent 00fe7785
...@@ -280,24 +280,6 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb) ...@@ -280,24 +280,6 @@ static int decode_lt_rps(HEVCContext *s, LongTermRPS *rps, GetBitContext *gb)
return 0; return 0;
} }
static int get_buffer_sao(HEVCContext *s, AVFrame *frame, const HEVCSPS *sps)
{
int ret, i;
frame->width = FFALIGN(s->avctx->coded_width + 2, FF_INPUT_BUFFER_PADDING_SIZE);
frame->height = s->avctx->coded_height + 3;
if ((ret = ff_get_buffer(s->avctx, frame, AV_GET_BUFFER_FLAG_REF)) < 0)
return ret;
for (i = 0; frame->data[i]; i++) {
int offset = frame->linesize[i] + FF_INPUT_BUFFER_PADDING_SIZE;
frame->data[i] += offset;
}
frame->width = s->avctx->coded_width;
frame->height = s->avctx->coded_height;
return 0;
}
static int set_sps(HEVCContext *s, const HEVCSPS *sps) static int set_sps(HEVCContext *s, const HEVCSPS *sps)
{ {
#define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL) #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL)
...@@ -353,18 +335,9 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps) ...@@ -353,18 +335,9 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
ff_videodsp_init (&s->vdsp, sps->bit_depth); ff_videodsp_init (&s->vdsp, sps->bit_depth);
if (sps->sao_enabled && !s->avctx->hwaccel) { if (sps->sao_enabled && !s->avctx->hwaccel) {
#ifdef USE_SAO_SMALL_BUFFER
{
int ctb_size = 1 << sps->log2_ctb_size;
int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
int c_idx, i; int c_idx;
for (i = 0; i < s->threads_number ; i++) {
HEVCLocalContext *lc = s->HEVClcList[i];
lc->sao_pixel_buffer =
av_malloc(((ctb_size + 2) * (ctb_size + 2)) <<
sps->pixel_shift);
}
for(c_idx = 0; c_idx < c_count; c_idx++) { for(c_idx = 0; c_idx < c_count; c_idx++) {
int w = sps->width >> sps->hshift[c_idx]; int w = sps->width >> sps->hshift[c_idx];
int h = sps->height >> sps->vshift[c_idx]; int h = sps->height >> sps->vshift[c_idx];
...@@ -376,12 +349,6 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps) ...@@ -376,12 +349,6 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
sps->pixel_shift); sps->pixel_shift);
} }
} }
#else
av_frame_unref(s->tmp_frame);
ret = get_buffer_sao(s, s->tmp_frame, sps);
s->sao_frame = s->tmp_frame;
#endif
}
s->sps = sps; s->sps = sps;
s->vps = (HEVCVPS*) s->vps_list[s->sps->vps_id]->data; s->vps = (HEVCVPS*) s->vps_list[s->sps->vps_id]->data;
...@@ -3211,17 +3178,10 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) ...@@ -3211,17 +3178,10 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
av_freep(&s->cabac_state); av_freep(&s->cabac_state);
#ifdef USE_SAO_SMALL_BUFFER
for (i = 0; i < s->threads_number; i++) {
av_freep(&s->HEVClcList[i]->sao_pixel_buffer);
}
for (i = 0; i < 3; i++) { for (i = 0; i < 3; i++) {
av_freep(&s->sao_pixel_buffer_h[i]); av_freep(&s->sao_pixel_buffer_h[i]);
av_freep(&s->sao_pixel_buffer_v[i]); av_freep(&s->sao_pixel_buffer_v[i]);
} }
#else
av_frame_free(&s->tmp_frame);
#endif
av_frame_free(&s->output_frame); av_frame_free(&s->output_frame);
for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
...@@ -3281,12 +3241,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) ...@@ -3281,12 +3241,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
if (!s->cabac_state) if (!s->cabac_state)
goto fail; goto fail;
#ifndef USE_SAO_SMALL_BUFFER
s->tmp_frame = av_frame_alloc();
if (!s->tmp_frame)
goto fail;
#endif
s->output_frame = av_frame_alloc(); s->output_frame = av_frame_alloc();
if (!s->output_frame) if (!s->output_frame)
goto fail; goto fail;
......
...@@ -36,8 +36,6 @@ ...@@ -36,8 +36,6 @@
#include "thread.h" #include "thread.h"
#include "videodsp.h" #include "videodsp.h"
//#define USE_SAO_SMALL_BUFFER /* reduce the memory used by SAO */
#define MAX_DPB_SIZE 16 // A.4.1 #define MAX_DPB_SIZE 16 // A.4.1
#define MAX_REFS 16 #define MAX_REFS 16
...@@ -747,9 +745,6 @@ typedef struct HEVCNAL { ...@@ -747,9 +745,6 @@ typedef struct HEVCNAL {
} HEVCNAL; } HEVCNAL;
typedef struct HEVCLocalContext { typedef struct HEVCLocalContext {
#ifdef USE_SAO_SMALL_BUFFER
uint8_t *sao_pixel_buffer;
#endif
uint8_t cabac_state[HEVC_CONTEXTS]; uint8_t cabac_state[HEVC_CONTEXTS];
uint8_t stat_coeff[4]; uint8_t stat_coeff[4];
...@@ -774,6 +769,7 @@ typedef struct HEVCLocalContext { ...@@ -774,6 +769,7 @@ typedef struct HEVCLocalContext {
int end_of_tiles_y; int end_of_tiles_y;
/* +7 is for subpixel interpolation, *2 for high bit depths */ /* +7 is for subpixel interpolation, *2 for high bit depths */
DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
/* The extended size between the new edge emu buffer is abused by SAO */
DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); DECLARE_ALIGNED(16, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
...@@ -813,13 +809,8 @@ typedef struct HEVCContext { ...@@ -813,13 +809,8 @@ typedef struct HEVCContext {
AVFrame *frame; AVFrame *frame;
AVFrame *output_frame; AVFrame *output_frame;
#ifdef USE_SAO_SMALL_BUFFER
uint8_t *sao_pixel_buffer_h[3]; uint8_t *sao_pixel_buffer_h[3];
uint8_t *sao_pixel_buffer_v[3]; uint8_t *sao_pixel_buffer_v[3];
#else
AVFrame *tmp_frame;
AVFrame *sao_frame;
#endif
const HEVCVPS *vps; const HEVCVPS *vps;
const HEVCSPS *sps; const HEVCSPS *sps;
......
...@@ -161,14 +161,12 @@ int i, j; ...@@ -161,14 +161,12 @@ int i, j;
} }
} }
#if defined(USE_SAO_SMALL_BUFFER)
static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
{ {
if (pixel_shift) if (pixel_shift)
*(uint16_t *)dst = *(uint16_t *)src; *(uint16_t *)dst = *(uint16_t *)src;
else else
*dst = *src; *dst = *src;
} }
static void copy_vert(uint8_t *dst, const uint8_t *src, static void copy_vert(uint8_t *dst, const uint8_t *src,
...@@ -210,7 +208,6 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, ...@@ -210,7 +208,6 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
} }
#endif
static void restore_tqb_pixels(HEVCContext *s, static void restore_tqb_pixels(HEVCContext *s,
uint8_t *src1, const uint8_t *dst1, uint8_t *src1, const uint8_t *dst1,
...@@ -317,21 +314,16 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -317,21 +314,16 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0); int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0);
int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1]; int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1];
uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)]; uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)];
#if defined(USE_SAO_SMALL_BUFFER) int stride_dst;
int stride_dst = ((1 << (s->sps->log2_ctb_size)) + 2) << s->sps->pixel_shift; uint8_t *dst;
uint8_t *dst = lc->sao_pixel_buffer + (1 * stride_dst) + (1 << s->sps->pixel_shift);
#else
int stride_dst = s->sao_frame->linesize[c_idx];
uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)];
#endif
switch (sao->type_idx[c_idx]) { switch (sao->type_idx[c_idx]) {
case SAO_BAND: case SAO_BAND:
dst = lc->edge_emu_buffer;
stride_dst = 2*MAX_PB_SIZE;
copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src); copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src);
#if defined(USE_SAO_SMALL_BUFFER)
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb); x_ctb, y_ctb);
#endif
s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
sao->offset_val[c_idx], sao->band_position[c_idx], sao->offset_val[c_idx], sao->band_position[c_idx],
width, height); width, height);
...@@ -341,7 +333,6 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -341,7 +333,6 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
break; break;
case SAO_EDGE: case SAO_EDGE:
{ {
#if defined(USE_SAO_SMALL_BUFFER)
int w = s->sps->width >> s->sps->hshift[c_idx]; int w = s->sps->width >> s->sps->hshift[c_idx];
int h = s->sps->height >> s->sps->vshift[c_idx]; int h = s->sps->height >> s->sps->vshift[c_idx];
int left_edge = edges[0]; int left_edge = edges[0];
...@@ -351,6 +342,9 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -351,6 +342,9 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
int sh = s->sps->pixel_shift; int sh = s->sps->pixel_shift;
int left_pixels, right_pixels; int left_pixels, right_pixels;
stride_dst = 2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE;
dst = lc->edge_emu_buffer + stride_dst + FF_INPUT_BUFFER_PADDING_SIZE;
if (!top_edge) { if (!top_edge) {
int left = 1 - left_edge; int left = 1 - left_edge;
int right = 1 - right_edge; int right = 1 - right_edge;
...@@ -433,40 +427,6 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) ...@@ -433,40 +427,6 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb); x_ctb, y_ctb);
#else
uint8_t left_pixels;
/* get the CTB edge pixels from the SAO pixel buffer */
left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED);
if (!edges[1]) {
uint8_t top_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
if (CTB(s->sao, x_ctb , y_ctb-1).type_idx[c_idx] == 0)
memcpy( dst - stride_dst - (top_left << s->sps->pixel_shift),
src - stride_src - (top_left << s->sps->pixel_shift),
(top_left + width + top_right) << s->sps->pixel_shift);
else {
if (top_left)
memcpy( dst - stride_dst - (1 << s->sps->pixel_shift),
src - stride_src - (1 << s->sps->pixel_shift),
1 << s->sps->pixel_shift);
if(top_right)
memcpy( dst - stride_dst + (width << s->sps->pixel_shift),
src - stride_src + (width << s->sps->pixel_shift),
1 << s->sps->pixel_shift);
}
}
if (!edges[3]) { // bottom and bottom right
uint8_t bottom_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] != SAO_APPLIED);
memcpy( dst + height * stride_dst - (bottom_left << s->sps->pixel_shift),
src + height * stride_src - (bottom_left << s->sps->pixel_shift),
(width + 1 + bottom_left) << s->sps->pixel_shift);
}
copy_CTB(dst - (left_pixels << s->sps->pixel_shift),
src - (left_pixels << s->sps->pixel_shift),
(width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src);
#endif
/* XXX: could handle the restoration here to simplify the
DSP functions */
s->hevcdsp.sao_edge_filter[restore](src, dst, s->hevcdsp.sao_edge_filter[restore](src, dst,
stride_src, stride_dst, stride_src, stride_dst,
sao, sao,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment