Commit bbad0bc5 authored by Jun Zhao's avatar Jun Zhao

lavfi/lut: Add slice threading support

Used the command for 1080p h264 clip as follow:

a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null
b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null

after enabled the slice threading, the fps change from:

a). 144fps to 258fps (lutyuv)
b). 94fps  to 153fps (lutrgb)

in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz
Reviewed-by: 's avatarPaul B Mahol <onemda@gmail.com>
Signed-off-by: 's avatarJun Zhao <barryjzhao@tencent.com>
parent 360bee8c
......@@ -337,42 +337,43 @@ static int config_props(AVFilterLink *inlink)
return 0;
}
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
AVFilterContext *ctx = inlink->dst;
LutContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0];
struct thread_data {
AVFrame *in;
AVFrame *out;
int i, j, plane, direct = 0;
if (av_frame_is_writable(in)) {
direct = 1;
out = in;
} else {
out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
if (!out) {
av_frame_free(&in);
return AVERROR(ENOMEM);
}
av_frame_copy_props(out, in);
}
int w;
int h;
};
#define LOAD_PACKED_COMMON\
LutContext *s = ctx->priv;\
const struct thread_data *td = arg;\
\
int i, j;\
const int w = td->w;\
const int h = td->h;\
AVFrame *in = td->in;\
AVFrame *out = td->out;\
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
const int step = s->step;\
\
const int slice_start = (h * jobnr ) / nb_jobs;\
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
/* packed, 16-bit */
static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PACKED_COMMON
if (s->is_rgb && s->is_16bit && !s->is_planar) {
/* packed, 16-bit */
uint16_t *inrow, *outrow, *inrow0, *outrow0;
const int w = inlink->w;
const int h = in->height;
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
const int in_linesize = in->linesize[0] / 2;
const int out_linesize = out->linesize[0] / 2;
const int step = s->step;
inrow0 = (uint16_t*) in ->data[0];
outrow0 = (uint16_t*) out->data[0];
inrow0 = (uint16_t *)in ->data[0];
outrow0 = (uint16_t *)out->data[0];
for (i = 0; i < h; i ++) {
inrow = inrow0;
outrow = outrow0;
for (i = slice_start; i < slice_end; i++) {
inrow = inrow0 + i * in_linesize;
outrow = outrow0 + i * out_linesize;
for (j = 0; j < w; j++) {
switch (step) {
......@@ -391,25 +392,25 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
outrow += step;
inrow += step;
}
inrow0 += in_linesize;
outrow0 += out_linesize;
}
} else if (s->is_rgb && !s->is_planar) {
/* packed */
return 0;
}
/* packed, 8-bit */
static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PACKED_COMMON
uint8_t *inrow, *outrow, *inrow0, *outrow0;
const int w = inlink->w;
const int h = in->height;
const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
const int in_linesize = in->linesize[0];
const int out_linesize = out->linesize[0];
const int step = s->step;
inrow0 = in ->data[0];
outrow0 = out->data[0];
for (i = 0; i < h; i ++) {
inrow = inrow0;
outrow = outrow0;
for (i = slice_start; i < slice_end; i++) {
inrow = inrow0 + i * in_linesize;
outrow = outrow0 + i * out_linesize;
for (j = 0; j < w; j++) {
switch (step) {
case 4: outrow[3] = tab[3][inrow[3]]; // Fall-through
......@@ -420,26 +421,45 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
outrow += step;
inrow += step;
}
inrow0 += in_linesize;
outrow0 += out_linesize;
}
} else if (s->is_16bit) {
// planar >8 bit depth
return 0;
}
#define LOAD_PLANAR_COMMON\
LutContext *s = ctx->priv;\
const struct thread_data *td = arg;\
int i, j, plane;\
AVFrame *in = td->in;\
AVFrame *out = td->out;\
#define PLANAR_COMMON\
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
int h = AV_CEIL_RSHIFT(td->h, vsub);\
int w = AV_CEIL_RSHIFT(td->w, hsub);\
const uint16_t *tab = s->lut[plane];\
\
const int slice_start = (h * jobnr ) / nb_jobs;\
const int slice_end = (h * (jobnr+1)) / nb_jobs;\
/* planar >8 bit depth */
static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PLANAR_COMMON
uint16_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
const uint16_t *tab = s->lut[plane];
PLANAR_COMMON
const int in_linesize = in->linesize[plane] / 2;
const int out_linesize = out->linesize[plane] / 2;
inrow = (uint16_t *)in ->data[plane];
outrow = (uint16_t *)out->data[plane];
inrow = (uint16_t *)(in ->data[plane] + slice_start * in_linesize);
outrow = (uint16_t *)(out->data[plane] + slice_start * out_linesize);
for (i = 0; i < h; i++) {
for (i = slice_start; i < slice_end; i++) {
for (j = 0; j < w; j++) {
#if HAVE_BIGENDIAN
outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
......@@ -451,29 +471,93 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
outrow += out_linesize;
}
}
} else {
/* planar 8bit depth */
return 0;
}
/* planar 8bit depth */
static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
{
LOAD_PLANAR_COMMON
uint8_t *inrow, *outrow;
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
int h = AV_CEIL_RSHIFT(inlink->h, vsub);
int w = AV_CEIL_RSHIFT(inlink->w, hsub);
const uint16_t *tab = s->lut[plane];
PLANAR_COMMON
const int in_linesize = in->linesize[plane];
const int out_linesize = out->linesize[plane];
inrow = in ->data[plane];
outrow = out->data[plane];
inrow = in ->data[plane] + slice_start * in_linesize;
outrow = out->data[plane] + slice_start * out_linesize;
for (i = 0; i < h; i++) {
for (i = slice_start; i < slice_end; i++) {
for (j = 0; j < w; j++)
outrow[j] = tab[inrow[j]];
inrow += in_linesize;
outrow += out_linesize;
}
}
return 0;
}
#define PACKED_THREAD_DATA\
struct thread_data td = {\
.in = in,\
.out = out,\
.w = inlink->w,\
.h = in->height,\
};\
#define PLANAR_THREAD_DATA\
struct thread_data td = {\
.in = in,\
.out = out,\
.w = inlink->w,\
.h = inlink->h,\
};\
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
{
AVFilterContext *ctx = inlink->dst;
LutContext *s = ctx->priv;
AVFilterLink *outlink = ctx->outputs[0];
AVFrame *out;
int direct = 0;
if (av_frame_is_writable(in)) {
direct = 1;
out = in;
} else {
out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
if (!out) {
av_frame_free(&in);
return AVERROR(ENOMEM);
}
av_frame_copy_props(out, in);
}
if (s->is_rgb && s->is_16bit && !s->is_planar) {
/* packed, 16-bit */
PACKED_THREAD_DATA
ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_rgb && !s->is_planar) {
/* packed 8 bits */
PACKED_THREAD_DATA
ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else if (s->is_16bit) {
/* planar >8 bit depth */
PLANAR_THREAD_DATA
ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
} else {
/* planar 8bit depth */
PLANAR_THREAD_DATA
ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
}
if (!direct)
......@@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
.query_formats = query_formats, \
.inputs = inputs, \
.outputs = outputs, \
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, \
.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS, \
}
#if CONFIG_LUT_FILTER
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment