lavfi/lut: Add slice threading support

Used the command for 1080p h264 clip as follow: a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null after enabled the slice threading, the fps change from: a). 144fps to 258fps (lutyuv) b). 94fps to 153fps (lutrgb) in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz Reviewed-by: Paul B Mahol <onemda@gmail.com> Signed-off-by: Jun Zhao <barryjzhao@tencent.com>

lavfi/lut: Add slice threading support
Used the command for 1080p h264 clip as follow: a). ffmpeg -i input -vf lutyuv="u=128:v=128" -f null /dev/null b). ffmpeg -i input -vf lutrgb="g=0:b=0" -f null /dev/null after enabled the slice threading, the fps change from: a). 144fps to 258fps (lutyuv) b). 94fps to 153fps (lutrgb) in Intel(R) Core(TM) i5-8265U CPU @ 1.60GHz Reviewed-by: Paul B Mahol <onemda@gmail.com> Signed-off-by: Jun Zhao <barryjzhao@tencent.com>
bbad0bc5 · Jun Zhao · 360bee8c · bbad0bc5
Commit bbad0bc5 authored May 21, 2019 by Jun Zhao
Show whitespace changes
Inline Side-by-side

Showing with 197 additions and 113 deletions

vf_lut.c libavfilter/vf_lut.c +197 -113

No files found.
--- a/libavfilter/vf_lut.c
+++ b/libavfilter/vf_lut.c
@@ -337,42 +337,43 @@ static int config_props(AVFilterLink *inlink)
    return 0;
 }

-static int filter_frame(AVFilterLink *inlink, AVFrame *in)
-{
-    AVFilterContext *ctx = inlink->dst;
-    LutContext *s = ctx->priv;
-    AVFilterLink *outlink = ctx->outputs[0];
+struct thread_data {
+    AVFrame *in;
    AVFrame *out;
-    int i, j, plane, direct = 0;

-    if (av_frame_is_writable(in)) {
-        direct = 1;
-        out = in;
-    } else {
-        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
-        if (!out) {
-            av_frame_free(&in);
-            return AVERROR(ENOMEM);
-        }
-        av_frame_copy_props(out, in);
-    }
+    int w;
+    int h;
+};
+
+#define LOAD_PACKED_COMMON\
+    LutContext *s = ctx->priv;\
+    const struct thread_data *td = arg;\
+\
+    int i, j;\
+    const int w = td->w;\
+    const int h = td->h;\
+    AVFrame *in = td->in;\
+    AVFrame *out = td->out;\
+    const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;\
+    const int step = s->step;\
+\
+    const int slice_start = (h *  jobnr   ) / nb_jobs;\
+    const int slice_end   = (h * (jobnr+1)) / nb_jobs;\
+
+/* packed, 16-bit */
+static int lut_packed_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    LOAD_PACKED_COMMON

-    if (s->is_rgb && s->is_16bit && !s->is_planar) {
-        /* packed, 16-bit */
    uint16_t *inrow, *outrow, *inrow0, *outrow0;
-        const int w = inlink->w;
-        const int h = in->height;
-        const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
    const int in_linesize  =  in->linesize[0] / 2;
    const int out_linesize = out->linesize[0] / 2;
-        const int step = s->step;
-
-        inrow0  = (uint16_t*) in ->data[0];
-        outrow0 = (uint16_t*) out->data[0];
+    inrow0  = (uint16_t *)in ->data[0];
+    outrow0 = (uint16_t *)out->data[0];

-        for (i = 0; i < h; i ++) {
-            inrow  = inrow0;
-            outrow = outrow0;
+    for (i = slice_start; i < slice_end; i++) {
+        inrow  = inrow0 + i * in_linesize;
+        outrow = outrow0 + i * out_linesize;
        for (j = 0; j < w; j++) {

            switch (step) {
@@ -391,25 +392,25 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
            outrow += step;
            inrow  += step;
        }
-            inrow0  += in_linesize;
-            outrow0 += out_linesize;
    }
-    } else if (s->is_rgb && !s->is_planar) {
-        /* packed */
+
+    return 0;
+}
+
+/* packed, 8-bit */
+static int lut_packed_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    LOAD_PACKED_COMMON
+
    uint8_t *inrow, *outrow, *inrow0, *outrow0;
-        const int w = inlink->w;
-        const int h = in->height;
-        const uint16_t (*tab)[256*256] = (const uint16_t (*)[256*256])s->lut;
    const int in_linesize  =  in->linesize[0];
    const int out_linesize = out->linesize[0];
-        const int step = s->step;
-
    inrow0  = in ->data[0];
    outrow0 = out->data[0];

-        for (i = 0; i < h; i ++) {
-            inrow  = inrow0;
-            outrow = outrow0;
+    for (i = slice_start; i < slice_end; i++) {
+        inrow  = inrow0 + i * in_linesize;
+        outrow = outrow0 + i * out_linesize;
        for (j = 0; j < w; j++) {
            switch (step) {
            case 4:  outrow[3] = tab[3][inrow[3]]; // Fall-through
@@ -420,26 +421,45 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
            outrow += step;
            inrow  += step;
        }
-            inrow0  += in_linesize;
-            outrow0 += out_linesize;
    }
-    } else if (s->is_16bit) {
-        // planar >8 bit depth
+
+    return 0;
+}
+
+#define LOAD_PLANAR_COMMON\
+    LutContext *s = ctx->priv;\
+    const struct thread_data *td = arg;\
+    int i, j, plane;\
+    AVFrame *in = td->in;\
+    AVFrame *out = td->out;\
+
+#define PLANAR_COMMON\
+        int vsub = plane == 1 || plane == 2 ? s->vsub : 0;\
+        int hsub = plane == 1 || plane == 2 ? s->hsub : 0;\
+        int h = AV_CEIL_RSHIFT(td->h, vsub);\
+        int w = AV_CEIL_RSHIFT(td->w, hsub);\
+        const uint16_t *tab = s->lut[plane];\
+\
+        const int slice_start = (h *  jobnr   ) / nb_jobs;\
+        const int slice_end   = (h * (jobnr+1)) / nb_jobs;\
+
+/* planar >8 bit depth */
+static int lut_planar_16bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    LOAD_PLANAR_COMMON
+
    uint16_t *inrow, *outrow;

    for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
-            int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
-            int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
-            int h = AV_CEIL_RSHIFT(inlink->h, vsub);
-            int w = AV_CEIL_RSHIFT(inlink->w, hsub);
-            const uint16_t *tab = s->lut[plane];
+        PLANAR_COMMON
+
        const int in_linesize  =  in->linesize[plane] / 2;
        const int out_linesize = out->linesize[plane] / 2;

-            inrow  = (uint16_t *)in ->data[plane];
-            outrow = (uint16_t *)out->data[plane];
+        inrow  = (uint16_t *)(in ->data[plane] + slice_start * in_linesize);
+        outrow = (uint16_t *)(out->data[plane] + slice_start * out_linesize);

-            for (i = 0; i < h; i++) {
+        for (i = slice_start; i < slice_end; i++) {
            for (j = 0; j < w; j++) {
 #if HAVE_BIGENDIAN
                outrow[j] = av_bswap16(tab[av_bswap16(inrow[j])]);
@@ -451,29 +471,93 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
            outrow += out_linesize;
        }
    }
-    } else {
-        /* planar 8bit depth */
+
+    return 0;
+}
+
+/* planar 8bit depth */
+static int lut_planar_8bits(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+{
+    LOAD_PLANAR_COMMON
+
    uint8_t *inrow, *outrow;

    for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
-            int vsub = plane == 1 || plane == 2 ? s->vsub : 0;
-            int hsub = plane == 1 || plane == 2 ? s->hsub : 0;
-            int h = AV_CEIL_RSHIFT(inlink->h, vsub);
-            int w = AV_CEIL_RSHIFT(inlink->w, hsub);
-            const uint16_t *tab = s->lut[plane];
+        PLANAR_COMMON
+
        const int in_linesize  =  in->linesize[plane];
        const int out_linesize = out->linesize[plane];

-            inrow  = in ->data[plane];
-            outrow = out->data[plane];
+        inrow  = in ->data[plane] + slice_start * in_linesize;
+        outrow = out->data[plane] + slice_start * out_linesize;

-            for (i = 0; i < h; i++) {
+        for (i = slice_start; i < slice_end; i++) {
            for (j = 0; j < w; j++)
                outrow[j] = tab[inrow[j]];
            inrow  += in_linesize;
            outrow += out_linesize;
        }
    }
+
+    return 0;
+}
+
+#define PACKED_THREAD_DATA\
+ struct thread_data td = {\
+            .in  = in,\
+            .out = out,\
+            .w   = inlink->w,\
+            .h   = in->height,\
+        };\
+
+#define PLANAR_THREAD_DATA\
+ struct thread_data td = {\
+            .in  = in,\
+            .out = out,\
+            .w   = inlink->w,\
+            .h   = inlink->h,\
+        };\
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *in)
+{
+    AVFilterContext *ctx = inlink->dst;
+    LutContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *out;
+    int direct = 0;
+
+    if (av_frame_is_writable(in)) {
+        direct = 1;
+        out = in;
+    } else {
+        out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+        if (!out) {
+            av_frame_free(&in);
+            return AVERROR(ENOMEM);
+        }
+        av_frame_copy_props(out, in);
+    }
+
+    if (s->is_rgb && s->is_16bit && !s->is_planar) {
+        /* packed, 16-bit */
+        PACKED_THREAD_DATA
+        ctx->internal->execute(ctx, lut_packed_16bits, &td, NULL,
+                               FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
+    } else if (s->is_rgb && !s->is_planar) {
+        /* packed 8 bits */
+        PACKED_THREAD_DATA
+        ctx->internal->execute(ctx, lut_packed_8bits, &td, NULL,
+                               FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
+    } else if (s->is_16bit) {
+        /* planar >8 bit depth */
+        PLANAR_THREAD_DATA
+        ctx->internal->execute(ctx, lut_planar_16bits, &td, NULL,
+                               FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
+    } else {
+        /* planar 8bit depth */
+        PLANAR_THREAD_DATA
+        ctx->internal->execute(ctx, lut_planar_8bits, &td, NULL,
+                               FFMIN(in->height, ff_filter_get_nb_threads(ctx)));
    }

    if (!direct)
@@ -508,7 +592,7 @@ static const AVFilterPad outputs[] = {
        .query_formats = query_formats,                                 \
        .inputs        = inputs,                                        \
        .outputs       = outputs,                                       \
-        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC,        \
+        .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS,        \
    }

 #if CONFIG_LUT_FILTER