Commit 6d7c6358 authored by Paul B Mahol's avatar Paul B Mahol

avfilter/vf_overlay: add x86 SIMD

Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha
is straight.
Signed-off-by: 's avatarPaul B Mahol <onemda@gmail.com>
parent a150b2e3
......@@ -39,6 +39,7 @@
#include "drawutils.h"
#include "framesync.h"
#include "video.h"
#include "vf_overlay.h"
typedef struct ThreadData {
AVFrame *dst, *src;
......@@ -59,21 +60,6 @@ static const char *const var_names[] = {
NULL
};
enum var_name {
VAR_MAIN_W, VAR_MW,
VAR_MAIN_H, VAR_MH,
VAR_OVERLAY_W, VAR_OW,
VAR_OVERLAY_H, VAR_OH,
VAR_HSUB,
VAR_VSUB,
VAR_X,
VAR_Y,
VAR_N,
VAR_POS,
VAR_T,
VAR_VARS_NB
};
#define MAIN 0
#define OVERLAY 1
......@@ -92,45 +78,6 @@ enum EvalMode {
EVAL_MODE_NB
};
enum OverlayFormat {
OVERLAY_FORMAT_YUV420,
OVERLAY_FORMAT_YUV422,
OVERLAY_FORMAT_YUV444,
OVERLAY_FORMAT_RGB,
OVERLAY_FORMAT_GBRP,
OVERLAY_FORMAT_AUTO,
OVERLAY_FORMAT_NB
};
typedef struct OverlayContext {
const AVClass *class;
int x, y; ///< position of overlaid picture
uint8_t main_is_packed_rgb;
uint8_t main_rgba_map[4];
uint8_t main_has_alpha;
uint8_t overlay_is_packed_rgb;
uint8_t overlay_rgba_map[4];
uint8_t overlay_has_alpha;
int format; ///< OverlayFormat
int alpha_format;
int eval_mode; ///< EvalMode
FFFrameSync fs;
int main_pix_step[4]; ///< steps per pixel for each plane of the main output
int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
int hsub, vsub; ///< chroma subsampling values
const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
double var_values[VAR_VARS_NB];
char *x_expr, *y_expr;
AVExpr *x_pexpr, *y_pexpr;
int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
} OverlayContext;
static av_cold void uninit(AVFilterContext *ctx)
{
OverlayContext *s = ctx->priv;
......@@ -509,6 +456,7 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
int jobnr,
int nb_jobs)
{
OverlayContext *octx = ctx->priv;
int src_wp = AV_CEIL_RSHIFT(src_w, hsub);
int src_hp = AV_CEIL_RSHIFT(src_h, vsub);
int dst_wp = AV_CEIL_RSHIFT(dst_w, hsub);
......@@ -538,8 +486,18 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
s = sp + k;
a = ap + (k<<hsub);
da = dap + ((xp+k) << hsub);
kmax = FFMIN(-xp + dst_wp, src_wp);
if (((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {
int c = octx->blend_row[i](d, da, s, a, kmax - k, src->linesize[3]);
for (kmax = FFMIN(-xp + dst_wp, src_wp); k < kmax; k++) {
s += c;
d += dst_step * c;
da += (1 << hsub) * c;
a += (1 << hsub) * c;
k += c;
}
for (; k < kmax; k++) {
int alpha_v, alpha_h, alpha;
// average alpha for color components, improve quality
......@@ -916,7 +874,7 @@ static int config_input_main(AVFilterLink *inlink)
}
if (!s->alpha_format)
return 0;
goto end;
switch (s->format) {
case OVERLAY_FORMAT_YUV420:
......@@ -960,6 +918,11 @@ static int config_input_main(AVFilterLink *inlink)
}
break;
}
end:
if (ARCH_X86)
ff_overlay_init_x86(s, s->format, s->alpha_format, s->main_has_alpha);
return 0;
}
......
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVFILTER_OVERLAY_H
#define AVFILTER_OVERLAY_H
#include "libavutil/eval.h"
#include "libavutil/pixdesc.h"
#include "framesync.h"
#include "avfilter.h"
enum var_name {
VAR_MAIN_W, VAR_MW,
VAR_MAIN_H, VAR_MH,
VAR_OVERLAY_W, VAR_OW,
VAR_OVERLAY_H, VAR_OH,
VAR_HSUB,
VAR_VSUB,
VAR_X,
VAR_Y,
VAR_N,
VAR_POS,
VAR_T,
VAR_VARS_NB
};
enum OverlayFormat {
OVERLAY_FORMAT_YUV420,
OVERLAY_FORMAT_YUV422,
OVERLAY_FORMAT_YUV444,
OVERLAY_FORMAT_RGB,
OVERLAY_FORMAT_GBRP,
OVERLAY_FORMAT_AUTO,
OVERLAY_FORMAT_NB
};
typedef struct OverlayContext {
const AVClass *class;
int x, y; ///< position of overlaid picture
uint8_t main_is_packed_rgb;
uint8_t main_rgba_map[4];
uint8_t main_has_alpha;
uint8_t overlay_is_packed_rgb;
uint8_t overlay_rgba_map[4];
uint8_t overlay_has_alpha;
int format; ///< OverlayFormat
int alpha_format;
int eval_mode; ///< EvalMode
FFFrameSync fs;
int main_pix_step[4]; ///< steps per pixel for each plane of the main output
int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
int hsub, vsub; ///< chroma subsampling values
const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
double var_values[VAR_VARS_NB];
char *x_expr, *y_expr;
AVExpr *x_pexpr, *y_pexpr;
int (*blend_row[4])(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, int w,
ptrdiff_t alinesize);
int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
} OverlayContext;
void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha);
#endif /* AVFILTER_OVERLAY_H */
......@@ -13,6 +13,7 @@ OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
......@@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
X86ASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
......
;*****************************************************************************
;* x86-optimized functions for overlay filter
;*
;* Copyright (C) 2018 Paul B Mahol
;* Copyright (C) 2018 Henrik Gramner
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_1: times 16 db 1
pw_128: times 8 dw 128
pw_255: times 8 dw 255
pw_257: times 8 dw 257
SECTION .text
INIT_XMM sse4
cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
xor xq, xq
movsxdifnidn wq, wd
mov rq, wq
and rq, mmsize/2 - 1
cmp wq, mmsize/2
jl .end
sub wq, rq
mova m3, [pw_255]
mova m4, [pw_128]
mova m5, [pw_257]
.loop:
pmovzxbw m0, [sq+xq]
pmovzxbw m2, [aq+xq]
pmovzxbw m1, [dq+xq]
pmullw m0, m2
pxor m2, m3
pmullw m1, m2
paddw m0, m4
paddw m0, m1
pmulhuw m0, m5
packuswb m0, m0
movq [dq+xq], m0
add xq, mmsize/2
cmp xq, wq
jl .loop
.end:
mov eax, xd
RET
INIT_XMM sse4
cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
xor xq, xq
movsxdifnidn wq, wd
sub wq, 1
mov rq, wq
and rq, mmsize/2 - 1
cmp wq, mmsize/2
jl .end
sub wq, rq
mova m3, [pw_255]
mova m4, [pw_128]
mova m5, [pw_257]
.loop:
pmovzxbw m0, [sq+xq]
movu m1, [aq+2*xq]
pandn m2, m3, m1
psllw m1, 8
pavgw m2, m1
pavgw m2, m1
psrlw m2, 8
pmovzxbw m1, [dq+xq]
pmullw m0, m2
pxor m2, m3
pmullw m1, m2
paddw m0, m4
paddw m0, m1
pmulhuw m0, m5
packuswb m0, m0
movq [dq+xq], m0
add xq, mmsize/2
cmp xq, wq
jl .loop
.end:
mov eax, xd
RET
INIT_XMM sse4
cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
mov daq, aq
add daq, rmp
xor xq, xq
movsxdifnidn wq, wd
sub wq, 1
mov rq, wq
and rq, mmsize/2 - 1
cmp wq, mmsize/2
jl .end
sub wq, rq
mova m3, [pw_255]
mova m4, [pw_128]
mova m5, [pw_257]
mova m6, [pb_1]
.loop:
pmovzxbw m0, [sq+xq]
movu m2, [aq+2*xq]
movu m1, [daq+2*xq]
pmaddubsw m2, m6
pmaddubsw m1, m6
paddw m2, m1
psrlw m2, 2
pmovzxbw m1, [dq+xq]
pmullw m0, m2
pxor m2, m3
pmullw m1, m2
paddw m0, m4
paddw m0, m1
pmulhuw m0, m5
packuswb m0, m0
movq [dq+xq], m0
add xq, mmsize/2
cmp xq, wq
jl .loop
.end:
mov eax, xd
RET
/*
* Copyright (c) 2018 Paul B Mahol
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_overlay.h"
int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
int w, ptrdiff_t alinesize);
int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
int w, ptrdiff_t alinesize);
int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
int w, ptrdiff_t alinesize);
av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE4(cpu_flags) &&
(format == OVERLAY_FORMAT_YUV444 ||
format == OVERLAY_FORMAT_GBRP) &&
alpha_format == 0 && main_has_alpha == 0) {
s->blend_row[0] = ff_overlay_row_44_sse4;
s->blend_row[1] = ff_overlay_row_44_sse4;
s->blend_row[2] = ff_overlay_row_44_sse4;
}
if (EXTERNAL_SSE4(cpu_flags) &&
(format == OVERLAY_FORMAT_YUV420) &&
alpha_format == 0 && main_has_alpha == 0) {
s->blend_row[0] = ff_overlay_row_44_sse4;
s->blend_row[1] = ff_overlay_row_20_sse4;
s->blend_row[2] = ff_overlay_row_20_sse4;
}
if (EXTERNAL_SSE4(cpu_flags) &&
(format == OVERLAY_FORMAT_YUV422) &&
alpha_format == 0 && main_has_alpha == 0) {
s->blend_row[0] = ff_overlay_row_44_sse4;
s->blend_row[1] = ff_overlay_row_22_sse4;
s->blend_row[2] = ff_overlay_row_22_sse4;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment