Commit ae4c9dde authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Michael Niedermayer

vf_psnr: sse2 optimizations for sum-squared-error.

The internal line accumulator for 16bit can overflow, so I changed that
from int to uint64_t in the C code. The matching assembly looks a little
weird but output looks correct.

(avx2 should be trivial to add later.)
Reviewed-by: 's avatarPaul B Mahol <onemda@gmail.com>
Reviewed-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent fcbea93c
/*
* Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef LIBAVFILTER_PSNR_H
#define LIBAVFILTER_PSNR_H
#include <stddef.h>
#include <stdint.h>
typedef struct PSNRDSPContext {
uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w);
} PSNRDSPContext;
void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp);
#endif /* LIBAVFILTER_PSNR_H */
......@@ -33,6 +33,7 @@
#include "drawutils.h"
#include "formats.h"
#include "internal.h"
#include "psnr.h"
#include "video.h"
typedef struct PSNRContext {
......@@ -50,11 +51,7 @@ typedef struct PSNRContext {
int planewidth[4];
int planeheight[4];
double planeweight[4];
void (*compute_mse)(struct PSNRContext *s,
const uint8_t *m[4], const int ml[4],
const uint8_t *r[4], const int rl[4],
int w, int h, double mse[4]);
PSNRDSPContext dsp;
} PSNRContext;
#define OFFSET(x) offsetof(PSNRContext, x)
......@@ -78,55 +75,48 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max)
return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0);
}
static inline
void compute_images_mse(PSNRContext *s,
const uint8_t *main_data[4], const int main_linesizes[4],
const uint8_t *ref_data[4], const int ref_linesizes[4],
int w, int h, double mse[4])
static uint64_t sse_line_8bit(const uint8_t *main_line, const uint8_t *ref_line, int outw)
{
int i, c, j;
int j;
unsigned m2 = 0;
for (c = 0; c < s->nb_components; c++) {
const int outw = s->planewidth[c];
const int outh = s->planeheight[c];
const uint8_t *main_line = main_data[c];
const uint8_t *ref_line = ref_data[c];
const int ref_linesize = ref_linesizes[c];
const int main_linesize = main_linesizes[c];
uint64_t m = 0;
for (j = 0; j < outw; j++)
m2 += pow2(main_line[j] - ref_line[j]);
for (i = 0; i < outh; i++) {
int m2 = 0;
for (j = 0; j < outw; j++)
m2 += pow2(main_line[j] - ref_line[j]);
m += m2;
ref_line += ref_linesize;
main_line += main_linesize;
}
mse[c] = m / (double)(outw * outh);
}
return m2;
}
static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw)
{
int j;
uint64_t m2 = 0;
const uint16_t *main_line = (const uint16_t *) _main_line;
const uint16_t *ref_line = (const uint16_t *) _ref_line;
for (j = 0; j < outw; j++)
m2 += pow2(main_line[j] - ref_line[j]);
return m2;
}
static inline
void compute_images_mse_16bit(PSNRContext *s,
void compute_images_mse(PSNRContext *s,
const uint8_t *main_data[4], const int main_linesizes[4],
const uint8_t *ref_data[4], const int ref_linesizes[4],
int w, int h, double mse[4])
{
int i, c, j;
int i, c;
for (c = 0; c < s->nb_components; c++) {
const int outw = s->planewidth[c];
const int outh = s->planeheight[c];
const uint16_t *main_line = (uint16_t *)main_data[c];
const uint16_t *ref_line = (uint16_t *)ref_data[c];
const int ref_linesize = ref_linesizes[c] / 2;
const int main_linesize = main_linesizes[c] / 2;
const uint8_t *main_line = main_data[c];
const uint8_t *ref_line = ref_data[c];
const int ref_linesize = ref_linesizes[c];
const int main_linesize = main_linesizes[c];
uint64_t m = 0;
for (i = 0; i < outh; i++) {
for (j = 0; j < outw; j++)
m += pow2(main_line[j] - ref_line[j]);
m += s->dsp.sse_line(main_line, ref_line, outw);
ref_line += ref_linesize;
main_line += main_linesize;
}
......@@ -155,9 +145,9 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main,
int j, c;
AVDictionary **metadata = avpriv_frame_get_metadatap(main);
s->compute_mse(s, (const uint8_t **)main->data, main->linesize,
(const uint8_t **)ref->data, ref->linesize,
main->width, main->height, comp_mse);
compute_images_mse(s, (const uint8_t **)main->data, main->linesize,
(const uint8_t **)ref->data, ref->linesize,
main->width, main->height, comp_mse);
for (j = 0; j < s->nb_components; j++)
mse += comp_mse[j] * s->planeweight[j];
......@@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink)
s->average_max += s->max[j] * s->planeweight[j];
}
s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse;
s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit;
if (ARCH_X86)
ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1);
return 0;
}
......
......@@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
......@@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
......
;*****************************************************************************
;* x86-optimized functions for interlace filter
;*
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro SSE_LINE_FN 2 ; 8 or 16, byte or word
INIT_XMM sse2
%if ARCH_X86_32
%if %1 == 8
cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
%else
cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
%endif
mov bufq, r0mp
mov refq, r1mp
mov wd, r2m
%else
cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
%endif
pxor m6, m6
pxor m7, m7
sub wd, mmsize*2
jl .end
.loop:
movu m0, [bufq+mmsize*0]
movu m1, [bufq+mmsize*1]
movu m2, [refq+mmsize*0]
movu m3, [refq+mmsize*1]
%if %1 == 8
add bufq, mmsize*2
add refq, mmsize*2
psubusb m4, m0, m2
psubusb m5, m1, m3
psubusb m2, m0
psubusb m3, m1
por m2, m4
por m3, m5
punpcklbw m0, m2, m6
punpcklbw m1, m3, m6
punpckhbw m2, m6
punpckhbw m3, m6
%else
psubw m0, m2
psubw m1, m3
movu m2, [bufq+mmsize*2]
movu m3, [bufq+mmsize*3]
movu m4, [refq+mmsize*2]
movu m5, [refq+mmsize*3]
psubw m2, m4
psubw m3, m5
add bufq, mmsize*4
add refq, mmsize*4
%endif
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
paddd m0, m1
paddd m2, m3
%if %1 == 8
paddd m7, m0
paddd m7, m2
%else
paddd m0, m2
punpckldq m2, m0, m6
punpckhdq m0, m6
paddq m7, m0
paddq m7, m2
%endif
sub wd, mmsize*2
jge .loop
.end:
add wd, mmsize*2
movhlps m0, m7
%if %1 == 8
paddd m7, m0
pshufd m0, m7, 1
paddd m7, m0
movd eax, m7
%else
paddq m7, m0
%if ARCH_X86_32
movd eax, m7
psrldq m7, 4
movd edx, m7
%else
movq rax, m7
%endif
%endif
; deal with cases where w % 32 != 0
test wd, wd
jz .end_scalar
.loop_scalar:
movzx px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
movzx px2d, %2 [refq+wq*(%1/8)-(%1/8)]
sub px1d, px2d
imul px1d, px1d
%if %1 == 8
add eax, px1d
%elif ARCH_X86_64
add rax, px1q
%else
add eax, px1d
adc edx, 0
%endif
dec wd
jg .loop_scalar
.end_scalar:
; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
RET
%endmacro
INIT_XMM sse2
SSE_LINE_FN 8, byte
SSE_LINE_FN 16, word
/*
* Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/cpu.h"
#include "libavfilter/psnr.h"
uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
if (bpp <= 8) {
dsp->sse_line = ff_sse_line_8bit_sse2;
} else if (bpp <= 15) {
dsp->sse_line = ff_sse_line_16bit_sse2;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment