Commit 869efbf9 authored by Martin Vignali's avatar Martin Vignali

avfilter/x86/vf_threshold : add threshold16 SIMD (SSE4 and AVX2)

parent 713f9c5b
......@@ -25,16 +25,18 @@
SECTION_RODATA
pb_128: times 16 db 128
pb_128_0 : times 8 db 0, 128
SECTION .text
%macro THRESHOLD_8 0
;%1 depth (8 or 16) ; %2 b or w ; %3 constant
%macro THRESHOLD 3
%if ARCH_X86_64
cglobal threshold8, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
cglobal threshold%1, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
mov wd, dword wm
mov hd, dword hm
%else
cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
cglobal threshold%1, 5, 7, 5, in, threshold, min, max, out, w, x
mov wd, r10m
%define ilinesizeq r5mp
%define tlinesizeq r6mp
......@@ -43,7 +45,10 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
%define olinesizeq r9mp
%define hd r11mp
%endif
VBROADCASTI128 m4, [pb_128]
VBROADCASTI128 m4, [%3]
%if %1 == 16
add wq, wq ; w *= 2 (16 bits instead of 8)
%endif
add inq, wq
add thresholdq, wq
add minq, wq
......@@ -60,7 +65,7 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
movu m3, [maxq + xq]
pxor m0, m4
pxor m1, m4
pcmpgtb m0, m1
pcmpgt%2 m0, m1
PBLENDVB m3, m2, m0
movu [outq + xq], m3
add xq, mmsize
......@@ -77,9 +82,11 @@ RET
%endmacro
INIT_XMM sse4
THRESHOLD_8
THRESHOLD 8, b, pb_128
THRESHOLD 16, w, pb_128_0
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
THRESHOLD_8
THRESHOLD 8, b, pb_128
THRESHOLD 16, w, pb_128_0
%endif
......@@ -23,20 +23,19 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/threshold.h"
void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold,
const uint8_t *min, const uint8_t *max,
uint8_t *out,
ptrdiff_t ilinesize, ptrdiff_t tlinesize,
ptrdiff_t flinesize, ptrdiff_t slinesize,
ptrdiff_t olinesize,
int w, int h);
void ff_threshold8_avx2(const uint8_t *in, const uint8_t *threshold,
const uint8_t *min, const uint8_t *max,
uint8_t *out,
ptrdiff_t ilinesize, ptrdiff_t tlinesize,
ptrdiff_t flinesize, ptrdiff_t slinesize,
ptrdiff_t olinesize,
int w, int h);
#define THRESHOLD_FUNC(depth, opt) \
void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\
const uint8_t *min, const uint8_t *max, \
uint8_t *out, \
ptrdiff_t ilinesize, ptrdiff_t tlinesize, \
ptrdiff_t flinesize, ptrdiff_t slinesize, \
ptrdiff_t olinesize, \
int w, int h);
THRESHOLD_FUNC(8, sse4)
THRESHOLD_FUNC(8, avx2)
THRESHOLD_FUNC(16, sse4)
THRESHOLD_FUNC(16, avx2)
av_cold void ff_threshold_init_x86(ThresholdContext *s)
{
......@@ -49,5 +48,12 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->threshold = ff_threshold8_avx2;
}
} else if (s->depth == 16) {
if (EXTERNAL_SSE4(cpu_flags)) {
s->threshold = ff_threshold16_sse4;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->threshold = ff_threshold16_avx2;
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment