Commit 3a230ce5 authored by Martin Vignali's avatar Martin Vignali

avfilter/x86/vf_blend : avfilter/x86/vf_blend : add AVX2 version for each func except divide

and optimize average, grainextract, multiply, screen, grain merge
parent 4d95c6d5
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
;* x86-optimized functions for blend filter ;* x86-optimized functions for blend filter
;* ;*
;* Copyright (C) 2015 Paul B Mahol ;* Copyright (C) 2015 Paul B Mahol
;* Copyright (C) 2018 Henrik Gramner
;* Copyright (C) 2018 Jokyo Images
;* ;*
;* This file is part of FFmpeg. ;* This file is part of FFmpeg.
;* ;*
...@@ -74,39 +76,36 @@ BLEND_INIT %1, 2 ...@@ -74,39 +76,36 @@ BLEND_INIT %1, 2
BLEND_END BLEND_END
%endmacro %endmacro
INIT_XMM sse2 %macro GRAINEXTRACT 0
BLEND_SIMPLE xor, xor BLEND_INIT grainextract, 6
BLEND_SIMPLE or, or pxor m4, m4
BLEND_SIMPLE and, and VBROADCASTI128 m5, [pw_128]
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
BLEND_INIT grainextract, 4
pxor m2, m2
mova m3, [pw_128]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
paddw m0, m3 punpcklbw m2, m3, m4
psubw m0, m1 punpckhbw m3, m4
packuswb m0, m0
movh [dstq + xq], m0 paddw m0, m5
add xq, mmsize / 2 paddw m1, m5
psubw m0, m2
psubw m1, m3
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro MULTIPLY 3 ; a, b, pw_1 %macro MULTIPLY 3 ; a, b, pw_1
pmullw %1, %2 ; xxxxxxxx a * b pmullw %1, %2 ; xxxxxxxx a * b
paddw %1, %3 paddw %1, %3
mova %2, %1 psrlw %2, %1, 8
psrlw %2, 8
paddw %1, %2 paddw %1, %2
psrlw %1, 8 ; 00xx00xx a * b / 255 psrlw %1, 8 ; 00xx00xx a * b / 255
%endmacro %endmacro
...@@ -118,92 +117,112 @@ BLEND_END ...@@ -118,92 +117,112 @@ BLEND_END
pxor %1, %4 ; 00xx00xx 255 - x / 255 pxor %1, %4 ; 00xx00xx 255 - x / 255
%endmacro %endmacro
BLEND_INIT multiply, 4 %macro BLEND_MULTIPLY 0
pxor m2, m2 BLEND_INIT multiply, 6
mova m3, [pw_1] pxor m4, m4
VBROADCASTI128 m5, [pw_1]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
; word movu m1, [topq + xq]
; |--| movu m3, [bottomq + xq]
movh m0, [topq + xq] ; 0000xxxx punpcklbw m0, m1, m4
movh m1, [bottomq + xq] punpckhbw m1, m4
punpcklbw m0, m2 ; 00xx00xx punpcklbw m2, m3, m4
punpcklbw m1, m2 punpckhbw m3, m4
MULTIPLY m0, m1, m3
packuswb m0, m0 ; 0000xxxx MULTIPLY m0, m2, m5
movh [dstq + xq], m0 MULTIPLY m1, m3, m5
add xq, mmsize / 2
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
BLEND_INIT screen, 5 %macro BLEND_SCREEN 0
pxor m2, m2 BLEND_INIT screen, 7
mova m3, [pw_1] pxor m4, m4
mova m4, [pw_255]
VBROADCASTI128 m5, [pw_1]
VBROADCASTI128 m6, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] ; 0000xxxx movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 ; 00xx00xx punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
punpcklbw m2, m3, m4
SCREEN m0, m1, m3, m4 punpckhbw m3, m4
packuswb m0, m0 ; 0000xxxx SCREEN m0, m2, m5, m6
movh [dstq + xq], m0 SCREEN m1, m3, m5, m6
add xq, mmsize / 2
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro AVERAGE 0
BLEND_INIT average, 3 BLEND_INIT average, 3
pxor m2, m2 pcmpeqb m2, m2
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] movu m0, [topq + xq]
movh m1, [bottomq + xq] movu m1, [bottomq + xq]
punpcklbw m0, m2 pxor m0, m2
punpcklbw m1, m2 pxor m1, m2
paddw m0, m1 pavgb m0, m1
psrlw m0, 1 pxor m0, m2
packuswb m0, m0 mova [dstq + xq], m0
movh [dstq + xq], m0 add xq, mmsize
add xq, mmsize / 2
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
BLEND_INIT grainmerge, 4
pxor m2, m2 %macro GRAINMERGE 0
mova m3, [pw_128] BLEND_INIT grainmerge, 6
pxor m4, m4
VBROADCASTI128 m5, [pw_128]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
.loop: .loop:
movh m0, [topq + xq] movu m1, [topq + xq]
movh m1, [bottomq + xq] movu m3, [bottomq + xq]
punpcklbw m0, m2 punpcklbw m0, m1, m4
punpcklbw m1, m2 punpckhbw m1, m4
paddw m0, m1 punpcklbw m2, m3, m4
psubw m0, m3 punpckhbw m3, m4
packuswb m0, m0
movh [dstq + xq], m0 paddw m0, m2
add xq, mmsize / 2 paddw m1, m3
psubw m0, m5
psubw m1, m5
packuswb m0, m1
mova [dstq + xq], m0
add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro HARDMIX 0
BLEND_INIT hardmix, 5 BLEND_INIT hardmix, 5
mova m2, [pb_255] VBROADCASTI128 m2, [pb_255]
mova m3, [pb_128] VBROADCASTI128 m3, [pb_128]
mova m4, [pb_127] VBROADCASTI128 m4, [pb_127]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
...@@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5 ...@@ -218,7 +237,9 @@ BLEND_INIT hardmix, 5
add xq, mmsize add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro DIVIDE 0
BLEND_INIT divide, 4 BLEND_INIT divide, 4
pxor m2, m2 pxor m2, m2
mova m3, [ps_255] mova m3, [ps_255]
...@@ -247,9 +268,11 @@ BLEND_INIT divide, 4 ...@@ -247,9 +268,11 @@ BLEND_INIT divide, 4
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro PHOENIX 0
BLEND_INIT phoenix, 4 BLEND_INIT phoenix, 4
mova m3, [pb_255] VBROADCASTI128 m3, [pb_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
...@@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4 ...@@ -266,6 +289,7 @@ BLEND_INIT phoenix, 4
add xq, mmsize add xq, mmsize
jl .loop jl .loop
BLEND_END BLEND_END
%endmacro
%macro BLEND_ABS 0 %macro BLEND_ABS 0
BLEND_INIT difference, 5 BLEND_INIT difference, 5
...@@ -291,7 +315,7 @@ BLEND_END ...@@ -291,7 +315,7 @@ BLEND_END
BLEND_INIT extremity, 8 BLEND_INIT extremity, 8
pxor m2, m2 pxor m2, m2
mova m4, [pw_255] VBROADCASTI128 m4, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
...@@ -315,7 +339,7 @@ BLEND_END ...@@ -315,7 +339,7 @@ BLEND_END
BLEND_INIT negation, 8 BLEND_INIT negation, 8
pxor m2, m2 pxor m2, m2
mova m4, [pw_255] VBROADCASTI128 m4, [pw_255]
.nextrow: .nextrow:
mov xq, widthq mov xq, widthq
...@@ -341,6 +365,43 @@ BLEND_END ...@@ -341,6 +365,43 @@ BLEND_END
%endmacro %endmacro
INIT_XMM sse2 INIT_XMM sse2
BLEND_SIMPLE xor, xor
BLEND_SIMPLE or, or
BLEND_SIMPLE and, and
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
GRAINEXTRACT
BLEND_MULTIPLY
BLEND_SCREEN
AVERAGE
GRAINMERGE
HARDMIX
PHOENIX
DIVIDE
BLEND_ABS BLEND_ABS
INIT_XMM ssse3 INIT_XMM ssse3
BLEND_ABS BLEND_ABS
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
BLEND_SIMPLE xor, xor
BLEND_SIMPLE or, or
BLEND_SIMPLE and, and
BLEND_SIMPLE addition, addusb
BLEND_SIMPLE subtract, subusb
BLEND_SIMPLE darken, minub
BLEND_SIMPLE lighten, maxub
GRAINEXTRACT
BLEND_MULTIPLY
BLEND_SCREEN
AVERAGE
GRAINMERGE
HARDMIX
PHOENIX
BLEND_ABS
%endif
...@@ -31,26 +31,43 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize, \ ...@@ -31,26 +31,43 @@ void ff_blend_##name##_##opt(const uint8_t *top, ptrdiff_t top_linesize, \
struct FilterParams *param, double *values, int starty); struct FilterParams *param, double *values, int starty);
BLEND_FUNC(addition, sse2) BLEND_FUNC(addition, sse2)
BLEND_FUNC(addition, avx2)
BLEND_FUNC(grainmerge, sse2) BLEND_FUNC(grainmerge, sse2)
BLEND_FUNC(grainmerge, avx2)
BLEND_FUNC(average, sse2) BLEND_FUNC(average, sse2)
BLEND_FUNC(average, avx2)
BLEND_FUNC(and, sse2) BLEND_FUNC(and, sse2)
BLEND_FUNC(and, avx2)
BLEND_FUNC(darken, sse2) BLEND_FUNC(darken, sse2)
BLEND_FUNC(darken, avx2)
BLEND_FUNC(grainextract, sse2) BLEND_FUNC(grainextract, sse2)
BLEND_FUNC(grainextract, avx2)
BLEND_FUNC(multiply, sse2) BLEND_FUNC(multiply, sse2)
BLEND_FUNC(multiply, avx2)
BLEND_FUNC(screen, sse2) BLEND_FUNC(screen, sse2)
BLEND_FUNC(screen, avx2)
BLEND_FUNC(hardmix, sse2) BLEND_FUNC(hardmix, sse2)
BLEND_FUNC(hardmix, avx2)
BLEND_FUNC(divide, sse2) BLEND_FUNC(divide, sse2)
BLEND_FUNC(lighten, sse2) BLEND_FUNC(lighten, sse2)
BLEND_FUNC(lighten, avx2)
BLEND_FUNC(or, sse2) BLEND_FUNC(or, sse2)
BLEND_FUNC(or, avx2)
BLEND_FUNC(phoenix, sse2) BLEND_FUNC(phoenix, sse2)
BLEND_FUNC(phoenix, avx2)
BLEND_FUNC(subtract, sse2) BLEND_FUNC(subtract, sse2)
BLEND_FUNC(subtract, avx2)
BLEND_FUNC(xor, sse2) BLEND_FUNC(xor, sse2)
BLEND_FUNC(xor, avx2)
BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, sse2)
BLEND_FUNC(difference, ssse3) BLEND_FUNC(difference, ssse3)
BLEND_FUNC(difference, avx2)
BLEND_FUNC(extremity, sse2) BLEND_FUNC(extremity, sse2)
BLEND_FUNC(extremity, ssse3) BLEND_FUNC(extremity, ssse3)
BLEND_FUNC(extremity, avx2)
BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, sse2)
BLEND_FUNC(negation, ssse3) BLEND_FUNC(negation, ssse3)
BLEND_FUNC(negation, avx2)
av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
{ {
...@@ -85,4 +102,26 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) ...@@ -85,4 +102,26 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break;
} }
} }
if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1 && !is_16bit) {
switch (param->mode) {
case BLEND_ADDITION: param->blend = ff_blend_addition_avx2; break;
case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_avx2; break;
case BLEND_AND: param->blend = ff_blend_and_avx2; break;
case BLEND_AVERAGE: param->blend = ff_blend_average_avx2; break;
case BLEND_DARKEN: param->blend = ff_blend_darken_avx2; break;
case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_avx2; break;
case BLEND_HARDMIX: param->blend = ff_blend_hardmix_avx2; break;
case BLEND_LIGHTEN: param->blend = ff_blend_lighten_avx2; break;
case BLEND_MULTIPLY: param->blend = ff_blend_multiply_avx2; break;
case BLEND_OR: param->blend = ff_blend_or_avx2; break;
case BLEND_PHOENIX: param->blend = ff_blend_phoenix_avx2; break;
case BLEND_SCREEN: param->blend = ff_blend_screen_avx2; break;
case BLEND_SUBTRACT: param->blend = ff_blend_subtract_avx2; break;
case BLEND_XOR: param->blend = ff_blend_xor_avx2; break;
case BLEND_DIFFERENCE: param->blend = ff_blend_difference_avx2; break;
case BLEND_EXTREMITY: param->blend = ff_blend_extremity_avx2; break;
case BLEND_NEGATION: param->blend = ff_blend_negation_avx2; break;
}
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment