Commit 3c99523a authored by Rostislav Pehlivanov's avatar Rostislav Pehlivanov

opus_pvq_search: split functions into exactness and only use the exact if its faster

This splits the asm function into exact and non-exact version. The exact
version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST describes
well) whilst the non-exact version is faster than the exact on older CPUs.

Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax.
Signed-off-by: 's avatarRostislav Pehlivanov <atomnuker@gmail.com>
parent 285c015f
...@@ -24,9 +24,9 @@ ...@@ -24,9 +24,9 @@
#include "libavutil/x86/cpu.h" #include "libavutil/x86/cpu.h"
#include "libavcodec/opus_pvq.h" #include "libavcodec/opus_pvq.h"
extern float ff_pvq_search_sse2(float *X, int *y, int K, int N); extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N);
extern float ff_pvq_search_sse4(float *X, int *y, int K, int N); extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N);
extern float ff_pvq_search_avx (float *X, int *y, int K, int N); extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N);
av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
{ {
...@@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) ...@@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
#if CONFIG_OPUS_ENCODER #if CONFIG_OPUS_ENCODER
if (EXTERNAL_SSE2(cpu_flags)) if (EXTERNAL_SSE2(cpu_flags))
s->pvq_search = ff_pvq_search_sse2; s->pvq_search = ff_pvq_search_approx_sse2;
if (EXTERNAL_SSE4(cpu_flags)) if (EXTERNAL_SSE4(cpu_flags))
s->pvq_search = ff_pvq_search_sse4; s->pvq_search = ff_pvq_search_approx_sse4;
if (EXTERNAL_AVX(cpu_flags)) if (EXTERNAL_AVX_FAST(cpu_flags))
s->pvq_search = ff_pvq_search_avx; s->pvq_search = ff_pvq_search_exact_avx;
#endif #endif
} }
...@@ -82,7 +82,7 @@ SECTION .text ...@@ -82,7 +82,7 @@ SECTION .text
%endif %endif
%endmacro %endmacro
%macro PULSES_SEARCH 1 %macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
; m6 Syy_norm ; m6 Syy_norm
; m7 Sxy_norm ; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
...@@ -96,7 +96,7 @@ align 16 ...@@ -96,7 +96,7 @@ align 16
movaps m4, [tmpY + r4] ; y[i] movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i] movaps m5, [tmpX + r4] ; X[i]
%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but fast 1/sqrt %if %2
xorps m0, m0 xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
%endif %endif
...@@ -104,7 +104,7 @@ align 16 ...@@ -104,7 +104,7 @@ align 16
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
%if !cpuflag(avx) %if %2
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%endif %endif
...@@ -119,7 +119,7 @@ align 16 ...@@ -119,7 +119,7 @@ align 16
andps m5, m0 ; (0<y)?m5:0 andps m5, m0 ; (0<y)?m5:0
%endif %endif
%if !cpuflag(avx) %if %2
rsqrtps m4, m4 rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else %else
...@@ -211,8 +211,13 @@ align 16 ...@@ -211,8 +211,13 @@ align 16
; uint32 K - Number of pulses to have after quantizations. ; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256 ; uint32 N - Number of vector elements. Must be 0 < N < 256
; ;
%macro PVQ_FAST_SEARCH 0 %macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N %if %1
cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%else
cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%endif
%define tmpX rsp %define tmpX rsp
%define tmpY outYq %define tmpY outYq
...@@ -255,7 +260,7 @@ align 16 ...@@ -255,7 +260,7 @@ align 16
jz %%zero_input ; if (Sx==0) goto zero_input jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K cvtsi2ss xm0, dword Kd ; m0 = K
%if !cpuflag(avx) %if %1
rcpss xm1, xm1 ; m1 = approx(1/Sx) rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx) mulss xm0, xm1 ; m0 = K*(1/Sx)
%else %else
...@@ -308,7 +313,7 @@ align 16 ...@@ -308,7 +313,7 @@ align 16
align 16 ; K - pulses > 0 align 16 ; K - pulses > 0
%%add_pulses_loop: %%add_pulses_loop:
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1 sub Kd, 1
jnz %%add_pulses_loop jnz %%add_pulses_loop
...@@ -320,7 +325,7 @@ align 16 ; K - pulses > 0 ...@@ -320,7 +325,7 @@ align 16 ; K - pulses > 0
align 16 align 16
%%remove_pulses_loop: %%remove_pulses_loop:
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1 add Kd, 1
jnz %%remove_pulses_loop jnz %%remove_pulses_loop
...@@ -367,12 +372,15 @@ align 16 ...@@ -367,12 +372,15 @@ align 16
jmp %%return jmp %%return
%endmacro %endmacro
; if 1, use a float op that give half precision but execute for around 3 cycles.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
INIT_XMM sse2 INIT_XMM sse2
PVQ_FAST_SEARCH PVQ_FAST_SEARCH 1
INIT_XMM sse4 INIT_XMM sse4
PVQ_FAST_SEARCH PVQ_FAST_SEARCH 1
INIT_XMM avx INIT_XMM avx
PVQ_FAST_SEARCH PVQ_FAST_SEARCH 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment