Commit d03da3e2 authored by Hendrik Leppkes's avatar Hendrik Leppkes

Merge commit '2008f760'

* commit '2008f760':
  dca: remove unused decode_hf function and quant_d tables
Merged-by: 's avatarHendrik Leppkes <h.leppkes@gmail.com>
parents af1238f8 2008f760
......@@ -41,12 +41,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
float out[32], const float in[32],
float scale);
void ff_decode_hf_neon(float dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
......@@ -54,7 +48,6 @@ av_cold void ff_dcadsp_init_aarch64(DCADSPContext *s)
if (have_neon(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
s->decode_hf = ff_decode_hf_neon;
}
}
......
......@@ -21,66 +21,6 @@
#include "libavutil/aarch64/asm.S"
function ff_decode_hf_neon, export=1
add x2, x2, x3
add x0, x0, x5, lsl #5
add x1, x1, x5, lsl #2
add x4, x4, x5, lsl #3
sub x6, x6, x5
ldr w7, [x1], #4
add x7, x2, x7, lsl #5
subs x6, x6, #1
b.eq 1f
b.gt 2f
ret
2:
ldr w8, [x1], #4
subs x6, x6, #2
add x8, x2, x8, lsl #5
ld1 {v2.4s}, [x4], #16
ld1 {v0.8b}, [x7]
ld1 {v4.8b}, [x8]
sxtl v3.8h, v0.8b
sxtl v7.8h, v4.8b
scvtf v2.4s, v2.4s, #4
sxtl v0.4s, v3.4h
sxtl2 v1.4s, v3.8h
sxtl v4.4s, v7.4h
sxtl2 v5.4s, v7.8h
scvtf v0.4s, v0.4s
scvtf v1.4s, v1.4s
scvtf v4.4s, v4.4s
scvtf v5.4s, v5.4s
fmul v0.4s, v0.4s, v2.s[0]
fmul v1.4s, v1.4s, v2.s[0]
fmul v4.4s, v4.4s, v2.s[2]
fmul v5.4s, v5.4s, v2.s[2]
b.lt 10f
ldr w7, [x1], #4
add x7, x2, x7, lsl #5
st1 {v0.4s,v1.4s}, [x0], #32
st1 {v4.4s,v5.4s}, [x0], #32
b.gt 2b
1:
ldr w9, [x4]
ld1 {v0.8b}, [x7]
scvtf s2, w9, #4
sxtl v3.8h, v0.8b
sxtl v0.4s, v3.4h
sxtl2 v1.4s, v3.8h
scvtf v0.4s, v0.4s
scvtf v1.4s, v1.4s
fmul v0.4s, v0.4s, v2.s[0]
fmul v1.4s, v1.4s, v2.s[0]
st1 {v0.4s,v1.4s}, [x0]
ret
10:
st1 {v0.4s,v1.4s}, [x0], #32
st1 {v4.4s,v5.4s}, [x0]
ret
endfunc
function ff_dca_lfe_fir0_neon, export=1
mov x3, #32 // decifactor
sub x1, x1, #7*4
......
......@@ -49,12 +49,6 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
float out[32], const float in[32],
float scale);
void ff_decode_hf_neon(float dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
......@@ -67,7 +61,6 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
if (have_neon(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
s->decode_hf = ff_decode_hf_neon;
}
}
......
......@@ -20,35 +20,6 @@
#include "libavutil/arm/asm.S"
function ff_decode_hf_neon, export=1
push {r4-r5,lr}
add r2, r2, r3
ldr r3, [sp, #12]
ldrd r4, r5, [sp, #16]
add r3, r3, r4, lsl #3
add r1, r1, r4, lsl #2
add r0, r0, r4, lsl #5
1: ldr_post lr, r1, #4
add r4, r4, #1
add lr, r2, lr, lsl #5
cmp r4, r5
vld1.32 {d7}, [r3]!
vld1.8 {d0}, [lr,:64]
vcvt.f32.s32 d7, d7, #4
vmovl.s8 q1, d0
vmovl.s16 q0, d2
vmovl.s16 q1, d3
vcvt.f32.s32 q0, q0
vcvt.f32.s32 q1, q1
vmul.f32 q0, q0, d7[0]
vmul.f32 q1, q1, d7[0]
vst1.32 {q0-q1}, [r0,:128]!
bne 1b
pop {r4-r5,pc}
endfunc
function ff_dca_lfe_fir0_neon, export=1
push {r4-r6,lr}
mov r3, #32 @ decifactor
......
......@@ -4189,13 +4189,6 @@ const uint32_t ff_dca_lossy_quant[32] = {
84, 42, 21, 0, 0, 0, 0, 0
};
const float ff_dca_lossy_quant_d[32] = {
0, 1.6, 1.0, 0.8, 0.59, 0.50, 0.42, 0.34,
0.19, 0.11, 0.06, 0.035, 0.019, 0.011, 0.0065, 0.0040,
0.0025, 0.0014, 0.0008, 0.00045, 0.00030, 0.00017, 0.00008, 0.00004,
0.00002, 0.00001, 0.000005, 0, 0, 0, 0, 0
};
/* 20bits unsigned fractional binary codes */
const uint32_t ff_dca_lossless_quant[32] = {
0, 4194304, 2097152, 1384120, 1048576, 696254, 524288, 348127,
......
......@@ -35,7 +35,6 @@ extern const uint32_t ff_dca_scale_factor_quant6[64];
extern const uint32_t ff_dca_scale_factor_quant7[128];
extern const uint32_t ff_dca_lossy_quant[32];
extern const float ff_dca_lossy_quant_d[32];
extern const uint32_t ff_dca_lossless_quant[32];
extern const float ff_dca_lossless_quant_d[32];
......
......@@ -992,12 +992,12 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
s->debug_flag |= 0x01;
}
s->dcadsp.decode_hf_int(subband_samples, s->dca_chan[k].high_freq_vq,
ff_dca_high_freq_vq, subsubframe * SAMPLES_PER_SUBBAND,
s->dca_chan[k].scale_factor,
s->audio_header.vq_start_subband[k],
s->audio_header.subband_activity[k]);
s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
ff_dca_high_freq_vq,
subsubframe * SAMPLES_PER_SUBBAND,
s->dca_chan[k].scale_factor,
s->audio_header.vq_start_subband[k],
s->audio_header.subband_activity[k]);
}
}
......
......@@ -27,29 +27,11 @@
#include "dcadsp.h"
#include "dcamath.h"
static void decode_hf_c(float dst[DCA_SUBBANDS][8],
static void decode_hf_c(int32_t dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end)
{
int i, l;
for (l = start; l < end; l++) {
/* 1 vector -> 32 samples but we only need the 8 samples
* for this subsubframe. */
const int8_t *ptr = &hf_vq[vq_num[l]][vq_offset];
float fscale = scale[l][0] * (1 / 16.0);
for (i = 0; i < 8; i++)
dst[l][i] = ptr[i] * fscale;
}
}
static void decode_hf_int_c(int32_t dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end)
{
int i, j;
......@@ -141,7 +123,6 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
s->lfe_fir[1] = dca_lfe_fir1_c;
s->qmf_32_subbands = dca_qmf_32_subbands;
s->decode_hf = decode_hf_c;
s->decode_hf_int = decode_hf_int_c;
s->dequantize = dequantize_c;
if (ARCH_AARCH64)
......
......@@ -32,16 +32,11 @@ typedef struct DCADSPContext {
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
void (*decode_hf)(float dst[DCA_SUBBANDS][8],
void (*decode_hf)(int32_t dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
void (*decode_hf_int)(int32_t dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
void (*dequantize)(int32_t *samples, uint32_t step_size, uint32_t scale);
} DCADSPContext;
......
......@@ -26,92 +26,6 @@ pf_inv16: times 4 dd 0x3D800000 ; 1/16
SECTION .text
; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
; const int8_t hf_vq[1024][32], intptr_t vq_offset,
; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
%macro DECODE_HF 0
cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
lea srcq, [srcq + offsetq]
shl startq, 2
mov offsetd, endm
%define DICT offsetq
shl offsetq, 2
mov endm, offsetq
.loop:
%if ARCH_X86_64
mov offsetd, [scaleq + 2 * startq]
cvtsi2ss m0, offsetd
%else
cvtsi2ss m0, [scaleq + 2 * startq]
%endif
mov offsetd, [numq + startq]
mulss m0, [pf_inv16]
shl DICT, 5
shufps m0, m0, 0
%if cpuflag(sse2)
%if cpuflag(sse4)
pmovsxbd m1, [srcq + DICT + 0]
pmovsxbd m2, [srcq + DICT + 4]
%else
movq m1, [srcq + DICT]
punpcklbw m1, m1
mova m2, m1
punpcklwd m1, m1
punpckhwd m2, m2
psrad m1, 24
psrad m2, 24
%endif
cvtdq2ps m1, m1
cvtdq2ps m2, m2
%else
movd mm0, [srcq + DICT + 0]
movd mm1, [srcq + DICT + 4]
punpcklbw mm0, mm0
punpcklbw mm1, mm1
movq mm2, mm0
movq mm3, mm1
punpcklwd mm0, mm0
punpcklwd mm1, mm1
punpckhwd mm2, mm2
punpckhwd mm3, mm3
psrad mm0, 24
psrad mm1, 24
psrad mm2, 24
psrad mm3, 24
cvtpi2ps m1, mm0
cvtpi2ps m2, mm1
cvtpi2ps m3, mm2
cvtpi2ps m4, mm3
shufps m0, m0, 0
shufps m1, m3, q1010
shufps m2, m4, q1010
%endif
mulps m1, m0
mulps m2, m0
mova [dstq + 8 * startq + 0], m1
mova [dstq + 8 * startq + 16], m2
add startq, 4
cmp startq, endm
jl .loop
.end:
%if notcpuflag(sse2)
emms
%endif
REP_RET
%endmacro
%if ARCH_X86_32
INIT_XMM sse
DECODE_HF
%endif
INIT_XMM sse2
DECODE_HF
INIT_XMM sse4
DECODE_HF
; %1=v0/v1 %2=in1 %3=in2
%macro FIR_LOOP 2-3
.loop%1:
......
......@@ -23,15 +23,6 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/dcadsp.h"
void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir0_fma3(float *out, const float *in, const float *coefs);
......@@ -41,21 +32,10 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
#if ARCH_X86_32
s->decode_hf = ff_decode_hf_sse;
#endif
s->lfe_fir[0] = ff_dca_lfe_fir0_sse;
s->lfe_fir[1] = ff_dca_lfe_fir1_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->decode_hf = ff_decode_hf_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
s->decode_hf = ff_decode_hf_sse4;
}
if (EXTERNAL_FMA3(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir0_fma3;
}
......
......@@ -75,16 +75,6 @@
} \
} while (0)
#define randomize_decode_hf() \
do { \
int i; \
for (i = 0; i < DCA_SUBBANDS; i++) { \
vq_num[i] = rnd() >> 22; \
scale[i][0] = rnd() >> 26; \
scale[i][1] = INT32_MIN; \
} \
} while (0)
void checkasm_check_dcadsp(void)
{
DCADSPContext c;
......@@ -98,40 +88,5 @@ void checkasm_check_dcadsp(void)
if (check_func(c.lfe_fir[1], "dca_lfe_fir1"))
check_lfe_fir(64, 1.0e-6f);
if (check_func(c.decode_hf, "dca_decode_hf")) {
LOCAL_ALIGNED_16(float, dst0, [DCA_SUBBANDS], [8]);
LOCAL_ALIGNED_16(float, dst1, [DCA_SUBBANDS], [8]);
LOCAL_ALIGNED_16(int32_t, scale, [DCA_SUBBANDS], [2]);
LOCAL_ALIGNED_16(int32_t, vq_num, [DCA_SUBBANDS]);
intptr_t start, end = 32, offset;
declare_func(void, float[DCA_SUBBANDS][8], const int32_t[DCA_SUBBANDS],
const int8_t[1024][DCA_SUBBANDS], intptr_t, int32_t[DCA_SUBBANDS][2],
intptr_t, intptr_t);
for (start = 0; start < 32; start++) {
for (offset = 0; offset < 32; offset += 8) {
int j;
for (j = 0; j < DCA_SUBBANDS; j++) {
memset(dst0[j], 0, sizeof(*(dst0[j])) * 8);
memset(dst1[j], 0, sizeof(*(dst1[j])) * 8);
}
randomize_decode_hf();
call_ref(dst0, vq_num, ff_dca_high_freq_vq, offset, scale, start, end);
call_new(dst1, vq_num, ff_dca_high_freq_vq, offset, scale, start, end);
for (j = 0; j < 8 * DCA_SUBBANDS; j++) {
if (!float_near_ulp(dst0[j>>3][j&7], dst1[j>>3][j&7], 1)) {
fail();
break;
}
}
bench_new(dst1, vq_num, ff_dca_high_freq_vq, offset, scale, start, end);
}
}
}
report("dcadsp");
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment