Commit c6963a22 authored by Michael Niedermayer's avatar Michael Niedermayer

Merge remote-tracking branch 'qatar/master'

* qatar/master:
  proresdsp: port x86 assembly to cpuflags.
  lavr: x86: improve non-SSE4 version of S16_TO_S32_SX macro
  lavfi: better channel layout negotiation
  alac: check for truncated packets
  alac: reverse lpc coeff order, simplify filter
  lavr: add x86-optimized mixing functions
  x86: add support for fmaddps fma4 instruction with abstraction to avx/sse
  tscc2: fix typo in array index
  build: use COMPILE template for HOSTOBJS
  build: do full flag handling for all compiler-type tools
  eval: fix printing of NaN in eval fate test.
  build: Rename aandct component to more descriptive aandcttables
  mpegaudio: bury inline asm under HAVE_INLINE_ASM.
  x86inc: automatically insert vzeroupper for YMM functions.
  rtmp: Check the buffer length of ping packets
  rtmp: Allow having more unknown data at the end of a chunk size packet without failing
  rtmp: Prevent reading outside of an allocate buffer when receiving server bandwidth packets

Conflicts:
	Makefile
	configure
	libavcodec/x86/proresdsp.asm
	libavutil/eval.c
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 94c3e11a 5361e10a
......@@ -11,7 +11,7 @@ ifndef V
Q = @
ECHO = printf "$(1)\t%s\n" $(2)
BRIEF = CC CXX AS YASM AR LD HOSTCC STRIP CP
SILENT = DEPCC YASMDEP RM RANLIB
SILENT = DEPCC DEPAS DEPHOSTCC YASMDEP RM RANLIB
MSG = $@
M = @$(call ECHO,$(TAG),$@);
$(foreach VAR,$(BRIEF), \
......@@ -26,15 +26,16 @@ ALLFFLIBS = avcodec avdevice avfilter avformat avresample avutil postproc swscal
IFLAGS := -I. -I$(SRC_PATH)/
CPPFLAGS := $(IFLAGS) $(CPPFLAGS)
CFLAGS += $(ECFLAGS)
CCFLAGS = $(CFLAGS)
CCFLAGS = $(CPPFLAGS) $(CFLAGS)
ASFLAGS := $(CPPFLAGS) $(ASFLAGS)
CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
YASMFLAGS += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm
HOSTCFLAGS += $(IFLAGS)
HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS)
LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS)
define COMPILE
$($(1)DEP)
$($(1)) $(CPPFLAGS) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $<
$(call $(1)DEP,$(1))
$($(1)) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $<
endef
COMPILE_C = $(call COMPILE,CC)
......@@ -101,7 +102,7 @@ checkheaders: $(filter-out $(SKIPHEADERS:.h=.ho),$(ALLHEADERS:.h=.ho))
alltools: $(TOOLS)
$(HOSTOBJS): %.o: %.c
$(HOSTCC) $(HOSTCFLAGS) -c -o $@ $<
$(call COMPILE,HOSTCC)
$(HOSTPROGS): %$(HOSTEXESUF): %.o
$(HOSTCC) $(HOSTLDFLAGS) -o $@ $< $(HOSTLIBS)
......@@ -117,4 +118,4 @@ CLEANSUFFIXES = *.d *.o *~ *.ho *.map *.ver *.gcno *.gcda
DISTCLEANSUFFIXES = *.pc
LIBSUFFIXES = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
-include $(wildcard $(OBJS:.o=.d) $(TESTOBJS:.o=.d))
-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d))
This diff is collapsed.
......@@ -28,8 +28,6 @@ doc/%.txt: doc/%.texi
$(Q)$(TEXIDEP)
$(M)makeinfo --force --no-headers -o $@ $< 2>/dev/null
doc/print_options.o: libavformat/options_table.h libavcodec/options_table.h
GENTEXI = format codec
GENTEXI := $(GENTEXI:%=doc/avoptions_%.texi)
......
......@@ -32,7 +32,7 @@ OBJS = allcodecs.o \
utils.o \
# parts needed for many different codecs
OBJS-$(CONFIG_AANDCT) += aandcttab.o
OBJS-$(CONFIG_AANDCTTABLES) += aandcttab.o
OBJS-$(CONFIG_AC3DSP) += ac3dsp.o
OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o
OBJS-$(CONFIG_ENCODERS) += faandct.o jfdctfst.o jfdctint.o
......
......@@ -200,6 +200,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
int lpc_order, int lpc_quant)
{
int i;
int32_t *pred = buffer_out;
/* first sample always copies */
*buffer_out = *error_buffer;
......@@ -223,37 +224,35 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
}
/* read warm-up samples */
for (i = 0; i < lpc_order; i++) {
buffer_out[i + 1] = sign_extend(buffer_out[i] + error_buffer[i + 1],
bps);
}
for (i = 1; i <= lpc_order; i++)
buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps);
/* NOTE: 4 and 8 are very common cases that could be optimized. */
for (i = lpc_order; i < nb_samples - 1; i++) {
for (; i < nb_samples; i++) {
int j;
int val = 0;
int error_val = error_buffer[i + 1];
int error_val = error_buffer[i];
int error_sign;
int d = buffer_out[i - lpc_order];
int d = *pred++;
/* LPC prediction */
for (j = 0; j < lpc_order; j++)
val += (buffer_out[i - j] - d) * lpc_coefs[j];
val += (pred[j] - d) * lpc_coefs[j];
val = (val + (1 << (lpc_quant - 1))) >> lpc_quant;
val += d + error_val;
buffer_out[i + 1] = sign_extend(val, bps);
buffer_out[i] = sign_extend(val, bps);
/* adapt LPC coefficients */
error_sign = sign_only(error_val);
if (error_sign) {
for (j = lpc_order - 1; j >= 0 && error_val * error_sign > 0; j--) {
for (j = 0; j < lpc_order && error_val * error_sign > 0; j++) {
int sign;
val = d - buffer_out[i - j];
val = d - pred[j];
sign = sign_only(val) * error_sign;
lpc_coefs[j] -= sign;
val *= sign;
error_val -= (val >> lpc_quant) * (lpc_order - j);
error_val -= (val >> lpc_quant) * (j + 1);
}
}
}
......@@ -356,7 +355,7 @@ static int decode_element(AVCodecContext *avctx, void *data, int ch_index,
lpc_order[ch] = get_bits(&alac->gb, 5);
/* read the predictor table */
for (i = 0; i < lpc_order[ch]; i++)
for (i = lpc_order[ch] - 1; i >= 0; i--)
lpc_coefs[ch][i] = get_sbits(&alac->gb, 16);
}
......@@ -477,16 +476,19 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
ALACContext *alac = avctx->priv_data;
enum RawDataBlockType element;
int channels;
int ch, ret;
int ch, ret, got_end;
init_get_bits(&alac->gb, avpkt->data, avpkt->size * 8);
got_end = 0;
alac->nb_samples = 0;
ch = 0;
while (get_bits_left(&alac->gb)) {
while (get_bits_left(&alac->gb) >= 3) {
element = get_bits(&alac->gb, 3);
if (element == TYPE_END)
if (element == TYPE_END) {
got_end = 1;
break;
}
if (element > TYPE_CPE && element != TYPE_LFE) {
av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d", element);
return AVERROR_PATCHWELCOME;
......@@ -501,11 +503,15 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
ret = decode_element(avctx, data,
alac_channel_layout_offsets[alac->channels - 1][ch],
channels);
if (ret < 0)
if (ret < 0 && get_bits_left(&alac->gb))
return ret;
ch += channels;
}
if (!got_end) {
av_log(avctx, AV_LOG_ERROR, "no end tag found. incomplete packet.\n");
return AVERROR_INVALIDDATA;
}
if (avpkt->size * 8 - get_bits_count(&alac->gb) > 8) {
av_log(avctx, AV_LOG_ERROR, "Error : %d bits left\n",
......
......@@ -298,8 +298,8 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data,
if (!size) {
int skip_row = 1, j, off = i * c->mb_width;
for (j = 0; j < c->mb_width; j++) {
if (c->slice_quants[off + i] == 1 ||
c->slice_quants[off + i] == 2) {
if (c->slice_quants[off + j] == 1 ||
c->slice_quants[off + j] == 2) {
skip_row = 0;
break;
}
......
......@@ -1158,12 +1158,7 @@ ALIGN 16
add src1q, 2*mmsize
sub lenq, 2*mmsize
jge .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......@@ -1193,12 +1188,7 @@ ALIGN 16
sub lenq, 2*mmsize
jge .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......@@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
%endif
add lenq, mmsize
jl .loop
%if mmsize == 32
vzeroupper
RET
%endif
.end:
REP_RET
%endmacro
......
......@@ -750,9 +750,6 @@ section .text
; The others pass args in registers and don't spill anything.
cglobal fft_dispatch%2, 2,5,8, z, nbits
FFT_DISPATCH fullsuffix, nbits
%if mmsize == 32
vzeroupper
%endif
RET
%endmacro ; DECL_FFT
......@@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
%1 r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0
add esp, 12
%endif
%if mmsize == 32
vzeroupper
%endif
RET
%endmacro
......
......@@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
#if HAVE_INLINE_ASM
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
......@@ -178,6 +180,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
*out = sum;
}
#endif /* HAVE_INLINE_ASM */
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
......@@ -241,9 +244,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s)
}
}
#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_SSE2) {
s->apply_window_float = apply_window_mp3;
}
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (0) {
#if HAVE_AVX
......
......@@ -83,8 +83,7 @@ section .text align=16
; %1 = row or col (for rounding variable)
; %2 = number of bits to shift at the end
; %3 = optimization
%macro IDCT_1D 3
%macro IDCT_1D 2
; a0 = (W4 * row[0]) + (1 << (15 - 1));
; a1 = a0;
; a2 = a0;
......@@ -235,8 +234,8 @@ section .text align=16
; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
; DCTELEM *block, const int16_t *qmat);
%macro idct_put_fn 2
cglobal prores_idct_put_10_%1, 4, 4, %2
%macro idct_put_fn 1
cglobal prores_idct_put_10, 4, 4, %1
movsxd r1, r1d
pxor m15, m15 ; zero
......@@ -252,7 +251,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
pmullw m13,[r3+64]
pmullw m12,[r3+96]
IDCT_1D row, 15, %1
IDCT_1D row, 15
; transpose for second part of IDCT
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
......@@ -267,7 +266,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
; for (i = 0; i < 8; i++)
; idctSparseColAdd(dest + i, line_size, block + i);
IDCT_1D col, 18, %1
IDCT_1D col, 18
; clip/store
mova m3, [pw_4]
......@@ -302,13 +301,27 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
RET
%endmacro
INIT_XMM
idct_put_fn sse2, 16
INIT_XMM
idct_put_fn sse4, 16
%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp
%if cpuflag(sse4)
movhlps %2, %1
pmovsxwd %1, %1
pmovsxwd %2, %2
%else ; sse2
pxor %3, %3
pcmpgtw %3, %1
mova %2, %1
punpcklwd %1, %3
punpckhwd %2, %3
%endif
%endmacro
INIT_XMM sse2
idct_put_fn 16
INIT_XMM sse4
idct_put_fn 16
%if HAVE_AVX
INIT_AVX
idct_put_fn avx, 16
INIT_XMM avx
idct_put_fn 16
%endif
%endif
......@@ -578,11 +578,44 @@ static void swap_samplerates(AVFilterGraph *graph)
swap_samplerates_on_filter(graph->filters[i]);
}
#define CH_CENTER_PAIR (AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER)
#define CH_FRONT_PAIR (AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT)
#define CH_STEREO_PAIR (AV_CH_STEREO_LEFT | AV_CH_STEREO_RIGHT)
#define CH_WIDE_PAIR (AV_CH_WIDE_LEFT | AV_CH_WIDE_RIGHT)
#define CH_SIDE_PAIR (AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT)
#define CH_DIRECT_PAIR (AV_CH_SURROUND_DIRECT_LEFT | AV_CH_SURROUND_DIRECT_RIGHT)
#define CH_BACK_PAIR (AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT)
/* allowable substitutions for channel pairs when comparing layouts,
* ordered by priority for both values */
static const uint64_t ch_subst[][2] = {
{ CH_FRONT_PAIR, CH_CENTER_PAIR },
{ CH_FRONT_PAIR, CH_WIDE_PAIR },
{ CH_FRONT_PAIR, AV_CH_FRONT_CENTER },
{ CH_CENTER_PAIR, CH_FRONT_PAIR },
{ CH_CENTER_PAIR, CH_WIDE_PAIR },
{ CH_CENTER_PAIR, AV_CH_FRONT_CENTER },
{ CH_WIDE_PAIR, CH_FRONT_PAIR },
{ CH_WIDE_PAIR, CH_CENTER_PAIR },
{ CH_WIDE_PAIR, AV_CH_FRONT_CENTER },
{ AV_CH_FRONT_CENTER, CH_FRONT_PAIR },
{ AV_CH_FRONT_CENTER, CH_CENTER_PAIR },
{ AV_CH_FRONT_CENTER, CH_WIDE_PAIR },
{ CH_SIDE_PAIR, CH_DIRECT_PAIR },
{ CH_SIDE_PAIR, CH_BACK_PAIR },
{ CH_SIDE_PAIR, AV_CH_BACK_CENTER },
{ CH_BACK_PAIR, CH_DIRECT_PAIR },
{ CH_BACK_PAIR, CH_SIDE_PAIR },
{ CH_BACK_PAIR, AV_CH_BACK_CENTER },
{ AV_CH_BACK_CENTER, CH_BACK_PAIR },
{ AV_CH_BACK_CENTER, CH_DIRECT_PAIR },
{ AV_CH_BACK_CENTER, CH_SIDE_PAIR },
};
static void swap_channel_layouts_on_filter(AVFilterContext *filter)
{
AVFilterLink *link = NULL;
uint64_t chlayout;
int i, j;
int i, j, k;
for (i = 0; i < filter->nb_inputs; i++) {
link = filter->inputs[i];
......@@ -594,27 +627,55 @@ static void swap_channel_layouts_on_filter(AVFilterContext *filter)
if (i == filter->nb_inputs)
return;
chlayout = link->out_channel_layouts->channel_layouts[0];
for (i = 0; i < filter->nb_outputs; i++) {
AVFilterLink *outlink = filter->outputs[i];
int best_idx, best_score = INT_MIN;
int best_idx, best_score = INT_MIN, best_count_diff = INT_MAX;
if (outlink->type != AVMEDIA_TYPE_AUDIO ||
outlink->in_channel_layouts->nb_channel_layouts < 2)
continue;
for (j = 0; j < outlink->in_channel_layouts->nb_channel_layouts; j++) {
uint64_t in_chlayout = link->out_channel_layouts->channel_layouts[0];
uint64_t out_chlayout = outlink->in_channel_layouts->channel_layouts[j];
int matched_channels = av_get_channel_layout_nb_channels(chlayout &
out_chlayout);
int extra_channels = av_get_channel_layout_nb_channels(out_chlayout &
(~chlayout));
int score = matched_channels - extra_channels;
int in_channels = av_get_channel_layout_nb_channels(in_chlayout);
int out_channels = av_get_channel_layout_nb_channels(out_chlayout);
int count_diff = out_channels - in_channels;
int matched_channels, extra_channels;
int score = 0;
/* channel substitution */
for (k = 0; k < FF_ARRAY_ELEMS(ch_subst); k++) {
uint64_t cmp0 = ch_subst[k][0];
uint64_t cmp1 = ch_subst[k][1];
if (( in_chlayout & cmp0) && (!(out_chlayout & cmp0)) &&
(out_chlayout & cmp1) && (!( in_chlayout & cmp1))) {
in_chlayout &= ~cmp0;
out_chlayout &= ~cmp1;
/* add score for channel match, minus a deduction for
having to do the substitution */
score += 10 * av_get_channel_layout_nb_channels(cmp1) - 2;
}
}
if (score > best_score) {
/* no penalty for LFE channel mismatch */
if ( (in_chlayout & AV_CH_LOW_FREQUENCY) &&
(out_chlayout & AV_CH_LOW_FREQUENCY))
score += 10;
in_chlayout &= ~AV_CH_LOW_FREQUENCY;
out_chlayout &= ~AV_CH_LOW_FREQUENCY;
matched_channels = av_get_channel_layout_nb_channels(in_chlayout &
out_chlayout);
extra_channels = av_get_channel_layout_nb_channels(out_chlayout &
(~in_chlayout));
score += 10 * matched_channels - 5 * extra_channels;
if (score > best_score ||
(count_diff < best_count_diff && score == best_score)) {
best_score = score;
best_idx = j;
best_count_diff = count_diff;
}
}
FFSWAP(uint64_t, outlink->in_channel_layouts->channel_layouts[0],
......
......@@ -515,6 +515,12 @@ static int gen_pong(URLContext *s, RTMPContext *rt, RTMPPacket *ppkt)
uint8_t *p;
int ret;
if (ppkt->data_size < 6) {
av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n",
ppkt->data_size);
return AVERROR_INVALIDDATA;
}
if ((ret = ff_rtmp_packet_create(&pkt, RTMP_NETWORK_CHANNEL, RTMP_PT_PING,
ppkt->timestamp + 1, 6)) < 0)
return ret;
......@@ -885,9 +891,9 @@ static int handle_chunk_size(URLContext *s, RTMPPacket *pkt)
RTMPContext *rt = s->priv_data;
int ret;
if (pkt->data_size != 4) {
if (pkt->data_size < 4) {
av_log(s, AV_LOG_ERROR,
"Chunk size change packet is not 4 bytes long (%d)\n",
"Too short chunk size change packet (%d)\n",
pkt->data_size);
return AVERROR_INVALIDDATA;
}
......@@ -913,6 +919,12 @@ static int handle_ping(URLContext *s, RTMPPacket *pkt)
RTMPContext *rt = s->priv_data;
int t, ret;
if (pkt->data_size < 2) {
av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n",
pkt->data_size);
return AVERROR_INVALIDDATA;
}
t = AV_RB16(pkt->data);
if (t == 6) {
if ((ret = gen_pong(s, rt, pkt)) < 0)
......@@ -950,6 +962,13 @@ static int handle_server_bw(URLContext *s, RTMPPacket *pkt)
{
RTMPContext *rt = s->priv_data;
if (pkt->data_size < 4) {
av_log(s, AV_LOG_ERROR,
"Too short server bandwidth report packet (%d)\n",
pkt->data_size);
return AVERROR_INVALIDDATA;
}
rt->server_bw = AV_RB32(pkt->data);
if (rt->server_bw <= 0) {
av_log(s, AV_LOG_ERROR, "Incorrect server bandwidth %d\n",
......
......@@ -246,9 +246,10 @@ static int handle_buffered_output(AVAudioResampleContext *avr,
return 0;
}
int avresample_convert(AVAudioResampleContext *avr, void **output,
int out_plane_size, int out_samples, void **input,
int in_plane_size, int in_samples)
int attribute_align_arg avresample_convert(AVAudioResampleContext *avr,
void **output, int out_plane_size,
int out_samples, void **input,
int in_plane_size, int in_samples)
{
AudioData input_buffer;
AudioData output_buffer;
......
......@@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len
mova [dstq+lenq+mmsize], m2
add lenq, mmsize*2
jl .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse2
......@@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len
mova [dstq+lenq+3*mmsize], m3
add lenq, mmsize*4
jl .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse2
......
......@@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
add srcq, mmsize*2
sub lend, mmsize*2/4
jg .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......@@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
add src0q, mmsize
sub lend, mmsize/4
jg .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......@@ -236,3 +226,296 @@ MIX_1_TO_2_S16P_FLT
INIT_XMM avx
MIX_1_TO_2_S16P_FLT
%endif
;-----------------------------------------------------------------------------
; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
; int len, int out_ch, int in_ch);
;-----------------------------------------------------------------------------
%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
; define some names to make the code clearer
%assign in_channels %1
%assign out_channels %2
%assign stereo out_channels - 1
%ifidn %3, s16p
%assign is_s16 1
%else
%assign is_s16 0
%endif
; determine how many matrix elements must go on the stack vs. mmregs
%assign matrix_elements in_channels * out_channels
%if is_s16
%if stereo
%assign needed_mmregs 7
%else
%assign needed_mmregs 5
%endif
%else
%if stereo
%assign needed_mmregs 4
%else
%assign needed_mmregs 3
%endif
%endif
%assign matrix_elements_mm num_mmregs - needed_mmregs
%if matrix_elements < matrix_elements_mm
%assign matrix_elements_mm matrix_elements
%endif
%if matrix_elements_mm < matrix_elements
%assign matrix_elements_stack matrix_elements - matrix_elements_mm
%else
%assign matrix_elements_stack 0
%endif
cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
; get aligned stack space if needed
%if matrix_elements_stack > 0
%if mmsize == 32
%assign bkpreg %1 + 1
%define bkpq r %+ bkpreg %+ q
mov bkpq, rsp
and rsp, ~(mmsize-1)
sub rsp, matrix_elements_stack * mmsize
%else
%assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
SUB rsp, pad
%endif
%endif
; load matrix pointers
%define matrix0q r1q
%define matrix1q r3q
%if stereo
mov matrix1q, [matrix0q+gprsize]
%endif
mov matrix0q, [matrix0q]
; define matrix coeff names
%assign %%i 0
%assign %%j needed_mmregs
%rep in_channels
%if %%i >= matrix_elements_mm
CAT_XDEFINE mx_stack_0_, %%i, 1
CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
%else
CAT_XDEFINE mx_stack_0_, %%i, 0
CAT_XDEFINE mx_0_, %%i, m %+ %%j
%assign %%j %%j+1
%endif
%assign %%i %%i+1
%endrep
%if stereo
%assign %%i 0
%rep in_channels
%if in_channels + %%i >= matrix_elements_mm
CAT_XDEFINE mx_stack_1_, %%i, 1
CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
%else
CAT_XDEFINE mx_stack_1_, %%i, 0
CAT_XDEFINE mx_1_, %%i, m %+ %%j
%assign %%j %%j+1
%endif
%assign %%i %%i+1
%endrep
%endif
; load/splat matrix coeffs
%assign %%i 0
%rep in_channels
%if mx_stack_0_ %+ %%i
VBROADCASTSS m0, [matrix0q+4*%%i]
mova mx_0_ %+ %%i, m0
%else
VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
%endif
%if stereo
%if mx_stack_1_ %+ %%i
VBROADCASTSS m0, [matrix1q+4*%%i]
mova mx_1_ %+ %%i, m0
%else
VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
%endif
%endif
%assign %%i %%i+1
%endrep
; load channel pointers to registers as offsets from the first channel pointer
%if ARCH_X86_64
movsxd lenq, r2d
%endif
shl lenq, 2-is_s16
%assign %%i 1
%rep (in_channels - 1)
%if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
mov src5q, [src0q+%%i*gprsize]
add src5q, lenq
mov src %+ %%i %+ m, src5q
%else
mov src %+ %%i %+ q, [src0q+%%i*gprsize]
add src %+ %%i %+ q, lenq
%endif
%assign %%i %%i+1
%endrep
mov src0q, [src0q]
add src0q, lenq
neg lenq
.loop
; for x86-32 with 7-8 channels we do not have enough gp registers for all src
; pointers, so we have to load some of them from the stack each time
%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
%if is_s16
; mix with s16p input
mova m0, [src0q+lenq]
S16_TO_S32_SX 0, 1
cvtdq2ps m0, m0
cvtdq2ps m1, m1
%if stereo
mulps m2, m0, mx_1_0
mulps m3, m1, mx_1_0
%endif
mulps m0, m0, mx_0_0
mulps m1, m1, mx_0_0
%assign %%i 1
%rep (in_channels - 1)
%if copy_src_from_stack
%define src_ptr src5q
%else
%define src_ptr src %+ %%i %+ q
%endif
%if stereo
%if copy_src_from_stack
mov src_ptr, src %+ %%i %+ m
%endif
mova m4, [src_ptr+lenq]
S16_TO_S32_SX 4, 5
cvtdq2ps m4, m4
cvtdq2ps m5, m5
fmaddps m2, m4, mx_1_ %+ %%i, m2, m6
fmaddps m3, m5, mx_1_ %+ %%i, m3, m6
fmaddps m0, m4, mx_0_ %+ %%i, m0, m4
fmaddps m1, m5, mx_0_ %+ %%i, m1, m5
%else
%if copy_src_from_stack
mov src_ptr, src %+ %%i %+ m
%endif
mova m2, [src_ptr+lenq]
S16_TO_S32_SX 2, 3
cvtdq2ps m2, m2
cvtdq2ps m3, m3
fmaddps m0, m2, mx_0_ %+ %%i, m0, m4
fmaddps m1, m3, mx_0_ %+ %%i, m1, m4
%endif
%assign %%i %%i+1
%endrep
%if stereo
cvtps2dq m2, m2
cvtps2dq m3, m3
packssdw m2, m3
mova [src1q+lenq], m2
%endif
cvtps2dq m0, m0
cvtps2dq m1, m1
packssdw m0, m1
mova [src0q+lenq], m0
%else
; mix with fltp input
%if stereo || mx_stack_0_0
mova m0, [src0q+lenq]
%endif
%if stereo
mulps m1, m0, mx_1_0
%endif
%if stereo || mx_stack_0_0
mulps m0, m0, mx_0_0
%else
mulps m0, [src0q+lenq], mx_0_0
%endif
%assign %%i 1
%rep (in_channels - 1)
%if copy_src_from_stack
%define src_ptr src5q
mov src_ptr, src %+ %%i %+ m
%else
%define src_ptr src %+ %%i %+ q
%endif
; avoid extra load for mono if matrix is in a mm register
%if stereo || mx_stack_0_ %+ %%i
mova m2, [src_ptr+lenq]
%endif
%if stereo
fmaddps m1, m2, mx_1_ %+ %%i, m1, m3
%endif
%if stereo || mx_stack_0_ %+ %%i
fmaddps m0, m2, mx_0_ %+ %%i, m0, m2
%else
fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
%endif
%assign %%i %%i+1
%endrep
mova [src0q+lenq], m0
%if stereo
mova [src1q+lenq], m1
%endif
%endif
add lenq, mmsize
jl .loop
; restore stack pointer
%if matrix_elements_stack > 0
%if mmsize == 32
mov rsp, bkpq
%else
ADD rsp, pad
%endif
%endif
; zero ymm high halves
%if mmsize == 32
vzeroupper
%endif
RET
%endmacro
%macro MIX_3_8_TO_1_2_FLT_FUNCS 0
%assign %%i 3
%rep 6
INIT_XMM sse
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
INIT_XMM sse2
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
INIT_XMM sse4
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
%if HAVE_AVX
%if ARCH_X86_64 || %%i < 6
INIT_YMM avx
%else
INIT_XMM avx
%endif
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
INIT_XMM avx
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
%endif
%if HAVE_FMA4
%if ARCH_X86_64 || %%i < 6
INIT_YMM fma4
%else
INIT_XMM fma4
%endif
MIX_3_8_TO_1_2_FLT %%i, 1, fltp
MIX_3_8_TO_1_2_FLT %%i, 2, fltp
INIT_XMM fma4
MIX_3_8_TO_1_2_FLT %%i, 1, s16p
MIX_3_8_TO_1_2_FLT %%i, 2, s16p
%endif
%assign %%i %%i+1
%endrep
%endmacro
MIX_3_8_TO_1_2_FLT_FUNCS
......@@ -47,6 +47,129 @@ extern void ff_mix_1_to_2_s16p_flt_sse4(int16_t **src, float **matrix, int len,
extern void ff_mix_1_to_2_s16p_flt_avx (int16_t **src, float **matrix, int len,
int out_ch, int in_ch);
#define DEFINE_MIX_3_8_TO_1_2(chan) \
extern void ff_mix_ ## chan ## _to_1_fltp_flt_sse(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_fltp_flt_sse(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse2(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse2(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse4(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse4(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_fltp_flt_avx(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_fltp_flt_avx(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_s16p_flt_avx(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_s16p_flt_avx(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_fltp_flt_fma4(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_fltp_flt_fma4(float **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
\
extern void ff_mix_ ## chan ## _to_1_s16p_flt_fma4(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch); \
extern void ff_mix_ ## chan ## _to_2_s16p_flt_fma4(int16_t **src, \
float **matrix, int len, \
int out_ch, int in_ch);
DEFINE_MIX_3_8_TO_1_2(3)
DEFINE_MIX_3_8_TO_1_2(4)
DEFINE_MIX_3_8_TO_1_2(5)
DEFINE_MIX_3_8_TO_1_2(6)
DEFINE_MIX_3_8_TO_1_2(7)
DEFINE_MIX_3_8_TO_1_2(8)
#define SET_MIX_3_8_TO_1_2(chan) \
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, 16, 4, "SSE", \
ff_mix_ ## chan ## _to_1_fltp_flt_sse); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, 16, 4, "SSE", \
ff_mix_## chan ##_to_2_fltp_flt_sse); \
} \
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, 16, 8, "SSE2", \
ff_mix_ ## chan ## _to_1_s16p_flt_sse2); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, 16, 8, "SSE2", \
ff_mix_ ## chan ## _to_2_s16p_flt_sse2); \
} \
if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, 16, 8, "SSE4", \
ff_mix_ ## chan ## _to_1_s16p_flt_sse4); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, 16, 8, "SSE4", \
ff_mix_ ## chan ## _to_2_s16p_flt_sse4); \
} \
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { \
int ptr_align = 32; \
int smp_align = 8; \
if (ARCH_X86_32 || chan >= 6) { \
ptr_align = 16; \
smp_align = 4; \
} \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, ptr_align, smp_align, "AVX", \
ff_mix_ ## chan ## _to_1_fltp_flt_avx); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, ptr_align, smp_align, "AVX", \
ff_mix_ ## chan ## _to_2_fltp_flt_avx); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, 16, 8, "AVX", \
ff_mix_ ## chan ## _to_1_s16p_flt_avx); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, 16, 8, "AVX", \
ff_mix_ ## chan ## _to_2_s16p_flt_avx); \
} \
if (mm_flags & AV_CPU_FLAG_FMA4 && HAVE_FMA4) { \
int ptr_align = 32; \
int smp_align = 8; \
if (ARCH_X86_32 || chan >= 6) { \
ptr_align = 16; \
smp_align = 4; \
} \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, ptr_align, smp_align, "FMA4", \
ff_mix_ ## chan ## _to_1_fltp_flt_fma4); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, ptr_align, smp_align, "FMA4", \
ff_mix_ ## chan ## _to_2_fltp_flt_fma4); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 1, 16, 8, "FMA4", \
ff_mix_ ## chan ## _to_1_s16p_flt_fma4); \
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
chan, 2, 16, 8, "FMA4", \
ff_mix_ ## chan ## _to_2_s16p_flt_fma4); \
}
av_cold void ff_audio_mix_init_x86(AudioMix *am)
{
#if HAVE_YASM
......@@ -80,5 +203,12 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
1, 2, 16, 8, "AVX", ff_mix_1_to_2_s16p_flt_avx);
}
SET_MIX_3_8_TO_1_2(3)
SET_MIX_3_8_TO_1_2(4)
SET_MIX_3_8_TO_1_2(5)
SET_MIX_3_8_TO_1_2(6)
SET_MIX_3_8_TO_1_2(7)
SET_MIX_3_8_TO_1_2(8)
#endif
}
......@@ -26,7 +26,8 @@
pmovsxwd m%1, m%1
SWAP %1, %2
%else
punpckhwd m%2, m%1
mova m%2, m%1
punpckhwd m%2, m%2
punpcklwd m%1, m%1
psrad m%2, 16
psrad m%1, 16
......
......@@ -797,11 +797,10 @@ int main(int argc, char **argv)
av_expr_parse_and_eval(&d, *expr,
const_names, const_values,
NULL, NULL, NULL, NULL, NULL, 0, NULL);
if(isnan(d)){
if (isnan(d))
printf("'%s' -> nan\n\n", *expr);
}else{
else
printf("'%s' -> %f\n\n", *expr, d);
}
}
av_expr_parse_and_eval(&d, "1+(5-2)^(3-1)+1/2+sin(PI)-max(-2.2,-3.1)",
......
......@@ -42,12 +42,7 @@ ALIGN 16
sub lenq, 2*mmsize
jge .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......@@ -88,12 +83,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
mova [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
%if mmsize == 32
vzeroupper
RET
%else
REP_RET
%endif
%endmacro
INIT_XMM sse
......
......@@ -392,11 +392,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
vzeroupper
%endif
ret
%endmacro
%macro REP_RET 0
%if regs_used > 7 || xmm_regs_used > 6
%if regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
RET
%else
rep ret
......@@ -433,11 +436,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 72
%macro RET 0
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
vzeroupper
%endif
ret
%endmacro
%macro REP_RET 0
%if regs_used > 9
%if regs_used > 9 || mmsize == 32
RET
%else
rep ret
......@@ -479,11 +485,14 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro RET 0
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
vzeroupper
%endif
ret
%endmacro
%macro REP_RET 0
%if regs_used > 3
%if regs_used > 3 || mmsize == 32
RET
%else
rep ret
......@@ -1126,16 +1135,22 @@ AVX_INSTR pfmul, 1, 0, 1
%undef j
%macro FMA_INSTR 3
%macro %1 4-7 %1, %2, %3
%if cpuflag(xop)
v%5 %1, %2, %3, %4
%macro %1 5-8 %1, %2, %3
%if cpuflag(xop) || cpuflag(fma4)
v%6 %1, %2, %3, %4
%else
%6 %1, %2, %3
%7 %1, %4
%ifidn %1, %4
%7 %5, %2, %3
%8 %1, %4, %5
%else
%7 %1, %2, %3
%8 %1, %4
%endif
%endif
%endmacro
%endmacro
FMA_INSTR fmaddps, mulps, addps
FMA_INSTR pmacsdd, pmulld, paddd
FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd
......@@ -15,9 +15,6 @@ ffservertest: ffserver$(EXESUF) tests/vsynth1/00.pgm tests/data/asynth1.sw
OBJDIRS += tests/data tests/vsynth1
# Required due to missing automatic dependency tracking for HOSTOBJS.
tests/rotozoom.o tests/videogen.o: tests/utils.c
tests/vsynth1/00.pgm: tests/videogen$(HOSTEXESUF) | tests/vsynth1
$(M)./$< 'tests/vsynth1/'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment