Commit 59216e05 authored by Claudio Freire's avatar Claudio Freire Committed by Michael Niedermayer

AAC Encoder: clipping avoidance

Avoid clipping due to quantization noise to produce audible
artifacts, by detecting near-clipping signals and both attenuating
them a little and encoding escape-encoded bands (usually the
loudest) rounding towards zero instead of nearest, which tends to
decrease overall energy and thus clipping.

Currently fate tests measure numerical error so this change makes
tests using asynth (which are near clipping) report higher error
not less, because of window attenuation. Yet, they sound better,
not worse (albeit subtle, other samples aren't subtle at all).
Only measuring psychoacoustically weighted error would make for
a representative test, so that will be left for a future patch.
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent c8c86b8f
...@@ -50,6 +50,8 @@ ...@@ -50,6 +50,8 @@
#define TNS_MAX_ORDER 20 #define TNS_MAX_ORDER 20
#define MAX_LTP_LONG_SFB 40 #define MAX_LTP_LONG_SFB 40
#define CLIP_AVOIDANCE_FACTOR 0.95f
enum RawDataBlockType { enum RawDataBlockType {
TYPE_SCE, TYPE_SCE,
TYPE_CPE, TYPE_CPE,
...@@ -180,6 +182,8 @@ typedef struct IndividualChannelStream { ...@@ -180,6 +182,8 @@ typedef struct IndividualChannelStream {
int predictor_initialized; int predictor_initialized;
int predictor_reset_group; int predictor_reset_group;
uint8_t prediction_used[41]; uint8_t prediction_used[41];
uint8_t window_clipping[8]; ///< set if a certain window is near clipping
float clip_avoidance_factor; ///< set if any window is near clipping to the necessary atennuation factor to avoid it
} IndividualChannelStream; } IndividualChannelStream;
/** /**
......
This diff is collapsed.
...@@ -472,12 +472,32 @@ static void encode_spectral_coeffs(AACEncContext *s, SingleChannelElement *sce) ...@@ -472,12 +472,32 @@ static void encode_spectral_coeffs(AACEncContext *s, SingleChannelElement *sce)
sce->ics.swb_sizes[i], sce->ics.swb_sizes[i],
sce->sf_idx[w*16 + i], sce->sf_idx[w*16 + i],
sce->band_type[w*16 + i], sce->band_type[w*16 + i],
s->lambda); s->lambda, sce->ics.window_clipping[w]);
start += sce->ics.swb_sizes[i]; start += sce->ics.swb_sizes[i];
} }
} }
} }
/**
* Downscale spectral coefficients for near-clipping windows to avoid artifacts
*/
static void avoid_clipping(AACEncContext *s, SingleChannelElement *sce)
{
int start, i, j, w;
if (sce->ics.clip_avoidance_factor < 1.0f) {
for (w = 0; w < sce->ics.num_windows; w++) {
start = 0;
for (i = 0; i < sce->ics.max_sfb; i++) {
float *swb_coeffs = sce->coeffs + start + w*128;
for (j = 0; j < sce->ics.swb_sizes[i]; j++)
swb_coeffs[j] *= sce->ics.clip_avoidance_factor;
start += sce->ics.swb_sizes[i];
}
}
}
}
/** /**
* Encode one channel of audio data. * Encode one channel of audio data.
*/ */
...@@ -578,6 +598,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, ...@@ -578,6 +598,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
for (ch = 0; ch < chans; ch++) { for (ch = 0; ch < chans; ch++) {
IndividualChannelStream *ics = &cpe->ch[ch].ics; IndividualChannelStream *ics = &cpe->ch[ch].ics;
int cur_channel = start_ch + ch; int cur_channel = start_ch + ch;
float clip_avoidance_factor;
overlap = &samples[cur_channel][0]; overlap = &samples[cur_channel][0];
samples2 = overlap + 1024; samples2 = overlap + 1024;
la = samples2 + (448+64); la = samples2 + (448+64);
...@@ -605,14 +626,29 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, ...@@ -605,14 +626,29 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
ics->num_windows = wi[ch].num_windows; ics->num_windows = wi[ch].num_windows;
ics->swb_sizes = s->psy.bands [ics->num_windows == 8]; ics->swb_sizes = s->psy.bands [ics->num_windows == 8];
ics->num_swb = tag == TYPE_LFE ? ics->num_swb : s->psy.num_bands[ics->num_windows == 8]; ics->num_swb = tag == TYPE_LFE ? ics->num_swb : s->psy.num_bands[ics->num_windows == 8];
clip_avoidance_factor = 0.0f;
for (w = 0; w < ics->num_windows; w++) for (w = 0; w < ics->num_windows; w++)
ics->group_len[w] = wi[ch].grouping[w]; ics->group_len[w] = wi[ch].grouping[w];
for (w = 0; w < ics->num_windows; w++) {
if (wi[ch].clipping[w] > CLIP_AVOIDANCE_FACTOR) {
ics->window_clipping[w] = 1;
clip_avoidance_factor = FFMAX(clip_avoidance_factor, wi[ch].clipping[w]);
} else {
ics->window_clipping[w] = 0;
}
}
if (clip_avoidance_factor > CLIP_AVOIDANCE_FACTOR) {
ics->clip_avoidance_factor = CLIP_AVOIDANCE_FACTOR / clip_avoidance_factor;
} else {
ics->clip_avoidance_factor = 1.0f;
}
apply_window_and_mdct(s, &cpe->ch[ch], overlap); apply_window_and_mdct(s, &cpe->ch[ch], overlap);
if (isnan(cpe->ch->coeffs[0])) { if (isnan(cpe->ch->coeffs[0])) {
av_log(avctx, AV_LOG_ERROR, "Input contains NaN\n"); av_log(avctx, AV_LOG_ERROR, "Input contains NaN\n");
return AVERROR(EINVAL); return AVERROR(EINVAL);
} }
avoid_clipping(s, &cpe->ch[ch]);
} }
start_ch += chans; start_ch += chans;
} }
......
...@@ -54,7 +54,7 @@ typedef struct AACCoefficientsEncoder { ...@@ -54,7 +54,7 @@ typedef struct AACCoefficientsEncoder {
void (*encode_window_bands_info)(struct AACEncContext *s, SingleChannelElement *sce, void (*encode_window_bands_info)(struct AACEncContext *s, SingleChannelElement *sce,
int win, int group_len, const float lambda); int win, int group_len, const float lambda);
void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, int size, void (*quantize_and_encode_band)(struct AACEncContext *s, PutBitContext *pb, const float *in, int size,
int scale_idx, int cb, const float lambda); int scale_idx, int cb, const float lambda, int rtz);
void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce); void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce, const float lambda); void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce, const float lambda);
void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe, const float lambda); void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe, const float lambda);
......
...@@ -837,6 +837,7 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio, ...@@ -837,6 +837,7 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
int grouping = 0; int grouping = 0;
int uselongblock = 1; int uselongblock = 1;
int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 }; int attacks[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
float clippings[AAC_NUM_BLOCKS_SHORT];
int i; int i;
FFPsyWindowInfo wi = { { 0 } }; FFPsyWindowInfo wi = { { 0 } };
...@@ -926,14 +927,35 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio, ...@@ -926,14 +927,35 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
lame_apply_block_type(pch, &wi, uselongblock); lame_apply_block_type(pch, &wi, uselongblock);
/* Calculate input sample maximums and evaluate clipping risk */
if (audio) {
for (i = 0; i < AAC_NUM_BLOCKS_SHORT; i++) {
const float *wbuf = audio + i * AAC_BLOCK_SIZE_SHORT;
float max = 0;
int j;
for (j = 0; j < AAC_BLOCK_SIZE_SHORT; j++)
max = FFMAX(max, fabsf(wbuf[j]));
clippings[i] = max;
}
} else {
for (i = 0; i < 8; i++)
clippings[i] = 0;
}
wi.window_type[1] = prev_type; wi.window_type[1] = prev_type;
if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) { if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
float clipping = 0.0f;
wi.num_windows = 1; wi.num_windows = 1;
wi.grouping[0] = 1; wi.grouping[0] = 1;
if (wi.window_type[0] == LONG_START_SEQUENCE) if (wi.window_type[0] == LONG_START_SEQUENCE)
wi.window_shape = 0; wi.window_shape = 0;
else else
wi.window_shape = 1; wi.window_shape = 1;
for (i = 0; i < 8; i++)
clipping = FFMAX(clipping, clippings[i]);
wi.clipping[0] = clipping;
} else { } else {
int lastgrp = 0; int lastgrp = 0;
...@@ -944,6 +966,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio, ...@@ -944,6 +966,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
lastgrp = i; lastgrp = i;
wi.grouping[lastgrp]++; wi.grouping[lastgrp]++;
} }
for (i = 0; i < 8; i += wi.grouping[i]) {
int w;
float clipping = 0.0f;
for (w = 0; w < wi.grouping[i] && !clipping; w++)
clipping = FFMAX(clipping, clippings[i+w]);
wi.clipping[i] = clipping;
}
} }
/* Determine grouping, based on the location of the first attack, and save for /* Determine grouping, based on the location of the first attack, and save for
......
...@@ -66,6 +66,7 @@ typedef struct FFPsyWindowInfo { ...@@ -66,6 +66,7 @@ typedef struct FFPsyWindowInfo {
int window_shape; ///< window shape (sine/KBD/whatever) int window_shape; ///< window shape (sine/KBD/whatever)
int num_windows; ///< number of windows in a frame int num_windows; ///< number of windows in a frame
int grouping[8]; ///< window grouping (for e.g. AAC) int grouping[8]; ///< window grouping (for e.g. AAC)
float clipping[8]; ///< maximum absolute normalized intensity in the given window for clip avoidance
int *window_sizes; ///< sequence of window sizes inside one frame (for eg. WMA) int *window_sizes; ///< sequence of window sizes inside one frame (for eg. WMA)
} FFPsyWindowInfo; } FFPsyWindowInfo;
......
...@@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa ...@@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa
fate-aac-aref-encode: CMP = stddev fate-aac-aref-encode: CMP = stddev
fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav
fate-aac-aref-encode: CMP_SHIFT = -4096 fate-aac-aref-encode: CMP_SHIFT = -4096
fate-aac-aref-encode: CMP_TARGET = 434 fate-aac-aref-encode: CMP_TARGET = 594
fate-aac-aref-encode: SIZE_TOLERANCE = 2464 fate-aac-aref-encode: SIZE_TOLERANCE = 2464
fate-aac-aref-encode: FUZZ = 5 fate-aac-aref-encode: FUZZ = 5
...@@ -155,7 +155,7 @@ fate-aac-ln-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref ...@@ -155,7 +155,7 @@ fate-aac-ln-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref
fate-aac-ln-encode: CMP = stddev fate-aac-ln-encode: CMP = stddev
fate-aac-ln-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav fate-aac-ln-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-ln-encode: CMP_SHIFT = -4096 fate-aac-ln-encode: CMP_SHIFT = -4096
fate-aac-ln-encode: CMP_TARGET = 65 fate-aac-ln-encode: CMP_TARGET = 68
fate-aac-ln-encode: SIZE_TOLERANCE = 3560 fate-aac-ln-encode: SIZE_TOLERANCE = 3560
FATE_AAC_LATM += fate-aac-latm_000000001180bc60 FATE_AAC_LATM += fate-aac-latm_000000001180bc60
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment