Commit 01ecb717 authored by Claudio Freire's avatar Claudio Freire

AAC encoder: Extensive improvements

This finalizes merging of the work in the patches in ticket #2686.

Improvements to twoloop and RC logic are extensive.

The non-exhaustive list of twoloop improvments includes:
 - Tweaks to distortion limits on the RD optimization phase of twoloop
 - Deeper search in twoloop
 - PNS information marking to let twoloop decide when to use it
   (turned out having the decision made separately wasn't working)
 - Tonal band detection and priorization
 - Better band energy conservation rules
 - Strict hole avoidance

For rate control:
 - Use psymodel's bit allocation to allow proper use of the bit
   reservoir. Don't work against the bit reservoir by moving lambda
   in the opposite direction when psymodel decides to allocate more/less
   bits to a frame.
 - Retry the encode if the effective rate lies outside a reasonable
   margin of psymodel's allocation or the selected ABR.
 - Log average lambda at the end. Useful info for everyone, but especially
   for tuning of the various encoder constants that relate to lambda
   feedback.

Psy:
 - Do not apply lowpass with a FIR filter, instead just let the coder
   zero bands above the cutoff. The FIR filter induces group delay,
   and while zeroing bands causes ripple, it's lost in the quantization
   noise.
 - Experimental VBR bit allocation code
 - Tweak automatic lowpass filter threshold to maximize audio bandwidth
   at all bitrates while still providing acceptable, stable quality.

I/S:
 - Phase decision fixes. Unrelated to #2686, but the bugs only surfaced
   when the merge was finalized. Measure I/S band energy accounting for
   phase, and prevent I/S and M/S from being applied both.

PNS:
 - Avoid marking short bands with PNS when they're part of a window
   group in which there's a large variation of energy from one window
   to the next. PNS can't preserve those and the effect is extremely
   noticeable.

M/S:
 - Implement BMLD protection similar to the specified in
   ISO-IEC/13818:7-2003, Appendix C Section 6.1. Since M/S decision
   doesn't conform to section 6.1, a different method had to be
   implemented, but should provide equivalent protection.
 - Move the decision logic closer to the method specified in
   ISO-IEC/13818:7-2003, Appendix C Section 6.1. Specifically,
   make sure M/S needs less bits than dual stereo.
 - Don't apply M/S in bands that are using I/S

Now, this of course needed adjustments in the compare targets and
fuzz factors of the AAC encoder's fate tests, but if wondering why
the targets go up (more distortion), consider the previous coder
was using too many bits on LF content (far more than required by
psy), and thus those signals will now be more distorted, not less.

The extra distortion isn't audible though, I carried extensive
ABX testing to make sure.

A very similar patch was also extensively tested by Kamendo2 in
the context of #2686.
parent 624057df
......@@ -18,6 +18,7 @@ version <next>:
- ffplay dynamic volume control
- displace filter
- selectivecolor filter
- extensive native AAC encoder improvements
version 2.8:
......
......@@ -252,6 +252,7 @@ typedef struct SingleChannelElement {
INTFLOAT sf[120]; ///< scalefactors
int sf_idx[128]; ///< scalefactor indices (used by encoder)
uint8_t zeroes[128]; ///< band is not coded (used by encoder)
uint8_t can_pns[128]; ///< band is allowed to PNS (informative)
float is_ener[128]; ///< Intensity stereo pos (used by encoder)
float pns_ener[128]; ///< Noise energy values (used by encoder)
DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024]; ///< coefficients for IMDCT, pristine
......
This diff is collapsed.
......@@ -129,7 +129,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
&s->scoefs[start + w*128], size,
sce->sf_idx[win*16+swb],
aac_cb_out_map[cb],
0, INFINITY, NULL, 0);
0, INFINITY, NULL, NULL, 0);
}
cost_stay_here = path[swb][cb].cost + bits;
cost_get_here = minbits + bits + run_bits + 4;
......
This diff is collapsed.
......@@ -258,6 +258,8 @@ static void apply_intensity_stereo(ChannelElement *cpe)
start += ics->swb_sizes[g];
continue;
}
if (cpe->ms_mask[w*16 + g])
p *= -1;
for (i = 0; i < ics->swb_sizes[g]; i++) {
float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
cpe->ch[0].coeffs[start+i] = sum;
......@@ -279,7 +281,7 @@ static void apply_mid_side_stereo(ChannelElement *cpe)
for (w2 = 0; w2 < ics->group_len[w]; w2++) {
int start = (w+w2) * 128;
for (g = 0; g < ics->num_swb; g++) {
if (!cpe->ms_mask[w*16 + g]) {
if (!cpe->ms_mask[w*16 + g] && !cpe->is_mask[w*16 + g]) {
start += ics->swb_sizes[g];
continue;
}
......@@ -490,6 +492,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
ChannelElement *cpe;
SingleChannelElement *sce;
int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
int target_bits, rate_bits, too_many_bits, too_few_bits;
int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
int chan_el_counter[4];
FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
......@@ -583,8 +586,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
return ret;
frame_bits = its = 0;
do {
int target_bits, too_many_bits, too_few_bits;
init_put_bits(&s->pb, avpkt->data, avpkt->size);
if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
......@@ -618,12 +619,15 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
if (s->psy.bitres.alloc > 0) {
/* Lambda unused here on purpose, we need to take psy's unscaled allocation */
target_bits += s->psy.bitres.alloc;
target_bits += s->psy.bitres.alloc
* (s->lambda / (avctx->global_quality ? avctx->global_quality : 120));
s->psy.bitres.alloc /= chans;
}
s->cur_type = tag;
for (ch = 0; ch < chans; ch++) {
s->cur_channel = start_ch + ch;
if (s->options.pns && s->coder->mark_pns)
s->coder->mark_pns(s, avctx, &cpe->ch[ch]);
s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
}
if (chans > 1
......@@ -680,8 +684,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
s->coder->search_for_ms(s, cpe);
else if (cpe->common_window)
memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
for (w = 0; w < 128; w++)
cpe->ms_mask[w] = cpe->is_mask[w] ? 0 : cpe->ms_mask[w];
apply_mid_side_stereo(cpe);
}
adjust_frame_information(cpe, chans);
......@@ -708,23 +710,25 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
}
/* rate control stuff
* target either the nominal bitrate, or what psy's bit reservoir says to target
* whichever is greatest
* allow between the nominal bitrate, and what psy's bit reservoir says to target
* but drift towards the nominal bitrate always
*/
frame_bits = put_bits_count(&s->pb);
target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / avctx->sample_rate);
target_bits = FFMIN(target_bits, 6144 * s->channels - 3);
rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate;
rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3);
too_many_bits = FFMAX(target_bits, rate_bits);
too_many_bits = FFMIN(too_many_bits, 6144 * s->channels - 3);
too_few_bits = FFMIN(FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);
/* When using ABR, be strict (but only for increasing) */
too_many_bits = target_bits + target_bits/2;
too_few_bits = target_bits - target_bits/8;
too_few_bits = too_few_bits - too_few_bits/8;
too_many_bits = too_many_bits + too_many_bits/2;
if ( its == 0 /* for steady-state Q-scale tracking */
|| (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
|| frame_bits >= 6144 * s->channels - 3 )
{
float ratio = ((float)target_bits) / frame_bits;
float ratio = ((float)rate_bits) / frame_bits;
if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
/*
......@@ -742,7 +746,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
s->lambda = FFMIN(s->lambda * ratio, 65536.f);
/* Keep iterating if we must reduce and lambda is in the sky */
if (s->lambda < 300.f || ratio > 0.9f) {
if ((s->lambda < 300.f || ratio > 0.9f) && (s->lambda > 10.f || ratio < 1.1f)) {
break;
} else {
if (is_mode || ms_mode || tns_mode || pred_mode) {
......@@ -764,6 +768,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
put_bits(&s->pb, 3, TYPE_END);
flush_put_bits(&s->pb);
avctx->frame_bits = put_bits_count(&s->pb);
s->lambda_sum += s->lambda;
s->lambda_count++;
if (!frame)
s->last_frame++;
......@@ -780,6 +786,8 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
{
AACEncContext *s = avctx->priv_data;
av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);
ff_mdct_end(&s->mdct1024);
ff_mdct_end(&s->mdct128);
ff_psy_end(&s->psy);
......
......@@ -66,6 +66,7 @@ typedef struct AACCoefficientsEncoder {
void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
void (*mark_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
......@@ -100,6 +101,8 @@ typedef struct AACEncContext {
int last_frame;
int random_state;
float lambda;
float lambda_sum; ///< sum(lambda), for Qvg reporting
int lambda_count; ///< count(lambda), for Qvg reporting
enum RawDataBlockType cur_type; ///< channel group type cur_channel belongs to
AudioFrameQueue afq;
......
......@@ -45,6 +45,11 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
float dist1 = 0.0f, dist2 = 0.0f;
struct AACISError is_error = {0};
if (ener01 <= 0 || ener0 <= 0) {
is_error.pass = 0;
return is_error;
}
for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
......@@ -63,15 +68,15 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
sce0->ics.swb_sizes[g],
sce0->sf_idx[(w+w2)*16+g],
sce0->band_type[(w+w2)*16+g],
s->lambda / band0->threshold, INFINITY, NULL, 0);
s->lambda / band0->threshold, INFINITY, NULL, NULL, 0);
dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
sce1->ics.swb_sizes[g],
sce1->sf_idx[(w+w2)*16+g],
sce1->band_type[(w+w2)*16+g],
s->lambda / band1->threshold, INFINITY, NULL, 0);
s->lambda / band1->threshold, INFINITY, NULL, NULL, 0);
dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
is_sf_idx, is_band_type,
s->lambda / minthr, INFINITY, NULL, 0);
s->lambda / minthr, INFINITY, NULL, NULL, 0);
for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
......@@ -85,6 +90,7 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
is_error.error = fabsf(dist1 - dist2);
is_error.dist1 = dist1;
is_error.dist2 = dist2;
is_error.ener01 = ener01;
return is_error;
}
......@@ -105,7 +111,7 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g]) {
float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f, ener01p = 0.0f;
struct AACISError ph_err1, ph_err2, *erf;
if (sce0->band_type[w*16+g] == NOISE_BT ||
sce1->band_type[w*16+g] == NOISE_BT) {
......@@ -114,23 +120,25 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
}
for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
float coef0 = fabsf(sce0->pcoeffs[start+(w+w2)*128+i]);
float coef1 = fabsf(sce1->pcoeffs[start+(w+w2)*128+i]);
float coef0 = fabsf(sce0->coeffs[start+(w+w2)*128+i]);
float coef1 = fabsf(sce1->coeffs[start+(w+w2)*128+i]);
ener0 += coef0*coef0;
ener1 += coef1*coef1;
ener01 += (coef0 + coef1)*(coef0 + coef1);
ener01p += (coef0 - coef1)*(coef0 - coef1);
}
}
ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
ener0, ener1, ener01, 0, -1);
ener0, ener1, ener01p, 0, -1);
ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
ener0, ener1, ener01, 0, +1);
erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
erf = (ph_err1.pass && ph_err1.error < ph_err2.error) ? &ph_err1 : &ph_err2;
if (erf->pass) {
cpe->is_mask[w*16+g] = 1;
cpe->ch[0].is_ener[w*16+g] = sqrt(ener0/ener01);
cpe->ms_mask[w*16+g] = 0;
cpe->ch[0].is_ener[w*16+g] = sqrt(ener0 / erf->ener01);
cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
cpe->ch[1].band_type[w*16+g] = erf->phase ? INTENSITY_BT : INTENSITY_BT2;
cpe->ch[1].band_type[w*16+g] = (erf->phase > 0) ? INTENSITY_BT : INTENSITY_BT2;
count++;
}
}
......
......@@ -39,6 +39,7 @@ struct AACISError {
float error; /* fabs(dist1 - dist2) */
float dist1; /* From original coeffs */
float dist2; /* From IS'd coeffs */
float ener01;
};
struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
......
......@@ -271,7 +271,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs);
dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
O34, num_coeffs, sce->sf_idx[sfb],
cb_n, s->lambda / band->threshold, INFINITY, &cost1, 0);
cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0);
cost_coeffs += cost1;
/* Encoded coefficients - needed for #bits, band type and quant. error */
......@@ -284,7 +284,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
cb_p = cb_n;
quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
&cost2, 0);
&cost2, NULL, 0);
/* Reconstructed coefficients - needed for distortion measurements */
for (i = 0; i < num_coeffs; i++)
......@@ -296,7 +296,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
cb_p = cb_n;
dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
P34, num_coeffs, sce->sf_idx[sfb],
cb_p, s->lambda / band->threshold, INFINITY, NULL, 0);
cb_p, s->lambda / band->threshold, INFINITY, NULL, NULL, 0);
for (i = 0; i < num_coeffs; i++)
dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
dist_spec_err *= s->lambda / band->threshold;
......
......@@ -43,7 +43,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
PutBitContext *pb, const float *in, float *out,
const float *scaled, int size, int scale_idx,
int cb, const float lambda, const float uplim,
int *bits, int BT_ZERO, int BT_UNSIGNED,
int *bits, float *energy, int BT_ZERO, int BT_UNSIGNED,
int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
const float ROUNDING)
{
......@@ -54,6 +54,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
const float CLIPPED_ESCAPE = 165140.0f*IQ;
int i, j;
float cost = 0;
float qenergy = 0;
const int dim = BT_PAIR ? 2 : 4;
int resbits = 0;
int off;
......@@ -63,6 +64,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
cost += in[i]*in[i];
if (bits)
*bits = 0;
if (energy)
*energy = qenergy;
if (out) {
for (i = 0; i < size; i += dim)
for (j = 0; j < dim; j++)
......@@ -113,11 +116,13 @@ static av_always_inline float quantize_and_encode_band_cost_template(
out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
if (vec[j] != 0.0f)
curbits++;
qenergy += quantized*quantized;
rd += di*di;
}
} else {
for (j = 0; j < dim; j++) {
quantized = vec[j]*IQ;
qenergy += quantized*quantized;
if (out)
out[i+j] = quantized;
rd += (in[i+j] - quantized)*(in[i+j] - quantized);
......@@ -149,6 +154,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
if (bits)
*bits = resbits;
if (energy)
*energy = qenergy;
return cost;
}
......@@ -156,7 +163,7 @@ static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s,
const float *in, float *quant, const float *scaled,
int size, int scale_idx, int cb,
const float lambda, const float uplim,
int *bits) {
int *bits, float *energy) {
av_assert0(0);
return 0.0f;
}
......@@ -167,10 +174,10 @@ static float quantize_and_encode_band_cost_ ## NAME(
PutBitContext *pb, const float *in, float *quant, \
const float *scaled, int size, int scale_idx, \
int cb, const float lambda, const float uplim, \
int *bits) { \
int *bits, float *energy) { \
return quantize_and_encode_band_cost_template( \
s, pb, in, quant, scaled, size, scale_idx, \
BT_ESC ? ESC_BT : cb, lambda, uplim, bits, \
BT_ESC ? ESC_BT : cb, lambda, uplim, bits, energy, \
BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO, \
ROUNDING); \
}
......@@ -190,7 +197,7 @@ static float (*const quantize_and_encode_band_cost_arr[])(
PutBitContext *pb, const float *in, float *quant,
const float *scaled, int size, int scale_idx,
int cb, const float lambda, const float uplim,
int *bits) = {
int *bits, float *energy) = {
quantize_and_encode_band_cost_ZERO,
quantize_and_encode_band_cost_SQUAD,
quantize_and_encode_band_cost_SQUAD,
......@@ -214,7 +221,7 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(
PutBitContext *pb, const float *in, float *quant,
const float *scaled, int size, int scale_idx,
int cb, const float lambda, const float uplim,
int *bits) = {
int *bits, float *energy) = {
quantize_and_encode_band_cost_ZERO,
quantize_and_encode_band_cost_SQUAD,
quantize_and_encode_band_cost_SQUAD,
......@@ -235,32 +242,32 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(
#define quantize_and_encode_band_cost( \
s, pb, in, quant, scaled, size, scale_idx, cb, \
lambda, uplim, bits, rtz) \
lambda, uplim, bits, energy, rtz) \
((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
s, pb, in, quant, scaled, size, scale_idx, cb, \
lambda, uplim, bits)
lambda, uplim, bits, energy)
static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
const float *scaled, int size, int scale_idx,
int cb, const float lambda, const float uplim,
int *bits, int rtz)
int *bits, float *energy, int rtz)
{
return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
cb, lambda, uplim, bits, rtz);
cb, lambda, uplim, bits, energy, rtz);
}
static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
const float *scaled, int size, int scale_idx,
int cb, const float lambda, const float uplim,
int *bits, int rtz)
int *bits, float *energy, int rtz)
{
int _bits;
int auxbits;
quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
cb, 0.0f, uplim, &_bits, rtz);
cb, 0.0f, uplim, &auxbits, energy, rtz);
if (bits) {
*bits = _bits;
*bits = auxbits;
}
return _bits;
return auxbits;
}
static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
......@@ -268,7 +275,7 @@ static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitConte
int cb, const float lambda, int rtz)
{
quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
INFINITY, NULL, rtz);
INFINITY, NULL, NULL, rtz);
}
#endif /* AVCODEC_AACENC_QUANTIZATION_H */
......@@ -96,6 +96,54 @@ static inline int find_min_book(float maxval, int sf)
return cb;
}
static float find_form_factor(int group_len, int swb_size, float thresh, const float *scaled, float nzslope) {
const float iswb_size = 1.0f / swb_size;
const float iswb_sizem1 = 1.0f / (swb_size - 1);
const float ethresh = thresh;
float form = 0.0f, weight = 0.0f;
int w2, i;
for (w2 = 0; w2 < group_len; w2++) {
float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
float nzl = 0;
for (i = 0; i < swb_size; i++) {
float s = fabsf(scaled[w2*128+i]);
maxval = FFMAX(maxval, s);
e += s;
e2 += s *= s;
/* We really don't want a hard non-zero-line count, since
* even below-threshold lines do add up towards band spectral power.
* So, fall steeply towards zero, but smoothly
*/
if (s >= ethresh) {
nzl += 1.0f;
} else {
nzl += powf(s / ethresh, nzslope);
}
}
if (e2 > thresh) {
float frm;
e *= iswb_size;
/** compute variance */
for (i = 0; i < swb_size; i++) {
float d = fabsf(scaled[w2*128+i]) - e;
var += d*d;
}
var = sqrtf(var * iswb_sizem1);
e2 *= iswb_size;
frm = e / FFMIN(e+4*var,maxval);
form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
weight += e2;
}
}
if (weight > 0) {
return form / weight;
} else {
return 1.0f;
}
}
/** Return the minimum scalefactor where the quantized coef does not clip. */
static inline uint8_t coef2minsf(float coef)
{
......@@ -125,6 +173,14 @@ static inline int quant_array_idx(const float val, const float *arr, const int n
return index;
}
/**
* approximates exp10f(-3.0f*(0.5f + 0.5f * cosf(FFMIN(b,15.5f) / 15.5f)))
*/
static av_always_inline float bval2bmax(float b)
{
return 0.001f + 0.0035f * (b*b*b) / (15.5f*15.5f*15.5f);
}
/*
* linear congruential pseudorandom number generator, copied from the decoder
*/
......
......@@ -158,6 +158,7 @@ typedef struct AacPsyContext{
} pe;
AacPsyCoeffs psy_coef[2][64];
AacPsyChannel *ch;
float global_quality; ///< normalized global quality taken from avctx
}AacPsyContext;
/**
......@@ -300,7 +301,8 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
float bark;
int i, j, g, start;
float prev, minscale, minath, minsnr, pe_min;
const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
const int bandwidth = ctx->avctx->cutoff ? ctx->avctx->cutoff : AAC_CUTOFF(ctx->avctx);
const float num_bark = calc_bark((float)bandwidth);
......@@ -308,9 +310,15 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
if (!ctx->model_priv_data)
return AVERROR(ENOMEM);
pctx = (AacPsyContext*) ctx->model_priv_data;
pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
/* Use the target average bitrate to compute spread parameters */
chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
}
pctx->chan_bitrate = chan_bitrate;
pctx->frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
pctx->frame_bits = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
pctx->pe.min = 8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
pctx->pe.max = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
ctx->bitres.size = 6144 - pctx->frame_bits;
......@@ -398,7 +406,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
int channel, int prev_type)
{
int i, j;
int br = ctx->avctx->bit_rate / ctx->avctx->channels;
int br = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
int attack_ratio = br <= 16000 ? 18 : 10;
AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
AacPsyChannel *pch = &pctx->ch[channel];
......@@ -508,7 +516,12 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
ctx->pe.max = FFMAX(pe, ctx->pe.max);
ctx->pe.min = FFMIN(pe, ctx->pe.min);
return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
/* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
* reservoir starvation from producing zero-bit frames
*/
return FFMIN(
ctx->frame_bits * bit_factor,
FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
}
static float calc_pe_3gpp(AacPsyBand *band)
......@@ -678,8 +691,26 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
/* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
ctx->ch[channel].entropy = pe;
if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
/* (2.5 * 120) achieves almost transparent rate, and we want to give
* ample room downwards, so we make that equivalent to QSCALE=2.4
*/
desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
/* PE slope smoothing */
if (ctx->bitres.bits > 0) {
desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
}
pctx->pe.max = FFMAX(pe, pctx->pe.max);
pctx->pe.min = FFMIN(pe, pctx->pe.min);
} else {
desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
/* NOTE: PE correction is kept simple. During initial testing it had very
* little effect on the final bitrate. Probably a good idea to come
* back and do more testing later.
......@@ -687,6 +718,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
if (ctx->bitres.bits > 0)
desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
0.85f, 1.15f);
}
pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
ctx->bitres.alloc = desired_bits;
......
......@@ -233,6 +233,11 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
}
#endif
static inline av_const float ff_sqrf(float a)
{
return a*a;
}
static inline int8_t ff_u8_to_s8(uint8_t a)
{
union {
......
This diff is collapsed.
......@@ -109,25 +109,21 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
return NULL;
ctx->avctx = avctx;
/* AAC has its own LP method */
if (avctx->codec_id != AV_CODEC_ID_AAC) {
if (avctx->cutoff > 0)
cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;
if (!cutoff_coeff && avctx->codec_id == AV_CODEC_ID_AAC)
cutoff_coeff = 2.0 * AAC_CUTOFF(avctx) / avctx->sample_rate;
if (cutoff_coeff && cutoff_coeff < 0.98)
ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
FF_FILTER_MODE_LOWPASS, FILT_ORDER,
cutoff_coeff, 0.0, 0.0);
if (ctx->fcoeffs) {
ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
if (!ctx->fstate) {
av_free(ctx);
return NULL;
}
ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
for (i = 0; i < avctx->channels; i++)
ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
}
}
ff_iir_filter_init(&ctx->fiir);
......
......@@ -29,7 +29,20 @@
/** maximum number of channels */
#define PSY_MAX_CHANS 20
#define AAC_CUTOFF(s) ((s)->bit_rate ? FFMIN3(4000 + (s)->bit_rate/8, 12000 + (s)->bit_rate/32, (s)->sample_rate / 2) : ((s)->sample_rate / 2))
/* cutoff for VBR is purposedly increased, since LP filtering actually
* hinders VBR performance rather than the opposite
*/
#define AAC_CUTOFF_FROM_BITRATE(bit_rate,channels,sample_rate) (bit_rate ? FFMIN3(FFMIN3( \
FFMAX(bit_rate/channels/5, bit_rate/channels*15/32 - 5500), \
3000 + bit_rate/channels/4, \
12000 + bit_rate/channels/16), \
22000, \
sample_rate / 2): (sample_rate / 2))
#define AAC_CUTOFF(s) ( \
(s->flags & CODEC_FLAG_QSCALE) \
? s->sample_rate / 2 \
: AAC_CUTOFF_FROM_BITRATE(s->bit_rate, s->channels, s->sample_rate) \
)
/**
* single band psychoacoustic information
......
......@@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa
fate-aac-aref-encode: CMP = stddev
fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav
fate-aac-aref-encode: CMP_SHIFT = -4096
fate-aac-aref-encode: CMP_TARGET = 584
fate-aac-aref-encode: CMP_TARGET = 1127
fate-aac-aref-encode: SIZE_TOLERANCE = 2464
fate-aac-aref-encode: FUZZ = 6
......@@ -155,51 +155,52 @@ fate-aac-ln-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref
fate-aac-ln-encode: CMP = stddev
fate-aac-ln-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-ln-encode: CMP_SHIFT = -4096
fate-aac-ln-encode: CMP_TARGET = 68
fate-aac-ln-encode: CMP_TARGET = 80
fate-aac-ln-encode: SIZE_TOLERANCE = 3560
fate-aac-ln-encode: FUZZ = 30
FATE_AAC_ENCODE += fate-aac-ln-encode-128k
fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
fate-aac-ln-encode-128k: CMP = stddev
fate-aac-ln-encode-128k: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-ln-encode-128k: CMP_SHIFT = -4096
fate-aac-ln-encode-128k: CMP_TARGET = 638
fate-aac-ln-encode-128k: CMP_TARGET = 745
fate-aac-ln-encode-128k: SIZE_TOLERANCE = 3560
fate-aac-ln-encode-128k: FUZZ = 5
FATE_AAC_ENCODE += fate-aac-pns-encode
fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k
fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k -cutoff 22050
fate-aac-pns-encode: CMP = stddev
fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-pns-encode: CMP_SHIFT = -4096
fate-aac-pns-encode: CMP_TARGET = 623.77
fate-aac-pns-encode: CMP_TARGET = 695
fate-aac-pns-encode: SIZE_TOLERANCE = 3560
fate-aac-pns-encode: FUZZ = 25
FATE_AAC_ENCODE += fate-aac-tns-encode
fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k
fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
fate-aac-tns-encode: CMP = stddev
fate-aac-tns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-tns-encode: CMP_SHIFT = -4096
fate-aac-tns-encode: CMP_TARGET = 644.50
fate-aac-tns-encode: CMP_TARGET = 768
fate-aac-tns-encode: FUZZ = 2.8
fate-aac-tns-encode: SIZE_TOLERANCE = 3560
FATE_AAC_ENCODE += fate-aac-is-encode
fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k
fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k -cutoff 22050
fate-aac-is-encode: CMP = stddev
fate-aac-is-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-is-encode: CMP_SHIFT = -4096
fate-aac-is-encode: CMP_TARGET = 614.04
fate-aac-is-encode: CMP_TARGET = 582
fate-aac-is-encode: SIZE_TOLERANCE = 3560
fate-aac-is-encode: FUZZ = 1
FATE_AAC_ENCODE += fate-aac-pred-encode
fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
fate-aac-pred-encode: CMP = stddev
fate-aac-pred-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
fate-aac-pred-encode: CMP_SHIFT = -4096
fate-aac-pred-encode: CMP_TARGET = 657
fate-aac-pred-encode: CMP_TARGET = 790
fate-aac-pred-encode: FUZZ = 5
fate-aac-pred-encode: SIZE_TOLERANCE = 3560
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment