Skip to content
Snippets Groups Projects
Verified Commit b30f45b9 authored by Jean-Marc Valin's avatar Jean-Marc Valin
Browse files

Fixing (hopefully) bandwidth detection for 24 kHz analysis

The masking term was previously completely broken (even in 1.1). The
bandwidth detection relies mostly on the noise floor and can only use
masking to cut one extra band. The 12-24 kHz energy is now normalized properly
but uses a higher noise floor due to the leakage in the resampler. Bandwidth
detection is still mostly useless at identifying SWB speech (it usually says
FB).
parent 251fc076
No related branches found
No related tags found
No related merge requests found
......@@ -432,6 +432,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
float alpha, alphaE, alphaE2;
float frame_loudness;
float bandwidth_mask;
int is_masked[NB_TBANDS+1];
int bandwidth=0;
float maxE = 0;
float noise_floor;
......@@ -449,7 +450,9 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
alpha = 1.f/IMIN(10, 1+tonal->count);
alphaE = 1.f/IMIN(25, 1+tonal->count);
alphaE2 = 1.f/IMIN(500, 1+tonal->count);
/* Noise floor related decay for bandwidth detection: -2.2 dB/second */
alphaE2 = 1.f/IMIN(100, 1+tonal->count);
if (tonal->count <= 1) alphaE2 = 1;
if (tonal->Fs == 48000)
{
......@@ -722,6 +725,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
for (b=0;b<NB_TBANDS;b++)
{
float E=0;
float Em;
int band_start, band_end;
/* Keep a margin of 300 Hz for aliasing */
band_start = tbands[b];
......@@ -735,40 +739,47 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
E = SCALE_ENER(E);
maxE = MAX32(maxE, E);
tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
E = MAX32(E, tonal->meanE[b]);
/* Use a simple follower with 13 dB/Bark slope for spreading function */
bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
Em = MAX32(E, tonal->meanE[b]);
/* Consider the band "active" only if all these conditions are met:
1) less than 10 dB below the simple follower
2) less than 90 dB below the peak band (maximal masking possible considering
1) less than 90 dB below the peak band (maximal masking possible considering
both the ATH and the loudness-dependent slope of the spreading function)
3) above the PCM quantization noise floor
2) above the PCM quantization noise floor
We use b+1 because the first CELT band isn't included in tbands[]
*/
if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
bandwidth = b+1;
/* Check if the band is masked (see below). */
is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
/* Use a simple follower with 13 dB/Bark slope for spreading function. */
bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
}
/* Special case for the last two bands, for which we don't have spectrum but only
the energy above 12 kHz. */
the energy above 12 kHz. The difficulty here is that the high-pass we use
leaks some LF energy, so we need to increase the threshold without accidentally cutting
off the band. */
if (tonal->Fs == 48000) {
float ratio;
float E = hp_ener*(1.f/(240*240));
ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
float noise_ratio;
float Em;
float E = hp_ener*(1.f/(60*60));
noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
#ifdef FIXED_POINT
/* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
#endif
maxE = MAX32(maxE, E);
tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
E = MAX32(E, tonal->meanE[b]);
/* Use a simple follower with 13 dB/Bark slope for spreading function */
bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
bandwidth = 20;
/* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
if (bandwidth >= 17)
Em = MAX32(E, tonal->meanE[b]);
if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
bandwidth = 20;
/* Check if the band is masked (see below). */
is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
}
/* In some cases, resampling aliasing can create a small amount of energy in the first band
being cut. So if the last band is masked, we don't include it. */
if (bandwidth == 20 && is_masked[NB_TBANDS])
bandwidth-=2;
else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
bandwidth--;
if (tonal->count<=2)
bandwidth = 20;
frame_loudness = 20*(float)log10(frame_loudness);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment