diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 844b08dd9e49d8103c37bf2cb82a370e443b638b..1ae5598a3b8dbd0175a2c2b37bd03c678b8e21dc 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -892,34 +892,15 @@ static opus_val32 compute_frame_energy(const opus_val16 *pcm, int frame_size, in #endif /* Decides if DTX should be turned on (=1) or off (=0) */ -static int decide_dtx_mode(float activity_probability, /* probability that current frame contains speech/music */ - int *nb_no_activity_frames, /* number of consecutive frames with no activity */ - opus_val32 peak_signal_energy, /* peak energy of desired signal detected so far */ - const opus_val16 *pcm, /* input pcm signal */ - int frame_size, /* frame size */ - int channels, - int is_silence, /* only digital silence detected in this frame */ - int arch - ) -{ - opus_val32 noise_energy; +static int decide_dtx_mode(opus_int activity, /* indicates if this frame contains speech/music */ + int *nb_no_activity_frames /* number of consecutive frames with no activity */ + ) - if (!is_silence) - { - if (activity_probability < DTX_ACTIVITY_THRESHOLD) /* is noise */ - { - noise_energy = compute_frame_energy(pcm, frame_size, channels, arch); - - /* but is sufficiently quiet */ - is_silence = peak_signal_energy >= (PSEUDO_SNR_THRESHOLD * noise_energy); - } - } - - if (is_silence) +{ + if (!activity) { /* The number of consecutive DTX frames should be within the allowed bounds */ (*nb_no_activity_frames)++; - if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX) { if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)) @@ -1102,6 +1083,8 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int analysis_read_subframe_bak=-1; int is_silence = 0; #endif + opus_int activity = VAD_NO_DECISION; + VARDECL(opus_val16, tmp_prefill); ALLOC_STACK; @@ -1169,6 +1152,17 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (!is_silence) st->voice_ratio = -1; + if (analysis_info.valid) { + activity = !is_silence && analysis_info.activity_probability >= DTX_ACTIVITY_THRESHOLD; + if (!activity) { + /* Mark as active if this noise frame is sufficiently loud */ + opus_val32 noise_energy = compute_frame_energy(pcm, frame_size, st->channels, st->arch); + activity = st->peak_signal_energy < (PSEUDO_SNR_THRESHOLD * noise_energy); + } + } else { + activity = !is_silence; + } + st->detected_bandwidth = 0; if (analysis_info.valid) { @@ -1668,7 +1662,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_CELT_ONLY) { opus_int32 total_bitRate, celt_rate; - opus_int activity; #ifdef FIXED_POINT const opus_int16 *pcm_silk; #else @@ -1676,14 +1669,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ ALLOC(pcm_silk, st->channels*frame_size, opus_int16); #endif - activity = VAD_NO_DECISION; -#ifndef DISABLE_FLOAT_API - if( analysis_info.valid ) { - /* Inform SILK about the Opus VAD decision */ - activity = ( analysis_info.activity_probability >= DTX_ACTIVITY_THRESHOLD ); - } -#endif - /* Distribute bits between SILK and CELT */ total_bitRate = 8 * bytes_target * frame_rate; if( st->mode == MODE_HYBRID ) { @@ -2144,8 +2129,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #ifndef DISABLE_FLOAT_API if (st->use_dtx && (analysis_info.valid || is_silence)) { - if (decide_dtx_mode(analysis_info.activity_probability, &st->nb_no_activity_frames, - st->peak_signal_energy, pcm, frame_size, st->channels, is_silence, st->arch)) + if (decide_dtx_mode(activity, &st->nb_no_activity_frames)) { st->rangeFinal = 0; data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels);