From 42f43db7e470dec8c7a40c86180d6a07241c3577 Mon Sep 17 00:00:00 2001 From: Gustaf Ullberg <gustaf.ullberg@gmail.com> Date: Tue, 10 Apr 2018 13:37:49 +0200 Subject: [PATCH] Silk makes use of Opus VAD Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca> --- silk/API.h | 3 ++- silk/define.h | 5 +++++ silk/enc_API.c | 7 ++++--- silk/fixed/encode_frame_FIX.c | 11 +++++++++-- silk/fixed/main_FIX.h | 3 ++- silk/float/encode_frame_FLP.c | 11 +++++++++-- silk/float/main_FLP.h | 3 ++- src/opus_encoder.c | 13 +++++++++++-- 8 files changed, 44 insertions(+), 12 deletions(-) diff --git a/silk/API.h b/silk/API.h index 0131acbb0..4d90ff9aa 100644 --- a/silk/API.h +++ b/silk/API.h @@ -80,7 +80,8 @@ opus_int silk_Encode( /* O Returns error co opus_int nSamplesIn, /* I Number of samples in input vector */ ec_enc *psRangeEnc, /* I/O Compressor data structure */ opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ - const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ + const opus_int prefillFlag, /* I Flag to indicate prefilling buffers no coding */ + int activity /* I Decision of Opus voice activity detector */ ); /****************************************/ diff --git a/silk/define.h b/silk/define.h index 1286048e7..22fd720b9 100644 --- a/silk/define.h +++ b/silk/define.h @@ -58,6 +58,11 @@ extern "C" #define MAX_CONSECUTIVE_DTX 20 /* eq 400 ms */ #define DTX_ACTIVITY_THRESHOLD 0.1f +/* VAD decision */ +#define VAD_NO_DECISION -1 +#define VAD_NO_ACTIVITY 0 +#define VAD_ACTIVITY 1 + /* Maximum sampling frequency */ #define MAX_FS_KHZ 16 #define MAX_API_FS_KHZ 48 diff --git a/silk/enc_API.c b/silk/enc_API.c index 10adf2c35..7ae31a9e0 100644 --- a/silk/enc_API.c +++ b/silk/enc_API.c @@ -144,7 +144,8 @@ opus_int silk_Encode( /* O Returns error co opus_int nSamplesIn, /* I Number of samples in input vector */ ec_enc *psRangeEnc, /* I/O Compressor data structure */ opus_int32 *nBytesOut, /* I/O Number of bytes in payload (input: Max bytes) */ - const opus_int prefillFlag /* I Flag to indicate prefilling buffers no coding */ + const opus_int prefillFlag, /* I Flag to indicate prefilling buffers no coding */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { opus_int n, i, nBits, flags, tmp_payloadSize_ms = 0, tmp_complexity = 0, ret = 0; @@ -425,7 +426,7 @@ opus_int silk_Encode( /* O Returns error co psEnc->state_Fxx[ 1 ].sCmn.sNSQ.prev_gain_Q16 = 65536; psEnc->state_Fxx[ 1 ].sCmn.first_frame_after_reset = 1; } - silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ] ); + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 1 ], activity ); } else { psEnc->state_Fxx[ 1 ].sCmn.VAD_flags[ psEnc->state_Fxx[ 0 ].sCmn.nFramesEncoded ] = 0; } @@ -440,7 +441,7 @@ opus_int silk_Encode( /* O Returns error co silk_memcpy( psEnc->state_Fxx[ 0 ].sCmn.inputBuf, psEnc->sStereo.sMid, 2 * sizeof( opus_int16 ) ); silk_memcpy( psEnc->sStereo.sMid, &psEnc->state_Fxx[ 0 ].sCmn.inputBuf[ psEnc->state_Fxx[ 0 ].sCmn.frame_length ], 2 * sizeof( opus_int16 ) ); } - silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ] ); + silk_encode_do_VAD_Fxx( &psEnc->state_Fxx[ 0 ], activity ); /* Encode */ for( n = 0; n < encControl->nChannelsInternal; n++ ) { diff --git a/silk/fixed/encode_frame_FIX.c b/silk/fixed/encode_frame_FIX.c index d49a0fe33..a02bf87db 100644 --- a/silk/fixed/encode_frame_FIX.c +++ b/silk/fixed/encode_frame_FIX.c @@ -43,18 +43,25 @@ static OPUS_INLINE void silk_LBRR_encode_FIX( ); void silk_encode_do_VAD_FIX( - silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { + const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ); + /****************************/ /* Voice Activity Detection */ /****************************/ silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */ + if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) { + psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1; + } /**************************************************/ /* Convert speech activity into VAD and DTX flags */ /**************************************************/ - if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) { psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; psEnc->sCmn.noSpeechCounter++; if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) { diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h index 780afa39c..6d2112e51 100644 --- a/silk/fixed/main_FIX.h +++ b/silk/fixed/main_FIX.h @@ -66,7 +66,8 @@ void silk_HP_variable_cutoff( /* Encoder main function */ void silk_encode_do_VAD_FIX( - silk_encoder_state_FIX *psEnc /* I/O Pointer to Silk FIX encoder state */ + silk_encoder_state_FIX *psEnc, /* I/O Pointer to Silk FIX encoder state */ + opus_int activity /* I Decision of Opus voice activity detector */ ); /* Encoder main function */ diff --git a/silk/float/encode_frame_FLP.c b/silk/float/encode_frame_FLP.c index 1d4d8a77e..b029c3f5c 100644 --- a/silk/float/encode_frame_FLP.c +++ b/silk/float/encode_frame_FLP.c @@ -42,18 +42,25 @@ static OPUS_INLINE void silk_LBRR_encode_FLP( ); void silk_encode_do_VAD_FLP( - silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int activity /* I Decision of Opus voice activity detector */ ) { + const opus_int activity_threshold = SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ); + /****************************/ /* Voice Activity Detection */ /****************************/ silk_VAD_GetSA_Q8( &psEnc->sCmn, psEnc->sCmn.inputBuf + 1, psEnc->sCmn.arch ); + /* If Opus VAD is inactive and Silk VAD is active: lower Silk VAD to just under the threshold */ + if( activity == VAD_NO_ACTIVITY && psEnc->sCmn.speech_activity_Q8 >= activity_threshold ) { + psEnc->sCmn.speech_activity_Q8 = activity_threshold - 1; + } /**************************************************/ /* Convert speech activity into VAD and DTX flags */ /**************************************************/ - if( psEnc->sCmn.speech_activity_Q8 < SILK_FIX_CONST( SPEECH_ACTIVITY_DTX_THRES, 8 ) ) { + if( psEnc->sCmn.speech_activity_Q8 < activity_threshold ) { psEnc->sCmn.indices.signalType = TYPE_NO_VOICE_ACTIVITY; psEnc->sCmn.noSpeechCounter++; if( psEnc->sCmn.noSpeechCounter <= NB_SPEECH_FRAMES_BEFORE_DTX ) { diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h index f47fc93be..5dc0ccf4a 100644 --- a/silk/float/main_FLP.h +++ b/silk/float/main_FLP.h @@ -56,7 +56,8 @@ void silk_HP_variable_cutoff( /* Encoder main function */ void silk_encode_do_VAD_FLP( - silk_encoder_state_FLP *psEnc /* I/O Encoder state FLP */ + silk_encoder_state_FLP *psEnc, /* I/O Encoder state FLP */ + opus_int activity /* I Decision of Opus voice activity detector */ ); /* Encoder main function */ diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 35a6eaa74..0be65a3d9 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -1662,6 +1662,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_CELT_ONLY) { opus_int32 total_bitRate, celt_rate; + opus_int activity; #ifdef FIXED_POINT const opus_int16 *pcm_silk; #else @@ -1669,6 +1670,14 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ ALLOC(pcm_silk, st->channels*frame_size, opus_int16); #endif + activity = VAD_NO_DECISION; +#ifndef DISABLE_FLOAT_API + if( analysis_info.valid ) { + /* Inform SILK about the Opus VAD decision */ + activity = ( analysis_info.activity_probability >= DTX_ACTIVITY_THRESHOLD ); + } +#endif + /* Distribute bits between SILK and CELT */ total_bitRate = 8 * bytes_target * frame_rate; if( st->mode == MODE_HYBRID ) { @@ -1816,7 +1825,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ for (i=0;i<st->encoder_buffer*st->channels;i++) pcm_silk[i] = FLOAT2INT16(st->delay_buffer[i]); #endif - silk_Encode( silk_enc, &st->silk_mode, pcm_silk, st->encoder_buffer, NULL, &zero, 1 ); + silk_Encode( silk_enc, &st->silk_mode, pcm_silk, st->encoder_buffer, NULL, &zero, 1, activity ); } #ifdef FIXED_POINT @@ -1825,7 +1834,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ for (i=0;i<frame_size*st->channels;i++) pcm_silk[i] = FLOAT2INT16(pcm_buf[total_buffer*st->channels + i]); #endif - ret = silk_Encode( silk_enc, &st->silk_mode, pcm_silk, frame_size, &enc, &nBytes, 0 ); + ret = silk_Encode( silk_enc, &st->silk_mode, pcm_silk, frame_size, &enc, &nBytes, 0, activity ); if( ret ) { /*fprintf (stderr, "SILK encode error: %d\n", ret);*/ /* Handle error */ -- GitLab