From 51f4a32ec2b62fd7c53e7b901fefd38ff95e7cc2 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@jmvalin.ca> Date: Wed, 20 Feb 2013 04:08:04 -0500 Subject: [PATCH] Adds support for delayed decision Variable duration option renamed to OPUS_SET_EXPERT_FRAME_DURATION, with new API. Also moves up the analysis to avoid having to do int->float conversion on large buffers. --- include/opus_defines.h | 17 ++- src/analysis.c | 184 +++++++++++++++++++++++++++---- src/analysis.h | 22 +++- src/opus_demo.c | 33 +++++- src/opus_encoder.c | 195 +++++++++++++++++++++------------ src/opus_multistream_encoder.c | 49 ++++++--- src/opus_private.h | 15 ++- 7 files changed, 394 insertions(+), 121 deletions(-) diff --git a/include/opus_defines.h b/include/opus_defines.h index e9434aab3..203144a77 100644 --- a/include/opus_defines.h +++ b/include/opus_defines.h @@ -149,8 +149,8 @@ extern "C" { #define OPUS_SET_LSB_DEPTH_REQUEST 4036 #define OPUS_GET_LSB_DEPTH_REQUEST 4037 #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039 -#define OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST 4040 -#define OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST 4041 +#define OPUS_SET_EXPERT_FRAME_DURATION_REQUEST 4040 +#define OPUS_GET_EXPERT_FRAME_DURATION_REQUEST 4041 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */ @@ -186,6 +186,15 @@ extern "C" { #define OPUS_BANDWIDTH_SUPERWIDEBAND 1104 /**<12 kHz bandpass @hideinitializer*/ #define OPUS_BANDWIDTH_FULLBAND 1105 /**<20 kHz bandpass @hideinitializer*/ +#define OPUS_FRAMESIZE_ARG 5000 /**< Select frame size from the argument (default) */ +#define OPUS_FRAMESIZE_2_5_MS 5001 /**< Use 2.5 ms frames */ +#define OPUS_FRAMESIZE_5_MS 5002 /**< Use 5 ms frames */ +#define OPUS_FRAMESIZE_10_MS 5003 /**< Use 10 ms frames */ +#define OPUS_FRAMESIZE_20_MS 5004 /**< Use 20 ms frames */ +#define OPUS_FRAMESIZE_40_MS 5005 /**< Use 40 ms frames */ +#define OPUS_FRAMESIZE_60_MS 5006 /**< Use 60 ms frames */ +#define OPUS_FRAMESIZE_VARIABLE 5010 /**< Optimize the frame size dynamically */ + /**@}*/ @@ -541,7 +550,7 @@ extern "C" { * <dt>1</dt><dd>Enable variable duration.</dd> * </dl> * @hideinitializer */ -#define OPUS_SET_EXPERT_VARIABLE_DURATION(x) OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int(x) +#define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x) /** Gets the encoder's configured use of variable duration frames. * @see OPUS_SET_EXPERT_VARIABLE_DURATION * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values: @@ -550,7 +559,7 @@ extern "C" { * <dt>1</dt><dd>variable duration enabled.</dd> * </dl> * @hideinitializer */ -#define OPUS_GET_EXPERT_VARIABLE_DURATION(x) OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST, __opus_check_int_ptr(x) +#define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x) /**@}*/ diff --git a/src/analysis.c b/src/analysis.c index 6b07890af..54005d3a3 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -139,10 +139,81 @@ static inline float fast_atan2f(float y, float x) { } } -void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth) +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) +{ +#if 1 + int pos; + int curr_lookahead; + float psum; + int i; + + pos = tonal->read_pos; + curr_lookahead = tonal->write_pos-tonal->read_pos; + if (curr_lookahead<0) + curr_lookahead += DETECT_SIZE; + + if (len > 480 && pos != tonal->write_pos) + { + pos++; + if (pos==DETECT_SIZE) + pos=0; + } + if (pos == tonal->write_pos) + pos--; + if (pos<0) + pos = DETECT_SIZE-1; + OPUS_COPY(info_out, &tonal->info[pos], 1); + tonal->read_subframe += len/120; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + + /* Compensate for the delay in the features themselves. + FIXME: Need a better estimate the 10 I just made up */ + curr_lookahead = IMAX(curr_lookahead-10, 0); + + psum=0; + for (i=0;i<DETECT_SIZE-curr_lookahead;i++) + psum += tonal->pmusic[i]; + for (;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i]; + /*printf("%f %f\n", psum, info_out->music_prob);*/ + + info_out->music_prob = psum; +#else + /* If data not available, return invalid */ + if (tonal->read_pos==tonal->write_pos) + { + info_out->valid=0; + return; + } + + OPUS_COPY(info_out, &tonal->info[tonal->read_pos], 1); + tonal->read_subframe += len/480; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + if (tonal->read_pos == tonal->write_pos) + { + tonal->read_pos = tonal->write_pos-1; + if (tonal->read_pos<0) + tonal->read_pos=DETECT_SIZE-1; + tonal->read_subframe = 3; + } +#endif +} + +void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix) { int i, b; - const CELTMode *mode; const kiss_fft_state *kfft; kiss_fft_cpx in[480], out[480]; int N = 480, N2=240; @@ -171,8 +242,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc float maxE = 0; float noise_floor; int remaining; - - celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode)); + AnalysisInfo *info; tonal->last_transition++; alpha = 1.f/IMIN(20, 1+tonal->count); @@ -181,23 +251,19 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc if (tonal->count<4) tonal->music_prob = .5; - kfft = mode->mdct.kfft[0]; + kfft = celt_mode->mdct.kfft[0]; if (tonal->count==0) tonal->mem_fill = 240; - if (C==1) - { - for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++) - tonal->inmem[i+tonal->mem_fill] = x[i]; - } else { - for (i=0;i<IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill);i++) - tonal->inmem[i+tonal->mem_fill] = x[2*i]+x[2*i+1]; - } + downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C); if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) { tonal->mem_fill += len; /* Don't have enough to update the analysis */ return; } + info = &tonal->info[tonal->write_pos++]; + if (tonal->write_pos>=DETECT_SIZE) + tonal->write_pos-=DETECT_SIZE; for (i=0;i<N2;i++) { @@ -209,15 +275,7 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc } OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); - if (C==1) - { - for (i=0;i<remaining;i++) - tonal->inmem[240+i] = x[ANALYSIS_BUF_SIZE-tonal->mem_fill+i]; - } else { - for (i=0;i<remaining;i++) - tonal->inmem[240+i] = x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)] - + x[2*(ANALYSIS_BUF_SIZE-tonal->mem_fill+i)+1]; - } + downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C); tonal->mem_fill = 240 + remaining; opus_fft(kfft, in, out); @@ -450,13 +508,49 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc tau = .00005f; beta = .1f; max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition); + max_certainty = 0; p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; p0 *= (float)pow(1-frame_prob, beta); p1 *= (float)pow(frame_prob, beta); tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1))); info->music_prob = tonal->music_prob; - /*printf("%f %f\n", frame_prob, info->music_prob);*/ + info->music_prob = frame_prob; + + float psum=1e-20; + float speech0 = (float)pow(1-frame_prob, beta); + float music0 = (float)pow(frame_prob, beta); + if (tonal->count==1) + { + tonal->pspeech[0]=.5; + tonal->pmusic [0]=.5; + } + float s0, m0; + s0 = tonal->pspeech[0] + tonal->pspeech[1]; + m0 = tonal->pmusic [0] + tonal->pmusic [1]; + tonal->pspeech[0] = s0*(1-tau)*speech0; + tonal->pmusic [0] = m0*(1-tau)*music0; + for (i=1;i<DETECT_SIZE-1;i++) + { + tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; + tonal->pmusic [i] = tonal->pmusic [i+1]*music0; + } + tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; + tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; + + for (i=0;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i] + tonal->pmusic[i]; + psum = 1.f/psum; + for (i=0;i<DETECT_SIZE;i++) + { + tonal->pspeech[i] *= psum; + tonal->pmusic [i] *= psum; + } + psum = tonal->pmusic[0]; + for (i=1;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i]; + + /*printf("%f %f %f\n", frame_prob, info->music_prob, psum);*/ } if (tonal->last_music != (tonal->music_prob>.5f)) tonal->last_transition=0; @@ -484,4 +578,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ info->noisiness = frame_noisiness; info->valid = 1; + if (info_out!=NULL) + OPUS_COPY(info_out, info, 1); +} + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) +{ + int offset; + int pcm_len; + + /* Avoid overflow/wrap-around of the analysis buffer */ + frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size); + + pcm_len = frame_size - analysis->analysis_offset; + offset = 0; + do { + tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix); + offset += 480; + pcm_len -= 480; + } while (pcm_len>0); + analysis->analysis_offset = frame_size; + + if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200) + { + int LM = 3; + LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps, + analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix); + while ((Fs/400<<LM)>frame_size) + LM--; + frame_size = (Fs/400<<LM); + } else { + frame_size = frame_size_select(frame_size, variable_duration, Fs); + } + if (frame_size<0) + return -1; + analysis->analysis_offset -= frame_size; + + /* Only perform analysis up to 20-ms frames. Longer ones will be split if + they're in CELT-only mode. */ + analysis_info->valid = 0; + tonality_get_info(analysis, analysis_info, frame_size); + + return frame_size; } diff --git a/src/analysis.h b/src/analysis.h index 6f3689da8..37a8bf405 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -28,10 +28,16 @@ #ifndef ANALYSIS_H #define ANALYSIS_H +#include "celt.h" +#include "opus_private.h" + #define NB_FRAMES 8 #define NB_TBANDS 18 #define NB_TOT_BANDS 21 #define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */ + +#define DETECT_SIZE 200 + typedef struct { float angle[240]; float d_angle[240]; @@ -55,9 +61,23 @@ typedef struct { int last_transition; int count; int opus_bandwidth; + opus_val32 subframe_mem[3]; + int analysis_offset; + float pspeech[DETECT_SIZE]; + float pmusic[DETECT_SIZE]; + int write_pos; + int read_pos; + int read_subframe; + AnalysisInfo info[DETECT_SIZE]; } TonalityAnalysisState; void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, - CELTEncoder *celt_enc, const opus_val16 *x, int len, int C, int lsb_depth); + const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix); + +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len); + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info); #endif diff --git a/src/opus_demo.c b/src/opus_demo.c index 6538aad6d..a0acb0cd2 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -244,7 +244,8 @@ int main(int argc, char *argv[]) int mode_switch_time = 48000; int nb_encoded; int remaining=0; - int variable_duration=0; + int variable_duration=OPUS_FRAMESIZE_ARG; + int delayed_decision=0; if (argc < 5 ) { @@ -313,7 +314,7 @@ int main(int argc, char *argv[]) forcechannels = OPUS_AUTO; use_dtx = 0; packet_loss_perc = 0; - max_frame_size = 960*6; + max_frame_size = 2*48000; curr_read=0; while( args < argc - 2 ) { @@ -383,7 +384,11 @@ int main(int argc, char *argv[]) args++; } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) { check_encoder_option(decode_only, "-variable-duration"); - variable_duration = 1; + variable_duration = OPUS_FRAMESIZE_VARIABLE; + args++; + } else if( strcmp( argv[ args ], "-delayed-decision" ) == 0 ) { + check_encoder_option(decode_only, "-delayed-decision"); + delayed_decision = 1; args++; } else if( strcmp( argv[ args ], "-dtx") == 0 ) { check_encoder_option(decode_only, "-dtx"); @@ -510,7 +515,7 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip)); opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16)); - opus_encoder_ctl(enc, OPUS_SET_EXPERT_VARIABLE_DURATION(variable_duration)); + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); } if (!encode_only) { @@ -566,6 +571,26 @@ int main(int argc, char *argv[]) if ( use_inbandfec ) { data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(char)); } + if(delayed_decision) + { + if (variable_duration!=OPUS_FRAMESIZE_VARIABLE) + { + if (frame_size==sampling_rate/400) + variable_duration = OPUS_FRAMESIZE_2_5_MS; + else if (frame_size==sampling_rate/200) + variable_duration = OPUS_FRAMESIZE_5_MS; + else if (frame_size==sampling_rate/100) + variable_duration = OPUS_FRAMESIZE_10_MS; + else if (frame_size==sampling_rate/50) + variable_duration = OPUS_FRAMESIZE_20_MS; + else if (frame_size==sampling_rate/25) + variable_duration = OPUS_FRAMESIZE_40_MS; + else + variable_duration = OPUS_FRAMESIZE_60_MS; + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); + } + frame_size = 2*48000; + } while (!stop) { if (delayed_celt) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 19778a401..3cee88b31 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -89,9 +89,9 @@ struct OpusEncoder { int first; opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef FIXED_POINT - opus_val32 subframe_mem[3]; TonalityAnalysisState analysis; - int detected_bandwidth; + int detected_bandwidth; + int analysis_offset; #endif opus_uint32 rangeFinal; }; @@ -215,6 +215,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->voice_ratio = -1; st->encoder_buffer = st->Fs/100; st->lsb_depth = 24; + st->variable_duration = OPUS_FRAMESIZE_ARG; /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead + 1.5 ms for SILK resamplers and stereo prediction) */ @@ -665,28 +666,28 @@ static int transient_viterbi(const float *E, const float *E_1, int N, int frame_ return best_state; } -void downmix_float(const void *_x, float *sub, int subframe, int i, int C) +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C) { const float *x; int c, j; x = (const float *)_x; for (j=0;j<subframe;j++) - sub[j] = x[(subframe*i+j)*C]; + sub[j] = x[(j+offset)*C]; for (c=1;c<C;c++) for (j=0;j<subframe;j++) - sub[j] += x[(subframe*i+j)*C+c]; + sub[j] += x[(j+offset)*C+c]; } -void downmix_int(const void *_x, float *sub, int subframe, int i, int C) +void downmix_int(const void *_x, float *sub, int subframe, int offset, int C) { const opus_int16 *x; int c, j; x = (const opus_int16 *)_x; for (j=0;j<subframe;j++) - sub[j] = x[(subframe*i+j)*C]; + sub[j] = x[(j+offset)*C]; for (c=1;c<C;c++) for (j=0;j<subframe;j++) - sub[j] += x[(subframe*i+j)*C+c]; + sub[j] += x[(j+offset)*C+c]; } int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, @@ -732,7 +733,7 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, int j; tmp=EPSILON; - downmix(x, sub, subframe, i, C); + downmix(x, sub, subframe, i*subframe, C); if (i==0) memx = sub[0]; for (j=0;j<subframe;j++) @@ -759,10 +760,36 @@ int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, } return bestLM; } + #endif +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs) +{ + int new_size; + if (frame_size<Fs/400) + return -1; + if (variable_duration == OPUS_FRAMESIZE_ARG) + new_size = frame_size; + else if (variable_duration == OPUS_FRAMESIZE_VARIABLE) + new_size = Fs/50; + else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS) + new_size = IMAX(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); + else + return -1; + if (new_size>frame_size) + return -1; + if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs && + 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs) + return -1; + return new_size; +} + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth) + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ) { void *silk_enc; CELTEncoder *celt_enc; @@ -790,11 +817,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ opus_val16 HB_gain; opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ int total_buffer; - int perform_analysis=0; - int orig_frame_size; -#ifndef FIXED_POINT - AnalysisInfo analysis_info; -#endif VARDECL(opus_val16, tmp_prefill); ALLOC_STACK; @@ -820,38 +842,15 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ lsb_depth = IMIN(lsb_depth, st->lsb_depth); - orig_frame_size = IMIN(frame_size,st->Fs/50); - if (st->variable_duration) - { - int LM = 3; -#ifndef FIXED_POINT - LM = optimize_framesize(pcm, frame_size, st->channels, st->Fs, st->bitrate_bps, - st->analysis.prev_tonality, st->subframe_mem, delay_compensation, downmix_float); -#endif - while ((st->Fs/400<<LM)>frame_size) - LM--; - frame_size = (st->Fs/400<<LM); - } + st->voice_ratio = -1; + #ifndef FIXED_POINT - /* Only perform analysis up to 20-ms frames. Longer ones will be split if - they're in CELT-only mode. */ - analysis_info.valid = 0; - perform_analysis = st->silk_mode.complexity >= 7 && st->Fs==48000; - if (!perform_analysis) - { - st->voice_ratio = -1; - st->detected_bandwidth = 0; - } else if (frame_size <= st->Fs/50) + st->detected_bandwidth = 0; + if (analysis_info->valid) { - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm, IMIN(480, frame_size), st->channels, lsb_depth); - if (frame_size > st->Fs/100) - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); - if (analysis_info.valid) - { - if (st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - st->detected_bandwidth = analysis_info.opus_bandwidth; - } + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info->music_prob)); + st->detected_bandwidth = analysis_info->opus_bandwidth; } #endif @@ -1161,7 +1160,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ if (to_celt && i==nb_frames-1) st->user_forced_mode = MODE_CELT_ONLY; - tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth); + tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth +#ifndef FIXED_POINT + , analysis_info +#endif + ); if (tmp_len<0) { RESTORE_STACK; @@ -1187,16 +1190,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ RESTORE_STACK; return ret; } -#ifndef FIXED_POINT - /* Perform analysis for 40-60 ms frames */ - if (perform_analysis && frame_size > st->Fs/50) - { - int nb_analysis = frame_size/(st->Fs/100); - for (i=0;i<nb_analysis;i++) - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm+i*(st->Fs/100)*st->channels, 480, st->channels, lsb_depth); - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - } -#endif curr_bandwidth = st->bandwidth; /* Chooses the appropriate mode for speech @@ -1446,11 +1439,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ { opus_int32 bonus=0; #ifndef FIXED_POINT - if (st->variable_duration && orig_frame_size != frame_size) + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != st->Fs/50) { - bonus = (40*st->stream_channels+40)*(48000/frame_size-48000/orig_frame_size); - if (analysis_info.valid) - bonus = bonus*(1.f+.5*analysis_info.tonality); + bonus = (40*st->stream_channels+40)*(st->Fs/frame_size-50); + if (analysis_info->valid) + bonus = bonus*(1.f+.5*analysis_info->tonality); } #endif celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); @@ -1587,8 +1580,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (ec_tell(&enc) <= 8*nb_compr_bytes) { #ifndef FIXED_POINT - if (perform_analysis) - celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); + celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info)); #endif ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); if (ret < 0) @@ -1688,6 +1680,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, VARDECL(opus_int16, in); ALLOC_STACK; + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); if(frame_size<0) { RESTORE_STACK; @@ -1707,6 +1700,12 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16); } @@ -1715,21 +1714,74 @@ opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 max_data_bytes) { int i, ret; + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; VARDECL(float, in); + AnalysisInfo analysis_info; ALLOC_STACK; + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(16, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_int, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + ALLOC(in, frame_size*st->channels, float); for (i=0;i<frame_size*st->channels;i++) in[i] = (1.0f/32768)*pcm[i]; - ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16); + ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, &analysis_info); RESTORE_STACK; return ret; } opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { - return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24); + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; + AnalysisInfo analysis_info; + + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(24, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_float, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24, &analysis_info); } #endif @@ -1998,15 +2050,13 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) *value = st->lsb_depth; } break; - case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); - if (value<0 || value>1) - goto bad_arg; st->variable_duration = value; } break; - case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 *value = va_arg(ap, opus_int32*); *value = st->variable_duration; @@ -2041,6 +2091,15 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) st->user_forced_mode = value; } break; + + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (value==0) + goto bad_arg; + celt_encoder_ctl(celt_enc, CELT_GET_MODE(value)); + } + break; default: /* fprintf(stderr, "unknown opus_encoder_ctl() request: %d", request);*/ ret = OPUS_UNIMPLEMENTED; diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index c5fb36691..c62041853 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -36,8 +36,10 @@ #include <stdarg.h> #include "float_cast.h" #include "os_support.h" +#include "analysis.h" struct OpusMSEncoder { + TonalityAnalysisState analysis; ChannelLayout layout; int variable_duration; opus_int32 bitrate_bps; @@ -105,6 +107,7 @@ int opus_multistream_encoder_init( st->layout.nb_coupled_streams = coupled_streams; st->bitrate_bps = OPUS_AUTO; + st->variable_duration = OPUS_FRAMESIZE_ARG; for (i=0;i<st->layout.nb_channels;i++) st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout)) @@ -187,6 +190,7 @@ static int opus_multistream_encode_native int lsb_depth #ifndef FIXED_POINT , downmix_func downmix + , const void *pcm_analysis #endif ) { @@ -202,10 +206,15 @@ static int opus_multistream_encode_native int orig_frame_size; int coded_channels; opus_int32 channel_rate; + opus_int32 complexity; + AnalysisInfo analysis_info; + const CELTMode *celt_mode; ALLOC_STACK; ptr = (char*)st + align(sizeof(OpusMSEncoder)); opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_COMPLEXITY(&complexity)); + opus_encoder_ctl((OpusEncoder*)ptr, CELT_GET_MODE(&celt_mode)); if (400*frame_size < Fs) { @@ -213,24 +222,24 @@ static int opus_multistream_encode_native return OPUS_BAD_ARG; } orig_frame_size = IMIN(frame_size,Fs/50); - if (st->variable_duration) +#ifndef FIXED_POINT + analysis_info.valid = 0; + if (complexity >= 7 && Fs==48000) { - int LM = 3; - int channels; opus_int32 delay_compensation; + int channels; channels = st->layout.nb_streams + st->layout.nb_coupled_streams; opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation)); delay_compensation -= Fs/400; -#ifndef FIXED_POINT - LM = optimize_framesize(pcm, frame_size, channels, Fs, st->bitrate_bps, - 0.f, st->subframe_mem, delay_compensation, downmix); + + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm_analysis, + frame_size, st->variable_duration, channels, Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix, &analysis_info); + } else #endif - while ((Fs/400<<LM)>frame_size) - LM--; - frame_size = (Fs/400<<LM); + { + frame_size = frame_size_select(frame_size, st->variable_duration, Fs); } - /* Validate frame_size before using it to allocate stack space. This mirrors the checks in opus_encode[_float](). */ if (400*frame_size != Fs && 200*frame_size != Fs && @@ -262,10 +271,10 @@ static int opus_multistream_encode_native channel_rate = st->bitrate_bps/coded_channels; } #ifndef FIXED_POINT - if (st->variable_duration && orig_frame_size != frame_size) + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50) { opus_int32 bonus; - bonus = 60*(48000/frame_size-48000/orig_frame_size); + bonus = 60*(Fs/frame_size-50); channel_rate += bonus; } #endif @@ -313,7 +322,11 @@ static int opus_multistream_encode_native /* Reserve three bytes for the last stream and four for the others */ curr_max -= IMAX(0,4*(st->layout.nb_streams-s-1)-1); curr_max = IMIN(curr_max,MS_FRAME_TMP); - len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth); + len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth +#ifndef FIXED_POINT + , &analysis_info +#endif + ); if (len<0) { RESTORE_STACK; @@ -412,8 +425,9 @@ int opus_multistream_encode_float opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_float, - pcm, frame_size, data, max_data_bytes, 24, downmix_float); + pcm, frame_size, data, max_data_bytes, 24, downmix_float, pcm+channels*st->analysis.analysis_offset); } int opus_multistream_encode( @@ -424,8 +438,9 @@ int opus_multistream_encode( opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_short, - pcm, frame_size, data, max_data_bytes, 16, downmix_int); + pcm, frame_size, data, max_data_bytes, 16, downmix_int, pcm+channels*st->analysis.analysis_offset); } #endif @@ -562,7 +577,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) *value = (OpusEncoder*)ptr; } break; - case OPUS_SET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 value = va_arg(ap, opus_int32); if (value<0 || value>1) @@ -570,7 +585,7 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) st->variable_duration = value; } break; - case OPUS_GET_EXPERT_VARIABLE_DURATION_REQUEST: + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: { opus_int32 *value = va_arg(ap, opus_int32*); *value = st->variable_duration; diff --git a/src/opus_private.h b/src/opus_private.h index 33a982e57..1da5748bd 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -31,6 +31,7 @@ #include "arch.h" #include "opus.h" +#include "celt.h" struct OpusRepacketizer { unsigned char toc; @@ -82,17 +83,23 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) typedef void (*downmix_func)(const void *, float *, int, int, int); -void downmix_float(const void *_x, float *sub, int subframe, int i, int C); -void downmix_int(const void *_x, float *sub, int subframe, int i, int C); +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C); +void downmix_int(const void *_x, float *sub, int subframe, int offset, int C); int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering, - void (*downmix)(const void *, float *, int, int, int)); + downmix_func downmix); int encode_size(int size, unsigned char *data); +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs); + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth); + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ); int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited, int *packet_offset); -- GitLab