diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 96a9f6fe2bbf7ade038e5015d2ea0ae27e8d6375..d93b15a6ebce7e723f5dcec62af72a5a1656af1a 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1329,7 +1329,7 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, prefilter_tapset = st->tapset_decision; pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes); - if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && st->analysis.tonality > .3 + if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3) && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period)) pitch_change = 1; if (pf_on==0) @@ -1353,15 +1353,17 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm, isTransient = 0; shortBlocks = 0; + if (st->complexity >= 1) + { + isTransient = transient_analysis(in, N+st->overlap, CC, + &tf_estimate, &tf_chan); + } if (LM>0 && ec_tell(enc)+3<=total_bits) { - if (st->complexity >= 1) - { - isTransient = transient_analysis(in, N+st->overlap, CC, - &tf_estimate, &tf_chan); - if (isTransient) - shortBlocks = M; - } + if (isTransient) + shortBlocks = M; + } else { + isTransient = 0; } ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */ diff --git a/include/opus_defines.h b/include/opus_defines.h index cdde061a567323fd5ebb75a6caf1e279a88f2409..203144a77701d2c6213cd9ea48050e10714dadb5 100644 --- a/include/opus_defines.h +++ b/include/opus_defines.h @@ -148,8 +148,9 @@ extern "C" { #define OPUS_GET_GAIN_REQUEST 4045 /* Should have been 4035 */ #define OPUS_SET_LSB_DEPTH_REQUEST 4036 #define OPUS_GET_LSB_DEPTH_REQUEST 4037 - #define OPUS_GET_LAST_PACKET_DURATION_REQUEST 4039 +#define OPUS_SET_EXPERT_FRAME_DURATION_REQUEST 4040 +#define OPUS_GET_EXPERT_FRAME_DURATION_REQUEST 4041 /* Don't use 4045, it's already taken by OPUS_GET_GAIN_REQUEST */ @@ -185,6 +186,15 @@ extern "C" { #define OPUS_BANDWIDTH_SUPERWIDEBAND 1104 /**<12 kHz bandpass @hideinitializer*/ #define OPUS_BANDWIDTH_FULLBAND 1105 /**<20 kHz bandpass @hideinitializer*/ +#define OPUS_FRAMESIZE_ARG 5000 /**< Select frame size from the argument (default) */ +#define OPUS_FRAMESIZE_2_5_MS 5001 /**< Use 2.5 ms frames */ +#define OPUS_FRAMESIZE_5_MS 5002 /**< Use 5 ms frames */ +#define OPUS_FRAMESIZE_10_MS 5003 /**< Use 10 ms frames */ +#define OPUS_FRAMESIZE_20_MS 5004 /**< Use 20 ms frames */ +#define OPUS_FRAMESIZE_40_MS 5005 /**< Use 40 ms frames */ +#define OPUS_FRAMESIZE_60_MS 5006 /**< Use 60 ms frames */ +#define OPUS_FRAMESIZE_VARIABLE 5010 /**< Optimize the frame size dynamically */ + /**@}*/ @@ -525,6 +535,32 @@ extern "C" { * @param[out] x <tt>opus_int32 *</tt>: Number of samples (at current sampling rate). * @hideinitializer */ #define OPUS_GET_LAST_PACKET_DURATION(x) OPUS_GET_LAST_PACKET_DURATION_REQUEST, __opus_check_int_ptr(x) + +/** Configures the encoder's use of variable duration frames. + * When enabled, the encoder is free to use a shorter frame size than the one + * requested in the opus_encode*() call. It is then the user's responsibility + * to verify how much audio was encoded by checking the ToC byte of the encoded + * packet. The part of the audio that was not encoded needs to be resent to the + * encoder for the next call. Do not use this option unless you <b>really</b> + * know what you are doing. + * @see OPUS_GET_EXPERT_VARIABLE_DURATION + * @param[in] x <tt>opus_int32</tt>: Allowed values: + * <dl> + * <dt>0</dt><dd>Disable variable duration (default).</dd> + * <dt>1</dt><dd>Enable variable duration.</dd> + * </dl> + * @hideinitializer */ +#define OPUS_SET_EXPERT_FRAME_DURATION(x) OPUS_SET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int(x) +/** Gets the encoder's configured use of variable duration frames. + * @see OPUS_SET_EXPERT_VARIABLE_DURATION + * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values: + * <dl> + * <dt>0</dt><dd>variable duration disabled (default).</dd> + * <dt>1</dt><dd>variable duration enabled.</dd> + * </dl> + * @hideinitializer */ +#define OPUS_GET_EXPERT_FRAME_DURATION(x) OPUS_GET_EXPERT_FRAME_DURATION_REQUEST, __opus_check_int_ptr(x) + /**@}*/ /** @defgroup opus_genericctls Generic CTLs diff --git a/src/analysis.c b/src/analysis.c index 22a8fa7913e87fdcfa9dcf926c09a656ee7cbb36..14b2246c908c2cb5457627d151b1fc75b68f9946 100644 --- a/src/analysis.c +++ b/src/analysis.c @@ -139,10 +139,56 @@ static inline float fast_atan2f(float y, float x) { } } -void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEncoder *celt_enc, const opus_val16 *x, int C, int lsb_depth) +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) +{ + int pos; + int curr_lookahead; + float psum; + int i; + + pos = tonal->read_pos; + curr_lookahead = tonal->write_pos-tonal->read_pos; + if (curr_lookahead<0) + curr_lookahead += DETECT_SIZE; + + if (len > 480 && pos != tonal->write_pos) + { + pos++; + if (pos==DETECT_SIZE) + pos=0; + } + if (pos == tonal->write_pos) + pos--; + if (pos<0) + pos = DETECT_SIZE-1; + OPUS_COPY(info_out, &tonal->info[pos], 1); + tonal->read_subframe += len/120; + while (tonal->read_subframe>=4) + { + tonal->read_subframe -= 4; + tonal->read_pos++; + } + if (tonal->read_pos>=DETECT_SIZE) + tonal->read_pos-=DETECT_SIZE; + + /* Compensate for the delay in the features themselves. + FIXME: Need a better estimate the 10 I just made up */ + curr_lookahead = IMAX(curr_lookahead-10, 0); + + psum=0; + for (i=0;i<DETECT_SIZE-curr_lookahead;i++) + psum += tonal->pmusic[i]; + for (;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i]; + psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence; + /*printf("%f %f\n", psum, info_out->music_prob);*/ + + info_out->music_prob = psum; +} + +void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix) { int i, b; - const CELTMode *mode; const kiss_fft_state *kfft; kiss_fft_cpx in[480], out[480]; int N = 480, N2=240; @@ -163,14 +209,15 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc float slope=0; float frame_stationarity; float relativeE; - float frame_prob; + float frame_probs[2]; float alpha, alphaE, alphaE2; float frame_loudness; float bandwidth_mask; int bandwidth=0; float maxE = 0; float noise_floor; - celt_encoder_ctl(celt_enc, CELT_GET_MODE(&mode)); + int remaining; + AnalysisInfo *info; tonal->last_transition++; alpha = 1.f/IMIN(20, 1+tonal->count); @@ -179,27 +226,32 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc if (tonal->count<4) tonal->music_prob = .5; - kfft = mode->mdct.kfft[0]; - if (C==1) + kfft = celt_mode->mdct.kfft[0]; + if (tonal->count==0) + tonal->mem_fill = 240; + downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, C); + if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) { - for (i=0;i<N2;i++) - { - float w = analysis_window[i]; - in[i].r = MULT16_16(w, x[i]); - in[i].i = MULT16_16(w, x[N-N2+i]); - in[N-i-1].r = MULT16_16(w, x[N-i-1]); - in[N-i-1].i = MULT16_16(w, x[2*N-N2-i-1]); - } - } else { - for (i=0;i<N2;i++) - { - float w = analysis_window[i]; - in[i].r = MULT16_16(w, x[2*i]+x[2*i+1]); - in[i].i = MULT16_16(w, x[2*(N-N2+i)]+x[2*(N-N2+i)+1]); - in[N-i-1].r = MULT16_16(w, x[2*(N-i-1)]+x[2*(N-i-1)+1]); - in[N-i-1].i = MULT16_16(w, x[2*(2*N-N2-i-1)]+x[2*(2*N-N2-i-1)+1]); - } + tonal->mem_fill += len; + /* Don't have enough to update the analysis */ + return; } + info = &tonal->info[tonal->write_pos++]; + if (tonal->write_pos>=DETECT_SIZE) + tonal->write_pos-=DETECT_SIZE; + + for (i=0;i<N2;i++) + { + float w = analysis_window[i]; + in[i].r = MULT16_16(w, tonal->inmem[i]); + in[i].i = MULT16_16(w, tonal->inmem[N2+i]); + in[N-i-1].r = MULT16_16(w, tonal->inmem[N-i-1]); + in[N-i-1].i = MULT16_16(w, tonal->inmem[N+N2-i-1]); + } + OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); + remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); + downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, C); + tonal->mem_fill = 240 + remaining; opus_fft(kfft, in, out); for (i=1;i<N2;i++) @@ -417,27 +469,91 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc features[24] = tonal->lowECount; #ifndef FIXED_POINT - mlp_process(&net, features, &frame_prob); - frame_prob = .5f*(frame_prob+1); + mlp_process(&net, features, frame_probs); + frame_probs[0] = .5f*(frame_probs[0]+1); /* Curve fitting between the MLP probability and the actual probability */ - frame_prob = .01f + 1.21f*frame_prob*frame_prob - .23f*(float)pow(frame_prob, 10); + frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10); + frame_probs[1] = .5*frame_probs[1]+.5; + frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5; - /*printf("%f\n", frame_prob);*/ + /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/ { float tau, beta; float p0, p1; - float max_certainty; /* One transition every 3 minutes */ - tau = .00005f; - beta = .1f; - max_certainty = .01f+1.f/(20.f+.5f*tonal->last_transition); + tau = .00005f*frame_probs[1]; + beta = .05f; + if (1) { + /* Adapt beta based on how "unexpected" the new prob is */ + float p, q; + p = MAX16(.05f,MIN16(.95f,frame_probs[0])); + q = MAX16(.05f,MIN16(.95f,tonal->music_prob)); + beta = .01+.05*ABS16(p-q)/(p*(1-q)+q*(1-p)); + } p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau; p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau; - p0 *= (float)pow(1-frame_prob, beta); - p1 *= (float)pow(frame_prob, beta); - tonal->music_prob = MAX16(max_certainty, MIN16(1-max_certainty, p1/(p0+p1))); + p0 *= (float)pow(1-frame_probs[0], beta); + p1 *= (float)pow(frame_probs[0], beta); + tonal->music_prob = p1/(p0+p1); info->music_prob = tonal->music_prob; - /*printf("%f %f\n", frame_prob, info->music_prob);*/ + + float psum=1e-20; + float speech0 = (float)pow(1-frame_probs[0], beta); + float music0 = (float)pow(frame_probs[0], beta); + if (tonal->count==1) + { + tonal->pspeech[0]=.5; + tonal->pmusic [0]=.5; + } + float s0, m0; + s0 = tonal->pspeech[0] + tonal->pspeech[1]; + m0 = tonal->pmusic [0] + tonal->pmusic [1]; + tonal->pspeech[0] = s0*(1-tau)*speech0; + tonal->pmusic [0] = m0*(1-tau)*music0; + for (i=1;i<DETECT_SIZE-1;i++) + { + tonal->pspeech[i] = tonal->pspeech[i+1]*speech0; + tonal->pmusic [i] = tonal->pmusic [i+1]*music0; + } + tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0; + tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0; + + for (i=0;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i] + tonal->pmusic[i]; + psum = 1.f/psum; + for (i=0;i<DETECT_SIZE;i++) + { + tonal->pspeech[i] *= psum; + tonal->pmusic [i] *= psum; + } + psum = tonal->pmusic[0]; + for (i=1;i<DETECT_SIZE;i++) + psum += tonal->pspeech[i]; + + /* Estimate our confidence in the speech/music decisions */ + if (frame_probs[1]>.75) + { + if (tonal->music_prob>.9) + { + float adapt; + adapt = 1.f/(++tonal->music_confidence_count); + tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500); + tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence); + } + if (tonal->music_prob<.1) + { + float adapt; + adapt = 1.f/(++tonal->speech_confidence_count); + tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500); + tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence); + } + } else { + if (tonal->music_confidence_count==0) + tonal->music_confidence = .9; + if (tonal->speech_confidence_count==0) + tonal->speech_confidence = .1; + } + psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum)); } if (tonal->last_music != (tonal->music_prob>.5f)) tonal->last_transition=0; @@ -465,4 +581,48 @@ void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, CELTEnc /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ info->noisiness = frame_noisiness; info->valid = 1; + if (info_out!=NULL) + OPUS_COPY(info_out, info, 1); +} + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) +{ + int offset; + int pcm_len; + + /* Avoid overflow/wrap-around of the analysis buffer */ + frame_size = IMIN((DETECT_SIZE-5)*Fs/100, frame_size); + + pcm_len = frame_size - analysis->analysis_offset; + offset = 0; + do { + tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, C, lsb_depth, downmix); + offset += 480; + pcm_len -= 480; + } while (pcm_len>0); + analysis->analysis_offset = frame_size; + + if (variable_duration == OPUS_FRAMESIZE_VARIABLE && frame_size >= Fs/200) + { + int LM = 3; + LM = optimize_framesize(pcm, frame_size, C, Fs, bitrate_bps, + analysis->prev_tonality, analysis->subframe_mem, delay_compensation, downmix); + while ((Fs/400<<LM)>frame_size) + LM--; + frame_size = (Fs/400<<LM); + } else { + frame_size = frame_size_select(frame_size, variable_duration, Fs); + } + if (frame_size<0) + return -1; + analysis->analysis_offset -= frame_size; + + /* Only perform analysis up to 20-ms frames. Longer ones will be split if + they're in CELT-only mode. */ + analysis_info->valid = 0; + tonality_get_info(analysis, analysis_info, frame_size); + + return frame_size; } diff --git a/src/analysis.h b/src/analysis.h index bf8ad40a72d0e396e07b74356b64317f08245c5f..7b17118cf0be99e4ca33ed448d36a70e390a1102 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -28,18 +28,27 @@ #ifndef ANALYSIS_H #define ANALYSIS_H +#include "celt.h" +#include "opus_private.h" + #define NB_FRAMES 8 #define NB_TBANDS 18 #define NB_TOT_BANDS 21 +#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */ + +#define DETECT_SIZE 200 typedef struct { float angle[240]; float d_angle[240]; float d2_angle[240]; + float inmem[ANALYSIS_BUF_SIZE]; + int mem_fill; /* number of usable samples in the buffer */ float prev_band_tonality[NB_TBANDS]; float prev_tonality; float E[NB_FRAMES][NB_TBANDS]; - float lowE[NB_TBANDS], highE[NB_TBANDS]; + float lowE[NB_TBANDS]; + float highE[NB_TBANDS]; float meanE[NB_TOT_BANDS]; float mem[32]; float cmean[8]; @@ -52,9 +61,27 @@ typedef struct { int last_transition; int count; int opus_bandwidth; + opus_val32 subframe_mem[3]; + int analysis_offset; + float pspeech[DETECT_SIZE]; + float pmusic[DETECT_SIZE]; + float speech_confidence; + float music_confidence; + int speech_confidence_count; + int music_confidence_count; + int write_pos; + int read_pos; + int read_subframe; + AnalysisInfo info[DETECT_SIZE]; } TonalityAnalysisState; void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info, - CELTEncoder *celt_enc, const opus_val16 *x, int C, int lsb_depth); + const CELTMode *celt_mode, const void *x, int len, int offset, int C, int lsb_depth, downmix_func downmix); + +void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len); + +int run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *pcm, + const void *analysis_pcm, int frame_size, int variable_duration, int C, opus_int32 Fs, int bitrate_bps, + int delay_compensation, int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info); #endif diff --git a/src/mlp_data.c b/src/mlp_data.c index 5c13ca408df136c107d06e5773bea7b41e5032d4..9085b85faa3f95754258f850cb25f28c751c5313 100644 --- a/src/mlp_data.c +++ b/src/mlp_data.c @@ -3,74 +3,103 @@ #include "mlp.h" -/* RMS error was 0.179835, seed was 1322103961 */ +/* RMS error was 0.138320, seed was 1361535663 */ -static const float weights[271] = { +static const float weights[422] = { /* hidden layer */ -1.55597f, -0.0739792f, -0.0646761f, -0.099531f, -0.0794943f, -0.0180174f, -0.0391354f, 0.0508224f, -0.0160169f, -0.0773263f, --0.0300002f, -0.0865361f, 0.124477f, -0.28648f, -0.0860702f, --0.518949f, -0.0873341f, -0.235393f, -0.907833f, -0.383573f, -0.535388f, -0.57944f, 0.98116f, 0.8482f, 1.12426f, --3.23721f, -0.647072f, -0.0265139f, 0.0711052f, -0.00125666f, --0.0396181f, -0.44282f, -0.510495f, -0.201865f, 0.0134336f, --0.167205f, -0.155406f, 0.00041678f, -0.00468705f, -0.0233224f, -0.264279f, -0.301375f, 0.00234895f, 0.0144741f, -0.137535f, -0.200323f, 0.0192027f, 3.19818f, 2.03495f, 0.705517f, --4.6025f, -0.11485f, -0.792716f, 0.150714f, 0.10608f, -0.240633f, 0.0690698f, 0.0695297f, 0.124819f, 0.0501433f, -0.0460952f, 0.147639f, 0.10327f, 0.158007f, 0.113714f, -0.0276191f, 0.0680749f, -0.130012f, 0.0796126f, 0.133067f, -0.51495f, 0.747578f, -0.128742f, 5.98112f, -1.16698f, --0.276492f, -1.73549f, -3.90234f, 2.01489f, -0.040118f, --0.113002f, -0.146751f, -0.113569f, 0.0534873f, 0.0989832f, -0.0872875f, 0.049266f, 0.0367557f, -0.00889148f, -0.0648461f, --0.00190352f, 0.0143773f, 0.0259364f, -0.0592133f, -0.0672924f, -0.1399f, -0.0987886f, -0.347402f, 0.101326f, -0.0680876f, -0.469186f, 0.246922f, 10.4017f, 3.44846f, -0.662725f, --0.0328208f, -0.0561274f, -0.0167744f, 0.00044282f, -0.0457645f, --0.0408314f, -0.013113f, -0.0373873f, -0.0474122f, -0.0273745f, --0.0308505f, 0.000582959f, -0.0421135f, 0.464859f, 0.196842f, -0.320538f, 0.0435528f, -0.200168f, 0.266475f, -0.0853727f, -1.20397f, 0.711542f, -1.04397f, -1.47759f, 1.26768f, -0.446958f, 0.266477f, -0.30802f, 0.28431f, -0.118541f, -0.00836345f, 0.0689026f, -0.0137996f, -0.0395417f, 0.26982f, --0.206255f, 0.16066f, 0.114757f, 0.359587f, -0.106503f, --0.0948534f, 0.175358f, -0.122966f, -0.0056675f, 0.483848f, --0.134916f, -0.427567f, -0.140172f, -1.0866f, -2.73921f, -0.549843f, 0.17685f, 0.0010675f, -0.00137386f, 0.0884424f, --0.0698736f, -0.00174136f, 0.0718775f, -0.0396849f, 0.0448056f, -0.0577853f, -0.0372353f, 0.134599f, 0.0260656f, 0.140322f, -0.22704f, -0.020568f, -0.0142424f, -0.21723f, -0.997704f, --0.884573f, -0.163495f, 2.33617f, 0.224142f, 0.19635f, --0.957387f, 0.144678f, 1.47035f, -0.00700498f, -0.0472309f, --0.0137848f, -0.0189145f, 0.00856479f, 0.0316965f, 0.00613373f, -0.00209807f, 0.00270964f, -0.0490206f, 0.0105712f, -0.0465045f, --0.0381532f, -0.0985268f, -0.108297f, 0.0146409f, -0.0040718f, --0.0698572f, -0.380568f, -0.230479f, 3.98917f, 0.457652f, --1.02355f, -7.4435f, -0.475314f, 1.61743f, 0.0254017f, --0.00791293f, 0.047217f, 0.0220995f, -0.0304311f, 0.0052168f, --0.0404054f, -0.0230293f, 0.00169229f, -0.0138178f, 0.0043137f, --0.0598088f, -0.133601f, 0.0555138f, -0.177358f, -0.159856f, --0.137281f, 0.108051f, -0.305973f, 0.393775f, 0.0747287f, -0.783993f, -0.875086f, 1.06862f, 0.340519f, -0.352681f, --0.0830912f, -0.100017f, 0.0729085f, -0.00829403f, 0.027489f, --0.0779597f, 0.082286f, -0.164181f, -0.41519f, 0.00282335f, --0.29573f, 0.125571f, 0.726935f, 0.392137f, 0.491348f, -0.0723196f, -0.0259758f, -0.0636332f, -0.452384f, -0.000225974f, --2.34001f, 2.45211f, -0.544628f, 5.62944f, -3.44507f, +-0.0941125f, -0.302976f, -0.603555f, -0.19393f, -0.185983f, +-0.601617f, -0.0465317f, -0.114563f, -0.103599f, -0.618938f, +-0.317859f, -0.169949f, -0.0702885f, 0.148065f, 0.409524f, +0.548432f, 0.367649f, -0.494393f, 0.764306f, -1.83957f, +0.170849f, 12.786f, -1.08848f, -1.27284f, -16.2606f, +24.1773f, -5.57454f, -0.17276f, -0.163388f, -0.224421f, +-0.0948944f, -0.0728695f, -0.26557f, -0.100283f, -0.0515459f, +-0.146142f, -0.120674f, -0.180655f, 0.12857f, 0.442138f, +-0.493735f, 0.167767f, 0.206699f, -0.197567f, 0.417999f, +1.50364f, -0.773341f, -10.0401f, 0.401872f, 2.97966f, +15.2165f, -1.88905f, -1.19254f, 0.0285397f, -0.00405139f, +0.0707565f, 0.00825699f, -0.0927269f, -0.010393f, -0.00428882f, +-0.00489743f, -0.0709731f, -0.00255992f, 0.0395619f, 0.226424f, +0.0325231f, 0.162175f, -0.100118f, 0.485789f, 0.12697f, +0.285937f, 0.0155637f, 0.10546f, 3.05558f, 1.15059f, +-1.00904f, -1.83088f, 3.31766f, -3.42516f, -0.119135f, +-0.0405654f, 0.00690068f, 0.0179877f, -0.0382487f, 0.00597941f, +-0.0183611f, 0.00190395f, -0.144322f, -0.0435671f, 0.000990594f, +0.221087f, 0.142405f, 0.484066f, 0.404395f, 0.511955f, +-0.237255f, 0.241742f, 0.35045f, -0.699428f, 10.3993f, +2.6507f, -2.43459f, -4.18838f, 1.05928f, 1.71067f, +0.00667811f, -0.0721335f, -0.0397346f, 0.0362704f, -0.11496f, +-0.0235776f, 0.0082161f, -0.0141741f, -0.0329699f, -0.0354253f, +0.00277404f, -0.290654f, -1.14767f, -0.319157f, -0.686544f, +0.36897f, 0.478899f, 0.182579f, -0.411069f, 0.881104f, +-4.60683f, 1.4697f, 0.335845f, -1.81905f, -30.1699f, +5.55225f, 0.0019508f, -0.123576f, -0.0727332f, -0.0641597f, +-0.0534458f, -0.108166f, -0.0937368f, -0.0697883f, -0.0275475f, +-0.192309f, -0.110074f, 0.285375f, -0.405597f, 0.0926724f, +-0.287881f, -0.851193f, -0.099493f, -0.233764f, -1.2852f, +1.13611f, 3.12168f, -0.0699f, -1.86216f, 2.65292f, +-7.31036f, 2.44776f, -0.00111802f, -0.0632786f, -0.0376296f, +-0.149851f, 0.142963f, 0.184368f, 0.123433f, 0.0756158f, +0.117312f, 0.0933395f, 0.0692163f, 0.0842592f, 0.0704683f, +0.0589963f, 0.0942205f, -0.448862f, 0.0262677f, 0.270352f, +-0.262317f, 0.172586f, 2.00227f, -0.159216f, 0.038422f, +10.2073f, 4.15536f, -2.3407f, -0.0550265f, 0.00964792f, +-0.141336f, 0.0274501f, 0.0343921f, -0.0487428f, 0.0950172f, +-0.00775017f, -0.0372492f, -0.00548121f, -0.0663695f, 0.0960506f, +-0.200008f, -0.0412827f, 0.58728f, 0.0515787f, 0.337254f, +0.855024f, 0.668371f, -0.114904f, -3.62962f, -0.467477f, +-0.215472f, 2.61537f, 0.406117f, -1.36373f, 0.0425394f, +0.12208f, 0.0934502f, 0.123055f, 0.0340935f, -0.142466f, +0.035037f, -0.0490666f, 0.0733208f, 0.0576672f, 0.123984f, +-0.0517194f, -0.253018f, 0.590565f, 0.145849f, 0.315185f, +0.221534f, -0.149081f, 0.216161f, -0.349575f, 24.5664f, +-0.994196f, 0.614289f, -18.7905f, -2.83277f, -0.716801f, +-0.347201f, 0.479515f, -0.246027f, 0.0758683f, 0.137293f, +-0.17781f, 0.118751f, -0.00108329f, -0.237334f, 0.355732f, +-0.12991f, -0.0547627f, -0.318576f, -0.325524f, 0.180494f, +-0.0625604f, 0.141219f, 0.344064f, 0.37658f, -0.591772f, +5.8427f, -0.38075f, 0.221894f, -1.41934f, -1.87943e+06f, +1.34114f, 0.0283355f, -0.0447856f, -0.0211466f, -0.0256927f, +0.0139618f, 0.0207934f, -0.0107666f, 0.0110969f, 0.0586069f, +-0.0253545f, -0.0328433f, 0.11872f, -0.216943f, 0.145748f, +0.119808f, -0.0915211f, -0.120647f, -0.0787719f, -0.143644f, +-0.595116f, -1.152f, -1.25335f, -1.17092f, 4.34023f, +-975268.f, -1.37033f, -0.0401123f, 0.210602f, -0.136656f, +0.135962f, -0.0523293f, 0.0444604f, 0.0143928f, 0.00412666f, +-0.0193003f, 0.218452f, -0.110204f, -2.02563f, 0.918238f, +-2.45362f, 1.19542f, -0.061362f, -1.92243f, 0.308111f, +0.49764f, 0.912356f, 0.209272f, -2.34525f, 2.19326f, +-6.47121f, 1.69771f, -0.725123f, 0.0118929f, 0.0377944f, +0.0554003f, 0.0226452f, -0.0704421f, -0.0300309f, 0.0122978f, +-0.0041782f, -0.0686612f, 0.0313115f, 0.039111f, 0.364111f, +-0.0945548f, 0.0229876f, -0.17414f, 0.329795f, 0.114714f, +0.30022f, 0.106997f, 0.132355f, 5.79932f, 0.908058f, +-0.905324f, -3.3561f, 0.190647f, 0.184211f, -0.673648f, +0.231807f, -0.0586222f, 0.230752f, -0.438277f, 0.245857f, +-0.17215f, 0.0876383f, -0.720512f, 0.162515f, 0.0170571f, +0.101781f, 0.388477f, 1.32931f, 1.08548f, -0.936301f, +-2.36958f, -6.71988f, -3.44376f, 2.13818f, 14.2318f, +4.91459f, -3.09052f, -9.69191f, -0.768234f, 1.79604f, +0.0549653f, 0.163399f, 0.0797025f, 0.0343933f, -0.0555876f, +-0.00505673f, 0.0187258f, 0.0326628f, 0.0231486f, 0.15573f, +0.0476223f, -0.254824f, 1.60155f, -0.801221f, 2.55496f, +0.737629f, -1.36249f, -0.695463f, -2.44301f, -1.73188f, +3.95279f, 1.89068f, 0.486087f, -11.3343f, 3.9416e+06f, /* output layer */ --3.13835f, 0.994751f, 0.444901f, 1.59518f, 1.23665f, -3.37012f, -1.34606f, 1.99131f, 1.33476f, 1.3885f, -1.12559f, }; +-0.381439, 0.12115, -0.906927, 2.93878, 1.6388, +0.882811, 0.874344, 1.21726, -0.874545, 0.321706, +0.785055, 0.946558, -0.575066, -3.46553, 0.884905, +0.0924047, -9.90712, 0.391338, 0.160103, -2.04954, +4.1455, 0.0684029, -0.144761, -0.285282, 0.379244, +-1.1584, -0.0277241, -9.85, -4.82386, 3.71333, +3.87308, 3.52558, }; -static const int topo[3] = {25, 10, 1}; +static const int topo[3] = {25, 15, 2}; const MLP net = { - 3, - topo, - weights + 3, + topo, + weights }; - diff --git a/src/mlp_train.c b/src/mlp_train.c index 5fbbff082f96d631dcfe430dc8bf98755cb8503e..2e9568ba4e15b7174716bc3644899fba56064d62 100644 --- a/src/mlp_train.c +++ b/src/mlp_train.c @@ -106,6 +106,7 @@ MLPTrain * mlp_init(int *topo, int nbLayers, float *inputs, float *outputs, int } #define MAX_NEURONS 100 +#define MAX_OUT 10 double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamples, double *W0_grad, double *W1_grad, double *error_rate) { @@ -120,7 +121,8 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp double netOut[MAX_NEURONS]; double error[MAX_NEURONS]; - *error_rate = 0; + for (i=0;i<outDim;i++) + error_rate[i] = 0; topo = net->topo; inDim = net->topo[0]; hiddenDim = net->topo[1]; @@ -153,7 +155,7 @@ double compute_gradient(MLPTrain *net, float *inputs, float *outputs, int nbSamp netOut[i] = tansig_approx(sum); error[i] = out[i] - netOut[i]; rms += error[i]*error[i]; - *error_rate += fabs(error[i])>1; + error_rate[i] += fabs(error[i])>1; /*error[i] = error[i]/(1+fabs(error[i]));*/ } /* Back-propagate error */ @@ -194,7 +196,7 @@ struct GradientArg { double *W0_grad; double *W1_grad; double rms; - double error_rate; + double error_rate[MAX_OUT]; }; void *gradient_thread_process(void *_arg) @@ -213,7 +215,7 @@ void *gradient_thread_process(void *_arg) sem_wait(&sem_begin[arg->id]); if (arg->done) break; - arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, &arg->error_rate); + arg->rms = compute_gradient(arg->net, arg->inputs, arg->outputs, arg->nbSamples, arg->W0_grad, arg->W1_grad, arg->error_rate); sem_post(&sem_end[arg->id]); } fprintf(stderr, "done\n"); @@ -295,7 +297,7 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam for (e=0;e<nbEpoch;e++) { double rms=0; - double error_rate = 0; + double error_rate[2] = {0,0}; for (i=0;i<NB_THREADS;i++) { sem_post(&sem_begin[i]); @@ -306,7 +308,8 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam { sem_wait(&sem_end[i]); rms += args[i].rms; - error_rate += args[i].error_rate; + error_rate[0] += args[i].error_rate[0]; + error_rate[1] += args[i].error_rate[1]; for (j=0;j<W0_size;j++) W0_grad[j] += args[i].W0_grad[j]; for (j=0;j<W1_size;j++) @@ -315,8 +318,9 @@ float mlp_train_backprop(MLPTrain *net, float *inputs, float *outputs, int nbSam float mean_rate = 0, min_rate = 1e10; rms = (rms/(outDim*nbSamples)); - error_rate = (error_rate/(outDim*nbSamples)); - fprintf (stderr, "%f (%f %f) ", error_rate, rms, best_rms); + error_rate[0] = (error_rate[0]/(nbSamples)); + error_rate[1] = (error_rate[1]/(nbSamples)); + fprintf (stderr, "%f %f (%f %f) ", error_rate[0], error_rate[1], rms, best_rms); if (rms < best_rms) { best_rms = rms; @@ -445,6 +449,7 @@ int main(int argc, char **argv) outputs = malloc(nbOutputs*nbSamples*sizeof(*outputs)); seed = time(NULL); + /*seed = 1361480659;*/ fprintf (stderr, "Seed is %u\n", seed); srand(seed); build_tansig_table(); diff --git a/src/opus_demo.c b/src/opus_demo.c index 09b12a333d16560c0676078964b01cd000ba73f5..a0acb0cd22b34b83ad292fdd9c99cfa6b4b23dbc 100644 --- a/src/opus_demo.c +++ b/src/opus_demo.c @@ -53,6 +53,7 @@ void print_usage( char* argv[] ) fprintf(stderr, "-d : only runs the decoder (reads the bit-stream as input)\n" ); fprintf(stderr, "-cbr : enable constant bitrate; default: variable bitrate\n" ); fprintf(stderr, "-cvbr : enable constrained variable bitrate; default: unconstrained\n" ); + fprintf(stderr, "-variable-duration : enable frames of variable duration (experts only); default: disabled\n" ); fprintf(stderr, "-bandwidth <NB|MB|WB|SWB|FB> : audio bandwidth (from narrowband to fullband); default: sampling rate\n" ); fprintf(stderr, "-framesize <2.5|5|10|20|40|60> : frame size in ms; default: 20 \n" ); fprintf(stderr, "-max_payload <bytes> : maximum payload size in bytes, default: 1024\n" ); @@ -221,6 +222,8 @@ int main(int argc, char *argv[]) short *in, *out; int application=OPUS_APPLICATION_AUDIO; double bits=0.0, bits_max=0.0, bits_act=0.0, bits2=0.0, nrg; + double tot_samples=0; + opus_uint64 tot_in, tot_out; int bandwidth=-1; const char *bandwidth_string; int lost = 0, lost_prev = 1; @@ -239,6 +242,10 @@ int main(int argc, char *argv[]) int curr_mode=0; int curr_mode_count=0; int mode_switch_time = 48000; + int nb_encoded; + int remaining=0; + int variable_duration=OPUS_FRAMESIZE_ARG; + int delayed_decision=0; if (argc < 5 ) { @@ -246,6 +253,7 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } + tot_in=tot_out=0; fprintf(stderr, "%s\n", opus_get_version_string()); args = 1; @@ -306,7 +314,7 @@ int main(int argc, char *argv[]) forcechannels = OPUS_AUTO; use_dtx = 0; packet_loss_perc = 0; - max_frame_size = 960*6; + max_frame_size = 2*48000; curr_read=0; while( args < argc - 2 ) { @@ -374,6 +382,14 @@ int main(int argc, char *argv[]) check_encoder_option(decode_only, "-cvbr"); cvbr = 1; args++; + } else if( strcmp( argv[ args ], "-variable-duration" ) == 0 ) { + check_encoder_option(decode_only, "-variable-duration"); + variable_duration = OPUS_FRAMESIZE_VARIABLE; + args++; + } else if( strcmp( argv[ args ], "-delayed-decision" ) == 0 ) { + check_encoder_option(decode_only, "-delayed-decision"); + delayed_decision = 1; + args++; } else if( strcmp( argv[ args ], "-dtx") == 0 ) { check_encoder_option(decode_only, "-dtx"); use_dtx = 1; @@ -499,6 +515,7 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_GET_LOOKAHEAD(&skip)); opus_encoder_ctl(enc, OPUS_SET_LSB_DEPTH(16)); + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); } if (!encode_only) { @@ -554,6 +571,26 @@ int main(int argc, char *argv[]) if ( use_inbandfec ) { data[1] = (unsigned char*)calloc(max_payload_bytes,sizeof(char)); } + if(delayed_decision) + { + if (variable_duration!=OPUS_FRAMESIZE_VARIABLE) + { + if (frame_size==sampling_rate/400) + variable_duration = OPUS_FRAMESIZE_2_5_MS; + else if (frame_size==sampling_rate/200) + variable_duration = OPUS_FRAMESIZE_5_MS; + else if (frame_size==sampling_rate/100) + variable_duration = OPUS_FRAMESIZE_10_MS; + else if (frame_size==sampling_rate/50) + variable_duration = OPUS_FRAMESIZE_20_MS; + else if (frame_size==sampling_rate/25) + variable_duration = OPUS_FRAMESIZE_40_MS; + else + variable_duration = OPUS_FRAMESIZE_60_MS; + opus_encoder_ctl(enc, OPUS_SET_EXPERT_FRAME_DURATION(variable_duration)); + } + frame_size = 2*48000; + } while (!stop) { if (delayed_celt) @@ -617,22 +654,28 @@ int main(int argc, char *argv[]) opus_encoder_ctl(enc, OPUS_SET_FORCE_CHANNELS(mode_list[curr_mode][3])); frame_size = mode_list[curr_mode][2]; } - err = fread(fbytes, sizeof(short)*channels, frame_size, fin); + err = fread(fbytes, sizeof(short)*channels, frame_size-remaining, fin); curr_read = err; + tot_in += curr_read; for(i=0;i<curr_read*channels;i++) { opus_int32 s; s=fbytes[2*i+1]<<8|fbytes[2*i]; s=((s&0xFFFF)^0x8000)-0x8000; - in[i]=s; + in[i+remaining*channels]=s; } - if (curr_read < frame_size) + if (curr_read+remaining < frame_size) { - for (i=curr_read*channels;i<frame_size*channels;i++) + for (i=(curr_read+remaining)*channels;i<frame_size*channels;i++) in[i] = 0; - stop = 1; + if (encode_only || decode_only) + stop = 1; } len[toggle] = opus_encode(enc, in, frame_size, data[toggle], max_payload_bytes); + nb_encoded = opus_packet_get_samples_per_frame(data[toggle], sampling_rate)*opus_packet_get_nb_frames(data[toggle], len[toggle]); + remaining = frame_size-nb_encoded; + for(i=0;i<remaining*channels;i++) + in[i] = in[nb_encoded*channels+i]; if (sweep_bps!=0) { bitrate_bps += sweep_bps; @@ -681,6 +724,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "Error writing.\n"); return EXIT_FAILURE; } + tot_samples += nb_encoded; } else { int output_samples; lost = len[toggle]==0 || (packet_loss_perc>0 && rand()%100 < packet_loss_perc); @@ -703,6 +747,11 @@ int main(int argc, char *argv[]) } if (output_samples>0) { + if (!decode_only && tot_out + output_samples > tot_in) + { + stop=1; + output_samples = tot_in-tot_out; + } if (output_samples>skip) { int i; for(i=0;i<(output_samples-skip)*channels;i++) @@ -716,6 +765,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "Error writing.\n"); return EXIT_FAILURE; } + tot_out += output_samples-skip; } if (output_samples<skip) skip -= output_samples; else skip = 0; @@ -723,6 +773,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "error decoding frame: %s\n", opus_strerror(output_samples)); } + tot_samples += output_samples; } } @@ -767,7 +818,7 @@ int main(int argc, char *argv[]) toggle = (toggle + use_inbandfec) & 1; } fprintf (stderr, "average bitrate: %7.3f kb/s\n", - 1e-3*bits*sampling_rate/(frame_size*(double)count)); + 1e-3*bits*sampling_rate/tot_samples); fprintf (stderr, "maximum bitrate: %7.3f kb/s\n", 1e-3*bits_max*sampling_rate/frame_size); if (!decode_only) diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 0daeb020463ca843c36ce0c0155e1cadb0a5a9ee..a8074473e05c648564e180aac8a98e89d05782e6 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -67,6 +67,7 @@ struct OpusEncoder { opus_int32 Fs; int use_vbr; int vbr_constraint; + int variable_duration; opus_int32 bitrate_bps; opus_int32 user_bitrate_bps; int lsb_depth; @@ -89,7 +90,8 @@ struct OpusEncoder { opus_val16 delay_buffer[MAX_ENCODER_BUFFER*2]; #ifndef FIXED_POINT TonalityAnalysisState analysis; - int detected_bandwidth; + int detected_bandwidth; + int analysis_offset; #endif opus_uint32 rangeFinal; }; @@ -213,6 +215,7 @@ int opus_encoder_init(OpusEncoder* st, opus_int32 Fs, int channels, int applicat st->voice_ratio = -1; st->encoder_buffer = st->Fs/100; st->lsb_depth = 24; + st->variable_duration = OPUS_FRAMESIZE_ARG; /* Delay compensation of 4 ms (2.5 ms for SILK's extra look-ahead + 1.5 ms for SILK resamplers and stereo prediction) */ @@ -535,8 +538,258 @@ static opus_int32 user_bitrate_to_bitrate(OpusEncoder *st, int frame_size, int m return st->user_bitrate_bps; } +#ifndef FIXED_POINT +/* Don't use more than 60 ms for the frame size analysis */ +#define MAX_DYNAMIC_FRAMESIZE 24 +/* Estimates how much the bitrate will be boosted based on the sub-frame energy */ +static float transient_boost(const float *E, const float *E_1, int LM, int maxM) +{ + int i; + int M; + float sumE=0, sumE_1=0; + float metric; + + M = IMIN(maxM, (1<<LM)+1); + for (i=0;i<M;i++) + { + sumE += E[i]; + sumE_1 += E_1[i]; + } + metric = sumE*sumE_1/(M*M); + /*if (LM==3) + printf("%f\n", metric);*/ + /*return metric>10 ? 1 : 0;*/ + /*return MAX16(0,1-exp(-.25*(metric-2.)));*/ + return MIN16(1,sqrt(MAX16(0,.05*(metric-2)))); +} + +/* Viterbi decoding trying to find the best frame size combination using look-ahead + + State numbering: + 0: unused + 1: 2.5 ms + 2: 5 ms (#1) + 3: 5 ms (#2) + 4: 10 ms (#1) + 5: 10 ms (#2) + 6: 10 ms (#3) + 7: 10 ms (#4) + 8: 20 ms (#1) + 9: 20 ms (#2) + 10: 20 ms (#3) + 11: 20 ms (#4) + 12: 20 ms (#5) + 13: 20 ms (#6) + 14: 20 ms (#7) + 15: 20 ms (#8) +*/ +static int transient_viterbi(const float *E, const float *E_1, int N, int frame_cost, int rate) +{ + int i; + float cost[MAX_DYNAMIC_FRAMESIZE][16]; + int states[MAX_DYNAMIC_FRAMESIZE][16]; + float best_cost; + int best_state; + + /* Makes variable framesize less aggressive at lower bitrates, but I can't + find any valid theretical justification for this (other than it seems + to help) */ + frame_cost *= 720/rate; + for (i=0;i<16;i++) + { + /* Impossible state */ + states[0][i] = -1; + cost[0][i] = 1e10; + } + for (i=0;i<4;i++) + { + cost[0][1<<i] = frame_cost + rate*(1<<i)*transient_boost(E, E_1, i, N+1); + states[0][1<<i] = i; + } + for (i=1;i<N;i++) + { + int j; + + /* Follow continuations */ + for (j=2;j<16;j++) + { + cost[i][j] = cost[i-1][j-1]; + states[i][j] = j-1; + } + + /* New frames */ + for(j=0;j<4;j++) + { + int k; + float min_cost; + float curr_cost; + states[i][1<<j] = 1; + min_cost = cost[i-1][1]; + for(k=1;k<4;k++) + { + float tmp = cost[i-1][(1<<(k+1))-1]; + if (tmp < min_cost) + { + states[i][1<<j] = (1<<(k+1))-1; + min_cost = tmp; + } + } + curr_cost = frame_cost+rate*(1<<j)*transient_boost(E+i, E_1+i, j, N-i+1); + cost[i][1<<j] = min_cost; + /* If part of the frame is outside the analysis window, only count part of the cost */ + if (N-i < (1<<j)) + cost[i][1<<j] += curr_cost*(float)(N-i)/(1<<j); + else + cost[i][1<<j] += curr_cost; + } + } + + best_state=1; + best_cost = cost[N-1][1]; + /* Find best end state (doesn't force a frame to end at N-1) */ + for (i=2;i<16;i++) + { + if (cost[N-1][i]<best_cost) + { + best_cost = cost[N-1][i]; + best_state = i; + } + } + + /* Follow transitions back */ + for (i=N-1;i>=0;i--) + { + /*printf("%d ", best_state);*/ + best_state = states[i][best_state]; + } + /*printf("%d\n", best_state);*/ + return best_state; +} + +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C) +{ + const float *x; + int c, j; + x = (const float *)_x; + for (j=0;j<subframe;j++) + sub[j] = x[(j+offset)*C]; + for (c=1;c<C;c++) + for (j=0;j<subframe;j++) + sub[j] += x[(j+offset)*C+c]; +} + +void downmix_int(const void *_x, float *sub, int subframe, int offset, int C) +{ + const opus_int16 *x; + int c, j; + x = (const opus_int16 *)_x; + for (j=0;j<subframe;j++) + sub[j] = x[(j+offset)*C]; + for (c=1;c<C;c++) + for (j=0;j<subframe;j++) + sub[j] += x[(j+offset)*C+c]; +} + +int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, + int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering, + downmix_func downmix) +{ + int N; + int i; + float e[MAX_DYNAMIC_FRAMESIZE+4]; + float e_1[MAX_DYNAMIC_FRAMESIZE+3]; + float memx; + int bestLM=0; + int subframe; + int pos; + VARDECL(opus_val16, sub); + + subframe = Fs/400; + ALLOC(sub, subframe, opus_val16); + e[0]=mem[0]; + e_1[0]=1./(EPSILON+mem[0]); + if (buffering) + { + /* Consider the CELT delay when not in restricted-lowdelay */ + /* We assume the buffering is between 2.5 and 5 ms */ + int offset = 2*subframe - buffering; + celt_assert(offset>=0 && offset <= subframe); + x += C*offset; + len -= offset; + e[1]=mem[1]; + e_1[1]=1./(EPSILON+mem[1]); + e[2]=mem[2]; + e_1[2]=1./(EPSILON+mem[2]); + pos = 3; + } else { + pos=1; + } + N=IMIN(len/subframe, MAX_DYNAMIC_FRAMESIZE); + memx = x[0]; + for (i=0;i<N;i++) + { + float tmp; + float tmpx; + int j; + tmp=EPSILON; + + downmix(x, sub, subframe, i*subframe, C); + if (i==0) + memx = sub[0]; + for (j=0;j<subframe;j++) + { + tmpx = sub[j]; + tmp += (tmpx-memx)*(tmpx-memx); + memx = tmpx; + } + e[i+pos] = tmp; + e_1[i+pos] = 1.f/tmp; + } + /* Hack to get 20 ms working with APPLICATION_AUDIO + The real problem is that the corresponding memory needs to use 1.5 ms + from this frame and 1 ms from the next frame */ + e[i+pos] = e[i+pos-1]; + if (buffering) + N=IMIN(MAX_DYNAMIC_FRAMESIZE, N+2); + bestLM = transient_viterbi(e, e_1, N, (1.f+.5*tonality)*(40*C+40), bitrate/400); + mem[0] = e[1<<bestLM]; + if (buffering) + { + mem[1] = e[(1<<bestLM)+1]; + mem[2] = e[(1<<bestLM)+2]; + } + return bestLM; +} + +#endif + +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs) +{ + int new_size; + if (frame_size<Fs/400) + return -1; + if (variable_duration == OPUS_FRAMESIZE_ARG) + new_size = frame_size; + else if (variable_duration == OPUS_FRAMESIZE_VARIABLE) + new_size = Fs/50; + else if (variable_duration >= OPUS_FRAMESIZE_2_5_MS && variable_duration <= OPUS_FRAMESIZE_60_MS) + new_size = IMIN(3*Fs/50, (Fs/400)<<(variable_duration-OPUS_FRAMESIZE_2_5_MS)); + else + return -1; + if (new_size>frame_size) + return -1; + if (400*new_size!=Fs && 200*new_size!=Fs && 100*new_size!=Fs && + 50*new_size!=Fs && 25*new_size!=Fs && 50*new_size!=3*Fs) + return -1; + return new_size; +} + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth) + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ) { void *silk_enc; CELTEncoder *celt_enc; @@ -563,11 +816,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int curr_bandwidth; opus_val16 HB_gain; opus_int32 max_data_bytes; /* Max number of bytes we're allowed to use */ - int extra_buffer, total_buffer; - int perform_analysis=0; -#ifndef FIXED_POINT - AnalysisInfo analysis_info; -#endif + int total_buffer; VARDECL(opus_val16, tmp_prefill); ALLOC_STACK; @@ -575,36 +824,37 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ max_data_bytes = IMIN(1276, out_data_bytes); st->rangeFinal = 0; - if (400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs && + if ((!st->variable_duration && 400*frame_size != st->Fs && 200*frame_size != st->Fs && 100*frame_size != st->Fs && 50*frame_size != st->Fs && 25*frame_size != st->Fs && 50*frame_size != 3*st->Fs) - { - RESTORE_STACK; - return OPUS_BAD_ARG; - } - if (max_data_bytes<=0) + || (400*frame_size < st->Fs) + || max_data_bytes<=0 + ) { RESTORE_STACK; return OPUS_BAD_ARG; } silk_enc = (char*)st+st->silk_enc_offset; celt_enc = (CELTEncoder*)((char*)st+st->celt_enc_offset); - - lsb_depth = IMIN(lsb_depth, st->lsb_depth); - -#ifndef FIXED_POINT - perform_analysis = st->silk_mode.complexity >= 7 && frame_size >= st->Fs/100 && st->Fs==48000; -#endif if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) delay_compensation = 0; else delay_compensation = st->delay_compensation; - if (perform_analysis) + + lsb_depth = IMIN(lsb_depth, st->lsb_depth); + + st->voice_ratio = -1; + +#ifndef FIXED_POINT + st->detected_bandwidth = 0; + if (analysis_info->valid) { - total_buffer = IMAX(st->Fs/200, delay_compensation); - } else { - total_buffer = delay_compensation; + if (st->signal_type == OPUS_AUTO) + st->voice_ratio = (int)floor(.5+100*(1-analysis_info->music_prob)); + st->detected_bandwidth = analysis_info->opus_bandwidth; } - extra_buffer = total_buffer-delay_compensation; +#endif + + total_buffer = delay_compensation; st->bitrate_bps = user_bitrate_to_bitrate(st, frame_size, max_data_bytes); frame_rate = st->Fs/frame_size; @@ -916,7 +1166,11 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ /* When switching from SILK/Hybrid to CELT, only ask for a switch at the last frame */ if (to_celt && i==nb_frames-1) st->user_forced_mode = MODE_CELT_ONLY; - tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth); + tmp_len = opus_encode_native(st, pcm+i*(st->channels*st->Fs/50), st->Fs/50, tmp_data+i*bytes_per_frame, bytes_per_frame, lsb_depth +#ifndef FIXED_POINT + , analysis_info +#endif + ); if (tmp_len<0) { RESTORE_STACK; @@ -942,7 +1196,6 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ RESTORE_STACK; return ret; } - curr_bandwidth = st->bandwidth; /* Chooses the appropriate mode for speech @@ -981,22 +1234,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ dc_reject(pcm, 3, &pcm_buf[total_buffer*st->channels], st->hp_mem, frame_size, st->channels, st->Fs); } -#ifndef FIXED_POINT - if (perform_analysis) - { - int nb_analysis_frames; - nb_analysis_frames = frame_size/(st->Fs/100); - for (i=0;i<nb_analysis_frames;i++) - tonality_analysis(&st->analysis, &analysis_info, celt_enc, pcm_buf+i*(st->Fs/100)*st->channels, st->channels, lsb_depth); - if (st->signal_type == OPUS_AUTO) - st->voice_ratio = (int)floor(.5+100*(1-analysis_info.music_prob)); - st->detected_bandwidth = analysis_info.opus_bandwidth; - } else { - analysis_info.valid = 0; - st->voice_ratio = -1; - st->detected_bandwidth = 0; - } -#endif + /* SILK processing */ HB_gain = Q15ONE; @@ -1205,9 +1443,18 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ } else { if (st->use_vbr) { + opus_int32 bonus=0; +#ifndef FIXED_POINT + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != st->Fs/50) + { + bonus = (40*st->stream_channels+40)*(st->Fs/frame_size-50); + if (analysis_info->valid) + bonus = bonus*(1.f+.5*analysis_info->tonality); + } +#endif celt_encoder_ctl(celt_enc, OPUS_SET_VBR(1)); celt_encoder_ctl(celt_enc, OPUS_SET_VBR_CONSTRAINT(st->vbr_constraint)); - celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps)); + celt_encoder_ctl(celt_enc, OPUS_SET_BITRATE(st->bitrate_bps+bonus)); nb_compr_bytes = max_data_bytes-1-redundancy_bytes; } else { nb_compr_bytes = bytes_target; @@ -1222,7 +1469,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (st->mode != MODE_SILK_ONLY && st->mode != st->prev_mode && st->prev_mode > 0) { for (i=0;i<st->channels*st->Fs/400;i++) - tmp_prefill[i] = st->delay_buffer[(extra_buffer+st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i]; + tmp_prefill[i] = st->delay_buffer[(st->encoder_buffer-total_buffer-st->Fs/400)*st->channels + i]; } for (i=0;i<st->channels*(st->encoder_buffer-(frame_size+total_buffer));i++) @@ -1236,7 +1483,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ const CELTMode *celt_mode; celt_encoder_ctl(celt_enc, CELT_GET_MODE(&celt_mode)); - gain_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, + gain_fade(pcm_buf, pcm_buf, st->prev_HB_gain, HB_gain, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs); } st->prev_HB_gain = HB_gain; @@ -1258,7 +1505,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ g1 *= (1.f/16384); g2 *= (1.f/16384); #endif - stereo_fade(pcm_buf+extra_buffer*st->channels, pcm_buf+extra_buffer*st->channels, g1, g2, celt_mode->overlap, + stereo_fade(pcm_buf, pcm_buf, g1, g2, celt_mode->overlap, frame_size, st->channels, celt_mode->window, st->Fs); st->hybrid_stereo_width_Q14 = st->silk_mode.stereoWidth_Q14; } @@ -1312,7 +1559,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ int err; celt_encoder_ctl(celt_enc, CELT_SET_START_BAND(0)); celt_encoder_ctl(celt_enc, OPUS_SET_VBR(0)); - err = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL); + err = celt_encode_with_ec(celt_enc, pcm_buf, st->Fs/200, data+nb_compr_bytes, redundancy_bytes, NULL); if (err < 0) { RESTORE_STACK; @@ -1339,10 +1586,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ if (ec_tell(&enc) <= 8*nb_compr_bytes) { #ifndef FIXED_POINT - if (perform_analysis) - celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(&analysis_info)); + celt_encoder_ctl(celt_enc, CELT_SET_ANALYSIS(analysis_info)); #endif - ret = celt_encode_with_ec(celt_enc, pcm_buf+extra_buffer*st->channels, frame_size, NULL, nb_compr_bytes, &enc); + ret = celt_encode_with_ec(celt_enc, pcm_buf, frame_size, NULL, nb_compr_bytes, &enc); if (ret < 0) { RESTORE_STACK; @@ -1365,9 +1611,9 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ celt_encoder_ctl(celt_enc, CELT_SET_PREDICTION(0)); /* NOTE: We could speed this up slightly (at the expense of code size) by just adding a function that prefills the buffer */ - celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2-N4), N4, dummy, 2, NULL); + celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2-N4), N4, dummy, 2, NULL); - err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(extra_buffer+frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL); + err = celt_encode_with_ec(celt_enc, pcm_buf+st->channels*(frame_size-N2), N2, data+nb_compr_bytes, redundancy_bytes, NULL); if (err < 0) { RESTORE_STACK; @@ -1440,6 +1686,7 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, VARDECL(opus_int16, in); ALLOC_STACK; + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); if(frame_size<0) { RESTORE_STACK; @@ -1459,6 +1706,12 @@ opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 16); } @@ -1467,21 +1720,74 @@ opus_int32 opus_encode(OpusEncoder *st, const opus_int16 *pcm, int frame_size, unsigned char *data, opus_int32 max_data_bytes) { int i, ret; + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; VARDECL(float, in); + AnalysisInfo analysis_info; ALLOC_STACK; + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(16, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_int, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + ALLOC(in, frame_size*st->channels, float); for (i=0;i<frame_size*st->channels;i++) in[i] = (1.0f/32768)*pcm[i]; - ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16); + ret = opus_encode_native(st, in, frame_size, data, max_data_bytes, 16, &analysis_info); RESTORE_STACK; return ret; } opus_int32 opus_encode_float(OpusEncoder *st, const float *pcm, int frame_size, unsigned char *data, opus_int32 out_data_bytes) { - return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24); + const CELTMode *celt_mode; + int delay_compensation; + int lsb_depth; + AnalysisInfo analysis_info; + + opus_encoder_ctl(st, CELT_GET_MODE(&celt_mode)); + if (st->application == OPUS_APPLICATION_RESTRICTED_LOWDELAY) + delay_compensation = 0; + else + delay_compensation = st->delay_compensation; + + lsb_depth = IMIN(24, st->lsb_depth); + + analysis_info.valid = 0; + if (st->silk_mode.complexity >= 7 && st->Fs==48000) + { + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm+st->channels*st->analysis.analysis_offset, + frame_size, st->variable_duration, st->channels, st->Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix_float, &analysis_info); + } else { + frame_size = frame_size_select(frame_size, st->variable_duration, st->Fs); + } + if(frame_size<0) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + + return opus_encode_native(st, pcm, frame_size, data, out_data_bytes, 24, &analysis_info); } #endif @@ -1750,6 +2056,18 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) *value = st->lsb_depth; } break; + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + st->variable_duration = value; + } + break; + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + *value = st->variable_duration; + } + break; case OPUS_RESET_STATE: { void *silk_enc; @@ -1779,6 +2097,15 @@ int opus_encoder_ctl(OpusEncoder *st, int request, ...) st->user_forced_mode = value; } break; + + case CELT_GET_MODE_REQUEST: + { + const CELTMode ** value = va_arg(ap, const CELTMode**); + if (value==0) + goto bad_arg; + celt_encoder_ctl(celt_enc, CELT_GET_MODE(value)); + } + break; default: /* fprintf(stderr, "unknown opus_encoder_ctl() request: %d", request);*/ ret = OPUS_UNIMPLEMENTED; diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c index db9fc7850512e171d9a17bbf4ccf2448103d74e0..c62041853d5dd5ce8249c29dccff56e0562d168f 100644 --- a/src/opus_multistream_encoder.c +++ b/src/opus_multistream_encoder.c @@ -36,10 +36,14 @@ #include <stdarg.h> #include "float_cast.h" #include "os_support.h" +#include "analysis.h" struct OpusMSEncoder { + TonalityAnalysisState analysis; ChannelLayout layout; - int bitrate; + int variable_duration; + opus_int32 bitrate_bps; + opus_val32 subframe_mem[3]; /* Encoder states go here */ }; @@ -102,6 +106,8 @@ int opus_multistream_encoder_init( st->layout.nb_streams = streams; st->layout.nb_coupled_streams = coupled_streams; + st->bitrate_bps = OPUS_AUTO; + st->variable_duration = OPUS_FRAMESIZE_ARG; for (i=0;i<st->layout.nb_channels;i++) st->layout.mapping[i] = mapping[i]; if (!validate_layout(&st->layout) || !validate_encoder_layout(&st->layout)) @@ -182,6 +188,10 @@ static int opus_multistream_encode_native unsigned char *data, opus_int32 max_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , downmix_func downmix + , const void *pcm_analysis +#endif ) { opus_int32 Fs; @@ -193,10 +203,43 @@ static int opus_multistream_encode_native VARDECL(opus_val16, buf); unsigned char tmp_data[MS_FRAME_TMP]; OpusRepacketizer rp; + int orig_frame_size; + int coded_channels; + opus_int32 channel_rate; + opus_int32 complexity; + AnalysisInfo analysis_info; + const CELTMode *celt_mode; ALLOC_STACK; ptr = (char*)st + align(sizeof(OpusMSEncoder)); opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_SAMPLE_RATE(&Fs)); + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_COMPLEXITY(&complexity)); + opus_encoder_ctl((OpusEncoder*)ptr, CELT_GET_MODE(&celt_mode)); + + if (400*frame_size < Fs) + { + RESTORE_STACK; + return OPUS_BAD_ARG; + } + orig_frame_size = IMIN(frame_size,Fs/50); +#ifndef FIXED_POINT + analysis_info.valid = 0; + if (complexity >= 7 && Fs==48000) + { + opus_int32 delay_compensation; + int channels; + + channels = st->layout.nb_streams + st->layout.nb_coupled_streams; + opus_encoder_ctl((OpusEncoder*)ptr, OPUS_GET_LOOKAHEAD(&delay_compensation)); + delay_compensation -= Fs/400; + + frame_size = run_analysis(&st->analysis, celt_mode, pcm, pcm_analysis, + frame_size, st->variable_duration, channels, Fs, st->bitrate_bps, delay_compensation, lsb_depth, downmix, &analysis_info); + } else +#endif + { + frame_size = frame_size_select(frame_size, st->variable_duration, Fs); + } /* Validate frame_size before using it to allocate stack space. This mirrors the checks in opus_encode[_float](). */ if (400*frame_size != Fs && 200*frame_size != Fs && @@ -215,6 +258,39 @@ static int opus_multistream_encode_native RESTORE_STACK; return OPUS_BUFFER_TOO_SMALL; } + + /* Compute bitrate allocation between streams (this could be a lot better) */ + coded_channels = st->layout.nb_streams + st->layout.nb_coupled_streams; + if (st->bitrate_bps==OPUS_AUTO) + { + channel_rate = Fs+60*Fs/orig_frame_size; + } else if (st->bitrate_bps==OPUS_BITRATE_MAX) + { + channel_rate = 300000; + } else { + channel_rate = st->bitrate_bps/coded_channels; + } +#ifndef FIXED_POINT + if (st->variable_duration==OPUS_FRAMESIZE_VARIABLE && frame_size != Fs/50) + { + opus_int32 bonus; + bonus = 60*(Fs/frame_size-50); + channel_rate += bonus; + } +#endif + ptr = (char*)st + align(sizeof(OpusMSEncoder)); + for (s=0;s<st->layout.nb_streams;s++) + { + OpusEncoder *enc; + enc = (OpusEncoder*)ptr; + if (s < st->layout.nb_coupled_streams) + ptr += align(coupled_size); + else + ptr += align(mono_size); + opus_encoder_ctl(enc, OPUS_SET_BITRATE(channel_rate * (s < st->layout.nb_coupled_streams ? 2 : 1))); + } + + ptr = (char*)st + align(sizeof(OpusMSEncoder)); /* Counting ToC */ tot_size = 0; for (s=0;s<st->layout.nb_streams;s++) @@ -246,7 +322,11 @@ static int opus_multistream_encode_native /* Reserve three bytes for the last stream and four for the others */ curr_max -= IMAX(0,4*(st->layout.nb_streams-s-1)-1); curr_max = IMIN(curr_max,MS_FRAME_TMP); - len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth); + len = opus_encode_native(enc, buf, frame_size, tmp_data, curr_max, lsb_depth +#ifndef FIXED_POINT + , &analysis_info +#endif + ); if (len<0) { RESTORE_STACK; @@ -345,8 +425,9 @@ int opus_multistream_encode_float opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_float, - pcm, frame_size, data, max_data_bytes, 24); + pcm, frame_size, data, max_data_bytes, 24, downmix_float, pcm+channels*st->analysis.analysis_offset); } int opus_multistream_encode( @@ -357,8 +438,9 @@ int opus_multistream_encode( opus_int32 max_data_bytes ) { + int channels = st->layout.nb_streams + st->layout.nb_coupled_streams; return opus_multistream_encode_native(st, opus_copy_channel_in_short, - pcm, frame_size, data, max_data_bytes, 16); + pcm, frame_size, data, max_data_bytes, 16, downmix_int, pcm+channels*st->analysis.analysis_offset); } #endif @@ -378,20 +460,10 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) { case OPUS_SET_BITRATE_REQUEST: { - int chan, s; opus_int32 value = va_arg(ap, opus_int32); - chan = st->layout.nb_streams + st->layout.nb_coupled_streams; - value /= chan; - for (s=0;s<st->layout.nb_streams;s++) - { - OpusEncoder *enc; - enc = (OpusEncoder*)ptr; - if (s < st->layout.nb_coupled_streams) - ptr += align(coupled_size); - else - ptr += align(mono_size); - opus_encoder_ctl(enc, request, value * (s < st->layout.nb_coupled_streams ? 2 : 1)); - } + if (value<0 && value!=OPUS_AUTO && value!=OPUS_BITRATE_MAX) + goto bad_arg; + st->bitrate_bps = value; } break; case OPUS_GET_BITRATE_REQUEST: @@ -504,7 +576,21 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) } *value = (OpusEncoder*)ptr; } - break; + break; + case OPUS_SET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 value = va_arg(ap, opus_int32); + if (value<0 || value>1) + goto bad_arg; + st->variable_duration = value; + } + break; + case OPUS_GET_EXPERT_FRAME_DURATION_REQUEST: + { + opus_int32 *value = va_arg(ap, opus_int32*); + *value = st->variable_duration; + } + break; default: ret = OPUS_UNIMPLEMENTED; break; @@ -512,6 +598,9 @@ int opus_multistream_encoder_ctl(OpusMSEncoder *st, int request, ...) va_end(ap); return ret; +bad_arg: + va_end(ap); + return OPUS_BAD_ARG; } void opus_multistream_encoder_destroy(OpusMSEncoder *st) diff --git a/src/opus_private.h b/src/opus_private.h index c9a4ff53cca5c5f6c1897301e8543b95098cadf0..2caac689ec8cea21f8f0b3b49c52c90decbaa9ed 100644 --- a/src/opus_private.h +++ b/src/opus_private.h @@ -31,6 +31,7 @@ #include "arch.h" #include "opus.h" +#include "celt.h" struct OpusRepacketizer { unsigned char toc; @@ -81,11 +82,24 @@ int get_mono_channel(const ChannelLayout *layout, int stream_id, int prev); #define OPUS_SET_FORCE_MODE_REQUEST 11002 #define OPUS_SET_FORCE_MODE(x) OPUS_SET_FORCE_MODE_REQUEST, __opus_check_int(x) +typedef void (*downmix_func)(const void *, float *, int, int, int); +void downmix_float(const void *_x, float *sub, int subframe, int offset, int C); +void downmix_int(const void *_x, float *sub, int subframe, int offset, int C); + +int optimize_framesize(const opus_val16 *x, int len, int C, opus_int32 Fs, + int bitrate, opus_val16 tonality, opus_val32 *mem, int buffering, + downmix_func downmix); int encode_size(int size, unsigned char *data); +opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs); + opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_size, - unsigned char *data, opus_int32 out_data_bytes, int lsb_depth); + unsigned char *data, opus_int32 out_data_bytes, int lsb_depth +#ifndef FIXED_POINT + , AnalysisInfo *analysis_info +#endif + ); int opus_decode_native(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_val16 *pcm, int frame_size, int decode_fec, int self_delimited,