diff --git a/dnn/dump_data.c b/dnn/dump_data.c index f5544db3a79b646da46ce1ee1e09b3e8371fad2a..cc272993b3e24081004514e26880000022b1e4da 100644 --- a/dnn/dump_data.c +++ b/dnn/dump_data.c @@ -138,9 +138,18 @@ int main(int argc, char **argv) { int encode = 0; int decode = 0; int quantize = 0; + int burg = 0; srand(getpid()); st = lpcnet_encoder_create(); argv0=argv[0]; + if (argc == 5 && strcmp(argv[1], "-btrain")==0) { + burg = 1; + training = 1; + } + if (argc == 4 && strcmp(argv[1], "-btest")==0) { + burg = 1; + training = 0; + } if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1; if (argc == 5 && strcmp(argv[1], "-qtrain")==0) { training = 1; @@ -236,7 +245,8 @@ int main(int argc, char **argv) { if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break; if (training && ++gain_change_count > 2821) { float tmp, tmp2; - speech_gain = pow(10., (-20+(rand()%40))/20.); + speech_gain = pow(10., (-30+(rand()%40))/20.); + if (rand()&1) speech_gain = -speech_gain; if (rand()%20==0) speech_gain *= .01; if (rand()%100==0) speech_gain = 0; gain_change_count = 0; @@ -247,13 +257,18 @@ int main(int argc, char **argv) { } biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE); biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE); - preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE); for (i=0;i<FRAME_SIZE;i++) { float g; float f = (float)i/FRAME_SIZE; g = f*speech_gain + (1-f)*old_speech_gain; x[i] *= g; } + if (burg) { + float ceps[2*NB_BANDS]; + burg_cepstral_analysis(ceps, x); + fwrite(ceps, sizeof(float), 2*NB_BANDS, ffeat); + } + preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE); for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5; /* PCM is delayed by 1/2 frame to make the features centered on the frames. */ for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]); diff --git a/dnn/freq.c b/dnn/freq.c index 0b3b6dabef6eee408f58167f8a01179a12acf8c5..102471934687261fd50c5f1eea6a7ff820fa8601 100644 --- a/dnn/freq.c +++ b/dnn/freq.c @@ -155,7 +155,7 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) { } } -void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order) { +void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order) { int i; float burg_in[FRAME_SIZE]; float burg_lpc[LPC_ORDER]; @@ -190,6 +190,19 @@ void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int burg_cepstrum[0] += - 4; } +void burg_cepstral_analysis(float *ceps, const float *x) { + int i; + compute_burg_cepstrum(x, &ceps[0 ], FRAME_SIZE/2, LPC_ORDER); + compute_burg_cepstrum(&x[FRAME_SIZE/2], &ceps[NB_BANDS], FRAME_SIZE/2, LPC_ORDER); + for (i=0;i<NB_BANDS;i++) { + float c0, c1; + c0 = ceps[i]; + c1 = ceps[NB_BANDS+i]; + ceps[i ] = .5*(c0+c1); + ceps[NB_BANDS+i] = (c0-c1); + } +} + void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) { int i; float sum[NB_BANDS] = {0}; diff --git a/dnn/freq.h b/dnn/freq.h index ca4553fb468798452e07e7db26683d5a4393e378..f64952cc4a65b65472ed302a01fd0eb34581c3bd 100644 --- a/dnn/freq.h +++ b/dnn/freq.h @@ -47,7 +47,8 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X); void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P); -void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order); +void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order); +void burg_cepstral_analysis(float *ceps, const float *x); void apply_window(float *x); void dct(float *out, const float *in); diff --git a/dnn/lpcnet_plc.c b/dnn/lpcnet_plc.c index 788801b0bb2765399e6ad9362f3dcda330aaf9dc..0c7da2c701e85a9770c26d3b18efcc927298c13c 100644 --- a/dnn/lpcnet_plc.c +++ b/dnn/lpcnet_plc.c @@ -75,7 +75,9 @@ LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) { float x[FRAME_SIZE]; short output[FRAME_SIZE]; #if PLC_DNN_PRED - float plc_features[NB_FEATURES+1]; + float plc_features[2*NB_BANDS+NB_FEATURES+1]; + for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i]; + burg_cepstral_analysis(plc_features, x); #endif st->enc.pcount = 0; if (st->skip_analysis) { @@ -105,8 +107,8 @@ LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) { process_single_frame(&st->enc, NULL); #if PLC_DNN_PRED if (st->skip_analysis <= 1) { - RNN_COPY(plc_features, st->enc.features[0], NB_FEATURES); - plc_features[NB_FEATURES] = 1; + RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES); + plc_features[2*NB_BANDS+NB_FEATURES] = 1; compute_plc_pred(&st->plc_net, st->features, plc_features); } #else @@ -142,7 +144,7 @@ LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm) { int i; #endif short output[FRAME_SIZE]; - float zeros[NB_FEATURES+1] = {0}; + float zeros[2*NB_BANDS+NB_FEATURES+1] = {0}; st->enc.pcount = 0; /* If we concealed the previous frame, finish synthesizing the rest of the samples. */ /* FIXME: Copy/predict features. */ diff --git a/dnn/lpcnet_private.h b/dnn/lpcnet_private.h index e1b4c99ad44f515a8d6135540ee06c05d66f2c83..362344260bd8da044e9fcca5ddec6ca8019b7f6d 100644 --- a/dnn/lpcnet_private.h +++ b/dnn/lpcnet_private.h @@ -64,6 +64,7 @@ struct LPCNetEncState{ float features[4][NB_TOTAL_FEATURES]; float sig_mem[LPC_ORDER]; int exc_mem; + float burg_cepstrum[2*NB_BANDS]; }; #define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET) diff --git a/dnn/training_tf2/lpcnet_plc.py b/dnn/training_tf2/lpcnet_plc.py index 04be0e0acc0a90a4aad8f86476cb643a063cc221..9acea419a90052fef964b7a685324360e051aa12 100644 --- a/dnn/training_tf2/lpcnet_plc.py +++ b/dnn/training_tf2/lpcnet_plc.py @@ -62,8 +62,8 @@ class WeightClip(Constraint): constraint = WeightClip(0.992) -def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128): - feat = Input(shape=(None, nb_used_features), batch_size=batch_size) +def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128): + feat = Input(shape=(None, nb_used_features+nb_burg_features), batch_size=batch_size) lost = Input(shape=(None, 1), batch_size=batch_size) fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1') @@ -96,5 +96,6 @@ def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, batch_size=128, tra model.rnn_units = rnn_units model.cond_size = cond_size model.nb_used_features = nb_used_features + model.nb_burg_features = nb_burg_features return model diff --git a/dnn/training_tf2/plc_loader.py b/dnn/training_tf2/plc_loader.py index 004fd70d06c117639e70ce8a315be2c50aeaece4..13414fd8cc388f5756d372848d6c82bc383b0ac4 100644 --- a/dnn/training_tf2/plc_loader.py +++ b/dnn/training_tf2/plc_loader.py @@ -29,12 +29,13 @@ import numpy as np from tensorflow.keras.utils import Sequence class PLCLoader(Sequence): - def __init__(self, features, lost, batch_size): + def __init__(self, features, lost, nb_burg_features, batch_size): self.batch_size = batch_size self.nb_batches = features.shape[0]//self.batch_size self.features = features[:self.nb_batches*self.batch_size, :, :] self.lost = lost.astype('float') self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]] + self.nb_burg_features = nb_burg_features self.on_epoch_end() def on_epoch_end(self): @@ -51,7 +52,7 @@ class PLCLoader(Sequence): lost = np.reshape(lost, (features.shape[0], features.shape[1], 1)) lost_mask = np.tile(lost, (1,1,features.shape[2])) - out_features = np.concatenate([features, 1.-lost], axis=-1) + out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-lost], axis=-1) inputs = [features*lost_mask, lost] outputs = [out_features] return (inputs, outputs) diff --git a/dnn/training_tf2/train_plc.py b/dnn/training_tf2/train_plc.py index 0ccca6e1c36383c373d9779e711f1334c456b0d5..c7a280efe90acf03863f187880e4f6784b14ba1c 100644 --- a/dnn/training_tf2/train_plc.py +++ b/dnn/training_tf2/train_plc.py @@ -140,8 +140,9 @@ with strategy.scope(): lpc_order = 16 feature_file = args.features -nb_features = model.nb_used_features + lpc_order +nb_features = model.nb_used_features + lpc_order + model.nb_burg_features nb_used_features = model.nb_used_features +nb_burg_features = model.nb_burg_features sequence_size = args.seq_length # u for unquantised, load 16 bit PCM samples and convert to mu-law @@ -153,7 +154,7 @@ features = features[:nb_sequences*sequence_size*nb_features] features = np.reshape(features, (nb_sequences, sequence_size, nb_features)) -features = features[:, :, :nb_used_features] +features = features[:, :, :nb_used_features+model.nb_burg_features] lost = np.memmap(args.lost_file, dtype='int8', mode='r') @@ -169,7 +170,7 @@ if quantize or retrain: model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size)) -loader = PLCLoader(features, lost, batch_size) +loader = PLCLoader(features, lost, nb_burg_features, batch_size) callbacks = [checkpoint] if args.logdir is not None: