From e62fd5c5c9f581e7c15de9ce62af041ec847030e Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@amazon.com> Date: Sun, 30 Jul 2023 15:31:59 -0400 Subject: [PATCH] C implementation of FWGAN --- autogen.sh | 2 +- dnn/fwgan.c | 196 ++++++++++++++++++++++++++++++++++++++++++++-- dnn/fwgan.h | 30 ++++++- dnn/nnet.c | 8 +- dnn/nnet.h | 2 + lpcnet_headers.mk | 1 + lpcnet_sources.mk | 1 + 7 files changed, 230 insertions(+), 10 deletions(-) diff --git a/autogen.sh b/autogen.sh index f87f41228..9ebf28f3b 100755 --- a/autogen.sh +++ b/autogen.sh @@ -9,7 +9,7 @@ set -e srcdir=`dirname $0` test -n "$srcdir" && cd "$srcdir" -dnn/download_model.sh ad05730 +dnn/download_model.sh 155367d echo "Updating build configuration files, please wait...." diff --git a/dnn/fwgan.c b/dnn/fwgan.c index bce3670aa..ea4f0e07d 100644 --- a/dnn/fwgan.c +++ b/dnn/fwgan.c @@ -31,21 +31,207 @@ #include "fwgan.h" #include "os_support.h" #include "freq.h" +#include "fwgan_data.h" +#include "lpcnet.h" +#include "pitch.h" +#include "nnet.h" -void fwgan_init(FWGANState *st, const float *pcm) +#define NB_SUBFRAMES 4 +#define SUBFRAME_SIZE 40 +#define FWGAN_FRAME_SIZE (NB_SUBFRAMES*SUBFRAME_SIZE) +#define CONT_PCM_INPUTS 320 +#define MAX_CONT_SIZE CONT_NET_0_OUT_SIZE +#define FWGAN_GAMMA 0.92f +#define FWGAN_DEEMPHASIS 0.85f + +#define FEAT_IN_SIZE (BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4 + FWGAN_FRAME_SIZE/2) + +#define FWGAN_FEATURES (NB_FEATURES-1) + +static void pitch_embeddings(float *pembed, double *phase, double w0) { + int i; + /* FIXME: This could be speeded up by making phase a unit-norm complex value, rotating it + by exp(-i*w0) each sample, and renormalizing once in while. */ + for (i=0;i<SUBFRAME_SIZE;i++) { + *phase += w0; + pembed[i] = sin(*phase); + pembed[SUBFRAME_SIZE+i] = cos(*phase); + } +} + +static void run_fwgan_upsampler(FWGANState *st, float *cond, const float *features) { - OPUS_CLEAR(st, 1); + FWGAN *model; + model = &st->model; + celt_assert(FWGAN_FEATURES == model->bfcc_with_corr_upsampler_fc.nb_inputs); + celt_assert(BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE == model->bfcc_with_corr_upsampler_fc.nb_outputs); + compute_generic_dense(&model->bfcc_with_corr_upsampler_fc, cond, features, ACTIVATION_TANH); +} + +void fwgan_cont(FWGANState *st, const float *pcm0, const float *features0) +{ + int i; + float norm2, norm_1; + float cont_inputs[CONT_PCM_INPUTS+1]; + float tmp1[MAX_CONT_SIZE]; + float tmp2[MAX_CONT_SIZE]; + FWGAN *model; + model = &st->model; + norm2 = celt_inner_prod(pcm0, pcm0, CONT_PCM_INPUTS, st->arch); + norm_1 = 1.f/sqrt(1e-8f + norm2); + for (i=0;i<CONT_PCM_INPUTS;i++) cont_inputs[i+1] = norm_1*pcm0[i]; + cont_inputs[0] = log(sqrt(norm2) + 1e-7f); + + compute_generic_dense(&model->cont_net_0, tmp1, cont_inputs, ACTIVATION_TANH); + compute_generic_dense(&model->cont_net_2, tmp2, tmp1, ACTIVATION_TANH); + compute_generic_dense(&model->cont_net_4, tmp1, tmp2, ACTIVATION_TANH); + compute_generic_dense(&model->cont_net_6, tmp2, tmp1, ACTIVATION_TANH); + compute_generic_dense(&model->cont_net_8, tmp1, tmp2, ACTIVATION_TANH); + celt_assert(CONT_NET_10_OUT_SIZE == model->cont_net_10.nb_outputs); + compute_generic_dense(&model->cont_net_10, cont_inputs, tmp1, ACTIVATION_TANH); + + celt_assert(RNN_GRU_STATE_SIZE == model->rnn_cont_fc_0.nb_outputs); + compute_generic_dense(&model->rnn_cont_fc_0, st->rnn_state, cont_inputs, ACTIVATION_TANH); + + celt_assert(FWC1_STATE_SIZE == model->fwc1_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc1_cont_fc_0, st->fwc1_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC2_STATE_SIZE == model->fwc2_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc2_cont_fc_0, st->fwc2_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC3_STATE_SIZE == model->fwc3_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc3_cont_fc_0, st->fwc3_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC4_STATE_SIZE == model->fwc4_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc4_cont_fc_0, st->fwc4_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC5_STATE_SIZE == model->fwc5_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc5_cont_fc_0, st->fwc5_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC6_STATE_SIZE == model->fwc6_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc6_cont_fc_0, st->fwc6_state, cont_inputs, ACTIVATION_TANH); + celt_assert(FWC7_STATE_SIZE == model->fwc7_cont_fc_0.nb_outputs); + compute_generic_dense(&model->fwc7_cont_fc_0, st->fwc7_state, cont_inputs, ACTIVATION_TANH); + + /* FIXME: Do we need to handle initial features? How? */ +} + +static void apply_gain(float *pcm, float c0, float *last_gain) { + int i; + float gain = pow(10.f, (0.5f*c0/sqrt(18.f))); + for (i=0;i<SUBFRAME_SIZE;i++) pcm[i] *= *last_gain; + *last_gain = gain; +} + +static void fwgan_lpc_syn(float *pcm, float *mem, const float *lpc, float last_lpc[LPC_ORDER]) { + int i; + for (i=0;i<SUBFRAME_SIZE;i++) { + int j; + for (j=0;j<LPC_ORDER;j++) pcm[i] -= mem[j]*last_lpc[j]; + OPUS_MOVE(&mem[1], &mem[0], LPC_ORDER-1); + mem[0] = pcm[i]; + } + OPUS_COPY(last_lpc, lpc, LPC_ORDER); +} + +static void fwgan_preemphasis(float *pcm, float *preemph_mem) { + int i; + for (i=0;i<SUBFRAME_SIZE;i++) { + float tmp = pcm[i]; + pcm[i] -= FWGAN_DEEMPHASIS * *preemph_mem; + *preemph_mem = tmp; + } } -static void run_fwgan(FWGANState *st, float *pcm, const float *input) +static void fwgan_deemphasis(float *pcm, float *deemph_mem) { + int i; + for (i=0;i<FWGAN_FRAME_SIZE;i++) { + pcm[i] += FWGAN_DEEMPHASIS * *deemph_mem; + *deemph_mem = pcm[i]; + } +} + +static void run_fwgan_subframe(FWGANState *st, float *pcm, const float *cond, double w0) { + float tmp1[FWC1_FC_0_OUT_SIZE]; + float tmp2[IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE)]; + float feat_in[FEAT_IN_SIZE]; + float rnn_in[FEAT_IN_CONV1_CONV_OUT_SIZE]; + float pembed[FWGAN_FRAME_SIZE/2]; + FWGAN *model; + model = &st->model; + + pitch_embeddings(pembed, &st->embed_phase, w0); + /* Interleave bfcc_cond and pembed for each subframe in feat_in. */ + OPUS_COPY(&feat_in[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4], &cond[0], BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4); + OPUS_COPY(&feat_in[0], &pembed[0], FWGAN_FRAME_SIZE/2); + + compute_generic_conv1d(&model->feat_in_conv1_conv, rnn_in, st->cont_conv1_mem, feat_in, FEAT_IN_CONV1_CONV_IN_SIZE, ACTIVATION_LINEAR); + celt_assert(FEAT_IN_NL1_GATE_OUT_SIZE == model->feat_in_nl1_gate.nb_outputs); + compute_gated_activation(&model->feat_in_nl1_gate, rnn_in, rnn_in, ACTIVATION_TANH); + + + compute_generic_gru(&model->rnn_gru_input, &model->rnn_gru_recurrent, st->rnn_state, rnn_in); + celt_assert(IMAX(RNN_GRU_STATE_SIZE, FWC2_FC_0_OUT_SIZE) >= model->rnn_nl_gate.nb_outputs); + compute_gated_activation(&model->rnn_nl_gate, tmp2, st->rnn_state, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc1_fc_0, tmp1, st->fwc1_state, tmp2, RNN_GRU_STATE_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc1_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc2_fc_0, tmp2, st->fwc2_state, tmp1, FWC1_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc2_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc3_fc_0, tmp1, st->fwc3_state, tmp2, FWC2_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc3_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc4_fc_0, tmp2, st->fwc4_state, tmp1, FWC3_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc4_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH); + compute_generic_conv1d(&model->fwc5_fc_0, tmp1, st->fwc5_state, tmp2, FWC4_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc5_fc_1_gate, tmp1, tmp1, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc6_fc_0, tmp2, st->fwc6_state, tmp1, FWC5_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc6_fc_1_gate, tmp2, tmp2, ACTIVATION_TANH); + + compute_generic_conv1d(&model->fwc7_fc_0, tmp1, st->fwc7_state, tmp2, FWC6_FC_0_OUT_SIZE, ACTIVATION_LINEAR); + compute_gated_activation(&model->fwc7_fc_1_gate, pcm, tmp1, ACTIVATION_TANH); +} + + + +void fwgan_init(FWGANState *st) +{ + int ret; + OPUS_CLEAR(st, 1); + ret = init_fwgan(&st->model, fwgan_arrays); + celt_assert(ret == 0); + /* FIXME: perform arch detection. */ } void fwgan_synthesize(FWGANState *st, float *pcm, const float *features) { + int subframe; float lpc[LPC_ORDER]; + float cond[BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE]; + double w0; + int period; + float lpc_weight; + float fwgan_features[NB_FEATURES-1]; + int i; + OPUS_COPY(fwgan_features, features, NB_FEATURES-2); + fwgan_features[NB_FEATURES-2] = features[NB_FEATURES-1]+.5; + + period = (int)floor(.1 + 50*features[NB_BANDS]+100); + w0 = 2*M_PI/period; lpc_from_cepstrum(lpc, features); - run_fwgan(st, pcm, features); - /* Run LPC filter. */ + lpc_weight = 1.f; + for (i=0;i<LPC_ORDER;i++) { + lpc_weight *= FWGAN_GAMMA; + lpc[i] *= lpc_weight; + } + run_fwgan_upsampler(st, cond, fwgan_features); + for (subframe=0;subframe<NB_SUBFRAMES;subframe++) { + float *sub_cond; + sub_cond = &cond[subframe*BFCC_WITH_CORR_UPSAMPLER_FC_OUT_SIZE/4]; + run_fwgan_subframe(st, &pcm[subframe*SUBFRAME_SIZE], sub_cond, w0); + apply_gain(&pcm[subframe*SUBFRAME_SIZE], features[0], &st->last_gain); + fwgan_preemphasis(&pcm[subframe*SUBFRAME_SIZE], &st->preemph_mem); + fwgan_lpc_syn(&pcm[subframe*SUBFRAME_SIZE], st->syn_mem, lpc, st->last_lpc); + } + fwgan_deemphasis(pcm, &st->deemph_mem); } diff --git a/dnn/fwgan.h b/dnn/fwgan.h index 84749176c..7da11b8f4 100644 --- a/dnn/fwgan.h +++ b/dnn/fwgan.h @@ -28,15 +28,43 @@ #define FWGAN_H #include "freq.h" +#include "fwgan_data.h" #define FWGAN_CONT_SAMPLES 320 +/* FIXME: Derive those from the model rather than hardcoding. */ +#define FWC1_STATE_SIZE 512 +#define FWC2_STATE_SIZE 512 +#define FWC3_STATE_SIZE 256 +#define FWC4_STATE_SIZE 256 +#define FWC5_STATE_SIZE 128 +#define FWC6_STATE_SIZE 128 +#define FWC7_STATE_SIZE 80 + typedef struct { + FWGAN model; + int arch; + double embed_phase; + float last_gain; + float last_lpc[LPC_ORDER]; float syn_mem[LPC_ORDER]; + float preemph_mem; + float deemph_mem; + float cont_conv1_mem[FEAT_IN_CONV1_CONV_STATE_SIZE]; + float cont[FEAT_IN_NL1_GATE_OUT_SIZE]; + float rnn_state[RNN_GRU_STATE_SIZE]; + float fwc1_state[FWC1_STATE_SIZE]; + float fwc2_state[FWC2_STATE_SIZE]; + float fwc3_state[FWC3_STATE_SIZE]; + float fwc4_state[FWC4_STATE_SIZE]; + float fwc5_state[FWC5_STATE_SIZE]; + float fwc6_state[FWC6_STATE_SIZE]; + float fwc7_state[FWC7_STATE_SIZE]; } FWGANState; -void fwgan_init(FWGANState *st, const float *pcm); +void fwgan_init(FWGANState *st); +void fwgan_cont(FWGANState *st, const float *pcm0, const float *features0); void fwgan_synthesize(FWGANState *st, float *pcm, const float *features); diff --git a/dnn/nnet.c b/dnn/nnet.c index 05b0ea909..0bb228feb 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -73,6 +73,7 @@ void compute_linear(const LinearLayer *linear, float *out, const float *in) { int i, M, N; const float *bias; + celt_assert(in != out); bias = linear->bias; M = linear->nb_inputs; N = linear->nb_outputs; @@ -146,11 +147,12 @@ void compute_gated_activation(const LinearLayer *layer, float *output, const flo { int i; float act1[MAX_INPUTS]; + float act2[MAX_INPUTS]; celt_assert(layer->nb_inputs == layer->nb_outputs); - compute_linear(layer, output, input); - compute_activation(output, output, layer->nb_outputs, ACTIVATION_SIGMOID); compute_activation(act1, input, layer->nb_outputs, activation); - for (i=0;i<layer->nb_outputs;i++) output[i] *= act1[i]; + compute_linear(layer, act2, input); + compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID); + for (i=0;i<layer->nb_outputs;i++) output[i] = act1[i]*act2[i]; } void compute_activation(float *output, const float *input, int N, int activation) diff --git a/dnn/nnet.h b/dnn/nnet.h index 2916b33f7..2b43308a7 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -135,6 +135,7 @@ void compute_linear(const LinearLayer *linear, float *out, const float *in); void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation); void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in); void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation); +void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation); void compute_activation(float *output, const float *input, int N, int activation); @@ -163,6 +164,7 @@ extern const WeightArray lpcnet_arrays[]; extern const WeightArray lpcnet_plc_arrays[]; extern const WeightArray rdovaeenc_arrays[]; extern const WeightArray rdovaedec_arrays[]; +extern const WeightArray fwgan_arrays[]; int linear_init(LinearLayer *layer, const WeightArray *arrays, const char *bias, diff --git a/lpcnet_headers.mk b/lpcnet_headers.mk index 93ca74fbf..fc3fc84c5 100644 --- a/lpcnet_headers.mk +++ b/lpcnet_headers.mk @@ -9,6 +9,7 @@ dnn/burg.h \ dnn/common.h \ dnn/freq.h \ dnn/fwgan.h \ +dnn/fwgan_data.h \ dnn/kiss99.h \ dnn/lpcnet_private.h \ dnn/nnet_data.h \ diff --git a/lpcnet_sources.mk b/lpcnet_sources.mk index 4c6e73f32..61cbb1f1c 100644 --- a/lpcnet_sources.mk +++ b/lpcnet_sources.mk @@ -2,6 +2,7 @@ LPCNET_SOURCES = \ dnn/burg.c \ dnn/freq.c \ dnn/fwgan.c \ +dnn/fwgan_data.c \ dnn/kiss99.c \ dnn/lpcnet.c \ dnn/lpcnet_enc.c \ -- GitLab