From d3321008081396024ca83e9ec5a5af894776c0a0 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@amazon.com> Date: Sat, 3 Jul 2021 14:54:08 -0400 Subject: [PATCH] Representing output pdf as binary probability tree Saves on the MDense/softmax computation since we only need to compute 8 values instead of 256. --- dnn/lpcnet.c | 10 ++-- dnn/nnet.c | 95 ++++++++++++++------------------- dnn/nnet.h | 2 + dnn/training_tf2/dump_lpcnet.py | 2 +- dnn/training_tf2/lpcnet.py | 15 ++++-- dnn/vec.h | 6 +-- dnn/vec_avx.h | 36 ++++++++----- dnn/vec_neon.h | 77 +++++++++++++++++++++++--- 8 files changed, 154 insertions(+), 89 deletions(-) diff --git a/dnn/lpcnet.c b/dnn/lpcnet.c index 0e3c6b118..152c56f19 100644 --- a/dnn/lpcnet.c +++ b/dnn/lpcnet.c @@ -77,7 +77,7 @@ void run_frame_network(LPCNetState *lpcnet, float *condition, float *gru_a_condi if (lpcnet->frame_count < 1000) lpcnet->frame_count++; } -void run_sample_network(NNetState *net, float *pdf, const float *condition, const float *gru_a_condition, int last_exc, int last_sig, int pred) +int run_sample_network(NNetState *net, const float *condition, const float *gru_a_condition, int last_exc, int last_sig, int pred) { float gru_a_input[3*GRU_A_STATE_SIZE]; float in_b[GRU_A_STATE_SIZE+FEATURE_DENSE2_OUT_SIZE]; @@ -90,7 +90,7 @@ void run_sample_network(NNetState *net, float *pdf, const float *condition, cons RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE); RNN_COPY(&in_b[GRU_A_STATE_SIZE], condition, FEATURE_DENSE2_OUT_SIZE); compute_gru2(&gru_b, net->gru_b_state, in_b); - compute_mdense(&dual_fc, pdf, net->gru_b_state); + return sample_mdense(&dual_fc, net->gru_b_state); } LPCNET_EXPORT int lpcnet_get_size() @@ -124,14 +124,11 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features, int i; float condition[FEATURE_DENSE2_OUT_SIZE]; float lpc[LPC_ORDER]; - float pdf[DUAL_FC_OUT_SIZE]; float gru_a_condition[3*GRU_A_STATE_SIZE]; int pitch; - float pitch_gain; /* Matches the Python code -- the 0.1 avoids rounding issues. */ pitch = (int)floor(.1 + 50*features[36]+100); pitch = IMIN(255, IMAX(33, pitch)); - pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1]; memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0])); lpcnet->old_gain[0] = features[PITCH_GAIN_FEATURE]; run_frame_network(lpcnet, condition, gru_a_condition, features, pitch); @@ -154,8 +151,7 @@ LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *lpcnet, const float *features, for (j=0;j<LPC_ORDER;j++) pred -= lpcnet->last_sig[j]*lpc[j]; last_sig_ulaw = lin2ulaw(lpcnet->last_sig[0]); pred_ulaw = lin2ulaw(pred); - run_sample_network(&lpcnet->nnet, pdf, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw); - exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, MAX16(0, 1.5f*pitch_gain - .5f), PDF_FLOOR); + exc = run_sample_network(&lpcnet->nnet, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw); pcm = pred + ulaw2lin(exc); RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1); lpcnet->last_sig[0] = pcm; diff --git a/dnn/nnet.c b/dnn/nnet.c index 27e94e695..fd3adcadb 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -141,6 +141,46 @@ void compute_mdense(const MDenseLayer *layer, float *output, const float *input) compute_activation(output, output, N, layer->activation); } +int sample_mdense(const MDenseLayer *layer, const float *input) +{ + int b, j, N, M, C, stride; + M = layer->nb_inputs; + N = layer->nb_neurons; + C = layer->nb_channels; + celt_assert(N*C <= MAX_MDENSE_TMP); + stride = M*C; + + celt_assert(N <= DUAL_FC_OUT_SIZE); + int val=0; + + for (b=0;b<8;b++) + { + int bit; + int i; + float sum1, sum2; + + i = (1<<b) | val; + + sum1 = layer->bias[i]; + sum2 = layer->bias[i + N]; + for (j=0;j<M;j++) { + sum1 += layer->input_weights[i*stride + j]*input[j]; + sum2 += layer->input_weights[i*stride + j + M]*input[j]; + } + sum1 = layer->factor[i]*tanh_approx(sum1); + sum2 = layer->factor[N + i]*tanh_approx(sum2); + sum1 += sum2; + //sum1 = 1.f/(1 + exp(-sum1)); + sum1 = sigmoid_approx(sum1); + + bit = .025+.95*((rand()+.5f)/(RAND_MAX+1.f)) < sum1; + val = (val << 1) | bit; + } + return val; + +} + + #if 0 void compute_gru(const GRULayer *gru, float *state, const float *input) { @@ -363,58 +403,3 @@ void accum_embedding(const EmbeddingLayer *layer, float *output, int input) output[i] += layer->embedding_weights[input*layer->dim + i]; } } - -int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor) -{ - int i; - float sum, norm; - float r; - float tmp[DUAL_FC_OUT_SIZE]; - celt_assert(N <= DUAL_FC_OUT_SIZE); - sum = 0; -#ifdef SOFTMAX_HACK - for (i=0;i<N;i++) - { - tmp[i] = pdf[i] * (1.f+exp_boost); - } - softmax(tmp, tmp, N); - for (i=0;i<N;i++) - { - sum += tmp[i]; - } -#else - /* Decrease the temperature of the sampling. */ - for (i=0;i<N;i++) - { - tmp[i] = pow(pdf[i], 1.f+exp_boost); - sum += tmp[i]; - } -#endif - norm = 1.f/sum; - /* Convert tmp to a CDF while subtracting the floor */ - tmp[0] = MAX16(0, norm*tmp[0] - pdf_floor); - for (i=1;i<N;i++) - { - tmp[i] = tmp[i-1] + MAX16(0, norm*tmp[i] - pdf_floor); - } - /* Do the sampling (from the cdf). */ - r = tmp[N-1] * ((rand()+.5f)/(RAND_MAX+1.f)); -#if 1 /* Bisection search in the CDF (faster than the equivalent linear one below). */ - { - int start=-1; - int end = N-1; - while (end > start+1) { - int mid = (start+end)>>1; - if (r <= tmp[mid]) end = mid; - else start = mid; - } - return end; - } -#else - for (i=0;i<N-1;i++) - { - if (r <= tmp[i]) return i; - } - return N-1; -#endif -} diff --git a/dnn/nnet.h b/dnn/nnet.h index ac0417695..c1b130cb0 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -97,6 +97,8 @@ void compute_dense(const DenseLayer *layer, float *output, const float *input); void compute_mdense(const MDenseLayer *layer, float *output, const float *input); +int sample_mdense(const MDenseLayer *layer, const float *input); + void compute_gru(const GRULayer *gru, float *state, const float *input); void compute_gru2(const GRULayer *gru, float *state, const float *input); diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 88cc42985..730ecb750 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -183,7 +183,7 @@ def dump_mdense_layer(self, f, hf): name = self.name print("printing layer " + name + " of type " + self.__class__.__name__) weights = self.get_weights() - printVector(f, np.transpose(weights[0], (1, 2, 0)), name + '_weights') + printVector(f, np.transpose(weights[0], (0, 2, 1)), name + '_weights') printVector(f, np.transpose(weights[1], (1, 0)), name + '_bias') printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor') activation = self.activation.__name__.upper() diff --git a/dnn/training_tf2/lpcnet.py b/dnn/training_tf2/lpcnet.py index 6bcc27b16..d4981dd4e 100644 --- a/dnn/training_tf2/lpcnet.py +++ b/dnn/training_tf2/lpcnet.py @@ -44,6 +44,15 @@ pcm_bits = 8 embed_size = 128 pcm_levels = 2**pcm_bits +def interleave(p): + p2=tf.expand_dims(p, 3) + nb_repeats = pcm_levels//(2*p.shape[2]) + p3 = tf.reshape(tf.repeat(tf.concat([1-p2, p2], 3), nb_repeats), (-1, 2400, pcm_levels)) + return p3 + +def tree_to_pdf(p): + return interleave(p[:,:,1:2]) * interleave(p[:,:,2:4]) * interleave(p[:,:,4:8]) * interleave(p[:,:,8:16]) * interleave(p[:,:,16:32]) * interleave(p[:,:,32:64]) * interleave(p[:,:,64:128]) * interleave(p[:,:,128:256]) + def quant_regularizer(x): Q = 128 Q_1 = 1./Q @@ -185,10 +194,10 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, train kernel_constraint=constraint, kernel_regularizer=quant) rnn_in = Concatenate()([cpcm, rep(cfeat)]) - md = MDense(pcm_levels, activation='softmax', name='dual_fc') + md = MDense(pcm_levels, activation='sigmoid', name='dual_fc') gru_out1, _ = rnn(rnn_in) gru_out2, _ = rnn2(Concatenate()([gru_out1, rep(cfeat)])) - ulaw_prob = md(gru_out2) + ulaw_prob = Lambda(tree_to_pdf)(md(gru_out2)) if adaptation: rnn.trainable=False @@ -207,7 +216,7 @@ def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, train dec_rnn_in = Concatenate()([cpcm, dec_feat]) dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1) dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2) - dec_ulaw_prob = md(dec_gru_out2) + dec_ulaw_prob = Lambda(tree_to_pdf)(md(dec_gru_out2)) decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2]) return model, encoder, decoder diff --git a/dnn/vec.h b/dnn/vec.h index ef44cba53..f93200723 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -79,7 +79,7 @@ static inline float celt_exp2(float x) } #define celt_exp(x) celt_exp2((x)*1.44269504f) -static inline float tansig_approx(float x) +static inline float tanh_approx(float x) { int i; float y, dy; @@ -100,7 +100,7 @@ static inline float tansig_approx(float x) static inline float sigmoid_approx(float x) { - return .5f + .5f*tansig_approx(.5f*x); + return .5f + .5f*tanh_approx(.5f*x); } static inline void softmax(float *y, const float *x, int N) @@ -115,7 +115,7 @@ static inline void vec_tanh(float *y, const float *x, int N) int i; for (i=0;i<N;i++) { - y[i] = tansig_approx(x[i]); + y[i] = tanh_approx(x[i]); } } diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index e6546918c..df02dca36 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -214,6 +214,26 @@ static inline float celt_exp(float x) return out[0]; } +static inline float tanh_approx(float x) +{ + float out[8]; + __m256 X, Y; + X = _mm256_set1_ps(x); + Y = tanh8_approx(X); + _mm256_storeu_ps(out, Y); + return out[0]; +} + +static inline float sigmoid_approx(float x) +{ + float out[8]; + __m256 X, Y; + X = _mm256_set1_ps(x); + Y = sigmoid8_approx(X); + _mm256_storeu_ps(out, Y); + return out[0]; +} + static inline void softmax(float *y, const float *x, int N) { int i; @@ -241,9 +261,7 @@ static inline void vec_tanh(float *y, const float *x, int N) } for (;i<N;i++) { - float ex2; - ex2 = celt_exp(2*x[i]); - y[i] = (ex2-1)/(ex2+1); + y[i] = tanh_approx(x[i]); } } @@ -259,9 +277,7 @@ static inline void vec_sigmoid(float *y, const float *x, int N) } for (;i<N;i++) { - float ex; - ex = celt_exp(x[i]); - y[i] = (ex)/(ex+1); + y[i] = sigmoid_approx(x[i]); } } #else @@ -277,9 +293,7 @@ static inline void vec_tanh(float *y, const float *x, int N) } for (;i<N;i++) { - float ex2; - ex2 = celt_exp(2*x[i]); - y[i] = (ex2-1)/(ex2+1); + y[i] = tanh_approx(x[i]); } } @@ -295,9 +309,7 @@ static inline void vec_sigmoid(float *y, const float *x, int N) } for (;i<N;i++) { - float ex; - ex = celt_exp(x[i]); - y[i] = (ex)/(ex+1); + y[i] = sigmoid_approx(x[i]); } } diff --git a/dnn/vec_neon.h b/dnn/vec_neon.h index 1209b760d..a964f7516 100644 --- a/dnn/vec_neon.h +++ b/dnn/vec_neon.h @@ -64,6 +64,47 @@ static inline OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) { return Y; } +static inline float32x4_t tanh4_approx(float32x4_t X) +{ + const float32x4_t N0 = vdupq_n_f32(952.52801514f); + const float32x4_t N1 = vdupq_n_f32(96.39235687f); + const float32x4_t N2 = vdupq_n_f32(0.60863042f); + const float32x4_t D0 = vdupq_n_f32(952.72399902f); + const float32x4_t D1 = vdupq_n_f32(413.36801147f); + const float32x4_t D2 = vdupq_n_f32(11.88600922f); + const float32x4_t max_out = vdupq_n_f32(1.f); + const float32x4_t min_out = vdupq_n_f32(-1.f); + float32x4_t X2, num, den; + X2 = vmulq_f32(X, X); + num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2)); + den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2)); + num = vmulq_f32(num, X); + den = vrecpeq_f32(den); + num = vmulq_f32(num, den); + return vmaxq_f32(min_out, vminq_f32(max_out, num)); +} + +static inline float32x4_t sigmoid4_approx(float32x4_t X) +{ + const float32x4_t N0 = vdupq_n_f32(238.13200378f); + const float32x4_t N1 = vdupq_n_f32(6.02452230f); + const float32x4_t N2 = vdupq_n_f32(0.00950985f); + const float32x4_t D0 = vdupq_n_f32(952.72399902f); + const float32x4_t D1 = vdupq_n_f32(103.34200287f); + const float32x4_t D2 = vdupq_n_f32(0.74287558f); + const float32x4_t half = vdupq_n_f32(0.5f); + const float32x4_t max_out = vdupq_n_f32(1.f); + const float32x4_t min_out = vdupq_n_f32(0.f); + float32x4_t X2, num, den; + X2 = vmulq_f32(X, X); + num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2)); + den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2)); + num = vmulq_f32(num, X); + den = vrecpeq_f32(den); + num = vmlaq_f32(half, num, den); + return vmaxq_f32(min_out, vminq_f32(max_out, num)); +} + static inline float celt_exp(float x) { float out[4]; @@ -74,6 +115,26 @@ static inline float celt_exp(float x) return out[0]; } +static inline float tanh_approx(float x) +{ + float out[4]; + float32x4_t X, Y; + X = vdupq_n_f32(x); + Y = tanh4_approx(X); + vst1q_f32(out, Y); + return out[0]; +} + +static inline float sigmoid_approx(float x) +{ + float out[4]; + float32x4_t X, Y; + X = vdupq_n_f32(x); + Y = sigmoid4_approx(X); + vst1q_f32(out, Y); + return out[0]; +} + static inline void softmax(float *y, const float *x, int N) { int i; @@ -93,13 +154,9 @@ static inline void vec_tanh(float *y, const float *x, int N) int i; for (i=0;i<N-3;i+=4) { - const float32x4_t two = vdupq_n_f32(2.f); - const float32x4_t one = vdupq_n_f32(1.f); float32x4_t X, Y; X = vld1q_f32(&x[i]); - X = vmulq_f32(X, two); - Y = exp4_approx(X); - Y = vmulq_f32(vsubq_f32(Y, one), vrecpeq_f32(vaddq_f32(Y, one))); + Y = tanh4_approx(X); vst1q_f32(&y[i], Y); } for (;i<N;i++) @@ -115,11 +172,9 @@ static inline void vec_sigmoid(float *y, const float *x, int N) int i; for (i=0;i<N-3;i+=4) { - const float32x4_t one = vdupq_n_f32(1.f); float32x4_t X, Y; X = vld1q_f32(&x[i]); - Y = exp4_approx(X); - Y = vmulq_f32(Y, vrecpeq_f32(vaddq_f32(Y, one))); + Y = sigmoid4_approx(X); vst1q_f32(&y[i], Y); } for (;i<N;i++) @@ -221,10 +276,16 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co #define MAX_INPUTS 2048 #define MAX_OUTPUTS 8192 +#if __ARM_FEATURE_DOTPROD +static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) { + return vdotq_s32(acc, a, b); +} +#else static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) { return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)), vmull_high_s8(a, b))); } +#endif static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x) { -- GitLab