From bce779886de72547db991c5d4dec1ae1fc1a7a80 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@jmvalin.ca> Date: Sat, 26 Dec 2020 02:15:57 -0500 Subject: [PATCH] WIP: signed*unsigned arithmetic --- dnn/nnet.c | 22 +++--- dnn/nnet.h | 9 +-- dnn/tansig_table.h | 5 ++ dnn/training_tf2/dump_lpcnet.py | 7 +- dnn/vec.h | 130 +++++++++++++++++++++++++------- dnn/vec_avx.h | 7 +- 6 files changed, 133 insertions(+), 47 deletions(-) diff --git a/dnn/nnet.c b/dnn/nnet.c index 0d81562a4..26027eed8 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -39,17 +39,14 @@ #include "nnet.h" #include "nnet_data.h" -#define SOFTMAX_HACK - -#ifdef __AVX__ -#include "vec_avx.h" -#elif __ARM_NEON__ -#include "vec_neon.h" -#else +#ifdef NO_OPTIMIZATIONS #warning Compiling without any vectorization. This code will be very slow -#include "vec.h" #endif + +#define SOFTMAX_HACK + + static OPUS_INLINE float relu(float x) { return x < 0 ? 0 : x; @@ -294,14 +291,19 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in celt_assert(input != state); celt_assert(gru->reset_after); RNN_COPY(zrh, input, 3*N); +#ifdef USE_SU_BIAS for (i=0;i<3*N;i++) - recur[i] = gru->bias[3*N + i]; + recur[i] = gru->subias[3*N + i]; +#else + for (i=0;i<3*N;i++) + recur[i] = gru->bias[3*N + i]; +#endif for (k=0;k<3;k++) { for (i=0;i<N;i++) recur[k*N + i] += gru->diag_weights[k*N + i]*state[i]; } - sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state); + sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state); for (i=0;i<2*N;i++) zrh[i] += recur[i]; compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID); diff --git a/dnn/nnet.h b/dnn/nnet.h index a0033de6e..de74be750 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -28,18 +28,14 @@ #ifndef _NNET_H_ #define _NNET_H_ +#include "vec.h" + #define ACTIVATION_LINEAR 0 #define ACTIVATION_SIGMOID 1 #define ACTIVATION_TANH 2 #define ACTIVATION_RELU 3 #define ACTIVATION_SOFTMAX 4 -#ifdef DOT_PROD -typedef signed char qweight; -#else -typedef float qweight; -#endif - typedef struct { const float *bias; const float *input_weights; @@ -70,6 +66,7 @@ typedef struct { typedef struct { const float *bias; + const float *subias; const float *diag_weights; const qweight *recurrent_weights; const int *idx; diff --git a/dnn/tansig_table.h b/dnn/tansig_table.h index c76f844a7..ebec7e3a9 100644 --- a/dnn/tansig_table.h +++ b/dnn/tansig_table.h @@ -1,5 +1,8 @@ /* This file is auto-generated by gen_tables */ +#ifndef TANSIG_TABLE_H +#define TANSIG_TABLE_H + static const float tansig_table[201] = { 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f, 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f, @@ -43,3 +46,5 @@ static const float tansig_table[201] = { 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, }; + +#endif /*TANSIG_TABLE_H*/ diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 779824491..100d7a47a 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -102,6 +102,9 @@ def dump_sparse_gru(self, f, hf): weights = self.get_weights() printSparseVector(f, weights[1], name + '_recurrent_weights') printVector(f, weights[-1], name + '_bias') + subias = weights[-1].copy() + subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0) + printVector(f, subias, name + '_subias') if hasattr(self, 'activation'): activation = self.activation.__name__.upper() else: @@ -112,8 +115,8 @@ def dump_sparse_gru(self, f, hf): reset_after = 1 neurons = weights[0].shape[1]//3 max_rnn_neurons = max(max_rnn_neurons, neurons) - f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n' - .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after)) + f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n' + .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after)) hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('extern const SparseGRULayer {};\n\n'.format(name)); diff --git a/dnn/vec.h b/dnn/vec.h index 7317c1736..646ed324f 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -26,11 +26,33 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "nnet.h" +#ifndef VEC_H +#define VEC_H + +#include "tansig_table.h" +#include "opus_types.h" +#include <math.h> +#include "arch.h" + +#ifdef DOT_PROD +typedef signed char qweight; +#else +typedef float qweight; +#endif + +#ifdef __AVX__ +#include "vec_avx.h" +#elif __ARM_NEON__ +#include "vec_neon.h" +#else + +//#define USE_SU_BIAS + +#define NO_OPTIMIZATIONS /* No AVX2/FMA support */ #ifndef LPCNET_TEST -static float celt_exp2(float x) +static inline float celt_exp2(float x) { int integer; float frac; @@ -50,7 +72,7 @@ static float celt_exp2(float x) } #define celt_exp(x) celt_exp2((x)*1.44269504f) -static float tansig_approx(float x) +static inline float tansig_approx(float x) { int i; float y, dy; @@ -69,19 +91,19 @@ static float tansig_approx(float x) return sign*y; } -static OPUS_INLINE float sigmoid_approx(float x) +static inline float sigmoid_approx(float x) { return .5f + .5f*tansig_approx(.5f*x); } -static void softmax(float *y, const float *x, int N) +static inline void softmax(float *y, const float *x, int N) { int i; for (i=0;i<N;i++) y[i] = celt_exp(x[i]); } -static void vec_tanh(float *y, const float *x, int N) +static inline void vec_tanh(float *y, const float *x, int N) { int i; for (i=0;i<N;i++) @@ -90,7 +112,7 @@ static void vec_tanh(float *y, const float *x, int N) } } -static void vec_sigmoid(float *y, const float *x, int N) +static inline void vec_sigmoid(float *y, const float *x, int N) { int i; for (i=0;i<N;i++) @@ -99,7 +121,7 @@ static void vec_sigmoid(float *y, const float *x, int N) } } #endif -static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x) +static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x) { int i, j; for (i=0;i<rows;i+=16) @@ -132,7 +154,7 @@ static void sgemv_accum16(float *out, const float *weights, int rows, int cols, } } -static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x) +static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x) { int i, j; for (i=0;i<rows;i+=16) @@ -167,42 +189,90 @@ static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int } #ifdef DOT_PROD + +#define MAX_INPUTS (2048) + + +#define SCALE (128.f*127.f) #define SCALE_1 (1.f/128.f/127.f) -static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x) + +#ifdef USE_SU_BIAS +static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x) { int i, j; + unsigned x[MAX_INPUTS]; + for (i=0;i<rows;i++) out[i] *= SCALE; + for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]); for (i=0;i<rows;i+=8) { - int cols; - cols = *idx++; - for (j=0;j<cols;j++) + int colblocks; + colblocks = *idx++; + for (j=0;j<colblocks;j++) + { + int pos; + float * restrict y; + int xj0, xj1, xj2, xj3; + pos = 4 * (*idx++); + xj0 = x[pos+0]; + xj1 = x[pos+1]; + xj2 = x[pos+2]; + xj3 = x[pos+3]; + y = &out[i]; + y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3); + y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3); + y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3); + y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3); + y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3); + y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3); + y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3); + y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3); + w += 32; + } + } + for (i=0;i<rows;i++) out[i] *= SCALE_1; +} +#else /*USE_SU_BIAS*/ +static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x) +{ + int i, j; + signed x[MAX_INPUTS]; + for (i=0;i<rows;i++) out[i] *= SCALE; + for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]); + for (i=0;i<rows;i+=8) + { + int colblocks; + colblocks = *idx++; + for (j=0;j<colblocks;j++) { int pos; float * restrict y; int xj0, xj1, xj2, xj3; pos = 4 * (*idx++); - xj0 = floor(.5+127*x[pos+0]); - xj1 = floor(.5+127*x[pos+1]); - xj2 = floor(.5+127*x[pos+2]); - xj3 = floor(.5+127*x[pos+3]); + xj0 = x[pos+0]; + xj1 = x[pos+1]; + xj2 = x[pos+2]; + xj3 = x[pos+3]; y = &out[i]; - y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3); - y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3); - y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3); - y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3); - y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3); - y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3); - y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3); - y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3); + y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3); + y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3); + y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3); + y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3); + y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3); + y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3); + y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3); + y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3); w += 32; } } + for (i=0;i<rows;i++) out[i] *= SCALE_1; } +#endif /*USE_SU_BIAS*/ -#else -static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x) +#else /*DOT_PROD*/ +static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x) { int i, j; + (void)ignore; for (i=0;i<rows;i+=8) { int cols; @@ -257,4 +327,8 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const } } } -#endif +#endif /*DOT_PROD*/ + + +#endif /*no optimizations*/ +#endif /*VEC_H*/ diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index f9ddc22e7..9dda9f7a2 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -29,6 +29,9 @@ AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma */ +#ifndef VEC_AVX_H +#define VEC_AVX_H + #include <immintrin.h> #ifdef __AVX2__ @@ -246,9 +249,10 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, } #else -static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x) +static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x) { int i, j; + (void)ignore; for (i=0;i<rows;i+=8) { float * restrict y; @@ -286,3 +290,4 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, } #endif +#endif /*VEC_AVX_H*/ -- GitLab