From 11736ca9e3654397bfe472e51f2ec96433764efe Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@jmvalin.ca> Date: Thu, 24 Dec 2020 03:47:42 -0500 Subject: [PATCH] WIP: 8-bit mul --- dnn/nnet.h | 8 ++++- dnn/training_tf2/dump_lpcnet.py | 8 +++-- dnn/vec.h | 62 ++++++++++----------------------- dnn/vec_avx.h | 28 ++++++++++++++- 4 files changed, 58 insertions(+), 48 deletions(-) diff --git a/dnn/nnet.h b/dnn/nnet.h index 14b0bced6..a0033de6e 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -34,6 +34,12 @@ #define ACTIVATION_RELU 3 #define ACTIVATION_SOFTMAX 4 +#ifdef DOT_PROD +typedef signed char qweight; +#else +typedef float qweight; +#endif + typedef struct { const float *bias; const float *input_weights; @@ -65,7 +71,7 @@ typedef struct { typedef struct { const float *bias; const float *diag_weights; - const float *recurrent_weights; + const qweight *recurrent_weights; const int *idx; int nb_neurons; int activation; diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 3442e28d7..779824491 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -81,9 +81,10 @@ def printSparseVector(f, A, name): W = np.concatenate([W, vblock]) idx[pos] = nb_nonzero f.write('#ifdef DOT_PROD\n') - printVector(f, W, name) + W = np.minimum(127, np.maximum(-128, np.round(W*128))) + printVector(f, W.astype('int'), name, dtype='qweight') f.write('#else /*DOT_PROD*/\n') - printVector(f, W0, name) + printVector(f, W0, name, dtype='nnet_weight') f.write('#endif /*DOT_PROD*/\n') #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16) printVector(f, idx, name + '_idx', dtype='int') @@ -232,7 +233,8 @@ f = open(cfile, 'w') hf = open(hfile, 'w') -f.write('/*This file is automatically generated from a Keras model*/\n\n') +f.write('/*This file is automatically generated from a Keras model*/\n') +f.write('/*based on model {}*/\n\n'.format(sys.argv[1])) f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile)) hf.write('/*This file is automatically generated from a Keras model*/\n\n') diff --git a/dnn/vec.h b/dnn/vec.h index a393e9568..7317c1736 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -25,6 +25,9 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +#include "nnet.h" + /* No AVX2/FMA support */ #ifndef LPCNET_TEST static float celt_exp2(float x) @@ -164,8 +167,8 @@ static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int } #ifdef DOT_PROD - -static void sparse_sgemv_accum8x4(float *out, const float *w, int rows, const int *idx, const float *x) +#define SCALE_1 (1.f/128.f/127.f) +static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x) { int i, j; for (i=0;i<rows;i+=8) @@ -176,55 +179,28 @@ static void sparse_sgemv_accum8x4(float *out, const float *w, int rows, const in { int pos; float * restrict y; - float xj0, xj1, xj2, xj3; + int xj0, xj1, xj2, xj3; pos = 4 * (*idx++); - xj0 = x[pos+0]; - xj1 = x[pos+1]; - xj2 = x[pos+2]; - xj3 = x[pos+3]; + xj0 = floor(.5+127*x[pos+0]); + xj1 = floor(.5+127*x[pos+1]); + xj2 = floor(.5+127*x[pos+2]); + xj3 = floor(.5+127*x[pos+3]); y = &out[i]; - y[0] += w[0]*xj0; - y[1] += w[4]*xj0; - y[2] += w[8]*xj0; - y[3] += w[12]*xj0; - y[4] += w[16]*xj0; - y[5] += w[20]*xj0; - y[6] += w[24]*xj0; - y[7] += w[28]*xj0; - - y[0] += w[1]*xj1; - y[1] += w[5]*xj1; - y[2] += w[9]*xj1; - y[3] += w[13]*xj1; - y[4] += w[17]*xj1; - y[5] += w[21]*xj1; - y[6] += w[25]*xj1; - y[7] += w[29]*xj1; - - y[0] += w[2]*xj2; - y[1] += w[6]*xj2; - y[2] += w[10]*xj2; - y[3] += w[14]*xj2; - y[4] += w[18]*xj2; - y[5] += w[22]*xj2; - y[6] += w[26]*xj2; - y[7] += w[30]*xj2; - - y[0] += w[3]*xj3; - y[1] += w[7]*xj3; - y[2] += w[11]*xj3; - y[3] += w[15]*xj3; - y[4] += w[19]*xj3; - y[5] += w[23]*xj3; - y[6] += w[27]*xj3; - y[7] += w[31]*xj3; + y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3); + y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3); + y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3); + y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3); + y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3); + y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3); + y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3); + y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3); w += 32; } } } #else -static void sparse_sgemv_accum8x4(float *out, const float *w, int rows, const int *idx, const float *x) +static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x) { int i, j; for (i=0;i<rows;i+=8) diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index e11fae04e..f9ddc22e7 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -219,8 +219,34 @@ static void sparse_sgemv_accum16(float *out, const float *weights, int rows, con } #ifdef DOT_PROD +static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x) +{ + int i, j; + for (i=0;i<rows;i+=8) + { + float * restrict y; + int cols; + __m256 vy0; + y = &out[i]; + vy0 = _mm256_loadu_ps(&y[0]); + cols = *idx++; + for (j=0;j<cols;j++) + { + int id; + __m256 vxj; + __m256 vw; + id = *idx++; + + //kernel goes here + + weights += 32; + } + _mm256_storeu_ps (&y[0], vy0); + } +} + #else -static void sparse_sgemv_accum8x4(float *out, const float *weights, int rows, const int *idx, const float *x) +static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x) { int i, j; for (i=0;i<rows;i+=8) -- GitLab