diff --git a/dnn/nnet.c b/dnn/nnet.c index 28ebf26ef8b478b1efb1306de7e72abe28dbf542..84472ee24efc9d9c83dd1b39f13c9b88617d366b 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -282,6 +282,10 @@ static void sparse_gemm_accum16(float *out, const float *weights, int rows, cons } #else + +#warning Compiling without any vectorization. This code will be very slow +#warning Try adding -mavx2 -mfma + static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x) { int i, j; @@ -314,6 +318,40 @@ static void gemm_accum16(float *out, const float *weights, int rows, int cols, i } } } + +static void sparse_gemm_accum16(float *out, const float *w, int rows, const int *idx, const float *x) +{ + int i, j; + for (i=0;i<rows;i+=16) + { + int cols; + cols = *idx++; + for (j=0;j<cols;j++) + { + float * restrict y; + float xj; + xj = x[*idx++]; + y = &out[i]; + y[0] += w[0]*xj; + y[1] += w[1]*xj; + y[2] += w[2]*xj; + y[3] += w[3]*xj; + y[4] += w[4]*xj; + y[5] += w[5]*xj; + y[6] += w[6]*xj; + y[7] += w[7]*xj; + y[8] += w[8]*xj; + y[9] += w[9]*xj; + y[10] += w[10]*xj; + y[11] += w[11]*xj; + y[12] += w[12]*xj; + y[13] += w[13]*xj; + y[14] += w[14]*xj; + y[15] += w[15]*xj; + w += 16; + } + } +} #endif static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)