diff --git a/dnn/nnet.c b/dnn/nnet.c
index 28ebf26ef8b478b1efb1306de7e72abe28dbf542..84472ee24efc9d9c83dd1b39f13c9b88617d366b 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -282,6 +282,10 @@ static void sparse_gemm_accum16(float *out, const float *weights, int rows, cons
 }
 
 #else
+
+#warning Compiling without any vectorization. This code will be very slow
+#warning Try adding -mavx2 -mfma
+
 static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
 {
    int i, j;
@@ -314,6 +318,40 @@ static void gemm_accum16(float *out, const float *weights, int rows, int cols, i
       }
    }
 }
+
+static void sparse_gemm_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      int cols;
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         float * restrict y;
+         float xj;
+         xj = x[*idx++];
+         y = &out[i];
+         y[0] += w[0]*xj;
+         y[1] += w[1]*xj;
+         y[2] += w[2]*xj;
+         y[3] += w[3]*xj;
+         y[4] += w[4]*xj;
+         y[5] += w[5]*xj;
+         y[6] += w[6]*xj;
+         y[7] += w[7]*xj;
+         y[8] += w[8]*xj;
+         y[9] += w[9]*xj;
+         y[10] += w[10]*xj;
+         y[11] += w[11]*xj;
+         y[12] += w[12]*xj;
+         y[13] += w[13]*xj;
+         y[14] += w[14]*xj;
+         y[15] += w[15]*xj;
+         w += 16;
+      }
+   }
+}
 #endif
 
 static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)