From c045702e51e04006f81f6f473ab5326eb24aace6 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Thu, 24 Dec 2020 02:46:19 -0500
Subject: [PATCH] Add non-dot-product AVX code

---
 dnn/vec_avx.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index 80b022909..e11fae04e 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -218,3 +218,45 @@ static void sparse_sgemv_accum16(float *out, const float *weights, int rows, con
    }
 }
 
+#ifdef DOT_PROD
+#else
+static void sparse_sgemv_accum8x4(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=8)
+   {
+      float * restrict y;
+      int cols;
+      __m256 vy0;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[4*id]);
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[4*id+1]);
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[4*id+2]);
+         vw = _mm256_loadu_ps(&weights[16]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vxj = _mm256_broadcast_ss(&x[4*id+3]);
+         vw = _mm256_loadu_ps(&weights[24]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         weights += 32;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+   }
+}
+#endif
+
-- 
GitLab