From 1ada7d4d6f838dc0842fc89159747755c516ce24 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 3 Nov 2023 02:46:38 -0400
Subject: [PATCH] Vectorizing sgemv for multiples of 4 with SSE

---
 dnn/vec_avx.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index 767d7e193..a1d6cad27 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -709,6 +709,23 @@ static inline void sgemv(float *out, const float *weights, int rows, int cols, i
      }
      _mm256_storeu_ps (&y[0], vy0);
   }
+  for (;i<rows-3;i+=4)
+  {
+     float *y;
+     __m128 vy0;
+     y = &out[i];
+     vy0 = _mm_setzero_ps();
+     for (j=0;j<cols;j++)
+     {
+        __m128 vxj;
+        __m128 vw;
+        vxj = _mm_broadcast_ss(&x[j]);
+
+        vw = _mm_loadu_ps(&weights[j*col_stride + i]);
+        vy0 = _mm_fmadd_ps(vw, vxj, vy0);
+     }
+     _mm_storeu_ps (&y[0], vy0);
+  }
   for (;i<rows;i++)
   {
     out[i] = 0;
-- 
GitLab