From 4710bdf7122d1dbc6f8df41137d5a1d4cddaa603 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@amazon.com>
Date: Fri, 21 Jul 2023 21:32:28 -0400
Subject: [PATCH] Add SSE2 support

Not so much for old machines, as for getting decent performance
when not setting -march= (SSE2 is part of the amd64 ABI).
---
 dnn/vec.h     |  2 +-
 dnn/vec_avx.h | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/dnn/vec.h b/dnn/vec.h
index 06d157184..f6085cee9 100644
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -35,7 +35,7 @@
 #include "arch.h"
 
 
-#if defined(__AVX__) || defined(__SSSE3__)
+#if defined(__AVX__) || defined(__SSE2__)
 #include "vec_avx.h"
 #elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
 #include "vec_neon.h"
diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h
index e4b8f0430..d20a2620e 100644
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -40,18 +40,9 @@
 
 
 /* Use 8-bit dot products unless disabled or if stuck with SSE2. */
-#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
+#ifndef DISABLE_DOT_PROD
 #define DOT_PROD
 #define USE_SU_BIAS
-
-#else
-
-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif
-
 #endif
 
 
@@ -652,6 +643,34 @@ static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256
 #define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
 
 #elif defined(__SSE2__)
+
+static inline __m128i mm_dpbusds_epi32(__m128i src, __m128i a, __m128i b) {
+  __m128i ah, al, bh, bl, tmp;
+  ah = _mm_srli_epi16(a, 8);
+  bh = _mm_srai_epi16(b, 8);
+  al = _mm_srli_epi16(_mm_slli_epi16(a, 8), 8);
+  bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8);
+  tmp = _mm_add_epi32(_mm_madd_epi16(ah, bh), _mm_madd_epi16(al, bl));
+  return _mm_add_epi32(src, tmp);
+}
+
+static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256i_emu b) {
+  mm256i_emu res;
+  res.hi = mm_dpbusds_epi32(src.hi, a.hi, b.hi);
+  res.lo = mm_dpbusds_epi32(src.lo, a.lo, b.lo);
+  return res;
+}
+#define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b)
+
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+
+#else
+
+#error "No optimizations in vec_avx.h. This should never happen. "
 #endif
 
 static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-- 
GitLab