From 4710bdf7122d1dbc6f8df41137d5a1d4cddaa603 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@amazon.com> Date: Fri, 21 Jul 2023 21:32:28 -0400 Subject: [PATCH] Add SSE2 support Not so much for old machines, as for getting decent performance when not setting -march= (SSE2 is part of the amd64 ABI). --- dnn/vec.h | 2 +- dnn/vec_avx.h | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/dnn/vec.h b/dnn/vec.h index 06d157184..f6085cee9 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -35,7 +35,7 @@ #include "arch.h" -#if defined(__AVX__) || defined(__SSSE3__) +#if defined(__AVX__) || defined(__SSE2__) #include "vec_avx.h" #elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON) #include "vec_neon.h" diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index e4b8f0430..d20a2620e 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -40,18 +40,9 @@ /* Use 8-bit dot products unless disabled or if stuck with SSE2. */ -#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD) +#ifndef DISABLE_DOT_PROD #define DOT_PROD #define USE_SU_BIAS - -#else - -#if defined(_MSC_VER) -#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance") -#else -#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance" -#endif - #endif @@ -652,6 +643,34 @@ static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256 #define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b) #elif defined(__SSE2__) + +static inline __m128i mm_dpbusds_epi32(__m128i src, __m128i a, __m128i b) { + __m128i ah, al, bh, bl, tmp; + ah = _mm_srli_epi16(a, 8); + bh = _mm_srai_epi16(b, 8); + al = _mm_srli_epi16(_mm_slli_epi16(a, 8), 8); + bl = _mm_srai_epi16(_mm_slli_epi16(b, 8), 8); + tmp = _mm_add_epi32(_mm_madd_epi16(ah, bh), _mm_madd_epi16(al, bl)); + return _mm_add_epi32(src, tmp); +} + +static inline mm256i_emu mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, mm256i_emu b) { + mm256i_emu res; + res.hi = mm_dpbusds_epi32(src.hi, a.hi, b.hi); + res.lo = mm_dpbusds_epi32(src.lo, a.lo, b.lo); + return res; +} +#define _mm256_dpbusds_epi32(src, a, b) mm256_dpbusds_epi32(src, a, b) + +#if defined(_MSC_VER) +#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance") +#else +#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance" +#endif + +#else + +#error "No optimizations in vec_avx.h. This should never happen. " #endif static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x) -- GitLab