From c045702e51e04006f81f6f473ab5326eb24aace6 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jmvalin@jmvalin.ca> Date: Thu, 24 Dec 2020 02:46:19 -0500 Subject: [PATCH] Add non-dot-product AVX code --- dnn/vec_avx.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index 80b022909..e11fae04e 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -218,3 +218,45 @@ static void sparse_sgemv_accum16(float *out, const float *weights, int rows, con } } +#ifdef DOT_PROD +#else +static void sparse_sgemv_accum8x4(float *out, const float *weights, int rows, const int *idx, const float *x) +{ + int i, j; + for (i=0;i<rows;i+=8) + { + float * restrict y; + int cols; + __m256 vy0; + y = &out[i]; + vy0 = _mm256_loadu_ps(&y[0]); + cols = *idx++; + for (j=0;j<cols;j++) + { + int id; + __m256 vxj; + __m256 vw; + id = *idx++; + vxj = _mm256_broadcast_ss(&x[4*id]); + vw = _mm256_loadu_ps(&weights[0]); + vy0 = _mm256_fmadd_ps(vw, vxj, vy0); + + vxj = _mm256_broadcast_ss(&x[4*id+1]); + vw = _mm256_loadu_ps(&weights[8]); + vy0 = _mm256_fmadd_ps(vw, vxj, vy0); + + vxj = _mm256_broadcast_ss(&x[4*id+2]); + vw = _mm256_loadu_ps(&weights[16]); + vy0 = _mm256_fmadd_ps(vw, vxj, vy0); + + vxj = _mm256_broadcast_ss(&x[4*id+3]); + vw = _mm256_loadu_ps(&weights[24]); + vy0 = _mm256_fmadd_ps(vw, vxj, vy0); + + weights += 32; + } + _mm256_storeu_ps (&y[0], vy0); + } +} +#endif + -- GitLab