Skip to content
Snippets Groups Projects
Commit 6b582edb authored by Jean-Marc Valin's avatar Jean-Marc Valin
Browse files

WIP: remove scalar code from AVX2 code

parent be392e38
No related branches found
No related tags found
No related merge requests found
......@@ -257,27 +257,12 @@ static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows
__m256i vxj;
__m256i vw;
int pos;
int xj0, xj1, xj2, xj3;
pos = 4 * (*idx++);
vxj = _mm256_set1_epi32(*(int*)&x[pos]);
xj0 = x[pos+0];
xj1 = x[pos+1];
xj2 = x[pos+2];
xj3 = x[pos+3];
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
w += 32;
}
_mm256_storeu_si256 ((__m256i *)&y[0], vy0);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment