Skip to content
Snippets Groups Projects
Commit 60d6eab6 authored by Jean-Marc Valin's avatar Jean-Marc Valin
Browse files

Doing a bit of unrolling to speed things up

parent 3e7ab9ff
No related branches found
No related tags found
No related merge requests found
......@@ -502,7 +502,40 @@ static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows
colblocks = *idx++;
y = &out[i];
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
for (j=0;j<colblocks;j++)
j=0;
#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
for (;j<colblocks-3;j+=4)
{
__m256i tmp;
__m256i vxj;
__m256i vw;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
vxj = _mm256_set1_epi32(*(int*)&x[*idx++]);
vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
tmp = _mm256_madd_epi16(tmp, ones);
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
}
#endif
for (;j<colblocks;j++)
{
__m256i tmp;
__m256i vxj;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment