Skip to content
Snippets Groups Projects
Commit 4e018b22 authored by Jean-Marc Valin's avatar Jean-Marc Valin
Browse files

SSE optimization of remove_doubling()

Should be trivial to adapt for Neon.
parent 39cbc458
No related branches found
No related tags found
No related merge requests found
......@@ -394,6 +394,20 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
RESTORE_STACK;
}
#ifndef OVERRIDE_DUAL_INNER_PROD
static opus_val32 dual_inner_prod(opus_val16 *x, opus_val16 *y1, opus_val16 *y2, int N)
{
int i;
opus_val32 xy=0;
for (i=0;i<N;i++)
{
xy = MAC16_16(xy, x[i], y1[i]);
xy = MAC16_16(xy, x[i], y2[i]);
}
return xy;
}
#endif
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0_, int prev_period, opus_val16 prev_gain)
......@@ -470,12 +484,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
{
T1b = (2*second_check[k]*T0+k)/(2*k);
}
xy=0;
for (i=0;i<N;i++)
{
xy = MAC16_16(xy, x[i], x[i-T1]);
xy = MAC16_16(xy, x[i], x[i-T1b]);
}
xy = dual_inner_prod(x, &x[-T1], &x[-T1b], N);
yy = yy_lookup[T1] + yy_lookup[T1b];
#ifdef FIXED_POINT
{
......
......@@ -71,4 +71,34 @@ static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v
_mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
#define OVERRIDE_DUAL_INNER_PROD
static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *y1, const opus_val16 *y2, int N)
{
int i;
__m128 xsum1, xsum2;
opus_val32 xy=0;
xsum1 = _mm_setzero_ps();
xsum2 = _mm_setzero_ps();
for (i=0;i<N-3;i+=4)
{
__m128 xi = _mm_loadu_ps(x+i);
__m128 y1i = _mm_loadu_ps(y1+i);
__m128 y2i = _mm_loadu_ps(y2+i);
xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
}
xsum1 = _mm_add_ps(xsum1,xsum2);
/* Horizontal sum */
xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
_mm_store_ss(&xy, xsum1);
for (;i<N;i++)
{
xy = MAC16_16(xy, x[i], y1[i]);
xy = MAC16_16(xy, x[i], y2[i]);
}
return xy;
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment