Skip to content
Snippets Groups Projects
Commit 57cd849c authored by Jean-Marc Valin's avatar Jean-Marc Valin
Browse files

Defining celt_inner_prod() and using it instead of explicit loops.

Also adds an SSE-optimized celt_inner_prod().
parent ff072009
No related branches found
No related tags found
No related merge requests found
......@@ -576,7 +576,7 @@ static int tf_analysis(const CELTMode *m, int len, int isTransient,
*tf_sum = 0;
for (i=0;i<len;i++)
{
int j, k, N;
int k, N;
int narrow;
opus_val32 L1, best_L1;
int best_level=0;
......@@ -768,10 +768,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
/* Compute inter-channel correlation for low frequencies */
for (i=0;i<8;i++)
{
int j;
opus_val32 partial = 0;
for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
partial = MAC16_16(partial, X[j], X[N0+j]);
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
}
sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
......@@ -779,10 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
minXC = sum;
for (i=8;i<intensity;i++)
{
int j;
opus_val32 partial = 0;
for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
partial = MAC16_16(partial, X[j], X[N0+j]);
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
}
minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
......
......@@ -252,7 +252,7 @@ void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
{
int i,j;
int i;
/*The EDSP version requires that max_pitch is at least 1, and that _x is
32-bit aligned.
Since it's hard to put asserts in assembly, put them here.*/
......@@ -279,9 +279,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
for (;i<max_pitch;i++)
{
opus_val32 sum = 0;
for (j=0;j<len;j++)
sum = MAC16_16(sum, _x[j],_y[i+j]);
opus_val32 sum;
sum = celt_inner_prod(_x, _y+i, len);
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
......@@ -361,12 +360,17 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
#endif
for (i=0;i<max_pitch>>1;i++)
{
opus_val32 sum=0;
opus_val32 sum;
xcorr[i] = 0;
if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
continue;
#ifdef FIXED_POINT
sum = 0;
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
sum = celt_inner_prod(x_lp, y+i, len>>1);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
......@@ -513,13 +517,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
pg = SHR32(frac_div32(best_xy,best_yy+1),16);
for (k=0;k<3;k++)
{
int T1 = T+k-1;
xy = 0;
for (i=0;i<N;i++)
xy = MAC16_16(xy, x[i], x[i-T1]);
xcorr[k] = xy;
}
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
offset = 1;
else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
......
......@@ -141,6 +141,18 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#endif
#ifndef OVERRIDE_CELT_INNER_PROD
static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
int N)
{
int i;
opus_val32 xy=0;
for (i=0;i<N;i++)
xy = MAC16_16(xy, x[i], y[i]);
return xy;
}
#endif
#ifdef FIXED_POINT
opus_val32
#else
......
......@@ -37,6 +37,7 @@
#include "os_support.h"
#include "bands.h"
#include "rate.h"
#include "pitch.h"
static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
{
......@@ -350,15 +351,11 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
#ifdef FIXED_POINT
int k;
#endif
opus_val32 E = EPSILON;
opus_val32 E;
opus_val16 g;
opus_val32 t;
celt_norm *xptr = X;
for (i=0;i<N;i++)
{
E = MAC16_16(E, *xptr, *xptr);
xptr++;
}
celt_norm *xptr;
E = EPSILON + celt_inner_prod(X, X, N);
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
......@@ -393,14 +390,8 @@ int stereo_itheta(celt_norm *X, celt_norm *Y, int stereo, int N)
Eside = MAC16_16(Eside, s, s);
}
} else {
for (i=0;i<N;i++)
{
celt_norm m, s;
m = X[i];
s = Y[i];
Emid = MAC16_16(Emid, m, m);
Eside = MAC16_16(Eside, s, s);
}
Emid += celt_inner_prod(X, X, N);
Eside += celt_inner_prod(Y, Y, N);
}
mid = celt_sqrt(Emid);
side = celt_sqrt(Eside);
......
......@@ -101,6 +101,32 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
}
#define OVERRIDE_CELT_INNER_PROD
static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
int N)
{
int i;
float xy;
__m128 sum;
sum = _mm_setzero_ps();
/* FIXME: We should probably go 8-way and use 2 sums. */
for (i=0;i<N-3;i+=4)
{
__m128 xi = _mm_loadu_ps(x+i);
__m128 yi = _mm_loadu_ps(y+i);
sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
}
/* Horizontal sum */
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
_mm_store_ss(&xy, sum);
for (;i<N;i++)
{
xy = MAC16_16(xy, x[i], y[i]);
}
return xy;
}
#define OVERRIDE_COMB_FILTER_CONST
static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
opus_val16 g10, opus_val16 g11, opus_val16 g12)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment