diff --git a/celt/pitch.c b/celt/pitch.c index f8115c2fe7272cf43e8a198db9a058d0c0e0d458..025f448de411f9dc610a6e6b1eeed900623e7bd3 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -394,6 +394,20 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR RESTORE_STACK; } +#ifndef OVERRIDE_DUAL_INNER_PROD +static opus_val32 dual_inner_prod(opus_val16 *x, opus_val16 *y1, opus_val16 *y2, int N) +{ + int i; + opus_val32 xy=0; + for (i=0;i<N;i++) + { + xy = MAC16_16(xy, x[i], y1[i]); + xy = MAC16_16(xy, x[i], y2[i]); + } + return xy; +} +#endif + static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}; opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, opus_val16 prev_gain) @@ -470,12 +484,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, { T1b = (2*second_check[k]*T0+k)/(2*k); } - xy=0; - for (i=0;i<N;i++) - { - xy = MAC16_16(xy, x[i], x[i-T1]); - xy = MAC16_16(xy, x[i], x[i-T1b]); - } + xy = dual_inner_prod(x, &x[-T1], &x[-T1b], N); yy = yy_lookup[T1] + yy_lookup[T1b]; #ifdef FIXED_POINT { diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index a55a69b73a00084e41fa7fcb1782fca15aee413f..4512665c0b040b25d7935cefc244c1af6218d745 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -71,4 +71,34 @@ static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); } +#define OVERRIDE_DUAL_INNER_PROD +static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *y1, const opus_val16 *y2, int N) +{ + int i; + __m128 xsum1, xsum2; + opus_val32 xy=0; + xsum1 = _mm_setzero_ps(); + xsum2 = _mm_setzero_ps(); + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 y1i = _mm_loadu_ps(y1+i); + __m128 y2i = _mm_loadu_ps(y2+i); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); + } + xsum1 = _mm_add_ps(xsum1,xsum2); + /* Horizontal sum */ + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); + _mm_store_ss(&xy, xsum1); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y1[i]); + xy = MAC16_16(xy, x[i], y2[i]); + } + return xy; +} + + #endif