diff --git a/celt/bands.c b/celt/bands.c index f55a8fe00b60cc34977a6dee726caf0f6da669a6..93bd0bc7220a86ed86f174fe02e5c2698d380df2 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -41,6 +41,7 @@ #include "mathops.h" #include "rate.h" #include "quant_bands.h" +#include "pitch.h" int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev) { @@ -383,11 +384,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N) opus_val32 t, lgain, rgain; /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ - for (j=0;j<N;j++) - { - xp = MAC16_16(xp, X[j], Y[j]); - side = MAC16_16(side, Y[j], Y[j]); - } + dual_inner_prod(Y, X, Y, N, &xp, &side); /* Compensating for the mid normalization */ xp = MULT16_32_Q15(mid, xp); /* mid and side are in Q15, not Q14 like X and Y */ diff --git a/celt/pitch.c b/celt/pitch.c index ac993602a2ff145bd819fbf80d34d0227f67edca..1b12b8398affb0bd6d4626ed0bb1b0e43b36689c 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -394,20 +394,6 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR RESTORE_STACK; } -#ifndef OVERRIDE_DUAL_INNER_PROD -static opus_val32 dual_inner_prod(opus_val16 *x, opus_val16 *y01, opus_val16 *y02, int N) -{ - int i; - opus_val32 xy=0; - for (i=0;i<N;i++) - { - xy = MAC16_16(xy, x[i], y01[i]); - xy = MAC16_16(xy, x[i], y02[i]); - } - return xy; -} -#endif - static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}; opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, opus_val16 prev_gain) @@ -415,7 +401,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int k, i, T, T0; opus_val16 g, g0; opus_val16 pg; - opus_val32 xy,xx,yy; + opus_val32 xy,xx,yy,xy2; opus_val32 xcorr[3]; opus_val32 best_xy, best_yy; int offset; @@ -435,12 +421,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, T = T0 = *T0_; ALLOC(yy_lookup, maxperiod+1, opus_val32); - xy=xx=0; - for (i=0;i<N;i++) - { - xx = MAC16_16(xx, x[i], x[i]); - xy = MAC16_16(xy, x[i], x[i-T0]); - } + dual_inner_prod(x, x, x-T0, N, &xx, &xy); yy_lookup[0] = xx; yy=xx; for (i=1;i<=maxperiod;i++) @@ -484,7 +465,8 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, { T1b = (2*second_check[k]*T0+k)/(2*k); } - xy = dual_inner_prod(x, &x[-T1], &x[-T1b], N); + dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2); + xy += xy2; yy = yy_lookup[T1] + yy_lookup[T1b]; #ifdef FIXED_POINT { diff --git a/celt/pitch.h b/celt/pitch.h index a2824c34e2b55d5228568749b4bc3fbb40619966..086a46ba11a9a24212a25282c6e7e371064f810e 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -118,6 +118,23 @@ static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus } #endif /* OVERRIDE_XCORR_KERNEL */ +#ifndef OVERRIDE_DUAL_INNER_PROD +static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + opus_val32 xy01=0; + opus_val32 xy02=0; + for (i=0;i<N;i++) + { + xy01 = MAC16_16(xy01, x[i], y01[i]); + xy02 = MAC16_16(xy02, x[i], y02[i]); + } + *xy1 = xy01; + *xy2 = xy02; +} +#endif + #ifdef FIXED_POINT opus_val32 #else diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 4ab69342a3355b71c329240549c219c77644a242..3c0fddd0d3c1dfc5e7e0928224b0993a725d6361 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -72,11 +72,11 @@ static inline void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_v } #define OVERRIDE_DUAL_INNER_PROD -static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N) +static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) { int i; __m128 xsum1, xsum2; - opus_val32 xy=0; xsum1 = _mm_setzero_ps(); xsum2 = _mm_setzero_ps(); for (i=0;i<N-3;i+=4) @@ -87,17 +87,18 @@ static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 * xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); } - xsum1 = _mm_add_ps(xsum1,xsum2); /* Horizontal sum */ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); - _mm_store_ss(&xy, xsum1); + _mm_store_ss(xy1, xsum1); + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); + _mm_store_ss(xy2, xsum2); for (;i<N;i++) { - xy = MAC16_16(xy, x[i], y01[i]); - xy = MAC16_16(xy, x[i], y02[i]); + *xy1 = MAC16_16(*xy1, x[i], y01[i]); + *xy2 = MAC16_16(*xy2, x[i], y02[i]); } - return xy; } #define OVERRIDE_COMB_FILTER_CONST