diff --git a/celt/celt.c b/celt/celt.c index dfa5c68b22a19531629b2c97ed94ed6f1c524783..3e0ce6e6a550768ea603e05b6ac740b029fd55c6 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -85,6 +85,31 @@ int resampling_factor(opus_int32 rate) return ret; } +#ifndef OVERRIDE_COMB_FILTER_CONST +static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + opus_val32 x0, x1, x2, x3, x4; + int i; + x4 = x[-T-2]; + x3 = x[-T-1]; + x2 = x[-T]; + x1 = x[-T+1]; + for (i=0;i<N;i++) + { + x0=x[i-T+2]; + y[i] = x[i] + + MULT16_32_Q15(g10,x2) + + MULT16_32_Q15(g11,ADD32(x1,x3)) + + MULT16_32_Q15(g12,ADD32(x0,x4)); + x4=x3; + x3=x2; + x2=x1; + x1=x0; + } + +} +#endif void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, @@ -141,19 +166,9 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, OPUS_MOVE(y+overlap, x+overlap, N-overlap); return; } - /* OPT: For machines where the movs are costly, unroll by 5 */ - for (;i<N;i++) - { - x0=x[i-T1+2]; - y[i] = x[i] - + MULT16_32_Q15(g10,x2) - + MULT16_32_Q15(g11,ADD32(x1,x3)) - + MULT16_32_Q15(g12,ADD32(x0,x4)); - x4=x3; - x3=x2; - x2=x1; - x1=x0; - } + + /* Compute the part with the constant filter. */ + comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12); } const signed char tf_select_table[4][8] = { diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index d24139806510fc62d6c3b0d0352a990e9229f545..64c96af2c23d9a8e4ac532a9d8beb0b006b67c4d 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -100,5 +100,54 @@ static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 * return xy; } +#define OVERRIDE_COMB_FILTER_CONST +static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif + + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +} #endif