diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c index 3a6d6a7102656fea91c38e410b1dfff89ca68445..4f858b1853987407773c5a78f64e4943b4bf143e 100644 --- a/celt/celt_lpc.c +++ b/celt/celt_lpc.c @@ -124,49 +124,12 @@ void celt_fir(const opus_val16 *_x, celt_assert((ord&3)==0); for (i=0;i<N-3;i+=4) { - opus_val32 sum1=0; - opus_val32 sum2=0; - opus_val32 sum3=0; - opus_val32 sum4=0; - const opus_val16 *xx = x+i; - const opus_val16 *z = rnum; - opus_val16 y_0, y_1, y_2, y_3; - y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ - y_0=*xx++; - y_1=*xx++; - y_2=*xx++; - for (j=0;j<ord-3;j+=4) - { - opus_val16 tmp; - tmp = *z++; - y_3=*xx++; - sum1 = MAC16_16(sum1,tmp,y_0); - sum2 = MAC16_16(sum2,tmp,y_1); - sum3 = MAC16_16(sum3,tmp,y_2); - sum4 = MAC16_16(sum4,tmp,y_3); - tmp=*z++; - y_0=*xx++; - sum1 = MAC16_16(sum1,tmp,y_1); - sum2 = MAC16_16(sum2,tmp,y_2); - sum3 = MAC16_16(sum3,tmp,y_3); - sum4 = MAC16_16(sum4,tmp,y_0); - tmp=*z++; - y_1=*xx++; - sum1 = MAC16_16(sum1,tmp,y_2); - sum2 = MAC16_16(sum2,tmp,y_3); - sum3 = MAC16_16(sum3,tmp,y_0); - sum4 = MAC16_16(sum4,tmp,y_1); - tmp=*z++; - y_2=*xx++; - sum1 = MAC16_16(sum1,tmp,y_3); - sum2 = MAC16_16(sum2,tmp,y_0); - sum3 = MAC16_16(sum3,tmp,y_1); - sum4 = MAC16_16(sum4,tmp,y_2); - } - _y[i ] = ADD16(_x[i ], ROUND16(sum1, SIG_SHIFT)); - _y[i+1] = ADD16(_x[i+1], ROUND16(sum2, SIG_SHIFT)); - _y[i+2] = ADD16(_x[i+2], ROUND16(sum3, SIG_SHIFT)); - _y[i+3] = ADD16(_x[i+3], ROUND16(sum4, SIG_SHIFT)); + opus_val32 sum[4]={0,0,0,0}; + xcorr_kernel(rnum, x+i, sum, ord); + _y[i ] = ADD16(_x[i ], ROUND16(sum[0], SIG_SHIFT)); + _y[i+1] = ADD16(_x[i+1], ROUND16(sum[1], SIG_SHIFT)); + _y[i+2] = ADD16(_x[i+2], ROUND16(sum[2], SIG_SHIFT)); + _y[i+3] = ADD16(_x[i+3], ROUND16(sum[3], SIG_SHIFT)); } for (;i<N;i++) { @@ -219,64 +182,26 @@ void celt_iir(const opus_val32 *_x, y[i]=0; for (i=0;i<N-3;i+=4) { - opus_val32 sum1=0; - opus_val32 sum2=0; - opus_val32 sum3=0; - opus_val32 sum4=0; - const opus_val16 *yy = y+i; - const opus_val16 *z = rden; - opus_val16 y_0, y_1, y_2, y_3; - sum1 = _x[i ]; - sum2 = _x[i+1]; - sum3 = _x[i+2]; - sum4 = _x[i+3]; - y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ - y_0=*yy++; - y_1=*yy++; - y_2=*yy++; - for (j=0;j<ord-3;j+=4) - { - opus_val16 tmp; - tmp = *z++; - y_3=*yy++; - sum1 = MAC16_16(sum1,tmp,y_0); - sum2 = MAC16_16(sum2,tmp,y_1); - sum3 = MAC16_16(sum3,tmp,y_2); - sum4 = MAC16_16(sum4,tmp,y_3); - tmp=*z++; - y_0=*yy++; - sum1 = MAC16_16(sum1,tmp,y_1); - sum2 = MAC16_16(sum2,tmp,y_2); - sum3 = MAC16_16(sum3,tmp,y_3); - sum4 = MAC16_16(sum4,tmp,y_0); - tmp=*z++; - y_1=*yy++; - sum1 = MAC16_16(sum1,tmp,y_2); - sum2 = MAC16_16(sum2,tmp,y_3); - sum3 = MAC16_16(sum3,tmp,y_0); - sum4 = MAC16_16(sum4,tmp,y_1); - tmp=*z++; - y_2=*yy++; - sum1 = MAC16_16(sum1,tmp,y_3); - sum2 = MAC16_16(sum2,tmp,y_0); - sum3 = MAC16_16(sum3,tmp,y_1); - sum4 = MAC16_16(sum4,tmp,y_2); - } - y[i+ord ] = -ROUND16(sum1,SIG_SHIFT); - _y[i ] = sum1; - sum2 = MAC16_16(sum2, y[i+ord ], den[0]); - y[i+ord+1] = -ROUND16(sum2,SIG_SHIFT); - _y[i+1] = sum2; - sum3 = MAC16_16(sum3, y[i+ord+1], den[0]); - sum3 = MAC16_16(sum3, y[i+ord ], den[1]); - y[i+ord+2] = -ROUND16(sum3,SIG_SHIFT); - _y[i+2] = sum3; + /* Unroll by 4 as if it were an FIR filter */ + opus_val32 sum[4]={_x[i],_x[i+1],_x[i+2],_x[i+3]}; + xcorr_kernel(rden, y+i, sum, ord); + + /* Patch up the result to compensate for the fact that this is an IIR */ + y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT); + _y[i ] = sum[0]; + sum[1] = MAC16_16(sum[1], y[i+ord ], den[0]); + y[i+ord+1] = -ROUND16(sum[1],SIG_SHIFT); + _y[i+1] = sum[1]; + sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]); + sum[2] = MAC16_16(sum[2], y[i+ord ], den[1]); + y[i+ord+2] = -ROUND16(sum[2],SIG_SHIFT); + _y[i+2] = sum[2]; - sum4 = MAC16_16(sum4, y[i+ord+2], den[0]); - sum4 = MAC16_16(sum4, y[i+ord+1], den[1]); - sum4 = MAC16_16(sum4, y[i+ord ], den[2]); - y[i+ord+3] = -ROUND16(sum4,SIG_SHIFT); - _y[i+3] = sum4; + sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]); + sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]); + sum[3] = MAC16_16(sum[3], y[i+ord ], den[2]); + y[i+ord+3] = -ROUND16(sum[3],SIG_SHIFT); + _y[i+3] = sum[3]; } for (;i<N;i++) { diff --git a/celt/pitch.c b/celt/pitch.c index cffee19cbe7d717b506eeaac175cca09be243399..f8115c2fe7272cf43e8a198db9a058d0c0e0d458 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -258,83 +258,17 @@ pitch_xcorr(opus_val16 *_x, opus_val16 *_y, opus_val32 *xcorr, int len, int max_ #endif for (i=0;i<max_pitch-3;i+=4) { - /* Compute correlation*/ - /*corr[nb_pitch-1-i]=inner_prod(x, _y+i, len);*/ - opus_val32 sum1=0; - opus_val32 sum2=0; - opus_val32 sum3=0; - opus_val32 sum4=0; - const opus_val16 *y = _y+i; - const opus_val16 *x = _x; - opus_val16 y_0, y_1, y_2, y_3; - y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ - y_0=*y++; - y_1=*y++; - y_2=*y++; - for (j=0;j<len-3;j+=4) - { - opus_val16 tmp; - tmp = *x++; - y_3=*y++; - sum1 = MAC16_16(sum1,tmp,y_0); - sum2 = MAC16_16(sum2,tmp,y_1); - sum3 = MAC16_16(sum3,tmp,y_2); - sum4 = MAC16_16(sum4,tmp,y_3); - tmp=*x++; - y_0=*y++; - sum1 = MAC16_16(sum1,tmp,y_1); - sum2 = MAC16_16(sum2,tmp,y_2); - sum3 = MAC16_16(sum3,tmp,y_3); - sum4 = MAC16_16(sum4,tmp,y_0); - tmp=*x++; - y_1=*y++; - sum1 = MAC16_16(sum1,tmp,y_2); - sum2 = MAC16_16(sum2,tmp,y_3); - sum3 = MAC16_16(sum3,tmp,y_0); - sum4 = MAC16_16(sum4,tmp,y_1); - tmp=*x++; - y_2=*y++; - sum1 = MAC16_16(sum1,tmp,y_3); - sum2 = MAC16_16(sum2,tmp,y_0); - sum3 = MAC16_16(sum3,tmp,y_1); - sum4 = MAC16_16(sum4,tmp,y_2); - } - if (j++<len) - { - opus_val16 tmp = *x++; - y_3=*y++; - sum1 = MAC16_16(sum1,tmp,y_0); - sum2 = MAC16_16(sum2,tmp,y_1); - sum3 = MAC16_16(sum3,tmp,y_2); - sum4 = MAC16_16(sum4,tmp,y_3); - } - if (j++<len) - { - opus_val16 tmp=*x++; - y_0=*y++; - sum1 = MAC16_16(sum1,tmp,y_1); - sum2 = MAC16_16(sum2,tmp,y_2); - sum3 = MAC16_16(sum3,tmp,y_3); - sum4 = MAC16_16(sum4,tmp,y_0); - } - if (j<len) - { - opus_val16 tmp=*x++; - y_1=*y++; - sum1 = MAC16_16(sum1,tmp,y_2); - sum2 = MAC16_16(sum2,tmp,y_3); - sum3 = MAC16_16(sum3,tmp,y_0); - sum4 = MAC16_16(sum4,tmp,y_1); - } - xcorr[i]=sum1; - xcorr[i+1]=sum2; - xcorr[i+2]=sum3; - xcorr[i+3]=sum4; + opus_val32 sum[4]={0,0,0,0}; + xcorr_kernel(_x, _y+i, sum, len); + xcorr[i]=sum[0]; + xcorr[i+1]=sum[1]; + xcorr[i+2]=sum[2]; + xcorr[i+3]=sum[3]; #ifdef FIXED_POINT - sum1 = MAX32(sum1, sum2); - sum3 = MAX32(sum3, sum4); - sum1 = MAX32(sum1, sum3); - maxcorr = MAX32(maxcorr, sum1); + sum[0] = MAX32(sum[0], sum[1]); + sum[2] = MAX32(sum[2], sum[3]); + sum[0] = MAX32(sum[0], sum[2]); + maxcorr = MAX32(maxcorr, sum[0]); #endif } /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ diff --git a/celt/pitch.h b/celt/pitch.h index 1d567b0751ab42893220f8f1c09c37a3f6c6e652..efc1175c4791fbf26f72f8d01436fac52f213085 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -45,6 +45,73 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, int N, int *T0, int prev_period, opus_val16 prev_gain); +/* OPT: This is the kernel you really want to optimize. It gets used a lot + by the prefilter and by the PLC. */ +static inline void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) +{ + int j; + opus_val16 y_0, y_1, y_2, y_3; + y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */ + y_0=*y++; + y_1=*y++; + y_2=*y++; + for (j=0;j<len-3;j+=4) + { + opus_val16 tmp; + tmp = *x++; + y_3=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_0); + sum[1] = MAC16_16(sum[1],tmp,y_1); + sum[2] = MAC16_16(sum[2],tmp,y_2); + sum[3] = MAC16_16(sum[3],tmp,y_3); + tmp=*x++; + y_0=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_1); + sum[1] = MAC16_16(sum[1],tmp,y_2); + sum[2] = MAC16_16(sum[2],tmp,y_3); + sum[3] = MAC16_16(sum[3],tmp,y_0); + tmp=*x++; + y_1=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_2); + sum[1] = MAC16_16(sum[1],tmp,y_3); + sum[2] = MAC16_16(sum[2],tmp,y_0); + sum[3] = MAC16_16(sum[3],tmp,y_1); + tmp=*x++; + y_2=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_3); + sum[1] = MAC16_16(sum[1],tmp,y_0); + sum[2] = MAC16_16(sum[2],tmp,y_1); + sum[3] = MAC16_16(sum[3],tmp,y_2); + } + if (j++<len) + { + opus_val16 tmp = *x++; + y_3=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_0); + sum[1] = MAC16_16(sum[1],tmp,y_1); + sum[2] = MAC16_16(sum[2],tmp,y_2); + sum[3] = MAC16_16(sum[3],tmp,y_3); + } + if (j++<len) + { + opus_val16 tmp=*x++; + y_0=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_1); + sum[1] = MAC16_16(sum[1],tmp,y_2); + sum[2] = MAC16_16(sum[2],tmp,y_3); + sum[3] = MAC16_16(sum[3],tmp,y_0); + } + if (j<len) + { + opus_val16 tmp=*x++; + y_1=*y++; + sum[0] = MAC16_16(sum[0],tmp,y_2); + sum[1] = MAC16_16(sum[1],tmp,y_3); + sum[2] = MAC16_16(sum[2],tmp,y_0); + sum[3] = MAC16_16(sum[3],tmp,y_1); + } +} + #ifdef FIXED_POINT opus_val32 #else