From ed317c94c310d50e48f55f0504c4d3e06b4f96da Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <Jean-Marc.Valin@csiro.au> Date: Tue, 15 Apr 2008 17:31:23 +1000 Subject: [PATCH] optimisation: another bunch of simplifications to the "simple case" of the alg_quant() search. --- libcelt/cwrs.c | 1 + libcelt/vq.c | 66 ++++++++++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/libcelt/cwrs.c b/libcelt/cwrs.c index 9ed3d7336..c8a2ac65e 100644 --- a/libcelt/cwrs.c +++ b/libcelt/cwrs.c @@ -42,6 +42,7 @@ #include "config.h" #endif +#include "os_support.h" #include <stdlib.h> #include <string.h> #include "cwrs.h" diff --git a/libcelt/vq.c b/libcelt/vq.c index b071cdd74..b9a91bab4 100644 --- a/libcelt/vq.c +++ b/libcelt/vq.c @@ -137,61 +137,67 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t * while (pulsesLeft > 0) { int pulsesAtOnce=1; - int sign; - celt_word32_t Rxy, Ryy, Ryp; - celt_word32_t g; - celt_word32_t best_num; - celt_word16_t best_den; int best_id; - + celt_word16_t magnitude; +#ifdef FIXED_POINT + int rshift; +#endif /* Decide on how many pulses to find at once */ pulsesAtOnce = (pulsesLeft*N_1)>>9; /* pulsesLeft/N */ if (pulsesAtOnce<1) pulsesAtOnce = 1; +#ifdef FIXED_POINT + rshift = yshift+1+celt_ilog2(K-pulsesLeft+pulsesAtOnce); +#endif + magnitude = SHL16(pulsesAtOnce, yshift); - /* This should ensure that anything we can process will have a better score */ - best_num = -SHR32(VERY_LARGE32,4); - best_den = 0; best_id = 0; + /* The squared magnitude term gets added anyway, so we might as well + add it outside the loop */ + yy = ADD32(yy, MULT16_16(magnitude,magnitude)); /* Choose between fast and accurate strategy depending on where we are in the search */ if (pulsesLeft>1) { - /* OPT: This loop is very CPU-intensive */ + /* This should ensure that anything we can process will have a better score */ + celt_word32_t best_num = -VERY_LARGE16; + celt_word16_t best_den = 0; j=0; do { - celt_word32_t num; - celt_word16_t den; + celt_word16_t Rxy, Ryy; /* Select sign based on X[j] alone */ - sign = signx[j]; - s = SHL16(sign*pulsesAtOnce, yshift); + s = signx[j]*magnitude; /* Temporary sums of the new pulse(s) */ - Rxy = xy + MULT16_16(s,X[j]); - Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s); + Rxy = SHR32(xy + MULT16_16(s,X[j]),rshift); + /* We're multiplying y[j] by two so we don't have to do it here */ + Ryy = SHR32(yy + MULT16_16(s,y[j]),rshift); - /* Approximate score: we maximise Rxy/sqrt(Ryy) */ - num = MULT16_16(ROUND16(Rxy,14),ABS16(ROUND16(Rxy,14))); - den = ROUND16(Ryy,14); + /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that + Rxy is positive because the sign is pre-computed) */ + Rxy = MULT16_16_Q15(Rxy,Rxy); /* The idea is to check for num/den >= best_num/best_den, but that way we can do it without any division */ - /* OPT: Make sure to use a conditional move here */ - if (MULT16_32_Q15(best_den, num) > MULT16_32_Q15(den, best_num)) + /* OPT: Make sure to use conditional moves here */ + if (MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)) { - best_den = den; - best_num = num; + best_den = Ryy; + best_num = Rxy; best_id = j; } } while (++j<N); /* Promises we loop at least once */ } else { + celt_word32_t g; + celt_word32_t best_num = -VERY_LARGE32; for (j=0;j<N;j++) { + celt_word32_t Rxy, Ryy, Ryp; celt_word32_t num; /* Select sign based on X[j] alone */ - sign = signx[j]; - s = SHL16(sign*pulsesAtOnce, yshift); + s = signx[j]*magnitude; /* Temporary sums of the new pulse(s) */ Rxy = xy + MULT16_16(s,X[j]); - Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s); - Ryp = yp + MULT16_16(s, P[j]); + /* We're multiplying y[j] by two so we don't have to do it here */ + Ryy = yy + MULT16_16(s,y[j]); + Ryp = yp + MULT16_16(s,P[j]); /* Compute the gain such that ||p + g*y|| = 1 */ g = MULT16_32_Q15( @@ -218,11 +224,13 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t * /* Updating the sums of the new pulse(s) */ xy = xy + MULT16_16(s,X[j]); - yy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s); + /* We're multiplying y[j] by two so we don't have to do it here */ + yy = yy + MULT16_16(s,y[j]); yp = yp + MULT16_16(s, P[j]); /* Only now that we've made the final choice, update y/iy */ - y[j] += s; + /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ + y[j] += 2*s; iy[j] += is; pulsesLeft -= pulsesAtOnce; } -- GitLab