From 35a1f88e8162138965ebe11a71ee3ac9d4e02c6d Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <Jean-Marc.Valin@csiro.au>
Date: Wed, 26 Mar 2008 10:34:23 +1100
Subject: [PATCH] optimisation: The "simple" Rxy/sqrt(Ryy) case in alg_quant no
 longer requires a division

---
 libcelt/vq.c | 44 +++++++++++++++++++++++++++++++++-----------
 libcelt/vq.h |  4 +---
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/libcelt/vq.c b/libcelt/vq.c
index 3a6494ef9..cb16c6913 100644
--- a/libcelt/vq.c
+++ b/libcelt/vq.c
@@ -1,4 +1,4 @@
-/* (C) 2007 Jean-Marc Valin, CSIRO
+/* (C) 2007-2008 Jean-Marc Valin, CSIRO
 */
 /*
    Redistribution and use in source and binary forms, with or without
@@ -39,9 +39,8 @@
 #include "arch.h"
 #include "os_support.h"
 
-/** Takes the pitch vector and the decoded residual vector (non-compressed), 
-   applies the compression in the pitch direction, computes the gain that will
-   give ||p+g*y||=1 and mixes the residual with the pitch. */
+/** Takes the pitch vector and the decoded residual vector, computes the gain
+    that will give ||p+g*y||=1 and mixes the residual with the pitch. */
 static void mix_pitch_and_residual(int * restrict iy, celt_norm_t * restrict X, int N, int K, const celt_norm_t * restrict P)
 {
    int i;
@@ -99,7 +98,6 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
    VARDECL(celt_norm_t, y);
    VARDECL(int, iy);
    VARDECL(int, signx);
-   VARDECL(celt_word32_t, scores);
    int i, j, is;
    celt_word16_t s;
    int pulsesLeft;
@@ -118,7 +116,6 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
    ALLOC(y, N, celt_norm_t);
    ALLOC(iy, N, int);
    ALLOC(signx, N, int);
-   ALLOC(scores, N, celt_word32_t);
 
    for (j=0;j<N;j++)
    {
@@ -150,29 +147,49 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
       int sign;
       celt_word32_t Rxy, Ryy, Ryp;
       celt_word32_t g;
+      celt_word32_t best_num;
+      celt_word16_t best_den;
+      int best_id;
       
       /* Decide on how many pulses to find at once */
       pulsesAtOnce = pulsesLeft/N;
       if (pulsesAtOnce<1)
          pulsesAtOnce = 1;
 
+      /* This should ensure that anything we can process will have a better score */
+      best_num = -SHR32(VERY_LARGE32,4);
+      best_den = 0;
+      best_id = 0;
       /* Choose between fast and accurate strategy depending on where we are in the search */
       if (pulsesLeft>1)
       {
          for (j=0;j<N;j++)
          {
+            celt_word32_t num;
+            celt_word16_t den;
             /* Select sign based on X[j] alone */
             sign = signx[j];
             s = SHL16(sign*pulsesAtOnce, yshift);
             /* Temporary sums of the new pulse(s) */
             Rxy = xy + MULT16_16(s,X[j]);
             Ryy = yy + 2*MULT16_16(s,y[j]) + MULT16_16(s,s);
-            /* This score is approximate, but good enough for the first pulses */
-            scores[j] = MULT32_32_Q31(MULT16_16(ROUND16(Rxy,14),ABS16(ROUND16(Rxy,14))), celt_rcp(SHR32(Ryy,12)));
+            
+            /* Approximate score: we maximise Rxy/sqrt(Ryy) */
+            num = MULT16_16(ROUND16(Rxy,14),ABS16(ROUND16(Rxy,14)));
+            den = ROUND16(Ryy,14);
+            /* The idea is to check for num/den >= best_num/best_den, but that way
+               we can do it without any division */
+            if (MULT16_32_Q15(best_den, num) >= MULT16_32_Q15(den, best_num))
+            {
+               best_den = den;
+               best_num = num;
+               best_id = j;
+            }
          }
       } else {
          for (j=0;j<N;j++)
          {
+            celt_word32_t num;
             /* Select sign based on X[j] alone */
             sign = signx[j];
             s = SHL16(sign*pulsesAtOnce, yshift);
@@ -190,12 +207,17 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
             /* Knowing that gain, what's the error: (x-g*y)^2 
                (result is negated and we discard x^2 because it's constant) */
             /* score = 2.f*g*Rxy - 1.f*g*g*Ryy*NORM_SCALING_1;*/
-            scores[j] = 2*MULT16_32_Q14(ROUND16(Rxy,14),g)
-                    - MULT16_32_Q14(EXTRACT16(MULT16_32_Q14(ROUND16(Ryy,14),g)),g);
+            num = 2*MULT16_32_Q14(ROUND16(Rxy,14),g)
+                  - MULT16_32_Q14(EXTRACT16(MULT16_32_Q14(ROUND16(Ryy,14),g)),g);
+            if (num >= best_num)
+            {
+               best_num = num;
+               best_id = j;
+            } 
          }
       }
       
-      j = find_max32(scores, N);
+      j = best_id;
       is = signx[j]*pulsesAtOnce;
       s = SHL16(is, yshift);
 
diff --git a/libcelt/vq.h b/libcelt/vq.h
index f5c507df6..8e508863a 100644
--- a/libcelt/vq.h
+++ b/libcelt/vq.h
@@ -1,4 +1,4 @@
-/* (C) 2007 Jean-Marc Valin, CSIRO
+/* (C) 2007-2008 Jean-Marc Valin, CSIRO
 */
 /**
    @file vq.h
@@ -48,7 +48,6 @@
  * @param N Number of samples to encode
  * @param K Number of pulses to use
  * @param p Pitch vector (it is assumed that p+x is a unit vector)
- * @param alpha compression factor to apply in the pitch direction (magic!)
  * @param enc Entropy encoder state
 */
 void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *P, ec_enc *enc);
@@ -58,7 +57,6 @@ void alg_quant(celt_norm_t *X, celt_mask_t *W, int N, int K, const celt_norm_t *
  * @param N Number of samples to decode
  * @param K Number of pulses to use
  * @param p Pitch vector (automatically added to x)
- * @param alpha compression factor in the pitch direction (magic!)
  * @param dec Entropy decoder state
  */
 void alg_unquant(celt_norm_t *X, int N, int K, celt_norm_t *P, ec_dec *dec);
-- 
GitLab