From 783ad76766e1f6b6aaca5d6eb415ac8a8269e1f2 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Thu, 16 Jun 2016 16:21:02 -0700
Subject: [PATCH] Revise celt_fir_c() to not pass in argument "mem"

The "mem" in celt_fir_c() either is contained in the head of input "x"
in reverse order already, or can be easily attached to the head of "x"
before calling the function. Removing argument "mem" can eliminate the
redundant buffer copies inside.
Update celt_fir_sse4_1() accordingly.
---
 celt/celt_decoder.c        | 13 +++++-----
 celt/celt_lpc.c            | 33 +++++++++---------------
 celt/celt_lpc.h            |  5 ++--
 celt/x86/celt_lpc_sse.c    | 51 ++++++++------------------------------
 celt/x86/celt_lpc_sse.h    | 10 +++-----
 celt/x86/x86_celt_map.c    |  1 -
 silk/LPC_analysis_filter.c |  6 +----
 7 files changed, 36 insertions(+), 83 deletions(-)

diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 58e7b75a9..567d74564 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -556,10 +556,11 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
    } else {
       /* Pitch-based PLC */
       const opus_val16 *window;
+      opus_val16 *exc;
       opus_val16 fade = Q15ONE;
       int pitch_index;
       VARDECL(opus_val32, etmp);
-      VARDECL(opus_val16, exc);
+      VARDECL(opus_val16, _exc);
 
       if (loss_count == 0)
       {
@@ -570,7 +571,8 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
       }
 
       ALLOC(etmp, overlap, opus_val32);
-      ALLOC(exc, MAX_PERIOD, opus_val16);
+      ALLOC(_exc, MAX_PERIOD+LPC_ORDER, opus_val16);
+      exc = _exc+LPC_ORDER;
       window = mode->window;
       c=0; do {
          opus_val16 decay;
@@ -635,15 +637,14 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
          /* Initialize the LPC history with the samples just before the start
             of the region for which we're computing the excitation. */
          {
-            opus_val16 lpc_mem[LPC_ORDER];
             for (i=0;i<LPC_ORDER;i++)
             {
-               lpc_mem[i] =
-                     ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-1-i], SIG_SHIFT);
+               exc[MAX_PERIOD-exc_length-LPC_ORDER+i] =
+                     ROUND16(buf[DECODE_BUFFER_SIZE-exc_length-LPC_ORDER+i], SIG_SHIFT);
             }
             /* Compute the excitation for exc_length samples before the loss. */
             celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
-                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
+                  exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, st->arch);
          }
 
          /* Check if the waveform is decaying, and if so how fast.
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index bc9eb2c82..0aabd1920 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -89,56 +89,47 @@ int          p
 
 
 void celt_fir_c(
-         const opus_val16 *_x,
+         const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
    int i,j;
    VARDECL(opus_val16, rnum);
-   VARDECL(opus_val16, x);
    SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-   for (i=0;i<N;i++)
-      x[i+ord]=_x[i];
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    (void)arch;
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    for (i=0;i<N-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
-      xcorr_kernel(rnum, x+i, sum, ord, arch);
-      _y[i  ] = SATURATE16(ADD32(EXTEND32(_x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
-      _y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
-      _y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
-      _y[i+3] = SATURATE16(ADD32(EXTEND32(_x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
+      xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+      y[i  ] = SATURATE16(ADD32(EXTEND32(x[i  ]), PSHR32(sum[0], SIG_SHIFT)));
+      y[i+1] = SATURATE16(ADD32(EXTEND32(x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
+      y[i+2] = SATURATE16(ADD32(EXTEND32(x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
+      y[i+3] = SATURATE16(ADD32(EXTEND32(x[i+3]), PSHR32(sum[3], SIG_SHIFT)));
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 #endif
    RESTORE_STACK;
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index 323459eb1..a4c5fd6ea 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -45,12 +45,11 @@ void celt_fir_c(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if !defined(OVERRIDE_CELT_FIR)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    (celt_fir_c(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    (celt_fir_c(x, num, y, N, ord, arch))
 #endif
 
 void celt_iir(const opus_val32 *x,
diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c
index 67e5592ac..12a9b0e50 100644
--- a/celt/x86/celt_lpc_sse.c
+++ b/celt/x86/celt_lpc_sse.c
@@ -40,63 +40,32 @@
 
 #if defined(FIXED_POINT)
 
-void celt_fir_sse4_1(const opus_val16 *_x,
+void celt_fir_sse4_1(const opus_val16 *x,
          const opus_val16 *num,
-         opus_val16 *_y,
+         opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch)
 {
     int i,j;
     VARDECL(opus_val16, rnum);
-    VARDECL(opus_val16, x);
 
     __m128i vecNoA;
     opus_int32 noA ;
     SAVE_STACK;
 
    ALLOC(rnum, ord, opus_val16);
-   ALLOC(x, N+ord, opus_val16);
    for(i=0;i<ord;i++)
       rnum[i] = num[ord-i-1];
-   for(i=0;i<ord;i++)
-      x[i] = mem[ord-i-1];
-
-   for (i=0;i<N-7;i+=8)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-       x[i+ord+4]=_x[i+4];
-       x[i+ord+5]=_x[i+5];
-       x[i+ord+6]=_x[i+6];
-       x[i+ord+7]=_x[i+7];
-   }
-
-   for (;i<N-3;i+=4)
-   {
-       x[i+ord  ]=_x[i  ];
-       x[i+ord+1]=_x[i+1];
-       x[i+ord+2]=_x[i+2];
-       x[i+ord+3]=_x[i+3];
-   }
-
-   for (;i<N;i++)
-         x[i+ord]=_x[i];
-
-   for(i=0;i<ord;i++)
-      mem[i] = _x[N-i-1];
 #ifdef SMALL_FOOTPRINT
    for (i=0;i<N;i++)
    {
-      opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum = MAC16_16(sum,rnum[j],x[i+j]);
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
       }
-      _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+      y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
    }
 #else
    noA = EXTEND32(1) << SIG_SHIFT >> 1;
@@ -107,22 +76,22 @@ void celt_fir_sse4_1(const opus_val16 *_x,
       opus_val32 sums[4] = {0};
       __m128i vecSum, vecX;
 
-      xcorr_kernel(rnum, x+i, sums, ord, arch);
+      xcorr_kernel(rnum, x+i-ord, sums, ord, arch);
 
       vecSum = _mm_loadu_si128((__m128i *)sums);
       vecSum = _mm_add_epi32(vecSum, vecNoA);
       vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
-      vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+      vecX = OP_CVTEPI16_EPI32_M64(x + i);
       vecSum = _mm_add_epi32(vecSum, vecX);
       vecSum = _mm_packs_epi32(vecSum, vecSum);
-      _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+      _mm_storel_epi64((__m128i *)(y + i), vecSum);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = 0;
       for (j=0;j<ord;j++)
-         sum = MAC16_16(sum, rnum[j], x[i + j]);
-      _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+         sum = MAC16_16(sum, rnum[j], x[i+j-ord]);
+      y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT)));
    }
 
 #endif
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index c5ec796ed..7d1ecf753 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -41,12 +41,11 @@ void celt_fir_sse4_1(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
-#define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+#define celt_fir(x, num, y, N, ord, arch) \
+    ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
 #else
 
@@ -56,11 +55,10 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16 *y,
          int N,
          int ord,
-         opus_val16 *mem,
          int arch);
 
-#  define celt_fir(x, num, y, N, ord, mem, arch) \
-    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+#  define celt_fir(x, num, y, N, ord, arch) \
+    ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
 #endif
 #endif
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 80559a28e..d39d88ede 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -47,7 +47,6 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          opus_val16       *y,
          int              N,
          int              ord,
-         opus_val16       *mem,
          int              arch
 ) = {
   celt_fir_c,                /* non-sse */
diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c
index 3c616d7be..20330d54c 100644
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -57,7 +57,6 @@ void silk_LPC_analysis_filter(
 {
     opus_int   j;
 #if USE_CELT_FIR
-    opus_int16 mem[SILK_MAX_ORDER_LPC];
     opus_int16 num[SILK_MAX_ORDER_LPC];
 #else
     int ix;
@@ -74,10 +73,7 @@ void silk_LPC_analysis_filter(
     for ( j = 0; j < d; j++ ) {
         num[ j ] = -B[ j ];
     }
-    for (j=0;j<d;j++) {
-        mem[ j ] = in[ d - j - 1 ];
-    }
-    celt_fir( in + d, num, out + d, len - d, d, mem, arch );
+    celt_fir( in + d, num, out + d, len - d, d, arch );
     for ( j = 0; j < d; j++ ) {
         out[ j ] = 0;
     }
-- 
GitLab