From 0d95b3b48cb0f852b22693bcf49ed4f0e604def3 Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Mon, 17 Jun 2013 03:58:16 -0400
Subject: [PATCH] SSE optimization of comb_filter()

Should make it easy to adapt to other architectures.
---
 celt/celt.c          | 41 ++++++++++++++++++++++++------------
 celt/x86/pitch_sse.h | 49 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/celt/celt.c b/celt/celt.c
index dfa5c68b2..3e0ce6e6a 100644
--- a/celt/celt.c
+++ b/celt/celt.c
@@ -85,6 +85,31 @@ int resampling_factor(opus_int32 rate)
    return ret;
 }
 
+#ifndef OVERRIDE_COMB_FILTER_CONST
+static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = x[-T-2];
+   x3 = x[-T-1];
+   x2 = x[-T];
+   x1 = x[-T+1];
+   for (i=0;i<N;i++)
+   {
+      x0=x[i-T+2];
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x2)
+               + MULT16_32_Q15(g11,ADD32(x1,x3))
+               + MULT16_32_Q15(g12,ADD32(x0,x4));
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+
+}
+#endif
 
 void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
       opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
@@ -141,19 +166,9 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
          OPUS_MOVE(y+overlap, x+overlap, N-overlap);
       return;
    }
-   /* OPT: For machines where the movs are costly, unroll by 5 */
-   for (;i<N;i++)
-   {
-      x0=x[i-T1+2];
-      y[i] = x[i]
-               + MULT16_32_Q15(g10,x2)
-               + MULT16_32_Q15(g11,ADD32(x1,x3))
-               + MULT16_32_Q15(g12,ADD32(x0,x4));
-      x4=x3;
-      x3=x2;
-      x2=x1;
-      x1=x0;
-   }
+
+   /* Compute the part with the constant filter. */
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12);
 }
 
 const signed char tf_select_table[4][8] = {
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index d24139806..64c96af2c 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -100,5 +100,54 @@ static inline opus_val32 dual_inner_prod(const opus_val16 *x, const opus_val16 *
    return xy;
 }
 
+#define OVERRIDE_COMB_FILTER_CONST
+static inline void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
+      opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+   int i;
+   __m128 x0v;
+   __m128 g10v, g11v, g12v;
+   g10v = _mm_load1_ps(&g10);
+   g11v = _mm_load1_ps(&g11);
+   g12v = _mm_load1_ps(&g12);
+   x0v = _mm_loadu_ps(&x[-T-2]);
+   for (i=0;i<N-3;i+=4)
+   {
+      __m128 yi, yi2, x1v, x2v, x3v, x4v;
+      const opus_val32 *xp = &x[i-T-2];
+      yi = _mm_loadu_ps(x+i);
+      x4v = _mm_loadu_ps(xp+4);
+#if 0
+      /* Slower version with all loads */
+      x1v = _mm_loadu_ps(xp+1);
+      x2v = _mm_loadu_ps(xp+2);
+      x3v = _mm_loadu_ps(xp+3);
+#else
+      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+      /* Use partial sums */
+      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+      yi = _mm_add_ps(yi, yi2);
+#endif
+      x0v=x4v;
+      _mm_storeu_ps(y+i, yi);
+   }
+   for (;i<N;i++)
+   {
+      y[i] = x[i]
+               + MULT16_32_Q15(g10,x[i-T])
+               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+   }
+}
 
 #endif
-- 
GitLab