Compare revisions

1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855
--- a/celt/mdct.c
+++ b/celt/mdct.c
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This is a simple MDCT implementation that uses a N/4 complex FFT
+   to do most of the work. It should be relatively straightforward to
+   plug in pretty much and FFT here.
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
+   (might as well use the same FFT everywhere).
+
+   The algorithm is similar to (and inspired from) Fabrice Bellard's
+   MDCT implementation in FFMPEG, but has differences in signs, ordering
+   and scaling in many places.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "mdct.h"
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include <math.h>
+#include "os_support.h"
+#include "mathops.h"
+#include "stack_alloc.h"
+
+#if defined(MIPSr1_ASM)
+#include "mips/mdct_mipsr1.h"
+#endif
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+#ifdef CUSTOM_MODES
+
+int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch)
+{
+   int i;
+   kiss_twiddle_scalar *trig;
+   int shift;
+   int N2=N>>1;
+   l->n = N;
+   l->maxshift = maxshift;
+   for (i=0;i<=maxshift;i++)
+   {
+      if (i==0)
+         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch);
+      else
+         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch);
+#ifndef ENABLE_TI_DSPLIB55
+      if (l->kfft[i]==NULL)
+         return 0;
+#endif
+   }
+   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar));
+   if (l->trig==NULL)
+     return 0;
+   for (shift=0;shift<=maxshift;shift++)
+   {
+      /* We have enough points that sine isn't necessary */
+#if defined(FIXED_POINT)
+#ifndef ENABLE_QEXT
+      for (i=0;i<N2;i++)
+         trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N));
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N))));
+#endif
+#else
+      for (i=0;i<N2;i++)
+         trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N);
+#endif
+      trig += N2;
+      N2 >>= 1;
+      N >>= 1;
+   }
+   return 1;
+}
+
+void clt_mdct_clear(mdct_lookup *l, int arch)
+{
+   int i;
+   for (i=0;i<=l->maxshift;i++)
+      opus_fft_free(l->kfft[i], arch);
+   opus_free((kiss_twiddle_scalar*)l->trig);
+}
+
+#endif /* CUSTOM_MODES */
+
+/* Forward MDCT trashes the input array */
+#ifndef OVERRIDE_clt_mdct_forward
+void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const celt_coef *window, int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   celt_coef scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
+   int headroom;
+#endif
+   SAVE_STACK;
+   (void)arch;
+   scale = st->scale;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1);
+         *yp++ = S_MUL(*xp1, *wp1)    - S_MUL(xp2[-N2], *wp2);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2);
+         *yp++ = S_MUL(*xp1, *wp2)     + S_MUL(xp2[N2], *wp1);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+#ifdef FIXED_POINT
+      opus_val32 maxval=1;
+#endif
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
+            For floating-point it doesn't matter. */
+#ifdef ENABLE_QEXT
+         yc.r = yr;
+         yc.i = yi;
+#else
+         yc.r = S_MUL2(yr, scale);
+         yc.i = S_MUL2(yi, scale);
+#endif
+#ifdef FIXED_POINT
+         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
+#endif
+         f2[st->bitrev[i]] = yc;
+      }
+#ifdef FIXED_POINT
+      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
+#endif
+   }
+
+   /* N/4 complex FFT, does not downscale anymore */
+   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         kiss_fft_scalar t0, t1;
+#ifdef ENABLE_QEXT
+         t0 = S_MUL2(t[i], scale);
+         t1 = S_MUL2(t[N4+i], scale);
+#else
+         t0 = t[i];
+         t1 = t[N4+i];
+#endif
+         yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom);
+         yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+#endif /* OVERRIDE_clt_mdct_forward */
+
+#ifndef OVERRIDE_clt_mdct_backward
+void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   const kiss_twiddle_scalar *trig;
+   (void) arch;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
+      for(i=0;i<N4;i++)
+      {
+         int rev;
+         kiss_fft_scalar yr, yi;
+         opus_val32 x1, x2;
+         rev = *bitrev++;
+         x1 = SHL32(*xp1, IMDCT_HEADROOM);
+         x2 = SHL32(*xp2, IMDCT_HEADROOM);
+         yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
+         yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(0));
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp0[1];
+         im = yp0[0];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
+         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp1[1];
+         im = yp1[0];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
+         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const celt_coef * OPUS_RESTRICT wp1 = window;
+      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1));
+         *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2));
+         wp1++;
+         wp2--;
+      }
+   }
+}
+#endif /* OVERRIDE_clt_mdct_backward */
--- a/celt/mdct.h
+++ b/celt/mdct.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2008 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* This is a simple MDCT implementation that uses a N/4 complex FFT
+   to do most of the work. It should be relatively straightforward to
+   plug in pretty much and FFT here.
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
+   (might as well use the same FFT everywhere).
+
+   The algorithm is similar to (and inspired from) Fabrice Bellard's
+   MDCT implementation in FFMPEG, but has differences in signs, ordering
+   and scaling in many places.
+*/
+
+#ifndef MDCT_H
+#define MDCT_H
+
+#include "opus_defines.h"
+#include "kiss_fft.h"
+#include "arch.h"
+
+typedef struct {
+   int n;
+   int maxshift;
+   const kiss_fft_state *kfft[4];
+   const kiss_twiddle_scalar * OPUS_RESTRICT trig;
+} mdct_lookup;
+
+#if defined(HAVE_ARM_NE10)
+#include "arm/mdct_arm.h"
+#endif
+
+/* There should be 2 bits of headroom in the IMDCT which we can take
+   advantage of to maximize accuracy. */
+#define IMDCT_HEADROOM 2
+
+int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch);
+void clt_mdct_clear(mdct_lookup *l, int arch);
+
+/** Compute a forward MDCT and scale by 4/N, trashes the input array */
+void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in,
+                        kiss_fft_scalar * OPUS_RESTRICT out,
+                        const celt_coef *window, int overlap,
+                        int shift, int stride, int arch);
+
+/** Compute a backward MDCT (no scaling) and performs weighted overlap-add
+    (scales implicitly by 1/2) */
+void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out,
+      const celt_coef * OPUS_RESTRICT window,
+      int overlap, int shift, int stride, int arch);
+
+#if !defined(OVERRIDE_OPUS_MDCT)
+/* Is run-time CPU detection enabled on this platform? */
+#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10)
+
+extern void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(
+      const mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out, const celt_coef *window,
+      int overlap, int shift, int stride, int arch);
+
+#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   ((*CLT_MDCT_FORWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
+                                                   _window, _overlap, _shift, \
+                                                   _stride, _arch))
+
+extern void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(
+      const mdct_lookup *l, kiss_fft_scalar *in,
+      kiss_fft_scalar * OPUS_RESTRICT out, const celt_coef *window,
+      int overlap, int shift, int stride, int arch);
+
+#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   (*CLT_MDCT_BACKWARD_IMPL[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
+                                                   _window, _overlap, _shift, \
+                                                   _stride, _arch)
+
+#else /* if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) */
+
+#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   clt_mdct_forward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
+
+#define clt_mdct_backward(_l, _in, _out, _window, _overlap, _shift, _stride, _arch) \
+   clt_mdct_backward_c(_l, _in, _out, _window, _overlap, _shift, _stride, _arch)
+
+#endif /* end if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10) && !defined(FIXED_POINT) */
+#endif /* end if !defined(OVERRIDE_OPUS_MDCT) */
+
+#endif
--- a/celt/meson.build
+++ b/celt/meson.build
+celt_sources = sources['CELT_SOURCES']
+
+celt_sse_sources = sources['CELT_SOURCES_SSE']
+
+celt_sse2_sources = sources['CELT_SOURCES_SSE2']
+
+celt_sse4_1_sources = sources['CELT_SOURCES_SSE4_1']
+
+celt_avx2_sources = sources['CELT_SOURCES_AVX2']
+
+celt_neon_intr_sources = sources['CELT_SOURCES_ARM_NEON_INTR']
+
+celt_static_libs = []
+
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  celt_sources +=  sources['CELT_SOURCES_X86_RTCD']
+endif
+
+foreach intr_name : ['sse', 'sse2', 'sse4_1', 'avx2', 'neon_intr']
+  have_intr = get_variable('have_' + intr_name)
+  if not have_intr
+    continue
+  endif
+
+  intr_sources = get_variable('celt_@0@_sources'.format(intr_name))
+  intr_args = get_variable('opus_@0@_args'.format(intr_name), [])
+  celt_static_libs += static_library('celt_' + intr_name, intr_sources,
+      c_args: intr_args,
+      include_directories: opus_includes,
+      install: false)
+endforeach
+
+have_arm_intrinsics_or_asm = have_arm_ne10
+if (intrinsics_support.length() + asm_optimization.length() + inline_optimization.length()) > 0
+  have_arm_intrinsics_or_asm = true
+endif
+
+if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    celt_sources +=  sources['CELT_SOURCES_ARM_RTCD']
+  endif
+  if have_arm_ne10
+    celt_sources += sources['CELT_SOURCES_ARM_NE10']
+  endif
+  if opus_arm_external_asm
+    subdir('arm')
+    celt_static_libs += static_library('celt-armasm',
+      celt_arm_armopts_s, celt_sources_arm_asm,
+      install: false)
+  endif
+endif
+
+celt_c_args = []
+if host_system == 'windows'
+  celt_c_args += ['-DDLL_EXPORT']
+endif
+
+celt_lib = static_library('opus-celt',
+  celt_sources,
+  c_args: celt_c_args,
+  include_directories: opus_includes,
+  link_whole: celt_static_libs,
+  dependencies: libm,
+  install: false)
--- a/libcelt/mfrngcod.h
+++ b/libcelt/mfrngcod.h
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -40,17 +40,9 @@
 /*Bits to shift by to move a symbol into the high-order position.*/
 # define EC_CODE_SHIFT (EC_CODE_BITS-EC_SYM_BITS-1)
 /*Carry bit of the high-order range symbol.*/
-# define EC_CODE_TOP   (((celt_uint32)1U)<<EC_CODE_BITS-1)
+# define EC_CODE_TOP   (((opus_uint32)1U)<<(EC_CODE_BITS-1))
 /*Low-order bit of the high-order range symbol.*/
 # define EC_CODE_BOT   (EC_CODE_TOP>>EC_SYM_BITS)
-/*Code for which propagating carries are possible.*/
-# define EC_CODE_CARRY (((celt_uint32)EC_SYM_MAX)<<EC_CODE_SHIFT)
 /*The number of bits available for the last, partial symbol in the code field.*/
 # define EC_CODE_EXTRA ((EC_CODE_BITS-2)%EC_SYM_BITS+1)
-/*A mask for the bits available in the coding buffer.
-  This allows different platforms to use a variable with more bits, if it is
-   convenient.
-  We will only use EC_CODE_BITS of it.*/
-# define EC_CODE_MASK  ((((celt_uint32)1U)<<EC_CODE_BITS-1)-1<<1|1)
-
 #endif
--- a/celt/mips/celt_mipsr1.h
+++ b/celt/mips/celt_mipsr1.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_MIPSR1_H__
+#define CELT_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_C
+
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+#define OVERRIDE_COMB_FILTER_CONST
+#define OVERRIDE_comb_filter
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const opus_val16 *window, int overlap, int arch)
+{
+   int i;
+   opus_val32 x0, x1, x2, x3, x4;
+
+   (void)arch;
+
+   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
+   opus_val16 g00, g01, g02, g10, g11, g12;
+   static const opus_val16 gains[3][3] = {
+         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+
+   if (g0==0 && g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y, x, N);
+      return;
+   }
+
+   g00 = MULT16_16_P15(g0, gains[tapset0][0]);
+   g01 = MULT16_16_P15(g0, gains[tapset0][1]);
+   g02 = MULT16_16_P15(g0, gains[tapset0][2]);
+   g10 = MULT16_16_P15(g1, gains[tapset1][0]);
+   g11 = MULT16_16_P15(g1, gains[tapset1][1]);
+   g12 = MULT16_16_P15(g1, gains[tapset1][2]);
+   x1 = x[-T1+1];
+   x2 = x[-T1  ];
+   x3 = x[-T1-1];
+   x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
+
+   for (i=0;i<overlap;i++)
+   {
+      opus_val16 f;
+      opus_val32 res;
+      f = MULT16_16_Q15(window[i],window[i]);
+      x0= x[i-T1+2];
+
+      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0]));
+
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1])));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2])));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1)));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0)));
+
+      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
+
+      y[i] = x[i] + res;
+
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+
+   x4 = x[i-T1-2];
+   x3 = x[i-T1-1];
+   x2 = x[i-T1];
+   x1 = x[i-T1+1];
+
+   if (g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y+overlap, x+overlap, N-overlap);
+      return;
+   }
+
+   for (i=overlap;i<N;i++)
+   {
+      opus_val32 res;
+      x0=x[i-T1+2];
+
+      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)g10), "r" ((int)x2));
+
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g11), "r" ((int)ADD32(x3,x1)));
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g12), "r" ((int)ADD32(x4,x0)));
+      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
+      y[i] = x[i] + res;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+}
+
+#endif /* CELT_MIPSR1_H__ */
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
+/* Copyright (C) 2007-2009 Xiph.Org Foundation
+   Copyright (C) 2003-2008 Jean-Marc Valin
+   Copyright (C) 2007-2008 CSIRO */
+/**
+   @file fixed_generic.h
+   @brief Generic fixed-point operations
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_FIXED_GENERIC_MIPSR1_H
+#define CELT_FIXED_GENERIC_MIPSR1_H
+
+#undef MULT16_32_Q15_ADD
+static inline int MULT16_32_Q15_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_32_Q15_SUB
+static inline int MULT16_32_Q15_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_16_Q15_ADD
+static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef MULT16_16_Q15_SUB
+static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+
+#undef MULT16_32_Q16
+static inline int MULT16_32_Q16(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1,%0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (16));
+    return c;
+}
+
+#undef MULT16_32_P16
+static inline int MULT16_32_P16(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR_R.W %0,$ac1, %1" : "=r" (c): "i" (16));
+    return c;
+}
+
+#undef MULT16_32_Q15
+static inline int MULT16_32_Q15(int a, int b)
+{
+    int c;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (15));
+    return c;
+}
+
+#undef MULT32_32_Q31
+static inline int MULT32_32_Q31(int a, int b)
+{
+    int r;
+    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (r): "i" (31));
+    return r;
+}
+
+#undef PSHR32
+static inline int PSHR32(int a, int shift)
+{
+    int r;
+    asm volatile ("SHRAV_R.W %0, %1, %2" :"=r" (r): "r" (a), "r" (shift));
+    return r;
+}
+
+#undef MULT16_16_P15
+static inline int MULT16_16_P15(int a, int b)
+{
+    int r;
+    asm volatile ("mul %0, %1, %2" :"=r" (r): "r" (a), "r" (b));
+    asm volatile ("SHRA_R.W %0, %1, %2" : "+r" (r):  "0" (r), "i"(15));
+    return r;
+}
+
+#endif /* CELT_FIXED_GENERIC_MIPSR1_H */
--- a/celt/mips/kiss_fft_mipsr1.h
+++ b/celt/mips/kiss_fft_mipsr1.h
+/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifndef KISS_FFT_MIPSR1_H
+#define KISS_FFT_MIPSR1_H
+
+#if !defined(KISS_FFT_GUTS_H)
+#error "This file should only be included from _kiss_fft_guts.h"
+#endif
+
+#ifdef FIXED_POINT
+
+#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d))
+#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d))
+
+#undef S_MUL_ADD
+static inline int S_MUL_ADD(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef S_MUL_SUB
+static inline int S_MUL_SUB(int a, int b, int c, int d) {
+    int m;
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
+    return m;
+}
+
+#undef C_MUL
+#   define C_MUL(m,a,b) (m=C_MUL_fun(a,b))
+static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
+
+    return m;
+}
+#undef C_MULC
+#   define C_MULC(m,a,b) (m=C_MULC_fun(a,b))
+static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
+    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
+    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
+    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
+    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
+
+    return m;
+}
+
+#endif /* FIXED_POINT */
+
+#define OVERRIDE_kf_bfly5
+static void kf_bfly5(
+                     kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm
+                    )
+{
+   kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+   int i, u;
+   kiss_fft_cpx scratch[13];
+
+   const kiss_twiddle_cpx *tw;
+   kiss_twiddle_cpx ya,yb;
+   kiss_fft_cpx * Fout_beg = Fout;
+
+#ifdef FIXED_POINT
+   ya.r = 10126;
+   ya.i = -31164;
+   yb.r = -26510;
+   yb.i = -19261;
+#else
+   ya = st->twiddles[fstride*m];
+   yb = st->twiddles[fstride*2*m];
+#endif
+
+   tw=st->twiddles;
+
+   for (i=0;i<N;i++)
+   {
+      Fout = Fout_beg + i*mm;
+      Fout0=Fout;
+      Fout1=Fout0+m;
+      Fout2=Fout0+2*m;
+      Fout3=Fout0+3*m;
+      Fout4=Fout0+4*m;
+
+      /* For non-custom modes, m is guaranteed to be a multiple of 4. */
+      for ( u=0; u<m; ++u ) {
+         scratch[0] = *Fout0;
+
+
+         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
+         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
+         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
+         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
+
+         C_ADD( scratch[7],scratch[1],scratch[4]);
+         C_SUB( scratch[10],scratch[1],scratch[4]);
+         C_ADD( scratch[8],scratch[2],scratch[3]);
+         C_SUB( scratch[9],scratch[2],scratch[3]);
+
+         Fout0->r += scratch[7].r + scratch[8].r;
+         Fout0->i += scratch[7].i + scratch[8].i;
+         scratch[5].r = scratch[0].r + S_MUL_ADD(scratch[7].r,ya.r,scratch[8].r,yb.r);
+         scratch[5].i = scratch[0].i + S_MUL_ADD(scratch[7].i,ya.r,scratch[8].i,yb.r);
+
+         scratch[6].r =  S_MUL_ADD(scratch[10].i,ya.i,scratch[9].i,yb.i);
+         scratch[6].i =  -S_MUL_ADD(scratch[10].r,ya.i,scratch[9].r,yb.i);
+
+         C_SUB(*Fout1,scratch[5],scratch[6]);
+         C_ADD(*Fout4,scratch[5],scratch[6]);
+
+         scratch[11].r = scratch[0].r + S_MUL_ADD(scratch[7].r,yb.r,scratch[8].r,ya.r);
+         scratch[11].i = scratch[0].i + S_MUL_ADD(scratch[7].i,yb.r,scratch[8].i,ya.r);
+
+         scratch[12].r =  S_MUL_SUB(scratch[9].i,ya.i,scratch[10].i,yb.i);
+         scratch[12].i =  S_MUL_SUB(scratch[10].r,yb.i,scratch[9].r,ya.i);
+
+         C_ADD(*Fout2,scratch[11],scratch[12]);
+         C_SUB(*Fout3,scratch[11],scratch[12]);
+
+         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+      }
+   }
+}
+
+
+#endif /* KISS_FFT_MIPSR1_H */
--- a/libcelt/mdct.c
+++ b/libcelt/mdct.c
@@ -5,19 +5,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -29,15 +29,17 @@
 /* This is a simple MDCT implementation that uses a N/4 complex FFT
   to do most of the work. It should be relatively straightforward to
   plug in pretty much and FFT here.
-   
-   This replaces the Vorbis FFT (and uses the exact same API), which 
-   was a bit too messy and that was ending up duplicating code 
+
+   This replaces the Vorbis FFT (and uses the exact same API), which
+   was a bit too messy and that was ending up duplicating code
   (might as well use the same FFT everywhere).
-   
+
   The algorithm is similar to (and inspired from) Fabrice Bellard's
   MDCT implementation in FFMPEG, but has differences in signs, ordering
-   and scaling in many places. 
+   and scaling in many places.
 */
+#ifndef MDCT_MIPSR1_H__
+#define MDCT_MIPSR1_H__

 #ifndef SKIP_CONFIG_H
 #ifdef HAVE_CONFIG_H
@@ -53,88 +55,56 @@
 #include "mathops.h"
 #include "stack_alloc.h"

-#ifndef M_PI
-#define M_PI 3.141592653
-#endif
-
-#ifdef CUSTOM_MODES
-
-void clt_mdct_init(mdct_lookup *l,int N, int maxshift)
+/* Forward MDCT trashes the input array */
+#define OVERRIDE_clt_mdct_forward
+void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 *window, int overlap, int shift, int stride, int arch)
 {
   int i;
-   int N4, N2;
-   kiss_twiddle_scalar *trig;
-   l->n = N;
-   N2 = N>>1;
-   N4 = N>>2;
-   l->maxshift = maxshift;
-   for (i=0;i<=maxshift;i++)
-   {
-      if (i==0)
-         l->kfft[i] = kiss_fft_alloc(N>>2>>i, 0, 0);
-      else
-         l->kfft[i] = kiss_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
-#ifndef ENABLE_TI_DSPLIB55
-      if (l->kfft[i]==NULL)
-         return;
-#endif
-   }
-   l->trig = trig = (kiss_twiddle_scalar*)celt_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
-   if (l->trig==NULL)
-     return;
-   /* We have enough points that sine isn't necessary */
-#if defined(FIXED_POINT)
-   for (i=0;i<=N4;i++)
-      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
-#else
-   for (i=0;i<=N4;i++)
-      trig[i] = (kiss_twiddle_scalar)cos(2*M_PI*i/N);
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+   opus_val16 scale;
+#ifdef FIXED_POINT
+   /* Allows us to scale with MULT16_32_Q16(), which is faster than
+      MULT16_32_Q15() on ARM. */
+   int scale_shift = st->scale_shift-1;
 #endif
-}
-
-void clt_mdct_clear(mdct_lookup *l)
-{
-   int i;
-   for (i=0;i<=l->maxshift;i++)
-      kiss_fft_free(l->kfft[i]);
-   celt_free((kiss_twiddle_scalar*)l->trig);
-}

-#endif /* CUSTOM_MODES */
+    (void)arch;

-void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * restrict out, const celt_word16 *window, int overlap, int shift)
-{
-   int i;
-   int N, N2, N4;
-   kiss_twiddle_scalar sine;
-   VARDECL(kiss_fft_scalar, f);
   SAVE_STACK;
+   scale = st->scale;
+
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
+
   ALLOC(f, N2, kiss_fft_scalar);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*M_PI*(.125f)/N;
-#endif
+   ALLOC(f2, N4, kiss_fft_cpx);

   /* Consider the input to be composed of four blocks: [a, b, c, d] */
   /* Window, shuffle, fold */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * restrict xp1 = in+(overlap>>1);
-      const kiss_fft_scalar * restrict xp2 = in+N2-1+(overlap>>1);
-      kiss_fft_scalar * restrict yp = out;
-      const celt_word16 * restrict wp1 = window+(overlap>>1);
-      const celt_word16 * restrict wp2 = window+(overlap>>1)-1;
-      for(i=0;i<(overlap>>2);i++)
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
      {
         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
-         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
-         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+          *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2);
+          *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]);
         xp1+=2;
         xp2-=2;
         wp1+=2;
@@ -142,7 +112,7 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
      }
      wp1 = window;
      wp2 = window+overlap-1;
-      for(;i<N4-(overlap>>2);i++)
+      for(;i<N4-((overlap+3)>>2);i++)
      {
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
         *yp++ = *xp2;
@@ -153,8 +123,8 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
      for(;i<N4;i++)
      {
         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
-         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+          *yp++ =  S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
+          *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
         xp1+=2;
         xp2-=2;
         wp1+=2;
@@ -163,171 +133,156 @@ void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar
   }
   /* Pre-rotation */
   {
-      kiss_fft_scalar * restrict yp = out;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
      for(i=0;i<N4;i++)
      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
         kiss_fft_scalar re, im, yr, yi;
-         re = yp[0];
-         im = yp[1];
-         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
-         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr + S_MUL(yi,sine);
-         *yp++ = yi - S_MUL(yr,sine);
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+
+         yr = S_MUL_SUB(re,t0,im,t1);
+         yi = S_MUL_ADD(im,t0,re,t1);
+
+         yc.r = yr;
+         yc.i = yi;
+         yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
+         yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
+         f2[st->bitrev[i]] = yc;
      }
   }

-   /* N/4 complex FFT, down-scales by 4/N */
-   kiss_fft(l->kfft[shift], (kiss_fft_cpx *)out, (kiss_fft_cpx *)f);
+   /* N/4 complex FFT, does not downscale anymore */
+   opus_fft_impl(st, f2);

   /* Post-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * restrict fp = f;
-      kiss_fft_scalar * restrict yp1 = out;
-      kiss_fft_scalar * restrict yp2 = out+N2-1;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
      /* Temp pointers to make it really clear to the compiler what we're doing */
      for(i=0;i<N4;i++)
      {
         kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
-         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp1 = yr - S_MUL(yi,sine);
-         *yp2 = yi + S_MUL(yr,sine);;
-         fp += 2;
-         yp1 += 2;
-         yp2 -= 2;
+         yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]);
+         yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
      }
   }
   RESTORE_STACK;
 }

-
-void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * restrict out, const celt_word16 * restrict window, int overlap, int shift)
+#define OVERRIDE_clt_mdct_backward
+void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
+      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
 {
   int i;
   int N, N2, N4;
-   kiss_twiddle_scalar sine;
-   VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_scalar, f2);
-   SAVE_STACK;
+   const kiss_twiddle_scalar *trig;
+
+    (void)arch;
+
   N = l->n;
-   N >>= shift;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
   N2 = N>>1;
   N4 = N>>2;
-   ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N2, kiss_fft_scalar);
-   /* sin(x) ~= x here */
-#ifdef FIXED_POINT
-   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
-#else
-   sine = (kiss_twiddle_scalar)2*M_PI*(.125f)/N;
-#endif
-   
+
   /* Pre-rotate */
   {
      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * restrict xp1 = in;
-      const kiss_fft_scalar * restrict xp2 = in+N2-1;
-      kiss_fft_scalar * restrict yp = f2;
-      const kiss_twiddle_scalar *t = &l->trig[0];
-      for(i=0;i<N4;i++) 
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
+      for(i=0;i<N4;i++)
      {
+         int rev;
         kiss_fft_scalar yr, yi;
-         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
-         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
-         /* works because the cos is nearly one */
-         *yp++ = yr - S_MUL(yi,sine);
-         *yp++ = yi + S_MUL(yr,sine);
-         xp1+=2;
-         xp2-=2;
+         rev = *bitrev++;
+         yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]);
+         yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]);
+         /* We swap real and imag because we use an FFT instead of an IFFT. */
+         yp[2*rev+1] = yr;
+         yp[2*rev] = yi;
+         /* Storing the pre-rotation directly in the bitrev order. */
+         xp1+=2*stride;
+         xp2-=2*stride;
      }
   }

-   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
-   kiss_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)f);
-   
-   /* Post-rotate */
-   {
-      kiss_fft_scalar * restrict fp = f;
-      const kiss_twiddle_scalar *t = &l->trig[0];
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));

-      for(i=0;i<N4;i++)
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
      {
         kiss_fft_scalar re, im, yr, yi;
-         re = fp[0];
-         im = fp[1];
+         kiss_twiddle_scalar t0, t1;
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp0[1];
+         im = yp0[0];
+         t0 = t[i];
+         t1 = t[N4+i];
         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t[i<<shift]) - S_MUL(im,t[(N4-i)<<shift]);
-         yi = S_MUL(im,t[i<<shift]) + S_MUL(re,t[(N4-i)<<shift]);
-         /* works because the cos is nearly one */
-         *fp++ = yr - S_MUL(yi,sine);
-         *fp++ = yi + S_MUL(yr,sine);
-      }
-   }
-   /* De-shuffle the components for the middle of the window only */
-   {
-      const kiss_fft_scalar * restrict fp1 = f;
-      const kiss_fft_scalar * restrict fp2 = f+N2-1;
-      kiss_fft_scalar * restrict yp = f2;
-      for(i = 0; i < N4; i++)
-      {
-         *yp++ =-*fp1;
-         *yp++ = *fp2;
-         fp1 += 2;
-         fp2 -= 2;
+         yr = S_MUL_ADD(re,t0 , im,t1);
+         yi = S_MUL_SUB(re,t1 , im,t0);
+         /* We swap real and imag because we're using an FFT instead of an IFFT. */
+         re = yp1[1];
+         im = yp1[0];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL_ADD(re,t0,im,t1);
+         yi = S_MUL_SUB(re,t1,im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
      }
   }
-   out -= (N2-overlap)>>1;
+
   /* Mirror on both sides for TDAC */
   {
-      kiss_fft_scalar * restrict fp1 = f2+N4-1;
-      kiss_fft_scalar * restrict xp1 = out+N2-1;
-      kiss_fft_scalar * restrict yp1 = out+N4-overlap/2;
-      const celt_word16 * restrict wp1 = window;
-      const celt_word16 * restrict wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp1 = *fp1;
-         xp1--;
-         fp1--;
-      }
-      for(; i < N4; i++)
-      {
-         kiss_fft_scalar x1;
-         x1 = *fp1--;
-         *yp1++ +=-MULT16_32_Q15(*wp1, x1);
-         *xp1-- += MULT16_32_Q15(*wp2, x1);
-         wp1++;
-         wp2--;
-      }
-   }
-   {
-      kiss_fft_scalar * restrict fp2 = f2+N4;
-      kiss_fft_scalar * restrict xp2 = out+N2;
-      kiss_fft_scalar * restrict yp2 = out+N-1-(N4-overlap/2);
-      const celt_word16 * restrict wp1 = window;
-      const celt_word16 * restrict wp2 = window+overlap-1;
-      for(i = 0; i< N4-overlap/2; i++)
-      {
-         *xp2 = *fp2;
-         xp2++;
-         fp2++;
-      }
-      for(; i < N4; i++)
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
      {
-         kiss_fft_scalar x2;
-         x2 = *fp2++;
-         *yp2--  = MULT16_32_Q15(*wp1, x2);
-         *xp2++  = MULT16_32_Q15(*wp2, x2);
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
         wp1++;
         wp2--;
      }
   }
-   RESTORE_STACK;
 }
-
-
+#endif /* MDCT_MIPSR1_H__ */
--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_MIPSR1_H
+#define PITCH_MIPSR1_H
+
+#define OVERRIDE_DUAL_INNER_PROD
+static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
+{
+   int j;
+   opus_val32 xy01=0;
+   opus_val32 xy02=0;
+
+   (void)arch;
+
+   asm volatile("MULT $ac1, $0, $0");
+   asm volatile("MULT $ac2, $0, $0");
+   /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
+   for (j=0;j<N;j++)
+   {
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
+      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+      ++j;
+      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
+      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+   }
+   asm volatile ("mflo %0, $ac1": "=r"(xy01));
+   asm volatile ("mflo %0, $ac2": "=r"(xy02));
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+static inline void xcorr_kernel_mips(const opus_val16 * x,
+      const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+
+    opus_int64 sum_0, sum_1, sum_2, sum_3;
+    sum_0 =  (opus_int64)sum[0];
+    sum_1 =  (opus_int64)sum[1];
+    sum_2 =  (opus_int64)sum[2];
+    sum_3 =  (opus_int64)sum[3];
+
+    y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+    y_0=*y++;
+    y_1=*y++;
+    y_2=*y++;
+    for (j=0;j<len-3;j+=4)
+    {
+        opus_val16 tmp;
+        tmp = *x++;
+        y_3=*y++;
+
+        sum_0 = __builtin_mips_madd( sum_0, tmp, y_0);
+        sum_1 = __builtin_mips_madd( sum_1, tmp, y_1);
+        sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
+        sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+
+        tmp=*x++;
+        y_0=*y++;
+
+        sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
+        sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
+        sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
+        sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+
+       tmp=*x++;
+       y_1=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+
+
+      tmp=*x++;
+      y_2=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_3 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_0 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_1);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_2);
+
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_0 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_1 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+   }
+
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+
+      sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
+      sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
+      sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
+      sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+   }
+
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+
+       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
+       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
+       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
+       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+
+   }
+
+   sum[0] = (opus_val32)sum_0;
+   sum[1] = (opus_val32)sum_1;
+   sum[2] = (opus_val32)sum_2;
+   sum[3] = (opus_val32)sum_3;
+}
+
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)(arch), xcorr_kernel_mips(x, y, sum, len))
+
+#endif /* PITCH_MIPSR1_H */
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef VQ_MIPSR1_H__
+#define VQ_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "mathops.h"
+#include "arch.h"
+
+#define OVERRIDE_vq_exp_rotation1
+static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
+{
+   int i;
+   opus_val16 ms;
+   celt_norm *Xptr;
+   Xptr = X;
+   ms = NEG16(s);
+   for (i=0;i<len-stride;i++)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr++      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
+   }
+   Xptr = &X[len-2*stride-1];
+   for (i=len-2*stride-1;i>=0;i--)
+   {
+      celt_norm x1, x2;
+      x1 = Xptr[0];
+      x2 = Xptr[stride];
+      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
+      *Xptr--      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
+   }
+}
+
+#define OVERRIDE_renormalise_vector
+void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
+{
+   int i;
+#ifdef FIXED_POINT
+   int k;
+#endif
+   opus_val32 E = EPSILON;
+   opus_val16 g;
+   opus_val32 t;
+   celt_norm *xptr = X;
+   int X0, X1;
+
+   (void)arch;
+
+   asm volatile("mult $ac1, $0, $0");
+   asm volatile("MTLO %0, $ac1" : :"r" (E));
+   /*if(N %4)
+       printf("error");*/
+   for (i=0;i<N-2;i+=2)
+   {
+      X0 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+
+      X1 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X1), "r" (X1));
+   }
+
+   for (;i<N;i++)
+   {
+      X0 = (int)*xptr++;
+      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+   }
+
+   asm volatile("MFLO %0, $ac1" : "=r" (E));
+#ifdef FIXED_POINT
+   k = celt_ilog2(E)>>1;
+#endif
+   t = VSHR32(E, 2*(k-7));
+   g = MULT16_16_P15(celt_rsqrt_norm(t),gain);
+
+   xptr = X;
+   for (i=0;i<N;i++)
+   {
+      *xptr = EXTRACT16(PSHR32(MULT16_16(g, *xptr), k+1));
+      xptr++;
+   }
+   /*return celt_sqrt(E);*/
+}
+
+#endif /* VQ_MIPSR1_H__ */
--- a/libcelt/modes.c
+++ b/libcelt/modes.c
 /* Copyright (c) 2007-2008 CSIRO
   Copyright (c) 2007-2009 Xiph.Org Foundation
-   Copyright (c) 2008 Gregory Maxwell 
+   Copyright (c) 2008 Gregory Maxwell
   Written by Jean-Marc Valin and Gregory Maxwell */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -37,8 +37,9 @@
 #include "os_support.h"
 #include "stack_alloc.h"
 #include "quant_bands.h"
+#include "cpu_support.h"

-static const celt_int16 eband5ms[] = {
+static const opus_int16 eband5ms[] = {
 /*0  200 400 600 800  1k 1.2 1.4 1.6  2k 2.4 2.8 3.2  4k 4.8 5.6 6.8  8k 9.6 12k 15.6 */
  0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
 };
@@ -63,9 +64,9 @@ static const unsigned char band_allocation[] = {

 #ifndef CUSTOM_MODES_ONLY
 #ifdef FIXED_POINT
-  #include "static_modes_fixed.c"
+  #include "static_modes_fixed.h"
 #else
-  #include "static_modes_float.c"
+  #include "static_modes_float.h"
 #endif
 #endif /* CUSTOM_MODES_ONLY */

@@ -73,13 +74,12 @@ static const unsigned char band_allocation[] = {
 #define M_PI 3.141592653
 #endif

-
 #ifdef CUSTOM_MODES

 /* Defining 25 critical bands for the full 0-20 kHz audio bandwidth
   Taken from http://ccrma.stanford.edu/~jos/bbt/Bark_Frequency_Scale.html */
 #define BARK_BANDS 25
-static const celt_int16 bark_freq[BARK_BANDS+1] = {
+static const opus_int16 bark_freq[BARK_BANDS+1] = {
      0,   100,   200,   300,   400,
    510,   630,   770,   920,  1080,
   1270,  1480,  1720,  2000,  2320,
@@ -87,16 +87,16 @@ static const celt_int16 bark_freq[BARK_BANDS+1] = {
   6400,  7700,  9500, 12000, 15500,
  20000};

-static celt_int16 *compute_ebands(celt_int32 Fs, int frame_size, int res, int *nbEBands)
+static opus_int16 *compute_ebands(opus_int32 Fs, int frame_size, int res, int *nbEBands)
 {
-   celt_int16 *eBands;
+   opus_int16 *eBands;
   int i, j, lin, low, high, nBark, offset=0;

   /* All modes that have 2.5 ms short blocks use the same definition */
-   if (Fs == 400*(celt_int32)frame_size)
+   if (Fs == 400*(opus_int32)frame_size)
   {
      *nbEBands = sizeof(eband5ms)/sizeof(eband5ms[0])-1;
-      eBands = celt_alloc(sizeof(celt_int16)*(*nbEBands+1));
+      eBands = opus_alloc(sizeof(opus_int16)*(*nbEBands+1));
      for (i=0;i<*nbEBands+1;i++)
         eBands[i] = eband5ms[i];
      return eBands;
@@ -114,11 +114,11 @@ static celt_int16 *compute_ebands(celt_int32 Fs, int frame_size, int res, int *n
   low = (bark_freq[lin]+res/2)/res;
   high = nBark-lin;
   *nbEBands = low+high;
-   eBands = celt_alloc(sizeof(celt_int16)*(*nbEBands+2));
-   
+   eBands = opus_alloc(sizeof(opus_int16)*(*nbEBands+2));
+
   if (eBands==NULL)
      return NULL;
-   
+
   /* Linear spacing (min_width) */
   for (i=0;i<low;i++)
      eBands[i] = i;
@@ -171,12 +171,15 @@ static void compute_allocation_table(CELTMode *mode)
   int maxBands = sizeof(eband5ms)/sizeof(eband5ms[0])-1;

   mode->nbAllocVectors = BITALLOC_SIZE;
-   allocVectors = celt_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
+   allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
   if (allocVectors==NULL)
+   {
+      mode->allocVectors = NULL;
      return;
+   }

   /* Check for standard mode */
-   if (mode->Fs == 400*(celt_int32)mode->shortMdctSize)
+   if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
   {
      for (i=0;i<BITALLOC_SIZE*mode->nbEBands;i++)
         allocVectors[i] = band_allocation[i];
@@ -192,15 +195,15 @@ static void compute_allocation_table(CELTMode *mode)
         int k;
         for (k=0;k<maxBands;k++)
         {
-            if (400*(celt_int32)eband5ms[k] > mode->eBands[j]*(celt_int32)mode->Fs/mode->shortMdctSize)
+            if (400*(opus_int32)eband5ms[k] > mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize)
               break;
         }
         if (k>maxBands-1)
            allocVectors[i*mode->nbEBands+j] = band_allocation[i*maxBands + maxBands-1];
         else {
-            celt_int32 a0, a1;
-            a1 = mode->eBands[j]*(celt_int32)mode->Fs/mode->shortMdctSize - 400*(celt_int32)eband5ms[k-1];
-            a0 = 400*(celt_int32)eband5ms[k] - mode->eBands[j]*(celt_int32)mode->Fs/mode->shortMdctSize;
+            opus_int32 a0, a1;
+            a1 = mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize - 400*(opus_int32)eband5ms[k-1];
+            a0 = 400*(opus_int32)eband5ms[k] - mode->eBands[j]*(opus_int32)mode->Fs/mode->shortMdctSize;
            allocVectors[i*mode->nbEBands+j] = (a0*band_allocation[i*maxBands+k-1]
                                             + a1*band_allocation[i*maxBands+k])/(a0+a1);
         }
@@ -221,20 +224,21 @@ static void compute_allocation_table(CELTMode *mode)

 #endif /* CUSTOM_MODES */

-CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)
+CELTMode *opus_custom_mode_create(opus_int32 Fs, int frame_size, int *error)
 {
   int i;
 #ifdef CUSTOM_MODES
   CELTMode *mode=NULL;
   int res;
-   celt_word16 *window;
-   celt_int16 *logN;
+   celt_coef *window;
+   opus_int16 *logN;
   int LM;
+   int arch = opus_select_arch();
   ALLOC_STACK;
 #if !defined(VAR_ARRAYS) && !defined(USE_ALLOCA)
   if (global_stack==NULL)
      goto failure;
-#endif 
+#endif
 #endif

 #ifndef CUSTOM_MODES_ONLY
@@ -247,7 +251,7 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)
               (frame_size<<j) == static_mode_list[i]->shortMdctSize*static_mode_list[i]->nbShortMdcts)
         {
            if (error)
-               *error = CELT_OK;
+               *error = OPUS_OK;
            return (CELTMode*)static_mode_list[i];
         }
      }
@@ -256,39 +260,39 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)

 #ifndef CUSTOM_MODES
   if (error)
-      *error = CELT_BAD_ARG;
+      *error = OPUS_BAD_ARG;
   return NULL;
 #else

   /* The good thing here is that permutation of the arguments will automatically be invalid */
-   
+
   if (Fs < 8000 || Fs > 96000)
   {
      if (error)
-         *error = CELT_BAD_ARG;
+         *error = OPUS_BAD_ARG;
      return NULL;
   }
   if (frame_size < 40 || frame_size > 1024 || frame_size%2!=0)
   {
      if (error)
-         *error = CELT_BAD_ARG;
+         *error = OPUS_BAD_ARG;
      return NULL;
   }
   /* Frames of less than 1ms are not supported. */
-   if ((celt_int32)frame_size*1000 < Fs)
+   if ((opus_int32)frame_size*1000 < Fs)
   {
      if (error)
-         *error = CELT_BAD_ARG;
+         *error = OPUS_BAD_ARG;
      return NULL;
   }

-   if ((celt_int32)frame_size*75 >= Fs && (frame_size%16)==0)
+   if ((opus_int32)frame_size*75 >= Fs && (frame_size%16)==0)
   {
     LM = 3;
-   } else if ((celt_int32)frame_size*150 >= Fs && (frame_size%8)==0)
+   } else if ((opus_int32)frame_size*150 >= Fs && (frame_size%8)==0)
   {
     LM = 2;
-   } else if ((celt_int32)frame_size*300 >= Fs && (frame_size%4)==0)
+   } else if ((opus_int32)frame_size*300 >= Fs && (frame_size%4)==0)
   {
     LM = 1;
   } else
@@ -297,14 +301,14 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)
   }

   /* Shorts longer than 3.3ms are not supported. */
-   if ((celt_int32)(frame_size>>LM)*300 > Fs)
+   if ((opus_int32)(frame_size>>LM)*300 > Fs)
   {
      if (error)
-         *error = CELT_BAD_ARG;
+         *error = OPUS_BAD_ARG;
      return NULL;
   }

-   mode = celt_alloc(sizeof(CELTMode));
+   mode = opus_alloc(sizeof(CELTMode));
   if (mode==NULL)
      goto failure;
   mode->Fs = Fs;
@@ -346,19 +350,27 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)
   mode->eBands = compute_ebands(Fs, mode->shortMdctSize, res, &mode->nbEBands);
   if (mode->eBands==NULL)
      goto failure;
+#if !defined(SMALL_FOOTPRINT)
+   /* Make sure we don't allocate a band larger than our PVQ table.
+      208 should be enough, but let's be paranoid. */
+   if ((mode->eBands[mode->nbEBands] - mode->eBands[mode->nbEBands-1])<<LM >
+    208) {
+       goto failure;
+   }
+#endif

   mode->effEBands = mode->nbEBands;
   while (mode->eBands[mode->effEBands] > mode->shortMdctSize)
      mode->effEBands--;
-   
+
   /* Overlap must be divisible by 4 */
   mode->overlap = ((mode->shortMdctSize>>2)<<2);

   compute_allocation_table(mode);
   if (mode->allocVectors==NULL)
      goto failure;
-   
-   window = (celt_word16*)celt_alloc(mode->overlap*sizeof(celt_word16));
+
+   window = (celt_coef*)opus_alloc(mode->overlap*sizeof(*window));
   if (window==NULL)
      goto failure;

@@ -366,12 +378,17 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)
   for (i=0;i<mode->overlap;i++)
      window[i] = Q15ONE*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap));
 #else
+# ifdef ENABLE_QEXT
+   for (i=0;i<mode->overlap;i++)
+      window[i] = MIN32(2147483647, 2147483648*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap)));
+# else
   for (i=0;i<mode->overlap;i++)
      window[i] = MIN32(32767,floor(.5+32768.*sin(.5*M_PI* sin(.5*M_PI*(i+.5)/mode->overlap) * sin(.5*M_PI*(i+.5)/mode->overlap))));
+# endif
 #endif
   mode->window = window;

-   logN = (celt_int16*)celt_alloc(mode->nbEBands*sizeof(celt_int16));
+   logN = (opus_int16*)opus_alloc(mode->nbEBands*sizeof(opus_int16));
   if (logN==NULL)
      goto failure;

@@ -381,53 +398,53 @@ CELTMode *celt_mode_create(celt_int32 Fs, int frame_size, int *error)

   compute_pulse_cache(mode, mode->maxLM);

-   clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts, mode->maxLM);
-   if ((mode->mdct.trig==NULL)
-#ifndef ENABLE_TI_DSPLIB55
-         || (mode->mdct.kfft==NULL)
-#endif
-   )
+   if (clt_mdct_init(&mode->mdct, 2*mode->shortMdctSize*mode->nbShortMdcts,
+           mode->maxLM, arch) == 0)
      goto failure;

   if (error)
-      *error = CELT_OK;
+      *error = OPUS_OK;

   return mode;
-failure: 
+failure:
   if (error)
-      *error = CELT_ALLOC_FAIL;
+      *error = OPUS_ALLOC_FAIL;
   if (mode!=NULL)
-      celt_mode_destroy(mode);
+      opus_custom_mode_destroy(mode);
   return NULL;
 #endif /* !CUSTOM_MODES */
 }

-void celt_mode_destroy(CELTMode *mode)
-{
 #ifdef CUSTOM_MODES
-   int i;
+void opus_custom_mode_destroy(CELTMode *mode)
+{
+   int arch = opus_select_arch();
+
   if (mode == NULL)
      return;
 #ifndef CUSTOM_MODES_ONLY
-   for (i=0;i<TOTAL_MODES;i++)
   {
-      if (mode == static_mode_list[i])
-      {
-         return;
-      }
+     int i;
+     for (i=0;i<TOTAL_MODES;i++)
+     {
+        if (mode == static_mode_list[i])
+        {
+           return;
+        }
+     }
   }
 #endif /* CUSTOM_MODES_ONLY */
-   celt_free((celt_int16*)mode->eBands);
-   celt_free((celt_int16*)mode->allocVectors);
-   
-   celt_free((celt_word16*)mode->window);
-   celt_free((celt_int16*)mode->logN);
-
-   celt_free((celt_int16*)mode->cache.index);
-   celt_free((unsigned char*)mode->cache.bits);
-   celt_free((unsigned char*)mode->cache.caps);
-   clt_mdct_clear(&mode->mdct);
-
-   celt_free((CELTMode *)mode);
-#endif
+   opus_free((opus_int16*)mode->eBands);
+   opus_free((unsigned char*)mode->allocVectors);
+
+   opus_free((opus_val16*)mode->window);
+   opus_free((opus_int16*)mode->logN);
+
+   opus_free((opus_int16*)mode->cache.index);
+   opus_free((unsigned char*)mode->cache.bits);
+   opus_free((unsigned char*)mode->cache.caps);
+   clt_mdct_clear(&mode->mdct, arch);
+
+   opus_free((CELTMode *)mode);
 }
+#endif
--- a/libcelt/modes.h
+++ b/libcelt/modes.h
 /* Copyright (c) 2007-2008 CSIRO
   Copyright (c) 2007-2009 Xiph.Org Foundation
-   Copyright (c) 2008 Gregory Maxwell 
+   Copyright (c) 2008 Gregory Maxwell
   Written by Jean-Marc Valin and Gregory Maxwell */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -30,88 +30,46 @@
 #ifndef MODES_H
 #define MODES_H

-#include "celt_types.h"
+#include "opus_types.h"
 #include "celt.h"
 #include "arch.h"
 #include "mdct.h"
 #include "entenc.h"
 #include "entdec.h"

-#define CELT_BITSTREAM_VERSION 0x80000010
-
 #define MAX_PERIOD 1024

-#ifndef CHANNELS
-# ifdef DISABLE_STEREO
-#  define CHANNELS(_C) (1)
-# else
-#  define CHANNELS(_C) (_C)
-# endif
-#endif
-
-#ifndef OVERLAP
-#define OVERLAP(mode) ((mode)->overlap)
-#endif
-
-#ifndef FRAMESIZE
-#define FRAMESIZE(mode) ((mode)->mdctSize)
-#endif
-
 typedef struct {
   int size;
-   const celt_int16 *index;
+   const opus_int16 *index;
   const unsigned char *bits;
   const unsigned char *caps;
 } PulseCache;

 /** Mode definition (opaque)
- @brief Mode definition 
+ @brief Mode definition
 */
-struct CELTMode {
-   celt_int32 Fs;
+struct OpusCustomMode {
+   opus_int32 Fs;
   int          overlap;

   int          nbEBands;
   int          effEBands;
-   celt_word16    preemph[4];
-   const celt_int16   *eBands;   /**< Definition for each "pseudo-critical band" */
-   
-   int          nbAllocVectors; /**< Number of lines in the matrix below */
-   const unsigned char   *allocVectors;   /**< Number of bits in each band for several rates */
-   
-   /* Stuff that could go in the {en,de}coder, but we save space this way */
-   mdct_lookup mdct;
-
-   const celt_word16 *window;
+   opus_val16    preemph[4];
+   const opus_int16   *eBands;   /**< Definition for each "pseudo-critical band" */

   int         maxLM;
   int         nbShortMdcts;
   int         shortMdctSize;

-   const celt_int16 *logN;
+   int          nbAllocVectors; /**< Number of lines in the matrix below */
+   const unsigned char   *allocVectors;   /**< Number of bits in each band for several rates */
+   const opus_int16 *logN;

+   const celt_coef *window;
+   mdct_lookup mdct;
   PulseCache cache;
 };

-#ifndef OPUS_BUILD
-#define CELT_STATIC static
-#else
-#define CELT_STATIC
-#endif
-
-#ifdef OPUS_BUILD
-#define CELT_SET_SIGNALLING_REQUEST    10003
-#define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, _celt_check_int(x)
-
-#define CELT_GET_MODE_REQUEST    10004
-/** Get the CELTMode used by an encoder or decoder */
-#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, _celt_check_mode_ptr_ptr(x)
-
-/* Prototypes for _ec versions of the encoder/decoder calls (not public) */
-int celt_encode_with_ec(CELTEncoder * restrict st, const celt_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
-int celt_encode_with_ec_float(CELTEncoder * restrict st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
-int celt_decode_with_ec(CELTDecoder * restrict st, const unsigned char *data, int len, celt_int16 * restrict pcm, int frame_size, ec_dec *dec);
-int celt_decode_with_ec_float(CELTDecoder * restrict st, const unsigned char *data, int len, float * restrict pcm, int frame_size, ec_dec *dec);
-#endif /* OPUS_BUILD */

 #endif
--- a/celt/opus_custom_demo.c
+++ b/celt/opus_custom_demo.c
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "opus_custom.h"
+#include "arch.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#define MAX_PACKET 1275
+
+static void print_usage(char **argv) {
+   fprintf (stderr, "Usage: %s [-e | -d] <rate> <channels> <frame size> "
+                  " [<bytes per packet>] [options] "
+                  "<input> <output>\n", argv[0]);
+   fprintf (stderr, "     -e encode only (default is encode and decode)\n");
+   fprintf (stderr, "     -d decode only (default is encode and decode)\n");
+   fprintf (stderr, "     <bytes per packet>: required only when encoding\n");
+   fprintf (stderr, "options:\n");
+   fprintf (stderr, "     -16                      format is 16-bit little-endian (default)\n");
+   fprintf (stderr, "     -24                      format is 24-bit little-endian\n");
+   fprintf (stderr, "     -f32                     format is 32-bit float little-endian\n");
+   fprintf (stderr, "     -complexity <0-10>       optional only when encoding\n");
+   fprintf (stderr, "     -loss <percentage>       encoding (robsutness setting) and decoding (simulating loss)\n");
+}
+
+static void int_to_char(opus_uint32 i, unsigned char ch[4])
+{
+    ch[0] = i>>24;
+    ch[1] = (i>>16)&0xFF;
+    ch[2] = (i>>8)&0xFF;
+    ch[3] = i&0xFF;
+}
+
+static opus_uint32 char_to_int(unsigned char ch[4])
+{
+    return ((opus_uint32)ch[0]<<24) | ((opus_uint32)ch[1]<<16)
+         | ((opus_uint32)ch[2]<< 8) |  (opus_uint32)ch[3];
+}
+
+#define check_encoder_option(decode_only, opt) do {if (decode_only) {fprintf(stderr, "option %s is only for encoding\n", opt); goto failure;}} while(0)
+#define check_decoder_option(encode_only, opt) do {if (encode_only) {fprintf(stderr, "option %s is only for decoding\n", opt); goto failure;}} while(0)
+
+#define FORMAT_S16_LE 0
+#define FORMAT_S24_LE 1
+#define FORMAT_F32_LE 2
+
+static const int format_size[3] = {2, 3, 4};
+
+typedef union {
+    opus_int32 i;
+    float f;
+} float_bits;
+
+
+int main(int argc, char *argv[])
+{
+   int err;
+   int ret=1;
+   int args;
+   opus_uint32 enc_final_range;
+   opus_uint32 dec_final_range;
+   int encode_only=0, decode_only=0;
+   char *inFile, *outFile;
+   FILE *fin=NULL, *fout=NULL;
+   OpusCustomMode *mode=NULL;
+   OpusCustomEncoder *enc=NULL;
+   OpusCustomDecoder *dec=NULL;
+   int len;
+   opus_int32 frame_size, channels, rate;
+   int format=FORMAT_S16_LE;
+   int bytes_per_packet=0;
+   unsigned char data[MAX_PACKET];
+   int complexity=-1;
+   float percent_loss = -1;
+   int i;
+#if !(defined (FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+   double rmsd = 0;
+#endif
+   int count = 0;
+   opus_int32 skip;
+   opus_int32 *in=NULL, *out=NULL;
+   unsigned char *fbytes=NULL;
+   args = 1;
+   if (argc < 7)
+   {
+      print_usage(argv);
+      goto failure;
+   }
+   if (strcmp(argv[args], "-e")==0)
+   {
+      encode_only = 1;
+      args++;
+   } else if (strcmp(argv[args], "-d")==0)
+   {
+      decode_only = 1;
+      args++;
+   }
+
+   rate = (opus_int32)atol(argv[args]);
+   args++;
+
+   if (rate != 8000 && rate != 12000
+    && rate != 16000 && rate != 24000
+    && rate != 48000)
+   {
+       fprintf(stderr, "Supported sampling rates are 8000, 12000, "
+               "16000, 24000 and 48000.\n");
+       goto failure;
+   }
+
+   channels = atoi(argv[args]);
+   args++;
+
+   if (channels < 1 || channels > 2)
+   {
+       fprintf(stderr, "Opus_demo supports only 1 or 2 channels.\n");
+       goto failure;
+   }
+
+   frame_size = atoi(argv[args]);
+   args++;
+
+   if (!decode_only)
+   {
+      bytes_per_packet = (opus_int32)atol(argv[args]);
+      args++;
+      if (bytes_per_packet < 0 || bytes_per_packet > MAX_PACKET)
+      {
+         fprintf (stderr, "bytes per packet must be between 0 and %d\n",
+                           MAX_PACKET);
+         goto failure;
+      }
+   }
+
+   mode = opus_custom_mode_create(rate, frame_size, NULL);
+   if (mode == NULL)
+   {
+      fprintf(stderr, "failed to create a mode\n");
+      goto failure;
+   }
+   while( args < argc - 2 ) {
+       /* process command line options */
+       if( strcmp( argv[ args ], "-complexity" ) == 0 ) {
+           check_encoder_option(decode_only, "-complexity");
+           args++;
+           complexity=atoi(argv[args]);
+           args++;
+       } else if( strcmp( argv[ args ], "-loss" ) == 0 ) {
+          args++;
+          percent_loss = atof(argv[args]);
+          args++;
+       } else if( strcmp( argv[ args ], "-16" ) == 0 ) {
+          format = FORMAT_S16_LE;
+          args++;
+       } else if( strcmp( argv[ args ], "-24" ) == 0 ) {
+          format = FORMAT_S24_LE;
+          args++;
+       } else if( strcmp( argv[ args ], "-f32" ) == 0 ) {
+          format = FORMAT_F32_LE;
+          args++;
+       } else {
+          printf( "Error: unrecognized setting: %s\n\n", argv[ args ] );
+          print_usage( argv );
+          goto failure;
+      }
+   }
+   if (!decode_only) {
+      enc = opus_custom_encoder_create(mode, channels, &err);
+      if (err != 0)
+      {
+         fprintf(stderr, "Failed to create the encoder: %s\n", opus_strerror(err));
+         goto failure;
+      }
+      if (complexity >= 0)
+      {
+         opus_custom_encoder_ctl(enc,OPUS_SET_COMPLEXITY(complexity));
+      }
+      if (percent_loss >= 0) {
+         opus_custom_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC((int)percent_loss));
+      }
+   }
+   if (!encode_only) {
+      dec = opus_custom_decoder_create(mode, channels, &err);
+      if (err != 0)
+      {
+         fprintf(stderr, "Failed to create the decoder: %s\n", opus_strerror(err));
+         goto failure;
+      }
+      opus_custom_decoder_ctl(dec, OPUS_GET_LOOKAHEAD(&skip));
+   }
+   if (argc-args != 2)
+   {
+      print_usage(argv);
+      goto failure;
+   }
+   inFile = argv[argc-2];
+   fin = fopen(inFile, "rb");
+   if (!fin)
+   {
+      fprintf (stderr, "Could not open input file %s\n", argv[argc-2]);
+      goto failure;
+   }
+   outFile = argv[argc-1];
+   fout = fopen(outFile, "wb+");
+   if (!fout)
+   {
+      fprintf (stderr, "Could not open output file %s\n", argv[argc-1]);
+      goto failure;
+   }
+   in = (opus_int32*)malloc(frame_size*channels*sizeof(opus_int32));
+   out = (opus_int32*)malloc(frame_size*channels*sizeof(opus_int32));
+   fbytes = (unsigned char*)malloc(frame_size*channels*4);
+
+   while (!feof(fin))
+   {
+      int lost = 0;
+      if (decode_only)
+      {
+          unsigned char ch[4];
+          size_t num_read = fread(ch, 1, 4, fin);
+          if (num_read!=4)
+              break;
+          len = char_to_int(ch);
+          if (len>MAX_PACKET || len<0)
+          {
+              fprintf(stderr, "Invalid payload length: %d\n",len);
+              break;
+          }
+          num_read = fread(ch, 1, 4, fin);
+          if (num_read!=4)
+              break;
+          enc_final_range = char_to_int(ch);
+          num_read = fread(data, 1, len, fin);
+          if (num_read!=(size_t)len)
+          {
+              fprintf(stderr, "Ran out of input, "
+                              "expecting %d bytes got %d\n",
+                              len,(int)num_read);
+              break;
+          }
+      } else {
+         err = fread(fbytes, format_size[format], frame_size*channels, fin);
+         if (feof(fin))
+            break;
+         if (format == FORMAT_S16_LE) {
+            for(i=0;i<frame_size*channels;i++)
+            {
+               opus_int32 s;
+               s=fbytes[2*i+1]<<8|fbytes[2*i];
+               s=((s&0xFFFF)^0x8000)-0x8000;
+               in[i]=s*256;
+            }
+         } else if (format == FORMAT_S24_LE) {
+            for(i=0;i<frame_size*channels;i++)
+            {
+               opus_int32 s;
+               s=fbytes[3*i+2]<<16|fbytes[3*i+1]<<8|fbytes[3*i];
+               s=((s&0xFFFFFF)^0x800000)-0x800000;
+               in[i]=s;
+            }
+         } else if (format == FORMAT_F32_LE) {
+            for(i=0;i<frame_size*channels;i++)
+            {
+               float_bits s;
+               s.i=fbytes[4*i+3]<<24|fbytes[4*i+2]<<16|fbytes[4*i+1]<<8|fbytes[4*i];
+               in[i]=(int)floor(.5 + s.f*8388608);
+            }
+         }
+         len = opus_custom_encode24(enc, in, frame_size, data, bytes_per_packet);
+         opus_custom_encoder_ctl(enc, OPUS_GET_FINAL_RANGE(&enc_final_range));
+         if (len <= 0)
+            fprintf (stderr, "opus_custom_encode() failed: %s\n", opus_strerror(len));
+      }
+
+      if (encode_only)
+      {
+          unsigned char int_field[4];
+          int_to_char(len, int_field);
+          if (fwrite(int_field, 1, 4, fout) != 4) {
+             fprintf(stderr, "Error writing.\n");
+             goto failure;
+          }
+          int_to_char(enc_final_range, int_field);
+          if (fwrite(int_field, 1, 4, fout) != 4) {
+             fprintf(stderr, "Error writing.\n");
+             goto failure;
+          }
+          if (fwrite(data, 1, len, fout) != (unsigned)len) {
+             fprintf(stderr, "Error writing.\n");
+             goto failure;
+          }
+      } else {
+         /* This is for simulating bit errors */
+#if 0
+         int errors = 0;
+         int eid = 0;
+         /* This simulates random bit error */
+         for (i=0;i<len*8;i++)
+         {
+            if (rand()%atoi(argv[8])==0)
+            {
+               if (i<64)
+               {
+                  errors++;
+                  eid = i;
+               }
+               data[i/8] ^= 1<<(7-(i%8));
+            }
+         }
+         if (errors == 1)
+            data[eid/8] ^= 1<<(7-(eid%8));
+         else if (errors%2 == 1)
+            data[rand()%8] ^= 1<<rand()%8;
+#endif
+
+#if 1 /* Set to zero to use the encoder's output instead */
+         /* This is to simulate packet loss */
+         lost = percent_loss != 0 && (float)rand()/RAND_MAX<.01*percent_loss;
+         if (lost)
+            /*if (errors && (errors%2==0))*/
+            ret = opus_custom_decode24(dec, NULL, len, out, frame_size);
+         else
+            ret = opus_custom_decode24(dec, data, len, out, frame_size);
+         opus_custom_decoder_ctl(dec, OPUS_GET_FINAL_RANGE(&dec_final_range));
+         if (ret < 0)
+            fprintf(stderr, "opus_custom_decode() failed: %s\n", opus_strerror(ret));
+#else
+         for (i=0;i<ret*channels;i++)
+            out[i] = in[i];
+#endif
+#if !(defined (FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+         if (!encode_only && !decode_only)
+         {
+            for (i=0;i<ret*channels;i++)
+            {
+               rmsd += (in[i]-out[i])*1.0*(in[i]-out[i]);
+               /*out[i] -= in[i];*/
+            }
+         }
+#endif
+         if (format == FORMAT_S16_LE) {
+            for(i=0;i<(ret-skip)*channels;i++)
+            {
+               opus_int32 s;
+               s=(out[i+(skip*channels)]+128)>>8;
+               if (s > 32767) s = 32767;
+               if (s < -32767) s = -32767;
+               fbytes[2*i]=s&0xFF;
+               fbytes[2*i+1]=(s>>8)&0xFF;
+            }
+         } else if (format == FORMAT_S24_LE) {
+            for(i=0;i<(ret-skip)*channels;i++)
+            {
+               opus_int32 s;
+               s=out[i+(skip*channels)];
+               if (s > 8388607) s = 8388607;
+               if (s < -8388607) s = -8388607;
+               fbytes[3*i]=s&0xFF;
+               fbytes[3*i+1]=(s>>8)&0xFF;
+               fbytes[3*i+2]=(s>>16)&0xFF;
+            }
+         } else if (format == FORMAT_F32_LE) {
+            for(i=0;i<(ret-skip)*channels;i++)
+            {
+               float_bits s;
+               s.f=out[i+(skip*channels)]*(1.f/8388608.f);
+               fbytes[4*i]=s.i&0xFF;
+               fbytes[4*i+1]=(s.i>>8)&0xFF;
+               fbytes[4*i+2]=(s.i>>16)&0xFF;
+               fbytes[4*i+3]=(s.i>>24)&0xFF;
+            }
+         }
+         fwrite(fbytes, format_size[format], (ret-skip)*channels, fout);
+      }
+
+      /* compare final range encoder rng values of encoder and decoder */
+      if( enc_final_range!=0  && !encode_only
+       && !lost
+       && dec_final_range != enc_final_range ) {
+          fprintf (stderr, "Error: Range coder state mismatch "
+                           "between encoder and decoder "
+                           "in frame %ld: 0x%8lx vs 0x%8lx\n",
+                       (long)count,
+                       (unsigned long)enc_final_range,
+                       (unsigned long)dec_final_range);
+          goto failure;
+      }
+
+      count++;
+      skip = 0;
+   }
+   PRINT_MIPS(stderr);
+   ret = EXIT_SUCCESS;
+#if !(defined (FIXED_POINT) && !defined(CUSTOM_MODES)) && defined(RESYNTH)
+   if (!encode_only && !decode_only)
+   {
+      if (rmsd > 0)
+      {
+         rmsd = sqrt(rmsd/(1.0*frame_size*channels*count));
+         fprintf (stderr, "Error: encoder doesn't match decoder\n");
+         fprintf (stderr, "RMS mismatch is %f\n", rmsd);
+         ret = 1;
+      } else {
+         fprintf (stderr, "Encoder matches decoder!!\n");
+      }
+   }
+#endif
+failure:
+   /* Cleanup after ourselves. */
+   if (enc) opus_custom_encoder_destroy(enc);
+   if (dec) opus_custom_decoder_destroy(dec);
+   if (fin) fclose(fin);
+   if (fout) fclose(fout);
+   if (mode) opus_custom_mode_destroy(mode);
+   if (in) free(in);
+   if (out) free(out);
+   if (fbytes) free(fbytes);
+   return ret;
+}
+
--- a/libcelt/os_support.h
+++ b/libcelt/os_support.h
 /* Copyright (C) 2007 Jean-Marc Valin
-      
+
   File: os_support.h
   This is the (tiny) OS abstraction layer. Aside from math.h, this is the
   only place where system headers are allowed.
@@ -35,61 +35,59 @@
 #  include "custom_support.h"
 #endif

+#include "opus_types.h"
+#include "opus_defines.h"
+
 #include <string.h>
-#include <stdio.h>
 #include <stdlib.h>

-/** CELT wrapper for calloc(). To do your own dynamic allocation, all you need to do is replace this function, celt_realloc and celt_free 
-    NOTE: celt_alloc needs to CLEAR THE MEMORY */
-#ifndef OVERRIDE_CELT_ALLOC
-static inline void *celt_alloc (int size)
+/** Opus wrapper for malloc(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
+#ifndef OVERRIDE_OPUS_ALLOC
+static OPUS_INLINE void *opus_alloc (size_t size)
 {
-   /* WARNING: this is not equivalent to malloc(). If you want to use malloc() 
-      or your own allocator, YOU NEED TO CLEAR THE MEMORY ALLOCATED. Otherwise
-      you will experience strange bugs */
-   return calloc(size,1);
+   return malloc(size);
 }
 #endif

-/** Same as celt_alloc(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
-#ifndef OVERRIDE_CELT_ALLOC_SCRATCH
-static inline void *celt_alloc_scratch (int size)
+#ifndef OVERRIDE_OPUS_REALLOC
+static OPUS_INLINE void *opus_realloc (void *ptr, size_t size)
 {
-   /* Scratch space doesn't need to be cleared */
-   return calloc(size,1);
+   return realloc(ptr, size);
 }
 #endif

-/** CELT wrapper for free(). To do your own dynamic allocation, all you need to do is replace this function, celt_realloc and celt_alloc */
-#ifndef OVERRIDE_CELT_FREE
-static inline void celt_free (void *ptr)
+/** Used only for non-threadsafe pseudostack.
+    If desired, this can always return the same area of memory rather than allocating a new one every time. */
+#ifndef OVERRIDE_OPUS_ALLOC_SCRATCH
+static OPUS_INLINE void *opus_alloc_scratch (size_t size)
 {
-   free(ptr);
+   /* Scratch space doesn't need to be cleared */
+   return opus_alloc(size);
 }
 #endif

-/** Same as celt_free(), except that the area is only needed inside a CELT call (might cause problem with wideband though) */
-#ifndef OVERRIDE_CELT_FREE_SCRATCH
-static inline void celt_free_scratch (void *ptr)
+/** Opus wrapper for free(). To do your own dynamic allocation replace this function, opus_realloc, and opus_free */
+#ifndef OVERRIDE_OPUS_FREE
+static OPUS_INLINE void opus_free (void *ptr)
 {
   free(ptr);
 }
 #endif

-/** Copy n bytes of memory from src to dst. The 0* term provides compile-time type checking  */
-#ifndef OVERRIDE_CELT_COPY
-#define CELT_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+/** Copy n elements from src to dst. The 0* term provides compile-time type checking  */
+#ifndef OVERRIDE_OPUS_COPY
+#define OPUS_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
 #endif

-/** Copy n bytes of memory from src to dst, allowing overlapping regions. The 0* term 
+/** Copy n elements from src to dst, allowing overlapping regions. The 0* term
    provides compile-time type checking */
-#ifndef OVERRIDE_CELT_MOVE
-#define CELT_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
+#ifndef OVERRIDE_OPUS_MOVE
+#define OPUS_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
 #endif

-/** Set n bytes of memory to value of c, starting at address s */
-#ifndef OVERRIDE_CELT_MEMSET
-#define CELT_MEMSET(dst, c, n) (memset((dst), (c), (n)*sizeof(*(dst))))
+/** Set n elements of dst to zero */
+#ifndef OVERRIDE_OPUS_CLEAR
+#define OPUS_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
 #endif

 /*#ifdef __GNUC__

--- a/libcelt/pitch.c
+++ b/libcelt/pitch.c
@@ -10,19 +10,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -31,29 +31,28 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif

-/* Always enable postfilter for Opus */
-#if defined(OPUS_BUILD) && !defined(ENABLE_POSTFILTER)
-#define ENABLE_POSTFILTER
-#endif
-
 #include "pitch.h"
 #include "os_support.h"
 #include "modes.h"
 #include "stack_alloc.h"
 #include "mathops.h"
+#include "celt_lpc.h"

-static void find_best_pitch(celt_word32 *xcorr, celt_word32 maxcorr, celt_word16 *y,
-                            int yshift, int len, int max_pitch, int best_pitch[2])
+static void find_best_pitch(opus_val32 *xcorr, opus_val16 *y, int len,
+                            int max_pitch, int *best_pitch
+#ifdef FIXED_POINT
+                            , int yshift, opus_val32 maxcorr
+#endif
+                            )
 {
   int i, j;
-   celt_word32 Syy=1;
-   celt_word16 best_num[2];
-   celt_word32 best_den[2];
+   opus_val32 Syy=1;
+   opus_val16 best_num[2];
+   opus_val32 best_den[2];
 #ifdef FIXED_POINT
   int xshift;

@@ -67,14 +66,19 @@ static void find_best_pitch(celt_word32 *xcorr, celt_word32 maxcorr, celt_word16
   best_pitch[0] = 0;
   best_pitch[1] = 1;
   for (j=0;j<len;j++)
-      Syy = MAC16_16(Syy, y[j],y[j]);
+      Syy = ADD32(Syy, SHR32(MULT16_16(y[j],y[j]), yshift));
   for (i=0;i<max_pitch;i++)
   {
      if (xcorr[i]>0)
      {
-         celt_word16 num;
-         celt_word32 xcorr16;
+         opus_val16 num;
+         opus_val32 xcorr16;
         xcorr16 = EXTRACT16(VSHR32(xcorr[i], xshift));
+#ifndef FIXED_POINT
+         /* Considering the range of xcorr16, this should avoid both underflows
+            and overflows (inf) when squaring xcorr16 */
+         xcorr16 *= 1e-12f;
+#endif
         num = MULT16_16_Q15(xcorr16,xcorr16);
         if (MULT16_32_Q15(num,best_den[1]) > MULT16_32_Q15(best_num[1],Syy))
         {
@@ -98,27 +102,87 @@ static void find_best_pitch(celt_word32 *xcorr, celt_word32 maxcorr, celt_word16
   }
 }

-#include "plc.h"
-void pitch_downsample(celt_sig * restrict x[], celt_word16 * restrict x_lp,
-      int len, int _C)
+static void celt_fir5(opus_val16 *x,
+         const opus_val16 *num,
+         int N)
 {
   int i;
-   celt_word32 ac[5];
-   celt_word16 tmp=Q15ONE;
-   celt_word16 lpc[4], mem[4]={0,0,0,0};
-   const int C = CHANNELS(_C);
+   opus_val16 num0, num1, num2, num3, num4;
+   opus_val32 mem0, mem1, mem2, mem3, mem4;
+   num0=num[0];
+   num1=num[1];
+   num2=num[2];
+   num3=num[3];
+   num4=num[4];
+   mem0=0;
+   mem1=0;
+   mem2=0;
+   mem3=0;
+   mem4=0;
+   for (i=0;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+      sum = MAC16_16(sum,num0,mem0);
+      sum = MAC16_16(sum,num1,mem1);
+      sum = MAC16_16(sum,num2,mem2);
+      sum = MAC16_16(sum,num3,mem3);
+      sum = MAC16_16(sum,num4,mem4);
+      mem4 = mem3;
+      mem3 = mem2;
+      mem2 = mem1;
+      mem1 = mem0;
+      mem0 = x[i];
+      x[i] = ROUND16(sum, SIG_SHIFT);
+   }
+}
+
+
+void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
+      int len, int C, int arch)
+{
+   int i;
+   opus_val32 ac[5];
+   opus_val16 tmp=Q15ONE;
+   opus_val16 lpc[4];
+   opus_val16 lpc2[5];
+   opus_val16 c1 = QCONST16(.8f,15);
+#ifdef FIXED_POINT
+   int shift;
+   opus_val32 maxabs = celt_maxabs32(x[0], len);
+   if (C==2)
+   {
+      opus_val32 maxabs_1 = celt_maxabs32(x[1], len);
+      maxabs = MAX32(maxabs, maxabs_1);
+   }
+   if (maxabs<1)
+      maxabs=1;
+   shift = celt_ilog2(maxabs)-10;
+   if (shift<0)
+      shift=0;
+   if (C==2)
+      shift++;
   for (i=1;i<len>>1;i++)
-      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), SIG_SHIFT+3);
-   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), SIG_SHIFT+3);
+      x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+   x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
   if (C==2)
   {
      for (i=1;i<len>>1;i++)
-         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), SIG_SHIFT+3);
-      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), SIG_SHIFT+3);
+         x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+      x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
   }
-
+#else
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+   x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+      x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+   }
+#endif
   _celt_autocorr(x_lp, ac, NULL, 0,
-                  4, len>>1);
+                  4, len>>1, arch);

   /* Noise floor -40 dB */
 #ifdef FIXED_POINT
@@ -143,34 +207,123 @@ void pitch_downsample(celt_sig * restrict x[], celt_word16 * restrict x_lp,
      tmp = MULT16_16_Q15(QCONST16(.9f,15), tmp);
      lpc[i] = MULT16_16_Q15(lpc[i], tmp);
   }
-   fir(x_lp, lpc, x_lp, len>>1, 4, mem);
+   /* Add a zero */
+   lpc2[0] = lpc[0] + QCONST16(.8f,SIG_SHIFT);
+   lpc2[1] = lpc[1] + MULT16_16_Q15(c1,lpc[0]);
+   lpc2[2] = lpc[2] + MULT16_16_Q15(c1,lpc[1]);
+   lpc2[3] = lpc[3] + MULT16_16_Q15(c1,lpc[2]);
+   lpc2[4] = MULT16_16_Q15(c1,lpc[3]);
+   celt_fir5(x_lp, lpc2, len>>1);
+}

-   mem[0]=0;
-   lpc[0]=QCONST16(.8f,12);
-   fir(x_lp, lpc, x_lp, len>>1, 1, mem);
+/* Pure C implementation. */
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch, int arch)
+{
+
+#if 0 /* This is a simple version of the pitch correlation that should work
+         well on DSPs like Blackfin and TI C5x/C6x */
+   int i, j;
+#ifdef FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+#if !defined(OVERRIDE_PITCH_XCORR)
+   (void)arch;
+#endif
+   for (i=0;i<max_pitch;i++)
+   {
+      opus_val32 sum = 0;
+      for (j=0;j<len;j++)
+         sum = MAC16_16(sum, _x[j], _y[i+j]);
+      xcorr[i] = sum;
+#ifdef FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef FIXED_POINT
+   return maxcorr;
+#endif

+#else /* Unrolled version of the pitch correlation -- runs faster on x86 and ARM */
+   int i;
+   /*The EDSP version requires that max_pitch is at least 1, and that _x is
+      32-bit aligned.
+     Since it's hard to put asserts in assembly, put them here.*/
+#ifdef FIXED_POINT
+   opus_val32 maxcorr=1;
+#endif
+   celt_assert(max_pitch>0);
+   celt_sig_assert(((size_t)_x&3)==0);
+   for (i=0;i<max_pitch-3;i+=4)
+   {
+      opus_val32 sum[4]={0,0,0,0};
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4]={0,0,0,0};
+         xcorr_kernel_c(_x, _y+i, sum_c, len);
+#endif
+         xcorr_kernel(_x, _y+i, sum, len, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
+      xcorr[i]=sum[0];
+      xcorr[i+1]=sum[1];
+      xcorr[i+2]=sum[2];
+      xcorr[i+3]=sum[3];
+#ifdef FIXED_POINT
+      sum[0] = MAX32(sum[0], sum[1]);
+      sum[2] = MAX32(sum[2], sum[3]);
+      sum[0] = MAX32(sum[0], sum[2]);
+      maxcorr = MAX32(maxcorr, sum[0]);
+#endif
+   }
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
+   for (;i<max_pitch;i++)
+   {
+      opus_val32 sum;
+      sum = celt_inner_prod(_x, _y+i, len, arch);
+      xcorr[i] = sum;
+#ifdef FIXED_POINT
+      maxcorr = MAX32(maxcorr, sum);
+#endif
+   }
+#ifdef FIXED_POINT
+   return maxcorr;
+#endif
+#endif
 }

-void pitch_search(const celt_word16 * restrict x_lp, celt_word16 * restrict y,
-                  int len, int max_pitch, int *pitch)
+void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
+                  int len, int max_pitch, int *pitch, int arch)
 {
   int i, j;
   int lag;
-   int best_pitch[2]={0};
-   VARDECL(celt_word16, x_lp4);
-   VARDECL(celt_word16, y_lp4);
-   VARDECL(celt_word32, xcorr);
-   celt_word32 maxcorr=1;
-   int offset;
+   int best_pitch[2]={0,0};
+   VARDECL(opus_val16, x_lp4);
+   VARDECL(opus_val16, y_lp4);
+   VARDECL(opus_val32, xcorr);
+#ifdef FIXED_POINT
+   opus_val32 maxcorr;
+   opus_val32 xmax, ymax;
   int shift=0;
+#endif
+   int offset;

   SAVE_STACK;

+   celt_assert(len>0);
+   celt_assert(max_pitch>0);
   lag = len+max_pitch;

-   ALLOC(x_lp4, len>>2, celt_word16);
-   ALLOC(y_lp4, lag>>2, celt_word16);
-   ALLOC(xcorr, max_pitch>>1, celt_word32);
+   ALLOC(x_lp4, len>>2, opus_val16);
+   ALLOC(y_lp4, lag>>2, opus_val16);
+   ALLOC(xcorr, max_pitch>>1, opus_val32);

   /* Downsample by 2 again */
   for (j=0;j<len>>2;j++)
@@ -179,7 +332,9 @@ void pitch_search(const celt_word16 * restrict x_lp, celt_word16 * restrict y,
      y_lp4[j] = y[2*j];

 #ifdef FIXED_POINT
-   shift = celt_ilog2(MAX16(1, MAX16(celt_maxabs16(x_lp4, len>>2), celt_maxabs16(y_lp4, lag>>2))))-11;
+   xmax = celt_maxabs16(x_lp4, len>>2);
+   ymax = celt_maxabs16(y_lp4, lag>>2);
+   shift = celt_ilog2(MAX32(1, MAX32(xmax, ymax)))-11;
   if (shift>0)
   {
      for (j=0;j<len>>2;j++)
@@ -195,35 +350,49 @@ void pitch_search(const celt_word16 * restrict x_lp, celt_word16 * restrict y,

   /* Coarse search with 4x decimation */

-   for (i=0;i<max_pitch>>2;i++)
-   {
-      celt_word32 sum = 0;
-      for (j=0;j<len>>2;j++)
-         sum = MAC16_16(sum, x_lp4[j],y_lp4[i+j]);
-      xcorr[i] = MAX32(-1, sum);
-      maxcorr = MAX32(maxcorr, sum);
-   }
-   find_best_pitch(xcorr, maxcorr, y_lp4, 0, len>>2, max_pitch>>2, best_pitch);
+#ifdef FIXED_POINT
+   maxcorr =
+#endif
+   celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2, arch);
+
+   find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch
+#ifdef FIXED_POINT
+                   , 0, maxcorr
+#endif
+                   );

   /* Finer search with 2x decimation */
+#ifdef FIXED_POINT
   maxcorr=1;
+#endif
   for (i=0;i<max_pitch>>1;i++)
   {
-      celt_word32 sum=0;
+      opus_val32 sum;
      xcorr[i] = 0;
      if (abs(i-2*best_pitch[0])>2 && abs(i-2*best_pitch[1])>2)
         continue;
+#ifdef FIXED_POINT
+      sum = 0;
      for (j=0;j<len>>1;j++)
         sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
+#else
+      sum = celt_inner_prod(x_lp, y+i, len>>1, arch);
+#endif
      xcorr[i] = MAX32(-1, sum);
+#ifdef FIXED_POINT
      maxcorr = MAX32(maxcorr, sum);
+#endif
   }
-   find_best_pitch(xcorr, maxcorr, y, shift, len>>1, max_pitch>>1, best_pitch);
+   find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch
+#ifdef FIXED_POINT
+                   , shift+1, maxcorr
+#endif
+                   );

   /* Refine by pseudo-interpolation */
   if (best_pitch[0]>0 && best_pitch[0]<(max_pitch>>1)-1)
   {
-      celt_word32 a, b, c;
+      opus_val32 a, b, c;
      a = xcorr[best_pitch[0]-1];
      b = xcorr[best_pitch[0]];
      c = xcorr[best_pitch[0]+1];
@@ -231,7 +400,7 @@ void pitch_search(const celt_word16 * restrict x_lp, celt_word16 * restrict y,
         offset = 1;
      else if ((a-c) > MULT16_32_Q15(QCONST16(.7f,15),b-c))
         offset = -1;
-      else 
+      else
         offset = 0;
   } else {
      offset = 0;
@@ -241,59 +410,88 @@ void pitch_search(const celt_word16 * restrict x_lp, celt_word16 * restrict y,
   RESTORE_STACK;
 }

-#ifdef ENABLE_POSTFILTER
+#ifdef FIXED_POINT
+static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy)
+{
+   opus_val32 x2y2;
+   int sx, sy, shift;
+   opus_val32 g;
+   opus_val16 den;
+   if (xy == 0 || xx == 0 || yy == 0)
+      return 0;
+   sx = celt_ilog2(xx)-14;
+   sy = celt_ilog2(yy)-14;
+   shift = sx + sy;
+   x2y2 = SHR32(MULT16_16(VSHR32(xx, sx), VSHR32(yy, sy)), 14);
+   if (shift & 1) {
+      if (x2y2 < 32768)
+      {
+         x2y2 <<= 1;
+         shift--;
+      } else {
+         x2y2 >>= 1;
+         shift++;
+      }
+   }
+   den = celt_rsqrt_norm(x2y2);
+   g = MULT16_32_Q15(den, xy);
+   g = VSHR32(g, (shift>>1)-1);
+   return EXTRACT16(MAX32(-Q15ONE, MIN32(g, Q15ONE)));
+}
+#else
+static opus_val16 compute_pitch_gain(opus_val32 xy, opus_val32 xx, opus_val32 yy)
+{
+   return xy/celt_sqrt(1+xx*yy);
+}
+#endif
+
 static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
-celt_word16 remove_doubling(celt_word16 *x, int maxperiod, int minperiod,
-      int N, int *_T0, int prev_period, celt_word16 prev_gain)
+opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
+      int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
 {
   int k, i, T, T0;
-   celt_word16 g, g0;
-   celt_word16 pg;
-   celt_word32 xy,xx,yy;
-   celt_word32 xcorr[3];
-   celt_word32 best_xy, best_yy;
+   opus_val16 g, g0;
+   opus_val16 pg;
+   opus_val32 xy,xx,yy,xy2;
+   opus_val32 xcorr[3];
+   opus_val32 best_xy, best_yy;
   int offset;
   int minperiod0;
+   VARDECL(opus_val32, yy_lookup);
+   SAVE_STACK;

   minperiod0 = minperiod;
   maxperiod /= 2;
   minperiod /= 2;
-   *_T0 /= 2;
+   *T0_ /= 2;
   prev_period /= 2;
   N /= 2;
   x += maxperiod;
-   if (*_T0>=maxperiod)
-      *_T0=maxperiod-1;
-
-   T = T0 = *_T0;
-   xx=xy=yy=0;
-   for (i=0;i<N;i++)
+   if (*T0_>=maxperiod)
+      *T0_=maxperiod-1;
+
+   T = T0 = *T0_;
+   ALLOC(yy_lookup, maxperiod+1, opus_val32);
+   dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch);
+   yy_lookup[0] = xx;
+   yy=xx;
+   for (i=1;i<=maxperiod;i++)
   {
-      xy = MAC16_16(xy, x[i], x[i-T0]);
-      xx = MAC16_16(xx, x[i], x[i]);
-      yy = MAC16_16(yy, x[i-T0],x[i-T0]);
+      yy = yy+MULT16_16(x[-i],x[-i])-MULT16_16(x[N-i],x[N-i]);
+      yy_lookup[i] = MAX32(0, yy);
   }
+   yy = yy_lookup[T0];
   best_xy = xy;
   best_yy = yy;
-#ifdef FIXED_POINT
-      {
-         celt_word32 x2y2;
-         int sh, t;
-         x2y2 = 1+HALF32(MULT32_32_Q31(xx,yy));
-         sh = celt_ilog2(x2y2)>>1;
-         t = VSHR32(x2y2, 2*(sh-7));
-         g = g0 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
-      }
-#else
-      g = g0 = xy/sqrt(1+xx*yy);
-#endif
+   g = g0 = compute_pitch_gain(xy, xx, yy);
   /* Look for any pitch at T/k */
   for (k=2;k<=15;k++)
   {
      int T1, T1b;
-      celt_word16 g1;
-      celt_word16 cont=0;
-      T1 = (2*T0+k)/(2*k);
+      opus_val16 g1;
+      opus_val16 cont=0;
+      opus_val16 thresh;
+      T1 = celt_udiv(2*T0+k, 2*k);
      if (T1 < minperiod)
         break;
      /* Look for another strong correlation at T1b */
@@ -305,36 +503,26 @@ celt_word16 remove_doubling(celt_word16 *x, int maxperiod, int minperiod,
            T1b = T0+T1;
      } else
      {
-         T1b = (2*second_check[k]*T0+k)/(2*k);
-      }
-      xy=yy=0;
-      for (i=0;i<N;i++)
-      {
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-         yy = MAC16_16(yy, x[i-T1], x[i-T1]);
-
-         xy = MAC16_16(xy, x[i], x[i-T1b]);
-         yy = MAC16_16(yy, x[i-T1b], x[i-T1b]);
+         T1b = celt_udiv(2*second_check[k]*T0+k, 2*k);
      }
-#ifdef FIXED_POINT
-      {
-         celt_word32 x2y2;
-         int sh, t;
-         x2y2 = 1+MULT32_32_Q31(xx,yy);
-         sh = celt_ilog2(x2y2)>>1;
-         t = VSHR32(x2y2, 2*(sh-7));
-         g1 = VSHR32(MULT16_32_Q15(celt_rsqrt_norm(t), xy),sh+1);
-      }
-#else
-      g1 = xy/sqrt(1+2.f*xx*1.f*yy);
-#endif
+      dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch);
+      xy = HALF32(xy + xy2);
+      yy = HALF32(yy_lookup[T1] + yy_lookup[T1b]);
+      g1 = compute_pitch_gain(xy, xx, yy);
      if (abs(T1-prev_period)<=1)
         cont = prev_gain;
      else if (abs(T1-prev_period)<=2 && 5*k*k < T0)
-         cont = HALF32(prev_gain);
+         cont = HALF16(prev_gain);
      else
         cont = 0;
-      if (g1 > QCONST16(.3f,15) + MULT16_16_Q15(QCONST16(.4f,15),g0)-cont)
+      thresh = MAX16(QCONST16(.3f,15), MULT16_16_Q15(QCONST16(.7f,15),g0)-cont);
+      /* Bias against very high pitch (very short period) to avoid false-positives
+         due to short-term correlation */
+      if (T1<3*minperiod)
+         thresh = MAX16(QCONST16(.4f,15), MULT16_16_Q15(QCONST16(.85f,15),g0)-cont);
+      else if (T1<2*minperiod)
+         thresh = MAX16(QCONST16(.5f,15), MULT16_16_Q15(QCONST16(.9f,15),g0)-cont);
+      if (g1 > thresh)
      {
         best_xy = xy;
         best_yy = yy;
@@ -342,19 +530,14 @@ celt_word16 remove_doubling(celt_word16 *x, int maxperiod, int minperiod,
         g = g1;
      }
   }
+   best_xy = MAX32(0, best_xy);
   if (best_yy <= best_xy)
      pg = Q15ONE;
   else
      pg = SHR32(frac_div32(best_xy,best_yy+1),16);

   for (k=0;k<3;k++)
-   {
-      int T1 = T+k-1;
-      xy = 0;
-      for (i=0;i<N;i++)
-         xy = MAC16_16(xy, x[i], x[i-T1]);
-      xcorr[k] = xy;
-   }
+      xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
   if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
      offset = 1;
   else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
@@ -363,11 +546,10 @@ celt_word16 remove_doubling(celt_word16 *x, int maxperiod, int minperiod,
      offset = 0;
   if (pg > g)
      pg = g;
-   *_T0 = 2*T+offset;
+   *T0_ = 2*T+offset;

-   if (*_T0<minperiod0)
-      *_T0=minperiod0;
+   if (*T0_<minperiod0)
+      *T0_=minperiod0;
+   RESTORE_STACK;
   return pg;
 }
-
-#endif /* ENABLE_POSTFILTER */
--- a/celt/pitch.h
+++ b/celt/pitch.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Written by Jean-Marc Valin */
+/**
+   @file pitch.h
+   @brief Pitch analysis
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef PITCH_H
+#define PITCH_H
+
+#include "modes.h"
+#include "cpu_support.h"
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \
+  || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT))
+#include "x86/pitch_sse.h"
+#endif
+
+#if defined(MIPSr1_ASM)
+#include "mips/pitch_mipsr1.h"
+#endif
+
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+# include "arm/pitch_arm.h"
+#endif
+
+void pitch_downsample(celt_sig * OPUS_RESTRICT x[], opus_val16 * OPUS_RESTRICT x_lp,
+      int len, int C, int arch);
+
+void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTRICT y,
+                  int len, int max_pitch, int *pitch, int arch);
+
+opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
+      int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
+
+
+/* OPT: This is the kernel you really want to optimize. It gets used a lot
+   by the prefilter and by the PLC. */
+static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   opus_val16 y_0, y_1, y_2, y_3;
+   celt_assert(len>=3);
+   y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
+   y_0=*y++;
+   y_1=*y++;
+   y_2=*y++;
+   for (j=0;j<len-3;j+=4)
+   {
+      opus_val16 tmp;
+      tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+      tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+      tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+      tmp=*x++;
+      y_2=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_3);
+      sum[1] = MAC16_16(sum[1],tmp,y_0);
+      sum[2] = MAC16_16(sum[2],tmp,y_1);
+      sum[3] = MAC16_16(sum[3],tmp,y_2);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp = *x++;
+      y_3=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_0);
+      sum[1] = MAC16_16(sum[1],tmp,y_1);
+      sum[2] = MAC16_16(sum[2],tmp,y_2);
+      sum[3] = MAC16_16(sum[3],tmp,y_3);
+   }
+   if (j++<len)
+   {
+      opus_val16 tmp=*x++;
+      y_0=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_1);
+      sum[1] = MAC16_16(sum[1],tmp,y_2);
+      sum[2] = MAC16_16(sum[2],tmp,y_3);
+      sum[3] = MAC16_16(sum[3],tmp,y_0);
+   }
+   if (j<len)
+   {
+      opus_val16 tmp=*x++;
+      y_1=*y++;
+      sum[0] = MAC16_16(sum[0],tmp,y_2);
+      sum[1] = MAC16_16(sum[1],tmp,y_3);
+      sum[2] = MAC16_16(sum[2],tmp,y_0);
+      sum[3] = MAC16_16(sum[3],tmp,y_1);
+   }
+}
+
+#ifndef OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+    ((void)(arch),xcorr_kernel_c(x, y, sum, len))
+#endif /* OVERRIDE_XCORR_KERNEL */
+
+
+static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01=0;
+   opus_val32 xy02=0;
+   for (i=0;i<N;i++)
+   {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+#ifndef OVERRIDE_DUAL_INNER_PROD
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+    ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2))
+#endif
+
+/*We make sure a C version is always available for cases where the overhead of
+  vectorization and passing around an arch flag aren't worth it.*/
+static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
+      const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy=0;
+   for (i=0;i<N;i++)
+      xy = MAC16_16(xy, x[i], y[i]);
+   return xy;
+}
+
+#if !defined(OVERRIDE_CELT_INNER_PROD)
+# define celt_inner_prod(x, y, N, arch) \
+    ((void)(arch),celt_inner_prod_c(x, y, N))
+#endif
+
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+     opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+
+#ifdef FIXED_POINT
+opus_val32
+#else
+void
+#endif
+celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
+      opus_val32 *xcorr, int len, int max_pitch, int arch);
+
+#ifndef OVERRIDE_PITCH_XCORR
+# define celt_pitch_xcorr celt_pitch_xcorr_c
+#endif
+
+#ifdef NON_STATIC_COMB_FILTER_CONST_C
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+                         opus_val16 g10, opus_val16 g11, opus_val16 g12);
+#endif
+
+#ifndef OVERRIDE_COMB_FILTER_CONST
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+    ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12))
+#endif
+
+
+#endif
--- a/libcelt/quant_bands.c
+++ b/libcelt/quant_bands.c
@@ -5,19 +5,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -40,8 +40,8 @@
 #include "rate.h"

 #ifdef FIXED_POINT
-/* Mean energy in each band quantized in Q6 */
-static const signed char eMeans[25] = {
+/* Mean energy in each band quantized in Q4 */
+const signed char eMeans[25] = {
      103,100, 92, 85, 81,
       77, 72, 70, 78, 75,
       73, 71, 78, 74, 69,
@@ -49,8 +49,8 @@ static const signed char eMeans[25] = {
       60, 60, 60, 60, 60
 };
 #else
-/* Mean energy in each band quantized in Q6 and converted back to float */
-static const celt_word16 eMeans[25] = {
+/* Mean energy in each band quantized in Q4 and converted back to float */
+const opus_val16 eMeans[25] = {
      6.437500f, 6.250000f, 5.750000f, 5.312500f, 5.062500f,
      4.812500f, 4.500000f, 4.375000f, 4.875000f, 4.687500f,
      4.562500f, 4.437500f, 4.875000f, 4.625000f, 4.312500f,
@@ -60,13 +60,13 @@ static const celt_word16 eMeans[25] = {
 #endif
 /* prediction coefficients: 0.9, 0.8, 0.65, 0.5 */
 #ifdef FIXED_POINT
-static const celt_word16 pred_coef[4] = {29440, 26112, 21248, 16384};
-static const celt_word16 beta_coef[4] = {30147, 22282, 12124, 6554};
-static const celt_word16 beta_intra = 4915;
+static const opus_val16 pred_coef[4] = {29440, 26112, 21248, 16384};
+static const opus_val16 beta_coef[4] = {30147, 22282, 12124, 6554};
+static const opus_val16 beta_intra = 4915;
 #else
-static const celt_word16 pred_coef[4] = {29440/32768., 26112/32768., 21248/32768., 16384/32768.};
-static const celt_word16 beta_coef[4] = {30147/32768., 22282/32768., 12124/32768., 6554/32768.};
-static const celt_word16 beta_intra = 4915/32768.;
+static const opus_val16 pred_coef[4] = {29440/32768., 26112/32768., 21248/32768., 16384/32768.};
+static const opus_val16 beta_coef[4] = {30147/32768., 22282/32768., 12124/32768., 6554/32768.};
+static const opus_val16 beta_intra = 4915/32768.;
 #endif

 /*Parameters of the Laplace-like probability models used for the coarse energy.
@@ -139,32 +139,31 @@ static const unsigned char e_prob_model[4][2][42] = {

 static const unsigned char small_energy_icdf[3]={2,1,0};

-static int intra_decision(const celt_word16 *eBands, celt_word16 *oldEBands, int start, int end, int len, int C)
+static opus_val32 loss_distortion(const celt_glog *eBands, celt_glog *oldEBands, int start, int end, int len, int C)
 {
   int c, i;
-   celt_word32 dist = 0;
+   opus_val32 dist = 0;
   c=0; do {
      for (i=start;i<end;i++)
      {
-         celt_word16 d = SHR16(SUB16(eBands[i+c*len], oldEBands[i+c*len]),2);
+         celt_glog d = PSHR32(SUB32(eBands[i+c*len], oldEBands[i+c*len]), DB_SHIFT-7);
         dist = MAC16_16(dist, d,d);
      }
   } while (++c<C);
-   return SHR32(dist,2*DB_SHIFT-4) > 2*C*(end-start);
+   return MIN32(200,SHR32(dist,14));
 }

 static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
-      const celt_word16 *eBands, celt_word16 *oldEBands,
-      celt_int32 budget, celt_int32 tell,
-      const unsigned char *prob_model, celt_word16 *error, ec_enc *enc,
-      int _C, int LM, int intra, celt_word16 max_decay)
+      const celt_glog *eBands, celt_glog *oldEBands,
+      opus_int32 budget, opus_int32 tell,
+      const unsigned char *prob_model, celt_glog *error, ec_enc *enc,
+      int C, int LM, int intra, celt_glog max_decay, int lfe)
 {
-   const int C = CHANNELS(_C);
   int i, c;
   int badness = 0;
-   celt_word32 prev[2] = {0,0};
-   celt_word16 coef;
-   celt_word16 beta;
+   opus_val32 prev[2] = {0,0};
+   opus_val16 coef;
+   opus_val16 beta;

   if (tell+3 <= budget)
      ec_enc_bit_logp(enc, intra, 3);
@@ -184,30 +183,29 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
      do {
         int bits_left;
         int qi, qi0;
-         celt_word32 q;
-         celt_word16 x;
-         celt_word32 f, tmp;
-         celt_word16 oldE;
-         celt_word16 decay_bound;
+         opus_val32 q;
+         celt_glog x;
+         opus_val32 f, tmp;
+         celt_glog oldE;
+         celt_glog decay_bound;
         x = eBands[i+c*m->nbEBands];
-         oldE = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]);
+         oldE = MAXG(-GCONST(9.f), oldEBands[i+c*m->nbEBands]);
 #ifdef FIXED_POINT
-         f = SHL32(EXTEND32(x),7) - PSHR32(MULT16_16(coef,oldE), 8) - prev[c];
+         f = x - MULT16_32_Q15(coef,oldE) - prev[c];
         /* Rounding to nearest integer here is really important! */
-         qi = (f+QCONST32(.5f,DB_SHIFT+7))>>(DB_SHIFT+7);
-         decay_bound = EXTRACT16(MAX32(-QCONST16(28.f,DB_SHIFT),
-               SUB32((celt_word32)oldEBands[i+c*m->nbEBands],max_decay)));
+         qi = (f+QCONST32(.5f,DB_SHIFT))>>DB_SHIFT;
+         decay_bound = MAXG(-GCONST(28.f), SUB32((opus_val32)oldEBands[i+c*m->nbEBands],max_decay));
 #else
         f = x-coef*oldE-prev[c];
         /* Rounding to nearest integer here is really important! */
         qi = (int)floor(.5f+f);
-         decay_bound = MAX16(-QCONST16(28.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]) - max_decay;
+         decay_bound = MAXG(-GCONST(28.f), oldEBands[i+c*m->nbEBands]) - max_decay;
 #endif
         /* Prevent the energy from going down too quickly (e.g. for bands
            that have just one bin) */
         if (qi < 0 && x < decay_bound)
         {
-            qi += (int)SHR16(SUB16(decay_bound,x), DB_SHIFT);
+            qi += (int)SHR32(SUB32(decay_bound,x), DB_SHIFT);
            if (qi > 0)
               qi = 0;
         }
@@ -223,6 +221,8 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
            if (bits_left < 16)
               qi = IMAX(-1, qi);
         }
+         if (lfe && i>=2)
+            qi = IMIN(qi, 0);
         if (budget-tell >= 15)
         {
            int pi;
@@ -242,73 +242,76 @@ static int quant_coarse_energy_impl(const CELTMode *m, int start, int end,
         }
         else
            qi = -1;
-         error[i+c*m->nbEBands] = PSHR32(f,7) - SHL16(qi,DB_SHIFT);
+         error[i+c*m->nbEBands] = f - SHL32(qi,DB_SHIFT);
         badness += abs(qi0-qi);
-         q = SHL32(EXTEND32(qi),DB_SHIFT);
-         
-         tmp = PSHR32(MULT16_16(coef,oldE),8) + prev[c] + SHL32(q,7);
+         q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT);
+
+         tmp = MULT16_32_Q15(coef,oldE) + prev[c] + q;
 #ifdef FIXED_POINT
-         tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp);
+         tmp = MAX32(-GCONST(28.f), tmp);
 #endif
-         oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7);
-         prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8));
+         oldEBands[i+c*m->nbEBands] = tmp;
+         prev[c] = prev[c] + q - MULT16_32_Q15(beta,q);
      } while (++c < C);
   }
-   return badness;
+   return lfe ? 0 : badness;
 }

 void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
-      const celt_word16 *eBands, celt_word16 *oldEBands, celt_uint32 budget,
-      celt_word16 *error, ec_enc *enc, int _C, int LM, int nbAvailableBytes,
-      int force_intra, int *delayedIntra, int two_pass)
+      const celt_glog *eBands, celt_glog *oldEBands, opus_uint32 budget,
+      celt_glog *error, ec_enc *enc, int C, int LM, int nbAvailableBytes,
+      int force_intra, opus_val32 *delayedIntra, int two_pass, int loss_rate, int lfe)
 {
-   const int C = CHANNELS(_C);
   int intra;
-   celt_word16 max_decay;
-   VARDECL(celt_word16, oldEBands_intra);
-   VARDECL(celt_word16, error_intra);
+   celt_glog max_decay;
+   VARDECL(celt_glog, oldEBands_intra);
+   VARDECL(celt_glog, error_intra);
   ec_enc enc_start_state;
-   celt_uint32 tell;
+   opus_uint32 tell;
   int badness1=0;
+   opus_int32 intra_bias;
+   opus_val32 new_distortion;
   SAVE_STACK;

-   intra = force_intra || (*delayedIntra && nbAvailableBytes > (end-start)*C);
-   if (/*shortBlocks || */intra_decision(eBands, oldEBands, start, effEnd, m->nbEBands, C))
-      *delayedIntra = 1;
-   else
-      *delayedIntra = 0;
+   intra = force_intra || (!two_pass && *delayedIntra>2*C*(end-start) && nbAvailableBytes > (end-start)*C);
+   intra_bias = (opus_int32)((budget**delayedIntra*loss_rate)/(C*512));
+   new_distortion = loss_distortion(eBands, oldEBands, start, effEnd, m->nbEBands, C);

   tell = ec_tell(enc);
   if (tell+3 > budget)
      two_pass = intra = 0;

-   /* Encode the global flags using a simple probability model
-      (first symbols in the stream) */
-
+   max_decay = GCONST(16.f);
+   if (end-start>10)
+   {
 #ifdef FIXED_POINT
-      max_decay = MIN32(QCONST16(16.f,DB_SHIFT), SHL32(EXTEND32(nbAvailableBytes),DB_SHIFT-3));
+      max_decay = SHL32(MIN32(SHR32(max_decay,DB_SHIFT-3), EXTEND32(nbAvailableBytes)),DB_SHIFT-3);
 #else
-   max_decay = MIN32(16.f, .125f*nbAvailableBytes);
+      max_decay = MIN32(max_decay, .125f*nbAvailableBytes);
 #endif
-
+   }
+   if (lfe)
+      max_decay = GCONST(3.f);
   enc_start_state = *enc;

-   ALLOC(oldEBands_intra, C*m->nbEBands, celt_word16);
-   ALLOC(error_intra, C*m->nbEBands, celt_word16);
-   CELT_COPY(oldEBands_intra, oldEBands, C*m->nbEBands);
+   ALLOC(oldEBands_intra, C*m->nbEBands, celt_glog);
+   ALLOC(error_intra, C*m->nbEBands, celt_glog);
+   OPUS_COPY(oldEBands_intra, oldEBands, C*m->nbEBands);

   if (two_pass || intra)
   {
      badness1 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands_intra, budget,
-            tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay);
+            tell, e_prob_model[LM][1], error_intra, enc, C, LM, 1, max_decay, lfe);
   }

   if (!intra)
   {
+      unsigned char *intra_buf;
      ec_enc enc_intra_state;
-      int tell_intra;
-      celt_uint32 nstart_bytes;
-      celt_uint32 nintra_bytes;
+      opus_int32 tell_intra;
+      opus_uint32 nstart_bytes;
+      opus_uint32 nintra_bytes;
+      opus_uint32 save_bytes;
      int badness2;
      VARDECL(unsigned char, intra_bits);

@@ -318,51 +321,59 @@ void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,

      nstart_bytes = ec_range_bytes(&enc_start_state);
      nintra_bytes = ec_range_bytes(&enc_intra_state);
-      ALLOC(intra_bits, nintra_bytes-nstart_bytes, unsigned char);
+      intra_buf = ec_get_buffer(&enc_intra_state) + nstart_bytes;
+      save_bytes = nintra_bytes-nstart_bytes;
+      if (save_bytes == 0)
+         save_bytes = ALLOC_NONE;
+      ALLOC(intra_bits, save_bytes, unsigned char);
      /* Copy bits from intra bit-stream */
-      CELT_COPY(intra_bits,
-            ec_get_buffer(&enc_intra_state) + nstart_bytes,
-            nintra_bytes - nstart_bytes);
+      OPUS_COPY(intra_bits, intra_buf, nintra_bytes - nstart_bytes);

      *enc = enc_start_state;

      badness2 = quant_coarse_energy_impl(m, start, end, eBands, oldEBands, budget,
-            tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay);
+            tell, e_prob_model[LM][intra], error, enc, C, LM, 0, max_decay, lfe);

-      if (two_pass && (badness1 < badness2 || (badness1 == badness2 && ec_tell_frac(enc) > tell_intra)))
+      if (two_pass && (badness1 < badness2 || (badness1 == badness2 && ((opus_int32)ec_tell_frac(enc))+intra_bias > tell_intra)))
      {
         *enc = enc_intra_state;
         /* Copy intra bits to bit-stream */
-         CELT_COPY(ec_get_buffer(&enc_intra_state) + nstart_bytes,
-               intra_bits, nintra_bytes - nstart_bytes);
-         CELT_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
-         CELT_COPY(error, error_intra, C*m->nbEBands);
+         OPUS_COPY(intra_buf, intra_bits, nintra_bytes - nstart_bytes);
+         OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
+         OPUS_COPY(error, error_intra, C*m->nbEBands);
+         intra = 1;
      }
   } else {
-      CELT_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
-      CELT_COPY(error, error_intra, C*m->nbEBands);
+      OPUS_COPY(oldEBands, oldEBands_intra, C*m->nbEBands);
+      OPUS_COPY(error, error_intra, C*m->nbEBands);
   }
+
+   if (intra)
+      *delayedIntra = new_distortion;
+   else
+      *delayedIntra = ADD32(MULT16_32_Q15(MULT16_16_Q15(pred_coef[LM], pred_coef[LM]),*delayedIntra),
+            new_distortion);
+
   RESTORE_STACK;
 }

-void quant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, celt_word16 *error, int *fine_quant, ec_enc *enc, int _C)
+void quant_fine_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, celt_glog *error, int *fine_quant, ec_enc *enc, int C)
 {
   int i, c;
-   const int C = CHANNELS(_C);

   /* Encode finer resolution */
   for (i=start;i<end;i++)
   {
-      celt_int16 frac = 1<<fine_quant[i];
+      opus_int16 frac = 1<<fine_quant[i];
      if (fine_quant[i] <= 0)
         continue;
      c=0;
      do {
         int q2;
-         celt_word16 offset;
+         celt_glog offset;
 #ifdef FIXED_POINT
         /* Has to be without rounding */
-         q2 = (error[i+c*m->nbEBands]+QCONST16(.5f,DB_SHIFT))>>(DB_SHIFT-fine_quant[i]);
+         q2 = (error[i+c*m->nbEBands]+GCONST(.5f))>>(DB_SHIFT-fine_quant[i]);
 #else
         q2 = (int)floor((error[i+c*m->nbEBands]+.5f)*frac);
 #endif
@@ -372,7 +383,7 @@ void quant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEB
            q2 = 0;
         ec_enc_bits(enc, q2, fine_quant[i]);
 #ifdef FIXED_POINT
-         offset = SUB16(SHR32(SHL32(EXTEND32(q2),DB_SHIFT)+QCONST16(.5f,DB_SHIFT),fine_quant[i]),QCONST16(.5f,DB_SHIFT));
+         offset = SUB32(VSHR32(2*q2+1, fine_quant[i]-DB_SHIFT+1), GCONST(.5f));
 #else
         offset = (q2+.5f)*(1<<(14-fine_quant[i]))*(1.f/16384) - .5f;
 #endif
@@ -383,10 +394,9 @@ void quant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEB
   }
 }

-void quant_energy_finalise(const CELTMode *m, int start, int end, celt_word16 *oldEBands, celt_word16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int _C)
+void quant_energy_finalise(const CELTMode *m, int start, int end, celt_glog *oldEBands, celt_glog *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C)
 {
   int i, prio, c;
-   const int C = CHANNELS(_C);

   /* Use up the remaining bits */
   for (prio=0;prio<2;prio++)
@@ -398,32 +408,31 @@ void quant_energy_finalise(const CELTMode *m, int start, int end, celt_word16 *o
         c=0;
         do {
            int q2;
-            celt_word16 offset;
+            celt_glog offset;
            q2 = error[i+c*m->nbEBands]<0 ? 0 : 1;
            ec_enc_bits(enc, q2, 1);
 #ifdef FIXED_POINT
-            offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1);
+            offset = SHR32(SHL32(q2,DB_SHIFT)-GCONST(.5f),fine_quant[i]+1);
 #else
            offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384);
 #endif
            oldEBands[i+c*m->nbEBands] += offset;
+            error[i+c*m->nbEBands] -= offset;
            bits_left--;
         } while (++c < C);
      }
   }
 }

-void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int intra, ec_dec *dec, int _C, int LM)
+void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, int intra, ec_dec *dec, int C, int LM)
 {
   const unsigned char *prob_model = e_prob_model[LM][intra];
   int i, c;
-   celt_word32 prev[2] = {0, 0};
-   celt_word16 coef;
-   celt_word16 beta;
-   const int C = CHANNELS(_C);
-   celt_int32 budget;
-   celt_int32 tell;
-
+   opus_val64 prev[2] = {0, 0};
+   opus_val16 coef;
+   opus_val16 beta;
+   opus_int32 budget;
+   opus_int32 tell;

   if (intra)
   {
@@ -442,8 +451,12 @@ void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_word16 *o
      c=0;
      do {
         int qi;
-         celt_word32 q;
-         celt_word32 tmp;
+         opus_val32 q;
+         opus_val32 tmp;
+         /* It would be better to express this invariant as a
+            test on C at function entry, but that isn't enough
+            to make the static analyzer happy. */
+         celt_sig_assert(c<2);
         tell = ec_tell(dec);
         if(budget-tell>=15)
         {
@@ -463,35 +476,34 @@ void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_word16 *o
         }
         else
            qi = -1;
-         q = SHL32(EXTEND32(qi),DB_SHIFT);
+         q = (opus_val32)SHL32(EXTEND32(qi),DB_SHIFT);

-         oldEBands[i+c*m->nbEBands] = MAX16(-QCONST16(9.f,DB_SHIFT), oldEBands[i+c*m->nbEBands]);
-         tmp = PSHR32(MULT16_16(coef,oldEBands[i+c*m->nbEBands]),8) + prev[c] + SHL32(q,7);
+         oldEBands[i+c*m->nbEBands] = MAXG(-GCONST(9.f), oldEBands[i+c*m->nbEBands]);
+         tmp = MULT16_32_Q15(coef,oldEBands[i+c*m->nbEBands]) + prev[c] + q;
 #ifdef FIXED_POINT
-         tmp = MAX32(-QCONST32(28.f, DB_SHIFT+7), tmp);
+         tmp = MIN32(GCONST(28.f), MAX32(-GCONST(28.f), tmp));
 #endif
-         oldEBands[i+c*m->nbEBands] = PSHR32(tmp, 7);
-         prev[c] = prev[c] + SHL32(q,7) - MULT16_16(beta,PSHR32(q,8));
+         oldEBands[i+c*m->nbEBands] = tmp;
+         prev[c] = prev[c] + q - MULT16_32_Q15(beta,q);
      } while (++c < C);
   }
 }

-void unquant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int *fine_quant, ec_dec *dec, int _C)
+void unquant_fine_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, int *fine_quant, ec_dec *dec, int C)
 {
   int i, c;
-   const int C = CHANNELS(_C);
   /* Decode finer resolution */
   for (i=start;i<end;i++)
   {
      if (fine_quant[i] <= 0)
         continue;
-      c=0; 
+      c=0;
      do {
         int q2;
-         celt_word16 offset;
+         celt_glog offset;
         q2 = ec_dec_bits(dec, fine_quant[i]);
 #ifdef FIXED_POINT
-         offset = SUB16(SHR32(SHL32(EXTEND32(q2),DB_SHIFT)+QCONST16(.5f,DB_SHIFT),fine_quant[i]),QCONST16(.5f,DB_SHIFT));
+         offset = SUB32(VSHR32(2*q2+1, fine_quant[i]-DB_SHIFT+1), GCONST(.5f));
 #else
         offset = (q2+.5f)*(1<<(14-fine_quant[i]))*(1.f/16384) - .5f;
 #endif
@@ -500,10 +512,9 @@ void unquant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *old
   }
 }

-void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int *fine_quant,  int *fine_priority, int bits_left, ec_dec *dec, int _C)
+void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_glog *oldEBands, int *fine_quant,  int *fine_priority, int bits_left, ec_dec *dec, int C)
 {
   int i, prio, c;
-   const int C = CHANNELS(_C);

   /* Use up the remaining bits */
   for (prio=0;prio<2;prio++)
@@ -515,10 +526,10 @@ void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_word16
         c=0;
         do {
            int q2;
-            celt_word16 offset;
+            celt_glog offset;
            q2 = ec_dec_bits(dec, 1);
 #ifdef FIXED_POINT
-            offset = SHR16(SHL16(q2,DB_SHIFT)-QCONST16(.5f,DB_SHIFT),fine_quant[i]+1);
+            offset = SHR32(SHL32(q2,DB_SHIFT)-GCONST(.5f),fine_quant[i]+1);
 #else
            offset = (q2-.5f)*(1<<(14-fine_quant[i]-1))*(1.f/16384);
 #endif
@@ -529,38 +540,23 @@ void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_word16
   }
 }

-void log2Amp(const CELTMode *m, int start, int end,
-      celt_ener *eBands, celt_word16 *oldEBands, int _C)
-{
-   int c, i;
-   const int C = CHANNELS(_C);
-   c=0;
-   do {
-      for (i=0;i<start;i++)
-         eBands[i+c*m->nbEBands] = 0;
-      for (;i<end;i++)
-      {
-         celt_word16 lg = ADD16(oldEBands[i+c*m->nbEBands],
-                         SHL16((celt_word16)eMeans[i],6));
-         eBands[i+c*m->nbEBands] = PSHR32(celt_exp2(lg),4);
-      }
-      for (;i<m->nbEBands;i++)
-         eBands[i+c*m->nbEBands] = 0;
-   } while (++c < C);
-}
-
 void amp2Log2(const CELTMode *m, int effEnd, int end,
-      celt_ener *bandE, celt_word16 *bandLogE, int _C)
+      celt_ener *bandE, celt_glog *bandLogE, int C)
 {
   int c, i;
-   const int C = CHANNELS(_C);
   c=0;
   do {
      for (i=0;i<effEnd;i++)
+      {
         bandLogE[i+c*m->nbEBands] =
-               celt_log2(SHL32(bandE[i+c*m->nbEBands],2))
-               - SHL16((celt_word16)eMeans[i],6);
+               celt_log2_db(bandE[i+c*m->nbEBands])
+               - SHL32((celt_glog)eMeans[i],DB_SHIFT-4);
+#ifdef FIXED_POINT
+         /* Compensate for bandE[] being Q12 but celt_log2() taking a Q14 input. */
+         bandLogE[i+c*m->nbEBands] += GCONST(2.f);
+#endif
+      }
      for (i=effEnd;i<end;i++)
-         bandLogE[c*m->nbEBands+i] = -QCONST16(14.f,DB_SHIFT);
+         bandLogE[c*m->nbEBands+i] = -GCONST(14.f);
   } while (++c < C);
 }
--- a/libcelt/quant_bands.h
+++ b/libcelt/quant_bands.h
@@ -5,19 +5,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -35,28 +35,32 @@
 #include "entdec.h"
 #include "mathops.h"

+#ifdef FIXED_POINT
+extern const signed char eMeans[25];
+#else
+extern const opus_val16 eMeans[25];
+#endif
+
 void amp2Log2(const CELTMode *m, int effEnd, int end,
-      celt_ener *bandE, celt_word16 *bandLogE, int _C);
+      celt_ener *bandE, celt_glog *bandLogE, int C);

 void log2Amp(const CELTMode *m, int start, int end,
-      celt_ener *eBands, celt_word16 *oldEBands, int _C);
-
-unsigned char *quant_prob_alloc(const CELTMode *m);
-void quant_prob_free(const celt_int16 *freq);
+      celt_ener *eBands, const celt_glog *oldEBands, int C);

 void quant_coarse_energy(const CELTMode *m, int start, int end, int effEnd,
-      const celt_word16 *eBands, celt_word16 *oldEBands, celt_uint32 budget,
-      celt_word16 *error, ec_enc *enc, int _C, int LM,
-      int nbAvailableBytes, int force_intra, int *delayedIntra, int two_pass);
+      const celt_glog *eBands, celt_glog *oldEBands, opus_uint32 budget,
+      celt_glog *error, ec_enc *enc, int C, int LM,
+      int nbAvailableBytes, int force_intra, opus_val32 *delayedIntra,
+      int two_pass, int loss_rate, int lfe);

-void quant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, celt_word16 *error, int *fine_quant, ec_enc *enc, int _C);
+void quant_fine_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, celt_glog *error, int *fine_quant, ec_enc *enc, int C);

-void quant_energy_finalise(const CELTMode *m, int start, int end, celt_word16 *oldEBands, celt_word16 *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int _C);
+void quant_energy_finalise(const CELTMode *m, int start, int end, celt_glog *oldEBands, celt_glog *error, int *fine_quant, int *fine_priority, int bits_left, ec_enc *enc, int C);

-void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int intra, ec_dec *dec, int _C, int LM);
+void unquant_coarse_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, int intra, ec_dec *dec, int C, int LM);

-void unquant_fine_energy(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int *fine_quant, ec_dec *dec, int _C);
+void unquant_fine_energy(const CELTMode *m, int start, int end, celt_glog *oldEBands, int *fine_quant, ec_dec *dec, int C);

-void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_word16 *oldEBands, int *fine_quant, int *fine_priority, int bits_left, ec_dec *dec, int _C);
+void unquant_energy_finalise(const CELTMode *m, int start, int end, celt_glog *oldEBands, int *fine_quant, int *fine_priority, int bits_left, ec_dec *dec, int C);

 #endif /* QUANT_BANDS */
--- a/libcelt/rate.c
+++ b/libcelt/rate.c
@@ -5,19 +5,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -39,7 +39,6 @@
 #include "entcode.h"
 #include "rate.h"

-
 static const unsigned char LOG2_FRAC_TABLE[24]={
   0,
   8,13,
@@ -54,10 +53,10 @@ static const unsigned char LOG2_FRAC_TABLE[24]={
  N and K are themselves limited to 15 bits.*/
 static int fits_in32(int _n, int _k)
 {
-   static const celt_int16 maxN[15] = {
+   static const opus_int16 maxN[15] = {
      32767, 32767, 32767, 1476, 283, 109,  60,  40,
       29,  24,  20,  18,  16,  14,  13};
-   static const celt_int16 maxK[15] = {
+   static const opus_int16 maxK[15] = {
      32767, 32767, 32767, 32767, 1172, 238,  95,  53,
       36,  27,  22,  18,  16,  15,  13};
   if (_n>=14)
@@ -79,13 +78,13 @@ void compute_pulse_cache(CELTMode *m, int LM)
   int curr=0;
   int nbEntries=0;
   int entryN[100], entryK[100], entryI[100];
-   const celt_int16 *eBands = m->eBands;
+   const opus_int16 *eBands = m->eBands;
   PulseCache *cache = &m->cache;
-   celt_int16 *cindex;
+   opus_int16 *cindex;
   unsigned char *bits;
   unsigned char *cap;

-   cindex = celt_alloc(sizeof(cache->index[0])*m->nbEBands*(LM+2));
+   cindex = (opus_int16 *)opus_alloc(sizeof(cache->index[0])*m->nbEBands*(LM+2));
   cache->index = cindex;

   /* Scan for all unique band sizes */
@@ -125,14 +124,14 @@ void compute_pulse_cache(CELTMode *m, int LM)
         }
      }
   }
-   bits = celt_alloc(sizeof(unsigned char)*curr);
+   bits = (unsigned char *)opus_alloc(sizeof(unsigned char)*curr);
   cache->bits = bits;
   cache->size = curr;
   /* Compute the cache for all unique sizes */
   for (i=0;i<nbEntries;i++)
   {
      unsigned char *ptr = bits+entryI[i];
-      celt_int16 tmp[MAX_PULSES+1];
+      opus_int16 tmp[CELT_MAX_PULSES+1];
      get_required_bits(tmp, entryN[i], get_pulses(entryK[i]), BITRES);
      for (j=1;j<=entryK[i];j++)
         ptr[j] = tmp[get_pulses(j)]-1;
@@ -141,7 +140,7 @@ void compute_pulse_cache(CELTMode *m, int LM)

   /* Compute the maximum rate for each band at which we'll reliably use as
       many bits as we ask for. */
-   cache->caps = cap = celt_alloc(sizeof(cache->caps[0])*(LM+1)*2*m->nbEBands);
+   cache->caps = cap = (unsigned char *)opus_alloc(sizeof(cache->caps[0])*(LM+1)*2*m->nbEBands);
   for (i=0;i<=LM;i++)
   {
      for (C=1;C<=2;C++)
@@ -157,8 +156,8 @@ void compute_pulse_cache(CELTMode *m, int LM)
            else
            {
               const unsigned char *pcache;
-               celt_int32           num;
-               celt_int32           den;
+               opus_int32           num;
+               opus_int32           den;
               int                  LM0;
               int                  N;
               int                  offset;
@@ -166,9 +165,9 @@ void compute_pulse_cache(CELTMode *m, int LM)
               int                  qb;
               int                  k;
               LM0 = 0;
-               /* Even-sized bands bigger than N=2 can be split one more
-                   time. */
-               if (N0 > 2 && !(N0&1))
+               /* Even-sized bands bigger than N=2 can be split one more time.
+                  As of commit 44203907 all bands >1 are even, including custom modes.*/
+               if (N0 > 2)
               {
                  N0>>=1;
                  LM0--;
@@ -190,13 +189,13 @@ void compute_pulse_cache(CELTMode *m, int LM)
                  /* Offset the number of qtheta bits by log2(N)/2
                      + QTHETA_OFFSET compared to their "fair share" of
                      total/N */
-                  offset = (m->logN[j]+(LM0+k<<BITRES)>>1)-QTHETA_OFFSET;
+                  offset = ((m->logN[j]+(opus_int32)((opus_uint32)(LM0+k)<<BITRES))>>1)-QTHETA_OFFSET;
                  /* The number of qtheta bits we'll allocate if the remainder
                      is to be max_bits.
                     The average measured cost for theta is 0.89701 times qb,
                      approximated here as 459/512. */
-                  num=459*(celt_int32)((2*N-1)*offset+max_bits);
-                  den=((celt_int32)(2*N-1)<<9)-459;
+                  num=459*(opus_int32)((2*N-1)*offset+max_bits);
+                  den=((opus_int32)(2*N-1)<<9)-459;
                  qb = IMIN((num+(den>>1))/den, 57);
                  celt_assert(qb >= 0);
                  max_bits += qb;
@@ -206,12 +205,12 @@ void compute_pulse_cache(CELTMode *m, int LM)
               if (C==2)
               {
                  max_bits <<= 1;
-                  offset = (m->logN[j]+(i<<BITRES)>>1)-(N==2?QTHETA_OFFSET_TWOPHASE:QTHETA_OFFSET);
+                  offset = ((m->logN[j]+(i<<BITRES))>>1)-(N==2?QTHETA_OFFSET_TWOPHASE:QTHETA_OFFSET);
                  ndof = 2*N-1-(N==2);
                  /* The average measured cost for theta with the step PDF is
                      0.95164 times qb, approximated here as 487/512. */
-                  num = (N==2?512:487)*(celt_int32)(max_bits+ndof*offset);
-                  den = ((celt_int32)ndof<<9)-(N==2?512:487);
+                  num = (N==2?512:487)*(opus_int32)(max_bits+ndof*offset);
+                  den = ((opus_int32)ndof<<9)-(N==2?512:487);
                  qb = IMIN((num+(den>>1))/den, (N==2?64:61));
                  celt_assert(qb >= 0);
                  max_bits += qb;
@@ -221,19 +220,19 @@ void compute_pulse_cache(CELTMode *m, int LM)
               ndof = C*N + ((C==2 && N>2) ? 1 : 0);
               /* Offset the number of fine bits by log2(N)/2 + FINE_OFFSET
                   compared to their "fair share" of total/N */
-               offset = (m->logN[j] + (i<<BITRES)>>1)-FINE_OFFSET;
+               offset = ((m->logN[j] + (i<<BITRES))>>1)-FINE_OFFSET;
               /* N=2 is the only point that doesn't match the curve */
               if (N==2)
                  offset += 1<<BITRES>>2;
               /* The number of fine bits we'll allocate if the remainder is
                   to be max_bits. */
               num = max_bits+ndof*offset;
-               den = ndof-1<<BITRES;
+               den = (ndof-1)<<BITRES;
               qb = IMIN((num+(den>>1))/den, MAX_FINE_BITS);
               celt_assert(qb >= 0);
               max_bits += C*qb<<BITRES;
            }
-            max_bits = (4*max_bits/(C*(m->eBands[j+1]-m->eBands[j]<<i)))-64;
+            max_bits = (4*max_bits/(C*((m->eBands[j+1]-m->eBands[j])<<i)))-64;
            celt_assert(max_bits >= 0);
            celt_assert(max_bits < 256);
            *cap++ = (unsigned char)max_bits;
@@ -244,25 +243,23 @@ void compute_pulse_cache(CELTMode *m, int LM)

 #endif /* CUSTOM_MODES */

-
 #define ALLOC_STEPS 6

-static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
-      const int *bits1, const int *bits2, const int *thresh, const int *cap, celt_int32 total, celt_int32 *_balance,
+static OPUS_INLINE int interp_bits2pulses(const CELTMode *m, int start, int end, int skip_start,
+      const int *bits1, const int *bits2, const int *thresh, const int *cap, opus_int32 total, opus_int32 *_balance,
      int skip_rsv, int *intensity, int intensity_rsv, int *dual_stereo, int dual_stereo_rsv, int *bits,
-      int *ebits, int *fine_priority, int _C, int LM, ec_ctx *ec, int encode, int prev)
+      int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
 {
-   celt_int32 psum;
+   opus_int32 psum;
   int lo, hi;
   int i, j;
   int logM;
-   const int C = CHANNELS(_C);
   int stereo;
   int codedBands=-1;
   int alloc_floor;
-   celt_int32 left, percoeff;
+   opus_int32 left, percoeff;
   int done;
-   int balance;
+   opus_int32 balance;
   SAVE_STACK;

   alloc_floor = C<<BITRES;
@@ -278,7 +275,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
      done = 0;
      for (j=end;j-->start;)
      {
-         int tmp = bits1[j] + (mid*(celt_int32)bits2[j]>>ALLOC_STEPS);
+         int tmp = bits1[j] + (mid*(opus_int32)bits2[j]>>ALLOC_STEPS);
         if (tmp >= thresh[j] || done)
         {
            done = 1;
@@ -299,7 +296,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
   done = 0;
   for (j=end;j-->start;)
   {
-      int tmp = bits1[j] + (lo*bits2[j]>>ALLOC_STEPS);
+      int tmp = bits1[j] + ((opus_int32)lo*bits2[j]>>ALLOC_STEPS);
      if (tmp < thresh[j] && !done)
      {
         if (tmp >= alloc_floor)
@@ -336,7 +333,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
      /*Figure out how many left-over bits we would be adding to this band.
        This can include bits we've stolen back from higher, skipped bands.*/
      left = total-psum;
-      percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+      percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
      left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
      rem = IMAX(left-(m->eBands[j]-m->eBands[start]),0);
      band_width = m->eBands[codedBands]-m->eBands[j];
@@ -351,9 +348,20 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
            /*This if() block is the only part of the allocation function that
               is not a mandatory part of the bitstream: any bands we choose to
               skip here must be explicitly signaled.*/
-            /*Choose a threshold with some hysteresis to keep bands from
-               fluctuating in and out.*/
-            if (band_bits > ((j<prev?7:9)*band_width<<LM<<BITRES)>>4)
+            int depth_threshold;
+            /*We choose a threshold with some hysteresis to keep bands from
+               fluctuating in and out, but we try not to fold below a certain point. */
+            if (codedBands > 17)
+               depth_threshold = j<prev ? 7 : 9;
+            else
+               depth_threshold = 0;
+#ifdef FUZZING
+            (void)signalBandwidth;
+            (void)depth_threshold;
+            if ((rand()&0x1) == 0)
+#else
+            if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
+#endif
            {
               ec_enc_bit_logp(ec, 1, 1);
               break;
@@ -413,7 +421,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int

   /* Allocate the remaining bits */
   left = total-psum;
-   percoeff = left/(m->eBands[codedBands]-m->eBands[start]);
+   percoeff = celt_udiv(left, m->eBands[codedBands]-m->eBands[start]);
   left -= (m->eBands[codedBands]-m->eBands[start])*percoeff;
   for (j=start;j<codedBands;j++)
      bits[j] += ((int)percoeff*(m->eBands[j+1]-m->eBands[j]));
@@ -431,17 +439,17 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
      int N0, N, den;
      int offset;
      int NClogN;
-      int excess;
+      opus_int32 excess, bit;

      celt_assert(bits[j] >= 0);
      N0 = m->eBands[j+1]-m->eBands[j];
      N=N0<<LM;
-      bits[j] += balance;
+      bit = (opus_int32)bits[j]+balance;

      if (N>1)
      {
-         excess = IMAX(bits[j]-cap[j],0);
-         bits[j] -= excess;
+         excess = MAX32(bit-cap[j],0);
+         bits[j] = bit-excess;

         /* Compensate for the extra DoF in stereo */
         den=(C*N+ ((C==2 && N>2 && !*dual_stereo && j<*intensity) ? 1 : 0));
@@ -464,7 +472,8 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
            offset += NClogN>>3;

         /* Divide with rounding */
-         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))) / (den<<BITRES));
+         ebits[j] = IMAX(0, (bits[j] + offset + (den<<(BITRES-1))));
+         ebits[j] = celt_udiv(ebits[j], den)>>BITRES;

         /* Make sure not to bust */
         if (C*ebits[j] > (bits[j]>>BITRES))
@@ -482,8 +491,8 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int

      } else {
         /* For N=1, all bits go to fine energy except for a single sign bit */
-         excess = IMAX(0,bits[j]-(C<<BITRES));
-         bits[j] -= excess;
+         excess = MAX32(0,bit-(C<<BITRES));
+         bits[j] = bit-excess;
         ebits[j] = 0;
         fine_priority[j] = 1;
      }
@@ -495,7 +504,7 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
      {
         int extra_fine;
         int extra_bits;
-         extra_fine = IMIN(excess >> stereo+BITRES, MAX_FINE_BITS-ebits[j]);
+         extra_fine = IMIN(excess>>(stereo+BITRES),MAX_FINE_BITS-ebits[j]);
         ebits[j] += extra_fine;
         extra_bits = extra_fine*C<<BITRES;
         fine_priority[j] = extra_bits >= excess-balance;
@@ -522,11 +531,10 @@ static inline int interp_bits2pulses(const CELTMode *m, int start, int end, int
   return codedBands;
 }

-int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stereo,
-      celt_int32 total, celt_int32 *balance, int *pulses, int *ebits, int *fine_priority, int _C, int LM, ec_ctx *ec, int encode, int prev)
+int clt_compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stereo,
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth)
 {
   int lo, hi, len, j;
-   const int C = CHANNELS(_C);
   int codedBands;
   int skip_start;
   int skip_rsv;
@@ -537,7 +545,7 @@ int compute_allocation(const CELTMode *m, int start, int end, const int *offsets
   VARDECL(int, thresh);
   VARDECL(int, trim_offset);
   SAVE_STACK;
-   
+
   total = IMAX(total, 0);
   len = m->nbEBands;
   skip_start = start;
@@ -569,7 +577,7 @@ int compute_allocation(const CELTMode *m, int start, int end, const int *offsets
      thresh[j] = IMAX((C)<<BITRES, (3*(m->eBands[j+1]-m->eBands[j])<<LM<<BITRES)>>4);
      /* Tilt of the allocation curve */
      trim_offset[j] = C*(m->eBands[j+1]-m->eBands[j])*(alloc_trim-5-LM)*(end-j-1)
-            <<(LM+BITRES)>>6;
+            *(1<<(LM+BITRES))>>6;
      /* Giving less resolution to single-coefficient bands because they get
         more benefit from having one coarse value per coefficient*/
      if ((m->eBands[j+1]-m->eBands[j])<<LM==1)
@@ -631,7 +639,7 @@ int compute_allocation(const CELTMode *m, int start, int end, const int *offsets
   }
   codedBands = interp_bits2pulses(m, start, end, skip_start, bits1, bits2, thresh, cap,
         total, balance, skip_rsv, intensity, intensity_rsv, dual_stereo, dual_stereo_rsv,
-         pulses, ebits, fine_priority, C, LM, ec, encode, prev);
+         pulses, ebits, fine_priority, C, LM, ec, encode, prev, signalBandwidth);
   RESTORE_STACK;
   return codedBands;
 }

--- a/libcelt/rate.h
+++ b/libcelt/rate.h
@@ -5,19 +5,19 @@
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
-   
+
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
-   
+
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
-   
+
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -32,7 +32,7 @@
 #define MAX_PSEUDO 40
 #define LOG_MAX_PSEUDO 6

-#define MAX_PULSES 128
+#define CELT_MAX_PULSES 128

 #define MAX_FINE_BITS 8

@@ -40,19 +40,17 @@
 #define QTHETA_OFFSET 4
 #define QTHETA_OFFSET_TWOPHASE 16

-#define BITOVERFLOW 30000
-
 #include "cwrs.h"
 #include "modes.h"

 void compute_pulse_cache(CELTMode *m, int LM);

-static inline int get_pulses(int i)
+static OPUS_INLINE int get_pulses(int i)
 {
   return i<8 ? i : (8 + (i&7)) << ((i>>3)-1);
 }

-static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits)
+static OPUS_INLINE int bits2pulses(const CELTMode *m, int band, int LM, int bits)
 {
   int i;
   int lo, hi;
@@ -68,18 +66,18 @@ static inline int bits2pulses(const CELTMode *m, int band, int LM, int bits)
   {
      int mid = (lo+hi+1)>>1;
      /* OPT: Make sure this is implemented with a conditional move */
-      if (cache[mid] >= bits)
+      if ((int)cache[mid] >= bits)
         hi = mid;
      else
         lo = mid;
   }
-   if (bits- (lo == 0 ? -1 : cache[lo]) <= cache[hi]-bits)
+   if (bits- (lo == 0 ? -1 : (int)cache[lo]) <= (int)cache[hi]-bits)
      return lo;
   else
      return hi;
 }

-static inline int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
+static OPUS_INLINE int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
 {
   const unsigned char *cache;

@@ -88,11 +86,8 @@ static inline int pulses2bits(const CELTMode *m, int band, int LM, int pulses)
   return pulses == 0 ? 0 : cache[pulses]+1;
 }

-/** Computes a cache of the pulses->bits mapping in each band */
-celt_int16 **compute_alloc_cache(CELTMode *m, int M);
-
 /** Compute the pulse allocation, i.e. how many pulses will go in each
-  * band. 
+  * band.
 @param m mode
 @param offsets Requested increase or decrease in the number of bits for
                each band
@@ -100,8 +95,7 @@ celt_int16 **compute_alloc_cache(CELTMode *m, int M);
 @param pulses Number of pulses per band (returned)
 @return Total number of bits allocated
 */
-int compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stero,
-      celt_int32 total, celt_int32 *balance, int *pulses, int *ebits, int *fine_priority, int _C, int LM, ec_ctx *ec, int encode, int prev);
-
+int clt_compute_allocation(const CELTMode *m, int start, int end, const int *offsets, const int *cap, int alloc_trim, int *intensity, int *dual_stereo,
+      opus_int32 total, opus_int32 *balance, int *pulses, int *ebits, int *fine_priority, int C, int LM, ec_ctx *ec, int encode, int prev, int signalBandwidth);

 #endif
No results found