Compare revisions

1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(PITCH_ARM_H)
+# define PITCH_ARM_H
+
+# include "armcpu.h"
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+        const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+
+#  if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+        const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#   define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr)
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_EDSP)
+opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
+#  endif
+
+#  if defined(OPUS_HAVE_RTCD) && \
+    ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
+     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
+     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
+extern opus_val32
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int, int);
+#   define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch, arch))
+
+#  elif defined(OPUS_ARM_PRESUME_EDSP) || \
+    defined(OPUS_ARM_PRESUME_MEDIA) || \
+    defined(OPUS_ARM_PRESUME_NEON)
+#   define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
+
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void xcorr_kernel_neon_fixed(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+#  endif
+
+#  if defined(OPUS_HAVE_RTCD) && \
+    (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+                    const opus_val16 *x,
+                    const opus_val16 *y,
+                    opus_val32       sum[4],
+                    int              len);
+
+#   define OVERRIDE_XCORR_KERNEL (1)
+#   define xcorr_kernel(x, y, sum, len, arch) \
+     ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_XCORR_KERNEL (1)
+#   define xcorr_kernel(x, y, sum, len, arch) \
+      ((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
+
+#  endif
+
+#else /* Start !FIXED_POINT */
+/* Float case */
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+                                 opus_val32 *xcorr, int len, int max_pitch, int arch);
+#endif
+
+#  if defined(OPUS_HAVE_RTCD) && \
+    (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void
+(*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val32 *, int, int, int);
+
+#  define OVERRIDE_PITCH_XCORR (1)
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch, arch))
+
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+#   define OVERRIDE_PITCH_XCORR (1)
+#   define celt_pitch_xcorr celt_pitch_xcorr_float_neon
+
+#  endif
+
+#endif /* end !FIXED_POINT */
+
+#endif
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+#ifdef FIXED_POINT
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(celt_inner_prod_c(x, y, N) == xy);
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(xy1_c == *xy1);
+        celt_assert(xy2_c == *xy2);
+    }
+#endif
+}
+
+#else /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+   vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
+#ifdef OPUS_CHECK_ASM
+
+/* This part of code simulates floating-point NEON operations. */
+
+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of celt_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
+{
+   int i;
+   *err = 0;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+      *err += ABS32(xy);
+   }
+   *err = *err*2e-7 + N*1e-37;
+   return xy;
+}
+
+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of dual_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
+{
+   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
+}
+
+#endif /* OPUS_CHECK_ASM */
+
+/* ========================================================================== */
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        float err, res;
+        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+        celt_assert(ABS32(res - xy) <= err);
+    }
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        float err[2];
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
+    }
+#endif
+}
+
+#endif /* FIXED_POINT */
--- a/libcelt/bands.c
+++ b/libcelt/bands.c
@@ -17,8 +17,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -40,29 +40,44 @@
 #include "os_support.h"
 #include "mathops.h"
 #include "rate.h"
+#include "quant_bands.h"
+#include "pitch.h"

-opus_uint32 lcg_rand(opus_uint32 seed)
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev)
+{
+   int i;
+   for (i=0;i<N;i++)
+   {
+      if (val < thresholds[i])
+         break;
+   }
+   if (i>prev && val < thresholds[prev]+hysteresis[prev])
+      i=prev;
+   if (i<prev && val > thresholds[prev-1]-hysteresis[prev-1])
+      i=prev;
+   return i;
+}
+
+opus_uint32 celt_lcg_rand(opus_uint32 seed)
 {
   return 1664525 * seed + 1013904223;
 }

 /* This is a cos() approximation designed to be bit-exact on any platform. Bit exactness
   with this approximation is important because it has an impact on the bit allocation */
-static opus_int16 bitexact_cos(opus_int16 x)
+opus_int16 bitexact_cos(opus_int16 x)
 {
   opus_int32 tmp;
   opus_int16 x2;
   tmp = (4096+((opus_int32)(x)*(x)))>>13;
-   if (tmp > 32767)
-      tmp = 32767;
+   celt_sig_assert(tmp<=32767);
   x2 = tmp;
   x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2)))));
-   if (x2 > 32766)
-      x2 = 32766;
+   celt_sig_assert(x2<=32766);
   return 1+x2;
 }

-static int bitexact_log2tan(int isin,int icos)
+int bitexact_log2tan(int isin,int icos)
 {
   int lc;
   int ls;
@@ -70,19 +85,19 @@ static int bitexact_log2tan(int isin,int icos)
   ls=EC_ILOG(isin);
   icos<<=15-lc;
   isin<<=15-ls;
-   return (ls-lc<<11)
+   return (ls-lc)*(1<<11)
         +FRAC_MUL16(isin, FRAC_MUL16(isin, -2597) + 7932)
         -FRAC_MUL16(icos, FRAC_MUL16(icos, -2597) + 7932);
 }

 #ifdef FIXED_POINT
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bank, int end, int _C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   const int C = CHANNELS(_C);
-   N = M*m->shortMdctSize;
+   (void)arch;
+   N = m->shortMdctSize<<LM;
   c=0; do {
      for (i=0;i<end;i++)
      {
@@ -90,86 +105,98 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bank
         opus_val32 maxval=0;
         opus_val32 sum = 0;

-         j=M*eBands[i]; do {
-            maxval = MAX32(maxval, X[j+c*N]);
-            maxval = MAX32(maxval, -X[j+c*N]);
-         } while (++j<M*eBands[i+1]);
-
+         maxval = celt_maxabs32(&X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
         if (maxval > 0)
         {
-            int shift = celt_ilog2(maxval)-10;
-            j=M*eBands[i]; do {
-               sum = MAC16_16(sum, EXTRACT16(VSHR32(X[j+c*N],shift)),
-                                   EXTRACT16(VSHR32(X[j+c*N],shift)));
-            } while (++j<M*eBands[i+1]);
-            /* We're adding one here to make damn sure we never end up with a pitch vector that's
-               larger than unity norm */
-            bank[i+c*m->nbEBands] = EPSILON+VSHR32(EXTEND32(celt_sqrt(sum)),-shift);
+            int shift, shift2;
+            shift = celt_ilog2(maxval) - 14;
+            shift2 = (((m->logN[i]>>BITRES)+LM+1)>>1);
+            j=eBands[i]<<LM;
+            if (shift>0)
+            {
+               do {
+                  sum = ADD32(sum, SHR32(MULT16_16(EXTRACT16(SHR32(X[j+c*N],shift)),
+                        EXTRACT16(SHR32(X[j+c*N],shift))), 2*shift2));
+               } while (++j<eBands[i+1]<<LM);
+            } else {
+               do {
+                  sum = ADD32(sum, SHR32(MULT16_16(EXTRACT16(SHL32(X[j+c*N],-shift)),
+                        EXTRACT16(SHL32(X[j+c*N],-shift))), 2*shift2));
+               } while (++j<eBands[i+1]<<LM);
+            }
+            shift+=shift2;
+            while (sum < 1<<28) {
+               sum <<=2;
+               shift -= 1;
+            }
+            /* We're adding one here to ensure the normalized band isn't larger than unity norm */
+            bandE[i+c*m->nbEBands] = EPSILON+VSHR32(celt_sqrt(sum),-shift);
         } else {
-            bank[i+c*m->nbEBands] = EPSILON;
+            bandE[i+c*m->nbEBands] = EPSILON;
         }
-         /*printf ("%f ", bank[i+c*m->nbEBands]);*/
+         /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
      }
   } while (++c<C);
   /*printf ("\n");*/
 }

 /* Normalise each band such that the energy is one. */
-void normalise_bands(const CELTMode *m, const celt_sig * restrict freq, celt_norm * restrict X, const celt_ener *bank, int end, int _C, int M)
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   const int C = CHANNELS(_C);
   N = M*m->shortMdctSize;
   c=0; do {
      i=0; do {
         opus_val16 g;
         int j,shift;
-         opus_val16 E;
-         shift = celt_zlog2(bank[i+c*m->nbEBands])-13;
-         E = VSHR32(bank[i+c*m->nbEBands], shift);
-         g = EXTRACT16(celt_rcp(SHL32(E,3)));
-         j=M*eBands[i]; do {
-            X[j+c*N] = MULT16_16_Q15(VSHR32(freq[j+c*N],shift-1),g);
-         } while (++j<M*eBands[i+1]);
+         opus_val32 E;
+         shift = celt_zlog2(bandE[i+c*m->nbEBands])-14;
+         E = VSHR32(bandE[i+c*m->nbEBands], shift-2);
+         g = EXTRACT16(celt_rcp(E));
+         if (shift > 0) {
+            j=M*eBands[i]; do {
+               X[j+c*N] = PSHR32(MULT16_32_Q15(g, freq[j+c*N]),shift);
+            } while (++j<M*eBands[i+1]);
+         } else {
+            j=M*eBands[i]; do {
+               X[j+c*N] = SHL32(MULT16_32_Q15(g, freq[j+c*N]),-shift);
+            } while (++j<M*eBands[i+1]);
+         }
      } while (++i<end);
   } while (++c<C);
 }

 #else /* FIXED_POINT */
 /* Compute the amplitude (sqrt energy) in each of the bands */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bank, int end, int _C, int M)
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   const int C = CHANNELS(_C);
-   N = M*m->shortMdctSize;
+   N = m->shortMdctSize<<LM;
   c=0; do {
      for (i=0;i<end;i++)
      {
-         int j;
-         opus_val32 sum = 1e-27f;
-         for (j=M*eBands[i];j<M*eBands[i+1];j++)
-            sum += X[j+c*N]*X[j+c*N];
-         bank[i+c*m->nbEBands] = celt_sqrt(sum);
-         /*printf ("%f ", bank[i+c*m->nbEBands]);*/
+         opus_val32 sum;
+         sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM, arch);
+         bandE[i+c*m->nbEBands] = celt_sqrt(sum);
+         /*printf ("%f ", bandE[i+c*m->nbEBands]);*/
      }
   } while (++c<C);
   /*printf ("\n");*/
 }

 /* Normalise each band such that the energy is one. */
-void normalise_bands(const CELTMode *m, const celt_sig * restrict freq, celt_norm * restrict X, const celt_ener *bank, int end, int _C, int M)
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M)
 {
   int i, c, N;
   const opus_int16 *eBands = m->eBands;
-   const int C = CHANNELS(_C);
   N = M*m->shortMdctSize;
   c=0; do {
      for (i=0;i<end;i++)
      {
         int j;
-         opus_val16 g = 1.f/(1e-27f+bank[i+c*m->nbEBands]);
+         opus_val16 g = 1.f/(1e-27f+bandE[i+c*m->nbEBands]);
         for (j=M*eBands[i];j<M*eBands[i+1];j++)
            X[j+c*N] = freq[j+c*N]*g;
      }
@@ -179,38 +206,83 @@ void normalise_bands(const CELTMode *m, const celt_sig * restrict freq, celt_nor
 #endif /* FIXED_POINT */

 /* De-normalise the energy to produce the synthesis from the unit-energy bands */
-void denormalise_bands(const CELTMode *m, const celt_norm * restrict X, celt_sig * restrict freq, const celt_ener *bank, int end, int _C, int M)
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const celt_glog *bandLogE, int start,
+      int end, int M, int downsample, int silence)
 {
-   int i, c, N;
+   int i, N;
+   int bound;
+   celt_sig * OPUS_RESTRICT f;
+   const celt_norm * OPUS_RESTRICT x;
   const opus_int16 *eBands = m->eBands;
-   const int C = CHANNELS(_C);
   N = M*m->shortMdctSize;
-   celt_assert2(C<=2, "denormalise_bands() not implemented for >2 channels");
-   c=0; do {
-      celt_sig * restrict f;
-      const celt_norm * restrict x;
-      f = freq+c*N;
-      x = X+c*N;
-      for (i=0;i<end;i++)
+   bound = M*eBands[end];
+   if (downsample!=1)
+      bound = IMIN(bound, N/downsample);
+   if (silence)
+   {
+      bound = 0;
+      start = end = 0;
+   }
+   f = freq;
+   x = X+M*eBands[start];
+   for (i=0;i<M*eBands[start];i++)
+      *f++ = 0;
+   for (i=start;i<end;i++)
+   {
+      int j, band_end;
+      opus_val32 g;
+      celt_glog lg;
+#ifdef FIXED_POINT
+      int shift;
+#endif
+      j=M*eBands[i];
+      band_end = M*eBands[i+1];
+      lg = ADD32(bandLogE[i], SHL32((opus_val32)eMeans[i],DB_SHIFT-4));
+#ifndef FIXED_POINT
+      g = celt_exp2_db(MIN32(32.f, lg));
+#else
+      /* Handle the integer part of the log energy */
+      shift = 15-(lg>>DB_SHIFT);
+      if (shift>31)
+      {
+         shift=0;
+         g=0;
+      } else {
+         /* Handle the fractional part. */
+         g = celt_exp2_db_frac((lg&((1<<DB_SHIFT)-1)));
+      }
+      /* Handle extreme gains with negative shift. */
+      if (shift<0)
      {
-         int j, band_end;
-         opus_val32 g = SHR32(bank[i+c*m->nbEBands],1);
-         j=M*eBands[i];
-         band_end = M*eBands[i+1];
+         /* For shift <= -2 and g > 16384 we'd be likely to overflow, so we're
+            capping the gain here, which is equivalent to a cap of 18 on lg.
+            This shouldn't trigger unless the bitstream is already corrupted. */
+         if (shift <= -2)
+         {
+            g = 16384*32768;
+            shift = -2;
+         }
         do {
-            *f++ = SHL32(MULT16_32_Q15(*x, g),2);
+            *f++ = SHL32(MULT16_32_Q15(*x, g), -shift);
            x++;
         } while (++j<band_end);
-      }
-      for (i=M*eBands[end];i<N;i++)
-         *f++ = 0;
-   } while (++c<C);
+      } else
+#endif
+         /* Be careful of the fixed-point "else" just above when changing this code */
+         do {
+            *f++ = SHR32(MULT16_32_Q15(*x, g), shift);
+            x++;
+         } while (++j<band_end);
+   }
+   celt_assert(start <= end);
+   OPUS_CLEAR(&freq[bound], N-bound);
 }

 /* This prevents energy collapse for transients with multiple short MDCTs */
-void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_masks, int LM, int C, int CC, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed)
+void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
+      int start, int end, const celt_glog *logE, const celt_glog *prev1logE,
+      const celt_glog *prev2logE, const int *pulses, opus_uint32 seed, int encode, int arch)
 {
   int c, i, j, k;
   for (i=start;i<end;i++)
@@ -220,14 +292,17 @@ void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_mas
      int depth;
 #ifdef FIXED_POINT
      int shift;
+      opus_val32 thresh32;
 #endif

      N0 = m->eBands[i+1]-m->eBands[i];
      /* depth in 1/8 bits */
-      depth = (1+pulses[i])/(m->eBands[i+1]-m->eBands[i]<<LM);
+      celt_sig_assert(pulses[i]>=0);
+      depth = celt_udiv(1+pulses[i], (m->eBands[i+1]-m->eBands[i]))>>LM;

 #ifdef FIXED_POINT
-      thresh = MULT16_32_Q15(QCONST16(0.5f, 15), MIN32(32767,SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1) ));
+      thresh32 = SHR32(celt_exp2(-SHL16(depth, 10-BITRES)),1);
+      thresh = MULT16_32_Q15(QCONST16(0.5f, 15), MIN32(32767,thresh32));
      {
         opus_val32 t;
         t = N0<<LM;
@@ -243,26 +318,29 @@ void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_mas
      c=0; do
      {
         celt_norm *X;
-         opus_val16 prev1;
-         opus_val16 prev2;
-         opus_val16 Ediff;
+         celt_glog prev1;
+         celt_glog prev2;
+         opus_val32 Ediff;
         opus_val16 r;
         int renormalize=0;
         prev1 = prev1logE[c*m->nbEBands+i];
         prev2 = prev2logE[c*m->nbEBands+i];
-         if (C<CC)
+         if (!encode && C==1)
         {
-            prev1 = MAX16(prev1,prev1logE[m->nbEBands+i]);
-            prev2 = MAX16(prev2,prev2logE[m->nbEBands+i]);
+            prev1 = MAXG(prev1,prev1logE[m->nbEBands+i]);
+            prev2 = MAXG(prev2,prev2logE[m->nbEBands+i]);
         }
-         Ediff = logE[c*m->nbEBands+i]-MIN16(prev1,prev2);
-         Ediff = MAX16(0, Ediff);
+         Ediff = logE[c*m->nbEBands+i]-MING(prev1,prev2);
+         Ediff = MAX32(0, Ediff);

 #ifdef FIXED_POINT
-         if (Ediff < 16384)
-            r = 2*MIN16(16383,SHR32(celt_exp2(-Ediff),1));
-         else
+         if (Ediff < GCONST(16.f))
+         {
+            opus_val32 r32 = SHR32(celt_exp2_db(-Ediff),1);
+            r = 2*MIN16(16383,r32);
+         } else {
            r = 0;
+         }
         if (LM==3)
            r = MULT16_16_Q14(23170, MIN32(23169, r));
         r = SHR16(MIN16(thresh, r),1);
@@ -270,13 +348,13 @@ void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_mas
 #else
         /* r needs to be multiplied by 2 or 2*sqrt(2) depending on LM because
            short blocks don't have the same energy as long */
-         r = 2.f*celt_exp2(-Ediff);
+         r = 2.f*celt_exp2_db(-Ediff);
         if (LM==3)
            r *= 1.41421356f;
         r = MIN16(thresh, r);
         r = r*sqrt_1;
 #endif
-         X = _X+c*size+(m->eBands[i]<<LM);
+         X = X_+c*size+(m->eBands[i]<<LM);
         for (k=0;k<1<<LM;k++)
         {
            /* Detect collapse */
@@ -285,7 +363,7 @@ void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_mas
               /* Fill with noise */
               for (j=0;j<N0;j++)
               {
-                  seed = lcg_rand(seed);
+                  seed = celt_lcg_rand(seed);
                  X[(j<<LM)+k] = (seed&0x8000 ? r : -r);
               }
               renormalize = 1;
@@ -293,12 +371,36 @@ void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_mas
         }
         /* We just added some energy, so we need to renormalise */
         if (renormalize)
-            renormalise_vector(X, N0<<LM, Q15ONE);
+            renormalise_vector(X, N0<<LM, Q31ONE, arch);
      } while (++c<C);
   }
 }

-static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, const celt_ener *bank, int bandID, int N)
+/* Compute the weights to use for optimizing normalized distortion across
+   channels. We use the amplitude to weight square distortion, which means
+   that we use the square root of the value we would have been using if we
+   wanted to minimize the MSE in the non-normalized domain. This roughly
+   corresponds to some quick-and-dirty perceptual experiments I ran to
+   measure inter-aural masking (there doesn't seem to be any published data
+   on the topic). */
+static void compute_channel_weights(celt_ener Ex, celt_ener Ey, opus_val16 w[2])
+{
+   celt_ener minE;
+#ifdef FIXED_POINT
+   int shift;
+#endif
+   minE = MIN32(Ex, Ey);
+   /* Adjustment to make the weights a bit more conservative. */
+   Ex = ADD32(Ex, minE/3);
+   Ey = ADD32(Ey, minE/3);
+#ifdef FIXED_POINT
+   shift = celt_ilog2(EPSILON+MAX32(Ex, Ey))-14;
+#endif
+   w[0] = VSHR32(Ex, shift);
+   w[1] = VSHR32(Ey, shift);
+}
+
+static void intensity_stereo(const CELTMode *m, celt_norm * OPUS_RESTRICT X, const celt_norm * OPUS_RESTRICT Y, const celt_ener *bandE, int bandID, int N)
 {
   int i = bandID;
   int j;
@@ -306,10 +408,10 @@ static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, cons
   opus_val16 left, right;
   opus_val16 norm;
 #ifdef FIXED_POINT
-   int shift = celt_zlog2(MAX32(bank[i], bank[i+m->nbEBands]))-13;
+   int shift = celt_zlog2(MAX32(bandE[i], bandE[i+m->nbEBands]))-13;
 #endif
-   left = VSHR32(bank[i],shift);
-   right = VSHR32(bank[i+m->nbEBands],shift);
+   left = VSHR32(bandE[i],shift);
+   right = VSHR32(bandE[i+m->nbEBands],shift);
   norm = EPSILON + celt_sqrt(EPSILON+MULT16_16(left,left)+MULT16_16(right,right));
   a1 = DIV32_16(SHL32(EXTEND32(left),14),norm);
   a2 = DIV32_16(SHL32(EXTEND32(right),14),norm);
@@ -318,51 +420,44 @@ static void intensity_stereo(const CELTMode *m, celt_norm *X, celt_norm *Y, cons
      celt_norm r, l;
      l = X[j];
      r = Y[j];
-      X[j] = MULT16_16_Q14(a1,l) + MULT16_16_Q14(a2,r);
+      X[j] = EXTRACT16(SHR32(MAC16_16(MULT16_16(a1, l), a2, r), 14));
      /* Side is not encoded, no need to calculate */
   }
 }

-static void stereo_split(celt_norm *X, celt_norm *Y, int N)
+static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, int N)
 {
   int j;
   for (j=0;j<N;j++)
   {
-      celt_norm r, l;
-      l = MULT16_16_Q15(QCONST16(.70710678f,15), X[j]);
-      r = MULT16_16_Q15(QCONST16(.70710678f,15), Y[j]);
-      X[j] = l+r;
-      Y[j] = r-l;
+      opus_val32 r, l;
+      l = MULT16_16(QCONST16(.70710678f, 15), X[j]);
+      r = MULT16_16(QCONST16(.70710678f, 15), Y[j]);
+      X[j] = EXTRACT16(SHR32(ADD32(l, r), 15));
+      Y[j] = EXTRACT16(SHR32(SUB32(r, l), 15));
   }
 }

-static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
+static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val32 mid, int N, int arch)
 {
   int j;
   opus_val32 xp=0, side=0;
   opus_val32 El, Er;
-   opus_val16 mid2;
 #ifdef FIXED_POINT
   int kl, kr;
 #endif
   opus_val32 t, lgain, rgain;

   /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   for (j=0;j<N;j++)
-   {
-      xp = MAC16_16(xp, X[j], Y[j]);
-      side = MAC16_16(side, Y[j], Y[j]);
-   }
+   dual_inner_prod(Y, X, Y, N, &xp, &side, arch);
   /* Compensating for the mid normalization */
-   xp = MULT16_32_Q15(mid, xp);
+   xp = MULT32_32_Q31(mid, xp);
   /* mid and side are in Q15, not Q14 like X and Y */
-   mid2 = SHR32(mid, 1);
-   El = MULT16_16(mid2, mid2) + side - 2*xp;
-   Er = MULT16_16(mid2, mid2) + side + 2*xp;
+   El = SHR32(MULT32_32_Q31(mid, mid),3) + side - 2*xp;
+   Er = SHR32(MULT32_32_Q31(mid, mid),3) + side + 2*xp;
   if (Er < QCONST32(6e-4f, 28) || El < QCONST32(6e-4f, 28))
   {
-      for (j=0;j<N;j++)
-         Y[j] = X[j];
+      OPUS_COPY(Y, X, N);
      return;
   }

@@ -386,7 +481,7 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
   {
      celt_norm r, l;
      /* Apply mid scaling (side is already scaled) */
-      l = MULT16_16_Q15(mid, X[j]);
+      l = MULT32_32_Q31(mid, X[j]);
      r = Y[j];
      X[j] = EXTRACT16(PSHR32(MULT16_16(lgain, SUB16(l,r)), kl+1));
      Y[j] = EXTRACT16(PSHR32(MULT16_16(rgain, ADD16(l,r)), kr+1));
@@ -394,17 +489,18 @@ static void stereo_merge(celt_norm *X, celt_norm *Y, opus_val16 mid, int N)
 }

 /* Decide whether we should spread the pulses in the current frame */
-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
-      int end, int _C, int M)
+      int end, int C, int M, const int *spread_weight)
 {
   int i, c, N0;
   int sum = 0, nbBands=0;
-   const int C = CHANNELS(_C);
-   const opus_int16 * restrict eBands = m->eBands;
+   const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
   int decision;
   int hf_sum=0;

+   celt_assert(end>0);
+
   N0 = M*m->shortMdctSize;

   if (M*(eBands[end]-eBands[end-1]) <= 8)
@@ -414,7 +510,7 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
      {
         int j, N, tmp=0;
         int tcount[3] = {0,0,0};
-         celt_norm * restrict x = X+M*eBands[i]+c*N0;
+         const celt_norm * OPUS_RESTRICT x = X+M*eBands[i]+c*N0;
         N = M*(eBands[i+1]-eBands[i]);
         if (N<=8)
            continue;
@@ -434,17 +530,17 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,

         /* Only include four last bands (8 kHz and up) */
         if (i>m->nbEBands-4)
-            hf_sum += 32*(tcount[1]+tcount[0])/N;
+            hf_sum += celt_udiv(32*(tcount[1]+tcount[0]), N);
         tmp = (2*tcount[2] >= N) + (2*tcount[1] >= N) + (2*tcount[0] >= N);
-         sum += tmp*256;
-         nbBands++;
+         sum += tmp*spread_weight[i];
+         nbBands+=spread_weight[i];
      }
   } while (++c<C);

   if (update_hf)
   {
      if (hf_sum)
-         hf_sum /= C*(4-m->nbEBands+end);
+         hf_sum = celt_udiv(hf_sum, C*(4-m->nbEBands+end));
      *hf_average = (*hf_average+hf_sum)>>1;
      hf_sum = *hf_average;
      if (*tapset_decision==2)
@@ -459,7 +555,9 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
         *tapset_decision=0;
   }
   /*printf("%d %d %d\n", hf_sum, *hf_average, *tapset_decision);*/
-   sum /= nbBands;
+   celt_assert(nbBands>0); /* end has to be non-zero */
+   celt_assert(sum>=0);
+   sum = celt_udiv((opus_int32)sum<<8, nbBands);
   /* Recursive averaging */
   sum = (sum+*average)>>1;
   *average = sum;
@@ -484,50 +582,6 @@ int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
   return decision;
 }

-#ifdef MEASURE_NORM_MSE
-
-float MSE[30] = {0};
-int nbMSEBands = 0;
-int MSECount[30] = {0};
-
-void dump_norm_mse(void)
-{
-   int i;
-   for (i=0;i<nbMSEBands;i++)
-   {
-      printf ("%g ", MSE[i]/MSECount[i]);
-   }
-   printf ("\n");
-}
-
-void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, float *bandE0, int M, int N, int C)
-{
-   static int init = 0;
-   int i;
-   if (!init)
-   {
-      atexit(dump_norm_mse);
-      init = 1;
-   }
-   for (i=0;i<m->nbEBands;i++)
-   {
-      int j;
-      int c;
-      float g;
-      if (bandE0[i]<10 || (C==2 && bandE0[i+m->nbEBands]<1))
-         continue;
-      c=0; do {
-         g = bandE[i+c*m->nbEBands]/(1e-15+bandE0[i+c*m->nbEBands]);
-         for (j=M*m->eBands[i];j<M*m->eBands[i+1];j++)
-            MSE[i] += (g*X[j+c*N]-X0[j+c*N])*(g*X[j+c*N]-X0[j+c*N]);
-      } while (++c<C);
-      MSECount[i]+=C;
-   }
-   nbMSEBands = m->nbEBands;
-}
-
-#endif
-
 /* Indexing table for converting from natural Hadamard to ordery Hadamard
   This is essentially a bit-reversed Gray, on top of which we've added
   an inversion of the order because we want the DC at the end rather than
@@ -547,6 +601,7 @@ static void deinterleave_hadamard(celt_norm *X, int N0, int stride, int hadamard
   SAVE_STACK;
   N = N0*stride;
   ALLOC(tmp, N, celt_norm);
+   celt_assert(stride>0);
   if (hadamard)
   {
      const int *ordery = ordery_table+stride-2;
@@ -560,8 +615,7 @@ static void deinterleave_hadamard(celt_norm *X, int N0, int stride, int hadamard
         for (j=0;j<N0;j++)
            tmp[i*N0+j] = X[j*stride+i];
   }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
   RESTORE_STACK;
 }

@@ -584,8 +638,7 @@ static void interleave_hadamard(celt_norm *X, int N0, int stride, int hadamard)
         for (j=0;j<N0;j++)
            tmp[j*stride+i] = X[i*N0+j];
   }
-   for (j=0;j<N;j++)
-      X[j] = tmp[j];
+   OPUS_COPY(X, tmp, N);
   RESTORE_STACK;
 }

@@ -596,11 +649,11 @@ void haar1(celt_norm *X, int N0, int stride)
   for (i=0;i<stride;i++)
      for (j=0;j<N0;j++)
      {
-         celt_norm tmp1, tmp2;
-         tmp1 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*2*j+i]);
-         tmp2 = MULT16_16_Q15(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
-         X[stride*2*j+i] = tmp1 + tmp2;
-         X[stride*(2*j+1)+i] = tmp1 - tmp2;
+         opus_val32 tmp1, tmp2;
+         tmp1 = MULT16_16(QCONST16(.70710678f,15), X[stride*2*j+i]);
+         tmp2 = MULT16_16(QCONST16(.70710678f,15), X[stride*(2*j+1)+i]);
+         X[stride*2*j+i] = EXTRACT16(PSHR32(ADD32(tmp1, tmp2), 15));
+         X[stride*(2*j+1)+i] = EXTRACT16(PSHR32(SUB32(tmp1, tmp2), 15));
      }
 }

@@ -615,7 +668,8 @@ static int compute_qn(int N, int b, int offset, int pulse_cap, int stereo)
   /* The upper limit ensures that in a stereo split with itheta==16384, we'll
       always have enough bits left over to code at least one pulse in the
       side; otherwise it would collapse, since it doesn't get folded. */
-   qb = IMIN(b-pulse_cap-(4<<BITRES), (b+N2*offset)/N2);
+   qb = celt_sudiv(b+N2*offset, N2);
+   qb = IMIN(b-pulse_cap-(4<<BITRES), qb);

   qb = IMIN(8<<BITRES, qb);

@@ -629,425 +683,382 @@ static int compute_qn(int N, int b, int offset, int pulse_cap, int stereo)
   return qn;
 }

-/* This function is responsible for encoding and decoding a band for both
-   the mono and stereo case. Even in the mono case, it can split the band
-   in two and transmit the energy difference with the two half-bands. It
-   can be called recursively so bands can end up being split in 8 parts. */
-static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, celt_norm *Y,
-      int N, int b, int spread, int B, int intensity, int tf_change, celt_norm *lowband, int resynth, ec_ctx *ec,
-      opus_int32 *remaining_bits, int LM, celt_norm *lowband_out, const celt_ener *bandE, int level,
-      opus_uint32 *seed, opus_val16 gain, celt_norm *lowband_scratch, int fill)
-{
-   const unsigned char *cache;
-   int q;
-   int curr_bits;
-   int stereo, split;
-   int imid=0, iside=0;
-   int N0=N;
-   int N_B=N;
-   int N_B0;
-   int B0=B;
-   int time_divide=0;
-   int recombine=0;
-   int inv = 0;
-   opus_val16 mid=0, side=0;
-   int longBlocks;
-   unsigned cm=0;
-
-   longBlocks = B0==1;
-
-   N_B /= B;
-   N_B0 = N_B;
+struct band_ctx {
+   int encode;
+   int resynth;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   int spread;
+   int tf_change;
+   ec_ctx *ec;
+   opus_int32 remaining_bits;
+   const celt_ener *bandE;
+   opus_uint32 seed;
+   int arch;
+   int theta_round;
+   int disable_inv;
+   int avoid_split_noise;
+};

-   split = stereo = Y != NULL;
+struct split_ctx {
+   int inv;
+   int imid;
+   int iside;
+   int delta;
+   int itheta;
+   int qalloc;
+};

-   /* Special case for one sample */
-   if (N==1)
+static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
+      celt_norm *X, celt_norm *Y, int N, int *b, int B, int B0,
+      int LM,
+      int stereo, int *fill)
+{
+   int qn;
+   int itheta=0;
+   int delta;
+   int imid, iside;
+   int qalloc;
+   int pulse_cap;
+   int offset;
+   opus_int32 tell;
+   int inv=0;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int intensity;
+   ec_ctx *ec;
+   const celt_ener *bandE;
+
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   intensity = ctx->intensity;
+   ec = ctx->ec;
+   bandE = ctx->bandE;
+
+   /* Decide on the resolution to give to the split parameter theta */
+   pulse_cap = m->logN[i]+LM*(1<<BITRES);
+   offset = (pulse_cap>>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET);
+   qn = compute_qn(N, *b, offset, pulse_cap, stereo);
+   if (stereo && i>=intensity)
+      qn = 1;
+   if (encode)
   {
-      int c;
-      celt_norm *x = X;
-      c=0; do {
-         int sign=0;
-         if (*remaining_bits>=1<<BITRES)
+      /* theta is the atan() of the ratio between the (normalized)
+         side and mid. With just that parameter, we can re-scale both
+         mid and side because we know that 1) they have unit norm and
+         2) they are orthogonal. */
+      itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
+   }
+   tell = ec_tell_frac(ec);
+   if (qn!=1)
+   {
+      if (encode)
+      {
+         if (!stereo || ctx->theta_round == 0)
         {
-            if (encode)
+            itheta = (itheta*(opus_int32)qn+8192)>>14;
+            if (!stereo && ctx->avoid_split_noise && itheta > 0 && itheta < qn)
            {
-               sign = x[0]<0;
-               ec_enc_bits(ec, sign, 1);
-            } else {
-               sign = ec_dec_bits(ec, 1);
+               /* Check if the selected value of theta will cause the bit allocation
+                  to inject noise on one side. If so, make sure the energy of that side
+                  is zero. */
+               int unquantized = celt_udiv((opus_int32)itheta*16384, qn);
+               imid = bitexact_cos((opus_int16)unquantized);
+               iside = bitexact_cos((opus_int16)(16384-unquantized));
+               delta = FRAC_MUL16((N-1)<<7,bitexact_log2tan(iside,imid));
+               if (delta > *b)
+                  itheta = qn;
+               else if (delta < -*b)
+                  itheta = 0;
            }
-            *remaining_bits -= 1<<BITRES;
-            b-=1<<BITRES;
+         } else {
+            int down;
+            /* Bias quantization towards itheta=0 and itheta=16384. */
+            int bias = itheta > 8192 ? 32767/qn : -32767/qn;
+            down = IMIN(qn-1, IMAX(0, (itheta*(opus_int32)qn + bias)>>14));
+            if (ctx->theta_round < 0)
+               itheta = down;
+            else
+               itheta = down+1;
         }
-         if (resynth)
-            x[0] = sign ? -NORM_SCALING : NORM_SCALING;
-         x = Y;
-      } while (++c<1+stereo);
-      if (lowband_out)
-         lowband_out[0] = SHR16(X[0],4);
-      return 1;
-   }
-
-   if (!stereo && level == 0)
-   {
-      int k;
-      if (tf_change>0)
-         recombine = tf_change;
-      /* Band recombining to increase frequency resolution */
-
-      if (lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
-      {
-         int j;
-         for (j=0;j<N;j++)
-            lowband_scratch[j] = lowband[j];
-         lowband = lowband_scratch;
      }
-
-      for (k=0;k<recombine;k++)
+      /* Entropy coding of the angle. We use a uniform pdf for the
+         time split, a step for stereo, and a triangular one for the rest. */
+      if (stereo && N>2)
      {
-         static const unsigned char bit_interleave_table[16]={
-           0,1,1,1,2,3,3,3,2,3,3,3,2,3,3,3
-         };
+         int p0 = 3;
+         int x = itheta;
+         int x0 = qn/2;
+         int ft = p0*(x0+1) + x0;
+         /* Use a probability of p0 up to itheta=8192 and then use 1 after */
         if (encode)
-            haar1(X, N>>k, 1<<k);
-         if (lowband)
-            haar1(lowband, N>>k, 1<<k);
-         fill = bit_interleave_table[fill&0xF]|bit_interleave_table[fill>>4]<<2;
-      }
-      B>>=recombine;
-      N_B<<=recombine;
-
-      /* Increasing the time resolution */
-      while ((N_B&1) == 0 && tf_change<0)
-      {
+         {
+            ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+         } else {
+            int fs;
+            fs=ec_decode(ec,ft);
+            if (fs<(x0+1)*p0)
+               x=fs/p0;
+            else
+               x=x0+1+(fs-(x0+1)*p0);
+            ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
+            itheta = x;
+         }
+      } else if (B0>1 || stereo) {
+         /* Uniform pdf */
         if (encode)
-            haar1(X, N_B, B);
-         if (lowband)
-            haar1(lowband, N_B, B);
-         fill |= fill<<B;
-         B <<= 1;
-         N_B >>= 1;
-         time_divide++;
-         tf_change++;
-      }
-      B0=B;
-      N_B0 = N_B;
-
-      /* Reorganize the samples in time order instead of frequency order */
-      if (B0>1)
-      {
+            ec_enc_uint(ec, itheta, qn+1);
+         else
+            itheta = ec_dec_uint(ec, qn+1);
+      } else {
+         int fs=1, ft;
+         ft = ((qn>>1)+1)*((qn>>1)+1);
         if (encode)
-            deinterleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
-         if (lowband)
-            deinterleave_hadamard(lowband, N_B>>recombine, B0<<recombine, longBlocks);
-      }
-   }
+         {
+            int fl;

-   /* If we need 1.5 more bit than we can produce, split the band in two. */
-   cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
-   if (!stereo && LM != -1 && b > cache[cache[0]]+12 && N>2)
-   {
-      if (LM>0 || (N&1)==0)
-      {
-         N >>= 1;
-         Y = X+N;
-         split = 1;
-         LM -= 1;
-         if (B==1)
-            fill = fill&1|fill<<1;
-         B = (B+1)>>1;
-      }
-   }
+            fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta;
+            fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 :
+             ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);

-   if (split)
-   {
-      int qn;
-      int itheta=0;
-      int mbits, sbits, delta;
-      int qalloc;
-      int pulse_cap;
-      int offset;
-      int orig_fill;
-      opus_int32 tell;
+            ec_encode(ec, fl, fl+fs, ft);
+         } else {
+            /* Triangular pdf */
+            int fl=0;
+            int fm;
+            fm = ec_decode(ec, ft);
+
+            if (fm < ((qn>>1)*((qn>>1) + 1)>>1))
+            {
+               itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1;
+               fs = itheta + 1;
+               fl = itheta*(itheta + 1)>>1;
+            }
+            else
+            {
+               itheta = (2*(qn + 1)
+                - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1;
+               fs = qn + 1 - itheta;
+               fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
+            }

-      /* Decide on the resolution to give to the split parameter theta */
-      pulse_cap = m->logN[i]+(LM<<BITRES);
-      offset = (pulse_cap>>1) - (stereo&&N==2 ? QTHETA_OFFSET_TWOPHASE : QTHETA_OFFSET);
-      qn = compute_qn(N, b, offset, pulse_cap, stereo);
-      if (stereo && i>=intensity)
-         qn = 1;
+            ec_dec_update(ec, fl, fl+fs, ft);
+         }
+      }
+      celt_assert(itheta>=0);
+      itheta = celt_udiv((opus_int32)itheta*16384, qn);
+      if (encode && stereo)
+      {
+         if (itheta==0)
+            intensity_stereo(m, X, Y, bandE, i, N);
+         else
+            stereo_split(X, Y, N);
+      }
+      /* NOTE: Renormalising X and Y *may* help fixed-point a bit at very high rate.
+               Let's do that at higher complexity */
+   } else if (stereo) {
      if (encode)
      {
-         /* theta is the atan() of the ratio between the (normalized)
-            side and mid. With just that parameter, we can re-scale both
-            mid and side because we know that 1) they have unit norm and
-            2) they are orthogonal. */
-         itheta = stereo_itheta(X, Y, stereo, N);
+         inv = itheta > 8192 && !ctx->disable_inv;
+         if (inv)
+         {
+            int j;
+            for (j=0;j<N;j++)
+               Y[j] = -Y[j];
+         }
+         intensity_stereo(m, X, Y, bandE, i, N);
      }
-      tell = ec_tell_frac(ec);
-      if (qn!=1)
+      if (*b>2<<BITRES && ctx->remaining_bits > 2<<BITRES)
      {
         if (encode)
-            itheta = (itheta*qn+8192)>>14;
-
-         /* Entropy coding of the angle. We use a uniform pdf for the
-            time split, a step for stereo, and a triangular one for the rest. */
-         if (stereo && N>2)
-         {
-            int p0 = 3;
-            int x = itheta;
-            int x0 = qn/2;
-            int ft = p0*(x0+1) + x0;
-            /* Use a probability of p0 up to itheta=8192 and then use 1 after */
-            if (encode)
-            {
-               ec_encode(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
-            } else {
-               int fs;
-               fs=ec_decode(ec,ft);
-               if (fs<(x0+1)*p0)
-                  x=fs/p0;
-               else
-                  x=x0+1+(fs-(x0+1)*p0);
-               ec_dec_update(ec,x<=x0?p0*x:(x-1-x0)+(x0+1)*p0,x<=x0?p0*(x+1):(x-x0)+(x0+1)*p0,ft);
-               itheta = x;
-            }
-         } else if (B0>1 || stereo) {
-            /* Uniform pdf */
-            if (encode)
-               ec_enc_uint(ec, itheta, qn+1);
-            else
-               itheta = ec_dec_uint(ec, qn+1);
-         } else {
-            int fs=1, ft;
-            ft = ((qn>>1)+1)*((qn>>1)+1);
-            if (encode)
-            {
-               int fl;
+            ec_enc_bit_logp(ec, inv, 2);
+         else
+            inv = ec_dec_bit_logp(ec, 2);
+      } else
+         inv = 0;
+      /* inv flag override to avoid problems with downmixing. */
+      if (ctx->disable_inv)
+         inv = 0;
+      itheta = 0;
+   }
+   qalloc = ec_tell_frac(ec) - tell;
+   *b -= qalloc;

-               fs = itheta <= (qn>>1) ? itheta + 1 : qn + 1 - itheta;
-               fl = itheta <= (qn>>1) ? itheta*(itheta + 1)>>1 :
-                ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
+   if (itheta == 0)
+   {
+      imid = 32767;
+      iside = 0;
+      *fill &= (1<<B)-1;
+      delta = -16384;
+   } else if (itheta == 16384)
+   {
+      imid = 0;
+      iside = 32767;
+      *fill &= ((1<<B)-1)<<B;
+      delta = 16384;
+   } else {
+      imid = bitexact_cos((opus_int16)itheta);
+      iside = bitexact_cos((opus_int16)(16384-itheta));
+      /* This is the mid vs side allocation that minimizes squared error
+         in that band. */
+      delta = FRAC_MUL16((N-1)<<7,bitexact_log2tan(iside,imid));
+   }

-               ec_encode(ec, fl, fl+fs, ft);
-            } else {
-               /* Triangular pdf */
-               int fl=0;
-               int fm;
-               fm = ec_decode(ec, ft);
+   sctx->inv = inv;
+   sctx->imid = imid;
+   sctx->iside = iside;
+   sctx->delta = delta;
+   sctx->itheta = itheta;
+   sctx->qalloc = qalloc;
+}
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
+      celt_norm *lowband_out)
+{
+   int c;
+   int stereo;
+   celt_norm *x = X;
+   int encode;
+   ec_ctx *ec;

-               if (fm < ((qn>>1)*((qn>>1) + 1)>>1))
-               {
-                  itheta = (isqrt32(8*(opus_uint32)fm + 1) - 1)>>1;
-                  fs = itheta + 1;
-                  fl = itheta*(itheta + 1)>>1;
-               }
-               else
-               {
-                  itheta = (2*(qn + 1)
-                   - isqrt32(8*(opus_uint32)(ft - fm - 1) + 1))>>1;
-                  fs = qn + 1 - itheta;
-                  fl = ft - ((qn + 1 - itheta)*(qn + 2 - itheta)>>1);
-               }
+   encode = ctx->encode;
+   ec = ctx->ec;

-               ec_dec_update(ec, fl, fl+fs, ft);
-            }
-         }
-         itheta = (opus_int32)itheta*16384/qn;
-         if (encode && stereo)
-         {
-            if (itheta==0)
-               intensity_stereo(m, X, Y, bandE, i, N);
-            else
-               stereo_split(X, Y, N);
-         }
-         /* TODO: Renormalising X and Y *may* help fixed-point a bit at very high rate.
-                  Let's do that at higher complexity */
-      } else if (stereo) {
+   stereo = Y != NULL;
+   c=0; do {
+      int sign=0;
+      if (ctx->remaining_bits>=1<<BITRES)
+      {
         if (encode)
         {
-            inv = itheta > 8192;
-            if (inv)
-            {
-               int j;
-               for (j=0;j<N;j++)
-                  Y[j] = -Y[j];
-            }
-            intensity_stereo(m, X, Y, bandE, i, N);
+            sign = x[0]<0;
+            ec_enc_bits(ec, sign, 1);
+         } else {
+            sign = ec_dec_bits(ec, 1);
         }
-         if (b>2<<BITRES && *remaining_bits > 2<<BITRES)
-         {
-            if (encode)
-               ec_enc_bit_logp(ec, inv, 2);
-            else
-               inv = ec_dec_bit_logp(ec, 2);
-         } else
-            inv = 0;
-         itheta = 0;
+         ctx->remaining_bits -= 1<<BITRES;
      }
-      qalloc = ec_tell_frac(ec) - tell;
-      b -= qalloc;
+      if (ctx->resynth)
+         x[0] = sign ? -NORM_SCALING : NORM_SCALING;
+      x = Y;
+   } while (++c<1+stereo);
+   if (lowband_out)
+      lowband_out[0] = SHR16(X[0],4);
+   return 1;
+}

-      orig_fill = fill;
-      if (itheta == 0)
-      {
-         imid = 32767;
-         iside = 0;
-         fill &= (1<<B)-1;
-         delta = -16384;
-      } else if (itheta == 16384)
-      {
-         imid = 0;
-         iside = 32767;
-         fill &= (1<<B)-1<<B;
-         delta = 16384;
-      } else {
-         imid = bitexact_cos(itheta);
-         iside = bitexact_cos(16384-itheta);
-         /* This is the mid vs side allocation that minimizes squared error
-            in that band. */
-         delta = FRAC_MUL16(N-1<<7,bitexact_log2tan(iside,imid));
-      }
+/* This function is responsible for encoding and decoding a mono partition.
+   It can split the band in two and transmit the energy difference with
+   the two half-bands. It can be called recursively so bands can end up being
+   split in 8 parts. */
+static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM,
+      opus_val32 gain, int fill)
+{
+   const unsigned char *cache;
+   int q;
+   int curr_bits;
+   int imid=0, iside=0;
+   int B0=B;
+   opus_val32 mid=0, side=0;
+   unsigned cm=0;
+   celt_norm *Y=NULL;
+   int encode;
+   const CELTMode *m;
+   int i;
+   int spread;
+   ec_ctx *ec;
+
+   encode = ctx->encode;
+   m = ctx->m;
+   i = ctx->i;
+   spread = ctx->spread;
+   ec = ctx->ec;

+   /* If we need 1.5 more bit than we can produce, split the band in two. */
+   cache = m->cache.bits + m->cache.index[(LM+1)*m->nbEBands+i];
+   if (LM != -1 && b > cache[cache[0]]+12 && N>2)
+   {
+      int mbits, sbits, delta;
+      int itheta;
+      int qalloc;
+      struct split_ctx sctx;
+      celt_norm *next_lowband2=NULL;
+      opus_int32 rebalance;
+
+      N >>= 1;
+      Y = X+N;
+      LM -= 1;
+      if (B==1)
+         fill = (fill&1)|(fill<<1);
+      B = (B+1)>>1;
+
+      compute_theta(ctx, &sctx, X, Y, N, &b, B, B0, LM, 0, &fill);
+      imid = sctx.imid;
+      iside = sctx.iside;
+      delta = sctx.delta;
+      itheta = sctx.itheta;
+      qalloc = sctx.qalloc;
 #ifdef FIXED_POINT
-      mid = imid;
-      side = iside;
+      mid = SHL32(EXTEND32(imid), 16);
+      side = SHL32(EXTEND32(iside), 16);
 #else
      mid = (1.f/32768)*imid;
      side = (1.f/32768)*iside;
 #endif

-      /* This is a special case for N=2 that only works for stereo and takes
-         advantage of the fact that mid and side are orthogonal to encode
-         the side with just one bit. */
-      if (N==2 && stereo)
+      /* Give more bits to low-energy MDCTs than they would otherwise deserve */
+      if (B0>1 && (itheta&0x3fff))
      {
-         int c;
-         int sign=0;
-         celt_norm *x2, *y2;
-         mbits = b;
-         sbits = 0;
-         /* Only need one bit for the side */
-         if (itheta != 0 && itheta != 16384)
-            sbits = 1<<BITRES;
-         mbits -= sbits;
-         c = itheta > 8192;
-         *remaining_bits -= qalloc+sbits;
-
-         x2 = c ? Y : X;
-         y2 = c ? X : Y;
-         if (sbits)
-         {
-            if (encode)
-            {
-               /* Here we only need to encode a sign for the side */
-               sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
-               ec_enc_bits(ec, sign, 1);
-            } else {
-               sign = ec_dec_bits(ec, 1);
-            }
-         }
-         sign = 1-2*sign;
-         /* We use orig_fill here because we want to fold the side, but if
-             itheta==16384, we'll have cleared the low bits of fill. */
-         cm = quant_band(encode, m, i, x2, NULL, N, mbits, spread, B, intensity, tf_change, lowband, resynth, ec, remaining_bits, LM, lowband_out, NULL, level, seed, gain, lowband_scratch, orig_fill);
-         /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
-             and there's no need to worry about mixing with the other channel. */
-         y2[0] = -sign*x2[1];
-         y2[1] = sign*x2[0];
-         if (resynth)
-         {
-            celt_norm tmp;
-            X[0] = MULT16_16_Q15(mid, X[0]);
-            X[1] = MULT16_16_Q15(mid, X[1]);
-            Y[0] = MULT16_16_Q15(side, Y[0]);
-            Y[1] = MULT16_16_Q15(side, Y[1]);
-            tmp = X[0];
-            X[0] = SUB16(tmp,Y[0]);
-            Y[0] = ADD16(tmp,Y[0]);
-            tmp = X[1];
-            X[1] = SUB16(tmp,Y[1]);
-            Y[1] = ADD16(tmp,Y[1]);
-         }
-      } else {
-         /* "Normal" split code */
-         celt_norm *next_lowband2=NULL;
-         celt_norm *next_lowband_out1=NULL;
-         int next_level=0;
-         opus_int32 rebalance;
-
-         /* Give more bits to low-energy MDCTs than they would otherwise deserve */
-         if (B0>1 && !stereo && (itheta&0x3fff))
-         {
-            if (itheta > 8192)
-               /* Rough approximation for pre-echo masking */
-               delta -= delta>>(4-LM);
-            else
-               /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */
-               delta = IMIN(0, delta + (N<<BITRES>>(5-LM)));
-         }
-         mbits = IMAX(0, IMIN(b, (b-delta)/2));
-         sbits = b-mbits;
-         *remaining_bits -= qalloc;
-
-         if (lowband && !stereo)
-            next_lowband2 = lowband+N; /* >32-bit split case */
-
-         /* Only stereo needs to pass on lowband_out. Otherwise, it's
-            handled at the end */
-         if (stereo)
-            next_lowband_out1 = lowband_out;
+         if (itheta > 8192)
+            /* Rough approximation for pre-echo masking */
+            delta -= delta>>(4-LM);
         else
-            next_level = level+1;
-
-         rebalance = *remaining_bits;
-         if (mbits >= sbits)
-         {
-            /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
-               mid for folding later */
-            cm = quant_band(encode, m, i, X, NULL, N, mbits, spread, B, intensity, tf_change,
-                  lowband, resynth, ec, remaining_bits, LM, next_lowband_out1,
-                  NULL, next_level, seed, stereo ? Q15ONE : MULT16_16_P15(gain,mid), lowband_scratch, fill);
-            rebalance = mbits - (rebalance-*remaining_bits);
-            if (rebalance > 3<<BITRES && itheta!=0)
-               sbits += rebalance - (3<<BITRES);
-
-            /* For a stereo split, the high bits of fill are always zero, so no
-               folding will be done to the side. */
-            cm |= quant_band(encode, m, i, Y, NULL, N, sbits, spread, B, intensity, tf_change,
-                  next_lowband2, resynth, ec, remaining_bits, LM, NULL,
-                  NULL, next_level, seed, MULT16_16_P15(gain,side), NULL, fill>>B)<<(B0>>1&stereo-1);
-         } else {
-            /* For a stereo split, the high bits of fill are always zero, so no
-               folding will be done to the side. */
-            cm = quant_band(encode, m, i, Y, NULL, N, sbits, spread, B, intensity, tf_change,
-                  next_lowband2, resynth, ec, remaining_bits, LM, NULL,
-                  NULL, next_level, seed, MULT16_16_P15(gain,side), NULL, fill>>B)<<(B0>>1&stereo-1);
-            rebalance = sbits - (rebalance-*remaining_bits);
-            if (rebalance > 3<<BITRES && itheta!=16384)
-               mbits += rebalance - (3<<BITRES);
-            /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
-               mid for folding later */
-            cm |= quant_band(encode, m, i, X, NULL, N, mbits, spread, B, intensity, tf_change,
-                  lowband, resynth, ec, remaining_bits, LM, next_lowband_out1,
-                  NULL, next_level, seed, stereo ? Q15ONE : MULT16_16_P15(gain,mid), lowband_scratch, fill);
-         }
+            /* Corresponds to a forward-masking slope of 1.5 dB per 10 ms */
+            delta = IMIN(0, delta + (N<<BITRES>>(5-LM)));
      }
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      if (lowband)
+         next_lowband2 = lowband+N; /* >32-bit split case */

+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         cm = quant_partition(ctx, X, N, mbits, B, lowband, LM,
+               MULT32_32_Q31(gain,mid), fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, Y, N, sbits, B, next_lowband2, LM,
+               MULT32_32_Q31(gain,side), fill>>B)<<(B0>>1);
+      } else {
+         cm = quant_partition(ctx, Y, N, sbits, B, next_lowband2, LM,
+               MULT32_32_Q31(gain,side), fill>>B)<<(B0>>1);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         cm |= quant_partition(ctx, X, N, mbits, B, lowband, LM,
+               MULT32_32_Q31(gain,mid), fill);
+      }
   } else {
      /* This is the basic no-split case */
      q = bits2pulses(m, i, LM, b);
      curr_bits = pulses2bits(m, i, LM, q);
-      *remaining_bits -= curr_bits;
+      ctx->remaining_bits -= curr_bits;

      /* Ensures we can never bust the budget */
-      while (*remaining_bits < 0 && q > 0)
+      while (ctx->remaining_bits < 0 && q > 0)
      {
-         *remaining_bits += curr_bits;
+         ctx->remaining_bits += curr_bits;
         q--;
         curr_bits = pulses2bits(m, i, LM, q);
-         *remaining_bits -= curr_bits;
+         ctx->remaining_bits -= curr_bits;
      }

      if (q!=0)
@@ -1056,31 +1067,32 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c

         /* Finally do the actual quantization */
         if (encode)
-            cm = alg_quant(X, N, K, spread, B, resynth, ec, gain);
-         else
+         {
+            cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth, ctx->arch);
+         } else {
            cm = alg_unquant(X, N, K, spread, B, ec, gain);
+         }
      } else {
         /* If there's no pulse, fill the band anyway */
         int j;
-         if (resynth)
+         if (ctx->resynth)
         {
            unsigned cm_mask;
-            /*B can be as large as 16, so this shift might overflow an int on a
+            /* B can be as large as 16, so this shift might overflow an int on a
               16-bit platform; use a long to get defined behavior.*/
            cm_mask = (unsigned)(1UL<<B)-1;
            fill &= cm_mask;
            if (!fill)
            {
-               for (j=0;j<N;j++)
-                  X[j] = 0;
+               OPUS_CLEAR(X, N);
            } else {
               if (lowband == NULL)
               {
                  /* Noise */
                  for (j=0;j<N;j++)
                  {
-                     *seed = lcg_rand(*seed);
-                     X[j] = (opus_int32)(*seed)>>20;
+                     ctx->seed = celt_lcg_rand(ctx->seed);
+                     X[j] = (celt_norm)((opus_int32)ctx->seed>>20);
                  }
                  cm = cm_mask;
               } else {
@@ -1088,104 +1100,390 @@ static unsigned quant_band(int encode, const CELTMode *m, int i, celt_norm *X, c
                  for (j=0;j<N;j++)
                  {
                     opus_val16 tmp;
-                     *seed = lcg_rand(*seed);
+                     ctx->seed = celt_lcg_rand(ctx->seed);
                     /* About 48 dB below the "normal" folding level */
                     tmp = QCONST16(1.0f/256, 10);
-                     tmp = (*seed)&0x8000 ? tmp : -tmp;
+                     tmp = (ctx->seed)&0x8000 ? tmp : -tmp;
                     X[j] = lowband[j]+tmp;
                  }
                  cm = fill;
               }
-               renormalise_vector(X, N, gain);
+               renormalise_vector(X, N, gain, ctx->arch);
            }
         }
      }
   }

+   return cm;
+}
+
+
+/* This function is responsible for encoding and decoding a band for the mono case. */
+static unsigned quant_band(struct band_ctx *ctx, celt_norm *X,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      opus_val32 gain, celt_norm *lowband_scratch, int fill)
+{
+   int N0=N;
+   int N_B=N;
+   int N_B0;
+   int B0=B;
+   int time_divide=0;
+   int recombine=0;
+   int longBlocks;
+   unsigned cm=0;
+   int k;
+   int encode;
+   int tf_change;
+
+   encode = ctx->encode;
+   tf_change = ctx->tf_change;
+
+   longBlocks = B0==1;
+
+   N_B = celt_udiv(N_B, B);
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, NULL, lowband_out);
+   }
+
+   if (tf_change>0)
+      recombine = tf_change;
+   /* Band recombining to increase frequency resolution */
+
+   if (lowband_scratch && lowband && (recombine || ((N_B&1) == 0 && tf_change<0) || B0>1))
+   {
+      OPUS_COPY(lowband_scratch, lowband, N);
+      lowband = lowband_scratch;
+   }
+
+   for (k=0;k<recombine;k++)
+   {
+      static const unsigned char bit_interleave_table[16]={
+            0,1,1,1,2,3,3,3,2,3,3,3,2,3,3,3
+      };
+      if (encode)
+         haar1(X, N>>k, 1<<k);
+      if (lowband)
+         haar1(lowband, N>>k, 1<<k);
+      fill = bit_interleave_table[fill&0xF]|bit_interleave_table[fill>>4]<<2;
+   }
+   B>>=recombine;
+   N_B<<=recombine;
+
+   /* Increasing the time resolution */
+   while ((N_B&1) == 0 && tf_change<0)
+   {
+      if (encode)
+         haar1(X, N_B, B);
+      if (lowband)
+         haar1(lowband, N_B, B);
+      fill |= fill<<B;
+      B <<= 1;
+      N_B >>= 1;
+      time_divide++;
+      tf_change++;
+   }
+   B0=B;
+   N_B0 = N_B;
+
+   /* Reorganize the samples in time order instead of frequency order */
+   if (B0>1)
+   {
+      if (encode)
+         deinterleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+      if (lowband)
+         deinterleave_hadamard(lowband, N_B>>recombine, B0<<recombine, longBlocks);
+   }
+
+   cm = quant_partition(ctx, X, N, b, B, lowband, LM, gain, fill);
+
   /* This code is used by the decoder and by the resynthesis-enabled encoder */
-   if (resynth)
+   if (ctx->resynth)
   {
-      if (stereo)
+      /* Undo the sample reorganization going from time order to frequency order */
+      if (B0>1)
+         interleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);
+
+      /* Undo time-freq changes that we did earlier */
+      N_B = N_B0;
+      B = B0;
+      for (k=0;k<time_divide;k++)
      {
-         if (N!=2)
-            stereo_merge(X, Y, mid, N);
-         if (inv)
-         {
-            int j;
-            for (j=0;j<N;j++)
-               Y[j] = -Y[j];
-         }
-      } else if (level == 0)
+         B >>= 1;
+         N_B <<= 1;
+         cm |= cm>>B;
+         haar1(X, N_B, B);
+      }
+
+      for (k=0;k<recombine;k++)
+      {
+         static const unsigned char bit_deinterleave_table[16]={
+               0x00,0x03,0x0C,0x0F,0x30,0x33,0x3C,0x3F,
+               0xC0,0xC3,0xCC,0xCF,0xF0,0xF3,0xFC,0xFF
+         };
+         cm = bit_deinterleave_table[cm];
+         haar1(X, N0>>k, 1<<k);
+      }
+      B<<=recombine;
+
+      /* Scale output for later folding */
+      if (lowband_out)
      {
-         int k;
+         int j;
+         opus_val16 n;
+         n = celt_sqrt(SHL32(EXTEND32(N0),22));
+         for (j=0;j<N0;j++)
+            lowband_out[j] = MULT16_16_Q15(n,X[j]);
+      }
+      cm &= (1<<B)-1;
+   }
+   return cm;
+}

-         /* Undo the sample reorganization going from time order to frequency order */
-         if (B0>1)
-            interleave_hadamard(X, N_B>>recombine, B0<<recombine, longBlocks);

-         /* Undo time-freq changes that we did earlier */
-         N_B = N_B0;
-         B = B0;
-         for (k=0;k<time_divide;k++)
-         {
-            B >>= 1;
-            N_B <<= 1;
-            cm |= cm>>B;
-            haar1(X, N_B, B);
-         }
+/* This function is responsible for encoding and decoding a band for the stereo case. */
+static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
+      int N, int b, int B, celt_norm *lowband,
+      int LM, celt_norm *lowband_out,
+      celt_norm *lowband_scratch, int fill)
+{
+   int imid=0, iside=0;
+   int inv = 0;
+   opus_val32 mid=0, side=0;
+   unsigned cm=0;
+   int mbits, sbits, delta;
+   int itheta;
+   int qalloc;
+   struct split_ctx sctx;
+   int orig_fill;
+   int encode;
+   ec_ctx *ec;

-         for (k=0;k<recombine;k++)
-         {
-            static const unsigned char bit_deinterleave_table[16]={
-              0x00,0x03,0x0C,0x0F,0x30,0x33,0x3C,0x3F,
-              0xC0,0xC3,0xCC,0xCF,0xF0,0xF3,0xFC,0xFF
-            };
-            cm = bit_deinterleave_table[cm];
-            haar1(X, N0>>k, 1<<k);
-         }
-         B<<=recombine;
+   encode = ctx->encode;
+   ec = ctx->ec;
+
+   /* Special case for one sample */
+   if (N==1)
+   {
+      return quant_band_n1(ctx, X, Y, lowband_out);
+   }

-         /* Scale output for later folding */
-         if (lowband_out)
+   orig_fill = fill;
+
+   compute_theta(ctx, &sctx, X, Y, N, &b, B, B, LM, 1, &fill);
+   inv = sctx.inv;
+   imid = sctx.imid;
+   iside = sctx.iside;
+   delta = sctx.delta;
+   itheta = sctx.itheta;
+   qalloc = sctx.qalloc;
+#ifdef FIXED_POINT
+   mid = SHL32(EXTEND32(imid), 16);
+   side = SHL32(EXTEND32(iside), 16);
+#else
+   mid = (1.f/32768)*imid;
+   side = (1.f/32768)*iside;
+#endif
+
+   /* This is a special case for N=2 that only works for stereo and takes
+      advantage of the fact that mid and side are orthogonal to encode
+      the side with just one bit. */
+   if (N==2)
+   {
+      int c;
+      int sign=0;
+      celt_norm *x2, *y2;
+      mbits = b;
+      sbits = 0;
+      /* Only need one bit for the side. */
+      if (itheta != 0 && itheta != 16384)
+         sbits = 1<<BITRES;
+      mbits -= sbits;
+      c = itheta > 8192;
+      ctx->remaining_bits -= qalloc+sbits;
+
+      x2 = c ? Y : X;
+      y2 = c ? X : Y;
+      if (sbits)
+      {
+         if (encode)
         {
-            int j;
-            opus_val16 n;
-            n = celt_sqrt(SHL32(EXTEND32(N0),22));
-            for (j=0;j<N0;j++)
-               lowband_out[j] = MULT16_16_Q15(n,X[j]);
+            /* Here we only need to encode a sign for the side. */
+            sign = x2[0]*y2[1] - x2[1]*y2[0] < 0;
+            ec_enc_bits(ec, sign, 1);
+         } else {
+            sign = ec_dec_bits(ec, 1);
         }
-         cm &= (1<<B)-1;
+      }
+      sign = 1-2*sign;
+      /* We use orig_fill here because we want to fold the side, but if
+         itheta==16384, we'll have cleared the low bits of fill. */
+      cm = quant_band(ctx, x2, N, mbits, B, lowband, LM, lowband_out, Q31ONE,
+            lowband_scratch, orig_fill);
+      /* We don't split N=2 bands, so cm is either 1 or 0 (for a fold-collapse),
+         and there's no need to worry about mixing with the other channel. */
+      y2[0] = -sign*x2[1];
+      y2[1] = sign*x2[0];
+      if (ctx->resynth)
+      {
+         celt_norm tmp;
+         X[0] = MULT32_32_Q31(mid, X[0]);
+         X[1] = MULT32_32_Q31(mid, X[1]);
+         Y[0] = MULT32_32_Q31(side, Y[0]);
+         Y[1] = MULT32_32_Q31(side, Y[1]);
+         tmp = X[0];
+         X[0] = SUB16(tmp,Y[0]);
+         Y[0] = ADD16(tmp,Y[0]);
+         tmp = X[1];
+         X[1] = SUB16(tmp,Y[1]);
+         Y[1] = ADD16(tmp,Y[1]);
+      }
+   } else {
+      /* "Normal" split code */
+      opus_int32 rebalance;
+
+      mbits = IMAX(0, IMIN(b, (b-delta)/2));
+      sbits = b-mbits;
+      ctx->remaining_bits -= qalloc;
+
+      rebalance = ctx->remaining_bits;
+      if (mbits >= sbits)
+      {
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm = quant_band(ctx, X, N, mbits, B, lowband, LM, lowband_out, Q31ONE,
+               lowband_scratch, fill);
+         rebalance = mbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=0)
+            sbits += rebalance - (3<<BITRES);
+
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm |= quant_band(ctx, Y, N, sbits, B, NULL, LM, NULL, side, NULL, fill>>B);
+      } else {
+         /* For a stereo split, the high bits of fill are always zero, so no
+            folding will be done to the side. */
+         cm = quant_band(ctx, Y, N, sbits, B, NULL, LM, NULL, side, NULL, fill>>B);
+         rebalance = sbits - (rebalance-ctx->remaining_bits);
+         if (rebalance > 3<<BITRES && itheta!=16384)
+            mbits += rebalance - (3<<BITRES);
+         /* In stereo mode, we do not apply a scaling to the mid because we need the normalized
+            mid for folding later. */
+         cm |= quant_band(ctx, X, N, mbits, B, lowband, LM, lowband_out, Q31ONE,
+               lowband_scratch, fill);
+      }
+   }
+
+
+   /* This code is used by the decoder and by the resynthesis-enabled encoder */
+   if (ctx->resynth)
+   {
+      if (N!=2)
+         stereo_merge(X, Y, mid, N, ctx->arch);
+      if (inv)
+      {
+         int j;
+         for (j=0;j<N;j++)
+            Y[j] = -Y[j];
      }
   }
   return cm;
 }

+#ifndef DISABLE_UPDATE_DRAFT
+static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
+{
+   int n1, n2;
+   const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
+   n1 = M*(eBands[start+1]-eBands[start]);
+   n2 = M*(eBands[start+2]-eBands[start+1]);
+   /* Duplicate enough of the first band folding data to be able to fold the second band.
+      Copies no data for CELT-only mode. */
+   OPUS_COPY(&norm[n1], &norm[2*n1 - n2], n2-n1);
+   if (dual_stereo)
+      OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
+}
+#endif
+
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm *_X, celt_norm *_Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res, int resynth,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
+      celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
+      opus_uint32 *seed, int complexity, int arch, int disable_inv)
 {
   int i;
   opus_int32 remaining_bits;
-   const opus_int16 * restrict eBands = m->eBands;
-   celt_norm * restrict norm, * restrict norm2;
+   const opus_int16 * OPUS_RESTRICT eBands = m->eBands;
+   celt_norm * OPUS_RESTRICT norm, * OPUS_RESTRICT norm2;
   VARDECL(celt_norm, _norm);
-   VARDECL(celt_norm, lowband_scratch);
+   VARDECL(celt_norm, _lowband_scratch);
+   VARDECL(celt_norm, X_save);
+   VARDECL(celt_norm, Y_save);
+   VARDECL(celt_norm, X_save2);
+   VARDECL(celt_norm, Y_save2);
+   VARDECL(celt_norm, norm_save2);
+   int resynth_alloc;
+   celt_norm *lowband_scratch;
   int B;
   int M;
   int lowband_offset;
   int update_lowband = 1;
-   int C = _Y != NULL ? 2 : 1;
+   int C = Y_ != NULL ? 2 : 1;
+   int norm_offset;
+   int theta_rdo = encode && Y_!=NULL && !dual_stereo && complexity>=8;
+#ifdef RESYNTH
+   int resynth = 1;
+#else
+   int resynth = !encode || theta_rdo;
+#endif
+   struct band_ctx ctx;
   SAVE_STACK;

   M = 1<<LM;
   B = shortBlocks ? M : 1;
-   ALLOC(_norm, C*M*eBands[m->nbEBands], celt_norm);
-   ALLOC(lowband_scratch, M*(eBands[m->nbEBands]-eBands[m->nbEBands-1]), celt_norm);
+   norm_offset = M*eBands[start];
+   /* No need to allocate norm for the last band because we don't need an
+      output in that band. */
+   ALLOC(_norm, C*(M*eBands[m->nbEBands-1]-norm_offset), celt_norm);
   norm = _norm;
-   norm2 = norm + M*eBands[m->nbEBands];
+   norm2 = norm + M*eBands[m->nbEBands-1]-norm_offset;
+
+   /* For decoding, we can use the last band as scratch space because we don't need that
+      scratch space for the last band and we don't care about the data there until we're
+      decoding the last band. */
+   if (encode && resynth)
+      resynth_alloc = M*(eBands[m->nbEBands]-eBands[m->nbEBands-1]);
+   else
+      resynth_alloc = ALLOC_NONE;
+   ALLOC(_lowband_scratch, resynth_alloc, celt_norm);
+   if (encode && resynth)
+      lowband_scratch = _lowband_scratch;
+   else
+      lowband_scratch = X_+M*eBands[m->effEBands-1];
+   ALLOC(X_save, resynth_alloc, celt_norm);
+   ALLOC(Y_save, resynth_alloc, celt_norm);
+   ALLOC(X_save2, resynth_alloc, celt_norm);
+   ALLOC(Y_save2, resynth_alloc, celt_norm);
+   ALLOC(norm_save2, resynth_alloc, celt_norm);

   lowband_offset = 0;
+   ctx.bandE = bandE;
+   ctx.ec = ec;
+   ctx.encode = encode;
+   ctx.intensity = intensity;
+   ctx.m = m;
+   ctx.seed = *seed;
+   ctx.spread = spread;
+   ctx.arch = arch;
+   ctx.disable_inv = disable_inv;
+   ctx.resynth = resynth;
+   ctx.theta_round = 0;
+   /* Avoid injecting noise in the first band on transients. */
+   ctx.avoid_split_noise = B > 1;
   for (i=start;i<end;i++)
   {
      opus_int32 tell;
@@ -1193,55 +1491,76 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
      int N;
      opus_int32 curr_balance;
      int effective_lowband=-1;
-      celt_norm * restrict X, * restrict Y;
+      celt_norm * OPUS_RESTRICT X, * OPUS_RESTRICT Y;
      int tf_change=0;
      unsigned x_cm;
      unsigned y_cm;
+      int last;
+
+      ctx.i = i;
+      last = (i==end-1);

-      X = _X+M*eBands[i];
-      if (_Y!=NULL)
-         Y = _Y+M*eBands[i];
+      X = X_+M*eBands[i];
+      if (Y_!=NULL)
+         Y = Y_+M*eBands[i];
      else
         Y = NULL;
      N = M*eBands[i+1]-M*eBands[i];
+      celt_assert(N > 0);
      tell = ec_tell_frac(ec);

      /* Compute how many bits we want to allocate to this band */
      if (i != start)
         balance -= tell;
      remaining_bits = total_bits-tell-1;
+      ctx.remaining_bits = remaining_bits;
      if (i <= codedBands-1)
      {
-         curr_balance = balance / IMIN(3, codedBands-i);
+         curr_balance = celt_sudiv(balance, IMIN(3, codedBands-i));
         b = IMAX(0, IMIN(16383, IMIN(remaining_bits+1,pulses[i]+curr_balance)));
      } else {
         b = 0;
      }

+#ifndef DISABLE_UPDATE_DRAFT
+      if (resynth && (M*eBands[i]-N >= M*eBands[start] || i==start+1) && (update_lowband || lowband_offset==0))
+            lowband_offset = i;
+      if (i == start+1)
+         special_hybrid_folding(m, norm, norm2, start, M, dual_stereo);
+#else
      if (resynth && M*eBands[i]-N >= M*eBands[start] && (update_lowband || lowband_offset==0))
            lowband_offset = i;
+#endif

      tf_change = tf_res[i];
+      ctx.tf_change = tf_change;
      if (i>=m->effEBands)
      {
         X=norm;
-         if (_Y!=NULL)
+         if (Y_!=NULL)
            Y = norm;
+         lowband_scratch = NULL;
      }
+      if (last && !theta_rdo)
+         lowband_scratch = NULL;

      /* Get a conservative estimate of the collapse_mask's for the bands we're
-          going to be folding from. */
+         going to be folding from. */
      if (lowband_offset != 0 && (spread!=SPREAD_AGGRESSIVE || B>1 || tf_change<0))
      {
         int fold_start;
         int fold_end;
         int fold_i;
         /* This ensures we never repeat spectral content within one band */
-         effective_lowband = IMAX(M*eBands[start], M*eBands[lowband_offset]-N);
+         effective_lowband = IMAX(0, M*eBands[lowband_offset]-norm_offset-N);
         fold_start = lowband_offset;
-         while(M*eBands[--fold_start] > effective_lowband);
+         while(M*eBands[--fold_start] > effective_lowband+norm_offset);
         fold_end = lowband_offset-1;
-         while(M*eBands[++fold_end] < effective_lowband+N);
+#ifndef DISABLE_UPDATE_DRAFT
+         while(++fold_end < i && M*eBands[fold_end] < effective_lowband+norm_offset+N);
+#else
+         while(M*eBands[++fold_end] < effective_lowband+norm_offset+N);
+#endif
         x_cm = y_cm = 0;
         fold_i = fold_start; do {
           x_cm |= collapse_masks[fold_i*C+0];
@@ -1249,7 +1568,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
         } while (++fold_i<fold_end);
      }
      /* Otherwise, we'll be using the LCG to fold, so all blocks will (almost
-          always) be non-zero.*/
+         always) be non-zero. */
      else
         x_cm = y_cm = (1<<B)-1;

@@ -1257,32 +1576,111 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
      {
         int j;

-         /* Switch off dual stereo to do intensity */
+         /* Switch off dual stereo to do intensity. */
         dual_stereo = 0;
-         for (j=M*eBands[start];j<M*eBands[i];j++)
-            norm[j] = HALF32(norm[j]+norm2[j]);
+         if (resynth)
+            for (j=0;j<M*eBands[i]-norm_offset;j++)
+               norm[j] = HALF32(norm[j]+norm2[j]);
      }
      if (dual_stereo)
      {
-         x_cm = quant_band(encode, m, i, X, NULL, N, b/2, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm+effective_lowband : NULL, resynth, ec, &remaining_bits, LM,
-               norm+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, x_cm);
-         y_cm = quant_band(encode, m, i, Y, NULL, N, b/2, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm2+effective_lowband : NULL, resynth, ec, &remaining_bits, LM,
-               norm2+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, y_cm);
+         x_cm = quant_band(&ctx, X, N, b/2, B,
+               effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+               last?NULL:norm+M*eBands[i]-norm_offset, Q31ONE, lowband_scratch, x_cm);
+         y_cm = quant_band(&ctx, Y, N, b/2, B,
+               effective_lowband != -1 ? norm2+effective_lowband : NULL, LM,
+               last?NULL:norm2+M*eBands[i]-norm_offset, Q31ONE, lowband_scratch, y_cm);
      } else {
-         x_cm = quant_band(encode, m, i, X, Y, N, b, spread, B, intensity, tf_change,
-               effective_lowband != -1 ? norm+effective_lowband : NULL, resynth, ec, &remaining_bits, LM,
-               norm+M*eBands[i], bandE, 0, seed, Q15ONE, lowband_scratch, x_cm|y_cm);
+         if (Y!=NULL)
+         {
+            if (theta_rdo && i < intensity)
+            {
+               ec_ctx ec_save, ec_save2;
+               struct band_ctx ctx_save, ctx_save2;
+               opus_val32 dist0, dist1;
+               unsigned cm, cm2;
+               int nstart_bytes, nend_bytes, save_bytes;
+               unsigned char *bytes_buf;
+               unsigned char bytes_save[1275];
+               opus_val16 w[2];
+               compute_channel_weights(bandE[i], bandE[i+m->nbEBands], w);
+               /* Make a copy. */
+               cm = x_cm|y_cm;
+               ec_save = *ec;
+               ctx_save = ctx;
+               OPUS_COPY(X_save, X, N);
+               OPUS_COPY(Y_save, Y, N);
+               /* Encode and round down. */
+               ctx.theta_round = -1;
+               x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
+                     effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                     last?NULL:norm+M*eBands[i]-norm_offset, lowband_scratch, cm);
+               dist0 = MULT16_32_Q15(w[0], celt_inner_prod(X_save, X, N, arch)) + MULT16_32_Q15(w[1], celt_inner_prod(Y_save, Y, N, arch));
+
+               /* Save first result. */
+               cm2 = x_cm;
+               ec_save2 = *ec;
+               ctx_save2 = ctx;
+               OPUS_COPY(X_save2, X, N);
+               OPUS_COPY(Y_save2, Y, N);
+               if (!last)
+                  OPUS_COPY(norm_save2, norm+M*eBands[i]-norm_offset, N);
+               nstart_bytes = ec_save.offs;
+               nend_bytes = ec_save.storage;
+               bytes_buf = ec_save.buf+nstart_bytes;
+               save_bytes = nend_bytes-nstart_bytes;
+               OPUS_COPY(bytes_save, bytes_buf, save_bytes);
+
+               /* Restore */
+               *ec = ec_save;
+               ctx = ctx_save;
+               OPUS_COPY(X, X_save, N);
+               OPUS_COPY(Y, Y_save, N);
+#ifndef DISABLE_UPDATE_DRAFT
+               if (i == start+1)
+                  special_hybrid_folding(m, norm, norm2, start, M, dual_stereo);
+#endif
+               /* Encode and round up. */
+               ctx.theta_round = 1;
+               x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
+                     effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                     last?NULL:norm+M*eBands[i]-norm_offset, lowband_scratch, cm);
+               dist1 = MULT16_32_Q15(w[0], celt_inner_prod(X_save, X, N, arch)) + MULT16_32_Q15(w[1], celt_inner_prod(Y_save, Y, N, arch));
+               if (dist0 >= dist1) {
+                  x_cm = cm2;
+                  *ec = ec_save2;
+                  ctx = ctx_save2;
+                  OPUS_COPY(X, X_save2, N);
+                  OPUS_COPY(Y, Y_save2, N);
+                  if (!last)
+                     OPUS_COPY(norm+M*eBands[i]-norm_offset, norm_save2, N);
+                  OPUS_COPY(bytes_buf, bytes_save, save_bytes);
+               }
+            } else {
+               ctx.theta_round = 0;
+               x_cm = quant_band_stereo(&ctx, X, Y, N, b, B,
+                     effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                     last?NULL:norm+M*eBands[i]-norm_offset, lowband_scratch, x_cm|y_cm);
+            }
+         } else {
+            x_cm = quant_band(&ctx, X, N, b, B,
+                  effective_lowband != -1 ? norm+effective_lowband : NULL, LM,
+                  last?NULL:norm+M*eBands[i]-norm_offset, Q31ONE, lowband_scratch, x_cm|y_cm);
+         }
         y_cm = x_cm;
      }
      collapse_masks[i*C+0] = (unsigned char)x_cm;
      collapse_masks[i*C+C-1] = (unsigned char)y_cm;
      balance += pulses[i] + tell;

-      /* Update the folding position only as long as we have 1 bit/sample depth */
+      /* Update the folding position only as long as we have 1 bit/sample depth. */
      update_lowband = b>(N<<BITRES);
+      /* We only need to avoid noise on a split for the first band. After that, we
+         have folding. */
+      ctx.avoid_split_noise = 0;
   }
+   *seed = ctx.seed;
+
   RESTORE_STACK;
 }

--- a/libcelt/bands.h
+++ b/libcelt/bands.h
@@ -17,8 +17,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -36,38 +36,43 @@
 #include "entdec.h"
 #include "rate.h"

+opus_int16 bitexact_cos(opus_int16 x);
+int bitexact_log2tan(int isin,int icos);
+
 /** Compute the amplitude (sqrt energy) in each of the bands
 * @param m Mode data
 * @param X Spectrum
- * @param bands Square root of the energy for each band (returned)
+ * @param bandE Square root of the energy for each band (returned)
 */
-void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bands, int end, int _C, int M);
+void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *bandE, int end, int C, int LM, int arch);

-/*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bank);*/
+/*void compute_noise_energies(const CELTMode *m, const celt_sig *X, const opus_val16 *tonality, celt_ener *bandE);*/

 /** Normalise each band of X such that the energy in each band is
    equal to 1
 * @param m Mode data
 * @param X Spectrum (returned normalised)
- * @param bands Square root of the energy for each band
+ * @param bandE Square root of the energy for each band
 */
-void normalise_bands(const CELTMode *m, const celt_sig * restrict freq, celt_norm * restrict X, const celt_ener *bands, int end, int _C, int M);
+void normalise_bands(const CELTMode *m, const celt_sig * OPUS_RESTRICT freq, celt_norm * OPUS_RESTRICT X, const celt_ener *bandE, int end, int C, int M);

 /** Denormalise each band of X to restore full amplitude
 * @param m Mode data
 * @param X Spectrum (returned de-normalised)
- * @param bands Square root of the energy for each band
+ * @param bandE Square root of the energy for each band
 */
-void denormalise_bands(const CELTMode *m, const celt_norm * restrict X, celt_sig * restrict freq, const celt_ener *bands, int end, int _C, int M);
+void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
+      celt_sig * OPUS_RESTRICT freq, const celt_glog *bandE, int start,
+      int end, int M, int downsample, int silence);

 #define SPREAD_NONE       (0)
 #define SPREAD_LIGHT      (1)
 #define SPREAD_NORMAL     (2)
 #define SPREAD_AGGRESSIVE (3)

-int spreading_decision(const CELTMode *m, celt_norm *X, int *average,
+int spreading_decision(const CELTMode *m, const celt_norm *X, int *average,
      int last_decision, int *hf_average, int *tapset_decision, int update_hf,
-      int end, int _C, int M);
+      int end, int C, int M, const int *spread_weight);

 #ifdef MEASURE_NORM_MSE
 void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, float *bandE0, int M, int N, int C);
@@ -76,22 +81,43 @@ void measure_norm_mse(const CELTMode *m, float *X, float *X0, float *bandE, floa
 void haar1(celt_norm *X, int N0, int stride);

 /** Quantisation/encoding of the residual spectrum
+ * @param encode flag that indicates whether we're encoding (1) or decoding (0)
 * @param m Mode data
+ * @param start First band to process
+ * @param end Last band to process + 1
 * @param X Residual (normalised)
+ * @param Y Residual (normalised) for second channel (or NULL for mono)
+ * @param collapse_masks Anti-collapse tracking mask
+ * @param bandE Square root of the energy for each band
+ * @param pulses Bit allocation (per band) for PVQ
+ * @param shortBlocks Zero for long blocks, non-zero for short blocks
+ * @param spread Amount of spreading to use
+ * @param dual_stereo Zero for MS stereo, non-zero for dual stereo
+ * @param intensity First band to use intensity stereo
+ * @param tf_res Time-frequency resolution change
 * @param total_bits Total number of bits that can be used for the frame (including the ones already spent)
- * @param enc Entropy encoder
+ * @param balance Number of unallocated bits
+ * @param en Entropy coder state
+ * @param LM log2() of the number of 2.5 subframes in the frame
+ * @param codedBands Last band to receive bits + 1
+ * @param seed Random generator seed
+ * @param arch Run-time architecture (see opus_select_arch())
 */
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
-      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
-      int time_domain, int fold, int dual_stereo, int intensity, int *tf_res, int resynth,
-      opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
+      celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
+      const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
+      int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
+      opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
+      int complexity, int arch, int disable_inv);

-void stereo_decision(const CELTMode *m, celt_norm * restrict X, int *stereo_mode, int len, int M);
+void anti_collapse(const CELTMode *m, celt_norm *X_,
+      unsigned char *collapse_masks, int LM, int C, int size, int start,
+      int end, const celt_glog *logE, const celt_glog *prev1logE,
+      const celt_glog *prev2logE, const int *pulses, opus_uint32 seed,
+      int encode, int arch);

-void anti_collapse(const CELTMode *m, celt_norm *_X, unsigned char *collapse_masks, int LM, int C, int CC, int size,
-      int start, int end, opus_val16 *logE, opus_val16 *prev1logE,
-      opus_val16 *prev2logE, int *pulses, opus_uint32 seed);
+opus_uint32 celt_lcg_rand(opus_uint32 seed);

-opus_uint32 lcg_rand(opus_uint32 seed);
+int hysteresis_decision(opus_val16 val, const opus_val16 *thresholds, const opus_val16 *hysteresis, int N, int prev);

 #endif /* BANDS_H */
--- a/celt/celt.c
+++ b/celt/celt.c
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_C
+
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "unknown"
+#endif
+
+#if defined(MIPSr1_ASM)
+#include "mips/celt_mipsr1.h"
+#endif
+
+
+int resampling_factor(opus_int32 rate)
+{
+   int ret;
+   switch (rate)
+   {
+   case 48000:
+      ret = 1;
+      break;
+   case 24000:
+      ret = 2;
+      break;
+   case 16000:
+      ret = 3;
+      break;
+   case 12000:
+      ret = 4;
+      break;
+   case 8000:
+      ret = 6;
+      break;
+   default:
+#ifndef CUSTOM_MODES
+      celt_assert(0);
+#endif
+      ret = 0;
+      break;
+   }
+   return ret;
+}
+
+
+#if !defined(OVERRIDE_COMB_FILTER_CONST) || defined(NON_STATIC_COMB_FILTER_CONST_C)
+/* This version should be faster on ARM */
+#ifdef OPUS_ARM_ASM
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+      celt_coef g10, celt_coef g11, celt_coef g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = SHL32(x[-T-2], 1);
+   x3 = SHL32(x[-T-1], 1);
+   x2 = SHL32(x[-T], 1);
+   x1 = SHL32(x[-T+1], 1);
+   for (i=0;i<N-4;i+=5)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC_COEF_32_ARM(x[i], g10, x2);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4));
+      t = SATURATE(t, SIG_SAT);
+      y[i] = t;
+      x4=SHL32(x[i-T+3],1);
+      t = MAC_COEF_32_ARM(x[i+1], g10, x1);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x0,x2));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x4,x3));
+      t = SATURATE(t, SIG_SAT);
+      y[i+1] = t;
+      x3=SHL32(x[i-T+4],1);
+      t = MAC_COEF_32_ARM(x[i+2], g10, x0);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x4,x1));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x3,x2));
+      t = SATURATE(t, SIG_SAT);
+      y[i+2] = t;
+      x2=SHL32(x[i-T+5],1);
+      t = MAC_COEF_32_ARM(x[i+3], g10, x4);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x3,x0));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x2,x1));
+      t = SATURATE(t, SIG_SAT);
+      y[i+3] = t;
+      x1=SHL32(x[i-T+6],1);
+      t = MAC_COEF_32_ARM(x[i+4], g10, x3);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x2,x4));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x1,x0));
+      t = SATURATE(t, SIG_SAT);
+      y[i+4] = t;
+   }
+#ifdef CUSTOM_MODES
+   for (;i<N;i++)
+   {
+      opus_val32 t;
+      x0=SHL32(x[i-T+2],1);
+      t = MAC_COEF_32_ARM(x[i], g10, x2);
+      t = MAC_COEF_32_ARM(t, g11, ADD32(x1,x3));
+      t = MAC_COEF_32_ARM(t, g12, ADD32(x0,x4));
+      t = SATURATE(t, SIG_SAT);
+      y[i] = t;
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+#endif
+}
+#else
+#ifndef NON_STATIC_COMB_FILTER_CONST_C
+static
+#endif
+void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N,
+      celt_coef g10, celt_coef g11, celt_coef g12)
+{
+   opus_val32 x0, x1, x2, x3, x4;
+   int i;
+   x4 = x[-T-2];
+   x3 = x[-T-1];
+   x2 = x[-T];
+   x1 = x[-T+1];
+   for (i=0;i<N;i++)
+   {
+      x0=x[i-T+2];
+      y[i] = x[i]
+               + MULT_COEF_32(g10,x2)
+               + MULT_COEF_32(g11,ADD32(x1,x3))
+               + MULT_COEF_32(g12,ADD32(x0,x4));
+      y[i] = SATURATE(y[i], SIG_SAT);
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+   }
+
+}
+#endif
+#endif
+
+#ifndef OVERRIDE_comb_filter
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const celt_coef *window, int overlap, int arch)
+{
+   int i;
+   /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */
+   celt_coef g00, g01, g02, g10, g11, g12;
+   opus_val32 x0, x1, x2, x3, x4;
+   static const opus_val16 gains[3][3] = {
+         {QCONST16(0.3066406250f, 15), QCONST16(0.2170410156f, 15), QCONST16(0.1296386719f, 15)},
+         {QCONST16(0.4638671875f, 15), QCONST16(0.2680664062f, 15), QCONST16(0.f, 15)},
+         {QCONST16(0.7998046875f, 15), QCONST16(0.1000976562f, 15), QCONST16(0.f, 15)}};
+
+   if (g0==0 && g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y, x, N);
+      return;
+   }
+   /* When the gain is zero, T0 and/or T1 is set to zero. We need
+      to have then be at least 2 to avoid processing garbage data. */
+   T0 = IMAX(T0, COMBFILTER_MINPERIOD);
+   T1 = IMAX(T1, COMBFILTER_MINPERIOD);
+   g00 = MULT_COEF_TAPS(g0, gains[tapset0][0]);
+   g01 = MULT_COEF_TAPS(g0, gains[tapset0][1]);
+   g02 = MULT_COEF_TAPS(g0, gains[tapset0][2]);
+   g10 = MULT_COEF_TAPS(g1, gains[tapset1][0]);
+   g11 = MULT_COEF_TAPS(g1, gains[tapset1][1]);
+   g12 = MULT_COEF_TAPS(g1, gains[tapset1][2]);
+   x1 = x[-T1+1];
+   x2 = x[-T1  ];
+   x3 = x[-T1-1];
+   x4 = x[-T1-2];
+   /* If the filter didn't change, we don't need the overlap */
+   if (g0==g1 && T0==T1 && tapset0==tapset1)
+      overlap=0;
+   for (i=0;i<overlap;i++)
+   {
+      celt_coef f;
+      x0=x[i-T1+2];
+      f = MULT_COEF(window[i],window[i]);
+      y[i] = x[i]
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g00),x[i-T0])
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g01),ADD32(x[i-T0+1],x[i-T0-1]))
+               + MULT_COEF_32(MULT_COEF((COEF_ONE-f),g02),ADD32(x[i-T0+2],x[i-T0-2]))
+               + MULT_COEF_32(MULT_COEF(f,g10),x2)
+               + MULT_COEF_32(MULT_COEF(f,g11),ADD32(x1,x3))
+               + MULT_COEF_32(MULT_COEF(f,g12),ADD32(x0,x4));
+      y[i] = SATURATE(y[i], SIG_SAT);
+      x4=x3;
+      x3=x2;
+      x2=x1;
+      x1=x0;
+
+   }
+   if (g1==0)
+   {
+      /* OPT: Happens to work without the OPUS_MOVE(), but only because the current encoder already copies x to y */
+      if (x!=y)
+         OPUS_MOVE(y+overlap, x+overlap, N-overlap);
+      return;
+   }
+
+   /* Compute the part with the constant filter. */
+   comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch);
+}
+#endif /* OVERRIDE_comb_filter */
+
+/* TF change table. Positive values mean better frequency resolution (longer
+   effective window), whereas negative values mean better time resolution
+   (shorter effective window). The second index is computed as:
+   4*isTransient + 2*tf_select + per_band_flag */
+const signed char tf_select_table[4][8] = {
+    /*isTransient=0     isTransient=1 */
+      {0, -1, 0, -1,    0,-1, 0,-1}, /* 2.5 ms */
+      {0, -1, 0, -2,    1, 0, 1,-1}, /* 5 ms */
+      {0, -2, 0, -3,    2, 0, 1,-1}, /* 10 ms */
+      {0, -2, 0, -3,    3, 0, 1,-1}, /* 20 ms */
+};
+
+
+void init_caps(const CELTMode *m,int *cap,int LM,int C)
+{
+   int i;
+   for (i=0;i<m->nbEBands;i++)
+   {
+      int N;
+      N=(m->eBands[i+1]-m->eBands[i])<<LM;
+      cap[i] = (m->cache.caps[m->nbEBands*(2*LM+C-1)+i]+64)*C*N>>2;
+   }
+}
+
+
+
+const char *opus_strerror(int error)
+{
+   static const char * const error_strings[8] = {
+      "success",
+      "invalid argument",
+      "buffer too small",
+      "internal error",
+      "corrupted stream",
+      "request not implemented",
+      "invalid state",
+      "memory allocation failed"
+   };
+   if (error > 0 || error < -7)
+      return "unknown error";
+   else
+      return error_strings[-error];
+}
+
+const char *opus_get_version_string(void)
+{
+    return "libopus " PACKAGE_VERSION
+    /* Applications may rely on the presence of this substring in the version
+       string to determine if they have a fixed-point or floating-point build
+       at runtime. */
+#ifdef FIXED_POINT
+          "-fixed"
+#endif
+#ifdef FUZZING
+          "-fuzzing"
+#endif
+          ;
+}
--- a/celt/celt.h
+++ b/celt/celt.h
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/**
+  @file celt.h
+  @brief Contains all the functions for encoding and decoding audio
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_H
+#define CELT_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+#include "opus_custom.h"
+#include "entenc.h"
+#include "entdec.h"
+#include "arch.h"
+#include "kiss_fft.h"
+
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CELTEncoder OpusCustomEncoder
+#define CELTDecoder OpusCustomDecoder
+#define CELTMode OpusCustomMode
+
+#define LEAK_BANDS 19
+
+typedef struct {
+   int valid;
+   float tonality;
+   float tonality_slope;
+   float noisiness;
+   float activity;
+   float music_prob;
+   float music_prob_min;
+   float music_prob_max;
+   int   bandwidth;
+   float activity_probability;
+   float max_pitch_ratio;
+   /* Store as Q6 char to save space. */
+   unsigned char leak_boost[LEAK_BANDS];
+} AnalysisInfo;
+
+typedef struct {
+   int signalType;
+   int offset;
+} SILKInfo;
+
+#define __celt_check_mode_ptr_ptr(ptr) ((ptr) + ((ptr) - (const CELTMode**)(ptr)))
+
+#define __celt_check_analysis_ptr(ptr) ((ptr) + ((ptr) - (const AnalysisInfo*)(ptr)))
+
+#define __celt_check_silkinfo_ptr(ptr) ((ptr) + ((ptr) - (const SILKInfo*)(ptr)))
+
+#define __celt_check_glog_ptr(ptr) ((ptr) + ((ptr) - (celt_glog*)(ptr)))
+
+/* Encoder/decoder Requests */
+
+
+#define CELT_SET_PREDICTION_REQUEST    10002
+/** Controls the use of interframe prediction.
+    0=Independent frames
+    1=Short term interframe prediction allowed
+    2=Long term prediction allowed
+ */
+#define CELT_SET_PREDICTION(x) CELT_SET_PREDICTION_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_INPUT_CLIPPING_REQUEST    10004
+#define CELT_SET_INPUT_CLIPPING(x) CELT_SET_INPUT_CLIPPING_REQUEST, __opus_check_int(x)
+
+#define CELT_GET_AND_CLEAR_ERROR_REQUEST   10007
+#define CELT_GET_AND_CLEAR_ERROR(x) CELT_GET_AND_CLEAR_ERROR_REQUEST, __opus_check_int_ptr(x)
+
+#define CELT_SET_CHANNELS_REQUEST    10008
+#define CELT_SET_CHANNELS(x) CELT_SET_CHANNELS_REQUEST, __opus_check_int(x)
+
+
+/* Internal */
+#define CELT_SET_START_BAND_REQUEST    10010
+#define CELT_SET_START_BAND(x) CELT_SET_START_BAND_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_END_BAND_REQUEST    10012
+#define CELT_SET_END_BAND(x) CELT_SET_END_BAND_REQUEST, __opus_check_int(x)
+
+#define CELT_GET_MODE_REQUEST    10015
+/** Get the CELTMode used by an encoder or decoder */
+#define CELT_GET_MODE(x) CELT_GET_MODE_REQUEST, __celt_check_mode_ptr_ptr(x)
+
+#define CELT_SET_SIGNALLING_REQUEST    10016
+#define CELT_SET_SIGNALLING(x) CELT_SET_SIGNALLING_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_TONALITY_REQUEST    10018
+#define CELT_SET_TONALITY(x) CELT_SET_TONALITY_REQUEST, __opus_check_int(x)
+#define CELT_SET_TONALITY_SLOPE_REQUEST    10020
+#define CELT_SET_TONALITY_SLOPE(x) CELT_SET_TONALITY_SLOPE_REQUEST, __opus_check_int(x)
+
+#define CELT_SET_ANALYSIS_REQUEST    10022
+#define CELT_SET_ANALYSIS(x) CELT_SET_ANALYSIS_REQUEST, __celt_check_analysis_ptr(x)
+
+#define OPUS_SET_LFE_REQUEST    10024
+#define OPUS_SET_LFE(x) OPUS_SET_LFE_REQUEST, __opus_check_int(x)
+
+#define OPUS_SET_ENERGY_MASK_REQUEST    10026
+#define OPUS_SET_ENERGY_MASK(x) OPUS_SET_ENERGY_MASK_REQUEST, __celt_check_glog_ptr(x)
+
+#define CELT_SET_SILK_INFO_REQUEST    10028
+#define CELT_SET_SILK_INFO(x) CELT_SET_SILK_INFO_REQUEST, __celt_check_silkinfo_ptr(x)
+
+/* Encoder stuff */
+
+int celt_encoder_get_size(int channels);
+
+int celt_encode_with_ec(OpusCustomEncoder * OPUS_RESTRICT st, const opus_res * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc);
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch);
+
+
+
+/* Decoder stuff */
+
+int celt_decoder_get_size(int channels);
+
+
+int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels);
+
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_res * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      );
+
+int celt_decode_with_ec(OpusCustomDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_res * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum);
+
+#define celt_encoder_ctl opus_custom_encoder_ctl
+#define celt_decoder_ctl opus_custom_decoder_ctl
+
+
+#ifdef CUSTOM_MODES
+#define OPUS_CUSTOM_NOSTATIC
+#else
+#define OPUS_CUSTOM_NOSTATIC static OPUS_INLINE
+#endif
+
+static const unsigned char trim_icdf[11] = {126, 124, 119, 109, 87, 41, 19, 9, 4, 2, 0};
+/* Probs: NONE: 21.875%, LIGHT: 6.25%, NORMAL: 65.625%, AGGRESSIVE: 6.25% */
+static const unsigned char spread_icdf[4] = {25, 23, 2, 0};
+
+static const unsigned char tapset_icdf[3]={2,1,0};
+
+#ifdef CUSTOM_MODES
+static const unsigned char toOpusTable[20] = {
+      0xE0, 0xE8, 0xF0, 0xF8,
+      0xC0, 0xC8, 0xD0, 0xD8,
+      0xA0, 0xA8, 0xB0, 0xB8,
+      0x00, 0x00, 0x00, 0x00,
+      0x80, 0x88, 0x90, 0x98,
+};
+
+static const unsigned char fromOpusTable[16] = {
+      0x80, 0x88, 0x90, 0x98,
+      0x40, 0x48, 0x50, 0x58,
+      0x20, 0x28, 0x30, 0x38,
+      0x00, 0x08, 0x10, 0x18
+};
+
+static OPUS_INLINE int toOpus(unsigned char c)
+{
+   int ret=0;
+   if (c<0xA0)
+      ret = toOpusTable[c>>3];
+   if (ret == 0)
+      return -1;
+   else
+      return ret|(c&0x7);
+}
+
+static OPUS_INLINE int fromOpus(unsigned char c)
+{
+   if (c<0x80)
+      return -1;
+   else
+      return fromOpusTable[(c>>3)-16] | (c&0x7);
+}
+#endif /* CUSTOM_MODES */
+
+#define COMBFILTER_MAXPERIOD 1024
+#define COMBFILTER_MINPERIOD 15
+
+extern const signed char tf_select_table[4][8];
+
+#if defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS)
+void validate_celt_decoder(CELTDecoder *st);
+#define VALIDATE_CELT_DECODER(st) validate_celt_decoder(st)
+#else
+#define VALIDATE_CELT_DECODER(st)
+#endif
+
+int resampling_factor(opus_int32 rate);
+
+void celt_preemphasis(const opus_res * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip);
+
+void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N,
+      opus_val16 g0, opus_val16 g1, int tapset0, int tapset1,
+      const celt_coef *window, int overlap, int arch);
+
+void init_caps(const CELTMode *m,int *cap,int LM,int C);
+
+#ifdef RESYNTH
+void deemphasis(celt_sig *in[], opus_res *pcm, int N, int C, int downsample, const opus_val16 *coef, celt_sig *mem, int accum);
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+      celt_glog *oldBandE, int start, int effEnd, int C, int CC, int isTransient,
+      int LM, int downsample, int silence, int arch);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CELT_H */
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_DECODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+#ifdef ENABLE_DEEP_PLC
+#include "lpcnet.h"
+#include "lpcnet_private.h"
+#endif
+
+/* The maximum pitch lag to allow in the pitch-based PLC. It's possible to save
+   CPU time in the PLC pitch search by making this smaller than MAX_PERIOD. The
+   current value corresponds to a pitch of 66.67 Hz. */
+#define PLC_PITCH_LAG_MAX (720)
+/* The minimum pitch lag to allow in the pitch-based PLC. This corresponds to a
+   pitch of 480 Hz. */
+#define PLC_PITCH_LAG_MIN (100)
+
+/**********************************************************************/
+/*                                                                    */
+/*                             DECODER                                */
+/*                                                                    */
+/**********************************************************************/
+#define DECODE_BUFFER_SIZE 2048
+
+#define PLC_UPDATE_FRAMES 4
+#define PLC_UPDATE_SAMPLES (PLC_UPDATE_FRAMES*FRAME_SIZE)
+
+/** Decoder state
+ @brief Decoder state
+ */
+struct OpusCustomDecoder {
+   const OpusCustomMode *mode;
+   int overlap;
+   int channels;
+   int stream_channels;
+
+   int downsample;
+   int start, end;
+   int signalling;
+   int disable_inv;
+   int complexity;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define DECODER_RESET_START rng
+
+   opus_uint32 rng;
+   int error;
+   int last_pitch_index;
+   int loss_duration;
+   int skip_plc;
+   int postfilter_period;
+   int postfilter_period_old;
+   opus_val16 postfilter_gain;
+   opus_val16 postfilter_gain_old;
+   int postfilter_tapset;
+   int postfilter_tapset_old;
+   int prefilter_and_fold;
+
+   celt_sig preemph_memD[2];
+
+#ifdef ENABLE_DEEP_PLC
+   opus_int16 plc_pcm[PLC_UPDATE_SAMPLES];
+   int plc_fill;
+   float plc_preemphasis_mem;
+#endif
+
+   celt_sig _decode_mem[1]; /* Size = channels*(DECODE_BUFFER_SIZE+mode->overlap) */
+   /* opus_val16 lpc[],  Size = channels*CELT_LPC_ORDER */
+   /* celt_glog oldEBands[], Size = 2*mode->nbEBands */
+   /* celt_glog oldLogE[], Size = 2*mode->nbEBands */
+   /* celt_glog oldLogE2[], Size = 2*mode->nbEBands */
+   /* celt_glog backgroundLogE[], Size = 2*mode->nbEBands */
+};
+
+#if defined(ENABLE_HARDENING) || defined(ENABLE_ASSERTIONS)
+/* Make basic checks on the CELT state to ensure we don't end
+   up writing all over memory. */
+void validate_celt_decoder(CELTDecoder *st)
+{
+#ifndef CUSTOM_MODES
+   celt_assert(st->mode == opus_custom_mode_create(48000, 960, NULL));
+   celt_assert(st->overlap == 120);
+   celt_assert(st->end <= 21);
+#else
+/* From Section 4.3 in the spec: "The normal CELT layer uses 21 of those bands,
+   though Opus Custom (see Section 6.2) may use a different number of bands"
+
+   Check if it's within the maximum number of Bark frequency bands instead */
+   celt_assert(st->end <= 25);
+#endif
+   celt_assert(st->channels == 1 || st->channels == 2);
+   celt_assert(st->stream_channels == 1 || st->stream_channels == 2);
+   celt_assert(st->downsample > 0);
+   celt_assert(st->start == 0 || st->start == 17);
+   celt_assert(st->start < st->end);
+#ifdef OPUS_ARCHMASK
+   celt_assert(st->arch >= 0);
+   celt_assert(st->arch <= OPUS_ARCHMASK);
+#endif
+   celt_assert(st->last_pitch_index <= PLC_PITCH_LAG_MAX);
+   celt_assert(st->last_pitch_index >= PLC_PITCH_LAG_MIN || st->last_pitch_index == 0);
+   celt_assert(st->postfilter_period < MAX_PERIOD);
+   celt_assert(st->postfilter_period >= COMBFILTER_MINPERIOD || st->postfilter_period == 0);
+   celt_assert(st->postfilter_period_old < MAX_PERIOD);
+   celt_assert(st->postfilter_period_old >= COMBFILTER_MINPERIOD || st->postfilter_period_old == 0);
+   celt_assert(st->postfilter_tapset <= 2);
+   celt_assert(st->postfilter_tapset >= 0);
+   celt_assert(st->postfilter_tapset_old <= 2);
+   celt_assert(st->postfilter_tapset_old >= 0);
+}
+#endif
+
+int celt_decoder_get_size(int channels)
+{
+   const CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_decoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTDecoder)
+            + (channels*(DECODE_BUFFER_SIZE+mode->overlap)-1)*sizeof(celt_sig)
+            + channels*CELT_LPC_ORDER*sizeof(opus_val16)
+            + 4*2*mode->nbEBands*sizeof(celt_glog);
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTDecoder *opus_custom_decoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTDecoder *st = (CELTDecoder *)opus_alloc(opus_custom_decoder_get_size(mode, channels));
+   ret = opus_custom_decoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_decoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+int celt_decoder_init(CELTDecoder *st, opus_int32 sampling_rate, int channels)
+{
+   int ret;
+   ret = opus_custom_decoder_init(st, opus_custom_mode_create(48000, 960, NULL), channels);
+   if (ret != OPUS_OK)
+      return ret;
+   st->downsample = resampling_factor(sampling_rate);
+   if (st->downsample==0)
+      return OPUS_BAD_ARG;
+   else
+      return OPUS_OK;
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_decoder_init(CELTDecoder *st, const CELTMode *mode, int channels)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_decoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->overlap = mode->overlap;
+   st->stream_channels = st->channels = channels;
+
+   st->downsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+#ifndef DISABLE_UPDATE_DRAFT
+   st->disable_inv = channels == 1;
+#else
+   st->disable_inv = 0;
+#endif
+   st->arch = opus_select_arch();
+
+   opus_custom_decoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_decoder_destroy(CELTDecoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+#ifndef CUSTOM_MODES
+/* Special case for stereo with no downsampling and no accumulation. This is
+   quite common and we can make it faster by processing both channels in the
+   same loop, reducing overhead due to the dependency loop in the IIR filter. */
+static void deemphasis_stereo_simple(celt_sig *in[], opus_res *pcm, int N, const opus_val16 coef0,
+      celt_sig *mem)
+{
+   celt_sig * OPUS_RESTRICT x0;
+   celt_sig * OPUS_RESTRICT x1;
+   celt_sig m0, m1;
+   int j;
+   x0=in[0];
+   x1=in[1];
+   m0 = mem[0];
+   m1 = mem[1];
+   for (j=0;j<N;j++)
+   {
+      celt_sig tmp0, tmp1;
+      /* Add VERY_SMALL to x[] first to reduce dependency chain. */
+      tmp0 = SATURATE(x0[j] + VERY_SMALL + m0, SIG_SAT);
+      tmp1 = SATURATE(x1[j] + VERY_SMALL + m1, SIG_SAT);
+      m0 = MULT16_32_Q15(coef0, tmp0);
+      m1 = MULT16_32_Q15(coef0, tmp1);
+      pcm[2*j  ] = SIG2RES(tmp0);
+      pcm[2*j+1] = SIG2RES(tmp1);
+   }
+   mem[0] = m0;
+   mem[1] = m1;
+}
+#endif
+
+#ifndef RESYNTH
+static
+#endif
+void deemphasis(celt_sig *in[], opus_res *pcm, int N, int C, int downsample, const opus_val16 *coef,
+      celt_sig *mem, int accum)
+{
+   int c;
+   int Nd;
+   int apply_downsampling=0;
+   opus_val16 coef0;
+   VARDECL(celt_sig, scratch);
+   SAVE_STACK;
+#ifndef CUSTOM_MODES
+   /* Short version for common case. */
+   if (downsample == 1 && C == 2 && !accum)
+   {
+      deemphasis_stereo_simple(in, pcm, N, coef[0], mem);
+      return;
+   }
+#endif
+   ALLOC(scratch, N, celt_sig);
+   coef0 = coef[0];
+   Nd = N/downsample;
+   c=0; do {
+      int j;
+      celt_sig * OPUS_RESTRICT x;
+      opus_res  * OPUS_RESTRICT y;
+      celt_sig m = mem[c];
+      x =in[c];
+      y = pcm+c;
+#ifdef CUSTOM_MODES
+      if (coef[1] != 0)
+      {
+         opus_val16 coef1 = coef[1];
+         opus_val16 coef3 = coef[3];
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = SATURATE(x[j] + m + VERY_SMALL, SIG_SAT);
+            m = MULT16_32_Q15(coef0, tmp)
+                          - MULT16_32_Q15(coef1, x[j]);
+            tmp = SHL32(MULT16_32_Q15(coef3, tmp), 2);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else
+#endif
+      if (downsample>1)
+      {
+         /* Shortcut for the standard (non-custom modes) case */
+         for (j=0;j<N;j++)
+         {
+            celt_sig tmp = SATURATE(x[j] + VERY_SMALL + m, SIG_SAT);
+            m = MULT16_32_Q15(coef0, tmp);
+            scratch[j] = tmp;
+         }
+         apply_downsampling=1;
+      } else {
+         /* Shortcut for the standard (non-custom modes) case */
+         if (accum)
+         {
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = SATURATE(x[j] + m + VERY_SMALL, SIG_SAT);
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = ADD_RES(y[j*C], SIG2RES(tmp));
+            }
+         } else
+         {
+            for (j=0;j<N;j++)
+            {
+               celt_sig tmp = SATURATE(x[j] + VERY_SMALL + m, SIG_SAT);
+               m = MULT16_32_Q15(coef0, tmp);
+               y[j*C] = SIG2RES(tmp);
+            }
+         }
+      }
+      mem[c] = m;
+
+      if (apply_downsampling)
+      {
+         /* Perform down-sampling */
+         if (accum)
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = ADD_RES(y[j*C], SIG2RES(scratch[j*downsample]));
+         } else
+         {
+            for (j=0;j<Nd;j++)
+               y[j*C] = SIG2RES(scratch[j*downsample]);
+         }
+      }
+   } while (++c<C);
+   RESTORE_STACK;
+}
+
+#ifndef RESYNTH
+static
+#endif
+void celt_synthesis(const CELTMode *mode, celt_norm *X, celt_sig * out_syn[],
+                    celt_glog *oldBandE, int start, int effEnd, int C, int CC,
+                    int isTransient, int LM, int downsample,
+                    int silence, int arch)
+{
+   int c, i;
+   int M;
+   int b;
+   int B;
+   int N, NB;
+   int shift;
+   int nbEBands;
+   int overlap;
+   VARDECL(celt_sig, freq);
+   SAVE_STACK;
+
+   overlap = mode->overlap;
+   nbEBands = mode->nbEBands;
+   N = mode->shortMdctSize<<LM;
+   ALLOC(freq, N, celt_sig); /**< Interleaved signal MDCTs */
+   M = 1<<LM;
+
+   if (isTransient)
+   {
+      B = M;
+      NB = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      NB = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+
+   if (CC==2&&C==1)
+   {
+      /* Copying a mono streams to two channels */
+      celt_sig *freq2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Store a temporary copy in the output buffer because the IMDCT destroys its input. */
+      freq2 = out_syn[1]+overlap/2;
+      OPUS_COPY(freq2, freq, N);
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq2[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch);
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[1]+NB*b, mode->window, overlap, shift, B, arch);
+   } else if (CC==1&&C==2)
+   {
+      /* Downmixing a stereo stream to mono */
+      celt_sig *freq2;
+      freq2 = out_syn[0]+overlap/2;
+      denormalise_bands(mode, X, freq, oldBandE, start, effEnd, M,
+            downsample, silence);
+      /* Use the output buffer as temp array before downmixing. */
+      denormalise_bands(mode, X+N, freq2, oldBandE+nbEBands, start, effEnd, M,
+            downsample, silence);
+      for (i=0;i<N;i++)
+         freq[i] = ADD32(HALF32(freq[i]), HALF32(freq2[i]));
+      for (b=0;b<B;b++)
+         clt_mdct_backward(&mode->mdct, &freq[b], out_syn[0]+NB*b, mode->window, overlap, shift, B, arch);
+   } else {
+      /* Normal case (mono or stereo) */
+      c=0; do {
+         denormalise_bands(mode, X+c*N, freq, oldBandE+c*nbEBands, start, effEnd, M,
+               downsample, silence);
+         for (b=0;b<B;b++)
+            clt_mdct_backward(&mode->mdct, &freq[b], out_syn[c]+NB*b, mode->window, overlap, shift, B, arch);
+      } while (++c<CC);
+   }
+   /* Saturate IMDCT output so that we can't overflow in the pitch postfilter
+      or in the */
+   c=0; do {
+      for (i=0;i<N;i++)
+         out_syn[c][i] = SATURATE(out_syn[c][i], SIG_SAT);
+   } while (++c<CC);
+   RESTORE_STACK;
+}
+
+static void tf_decode(int start, int end, int isTransient, int *tf_res, int LM, ec_dec *dec)
+{
+   int i, curr, tf_select;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+
+   budget = dec->storage*8;
+   tell = ec_tell(dec);
+   logp = isTransient ? 2 : 4;
+   tf_select_rsv = LM>0 && tell+logp+1<=budget;
+   budget -= tf_select_rsv;
+   tf_changed = curr = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         curr ^= ec_dec_bit_logp(dec, logp);
+         tell = ec_tell(dec);
+         tf_changed |= curr;
+      }
+      tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   tf_select = 0;
+   if (tf_select_rsv &&
+     tf_select_table[LM][4*isTransient+0+tf_changed] !=
+     tf_select_table[LM][4*isTransient+2+tf_changed])
+   {
+      tf_select = ec_dec_bit_logp(dec, 1);
+   }
+   for (i=start;i<end;i++)
+   {
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   }
+}
+
+static int celt_plc_pitch_search(celt_sig *decode_mem[2], int C, int arch)
+{
+   int pitch_index;
+   VARDECL( opus_val16, lp_pitch_buf );
+   SAVE_STACK;
+   ALLOC( lp_pitch_buf, DECODE_BUFFER_SIZE>>1, opus_val16 );
+   pitch_downsample(decode_mem, lp_pitch_buf,
+         DECODE_BUFFER_SIZE, C, arch);
+   pitch_search(lp_pitch_buf+(PLC_PITCH_LAG_MAX>>1), lp_pitch_buf,
+         DECODE_BUFFER_SIZE-PLC_PITCH_LAG_MAX,
+         PLC_PITCH_LAG_MAX-PLC_PITCH_LAG_MIN, &pitch_index, arch);
+   pitch_index = PLC_PITCH_LAG_MAX-pitch_index;
+   RESTORE_STACK;
+   return pitch_index;
+}
+
+static void prefilter_and_fold(CELTDecoder * OPUS_RESTRICT st, int N)
+{
+   int c;
+   int CC;
+   int i;
+   int overlap;
+   celt_sig *decode_mem[2];
+   const OpusCustomMode *mode;
+   VARDECL(opus_val32, etmp);
+   mode = st->mode;
+   overlap = st->overlap;
+   CC = st->channels;
+   ALLOC(etmp, overlap, opus_val32);
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+   } while (++c<CC);
+
+   c=0; do {
+      /* Apply the pre-filter to the MDCT overlap for the next frame because
+         the post-filter will be re-applied in the decoder after the MDCT
+         overlap. */
+      comb_filter(etmp, decode_mem[c]+DECODE_BUFFER_SIZE-N,
+         st->postfilter_period_old, st->postfilter_period, overlap,
+         -st->postfilter_gain_old, -st->postfilter_gain,
+         st->postfilter_tapset_old, st->postfilter_tapset, NULL, 0, st->arch);
+
+      /* Simulate TDAC on the concealed audio so that it blends with the
+         MDCT of the next frame. */
+      for (i=0;i<overlap/2;i++)
+      {
+         decode_mem[c][DECODE_BUFFER_SIZE-N+i] =
+            MULT16_32_Q15(COEF2VAL16(mode->window[i]), etmp[overlap-1-i])
+            + MULT16_32_Q15 (COEF2VAL16(mode->window[overlap-i-1]), etmp[i]);
+      }
+   } while (++c<CC);
+}
+
+#ifdef ENABLE_DEEP_PLC
+
+#define SINC_ORDER 48
+/* h=cos(pi/2*abs(sin([-24:24]/48*pi*23./24)).^2);
+   b=sinc([-24:24]/3*1.02).*h;
+   b=b/sum(b); */
+static const float sinc_filter[SINC_ORDER+1] = {
+    4.2931e-05f, -0.000190293f, -0.000816132f, -0.000637162f, 0.00141662f, 0.00354764f, 0.00184368f, -0.00428274f,
+    -0.00856105f, -0.0034003f, 0.00930201f, 0.0159616f, 0.00489785f, -0.0169649f, -0.0259484f, -0.00596856f,
+    0.0286551f, 0.0405872f, 0.00649994f, -0.0509284f, -0.0716655f, -0.00665212f,  0.134336f,  0.278927f,
+    0.339995f,  0.278927f,  0.134336f, -0.00665212f, -0.0716655f, -0.0509284f, 0.00649994f, 0.0405872f,
+    0.0286551f, -0.00596856f, -0.0259484f, -0.0169649f, 0.00489785f, 0.0159616f, 0.00930201f, -0.0034003f,
+    -0.00856105f, -0.00428274f, 0.00184368f, 0.00354764f, 0.00141662f, -0.000637162f, -0.000816132f, -0.000190293f,
+    4.2931e-05f
+};
+
+void update_plc_state(LPCNetPLCState *lpcnet, celt_sig *decode_mem[2], float *plc_preemphasis_mem, int CC)
+{
+   int i;
+   int tmp_read_post, tmp_fec_skip;
+   int offset;
+   celt_sig buf48k[DECODE_BUFFER_SIZE];
+   opus_int16 buf16k[PLC_UPDATE_SAMPLES];
+   if (CC == 1) OPUS_COPY(buf48k, decode_mem[0], DECODE_BUFFER_SIZE);
+   else {
+      for (i=0;i<DECODE_BUFFER_SIZE;i++) {
+         buf48k[i] = .5*(decode_mem[0][i] + decode_mem[1][i]);
+      }
+   }
+   /* Down-sample the last 40 ms. */
+   for (i=1;i<DECODE_BUFFER_SIZE;i++) buf48k[i] += PREEMPHASIS*buf48k[i-1];
+   *plc_preemphasis_mem = buf48k[DECODE_BUFFER_SIZE-1];
+   offset = DECODE_BUFFER_SIZE-SINC_ORDER-1 - 3*(PLC_UPDATE_SAMPLES-1);
+   celt_assert(3*(PLC_UPDATE_SAMPLES-1) + SINC_ORDER + offset == DECODE_BUFFER_SIZE-1);
+   for (i=0;i<PLC_UPDATE_SAMPLES;i++) {
+      int j;
+      float sum = 0;
+      for (j=0;j<SINC_ORDER+1;j++) {
+         sum += buf48k[3*i + j + offset]*sinc_filter[j];
+      }
+      buf16k[i] = float2int(MIN32(32767.f, MAX32(-32767.f, sum)));
+   }
+   tmp_read_post = lpcnet->fec_read_pos;
+   tmp_fec_skip = lpcnet->fec_skip;
+   for (i=0;i<PLC_UPDATE_FRAMES;i++) {
+      lpcnet_plc_update(lpcnet, &buf16k[FRAME_SIZE*i]);
+   }
+   lpcnet->fec_read_pos = tmp_read_post;
+   lpcnet->fec_skip = tmp_fec_skip;
+}
+#endif
+
+static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
+{
+   int c;
+   int i;
+   const int C = st->channels;
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   celt_glog *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   int start;
+   int loss_duration;
+   int noise_based;
+   const opus_int16 *eBands;
+   SAVE_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<C);
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*C);
+   oldBandE = (celt_glog*)(lpc+C*CELT_LPC_ORDER);
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+   loss_duration = st->loss_duration;
+   start = st->start;
+#ifdef ENABLE_DEEP_PLC
+   if (lpcnet != NULL) noise_based = start != 0 || (lpcnet->fec_fill_pos == 0 && (st->skip_plc || loss_duration >= 80));
+   else
+#endif
+   noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
+   if (noise_based)
+   {
+      /* Noise-based PLC/CNG */
+      VARDECL(celt_norm, X);
+      opus_uint32 seed;
+      int end;
+      int effEnd;
+      celt_glog decay;
+      end = st->end;
+      effEnd = IMAX(start, IMIN(end, mode->effEBands));
+
+      ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+overlap);
+      } while (++c<C);
+
+      if (st->prefilter_and_fold) {
+         prefilter_and_fold(st, N);
+      }
+
+      /* Energy decay */
+      decay = loss_duration==0 ? GCONST(1.5f) : GCONST(.5f);
+      c=0; do
+      {
+         for (i=start;i<end;i++)
+            oldBandE[c*nbEBands+i] = MAXG(backgroundLogE[c*nbEBands+i], oldBandE[c*nbEBands+i] - decay);
+      } while (++c<C);
+      seed = st->rng;
+      for (c=0;c<C;c++)
+      {
+         for (i=start;i<effEnd;i++)
+         {
+            int j;
+            int boffs;
+            int blen;
+            boffs = N*c+(eBands[i]<<LM);
+            blen = (eBands[i+1]-eBands[i])<<LM;
+            for (j=0;j<blen;j++)
+            {
+               seed = celt_lcg_rand(seed);
+               X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
+            }
+            renormalise_vector(X+boffs, blen, Q31ONE, st->arch);
+         }
+      }
+      st->rng = seed;
+
+      celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
+      st->prefilter_and_fold = 0;
+      /* Skip regular PLC until we get two consecutive packets. */
+      st->skip_plc = 1;
+   } else {
+      int exc_length;
+      /* Pitch-based PLC */
+      const celt_coef *window;
+      opus_val16 *exc;
+      opus_val16 fade = Q15ONE;
+      int pitch_index;
+      VARDECL(opus_val16, _exc);
+      VARDECL(opus_val16, fir_tmp);
+
+      if (loss_duration == 0)
+      {
+#ifdef ENABLE_DEEP_PLC
+        if (lpcnet != NULL && lpcnet->loaded) update_plc_state(lpcnet, decode_mem, &st->plc_preemphasis_mem, C);
+#endif
+         st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
+      } else {
+         pitch_index = st->last_pitch_index;
+         fade = QCONST16(.8f,15);
+      }
+
+      /* We want the excitation for 2 pitch periods in order to look for a
+         decaying signal, but we can't get more than MAX_PERIOD. */
+      exc_length = IMIN(2*pitch_index, MAX_PERIOD);
+
+      ALLOC(_exc, MAX_PERIOD+CELT_LPC_ORDER, opus_val16);
+      ALLOC(fir_tmp, exc_length, opus_val16);
+      exc = _exc+CELT_LPC_ORDER;
+      window = mode->window;
+      c=0; do {
+         opus_val16 decay;
+         opus_val16 attenuation;
+         opus_val32 S1=0;
+         celt_sig *buf;
+         int extrapolation_offset;
+         int extrapolation_len;
+         int j;
+
+         buf = decode_mem[c];
+         for (i=0;i<MAX_PERIOD+CELT_LPC_ORDER;i++)
+            exc[i-CELT_LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-CELT_LPC_ORDER+i], SIG_SHIFT);
+
+         if (loss_duration == 0)
+         {
+            opus_val32 ac[CELT_LPC_ORDER+1];
+            /* Compute LPC coefficients for the last MAX_PERIOD samples before
+               the first loss so we can work in the excitation-filter domain. */
+            _celt_autocorr(exc, ac, window, overlap,
+                   CELT_LPC_ORDER, MAX_PERIOD, st->arch);
+            /* Add a noise floor of -40 dB. */
+#ifdef FIXED_POINT
+            ac[0] += SHR32(ac[0],13);
+#else
+            ac[0] *= 1.0001f;
+#endif
+            /* Use lag windowing to stabilize the Levinson-Durbin recursion. */
+            for (i=1;i<=CELT_LPC_ORDER;i++)
+            {
+               /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
+#ifdef FIXED_POINT
+               ac[i] -= MULT16_32_Q15(2*i*i, ac[i]);
+#else
+               ac[i] -= ac[i]*(0.008f*0.008f)*i*i;
+#endif
+            }
+            _celt_lpc(lpc+c*CELT_LPC_ORDER, ac, CELT_LPC_ORDER);
+#ifdef FIXED_POINT
+         /* For fixed-point, apply bandwidth expansion until we can guarantee that
+            no overflow can happen in the IIR filter. This means:
+            32768*sum(abs(filter)) < 2^31 */
+         while (1) {
+            opus_val16 tmp=Q15ONE;
+            opus_val32 sum=QCONST16(1., SIG_SHIFT);
+            for (i=0;i<CELT_LPC_ORDER;i++)
+               sum += ABS16(lpc[c*CELT_LPC_ORDER+i]);
+            if (sum < 65535) break;
+            for (i=0;i<CELT_LPC_ORDER;i++)
+            {
+               tmp = MULT16_16_Q15(QCONST16(.99f,15), tmp);
+               lpc[c*CELT_LPC_ORDER+i] = MULT16_16_Q15(lpc[c*CELT_LPC_ORDER+i], tmp);
+            }
+         }
+#endif
+         }
+         /* Initialize the LPC history with the samples just before the start
+            of the region for which we're computing the excitation. */
+         {
+            /* Compute the excitation for exc_length samples before the loss. We need the copy
+               because celt_fir() cannot filter in-place. */
+            celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*CELT_LPC_ORDER,
+                  fir_tmp, exc_length, CELT_LPC_ORDER, st->arch);
+            OPUS_COPY(exc+MAX_PERIOD-exc_length, fir_tmp, exc_length);
+         }
+
+         /* Check if the waveform is decaying, and if so how fast.
+            We do this to avoid adding energy when concealing in a segment
+            with decaying energy. */
+         {
+            opus_val32 E1=1, E2=1;
+            int decay_length;
+#ifdef FIXED_POINT
+            int shift = IMAX(0,2*celt_zlog2(celt_maxabs16(&exc[MAX_PERIOD-exc_length], exc_length))-20);
+#endif
+            decay_length = exc_length>>1;
+            for (i=0;i<decay_length;i++)
+            {
+               opus_val16 e;
+               e = exc[MAX_PERIOD-decay_length+i];
+               E1 += SHR32(MULT16_16(e, e), shift);
+               e = exc[MAX_PERIOD-2*decay_length+i];
+               E2 += SHR32(MULT16_16(e, e), shift);
+            }
+            E1 = MIN32(E1, E2);
+            decay = celt_sqrt(frac_div32(SHR32(E1, 1), E2));
+         }
+
+         /* Move the decoder memory one frame to the left to give us room to
+            add the data for the new frame. We ignore the overlap that extends
+            past the end of the buffer, because we aren't going to use it. */
+         OPUS_MOVE(buf, buf+N, DECODE_BUFFER_SIZE-N);
+
+         /* Extrapolate from the end of the excitation with a period of
+            "pitch_index", scaling down each period by an additional factor of
+            "decay". */
+         extrapolation_offset = MAX_PERIOD-pitch_index;
+         /* We need to extrapolate enough samples to cover a complete MDCT
+            window (including overlap/2 samples on both sides). */
+         extrapolation_len = N+overlap;
+         /* We also apply fading if this is not the first loss. */
+         attenuation = MULT16_16_Q15(fade, decay);
+         for (i=j=0;i<extrapolation_len;i++,j++)
+         {
+            opus_val16 tmp;
+            if (j >= pitch_index) {
+               j -= pitch_index;
+               attenuation = MULT16_16_Q15(attenuation, decay);
+            }
+            buf[DECODE_BUFFER_SIZE-N+i] =
+                  SHL32(EXTEND32(MULT16_16_Q15(attenuation,
+                        exc[extrapolation_offset+j])), SIG_SHIFT);
+            /* Compute the energy of the previously decoded signal whose
+               excitation we're copying. */
+            tmp = SROUND16(
+                  buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
+                  SIG_SHIFT);
+            S1 += SHR32(MULT16_16(tmp, tmp), 10);
+         }
+         {
+            opus_val16 lpc_mem[CELT_LPC_ORDER];
+            /* Copy the last decoded samples (prior to the overlap region) to
+               synthesis filter memory so we can have a continuous signal. */
+            for (i=0;i<CELT_LPC_ORDER;i++)
+               lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+            /* Apply the synthesis filter to convert the excitation back into
+               the signal domain. */
+            celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*CELT_LPC_ORDER,
+                  buf+DECODE_BUFFER_SIZE-N, extrapolation_len, CELT_LPC_ORDER,
+                  lpc_mem, st->arch);
+#ifdef FIXED_POINT
+            for (i=0; i < extrapolation_len; i++)
+               buf[DECODE_BUFFER_SIZE-N+i] = SATURATE(buf[DECODE_BUFFER_SIZE-N+i], SIG_SAT);
+#endif
+         }
+
+         /* Check if the synthesis energy is higher than expected, which can
+            happen with the signal changes during our window. If so,
+            attenuate. */
+         {
+            opus_val32 S2=0;
+            for (i=0;i<extrapolation_len;i++)
+            {
+               opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               S2 += SHR32(MULT16_16(tmp, tmp), 10);
+            }
+            /* This checks for an "explosion" in the synthesis. */
+#ifdef FIXED_POINT
+            if (!(S1 > SHR32(S2,2)))
+#else
+            /* The float test is written this way to catch NaNs in the output
+               of the IIR filter at the same time. */
+            if (!(S1 > 0.2f*S2))
+#endif
+            {
+               for (i=0;i<extrapolation_len;i++)
+                  buf[DECODE_BUFFER_SIZE-N+i] = 0;
+            } else if (S1 < S2)
+            {
+               opus_val16 ratio = celt_sqrt(frac_div32(SHR32(S1,1)+1,S2+1));
+               for (i=0;i<overlap;i++)
+               {
+                  opus_val16 tmp_g = Q15ONE
+                        - MULT16_16_Q15(COEF2VAL16(window[i]), Q15ONE-ratio);
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(tmp_g, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+               for (i=overlap;i<extrapolation_len;i++)
+               {
+                  buf[DECODE_BUFFER_SIZE-N+i] =
+                        MULT16_32_Q15(ratio, buf[DECODE_BUFFER_SIZE-N+i]);
+               }
+            }
+         }
+
+      } while (++c<C);
+
+#ifdef ENABLE_DEEP_PLC
+      if (lpcnet != NULL && lpcnet->loaded && (st->complexity >= 5 || lpcnet->fec_fill_pos > 0)) {
+         float overlap_mem;
+         int samples_needed16k;
+         celt_sig *buf;
+         VARDECL(float, buf_copy);
+         buf = decode_mem[0];
+         ALLOC(buf_copy, C*overlap, float);
+         c=0; do {
+            OPUS_COPY(buf_copy+c*overlap, &decode_mem[c][DECODE_BUFFER_SIZE-N], overlap);
+         } while (++c<C);
+
+         /* Need enough samples from the PLC to cover the frame size, resampling delay,
+            and the overlap at the end. */
+         samples_needed16k = (N+SINC_ORDER+overlap)/3;
+         if (loss_duration == 0) {
+            st->plc_fill = 0;
+         }
+         while (st->plc_fill < samples_needed16k) {
+            lpcnet_plc_conceal(lpcnet, &st->plc_pcm[st->plc_fill]);
+            st->plc_fill += FRAME_SIZE;
+         }
+         /* Resample to 48 kHz. */
+         for (i=0;i<(N+overlap)/3;i++) {
+            int j;
+            float sum;
+            for (sum=0, j=0;j<17;j++) sum += 3*st->plc_pcm[i+j]*sinc_filter[3*j];
+            buf[DECODE_BUFFER_SIZE-N+3*i] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+2];
+            buf[DECODE_BUFFER_SIZE-N+3*i+1] = sum;
+            for (sum=0, j=0;j<16;j++) sum += 3*st->plc_pcm[i+j+1]*sinc_filter[3*j+1];
+            buf[DECODE_BUFFER_SIZE-N+3*i+2] = sum;
+         }
+         OPUS_MOVE(st->plc_pcm, &st->plc_pcm[N/3], st->plc_fill-N/3);
+         st->plc_fill -= N/3;
+         for (i=0;i<N;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE-N+i];
+            buf[DECODE_BUFFER_SIZE-N+i] -= PREEMPHASIS*st->plc_preemphasis_mem;
+            st->plc_preemphasis_mem = tmp;
+         }
+         overlap_mem = st->plc_preemphasis_mem;
+         for (i=0;i<overlap;i++) {
+            float tmp = buf[DECODE_BUFFER_SIZE+i];
+            buf[DECODE_BUFFER_SIZE+i] -= PREEMPHASIS*overlap_mem;
+            overlap_mem = tmp;
+         }
+         /* For now, we just do mono PLC. */
+         if (C==2) OPUS_COPY(decode_mem[1], decode_mem[0], DECODE_BUFFER_SIZE+overlap);
+         c=0; do {
+            /* Cross-fade with 48-kHz non-neural PLC for the first 2.5 ms to avoid a discontinuity. */
+            if (loss_duration == 0) {
+               for (i=0;i<overlap;i++) decode_mem[c][DECODE_BUFFER_SIZE-N+i] = (1-window[i])*buf_copy[c*overlap+i] + (window[i])*decode_mem[c][DECODE_BUFFER_SIZE-N+i];
+            }
+         } while (++c<C);
+      }
+#endif
+      st->prefilter_and_fold = 1;
+   }
+
+   /* Saturate to soemthing large to avoid wrap-around. */
+   st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
+
+   RESTORE_STACK;
+}
+
+int celt_decode_with_ec_dred(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_res * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum
+#ifdef ENABLE_DEEP_PLC
+      ,LPCNetPLCState *lpcnet
+#endif
+      )
+{
+   int c, i, N;
+   int spread_decision;
+   opus_int32 bits;
+   ec_dec _dec;
+   VARDECL(celt_norm, X);
+   VARDECL(int, fine_quant);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *decode_mem[2];
+   celt_sig *out_syn[2];
+   opus_val16 *lpc;
+   celt_glog *oldBandE, *oldLogE, *oldLogE2, *backgroundLogE;
+
+   int shortBlocks;
+   int isTransient;
+   int intra_ener;
+   const int CC = st->channels;
+   int LM, M;
+   int start;
+   int end;
+   int effEnd;
+   int codedBands;
+   int alloc_trim;
+   int postfilter_pitch;
+   opus_val16 postfilter_gain;
+   int intensity=0;
+   int dual_stereo=0;
+   opus_int32 total_bits;
+   opus_int32 balance;
+   opus_int32 tell;
+   int dynalloc_logp;
+   int postfilter_tapset;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence;
+   int C = st->stream_channels;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   celt_glog max_background_increase;
+   ALLOC_STACK;
+
+   VALIDATE_CELT_DECODER(st);
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   start = st->start;
+   end = st->end;
+   frame_size *= st->downsample;
+
+   lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+overlap)*CC);
+   oldBandE = (celt_glog*)(lpc+CC*CELT_LPC_ORDER);
+   oldLogE = oldBandE + 2*nbEBands;
+   oldLogE2 = oldLogE + 2*nbEBands;
+   backgroundLogE = oldLogE2  + 2*nbEBands;
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && data!=NULL)
+   {
+      int data0=data[0];
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         data0 = fromOpus(data0);
+         if (data0<0)
+            return OPUS_INVALID_PACKET;
+      }
+      st->end = end = IMAX(1, mode->effEBands-2*(data0>>5));
+      LM = (data0>>3)&0x3;
+      C = 1 + ((data0>>2)&0x1);
+      data++;
+      len--;
+      if (LM>mode->maxLM)
+         return OPUS_INVALID_PACKET;
+      if (frame_size < mode->shortMdctSize<<LM)
+         return OPUS_BUFFER_TOO_SMALL;
+      else
+         frame_size = mode->shortMdctSize<<LM;
+   } else {
+#else
+   {
+#endif
+      for (LM=0;LM<=mode->maxLM;LM++)
+         if (mode->shortMdctSize<<LM==frame_size)
+            break;
+      if (LM>mode->maxLM)
+         return OPUS_BAD_ARG;
+   }
+   M=1<<LM;
+
+   if (len<0 || len>1275 || pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   N = M*mode->shortMdctSize;
+   c=0; do {
+      decode_mem[c] = st->_decode_mem + c*(DECODE_BUFFER_SIZE+overlap);
+      out_syn[c] = decode_mem[c]+DECODE_BUFFER_SIZE-N;
+   } while (++c<CC);
+
+   effEnd = end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   if (data == NULL || len<=1)
+   {
+      celt_decode_lost(st, N, LM
+#ifdef ENABLE_DEEP_PLC
+      , lpcnet
+#endif
+                      );
+      deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
+      RESTORE_STACK;
+      return frame_size/st->downsample;
+   }
+#ifdef ENABLE_DEEP_PLC
+   else {
+      /* FIXME: This is a bit of a hack just to make sure opus_decode_native() knows we're no longer in PLC. */
+      if (lpcnet) lpcnet->blend = 0;
+   }
+#endif
+
+   /* Check if there are at least two packets received consecutively before
+    * turning on the pitch-based PLC */
+   if (st->loss_duration == 0) st->skip_plc = 0;
+
+   if (dec == NULL)
+   {
+      ec_dec_init(&_dec,(unsigned char*)data,len);
+      dec = &_dec;
+   }
+
+   if (C==1)
+   {
+      for (i=0;i<nbEBands;i++)
+         oldBandE[i]=MAXG(oldBandE[i],oldBandE[nbEBands+i]);
+   }
+
+   total_bits = len*8;
+   tell = ec_tell(dec);
+
+   if (tell >= total_bits)
+      silence = 1;
+   else if (tell==1)
+      silence = ec_dec_bit_logp(dec, 15);
+   else
+      silence = 0;
+   if (silence)
+   {
+      /* Pretend we've read all the remaining bits */
+      tell = len*8;
+      dec->nbits_total+=tell-ec_tell(dec);
+   }
+
+   postfilter_gain = 0;
+   postfilter_pitch = 0;
+   postfilter_tapset = 0;
+   if (start==0 && tell+16 <= total_bits)
+   {
+      if(ec_dec_bit_logp(dec, 1))
+      {
+         int qg, octave;
+         octave = ec_dec_uint(dec, 6);
+         postfilter_pitch = (16<<octave)+ec_dec_bits(dec, 4+octave)-1;
+         qg = ec_dec_bits(dec, 3);
+         if (ec_tell(dec)+2<=total_bits)
+            postfilter_tapset = ec_dec_icdf(dec, tapset_icdf, 2);
+         postfilter_gain = QCONST16(.09375f,15)*(qg+1);
+      }
+      tell = ec_tell(dec);
+   }
+
+   if (LM > 0 && tell+3 <= total_bits)
+   {
+      isTransient = ec_dec_bit_logp(dec, 3);
+      tell = ec_tell(dec);
+   }
+   else
+      isTransient = 0;
+
+   if (isTransient)
+      shortBlocks = M;
+   else
+      shortBlocks = 0;
+
+   /* Decode the global flags (first symbols in the stream) */
+   intra_ener = tell+3<=total_bits ? ec_dec_bit_logp(dec, 3) : 0;
+   /* If recovering from packet loss, make sure we make the energy prediction safe to reduce the
+      risk of getting loud artifacts. */
+   if (!intra_ener && st->loss_duration != 0) {
+      c=0; do
+      {
+         celt_glog safety = 0;
+         int missing = IMIN(10, st->loss_duration>>LM);
+         if (LM==0) safety = GCONST(1.5f);
+         else if (LM==1) safety = GCONST(.5f);
+         for (i=start;i<end;i++)
+         {
+            if (oldBandE[c*nbEBands+i] < MAXG(oldLogE[c*nbEBands+i], oldLogE2[c*nbEBands+i])) {
+               /* If energy is going down already, continue the trend. */
+               opus_val32 slope;
+               opus_val32 E0, E1, E2;
+               E0 = oldBandE[c*nbEBands+i];
+               E1 = oldLogE[c*nbEBands+i];
+               E2 = oldLogE2[c*nbEBands+i];
+               slope = MAX32(E1 - E0, HALF32(E2 - E0));
+               slope = MING(slope, GCONST(2.f));
+               E0 -= MAX32(0, (1+missing)*slope);
+               oldBandE[c*nbEBands+i] = MAX32(-GCONST(20.f), E0);
+            } else {
+               /* Otherwise take the min of the last frames. */
+               oldBandE[c*nbEBands+i] = MING(MING(oldBandE[c*nbEBands+i], oldLogE[c*nbEBands+i]), oldLogE2[c*nbEBands+i]);
+            }
+            /* Shorter frames have more natural fluctuations -- play it safe. */
+            oldBandE[c*nbEBands+i] -= safety;
+         }
+      } while (++c<2);
+   }
+   /* Get band energies */
+   unquant_coarse_energy(mode, start, end, oldBandE,
+         intra_ener, dec, C, LM);
+
+   ALLOC(tf_res, nbEBands, int);
+   tf_decode(start, end, isTransient, tf_res, LM, dec);
+
+   tell = ec_tell(dec);
+   spread_decision = SPREAD_NORMAL;
+   if (tell+4 <= total_bits)
+      spread_decision = ec_dec_icdf(dec, spread_icdf, 5);
+
+   ALLOC(cap, nbEBands, int);
+
+   init_caps(mode,cap,LM,C);
+
+   ALLOC(offsets, nbEBands, int);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   tell = ec_tell_frac(dec);
+   for (i=start;i<end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      while (tell+(dynalloc_loop_logp<<BITRES) < total_bits && boost < cap[i])
+      {
+         int flag;
+         flag = ec_dec_bit_logp(dec, dynalloc_loop_logp);
+         tell = ec_tell_frac(dec);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_bits -= quanta;
+         dynalloc_loop_logp = 1;
+      }
+      offsets[i] = boost;
+      /* Making dynalloc more likely */
+      if (boost>0)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+   }
+
+   ALLOC(fine_quant, nbEBands, int);
+   alloc_trim = tell+(6<<BITRES) <= total_bits ?
+         ec_dec_icdf(dec, trim_icdf, 7) : 5;
+
+   bits = (((opus_int32)len*8)<<BITRES) - (opus_int32)ec_tell_frac(dec) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   codedBands = clt_compute_allocation(mode, start, end, offsets, cap,
+         alloc_trim, &intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, dec, 0, 0, 0);
+
+   unquant_fine_energy(mode, start, end, oldBandE, fine_quant, dec, C);
+
+   c=0; do {
+      OPUS_MOVE(decode_mem[c], decode_mem[c]+N, DECODE_BUFFER_SIZE-N+overlap);
+   } while (++c<CC);
+
+   /* Decode fixed codebook */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+
+   ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
+
+   quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
+         NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
+         len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, 0,
+         st->arch, st->disable_inv);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = ec_dec_bits(dec, 1);
+   }
+
+   unquant_energy_finalise(mode, start, end, oldBandE,
+         fine_quant, fine_priority, len*8-ec_tell(dec), dec, C);
+
+   if (anti_collapse_on)
+      anti_collapse(mode, X, collapse_masks, LM, C, N,
+            start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, 0, st->arch);
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -GCONST(28.f);
+   }
+   if (st->prefilter_and_fold) {
+      prefilter_and_fold(st, N);
+   }
+   celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd,
+                  C, CC, isTransient, LM, st->downsample, silence, st->arch);
+
+   c=0; do {
+      st->postfilter_period=IMAX(st->postfilter_period, COMBFILTER_MINPERIOD);
+      st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD);
+      comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize,
+            st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset,
+            mode->window, overlap, st->arch);
+      if (LM!=0)
+         comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize,
+               st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset,
+               mode->window, overlap, st->arch);
+
+   } while (++c<CC);
+   st->postfilter_period_old = st->postfilter_period;
+   st->postfilter_gain_old = st->postfilter_gain;
+   st->postfilter_tapset_old = st->postfilter_tapset;
+   st->postfilter_period = postfilter_pitch;
+   st->postfilter_gain = postfilter_gain;
+   st->postfilter_tapset = postfilter_tapset;
+   if (LM!=0)
+   {
+      st->postfilter_period_old = st->postfilter_period;
+      st->postfilter_gain_old = st->postfilter_gain;
+      st->postfilter_tapset_old = st->postfilter_tapset;
+   }
+
+   if (C==1)
+      OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
+
+   if (!isTransient)
+   {
+      OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
+      OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
+   } else {
+      for (i=0;i<2*nbEBands;i++)
+         oldLogE[i] = MING(oldLogE[i], oldBandE[i]);
+   }
+   /* In normal circumstances, we only allow the noise floor to increase by
+      up to 2.4 dB/second, but when we're in DTX we give the weight of
+      all missing packets to the update packet. */
+   max_background_increase = IMIN(160, st->loss_duration+M)*GCONST(0.001f);
+   for (i=0;i<2*nbEBands;i++)
+      backgroundLogE[i] = MING(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+   /* In case start or end were to change */
+   c=0; do
+   {
+      for (i=0;i<start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-GCONST(28.f);
+      }
+      for (i=end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-GCONST(28.f);
+      }
+   } while (++c<2);
+   st->rng = dec->rng;
+
+   deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
+   st->loss_duration = 0;
+   st->prefilter_and_fold = 0;
+   RESTORE_STACK;
+   if (ec_tell(dec) > 8*len)
+      return OPUS_INTERNAL_ERROR;
+   if(ec_get_error(dec))
+      st->error = 1;
+   return frame_size/st->downsample;
+}
+
+int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data,
+      int len, opus_res * OPUS_RESTRICT pcm, int frame_size, ec_dec *dec, int accum)
+{
+   return celt_decode_with_ec_dred(st, data, len, pcm, frame_size, dec, accum
+#ifdef ENABLE_DEEP_PLC
+       , NULL
+#endif
+       );
+}
+
+#ifdef CUSTOM_MODES
+
+#if defined(FIXED_POINT) && !defined(ENABLE_RES24)
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
+}
+#else
+int opus_custom_decode(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int16 * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+
+   ALLOC(out, C*N, opus_res);
+   ret = celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j]=RES2INT16(out[j]);
+
+   RESTORE_STACK;
+   return ret;
+}
+#endif
+
+#if defined(FIXED_POINT) && defined(ENABLE_RES24)
+int opus_custom_decode24(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int32 * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
+}
+#else
+int opus_custom_decode24(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, opus_int32 * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+
+   ALLOC(out, C*N, opus_res);
+   ret = celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j]=RES2INT24(out[j]);
+
+   RESTORE_STACK;
+   return ret;
+}
+#endif
+
+
+#ifndef DISABLE_FLOAT_API
+
+# if !defined(FIXED_POINT)
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   return celt_decode_with_ec(st, data, len, pcm, frame_size, NULL, 0);
+}
+# else
+int opus_custom_decode_float(CELTDecoder * OPUS_RESTRICT st, const unsigned char *data, int len, float * OPUS_RESTRICT pcm, int frame_size)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, out);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+
+   ALLOC(out, C*N, opus_res);
+   ret=celt_decode_with_ec(st, data, len, out, frame_size, NULL, 0);
+   if (ret>0)
+      for (j=0;j<C*ret;j++)
+         pcm[j]=RES2FLOAT(out[j]);
+
+   RESTORE_STACK;
+   return ret;
+}
+# endif
+
+#endif
+
+#endif /* CUSTOM_MODES */
+
+int opus_custom_decoder_ctl(CELTDecoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if(value<0 || value>10)
+          {
+             goto bad_arg;
+          }
+          st->complexity = value;
+      }
+      break;
+      case OPUS_GET_COMPLEXITY_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          if (!value)
+          {
+             goto bad_arg;
+          }
+          *value = st->complexity;
+      }
+      break;
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case CELT_GET_AND_CLEAR_ERROR_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value=st->error;
+         st->error = 0;
+      }
+      break;
+      case OPUS_GET_LOOKAHEAD_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->overlap/st->downsample;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         opus_val16 *lpc;
+         celt_glog *oldBandE, *oldLogE, *oldLogE2;
+         lpc = (opus_val16*)(st->_decode_mem+(DECODE_BUFFER_SIZE+st->overlap)*st->channels);
+         oldBandE = (celt_glog*)(lpc+st->channels*CELT_LPC_ORDER);
+         oldLogE = oldBandE + 2*st->mode->nbEBands;
+         oldLogE2 = oldLogE + 2*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->DECODER_RESET_START,
+               opus_custom_decoder_get_size(st->mode, st->channels)-
+               ((char*)&st->DECODER_RESET_START - (char*)st));
+         for (i=0;i<2*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-GCONST(28.f);
+         st->skip_plc = 1;
+      }
+      break;
+      case OPUS_GET_PITCH_REQUEST:
+      {
+         opus_int32 *value = va_arg(ap, opus_int32*);
+         if (value==NULL)
+            goto bad_arg;
+         *value = st->postfilter_period;
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      case OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if(value<0 || value>1)
+          {
+             goto bad_arg;
+          }
+          st->disable_inv = value;
+      }
+      break;
+      case OPUS_GET_PHASE_INVERSION_DISABLED_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          if (!value)
+          {
+             goto bad_arg;
+          }
+          *value = st->disable_inv;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+      va_end(ap);
+  return OPUS_UNIMPLEMENTED;
+}
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
+/* Copyright (c) 2007-2008 CSIRO
+   Copyright (c) 2007-2010 Xiph.Org Foundation
+   Copyright (c) 2008 Gregory Maxwell
+   Written by Jean-Marc Valin and Gregory Maxwell */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define CELT_ENCODER_C
+
+#include "cpu_support.h"
+#include "os_support.h"
+#include "mdct.h"
+#include <math.h>
+#include "celt.h"
+#include "pitch.h"
+#include "bands.h"
+#include "modes.h"
+#include "entcode.h"
+#include "quant_bands.h"
+#include "rate.h"
+#include "stack_alloc.h"
+#include "mathops.h"
+#include "float_cast.h"
+#include <stdarg.h>
+#include "celt_lpc.h"
+#include "vq.h"
+
+
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+
+/** Encoder state
+ @brief Encoder state
+ */
+struct OpusCustomEncoder {
+   const OpusCustomMode *mode;     /**< Mode used by the encoder */
+   int channels;
+   int stream_channels;
+
+   int force_intra;
+   int clip;
+   int disable_pf;
+   int complexity;
+   int upsample;
+   int start, end;
+
+   opus_int32 bitrate;
+   int vbr;
+   int signalling;
+   int constrained_vbr;      /* If zero, VBR can do whatever it likes with the rate */
+   int loss_rate;
+   int lsb_depth;
+   int lfe;
+   int disable_inv;
+   int arch;
+
+   /* Everything beyond this point gets cleared on a reset */
+#define ENCODER_RESET_START rng
+
+   opus_uint32 rng;
+   int spread_decision;
+   opus_val32 delayedIntra;
+   int tonal_average;
+   int lastCodedBands;
+   int hf_average;
+   int tapset_decision;
+
+   int prefilter_period;
+   opus_val16 prefilter_gain;
+   int prefilter_tapset;
+#ifdef RESYNTH
+   int prefilter_period_old;
+   opus_val16 prefilter_gain_old;
+   int prefilter_tapset_old;
+#endif
+   int consec_transient;
+   AnalysisInfo analysis;
+   SILKInfo silk_info;
+
+   opus_val32 preemph_memE[2];
+   opus_val32 preemph_memD[2];
+
+   /* VBR-related parameters */
+   opus_int32 vbr_reservoir;
+   opus_int32 vbr_drift;
+   opus_int32 vbr_offset;
+   opus_int32 vbr_count;
+   opus_val32 overlap_max;
+   opus_val16 stereo_saving;
+   int intensity;
+   celt_glog *energy_mask;
+   celt_glog spec_avg;
+
+#ifdef RESYNTH
+   /* +MAX_PERIOD/2 to make space for overlap */
+   celt_sig syn_mem[2][2*MAX_PERIOD+MAX_PERIOD/2];
+#endif
+
+   celt_sig in_mem[1]; /* Size = channels*mode->overlap */
+   /* celt_sig prefilter_mem[],  Size = channels*COMBFILTER_MAXPERIOD */
+   /* celt_glog oldBandE[],     Size = channels*mode->nbEBands */
+   /* celt_glog oldLogE[],      Size = channels*mode->nbEBands */
+   /* celt_glog oldLogE2[],     Size = channels*mode->nbEBands */
+   /* celt_glog energyError[],  Size = channels*mode->nbEBands */
+};
+
+int celt_encoder_get_size(int channels)
+{
+   CELTMode *mode = opus_custom_mode_create(48000, 960, NULL);
+   return opus_custom_encoder_get_size(mode, channels);
+}
+
+OPUS_CUSTOM_NOSTATIC int opus_custom_encoder_get_size(const CELTMode *mode, int channels)
+{
+   int size = sizeof(struct CELTEncoder)
+         + (channels*mode->overlap-1)*sizeof(celt_sig)    /* celt_sig in_mem[channels*mode->overlap]; */
+         + channels*COMBFILTER_MAXPERIOD*sizeof(celt_sig) /* celt_sig prefilter_mem[channels*COMBFILTER_MAXPERIOD]; */
+         + 4*channels*mode->nbEBands*sizeof(celt_glog);   /* celt_glog oldBandE[channels*mode->nbEBands]; */
+                                                          /* celt_glog oldLogE[channels*mode->nbEBands]; */
+                                                          /* celt_glog oldLogE2[channels*mode->nbEBands]; */
+                                                          /* celt_glog energyError[channels*mode->nbEBands]; */
+   return size;
+}
+
+#ifdef CUSTOM_MODES
+CELTEncoder *opus_custom_encoder_create(const CELTMode *mode, int channels, int *error)
+{
+   int ret;
+   CELTEncoder *st = (CELTEncoder *)opus_alloc(opus_custom_encoder_get_size(mode, channels));
+   /* init will handle the NULL case */
+   ret = opus_custom_encoder_init(st, mode, channels);
+   if (ret != OPUS_OK)
+   {
+      opus_custom_encoder_destroy(st);
+      st = NULL;
+   }
+   if (error)
+      *error = ret;
+   return st;
+}
+#endif /* CUSTOM_MODES */
+
+static int opus_custom_encoder_init_arch(CELTEncoder *st, const CELTMode *mode,
+                                         int channels, int arch)
+{
+   if (channels < 0 || channels > 2)
+      return OPUS_BAD_ARG;
+
+   if (st==NULL || mode==NULL)
+      return OPUS_ALLOC_FAIL;
+
+   OPUS_CLEAR((char*)st, opus_custom_encoder_get_size(mode, channels));
+
+   st->mode = mode;
+   st->stream_channels = st->channels = channels;
+
+   st->upsample = 1;
+   st->start = 0;
+   st->end = st->mode->effEBands;
+   st->signalling = 1;
+   st->arch = arch;
+
+   st->constrained_vbr = 1;
+   st->clip = 1;
+
+   st->bitrate = OPUS_BITRATE_MAX;
+   st->vbr = 0;
+   st->force_intra  = 0;
+   st->complexity = 5;
+   st->lsb_depth=24;
+
+   opus_custom_encoder_ctl(st, OPUS_RESET_STATE);
+
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+int opus_custom_encoder_init(CELTEncoder *st, const CELTMode *mode, int channels)
+{
+   return opus_custom_encoder_init_arch(st, mode, channels, opus_select_arch());
+}
+#endif
+
+int celt_encoder_init(CELTEncoder *st, opus_int32 sampling_rate, int channels,
+                      int arch)
+{
+   int ret;
+   ret = opus_custom_encoder_init_arch(st,
+           opus_custom_mode_create(48000, 960, NULL), channels, arch);
+   if (ret != OPUS_OK)
+      return ret;
+   st->upsample = resampling_factor(sampling_rate);
+   return OPUS_OK;
+}
+
+#ifdef CUSTOM_MODES
+void opus_custom_encoder_destroy(CELTEncoder *st)
+{
+   opus_free(st);
+}
+#endif /* CUSTOM_MODES */
+
+
+static int transient_analysis(const opus_val32 * OPUS_RESTRICT in, int len, int C,
+                              opus_val16 *tf_estimate, int *tf_chan, int allow_weak_transients,
+                              int *weak_transient, opus_val16 tone_freq, opus_val32 toneishness)
+{
+   int i;
+   VARDECL(opus_val16, tmp);
+   opus_val32 mem0,mem1;
+   int is_transient = 0;
+   opus_int32 mask_metric = 0;
+   int c;
+   opus_val16 tf_max;
+   int len2;
+   /* Forward masking: 6.7 dB/ms. */
+#ifdef FIXED_POINT
+   int forward_shift = 4;
+#else
+   opus_val16 forward_decay = QCONST16(.0625f,15);
+#endif
+   /* Table of 6*64/x, trained on real data to minimize the average error */
+   static const unsigned char inv_table[128] = {
+         255,255,156,110, 86, 70, 59, 51, 45, 40, 37, 33, 31, 28, 26, 25,
+          23, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12,
+          12, 12, 11, 11, 11, 10, 10, 10,  9,  9,  9,  9,  9,  9,  8,  8,
+           8,  8,  8,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,
+           6,  6,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,
+           5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
+           4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,
+           3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,
+   };
+   SAVE_STACK;
+   ALLOC(tmp, len, opus_val16);
+
+   *weak_transient = 0;
+   /* For lower bitrates, let's be more conservative and have a forward masking
+      decay of 3.3 dB/ms. This avoids having to code transients at very low
+      bitrate (mostly for hybrid), which can result in unstable energy and/or
+      partial collapse. */
+   if (allow_weak_transients)
+   {
+#ifdef FIXED_POINT
+      forward_shift = 5;
+#else
+      forward_decay = QCONST16(.03125f,15);
+#endif
+   }
+   len2=len/2;
+   for (c=0;c<C;c++)
+   {
+      opus_val32 mean;
+      opus_int32 unmask=0;
+      opus_val32 norm;
+      opus_val16 maxE;
+      mem0=0;
+      mem1=0;
+      /* High-pass filter: (1 - 2*z^-1 + z^-2) / (1 - z^-1 + .5*z^-2) */
+      for (i=0;i<len;i++)
+      {
+#ifndef FIXED_POINT
+         float mem00;
+#endif
+         opus_val32 x,y;
+         x = SHR32(in[i+c*len],SIG_SHIFT);
+         y = ADD32(mem0, x);
+#ifdef FIXED_POINT
+         mem0 = mem1 + y - SHL32(x,1);
+         mem1 = x - SHR32(y,1);
+#else
+         /* Original code:
+         mem0 = mem1 + y - 2*x;
+         mem1 = x - .5f*y;
+         Modified code to shorten dependency chains: */
+         mem00=mem0;
+         mem0 = mem0 - x + .5f*mem1;
+         mem1 =  x - mem00;
+#endif
+         tmp[i] = SROUND16(y, 2);
+         /*printf("%f ", tmp[i]);*/
+      }
+      /*printf("\n");*/
+      /* First few samples are bad because we don't propagate the memory */
+      OPUS_CLEAR(tmp, 12);
+
+#ifdef FIXED_POINT
+      /* Normalize tmp to max range */
+      {
+         int shift=0;
+         shift = 14-celt_ilog2(MAX16(1, celt_maxabs16(tmp, len)));
+         if (shift!=0)
+         {
+            for (i=0;i<len;i++)
+               tmp[i] = SHL16(tmp[i], shift);
+         }
+      }
+#endif
+
+      mean=0;
+      mem0=0;
+      /* Grouping by two to reduce complexity */
+      /* Forward pass to compute the post-echo threshold*/
+      for (i=0;i<len2;i++)
+      {
+         opus_val16 x2 = PSHR32(MULT16_16(tmp[2*i],tmp[2*i]) + MULT16_16(tmp[2*i+1],tmp[2*i+1]),16);
+         mean += x2;
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(x2-mem0,forward_shift);
+         mem0 = tmp[i];
+#else
+         mem0 = x2 + (1.f-forward_decay)*mem0;
+         tmp[i] = forward_decay*mem0;
+#endif
+      }
+
+      mem0=0;
+      maxE=0;
+      /* Backward pass to compute the pre-echo threshold */
+      for (i=len2-1;i>=0;i--)
+      {
+         /* Backward masking: 13.9 dB/ms. */
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         tmp[i] = mem0 + PSHR32(tmp[i]-mem0,3);
+         mem0 = tmp[i];
+         maxE = MAX16(maxE, mem0);
+#else
+         mem0 = tmp[i] + 0.875f*mem0;
+         tmp[i] = 0.125f*mem0;
+         maxE = MAX16(maxE, 0.125f*mem0);
+#endif
+      }
+      /*for (i=0;i<len2;i++)printf("%f ", tmp[i]/mean);printf("\n");*/
+
+      /* Compute the ratio of the "frame energy" over the harmonic mean of the energy.
+         This essentially corresponds to a bitrate-normalized temporal noise-to-mask
+         ratio */
+
+      /* As a compromise with the old transient detector, frame energy is the
+         geometric mean of the energy and half the max */
+#ifdef FIXED_POINT
+      /* Costs two sqrt() to avoid overflows */
+      mean = MULT16_16(celt_sqrt(mean), celt_sqrt(MULT16_16(maxE,len2>>1)));
+#else
+      mean = celt_sqrt(mean * maxE*.5*len2);
+#endif
+      /* Inverse of the mean energy in Q15+6 */
+      norm = SHL32(EXTEND32(len2),6+14)/ADD32(EPSILON,SHR32(mean,1));
+      /* Compute harmonic mean discarding the unreliable boundaries
+         The data is smooth, so we only take 1/4th of the samples */
+      unmask=0;
+      /* We should never see NaNs here. If we find any, then something really bad happened and we better abort
+         before it does any damage later on. If these asserts are disabled (no hardening), then the table
+         lookup a few lines below (id = ...) is likely to crash dur to an out-of-bounds read. DO NOT FIX
+         that crash on NaN since it could result in a worse issue later on. */
+      celt_assert(!celt_isnan(tmp[0]));
+      celt_assert(!celt_isnan(norm));
+      for (i=12;i<len2-5;i+=4)
+      {
+         int id;
+#ifdef FIXED_POINT
+         id = MAX32(0,MIN32(127,MULT16_32_Q15(tmp[i]+EPSILON,norm))); /* Do not round to nearest */
+#else
+         id = (int)MAX32(0,MIN32(127,floor(64*norm*(tmp[i]+EPSILON)))); /* Do not round to nearest */
+#endif
+         unmask += inv_table[id];
+      }
+      /*printf("%d\n", unmask);*/
+      /* Normalize, compensate for the 1/4th of the sample and the factor of 6 in the inverse table */
+      unmask = 64*unmask*4/(6*(len2-17));
+      if (unmask>mask_metric)
+      {
+         *tf_chan = c;
+         mask_metric = unmask;
+      }
+   }
+   is_transient = mask_metric>200;
+   /* Prevent the transient detector from confusing the partial cycle of a
+      very low frequency tone with a transient. */
+   if (toneishness > QCONST32(.98f, 29) && tone_freq < QCONST16(0.026f, 13))
+      is_transient = 0;
+   /* For low bitrates, define "weak transients" that need to be
+      handled differently to avoid partial collapse. */
+   if (allow_weak_transients && is_transient && mask_metric<600) {
+      is_transient = 0;
+      *weak_transient = 1;
+   }
+   /* Arbitrary metric for VBR boost */
+   tf_max = MAX16(0,celt_sqrt(27*mask_metric)-42);
+   /* *tf_estimate = 1 + MIN16(1, sqrt(MAX16(0, tf_max-30))/20); */
+   *tf_estimate = celt_sqrt(MAX32(0, SHL32(MULT16_16(QCONST16(0.0069,14),MIN16(163,tf_max)),14)-QCONST32(0.139,28)));
+   /*printf("%d %f\n", tf_max, mask_metric);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   is_transient = rand()&0x1;
+#endif
+   /*printf("%d %f %d\n", is_transient, (float)*tf_estimate, tf_max);*/
+   return is_transient;
+}
+
+/* Looks for sudden increases of energy to decide whether we need to patch
+   the transient decision */
+static int patch_transient_decision(celt_glog *newE, celt_glog *oldE, int nbEBands,
+      int start, int end, int C)
+{
+   int i, c;
+   opus_val32 mean_diff=0;
+   celt_glog spread_old[26];
+   /* Apply an aggressive (-6 dB/Bark) spreading function to the old frame to
+      avoid false detection caused by irrelevant bands */
+   if (C==1)
+   {
+      spread_old[start] = oldE[start];
+      for (i=start+1;i<end;i++)
+         spread_old[i] = MAXG(spread_old[i-1]-GCONST(1.0f), oldE[i]);
+   } else {
+      spread_old[start] = MAXG(oldE[start],oldE[start+nbEBands]);
+      for (i=start+1;i<end;i++)
+         spread_old[i] = MAXG(spread_old[i-1]-GCONST(1.0f),
+                               MAXG(oldE[i],oldE[i+nbEBands]));
+   }
+   for (i=end-2;i>=start;i--)
+      spread_old[i] = MAXG(spread_old[i], spread_old[i+1]-GCONST(1.0f));
+   /* Compute mean increase */
+   c=0; do {
+      for (i=IMAX(2,start);i<end-1;i++)
+      {
+         opus_val16 x1, x2;
+         x1 = MAXG(0, newE[i + c*nbEBands]);
+         x2 = MAXG(0, spread_old[i]);
+         mean_diff = ADD32(mean_diff, MAXG(0, SUB32(x1, x2)));
+      }
+   } while (++c<C);
+   mean_diff = DIV32(mean_diff, C*(end-1-IMAX(2,start)));
+   /*printf("%f %f %d\n", mean_diff, max_diff, count);*/
+   return mean_diff > GCONST(1.f);
+}
+
+/** Apply window and compute the MDCT for all sub-frames and
+    all channels in a frame */
+static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS_RESTRICT in,
+                          celt_sig * OPUS_RESTRICT out, int C, int CC, int LM, int upsample,
+                          int arch)
+{
+   const int overlap = mode->overlap;
+   int N;
+   int B;
+   int shift;
+   int i, b, c;
+   if (shortBlocks)
+   {
+      B = shortBlocks;
+      N = mode->shortMdctSize;
+      shift = mode->maxLM;
+   } else {
+      B = 1;
+      N = mode->shortMdctSize<<LM;
+      shift = mode->maxLM-LM;
+   }
+   c=0; do {
+      for (b=0;b<B;b++)
+      {
+         /* Interleaving the sub-frames while doing the MDCTs */
+         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N,
+                          &out[b+c*N*B], mode->window, overlap, shift, B,
+                          arch);
+      }
+   } while (++c<CC);
+   if (CC==2&&C==1)
+   {
+      for (i=0;i<B*N;i++)
+         out[i] = ADD32(HALF32(out[i]), HALF32(out[B*N+i]));
+   }
+   if (upsample != 1)
+   {
+      c=0; do
+      {
+         int bound = B*N/upsample;
+         for (i=0;i<bound;i++)
+            out[c*B*N+i] *= upsample;
+         OPUS_CLEAR(&out[c*B*N+bound], B*N-bound);
+      } while (++c<C);
+   }
+}
+
+
+void celt_preemphasis(const opus_res * OPUS_RESTRICT pcmp, celt_sig * OPUS_RESTRICT inp,
+                        int N, int CC, int upsample, const opus_val16 *coef, celt_sig *mem, int clip)
+{
+   int i;
+   opus_val16 coef0;
+   celt_sig m;
+   int Nu;
+
+   coef0 = coef[0];
+   m = *mem;
+
+   /* Fast path for the normal 48kHz case and no clipping */
+   if (coef[1] == 0 && upsample == 1 && !clip)
+   {
+      for (i=0;i<N;i++)
+      {
+         celt_sig x;
+         x = RES2SIG(pcmp[CC*i]);
+         /* Apply pre-emphasis */
+         inp[i] = x - m;
+         m = MULT16_32_Q15(coef0, x);
+      }
+      *mem = m;
+      return;
+   }
+
+   Nu = N/upsample;
+   if (upsample!=1)
+   {
+      OPUS_CLEAR(inp, N);
+   }
+   for (i=0;i<Nu;i++)
+      inp[i*upsample] = RES2SIG(pcmp[CC*i]);
+
+#ifndef FIXED_POINT
+   if (clip)
+   {
+      /* Clip input to avoid encoding non-portable files */
+      for (i=0;i<Nu;i++)
+         inp[i*upsample] = MAX32(-65536.f, MIN32(65536.f,inp[i*upsample]));
+   }
+#else
+   (void)clip; /* Avoids a warning about clip being unused. */
+#endif
+#ifdef CUSTOM_MODES
+   if (coef[1] != 0)
+   {
+      opus_val16 coef1 = coef[1];
+      opus_val16 coef2 = coef[2];
+      for (i=0;i<N;i++)
+      {
+         celt_sig x, tmp;
+         x = inp[i];
+         /* Apply pre-emphasis */
+         tmp = SHL32(MULT16_32_Q15(coef2, x), 15-SIG_SHIFT);
+         inp[i] = tmp + m;
+         m = MULT16_32_Q15(coef1, inp[i]) - MULT16_32_Q15(coef0, tmp);
+      }
+   } else
+#endif
+   {
+      for (i=0;i<N;i++)
+      {
+         celt_sig x;
+         x = inp[i];
+         /* Apply pre-emphasis */
+         inp[i] = x - m;
+         m = MULT16_32_Q15(coef0, x);
+      }
+   }
+   *mem = m;
+}
+
+
+
+static opus_val32 l1_metric(const celt_norm *tmp, int N, int LM, opus_val16 bias)
+{
+   int i;
+   opus_val32 L1;
+   L1 = 0;
+   for (i=0;i<N;i++)
+      L1 += EXTEND32(ABS16(tmp[i]));
+   /* When in doubt, prefer good freq resolution */
+   L1 = MAC16_32_Q15(L1, LM*bias, L1);
+   return L1;
+
+}
+
+static int tf_analysis(const CELTMode *m, int len, int isTransient,
+      int *tf_res, int lambda, celt_norm *X, int N0, int LM,
+      opus_val16 tf_estimate, int tf_chan, int *importance)
+{
+   int i;
+   VARDECL(int, metric);
+   int cost0;
+   int cost1;
+   VARDECL(int, path0);
+   VARDECL(int, path1);
+   VARDECL(celt_norm, tmp);
+   VARDECL(celt_norm, tmp_1);
+   int sel;
+   int selcost[2];
+   int tf_select=0;
+   opus_val16 bias;
+
+   SAVE_STACK;
+   bias = MULT16_16_Q14(QCONST16(.04f,15), MAX16(-QCONST16(.25f,14), QCONST16(.5f,14)-tf_estimate));
+   /*printf("%f ", bias);*/
+
+   ALLOC(metric, len, int);
+   ALLOC(tmp, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(tmp_1, (m->eBands[len]-m->eBands[len-1])<<LM, celt_norm);
+   ALLOC(path0, len, int);
+   ALLOC(path1, len, int);
+
+   for (i=0;i<len;i++)
+   {
+      int k, N;
+      int narrow;
+      opus_val32 L1, best_L1;
+      int best_level=0;
+      N = (m->eBands[i+1]-m->eBands[i])<<LM;
+      /* band is too narrow to be split down to LM=-1 */
+      narrow = (m->eBands[i+1]-m->eBands[i])==1;
+      OPUS_COPY(tmp, &X[tf_chan*N0 + (m->eBands[i]<<LM)], N);
+      /* Just add the right channel if we're in stereo */
+      /*if (C==2)
+         for (j=0;j<N;j++)
+            tmp[j] = ADD16(SHR16(tmp[j], 1),SHR16(X[N0+j+(m->eBands[i]<<LM)], 1));*/
+      L1 = l1_metric(tmp, N, isTransient ? LM : 0, bias);
+      best_L1 = L1;
+      /* Check the -1 case for transients */
+      if (isTransient && !narrow)
+      {
+         OPUS_COPY(tmp_1, tmp, N);
+         haar1(tmp_1, N>>LM, 1<<LM);
+         L1 = l1_metric(tmp_1, N, LM+1, bias);
+         if (L1<best_L1)
+         {
+            best_L1 = L1;
+            best_level = -1;
+         }
+      }
+      /*printf ("%f ", L1);*/
+      for (k=0;k<LM+!(isTransient||narrow);k++)
+      {
+         int B;
+
+         if (isTransient)
+            B = (LM-k-1);
+         else
+            B = k+1;
+
+         haar1(tmp, N>>k, 1<<k);
+
+         L1 = l1_metric(tmp, N, B, bias);
+
+         if (L1 < best_L1)
+         {
+            best_L1 = L1;
+            best_level = k+1;
+         }
+      }
+      /*printf ("%d ", isTransient ? LM-best_level : best_level);*/
+      /* metric is in Q1 to be able to select the mid-point (-0.5) for narrower bands */
+      if (isTransient)
+         metric[i] = 2*best_level;
+      else
+         metric[i] = -2*best_level;
+      /* For bands that can't be split to -1, set the metric to the half-way point to avoid
+         biasing the decision */
+      if (narrow && (metric[i]==0 || metric[i]==-2*LM))
+         metric[i]-=1;
+      /*printf("%d ", metric[i]/2 + (!isTransient)*LM);*/
+   }
+   /*printf("\n");*/
+   /* Search for the optimal tf resolution, including tf_select */
+   tf_select = 0;
+   for (sel=0;sel<2;sel++)
+   {
+      cost0 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+      cost1 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*sel+1]) + (isTransient ? 0 : lambda);
+      for (i=1;i<len;i++)
+      {
+         int curr0, curr1;
+         curr0 = IMIN(cost0, cost1 + lambda);
+         curr1 = IMIN(cost0 + lambda, cost1);
+         cost0 = curr0 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+0]);
+         cost1 = curr1 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*sel+1]);
+      }
+      cost0 = IMIN(cost0, cost1);
+      selcost[sel]=cost0;
+   }
+   /* For now, we're conservative and only allow tf_select=1 for transients.
+    * If tests confirm it's useful for non-transients, we could allow it. */
+   if (selcost[1]<selcost[0] && isTransient)
+      tf_select=1;
+   cost0 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+   cost1 = importance[0]*abs(metric[0]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]) + (isTransient ? 0 : lambda);
+   /* Viterbi forward pass */
+   for (i=1;i<len;i++)
+   {
+      int curr0, curr1;
+      int from0, from1;
+
+      from0 = cost0;
+      from1 = cost1 + lambda;
+      if (from0 < from1)
+      {
+         curr0 = from0;
+         path0[i]= 0;
+      } else {
+         curr0 = from1;
+         path0[i]= 1;
+      }
+
+      from0 = cost0 + lambda;
+      from1 = cost1;
+      if (from0 < from1)
+      {
+         curr1 = from0;
+         path1[i]= 0;
+      } else {
+         curr1 = from1;
+         path1[i]= 1;
+      }
+      cost0 = curr0 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+0]);
+      cost1 = curr1 + importance[i]*abs(metric[i]-2*tf_select_table[LM][4*isTransient+2*tf_select+1]);
+   }
+   tf_res[len-1] = cost0 < cost1 ? 0 : 1;
+   /* Viterbi backward pass to check the decisions */
+   for (i=len-2;i>=0;i--)
+   {
+      if (tf_res[i+1] == 1)
+         tf_res[i] = path1[i+1];
+      else
+         tf_res[i] = path0[i+1];
+   }
+   /*printf("%d %f\n", *tf_sum, tf_estimate);*/
+   RESTORE_STACK;
+#ifdef FUZZING
+   tf_select = rand()&0x1;
+   tf_res[0] = rand()&0x1;
+   for (i=1;i<len;i++)
+      tf_res[i] = tf_res[i-1] ^ ((rand()&0xF) == 0);
+#endif
+   return tf_select;
+}
+
+static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM, int tf_select, ec_enc *enc)
+{
+   int curr, i;
+   int tf_select_rsv;
+   int tf_changed;
+   int logp;
+   opus_uint32 budget;
+   opus_uint32 tell;
+   budget = enc->storage*8;
+   tell = ec_tell(enc);
+   logp = isTransient ? 2 : 4;
+   /* Reserve space to code the tf_select decision. */
+   tf_select_rsv = LM>0 && tell+logp+1 <= budget;
+   budget -= tf_select_rsv;
+   curr = tf_changed = 0;
+   for (i=start;i<end;i++)
+   {
+      if (tell+logp<=budget)
+      {
+         ec_enc_bit_logp(enc, tf_res[i] ^ curr, logp);
+         tell = ec_tell(enc);
+         curr = tf_res[i];
+         tf_changed |= curr;
+      }
+      else
+         tf_res[i] = curr;
+      logp = isTransient ? 4 : 5;
+   }
+   /* Only code tf_select if it would actually make a difference. */
+   if (tf_select_rsv &&
+         tf_select_table[LM][4*isTransient+0+tf_changed]!=
+         tf_select_table[LM][4*isTransient+2+tf_changed])
+      ec_enc_bit_logp(enc, tf_select, 1);
+   else
+      tf_select = 0;
+   for (i=start;i<end;i++)
+      tf_res[i] = tf_select_table[LM][4*isTransient+2*tf_select+tf_res[i]];
+   /*for(i=0;i<end;i++)printf("%d ", isTransient ? tf_res[i] : LM+tf_res[i]);printf("\n");*/
+}
+
+
+static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
+      const celt_glog *bandLogE, int end, int LM, int C, int N0,
+      AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
+      int intensity, celt_glog surround_trim, opus_int32 equiv_rate, int arch)
+{
+   int i;
+   opus_val32 diff=0;
+   int c;
+   int trim_index;
+   opus_val16 trim = QCONST16(5.f, 8);
+   opus_val16 logXC, logXC2;
+   /* At low bitrate, reducing the trim seems to help. At higher bitrates, it's less
+      clear what's best, so we're keeping it as it was before, at least for now. */
+   if (equiv_rate < 64000) {
+      trim = QCONST16(4.f, 8);
+   } else if (equiv_rate < 80000) {
+      opus_int32 frac = (equiv_rate-64000) >> 10;
+      trim = QCONST16(4.f, 8) + QCONST16(1.f/16.f, 8)*frac;
+   }
+   if (C==2)
+   {
+      opus_val16 sum = 0; /* Q10 */
+      opus_val16 minXC; /* Q10 */
+      /* Compute inter-channel correlation for low frequencies */
+      for (i=0;i<8;i++)
+      {
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
+         sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
+      }
+      sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
+      sum = MIN16(QCONST16(1.f, 10), ABS16(sum));
+      minXC = sum;
+      for (i=8;i<intensity;i++)
+      {
+         opus_val32 partial;
+         partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
+               (m->eBands[i+1]-m->eBands[i])<<LM, arch);
+         minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
+      }
+      minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
+      /*printf ("%f\n", sum);*/
+      /* mid-side savings estimations based on the LF average*/
+      logXC = celt_log2(QCONST32(1.001f, 20)-MULT16_16(sum, sum));
+      /* mid-side savings estimations based on min correlation */
+      logXC2 = MAX16(HALF16(logXC), celt_log2(QCONST32(1.001f, 20)-MULT16_16(minXC, minXC)));
+#ifdef FIXED_POINT
+      /* Compensate for Q20 vs Q14 input and convert output to Q8 */
+      logXC = PSHR32(logXC-QCONST16(6.f, 10),10-8);
+      logXC2 = PSHR32(logXC2-QCONST16(6.f, 10),10-8);
+#endif
+
+      trim += MAX16(-QCONST16(4.f, 8), MULT16_16_Q15(QCONST16(.75f,15),logXC));
+      *stereo_saving = MIN16(*stereo_saving + QCONST16(0.25f, 8), -HALF16(logXC2));
+   }
+
+   /* Estimate spectral tilt */
+   c=0; do {
+      for (i=0;i<end-1;i++)
+      {
+         diff += SHR32(bandLogE[i+c*m->nbEBands], 5)*(opus_int32)(2+2*i-end);
+      }
+   } while (++c<C);
+   diff /= C*(end-1);
+   /*printf("%f\n", diff);*/
+   trim -= MAX32(-QCONST16(2.f, 8), MIN32(QCONST16(2.f, 8), SHR32(diff+QCONST32(1.f, DB_SHIFT-5),DB_SHIFT-13)/6 ));
+   trim -= SHR16(surround_trim, DB_SHIFT-8);
+   trim -= 2*SHR16(tf_estimate, 14-8);
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid)
+   {
+      trim -= MAX16(-QCONST16(2.f, 8), MIN16(QCONST16(2.f, 8),
+            (opus_val16)(QCONST16(2.f, 8)*(analysis->tonality_slope+.05f))));
+   }
+#else
+   (void)analysis;
+#endif
+
+#ifdef FIXED_POINT
+   trim_index = PSHR32(trim, 8);
+#else
+   trim_index = (int)floor(.5f+trim);
+#endif
+   trim_index = IMAX(0, IMIN(10, trim_index));
+   /*printf("%d\n", trim_index);*/
+#ifdef FUZZING
+   trim_index = rand()%11;
+#endif
+   return trim_index;
+}
+
+static int stereo_analysis(const CELTMode *m, const celt_norm *X,
+      int LM, int N0)
+{
+   int i;
+   int thetas;
+   opus_val32 sumLR = EPSILON, sumMS = EPSILON;
+
+   /* Use the L1 norm to model the entropy of the L/R signal vs the M/S signal */
+   for (i=0;i<13;i++)
+   {
+      int j;
+      for (j=m->eBands[i]<<LM;j<m->eBands[i+1]<<LM;j++)
+      {
+         opus_val32 L, R, M, S;
+         /* We cast to 32-bit first because of the -32768 case */
+         L = EXTEND32(X[j]);
+         R = EXTEND32(X[N0+j]);
+         M = ADD32(L, R);
+         S = SUB32(L, R);
+         sumLR = ADD32(sumLR, ADD32(ABS32(L), ABS32(R)));
+         sumMS = ADD32(sumMS, ADD32(ABS32(M), ABS32(S)));
+      }
+   }
+   sumMS = MULT16_32_Q15(QCONST16(0.707107f, 15), sumMS);
+   thetas = 13;
+   /* We don't need thetas for lower bands with LM<=1 */
+   if (LM<=1)
+      thetas -= 8;
+   return MULT16_32_Q15((m->eBands[13]<<(LM+1))+thetas, sumMS)
+         > MULT16_32_Q15(m->eBands[13]<<(LM+1), sumLR);
+}
+
+#define MSWAP(a,b) do {celt_glog tmp = a;a=b;b=tmp;} while(0)
+static celt_glog median_of_5(const celt_glog *x)
+{
+   celt_glog t0, t1, t2, t3, t4;
+   t2 = x[2];
+   if (x[0] > x[1])
+   {
+      t0 = x[1];
+      t1 = x[0];
+   } else {
+      t0 = x[0];
+      t1 = x[1];
+   }
+   if (x[3] > x[4])
+   {
+      t3 = x[4];
+      t4 = x[3];
+   } else {
+      t3 = x[3];
+      t4 = x[4];
+   }
+   if (t0 > t3)
+   {
+      MSWAP(t0, t3);
+      MSWAP(t1, t4);
+   }
+   if (t2 > t1)
+   {
+      if (t1 < t3)
+         return MING(t2, t3);
+      else
+         return MING(t4, t1);
+   } else {
+      if (t2 < t3)
+         return MING(t1, t3);
+      else
+         return MING(t2, t4);
+   }
+}
+
+static celt_glog median_of_3(const celt_glog *x)
+{
+   celt_glog t0, t1, t2;
+   if (x[0] > x[1])
+   {
+      t0 = x[1];
+      t1 = x[0];
+   } else {
+      t0 = x[0];
+      t1 = x[1];
+   }
+   t2 = x[2];
+   if (t1 < t2)
+      return t1;
+   else if (t0 < t2)
+      return t2;
+   else
+      return t0;
+}
+
+static celt_glog dynalloc_analysis(const celt_glog *bandLogE, const celt_glog *bandLogE2, const celt_glog *oldBandE,
+      int nbEBands, int start, int end, int C, int *offsets, int lsb_depth, const opus_int16 *logN,
+      int isTransient, int vbr, int constrained_vbr, const opus_int16 *eBands, int LM,
+      int effectiveBytes, opus_int32 *tot_boost_, int lfe, celt_glog *surround_dynalloc,
+      AnalysisInfo *analysis, int *importance, int *spread_weight, opus_val16 tone_freq, opus_val32 toneishness)
+{
+   int i, c;
+   opus_int32 tot_boost=0;
+   celt_glog maxDepth;
+   VARDECL(celt_glog, follower);
+   VARDECL(celt_glog, noise_floor);
+   VARDECL(celt_glog, bandLogE3);
+   SAVE_STACK;
+   ALLOC(follower, C*nbEBands, celt_glog);
+   ALLOC(noise_floor, C*nbEBands, celt_glog);
+   ALLOC(bandLogE3, nbEBands, celt_glog);
+   OPUS_CLEAR(offsets, nbEBands);
+   /* Dynamic allocation code */
+   maxDepth=-GCONST(31.9f);
+   for (i=0;i<end;i++)
+   {
+      /* Noise floor must take into account eMeans, the depth, the width of the bands
+         and the preemphasis filter (approx. square of bark band ID) */
+      noise_floor[i] = GCONST(0.0625f)*logN[i]
+            +GCONST(.5f)+SHL32(9-lsb_depth,DB_SHIFT)-SHL32(eMeans[i],DB_SHIFT-4)
+            +GCONST(.0062f)*(i+5)*(i+5);
+   }
+   c=0;do
+   {
+      for (i=0;i<end;i++)
+         maxDepth = MAXG(maxDepth, bandLogE[c*nbEBands+i]-noise_floor[i]);
+   } while (++c<C);
+   {
+      /* Compute a really simple masking model to avoid taking into account completely masked
+         bands when computing the spreading decision. */
+      VARDECL(celt_glog, mask);
+      VARDECL(celt_glog, sig);
+      ALLOC(mask, nbEBands, celt_glog);
+      ALLOC(sig, nbEBands, celt_glog);
+      for (i=0;i<end;i++)
+         mask[i] = bandLogE[i]-noise_floor[i];
+      if (C==2)
+      {
+         for (i=0;i<end;i++)
+            mask[i] = MAXG(mask[i], bandLogE[nbEBands+i]-noise_floor[i]);
+      }
+      OPUS_COPY(sig, mask, end);
+      for (i=1;i<end;i++)
+         mask[i] = MAXG(mask[i], mask[i-1] - GCONST(2.f));
+      for (i=end-2;i>=0;i--)
+         mask[i] = MAXG(mask[i], mask[i+1] - GCONST(3.f));
+      for (i=0;i<end;i++)
+      {
+         /* Compute SMR: Mask is never more than 72 dB below the peak and never below the noise floor.*/
+         celt_glog smr = sig[i]-MAXG(MAXG(0, maxDepth-GCONST(12.f)), mask[i]);
+         /* Clamp SMR to make sure we're not shifting by something negative or too large. */
+#ifdef FIXED_POINT
+         /* FIXME: Use PSHR16() instead */
+         int shift = -PSHR32(MAXG(-GCONST(5.f), MING(0, smr)), DB_SHIFT);
+#else
+         int shift = IMIN(5, IMAX(0, -(int)floor(.5f + smr)));
+#endif
+         spread_weight[i] = 32 >> shift;
+      }
+      /*for (i=0;i<end;i++)
+         printf("%d ", spread_weight[i]);
+      printf("\n");*/
+   }
+   /* Make sure that dynamic allocation can't make us bust the budget.
+      We enable the feature starting at 24 kb/s for 20-ms frames
+      and 96 kb/s for 2.5 ms frames.  */
+   if (effectiveBytes >= (30 + 5*LM) && !lfe)
+   {
+      int last=0;
+      c=0;do
+      {
+         celt_glog offset;
+         celt_glog tmp;
+         celt_glog *f;
+         OPUS_COPY(bandLogE3, &bandLogE2[c*nbEBands], end);
+         if (LM==0) {
+            /* For 2.5 ms frames, the first 8 bands have just one bin, so the
+               energy is highly unreliable (high variance). For that reason,
+               we take the max with the previous energy so that at least 2 bins
+               are getting used. */
+            for (i=0;i<IMIN(8,end);i++) bandLogE3[i] = MAXG(bandLogE2[c*nbEBands+i], oldBandE[c*nbEBands+i]);
+         }
+         f = &follower[c*nbEBands];
+         f[0] = bandLogE3[0];
+         for (i=1;i<end;i++)
+         {
+            /* The last band to be at least 3 dB higher than the previous one
+               is the last we'll consider. Otherwise, we run into problems on
+               bandlimited signals. */
+            if (bandLogE3[i] > bandLogE3[i-1]+GCONST(.5f))
+               last=i;
+            f[i] = MING(f[i-1]+GCONST(1.5f), bandLogE3[i]);
+         }
+         for (i=last-1;i>=0;i--)
+            f[i] = MING(f[i], MING(f[i+1]+GCONST(2.f), bandLogE3[i]));
+
+         /* Combine with a median filter to avoid dynalloc triggering unnecessarily.
+            The "offset" value controls how conservative we are -- a higher offset
+            reduces the impact of the median filter and makes dynalloc use more bits. */
+         offset = GCONST(1.f);
+         for (i=2;i<end-2;i++)
+            f[i] = MAXG(f[i], median_of_5(&bandLogE3[i-2])-offset);
+         tmp = median_of_3(&bandLogE3[0])-offset;
+         f[0] = MAXG(f[0], tmp);
+         f[1] = MAXG(f[1], tmp);
+         tmp = median_of_3(&bandLogE3[end-3])-offset;
+         f[end-2] = MAXG(f[end-2], tmp);
+         f[end-1] = MAXG(f[end-1], tmp);
+
+         for (i=0;i<end;i++)
+            f[i] = MAXG(f[i], noise_floor[i]);
+      } while (++c<C);
+      if (C==2)
+      {
+         for (i=start;i<end;i++)
+         {
+            /* Consider 24 dB "cross-talk" */
+            follower[nbEBands+i] = MAXG(follower[nbEBands+i], follower[         i]-GCONST(4.f));
+            follower[         i] = MAXG(follower[         i], follower[nbEBands+i]-GCONST(4.f));
+            follower[i] = HALF32(MAXG(0, bandLogE[i]-follower[i]) + MAXG(0, bandLogE[nbEBands+i]-follower[nbEBands+i]));
+         }
+      } else {
+         for (i=start;i<end;i++)
+         {
+            follower[i] = MAXG(0, bandLogE[i]-follower[i]);
+         }
+      }
+      for (i=start;i<end;i++)
+         follower[i] = MAXG(follower[i], surround_dynalloc[i]);
+      for (i=start;i<end;i++)
+      {
+#ifdef FIXED_POINT
+         importance[i] = PSHR32(13*celt_exp2_db(MING(follower[i], GCONST(4.f))), 16);
+#else
+         importance[i] = (int)floor(.5f+13*celt_exp2_db(MING(follower[i], GCONST(4.f))));
+#endif
+      }
+      /* For non-transient CBR/CVBR frames, halve the dynalloc contribution */
+      if ((!vbr || constrained_vbr)&&!isTransient)
+      {
+         for (i=start;i<end;i++)
+            follower[i] = HALF32(follower[i]);
+      }
+      for (i=start;i<end;i++)
+      {
+         if (i<8)
+            follower[i] *= 2;
+         if (i>=12)
+            follower[i] = HALF32(follower[i]);
+      }
+      /* Compensate for Opus' under-allocation on tones. */
+      if (toneishness > QCONST32(.98f, 29)) {
+#ifdef FIXED_POINT
+         int freq_bin = PSHR32(MULT16_16(tone_freq, QCONST16(120/M_PI, 9)), 13+9);
+#else
+         int freq_bin = (int)floor(.5 + tone_freq*120/M_PI);
+#endif
+         for (i=start;i<end;i++) {
+            if (freq_bin >= eBands[i] && freq_bin <= eBands[i+1]) follower[i] += GCONST(2.f);
+            if (freq_bin >= eBands[i]-1 && freq_bin <= eBands[i+1]+1) follower[i] += GCONST(1.f);
+            if (freq_bin >= eBands[i]-2 && freq_bin <= eBands[i+1]+2) follower[i] += GCONST(1.f);
+            if (freq_bin >= eBands[i]-3 && freq_bin <= eBands[i+1]+3) follower[i] += GCONST(.5f);
+         }
+      }
+#ifdef DISABLE_FLOAT_API
+      (void)analysis;
+#else
+      if (analysis->valid)
+      {
+         for (i=start;i<IMIN(LEAK_BANDS, end);i++)
+            follower[i] = follower[i] +  GCONST(1.f/64.f)*analysis->leak_boost[i];
+      }
+#endif
+      for (i=start;i<end;i++)
+      {
+         int width;
+         int boost;
+         int boost_bits;
+
+         follower[i] = MING(follower[i], GCONST(4));
+
+         follower[i] = SHR32(follower[i], 8);
+         width = C*(eBands[i+1]-eBands[i])<<LM;
+         if (width<6)
+         {
+            boost = (int)SHR32(follower[i],DB_SHIFT-8);
+            boost_bits = boost*width<<BITRES;
+         } else if (width > 48) {
+            boost = (int)SHR32(follower[i]*8,DB_SHIFT-8);
+            boost_bits = (boost*width<<BITRES)/8;
+         } else {
+            boost = (int)SHR32(follower[i]*width/6,DB_SHIFT-8);
+            boost_bits = boost*6<<BITRES;
+         }
+         /* For CBR and non-transient CVBR frames, limit dynalloc to 2/3 of the bits */
+         if ((!vbr || (constrained_vbr&&!isTransient))
+               && (tot_boost+boost_bits)>>BITRES>>3 > 2*effectiveBytes/3)
+         {
+            opus_int32 cap = ((2*effectiveBytes/3)<<BITRES<<3);
+            offsets[i] = cap-tot_boost;
+            tot_boost = cap;
+            break;
+         } else {
+            offsets[i] = boost;
+            tot_boost += boost_bits;
+         }
+      }
+   } else {
+      for (i=start;i<end;i++)
+         importance[i] = 13;
+   }
+   *tot_boost_ = tot_boost;
+   RESTORE_STACK;
+   return maxDepth;
+}
+
+#ifdef FIXED_POINT
+void normalize_tone_input(opus_val16 *x, int len) {
+   opus_val32 ac0=len;
+   int i;
+   int shift;
+   for (i=0;i<len;i++) {
+      ac0 = ADD32(ac0, SHR32(MULT16_16(x[i], x[i]), 10));
+   }
+   shift = 5 - (28-celt_ilog2(ac0))/2;
+   if (shift > 0) {
+      for (i=0;i<len;i++) {
+         x[i] = PSHR32(x[i], shift);
+      }
+   }
+}
+int acos_approx(opus_val32 x) {
+   opus_val16 x14;
+   opus_val32 tmp;
+   int flip = x<0;
+   x = abs(x);
+   x14 = x>>15;
+   tmp = (762*x14>>14)-3308;
+   tmp = (tmp*x14>>14)+25726;
+   tmp = tmp*celt_sqrt(IMAX(0, (1<<30) - (x<<1)))>>16;
+   if (flip) tmp = 25736 - tmp;
+   return tmp;
+}
+#endif
+
+/* Compute the LPC coefficients using a least-squares fit for both forward and backward prediction. */
+static int tone_lpc(const opus_val16 *x, int len, int delay, opus_val32 *lpc) {
+   int i;
+   opus_val32 r00=0, r01=0, r11=0, r02=0, r12=0, r22=0;
+   opus_val32 edges;
+   opus_val32 num0, num1, den;
+   celt_assert(len > 2*delay);
+   /* Compute correlations as if using the forward prediction covariance method. */
+   for (i=0;i<len-2*delay;i++) {
+      r00 += MULT16_16(x[i],x[i]);
+      r01 += MULT16_16(x[i],x[i+delay]);
+      r02 += MULT16_16(x[i],x[i+2*delay]);
+   }
+   edges = 0;
+   for (i=0;i<delay;i++) edges += MULT16_16(x[len+i-2*delay],x[len+i-2*delay]) - MULT16_16(x[i],x[i]);
+   r11 = r00+edges;
+   edges = 0;
+   for (i=0;i<delay;i++) edges += MULT16_16(x[len+i-delay],x[len+i-delay]) - MULT16_16(x[i+delay],x[i+delay]);
+   r22 = r11+edges;
+   edges = 0;
+   for (i=0;i<delay;i++) edges += MULT16_16(x[len+i-2*delay],x[len+i-delay]) - MULT16_16(x[i],x[i+delay]);
+   r12 = r01+edges;
+   /* Reverse and sum to get the backward contribution. */
+   {
+      opus_val32 R00, R01, R11, R02, R12, R22;
+      R00 = r00 + r22;
+      R01 = r01 + r12;
+      R11 = 2*r11;
+      R02 = 2*r02;
+      R12 = r12 + r01;
+      R22 = r00 + r22;
+      r00 = R00;
+      r01 = R01;
+      r11 = R11;
+      r02 = R02;
+      r12 = R12;
+      r22 = R22;
+   }
+   /* Solve A*x=b, where A=[r00, r01; r01, r11] and b=[r02; r12]. */
+   den = MULT32_32_Q31(r00,r11) - MULT32_32_Q31(r01,r01);
+#ifdef FIXED_POINT
+   if (den <= SHR32(MULT32_32_Q31(r00,r11), 10)) return 1;
+#else
+   if (den < .001f*MULT32_32_Q31(r00,r11)) return 1;
+#endif
+   num1 = MULT32_32_Q31(r02,r11) - MULT32_32_Q31(r01,r12);
+   if (num1 >= den) lpc[1] = QCONST32(1.f, 29);
+   else if (num1 <= -den) lpc[1] = -QCONST32(1.f, 29);
+   else lpc[1] = frac_div32_q29(num1, den);
+   num0 = MULT32_32_Q31(r00,r12) - MULT32_32_Q31(r02,r01);
+   if (HALF32(num0) >= den) lpc[0] = QCONST32(1.999999f, 29);
+   else if (HALF32(num0) <= -den) lpc[0] = -QCONST32(1.999999f, 29);
+   else lpc[0] = frac_div32_q29(num0, den);
+   /*printf("%f %f\n", lpc[0], lpc[1]);*/
+   return 0;
+}
+
+/* Detects pure of nearly pure tones so we can prevent them from causing problems with the encoder. */
+static opus_val16 tone_detect(const celt_sig *in, const celt_sig *prefilter_mem, int CC, int N, int overlap, opus_val32 *toneishness, opus_int32 Fs) {
+   int i;
+   int delay = 1;
+   int fail;
+   opus_val32 lpc[2];
+   opus_val16 freq;
+   VARDECL(opus_val16, x);
+   ALLOC(x, N+overlap, opus_val16);
+   /* Shift by SIG_SHIFT+1 (+2 for stereo) to account for HF gain of the preemphasis filter. */
+   if (CC==2) {
+      for (i=0;i<N;i++) x[i+overlap] = PSHR32(ADD32(in[i], in[i+N+overlap]), SIG_SHIFT+2);
+      for (i=0;i<overlap;i++) x[i] = PSHR32(ADD32(prefilter_mem[COMBFILTER_MAXPERIOD-overlap+i], prefilter_mem[2*COMBFILTER_MAXPERIOD-overlap+i]), SIG_SHIFT+2);
+   } else {
+      for (i=0;i<N;i++) x[i+overlap] = PSHR32(in[i], SIG_SHIFT+1);
+      for (i=0;i<overlap;i++) x[i] = PSHR32(prefilter_mem[COMBFILTER_MAXPERIOD-overlap+i], SIG_SHIFT+1);
+   }
+#ifdef FIXED_POINT
+   normalize_tone_input(x, N+overlap);
+#endif
+   fail = tone_lpc(x, N+overlap, delay, lpc);
+   /* If our LPC filter resonates too close to DC, retry the analysis with down-sampling. */
+   while (delay <= Fs/3000 && (fail || (lpc[0] > QCONST32(1.f, 29) && lpc[1] < 0))) {
+      delay *= 2;
+      fail = tone_lpc(x, N+overlap, delay, lpc);
+   }
+   /* Check that our filter has complex roots. */
+   if (!fail && MULT32_32_Q31(lpc[0],lpc[0]) + MULT32_32_Q31(QCONST32(3.999999, 29), lpc[1]) < 0) {
+      /* Squared radius of the poles. */
+      *toneishness = -lpc[1];
+#ifdef FIXED_POINT
+      freq = acos_approx(lpc[0]>>1)/delay;
+#else
+      freq = acos(.5f*lpc[0])/delay;
+#endif
+   } else {
+      freq = -1;
+      *toneishness=0;
+   }
+   /*printf("%f %f %f %f\n", freq, lpc[0], lpc[1], *toneishness);*/
+   return freq;
+}
+
+static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, int CC, int N,
+      int prefilter_tapset, int *pitch, opus_val16 *gain, int *qgain, int enabled, int nbAvailableBytes, AnalysisInfo *analysis, opus_val16 tone_freq, opus_val32 toneishness)
+{
+   int c;
+   VARDECL(celt_sig, _pre);
+   celt_sig *pre[2];
+   const CELTMode *mode;
+   int pitch_index;
+   opus_val16 gain1;
+   opus_val16 pf_threshold;
+   int pf_on;
+   int qg;
+   int overlap;
+   SAVE_STACK;
+
+   mode = st->mode;
+   overlap = mode->overlap;
+   ALLOC(_pre, CC*(N+COMBFILTER_MAXPERIOD), celt_sig);
+
+   pre[0] = _pre;
+   pre[1] = _pre + (N+COMBFILTER_MAXPERIOD);
+
+
+   c=0; do {
+      OPUS_COPY(pre[c], prefilter_mem+c*COMBFILTER_MAXPERIOD, COMBFILTER_MAXPERIOD);
+      OPUS_COPY(pre[c]+COMBFILTER_MAXPERIOD, in+c*(N+overlap)+overlap, N);
+   } while (++c<CC);
+
+   if (enabled)
+   {
+      VARDECL(opus_val16, pitch_buf);
+      ALLOC(pitch_buf, (COMBFILTER_MAXPERIOD+N)>>1, opus_val16);
+
+      pitch_downsample(pre, pitch_buf, COMBFILTER_MAXPERIOD+N, CC, st->arch);
+      /* Don't search for the fir last 1.5 octave of the range because
+         there's too many false-positives due to short-term correlation */
+      pitch_search(pitch_buf+(COMBFILTER_MAXPERIOD>>1), pitch_buf, N,
+            COMBFILTER_MAXPERIOD-3*COMBFILTER_MINPERIOD, &pitch_index,
+            st->arch);
+      pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
+
+      gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
+            N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
+      if (pitch_index > COMBFILTER_MAXPERIOD-2)
+         pitch_index = COMBFILTER_MAXPERIOD-2;
+      gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
+      /* If we detect that the signal is dominated by a single tone, don't rely on the standard pitch
+         estimator, as it can become unreliable. */
+      if (toneishness > QCONST32(.99f, 29)) {
+         /* If the pitch is too high for our post-filter, apply pitch doubling until
+            we can get something that fits (not ideal, but better than nothing). */
+         while (tone_freq >= QCONST16(0.39f, 13)) tone_freq/=2;
+         if (tone_freq > QCONST16(0.006148f, 13)) {
+#ifdef FIXED_POINT
+            pitch_index = IMIN(51472/tone_freq, COMBFILTER_MAXPERIOD-2);
+#else
+            pitch_index = IMIN((int)floor(.5+2.f*M_PI/tone_freq), COMBFILTER_MAXPERIOD-2);
+#endif
+         } else {
+            /* If the pitch is too low, using a very high pitch will actually give us an improvement
+               due to the DC component of the filter that will be close to our tone. Again, not ideal,
+               but if we only have a single tone, it's better than nothing. */
+            pitch_index = COMBFILTER_MINPERIOD;
+         }
+         gain1 = QCONST16(.75f, 15);
+      }
+      /*printf("%d %d %f %f\n", pitch_change, pitch_index, gain1, st->analysis.tonality);*/
+      if (st->loss_rate>2)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>4)
+         gain1 = HALF32(gain1);
+      if (st->loss_rate>8)
+         gain1 = 0;
+   } else {
+      gain1 = 0;
+      pitch_index = COMBFILTER_MINPERIOD;
+   }
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid)
+      gain1 = (opus_val16)(gain1 * analysis->max_pitch_ratio);
+#else
+   (void)analysis;
+#endif
+   /* Gain threshold for enabling the prefilter/postfilter */
+   pf_threshold = QCONST16(.2f,15);
+
+   /* Adjusting the threshold based on rate and continuity */
+   if (abs(pitch_index-st->prefilter_period)*10>pitch_index)
+      pf_threshold += QCONST16(.2f,15);
+   if (nbAvailableBytes<25)
+      pf_threshold += QCONST16(.1f,15);
+   if (nbAvailableBytes<35)
+      pf_threshold += QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.4f,15))
+      pf_threshold -= QCONST16(.1f,15);
+   if (st->prefilter_gain > QCONST16(.55f,15))
+      pf_threshold -= QCONST16(.1f,15);
+
+   /* Hard threshold at 0.2 */
+   pf_threshold = MAX16(pf_threshold, QCONST16(.2f,15));
+   if (gain1<pf_threshold)
+   {
+      gain1 = 0;
+      pf_on = 0;
+      qg = 0;
+   } else {
+      /*This block is not gated by a total bits check only because
+        of the nbAvailableBytes check above.*/
+      if (ABS16(gain1-st->prefilter_gain)<QCONST16(.1f,15))
+         gain1=st->prefilter_gain;
+
+#ifdef FIXED_POINT
+      qg = ((gain1+1536)>>10)/3-1;
+#else
+      qg = (int)floor(.5f+gain1*32/3)-1;
+#endif
+      qg = IMAX(0, IMIN(7, qg));
+      gain1 = QCONST16(0.09375f,15)*(qg+1);
+      pf_on = 1;
+   }
+   /*printf("%d %f\n", pitch_index, gain1);*/
+
+   c=0; do {
+      int offset = mode->shortMdctSize-overlap;
+      st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+      OPUS_COPY(in+c*(N+overlap), st->in_mem+c*(overlap), overlap);
+      if (offset)
+         comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD,
+               st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain,
+               st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch);
+
+      comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset,
+            st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1,
+            st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch);
+      OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap);
+
+      if (N>COMBFILTER_MAXPERIOD)
+      {
+         OPUS_COPY(prefilter_mem+c*COMBFILTER_MAXPERIOD, pre[c]+N, COMBFILTER_MAXPERIOD);
+      } else {
+         OPUS_MOVE(prefilter_mem+c*COMBFILTER_MAXPERIOD, prefilter_mem+c*COMBFILTER_MAXPERIOD+N, COMBFILTER_MAXPERIOD-N);
+         OPUS_COPY(prefilter_mem+c*COMBFILTER_MAXPERIOD+COMBFILTER_MAXPERIOD-N, pre[c]+COMBFILTER_MAXPERIOD, N);
+      }
+   } while (++c<CC);
+
+   RESTORE_STACK;
+   *gain = gain1;
+   *pitch = pitch_index;
+   *qgain = qg;
+   return pf_on;
+}
+
+static int compute_vbr(const CELTMode *mode, AnalysisInfo *analysis, opus_int32 base_target,
+      int LM, opus_int32 bitrate, int lastCodedBands, int C, int intensity,
+      int constrained_vbr, opus_val16 stereo_saving, int tot_boost,
+      opus_val16 tf_estimate, int pitch_change, celt_glog maxDepth,
+      int lfe, int has_surround_mask, celt_glog surround_masking,
+      celt_glog temporal_vbr)
+{
+   /* The target rate in 8th bits per frame */
+   opus_int32 target;
+   int coded_bins;
+   int coded_bands;
+   opus_val16 tf_calibration;
+   int nbEBands;
+   const opus_int16 *eBands;
+
+   nbEBands = mode->nbEBands;
+   eBands = mode->eBands;
+
+   coded_bands = lastCodedBands ? lastCodedBands : nbEBands;
+   coded_bins = eBands[coded_bands]<<LM;
+   if (C==2)
+      coded_bins += eBands[IMIN(intensity, coded_bands)]<<LM;
+
+   target = base_target;
+
+   /*printf("%f %f %f %f %d %d ", st->analysis.activity, st->analysis.tonality, tf_estimate, st->stereo_saving, tot_boost, coded_bands);*/
+#ifndef DISABLE_FLOAT_API
+   if (analysis->valid && analysis->activity<.4)
+      target -= (opus_int32)((coded_bins<<BITRES)*(.4f-analysis->activity));
+#endif
+   /* Stereo savings */
+   if (C==2)
+   {
+      int coded_stereo_bands;
+      int coded_stereo_dof;
+      opus_val16 max_frac;
+      coded_stereo_bands = IMIN(intensity, coded_bands);
+      coded_stereo_dof = (eBands[coded_stereo_bands]<<LM)-coded_stereo_bands;
+      /* Maximum fraction of the bits we can save if the signal is mono. */
+      max_frac = DIV32_16(MULT16_16(QCONST16(0.8f, 15), coded_stereo_dof), coded_bins);
+      stereo_saving = MIN16(stereo_saving, QCONST16(1.f, 8));
+      /*printf("%d %d %d ", coded_stereo_dof, coded_bins, tot_boost);*/
+      target -= (opus_int32)MIN32(MULT16_32_Q15(max_frac,target),
+                      SHR32(MULT16_16(stereo_saving-QCONST16(0.1f,8),(coded_stereo_dof<<BITRES)),8));
+   }
+   /* Boost the rate according to dynalloc (minus the dynalloc average for calibration). */
+   target += tot_boost-(19<<LM);
+   /* Apply transient boost, compensating for average boost. */
+   tf_calibration = QCONST16(0.044f,14);
+   target += (opus_int32)SHL32(MULT16_32_Q15(tf_estimate-tf_calibration, target),1);
+
+#ifndef DISABLE_FLOAT_API
+   /* Apply tonality boost */
+   if (analysis->valid && !lfe)
+   {
+      opus_int32 tonal_target;
+      float tonal;
+
+      /* Tonality boost (compensating for the average). */
+      tonal = MAX16(0.f,analysis->tonality-.15f)-0.12f;
+      tonal_target = target + (opus_int32)((coded_bins<<BITRES)*1.2f*tonal);
+      if (pitch_change)
+         tonal_target +=  (opus_int32)((coded_bins<<BITRES)*.8f);
+      /*printf("%f %f ", analysis->tonality, tonal);*/
+      target = tonal_target;
+   }
+#else
+   (void)analysis;
+   (void)pitch_change;
+#endif
+
+   if (has_surround_mask&&!lfe)
+   {
+      opus_int32 surround_target = target + (opus_int32)SHR32(MULT16_16(SHR32(surround_masking,DB_SHIFT-10),coded_bins<<BITRES), 10);
+      /*printf("%f %d %d %d %d %d %d ", surround_masking, coded_bins, st->end, st->intensity, surround_target, target, st->bitrate);*/
+      target = IMAX(target/4, surround_target);
+   }
+
+   {
+      opus_int32 floor_depth;
+      int bins;
+      bins = eBands[nbEBands-2]<<LM;
+      /*floor_depth = SHR32(MULT16_16((C*bins<<BITRES),celt_log2(SHL32(MAX16(1,sample_max),13))), DB_SHIFT);*/
+      floor_depth = (opus_int32)SHR32(MULT16_32_Q15((C*bins<<BITRES),maxDepth), DB_SHIFT-15);
+      floor_depth = IMAX(floor_depth, target>>2);
+      target = IMIN(target, floor_depth);
+      /*printf("%f %d\n", maxDepth, floor_depth);*/
+   }
+
+   /* Make VBR less aggressive for constrained VBR because we can't keep a higher bitrate
+      for long. Needs tuning. */
+   if ((!has_surround_mask||lfe) && constrained_vbr)
+   {
+      target = base_target + (opus_int32)MULT16_32_Q15(QCONST16(0.67f, 15), target-base_target);
+   }
+
+   if (!has_surround_mask && tf_estimate < QCONST16(.2f, 14))
+   {
+      opus_val16 amount;
+      opus_val16 tvbr_factor;
+      amount = MULT16_16_Q15(QCONST16(.0000031f, 30), IMAX(0, IMIN(32000, 96000-bitrate)));
+      tvbr_factor = SHR32(MULT16_16(SHR32(temporal_vbr, DB_SHIFT-10), amount), 10);
+      target += (opus_int32)MULT16_32_Q15(tvbr_factor, target);
+   }
+
+   /* Don't allow more than doubling the rate */
+   target = IMIN(2*base_target, target);
+
+   return target;
+}
+
+int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_res * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes, ec_enc *enc)
+{
+   int i, c, N;
+   opus_int32 bits;
+   ec_enc _enc;
+   VARDECL(celt_sig, in);
+   VARDECL(celt_sig, freq);
+   VARDECL(celt_norm, X);
+   VARDECL(celt_ener, bandE);
+   VARDECL(celt_glog, bandLogE);
+   VARDECL(celt_glog, bandLogE2);
+   VARDECL(int, fine_quant);
+   VARDECL(celt_glog, error);
+   VARDECL(int, pulses);
+   VARDECL(int, cap);
+   VARDECL(int, offsets);
+   VARDECL(int, importance);
+   VARDECL(int, spread_weight);
+   VARDECL(int, fine_priority);
+   VARDECL(int, tf_res);
+   VARDECL(unsigned char, collapse_masks);
+   celt_sig *prefilter_mem;
+   celt_glog *oldBandE, *oldLogE, *oldLogE2, *energyError;
+   int shortBlocks=0;
+   int isTransient=0;
+   const int CC = st->channels;
+   const int C = st->stream_channels;
+   int LM, M;
+   int tf_select;
+   int nbFilledBytes, nbAvailableBytes;
+   int start;
+   int end;
+   int effEnd;
+   int codedBands;
+   int alloc_trim;
+   int pitch_index=COMBFILTER_MINPERIOD;
+   opus_val16 gain1 = 0;
+   int dual_stereo=0;
+   int effectiveBytes;
+   int dynalloc_logp;
+   opus_int32 vbr_rate;
+   opus_int32 total_bits;
+   opus_int32 total_boost;
+   opus_int32 balance;
+   opus_int32 tell;
+   opus_int32 tell0_frac;
+   int prefilter_tapset=0;
+   int pf_on;
+   int anti_collapse_rsv;
+   int anti_collapse_on=0;
+   int silence=0;
+   int tf_chan = 0;
+   opus_val16 tf_estimate;
+   int pitch_change=0;
+   opus_int32 tot_boost;
+   opus_val32 sample_max;
+   celt_glog maxDepth;
+   const OpusCustomMode *mode;
+   int nbEBands;
+   int overlap;
+   const opus_int16 *eBands;
+   int secondMdct;
+   int signalBandwidth;
+   int transient_got_disabled=0;
+   celt_glog surround_masking=0;
+   celt_glog temporal_vbr=0;
+   celt_glog surround_trim = 0;
+   opus_int32 equiv_rate;
+   int hybrid;
+   int weak_transient = 0;
+   int enable_tf_analysis;
+   opus_val16 tone_freq=-1;
+   opus_val32 toneishness=0;
+   VARDECL(celt_glog, surround_dynalloc);
+   ALLOC_STACK;
+
+   mode = st->mode;
+   nbEBands = mode->nbEBands;
+   overlap = mode->overlap;
+   eBands = mode->eBands;
+   start = st->start;
+   end = st->end;
+   hybrid = start != 0;
+   tf_estimate = 0;
+   if (nbCompressedBytes<2 || pcm==NULL)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+
+   frame_size *= st->upsample;
+   for (LM=0;LM<=mode->maxLM;LM++)
+      if (mode->shortMdctSize<<LM==frame_size)
+         break;
+   if (LM>mode->maxLM)
+   {
+      RESTORE_STACK;
+      return OPUS_BAD_ARG;
+   }
+   M=1<<LM;
+   N = M*mode->shortMdctSize;
+
+   prefilter_mem = st->in_mem+CC*(overlap);
+   oldBandE = (celt_glog*)(st->in_mem+CC*(overlap+COMBFILTER_MAXPERIOD));
+   oldLogE = oldBandE + CC*nbEBands;
+   oldLogE2 = oldLogE + CC*nbEBands;
+   energyError = oldLogE2 + CC*nbEBands;
+
+   if (enc==NULL)
+   {
+      tell0_frac=tell=1;
+      nbFilledBytes=0;
+   } else {
+      tell0_frac=ec_tell_frac(enc);
+      tell=ec_tell(enc);
+      nbFilledBytes=(tell+4)>>3;
+   }
+
+#ifdef CUSTOM_MODES
+   if (st->signalling && enc==NULL)
+   {
+      int tmp = (mode->effEBands-end)>>1;
+      end = st->end = IMAX(1, mode->effEBands-tmp);
+      compressed[0] = tmp<<5;
+      compressed[0] |= LM<<3;
+      compressed[0] |= (C==2)<<2;
+      /* Convert "standard mode" to Opus header */
+      if (mode->Fs==48000 && mode->shortMdctSize==120)
+      {
+         int c0 = toOpus(compressed[0]);
+         if (c0<0)
+         {
+            RESTORE_STACK;
+            return OPUS_BAD_ARG;
+         }
+         compressed[0] = c0;
+      }
+      compressed++;
+      nbCompressedBytes--;
+   }
+#else
+   celt_assert(st->signalling==0);
+#endif
+
+   /* Can't produce more than 1275 output bytes */
+   nbCompressedBytes = IMIN(nbCompressedBytes,1275);
+
+   if (st->vbr && st->bitrate!=OPUS_BITRATE_MAX)
+   {
+      opus_int32 den=mode->Fs>>BITRES;
+      vbr_rate=(st->bitrate*frame_size+(den>>1))/den;
+#ifdef CUSTOM_MODES
+      if (st->signalling)
+         vbr_rate -= 8<<BITRES;
+#endif
+      effectiveBytes = vbr_rate>>(3+BITRES);
+   } else {
+      opus_int32 tmp;
+      vbr_rate = 0;
+      tmp = st->bitrate*frame_size;
+      if (tell>1)
+         tmp += tell*mode->Fs;
+      if (st->bitrate!=OPUS_BITRATE_MAX)
+      {
+         nbCompressedBytes = IMAX(2, IMIN(nbCompressedBytes,
+               (tmp+4*mode->Fs)/(8*mode->Fs)-!!st->signalling));
+         if (enc != NULL)
+            ec_enc_shrink(enc, nbCompressedBytes);
+      }
+      effectiveBytes = nbCompressedBytes - nbFilledBytes;
+   }
+   nbAvailableBytes = nbCompressedBytes - nbFilledBytes;
+   equiv_rate = ((opus_int32)nbCompressedBytes*8*50 << (3-LM)) - (40*C+20)*((400>>LM) - 50);
+   if (st->bitrate != OPUS_BITRATE_MAX)
+      equiv_rate = IMIN(equiv_rate, st->bitrate - (40*C+20)*((400>>LM) - 50));
+
+   if (enc==NULL)
+   {
+      ec_enc_init(&_enc, compressed, nbCompressedBytes);
+      enc = &_enc;
+   }
+
+   if (vbr_rate>0)
+   {
+      /* Computes the max bit-rate allowed in VBR mode to avoid violating the
+          target rate and buffering.
+         We must do this up front so that bust-prevention logic triggers
+          correctly if we don't have enough bits. */
+      if (st->constrained_vbr)
+      {
+         opus_int32 vbr_bound;
+         opus_int32 max_allowed;
+         /* We could use any multiple of vbr_rate as bound (depending on the
+             delay).
+            This is clamped to ensure we use at least two bytes if the encoder
+             was entirely empty, but to allow 0 in hybrid mode. */
+         vbr_bound = vbr_rate;
+         max_allowed = IMIN(IMAX(tell==1?2:0,
+               (vbr_rate+vbr_bound-st->vbr_reservoir)>>(BITRES+3)),
+               nbAvailableBytes);
+         if(max_allowed < nbAvailableBytes)
+         {
+            nbCompressedBytes = nbFilledBytes+max_allowed;
+            nbAvailableBytes = max_allowed;
+            ec_enc_shrink(enc, nbCompressedBytes);
+         }
+      }
+   }
+   total_bits = nbCompressedBytes*8;
+
+   effEnd = end;
+   if (effEnd > mode->effEBands)
+      effEnd = mode->effEBands;
+
+   ALLOC(in, CC*(N+overlap), celt_sig);
+
+   sample_max=MAX32(st->overlap_max, celt_maxabs_res(pcm, C*(N-overlap)/st->upsample));
+   st->overlap_max=celt_maxabs_res(pcm+C*(N-overlap)/st->upsample, C*overlap/st->upsample);
+   sample_max=MAX32(sample_max, st->overlap_max);
+#ifdef FIXED_POINT
+   silence = (sample_max==0);
+#else
+   silence = (sample_max <= (opus_val16)1/(1<<st->lsb_depth));
+#endif
+#ifdef FUZZING
+   if ((rand()&0x3F)==0)
+      silence = 1;
+#endif
+   if (tell==1)
+      ec_enc_bit_logp(enc, silence, 15);
+   else
+      silence=0;
+   if (silence)
+   {
+      /*In VBR mode there is no need to send more than the minimum. */
+      if (vbr_rate>0)
+      {
+         effectiveBytes=nbCompressedBytes=IMIN(nbCompressedBytes, nbFilledBytes+2);
+         total_bits=nbCompressedBytes*8;
+         nbAvailableBytes=2;
+         ec_enc_shrink(enc, nbCompressedBytes);
+      }
+      /* Pretend we've filled all the remaining bits with zeros
+            (that's what the initialiser did anyway) */
+      tell = nbCompressedBytes*8;
+      enc->nbits_total+=tell-ec_tell(enc);
+   }
+   c=0; do {
+      int need_clip=0;
+#ifndef FIXED_POINT
+      need_clip = st->clip && sample_max>65536.f;
+#endif
+      celt_preemphasis(pcm+c, in+c*(N+overlap)+overlap, N, CC, st->upsample,
+                  mode->preemph, st->preemph_memE+c, need_clip);
+   } while (++c<CC);
+
+
+   tone_freq = tone_detect(in+overlap, prefilter_mem, 1, N, overlap, &toneishness, mode->Fs);
+   /* Find pitch period and gain */
+   {
+      int enabled;
+      int qg;
+      enabled = ((st->lfe&&nbAvailableBytes>3) || nbAvailableBytes>12*C) && !hybrid && !silence && tell+16<=total_bits && !st->disable_pf
+            && st->complexity >= 5;
+
+      prefilter_tapset = st->tapset_decision;
+      pf_on = run_prefilter(st, in, prefilter_mem, CC, N, prefilter_tapset, &pitch_index, &gain1, &qg, enabled, nbAvailableBytes, &st->analysis, tone_freq, toneishness);
+      if ((gain1 > QCONST16(.4f,15) || st->prefilter_gain > QCONST16(.4f,15)) && (!st->analysis.valid || st->analysis.tonality > .3)
+            && (pitch_index > 1.26*st->prefilter_period || pitch_index < .79*st->prefilter_period))
+         pitch_change = 1;
+      if (pf_on==0)
+      {
+         if(!hybrid && tell+16<=total_bits)
+            ec_enc_bit_logp(enc, 0, 1);
+      } else {
+         /*This block is not gated by a total bits check only because
+           of the nbAvailableBytes check above.*/
+         int octave;
+         ec_enc_bit_logp(enc, 1, 1);
+         pitch_index += 1;
+         octave = EC_ILOG(pitch_index)-5;
+         ec_enc_uint(enc, octave, 6);
+         ec_enc_bits(enc, pitch_index-(16<<octave), 4+octave);
+         pitch_index -= 1;
+         ec_enc_bits(enc, qg, 3);
+         ec_enc_icdf(enc, prefilter_tapset, tapset_icdf, 2);
+      }
+   }
+
+   isTransient = 0;
+   shortBlocks = 0;
+   if (st->complexity >= 1 && !st->lfe)
+   {
+      /* Reduces the likelihood of energy instability on fricatives at low bitrate
+         in hybrid mode. It seems like we still want to have real transients on vowels
+         though (small SILK quantization offset value). */
+      int allow_weak_transients = hybrid && effectiveBytes<15 && st->silk_info.signalType != 2;
+      isTransient = transient_analysis(in, N+overlap, CC,
+            &tf_estimate, &tf_chan, allow_weak_transients, &weak_transient, tone_freq, toneishness);
+   }
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+   {
+      if (isTransient)
+         shortBlocks = M;
+   } else {
+      isTransient = 0;
+      transient_got_disabled=1;
+   }
+
+   ALLOC(freq, CC*N, celt_sig); /**< Interleaved signal MDCTs */
+   ALLOC(bandE,nbEBands*CC, celt_ener);
+   ALLOC(bandLogE,nbEBands*CC, celt_glog);
+
+   secondMdct = shortBlocks && st->complexity>=8;
+   ALLOC(bandLogE2, C*nbEBands, celt_glog);
+   if (secondMdct)
+   {
+      compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
+      compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
+      amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
+      for (c=0;c<C;c++)
+      {
+         for (i=0;i<end;i++)
+            bandLogE2[nbEBands*c+i] += HALF32(SHL32(LM, DB_SHIFT));
+      }
+   }
+
+   compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
+   /* This should catch any NaN in the CELT input. Since we're not supposed to see any (they're filtered
+      at the Opus layer), just abort. */
+   celt_assert(!celt_isnan(freq[0]) && (C==1 || !celt_isnan(freq[N])));
+   if (CC==2&&C==1)
+      tf_chan = 0;
+   compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
+
+   if (st->lfe)
+   {
+      for (i=2;i<end;i++)
+      {
+         bandE[i] = IMIN(bandE[i], MULT16_32_Q15(QCONST16(1e-4f,15),bandE[0]));
+         bandE[i] = MAX32(bandE[i], EPSILON);
+      }
+   }
+   amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
+
+   ALLOC(surround_dynalloc, C*nbEBands, celt_glog);
+   OPUS_CLEAR(surround_dynalloc, end);
+   /* This computes how much masking takes place between surround channels */
+   if (!hybrid&&st->energy_mask&&!st->lfe)
+   {
+      int mask_end;
+      int midband;
+      int count_dynalloc;
+      opus_val32 mask_avg=0;
+      opus_val32 diff=0;
+      int count=0;
+      mask_end = IMAX(2,st->lastCodedBands);
+      for (c=0;c<C;c++)
+      {
+         for(i=0;i<mask_end;i++)
+         {
+            celt_glog mask;
+            opus_val16 mask16;
+            mask = MAXG(MING(st->energy_mask[nbEBands*c+i],
+                   GCONST(.25f)), -GCONST(2.0f));
+            if (mask > 0)
+               mask = HALF32(mask);
+            mask16 = SHR32(mask, DB_SHIFT-10);
+            mask_avg += MULT16_16(mask16, eBands[i+1]-eBands[i]);
+            count += eBands[i+1]-eBands[i];
+            diff += MULT16_16(mask16, 1+2*i-mask_end);
+         }
+      }
+      celt_assert(count>0);
+      mask_avg = SHL32(DIV32_16(mask_avg,count), DB_SHIFT-10);
+      mask_avg += GCONST(.2f);
+      diff = SHL32(diff*6/(C*(mask_end-1)*(mask_end+1)*mask_end), DB_SHIFT-10);
+      /* Again, being conservative */
+      diff = HALF32(diff);
+      diff = MAX32(MIN32(diff, GCONST(.031f)), -GCONST(.031f));
+      /* Find the band that's in the middle of the coded spectrum */
+      for (midband=0;eBands[midband+1] < eBands[mask_end]/2;midband++);
+      count_dynalloc=0;
+      for(i=0;i<mask_end;i++)
+      {
+         opus_val32 lin;
+         celt_glog unmask;
+         lin = mask_avg + diff*(i-midband);
+         if (C==2)
+            unmask = MAXG(st->energy_mask[i], st->energy_mask[nbEBands+i]);
+         else
+            unmask = st->energy_mask[i];
+         unmask = MING(unmask, GCONST(.0f));
+         unmask -= lin;
+         if (unmask > GCONST(.25f))
+         {
+            surround_dynalloc[i] = unmask - GCONST(.25f);
+            count_dynalloc++;
+         }
+      }
+      if (count_dynalloc>=3)
+      {
+         /* If we need dynalloc in many bands, it's probably because our
+            initial masking rate was too low. */
+         mask_avg += GCONST(.25f);
+         if (mask_avg>0)
+         {
+            /* Something went really wrong in the original calculations,
+               disabling masking. */
+            mask_avg = 0;
+            diff = 0;
+            OPUS_CLEAR(surround_dynalloc, mask_end);
+         } else {
+            for(i=0;i<mask_end;i++)
+               surround_dynalloc[i] = MAXG(0, surround_dynalloc[i]-GCONST(.25f));
+         }
+      }
+      mask_avg += GCONST(.2f);
+      /* Convert to 1/64th units used for the trim */
+      surround_trim = 64*diff;
+      /*printf("%d %d ", mask_avg, surround_trim);*/
+      surround_masking = mask_avg;
+   }
+   /* Temporal VBR (but not for LFE) */
+   if (!st->lfe)
+   {
+      celt_glog follow=-QCONST32(10.0f, DB_SHIFT-5);
+      opus_val32 frame_avg=0;
+      celt_glog offset = shortBlocks?HALF32(SHL32(LM, DB_SHIFT-5)):0;
+      for(i=start;i<end;i++)
+      {
+         follow = MAXG(follow-QCONST32(1.0f, DB_SHIFT-5), SHR32(bandLogE[i],5)-offset);
+         if (C==2)
+            follow = MAXG(follow, SHR32(bandLogE[i+nbEBands],5)-offset);
+         frame_avg += follow;
+      }
+      frame_avg /= (end-start);
+      temporal_vbr = SUB32(SHL32(frame_avg, 5),st->spec_avg);
+      temporal_vbr = MING(GCONST(3.f), MAXG(-GCONST(1.5f), temporal_vbr));
+      st->spec_avg += MULT16_32_Q15(QCONST16(.02f, 15), temporal_vbr);
+   }
+   /*for (i=0;i<21;i++)
+      printf("%f ", bandLogE[i]);
+   printf("\n");*/
+
+   if (!secondMdct)
+   {
+      OPUS_COPY(bandLogE2, bandLogE, C*nbEBands);
+   }
+
+   /* Last chance to catch any transient we might have missed in the
+      time-domain analysis */
+   if (LM>0 && ec_tell(enc)+3<=total_bits && !isTransient && st->complexity>=5 && !st->lfe && !hybrid)
+   {
+      if (patch_transient_decision(bandLogE, oldBandE, nbEBands, start, end, C))
+      {
+         isTransient = 1;
+         shortBlocks = M;
+         compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
+         compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
+         amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
+         /* Compensate for the scaling of short vs long mdcts */
+         for (c=0;c<C;c++)
+         {
+            for (i=0;i<end;i++)
+               bandLogE2[nbEBands*c+i] += HALF32(SHL32(LM, DB_SHIFT));
+         }
+         tf_estimate = QCONST16(.2f,14);
+      }
+   }
+
+   if (LM>0 && ec_tell(enc)+3<=total_bits)
+      ec_enc_bit_logp(enc, isTransient, 3);
+
+   ALLOC(X, C*N, celt_norm);         /**< Interleaved normalised MDCTs */
+
+   /* Band normalisation */
+   normalise_bands(mode, freq, X, bandE, effEnd, C, M);
+
+   enable_tf_analysis = effectiveBytes>=15*C && !hybrid && st->complexity>=2 && !st->lfe && toneishness < QCONST32(.98f, 29);
+
+   ALLOC(offsets, nbEBands, int);
+   ALLOC(importance, nbEBands, int);
+   ALLOC(spread_weight, nbEBands, int);
+
+   maxDepth = dynalloc_analysis(bandLogE, bandLogE2, oldBandE, nbEBands, start, end, C, offsets,
+         st->lsb_depth, mode->logN, isTransient, st->vbr, st->constrained_vbr,
+         eBands, LM, effectiveBytes, &tot_boost, st->lfe, surround_dynalloc, &st->analysis, importance, spread_weight, tone_freq, toneishness);
+
+   ALLOC(tf_res, nbEBands, int);
+   /* Disable variable tf resolution for hybrid and at very low bitrate */
+   if (enable_tf_analysis)
+   {
+      int lambda;
+      lambda = IMAX(80, 20480/effectiveBytes + 2);
+      tf_select = tf_analysis(mode, effEnd, isTransient, tf_res, lambda, X, N, LM, tf_estimate, tf_chan, importance);
+      for (i=effEnd;i<end;i++)
+         tf_res[i] = tf_res[effEnd-1];
+   } else if (hybrid && weak_transient)
+   {
+      /* For weak transients, we rely on the fact that improving time resolution using
+         TF on a long window is imperfect and will not result in an energy collapse at
+         low bitrate. */
+      for (i=0;i<end;i++)
+         tf_res[i] = 1;
+      tf_select=0;
+   } else if (hybrid && effectiveBytes<15 && st->silk_info.signalType != 2)
+   {
+      /* For low bitrate hybrid, we force temporal resolution to 5 ms rather than 2.5 ms. */
+      for (i=0;i<end;i++)
+         tf_res[i] = 0;
+      tf_select=isTransient;
+   } else {
+      for (i=0;i<end;i++)
+         tf_res[i] = isTransient;
+      tf_select=0;
+   }
+
+   ALLOC(error, C*nbEBands, celt_glog);
+   c=0;
+   do {
+      for (i=start;i<end;i++)
+      {
+         /* When the energy is stable, slightly bias energy quantization towards
+            the previous error to make the gain more stable (a constant offset is
+            better than fluctuations). */
+         if (ABS32(SUB32(bandLogE[i+c*nbEBands], oldBandE[i+c*nbEBands])) < GCONST(2.f))
+         {
+            bandLogE[i+c*nbEBands] -= MULT16_32_Q15(QCONST16(0.25f, 15), energyError[i+c*nbEBands]);
+         }
+      }
+   } while (++c < C);
+   quant_coarse_energy(mode, start, end, effEnd, bandLogE,
+         oldBandE, total_bits, error, enc,
+         C, LM, nbAvailableBytes, st->force_intra,
+         &st->delayedIntra, st->complexity >= 4, st->loss_rate, st->lfe);
+
+   tf_encode(start, end, isTransient, tf_res, LM, tf_select, enc);
+
+   if (ec_tell(enc)+4<=total_bits)
+   {
+      if (st->lfe)
+      {
+         st->tapset_decision = 0;
+         st->spread_decision = SPREAD_NORMAL;
+      } else if (hybrid)
+      {
+         if (st->complexity == 0)
+            st->spread_decision = SPREAD_NONE;
+         else if (isTransient)
+            st->spread_decision = SPREAD_NORMAL;
+         else
+            st->spread_decision = SPREAD_AGGRESSIVE;
+      } else if (shortBlocks || st->complexity < 3 || nbAvailableBytes < 10*C)
+      {
+         if (st->complexity == 0)
+            st->spread_decision = SPREAD_NONE;
+         else
+            st->spread_decision = SPREAD_NORMAL;
+      } else {
+         /* Disable new spreading+tapset estimator until we can show it works
+            better than the old one. So far it seems like spreading_decision()
+            works best. */
+#if 0
+         if (st->analysis.valid)
+         {
+            static const opus_val16 spread_thresholds[3] = {-QCONST16(.6f, 15), -QCONST16(.2f, 15), -QCONST16(.07f, 15)};
+            static const opus_val16 spread_histeresis[3] = {QCONST16(.15f, 15), QCONST16(.07f, 15), QCONST16(.02f, 15)};
+            static const opus_val16 tapset_thresholds[2] = {QCONST16(.0f, 15), QCONST16(.15f, 15)};
+            static const opus_val16 tapset_histeresis[2] = {QCONST16(.1f, 15), QCONST16(.05f, 15)};
+            st->spread_decision = hysteresis_decision(-st->analysis.tonality, spread_thresholds, spread_histeresis, 3, st->spread_decision);
+            st->tapset_decision = hysteresis_decision(st->analysis.tonality_slope, tapset_thresholds, tapset_histeresis, 2, st->tapset_decision);
+         } else
+#endif
+         {
+            st->spread_decision = spreading_decision(mode, X,
+                  &st->tonal_average, st->spread_decision, &st->hf_average,
+                  &st->tapset_decision, pf_on&&!shortBlocks, effEnd, C, M, spread_weight);
+         }
+         /*printf("%d %d\n", st->tapset_decision, st->spread_decision);*/
+         /*printf("%f %d %f %d\n\n", st->analysis.tonality, st->spread_decision, st->analysis.tonality_slope, st->tapset_decision);*/
+      }
+      ec_enc_icdf(enc, st->spread_decision, spread_icdf, 5);
+   } else {
+      st->spread_decision = SPREAD_NORMAL;
+   }
+
+   /* For LFE, everything interesting is in the first band */
+   if (st->lfe)
+      offsets[0] = IMIN(8, effectiveBytes/3);
+   ALLOC(cap, nbEBands, int);
+   init_caps(mode,cap,LM,C);
+
+   dynalloc_logp = 6;
+   total_bits<<=BITRES;
+   total_boost = 0;
+   tell = ec_tell_frac(enc);
+   for (i=start;i<end;i++)
+   {
+      int width, quanta;
+      int dynalloc_loop_logp;
+      int boost;
+      int j;
+      width = C*(eBands[i+1]-eBands[i])<<LM;
+      /* quanta is 6 bits, but no more than 1 bit/sample
+         and no less than 1/8 bit/sample */
+      quanta = IMIN(width<<BITRES, IMAX(6<<BITRES, width));
+      dynalloc_loop_logp = dynalloc_logp;
+      boost = 0;
+      for (j = 0; tell+(dynalloc_loop_logp<<BITRES) < total_bits-total_boost
+            && boost < cap[i]; j++)
+      {
+         int flag;
+         flag = j<offsets[i];
+         ec_enc_bit_logp(enc, flag, dynalloc_loop_logp);
+         tell = ec_tell_frac(enc);
+         if (!flag)
+            break;
+         boost += quanta;
+         total_boost += quanta;
+         dynalloc_loop_logp = 1;
+      }
+      /* Making dynalloc more likely */
+      if (j)
+         dynalloc_logp = IMAX(2, dynalloc_logp-1);
+      offsets[i] = boost;
+   }
+
+   if (C==2)
+   {
+      static const opus_val16 intensity_thresholds[21]=
+      /* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  20  off*/
+        {  1, 2, 3, 4, 5, 6, 7, 8,16,24,36,44,50,56,62,67,72,79,88,106,134};
+      static const opus_val16 intensity_histeresis[21]=
+        {  1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6,  8, 8};
+
+      /* Always use MS for 2.5 ms frames until we can do a better analysis */
+      if (LM!=0)
+         dual_stereo = stereo_analysis(mode, X, LM, N);
+
+      st->intensity = hysteresis_decision((opus_val16)(equiv_rate/1000),
+            intensity_thresholds, intensity_histeresis, 21, st->intensity);
+      st->intensity = IMIN(end,IMAX(start, st->intensity));
+   }
+
+   alloc_trim = 5;
+   if (tell+(6<<BITRES) <= total_bits - total_boost)
+   {
+      if (start > 0 || st->lfe)
+      {
+         st->stereo_saving = 0;
+         alloc_trim = 5;
+      } else {
+         alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
+            end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
+            st->intensity, surround_trim, equiv_rate, st->arch);
+      }
+      ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
+      tell = ec_tell_frac(enc);
+   }
+
+   /* Variable bitrate */
+   if (vbr_rate>0)
+   {
+     opus_val16 alpha;
+     opus_int32 delta;
+     /* The target rate in 8th bits per frame */
+     opus_int32 target, base_target;
+     opus_int32 min_allowed;
+     int lm_diff = mode->maxLM - LM;
+
+     /* Don't attempt to use more than 510 kb/s, even for frames smaller than 20 ms.
+        The CELT allocator will just not be able to use more than that anyway. */
+     nbCompressedBytes = IMIN(nbCompressedBytes,1275>>(3-LM));
+     if (!hybrid)
+     {
+        base_target = vbr_rate - ((40*C+20)<<BITRES);
+     } else {
+        base_target = IMAX(0, vbr_rate - ((9*C+4)<<BITRES));
+     }
+
+     if (st->constrained_vbr)
+        base_target += (st->vbr_offset>>lm_diff);
+
+     if (!hybrid)
+     {
+        target = compute_vbr(mode, &st->analysis, base_target, LM, equiv_rate,
+           st->lastCodedBands, C, st->intensity, st->constrained_vbr,
+           st->stereo_saving, tot_boost, tf_estimate, pitch_change, maxDepth,
+           st->lfe, st->energy_mask!=NULL, surround_masking,
+           temporal_vbr);
+     } else {
+        target = base_target;
+        /* Tonal frames (offset<100) need more bits than noisy (offset>100) ones. */
+        if (st->silk_info.offset < 100) target += 12 << BITRES >> (3-LM);
+        if (st->silk_info.offset > 100) target -= 18 << BITRES >> (3-LM);
+        /* Boosting bitrate on transients and vowels with significant temporal
+           spikes. */
+        target += (opus_int32)MULT16_16_Q14(tf_estimate-QCONST16(.25f,14), (50<<BITRES));
+        /* If we have a strong transient, let's make sure it has enough bits to code
+           the first two bands, so that it can use folding rather than noise. */
+        if (tf_estimate > QCONST16(.7f,14))
+           target = IMAX(target, 50<<BITRES);
+     }
+     /* The current offset is removed from the target and the space used
+        so far is added*/
+     target=target+tell;
+     /* In VBR mode the frame size must not be reduced so much that it would
+         result in the encoder running out of bits.
+        The margin of 2 bytes ensures that none of the bust-prevention logic
+         in the decoder will have triggered so far. */
+     min_allowed = ((tell+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3)) + 2;
+     /* Take into account the 37 bits we need to have left in the packet to
+        signal a redundant frame in hybrid mode. Creating a shorter packet would
+        create an entropy coder desync. */
+     if (hybrid)
+        min_allowed = IMAX(min_allowed, (tell0_frac+(37<<BITRES)+total_boost+(1<<(BITRES+3))-1)>>(BITRES+3));
+
+     nbAvailableBytes = (target+(1<<(BITRES+2)))>>(BITRES+3);
+     nbAvailableBytes = IMAX(min_allowed,nbAvailableBytes);
+     nbAvailableBytes = IMIN(nbCompressedBytes,nbAvailableBytes);
+
+     /* By how much did we "miss" the target on that frame */
+     delta = target - vbr_rate;
+
+     target=nbAvailableBytes<<(BITRES+3);
+
+     /*If the frame is silent we don't adjust our drift, otherwise
+       the encoder will shoot to very high rates after hitting a
+       span of silence, but we do allow the bitres to refill.
+       This means that we'll undershoot our target in CVBR/VBR modes
+       on files with lots of silence. */
+     if(silence)
+     {
+       nbAvailableBytes = 2;
+       target = 2*8<<BITRES;
+       delta = 0;
+     }
+
+     if (st->vbr_count < 970)
+     {
+        st->vbr_count++;
+        alpha = celt_rcp(SHL32(EXTEND32(st->vbr_count+20),16));
+     } else
+        alpha = QCONST16(.001f,15);
+     /* How many bits have we used in excess of what we're allowed */
+     if (st->constrained_vbr)
+        st->vbr_reservoir += target - vbr_rate;
+     /*printf ("%d\n", st->vbr_reservoir);*/
+
+     /* Compute the offset we need to apply in order to reach the target */
+     if (st->constrained_vbr)
+     {
+        st->vbr_drift += (opus_int32)MULT16_32_Q15(alpha,(delta*(1<<lm_diff))-st->vbr_offset-st->vbr_drift);
+        st->vbr_offset = -st->vbr_drift;
+     }
+     /*printf ("%d\n", st->vbr_drift);*/
+
+     if (st->constrained_vbr && st->vbr_reservoir < 0)
+     {
+        /* We're under the min value -- increase rate */
+        int adjust = (-st->vbr_reservoir)/(8<<BITRES);
+        /* Unless we're just coding silence */
+        nbAvailableBytes += silence?0:adjust;
+        st->vbr_reservoir = 0;
+        /*printf ("+%d\n", adjust);*/
+     }
+     nbCompressedBytes = IMIN(nbCompressedBytes,nbAvailableBytes);
+     /*printf("%d\n", nbCompressedBytes*50*8);*/
+     /* This moves the raw bits to take into account the new compressed size */
+     ec_enc_shrink(enc, nbCompressedBytes);
+   }
+
+   /* Bit allocation */
+   ALLOC(fine_quant, nbEBands, int);
+   ALLOC(pulses, nbEBands, int);
+   ALLOC(fine_priority, nbEBands, int);
+
+   /* bits =           packet size                    - where we are - safety*/
+   bits = (((opus_int32)nbCompressedBytes*8)<<BITRES) - (opus_int32)ec_tell_frac(enc) - 1;
+   anti_collapse_rsv = isTransient&&LM>=2&&bits>=((LM+2)<<BITRES) ? (1<<BITRES) : 0;
+   bits -= anti_collapse_rsv;
+   signalBandwidth = end-1;
+#ifndef DISABLE_FLOAT_API
+   if (st->analysis.valid)
+   {
+      int min_bandwidth;
+      if (equiv_rate < (opus_int32)32000*C)
+         min_bandwidth = 13;
+      else if (equiv_rate < (opus_int32)48000*C)
+         min_bandwidth = 16;
+      else if (equiv_rate < (opus_int32)60000*C)
+         min_bandwidth = 18;
+      else  if (equiv_rate < (opus_int32)80000*C)
+         min_bandwidth = 19;
+      else
+         min_bandwidth = 20;
+      signalBandwidth = IMAX(st->analysis.bandwidth, min_bandwidth);
+   }
+#endif
+   if (st->lfe)
+      signalBandwidth = 1;
+   codedBands = clt_compute_allocation(mode, start, end, offsets, cap,
+         alloc_trim, &st->intensity, &dual_stereo, bits, &balance, pulses,
+         fine_quant, fine_priority, C, LM, enc, 1, st->lastCodedBands, signalBandwidth);
+   if (st->lastCodedBands)
+      st->lastCodedBands = IMIN(st->lastCodedBands+1,IMAX(st->lastCodedBands-1,codedBands));
+   else
+      st->lastCodedBands = codedBands;
+
+   quant_fine_energy(mode, start, end, oldBandE, error, fine_quant, enc, C);
+
+   /* Residual quantisation */
+   ALLOC(collapse_masks, C*nbEBands, unsigned char);
+   quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
+         bandE, pulses, shortBlocks, st->spread_decision,
+         dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
+         balance, enc, LM, codedBands, &st->rng, st->complexity, st->arch, st->disable_inv);
+
+   if (anti_collapse_rsv > 0)
+   {
+      anti_collapse_on = st->consec_transient<2;
+#ifdef FUZZING
+      anti_collapse_on = rand()&0x1;
+#endif
+      ec_enc_bits(enc, anti_collapse_on, 1);
+   }
+   quant_energy_finalise(mode, start, end, oldBandE, error, fine_quant, fine_priority, nbCompressedBytes*8-ec_tell(enc), enc, C);
+   OPUS_CLEAR(energyError, nbEBands*CC);
+   c=0;
+   do {
+      for (i=start;i<end;i++)
+      {
+         energyError[i+c*nbEBands] = MAXG(-GCONST(0.5f), MING(GCONST(0.5f), error[i+c*nbEBands]));
+      }
+   } while (++c < C);
+
+   if (silence)
+   {
+      for (i=0;i<C*nbEBands;i++)
+         oldBandE[i] = -GCONST(28.f);
+   }
+
+#ifdef RESYNTH
+   /* Re-synthesis of the coded audio if required */
+   {
+      celt_sig *out_mem[2];
+
+      if (anti_collapse_on)
+      {
+         anti_collapse(mode, X, collapse_masks, LM, C, N,
+               start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, 1, st->arch);
+      }
+
+      c=0; do {
+         OPUS_MOVE(st->syn_mem[c], st->syn_mem[c]+N, 2*MAX_PERIOD-N+overlap/2);
+      } while (++c<CC);
+
+      c=0; do {
+         out_mem[c] = st->syn_mem[c]+2*MAX_PERIOD-N;
+      } while (++c<CC);
+
+      celt_synthesis(mode, X, out_mem, oldBandE, start, effEnd,
+                     C, CC, isTransient, LM, st->upsample, silence, st->arch);
+
+      c=0; do {
+         st->prefilter_period=IMAX(st->prefilter_period, COMBFILTER_MINPERIOD);
+         st->prefilter_period_old=IMAX(st->prefilter_period_old, COMBFILTER_MINPERIOD);
+         comb_filter(out_mem[c], out_mem[c], st->prefilter_period_old, st->prefilter_period, mode->shortMdctSize,
+               st->prefilter_gain_old, st->prefilter_gain, st->prefilter_tapset_old, st->prefilter_tapset,
+               mode->window, overlap, st->arch);
+         if (LM!=0)
+            comb_filter(out_mem[c]+mode->shortMdctSize, out_mem[c]+mode->shortMdctSize, st->prefilter_period, pitch_index, N-mode->shortMdctSize,
+                  st->prefilter_gain, gain1, st->prefilter_tapset, prefilter_tapset,
+                  mode->window, overlap, st->arch);
+      } while (++c<CC);
+
+      /* We reuse freq[] as scratch space for the de-emphasis */
+      deemphasis(out_mem, (opus_res*)pcm, N, CC, st->upsample, mode->preemph, st->preemph_memD, 0);
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   st->prefilter_period = pitch_index;
+   st->prefilter_gain = gain1;
+   st->prefilter_tapset = prefilter_tapset;
+#ifdef RESYNTH
+   if (LM!=0)
+   {
+      st->prefilter_period_old = st->prefilter_period;
+      st->prefilter_gain_old = st->prefilter_gain;
+      st->prefilter_tapset_old = st->prefilter_tapset;
+   }
+#endif
+
+   if (CC==2&&C==1) {
+      OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
+   }
+
+   if (!isTransient)
+   {
+      OPUS_COPY(oldLogE2, oldLogE, CC*nbEBands);
+      OPUS_COPY(oldLogE, oldBandE, CC*nbEBands);
+   } else {
+      for (i=0;i<CC*nbEBands;i++)
+         oldLogE[i] = MING(oldLogE[i], oldBandE[i]);
+   }
+   /* In case start or end were to change */
+   c=0; do
+   {
+      for (i=0;i<start;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-GCONST(28.f);
+      }
+      for (i=end;i<nbEBands;i++)
+      {
+         oldBandE[c*nbEBands+i]=0;
+         oldLogE[c*nbEBands+i]=oldLogE2[c*nbEBands+i]=-GCONST(28.f);
+      }
+   } while (++c<CC);
+
+   if (isTransient || transient_got_disabled)
+      st->consec_transient++;
+   else
+      st->consec_transient=0;
+   st->rng = enc->rng;
+
+   /* If there's any room left (can only happen for very high rates),
+      it's already filled with zeros */
+   ec_enc_done(enc);
+
+#ifdef CUSTOM_MODES
+   if (st->signalling)
+      nbCompressedBytes++;
+#endif
+
+   RESTORE_STACK;
+   if (ec_get_error(enc))
+      return OPUS_INTERNAL_ERROR;
+   else
+      return nbCompressedBytes;
+}
+
+
+#ifdef CUSTOM_MODES
+
+#if defined(FIXED_POINT) && !defined(ENABLE_RES24)
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+#else
+int opus_custom_encode(CELTEncoder * OPUS_RESTRICT st, const opus_int16 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(in, C*N, opus_res);
+
+   for (j=0;j<C*N;j++)
+     in[j] = INT16TORES(pcm[j]);
+
+   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((opus_int16*)pcm)[j]=RES2INT16(in[j]);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+#endif
+
+
+#if defined(FIXED_POINT) && defined(ENABLE_RES24)
+int opus_custom_encode24(CELTEncoder * OPUS_RESTRICT st, const opus_int32 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+#else
+int opus_custom_encode24(CELTEncoder * OPUS_RESTRICT st, const opus_int32 * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(in, C*N, opus_res);
+
+   for (j=0;j<C*N;j++)
+     in[j] = INT24TORES(pcm[j]);
+
+   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((opus_int32*)pcm)[j]=RES2INT24(in[j]);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+#endif
+
+
+#ifndef DISABLE_FLOAT_API
+
+# if !defined(FIXED_POINT)
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   return celt_encode_with_ec(st, pcm, frame_size, compressed, nbCompressedBytes, NULL);
+}
+# else
+int opus_custom_encode_float(CELTEncoder * OPUS_RESTRICT st, const float * pcm, int frame_size, unsigned char *compressed, int nbCompressedBytes)
+{
+   int j, ret, C, N;
+   VARDECL(opus_res, in);
+   ALLOC_STACK;
+
+   if (pcm==NULL)
+      return OPUS_BAD_ARG;
+
+   C = st->channels;
+   N = frame_size;
+   ALLOC(in, C*N, opus_res);
+
+   for (j=0;j<C*N;j++)
+     in[j] = FLOAT2RES(pcm[j]);
+
+   ret=celt_encode_with_ec(st,in,frame_size,compressed,nbCompressedBytes, NULL);
+#ifdef RESYNTH
+   for (j=0;j<C*N;j++)
+      ((float*)pcm)[j]=RES2FLOAT(in[j]);
+#endif
+   RESTORE_STACK;
+   return ret;
+}
+# endif
+
+#endif
+
+#endif /* CUSTOM_MODES */
+
+int opus_custom_encoder_ctl(CELTEncoder * OPUS_RESTRICT st, int request, ...)
+{
+   va_list ap;
+
+   va_start(ap, request);
+   switch (request)
+   {
+      case OPUS_SET_COMPLEXITY_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>10)
+            goto bad_arg;
+         st->complexity = value;
+      }
+      break;
+      case CELT_SET_START_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<0 || value>=st->mode->nbEBands)
+            goto bad_arg;
+         st->start = value;
+      }
+      break;
+      case CELT_SET_END_BAND_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>st->mode->nbEBands)
+            goto bad_arg;
+         st->end = value;
+      }
+      break;
+      case CELT_SET_PREDICTION_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>2)
+            goto bad_arg;
+         st->disable_pf = value<=1;
+         st->force_intra = value==0;
+      }
+      break;
+      case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
+      {
+         int value = va_arg(ap, opus_int32);
+         if (value<0 || value>100)
+            goto bad_arg;
+         st->loss_rate = value;
+      }
+      break;
+      case OPUS_SET_VBR_CONSTRAINT_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->constrained_vbr = value;
+      }
+      break;
+      case OPUS_SET_VBR_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->vbr = value;
+      }
+      break;
+      case OPUS_SET_BITRATE_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<=500 && value!=OPUS_BITRATE_MAX)
+            goto bad_arg;
+         value = IMIN(value, 260000*st->channels);
+         st->bitrate = value;
+      }
+      break;
+      case CELT_SET_CHANNELS_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         if (value<1 || value>2)
+            goto bad_arg;
+         st->stream_channels = value;
+      }
+      break;
+      case OPUS_SET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if (value<8 || value>24)
+             goto bad_arg;
+          st->lsb_depth=value;
+      }
+      break;
+      case OPUS_GET_LSB_DEPTH_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          *value=st->lsb_depth;
+      }
+      break;
+      case OPUS_SET_PHASE_INVERSION_DISABLED_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          if(value<0 || value>1)
+          {
+             goto bad_arg;
+          }
+          st->disable_inv = value;
+      }
+      break;
+      case OPUS_GET_PHASE_INVERSION_DISABLED_REQUEST:
+      {
+          opus_int32 *value = va_arg(ap, opus_int32*);
+          if (!value)
+          {
+             goto bad_arg;
+          }
+          *value = st->disable_inv;
+      }
+      break;
+      case OPUS_RESET_STATE:
+      {
+         int i;
+         celt_glog *oldBandE, *oldLogE, *oldLogE2;
+         oldBandE = (celt_glog*)(st->in_mem+st->channels*(st->mode->overlap+COMBFILTER_MAXPERIOD));
+         oldLogE = oldBandE + st->channels*st->mode->nbEBands;
+         oldLogE2 = oldLogE + st->channels*st->mode->nbEBands;
+         OPUS_CLEAR((char*)&st->ENCODER_RESET_START,
+               opus_custom_encoder_get_size(st->mode, st->channels)-
+               ((char*)&st->ENCODER_RESET_START - (char*)st));
+         for (i=0;i<st->channels*st->mode->nbEBands;i++)
+            oldLogE[i]=oldLogE2[i]=-GCONST(28.f);
+         st->vbr_offset = 0;
+         st->delayedIntra = 1;
+         st->spread_decision = SPREAD_NORMAL;
+         st->tonal_average = 256;
+         st->hf_average = 0;
+         st->tapset_decision = 0;
+      }
+      break;
+#ifdef CUSTOM_MODES
+      case CELT_SET_INPUT_CLIPPING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->clip = value;
+      }
+      break;
+#endif
+      case CELT_SET_SIGNALLING_REQUEST:
+      {
+         opus_int32 value = va_arg(ap, opus_int32);
+         st->signalling = value;
+      }
+      break;
+      case CELT_SET_ANALYSIS_REQUEST:
+      {
+         AnalysisInfo *info = va_arg(ap, AnalysisInfo *);
+         if (info)
+            OPUS_COPY(&st->analysis, info, 1);
+      }
+      break;
+      case CELT_SET_SILK_INFO_REQUEST:
+      {
+         SILKInfo *info = va_arg(ap, SILKInfo *);
+         if (info)
+            OPUS_COPY(&st->silk_info, info, 1);
+      }
+      break;
+      case CELT_GET_MODE_REQUEST:
+      {
+         const CELTMode ** value = va_arg(ap, const CELTMode**);
+         if (value==0)
+            goto bad_arg;
+         *value=st->mode;
+      }
+      break;
+      case OPUS_GET_FINAL_RANGE_REQUEST:
+      {
+         opus_uint32 * value = va_arg(ap, opus_uint32 *);
+         if (value==0)
+            goto bad_arg;
+         *value=st->rng;
+      }
+      break;
+      case OPUS_SET_LFE_REQUEST:
+      {
+          opus_int32 value = va_arg(ap, opus_int32);
+          st->lfe = value;
+      }
+      break;
+      case OPUS_SET_ENERGY_MASK_REQUEST:
+      {
+         celt_glog *value = va_arg(ap, celt_glog*);
+          st->energy_mask = value;
+      }
+      break;
+      default:
+         goto bad_request;
+   }
+   va_end(ap);
+   return OPUS_OK;
+bad_arg:
+   va_end(ap);
+   return OPUS_BAD_ARG;
+bad_request:
+   va_end(ap);
+   return OPUS_UNIMPLEMENTED;
+}
--- a/libcelt/plc.c
+++ b/libcelt/plc.c
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -29,9 +29,10 @@
 #include "config.h"
 #endif

-#include "plc.h"
+#include "celt_lpc.h"
 #include "stack_alloc.h"
 #include "mathops.h"
+#include "pitch.h"

 void _celt_lpc(
      opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
@@ -43,24 +44,27 @@ int          p
   opus_val32 r;
   opus_val32 error = ac[0];
 #ifdef FIXED_POINT
-   opus_val32 lpc[LPC_ORDER];
+   opus_val32 lpc[CELT_LPC_ORDER];
 #else
   float *lpc = _lpc;
 #endif

-   for (i = 0; i < p; i++)
-      lpc[i] = 0;
+   OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
   if (ac[0] != 0)
+#else
+   if (ac[0] > 1e-10f)
+#endif
   {
      for (i = 0; i < p; i++) {
         /* Sum up this iteration's reflection coefficient */
         opus_val32 rr = 0;
         for (j = 0; j < i; j++)
            rr += MULT32_32_Q31(lpc[j],ac[i - j]);
-         rr += SHR32(ac[i + 1],3);
-         r = -frac_div32(SHL32(rr,3), error);
+         rr += SHR32(ac[i + 1],6);
+         r = -frac_div32(SHL32(rr,6), error);
         /*  Update LPC coefficients and total error */
-         lpc[i] = SHR32(r,3);
+         lpc[i] = SHR32(r,6);
         for (j = 0; j < (i+1)>>1; j++)
         {
            opus_val32 tmp1, tmp2;
@@ -73,56 +77,127 @@ int          p
         error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
         /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
            break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
            break;
 #endif
      }
   }
 #ifdef FIXED_POINT
-   for (i=0;i<p;i++)
-      _lpc[i] = ROUND16(lpc[i],16);
+   {
+      /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+         This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+         fixes should also be applied there. */
+      int iter, idx = 0;
+      opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+      for (iter = 0; iter < 10; iter++) {
+         maxabs = 0;
+         for (i = 0; i < p; i++) {
+            absval = ABS32(lpc[i]);
+            if (absval > maxabs) {
+               maxabs = absval;
+               idx = i;
+            }
+         }
+         maxabs = PSHR32(maxabs, 13);  /* Q25->Q12 */
+
+         if (maxabs > 32767) {
+            maxabs = MIN32(maxabs, 163838);
+            chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+                                                    SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+            chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+            /* Apply bandwidth expansion. */
+            for (i = 0; i < p - 1; i++) {
+               lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+               chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+            }
+            lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+         } else {
+            break;
+         }
+      }
+
+      if (iter == 10) {
+         /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+            fall back to the A(z)=1 filter. */
+         OPUS_CLEAR(lpc, p);
+         _lpc[0] = 4096;  /* Q12 */
+      } else {
+         for (i = 0; i < p; i++) {
+            _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13));  /* Q25->Q12 */
+         }
+      }
+   }
 #endif
 }

-void fir(const opus_val16 *x,
+
+void celt_fir_c(
+         const opus_val16 *x,
         const opus_val16 *num,
         opus_val16 *y,
         int N,
         int ord,
-         opus_val16 *mem)
+         int arch)
 {
   int i,j;
-
-   for (i=0;i<N;i++)
+   VARDECL(opus_val16, rnum);
+   SAVE_STACK;
+   celt_assert(x != y);
+   ALLOC(rnum, ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   for (i=0;i<N-3;i+=4)
   {
-      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
-      for (j=0;j<ord;j++)
+      opus_val32 sum[4];
+      sum[0] = SHL32(EXTEND32(x[i  ]), SIG_SHIFT);
+      sum[1] = SHL32(EXTEND32(x[i+1]), SIG_SHIFT);
+      sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
+      sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
      {
-         sum += MULT16_16(num[j],mem[j]);
-      }
-      for (j=ord-1;j>=1;j--)
-      {
-         mem[j]=mem[j-1];
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rnum, x+i-ord, sum_c, ord);
+#endif
+         xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
      }
-      mem[0] = x[i];
-      y[i] = ROUND16(sum, SIG_SHIFT);
+#endif
+      y[i  ] = SROUND16(sum[0], SIG_SHIFT);
+      y[i+1] = SROUND16(sum[1], SIG_SHIFT);
+      y[i+2] = SROUND16(sum[2], SIG_SHIFT);
+      y[i+3] = SROUND16(sum[3], SIG_SHIFT);
   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+      for (j=0;j<ord;j++)
+         sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
+      y[i] = SROUND16(sum, SIG_SHIFT);
+   }
+   RESTORE_STACK;
 }

-void iir(const opus_val32 *x,
+void celt_iir(const opus_val32 *_x,
         const opus_val16 *den,
-         opus_val32 *y,
+         opus_val32 *_y,
         int N,
         int ord,
-         opus_val16 *mem)
+         opus_val16 *mem,
+         int arch)
 {
+#ifdef SMALL_FOOTPRINT
   int i,j;
+   (void)arch;
   for (i=0;i<N;i++)
   {
-      opus_val32 sum = x[i];
+      opus_val32 sum = _x[i];
      for (j=0;j<ord;j++)
      {
         sum -= MULT16_16(den[j],mem[j]);
@@ -131,56 +206,159 @@ void iir(const opus_val32 *x,
      {
         mem[j]=mem[j-1];
      }
-      mem[0] = ROUND16(sum,SIG_SHIFT);
-      y[i] = sum;
+      mem[0] = SROUND16(sum, SIG_SHIFT);
+      _y[i] = sum;
   }
+#else
+   int i,j;
+   VARDECL(opus_val16, rden);
+   VARDECL(opus_val16, y);
+   SAVE_STACK;
+
+   celt_assert((ord&3)==0);
+   ALLOC(rden, ord, opus_val16);
+   ALLOC(y, N+ord, opus_val16);
+   for(i=0;i<ord;i++)
+      rden[i] = den[ord-i-1];
+   for(i=0;i<ord;i++)
+      y[i] = -mem[ord-i-1];
+   for(;i<N+ord;i++)
+      y[i]=0;
+   for (i=0;i<N-3;i+=4)
+   {
+      /* Unroll by 4 as if it were an FIR filter */
+      opus_val32 sum[4];
+      sum[0]=_x[i];
+      sum[1]=_x[i+1];
+      sum[2]=_x[i+2];
+      sum[3]=_x[i+3];
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+      {
+         opus_val32 sum_c[4];
+         memcpy(sum_c, sum, sizeof(sum_c));
+         xcorr_kernel_c(rden, y+i, sum_c, ord);
+#endif
+         xcorr_kernel(rden, y+i, sum, ord, arch);
+#if defined(OPUS_CHECK_ASM) && defined(FIXED_POINT)
+         celt_assert(memcmp(sum, sum_c, sizeof(sum)) == 0);
+      }
+#endif
+      /* Patch up the result to compensate for the fact that this is an IIR */
+      y[i+ord  ] = -SROUND16(sum[0],SIG_SHIFT);
+      _y[i  ] = sum[0];
+      sum[1] = MAC16_16(sum[1], y[i+ord  ], den[0]);
+      y[i+ord+1] = -SROUND16(sum[1],SIG_SHIFT);
+      _y[i+1] = sum[1];
+      sum[2] = MAC16_16(sum[2], y[i+ord+1], den[0]);
+      sum[2] = MAC16_16(sum[2], y[i+ord  ], den[1]);
+      y[i+ord+2] = -SROUND16(sum[2],SIG_SHIFT);
+      _y[i+2] = sum[2];
+
+      sum[3] = MAC16_16(sum[3], y[i+ord+2], den[0]);
+      sum[3] = MAC16_16(sum[3], y[i+ord+1], den[1]);
+      sum[3] = MAC16_16(sum[3], y[i+ord  ], den[2]);
+      y[i+ord+3] = -SROUND16(sum[3],SIG_SHIFT);
+      _y[i+3] = sum[3];
+   }
+   for (;i<N;i++)
+   {
+      opus_val32 sum = _x[i];
+      for (j=0;j<ord;j++)
+         sum -= MULT16_16(rden[j],y[i+j]);
+      y[i+ord] = SROUND16(sum,SIG_SHIFT);
+      _y[i] = sum;
+   }
+   for(i=0;i<ord;i++)
+      mem[i] = _y[N-i-1];
+   RESTORE_STACK;
+#endif
 }

-void _celt_autocorr(
+int _celt_autocorr(
                   const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                   opus_val32       *ac,  /* out: [0...lag-1] ac values */
-                   const opus_val16       *window,
+                   const celt_coef  *window,
                   int          overlap,
                   int          lag,
-                   int          n
+                   int          n,
+                   int          arch
                  )
 {
   opus_val32 d;
-   int i;
+   int i, k;
+   int fastN=n-lag;
+   int shift;
+   const opus_val16 *xptr;
   VARDECL(opus_val16, xx);
   SAVE_STACK;
   ALLOC(xx, n, opus_val16);
-   for (i=0;i<n;i++)
-      xx[i] = x[i];
-   for (i=0;i<overlap;i++)
+   celt_assert(n>0);
+   celt_assert(overlap>=0);
+   if (overlap == 0)
   {
-      xx[i] = MULT16_16_Q15(x[i],window[i]);
-      xx[n-i-1] = MULT16_16_Q15(x[n-i-1],window[i]);
+      xptr = x;
+   } else {
+      for (i=0;i<n;i++)
+         xx[i] = x[i];
+      for (i=0;i<overlap;i++)
+      {
+         opus_val16 w = COEF2VAL16(window[i]);
+         xx[i] = MULT16_16_Q15(x[i],w);
+         xx[n-i-1] = MULT16_16_Q15(x[n-i-1],w);
+      }
+      xptr = xx;
   }
+   shift=0;
 #ifdef FIXED_POINT
   {
-      opus_val32 ac0=0;
-      int shift;
-      for(i=0;i<n;i++)
-         ac0 += SHR32(MULT16_16(xx[i],xx[i]),8);
-      ac0 += 1+n;
-
-      shift = celt_ilog2(ac0)-30+9;
-      shift = (shift+1)/2;
-      for(i=0;i<n;i++)
-         xx[i] = VSHR32(xx[i], shift);
+      opus_val32 ac0;
+      ac0 = 1+(n<<7);
+      if (n&1) ac0 += SHR32(MULT16_16(xptr[0],xptr[0]),9);
+      for(i=(n&1);i<n;i+=2)
+      {
+         ac0 += SHR32(MULT16_16(xptr[i],xptr[i]),9);
+         ac0 += SHR32(MULT16_16(xptr[i+1],xptr[i+1]),9);
+      }
+
+      shift = celt_ilog2(ac0)-30+10;
+      shift = (shift)/2;
+      if (shift>0)
+      {
+         for(i=0;i<n;i++)
+            xx[i] = PSHR32(xptr[i], shift);
+         xptr = xx;
+      } else
+         shift = 0;
   }
 #endif
-   while (lag>=0)
+   celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1, arch);
+   for (k=0;k<=lag;k++)
   {
-      for (i = lag, d = 0; i < n; i++)
-         d += xx[i] * xx[i-lag];
-      ac[lag] = d;
-      /*printf ("%f ", ac[lag]);*/
-      lag--;
+      for (i = k+fastN, d = 0; i < n; i++)
+         d = MAC16_16(d, xptr[i], xptr[i-k]);
+      ac[k] += d;
   }
-   /*printf ("\n");*/
-   ac[0] += 10;
+#ifdef FIXED_POINT
+   shift = 2*shift;
+   if (shift<=0)
+      ac[0] += SHL32((opus_int32)1, -shift);
+   if (ac[0] < 268435456)
+   {
+      int shift2 = 29 - EC_ILOG(ac[0]);
+      for (i=0;i<=lag;i++)
+         ac[i] = SHL32(ac[i], shift2);
+      shift -= shift2;
+   } else if (ac[0] >= 536870912)
+   {
+      int shift2=1;
+      if (ac[0] >= 1073741824)
+         shift2++;
+      for (i=0;i<=lag;i++)
+         ac[i] = SHR32(ac[i], shift2);
+      shift += shift2;
+   }
+#endif

   RESTORE_STACK;
+   return shift;
 }
--- a/libcelt/plc.h
+++ b/libcelt/plc.h
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -29,25 +29,38 @@
 #define PLC_H

 #include "arch.h"
+#include "cpu_support.h"

-#define LPC_ORDER 24
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#include "x86/celt_lpc_sse.h"
+#endif
+
+#define CELT_LPC_ORDER 24

 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);

-void fir(const opus_val16 *x,
+void celt_fir_c(
+         const opus_val16 *x,
         const opus_val16 *num,
         opus_val16 *y,
         int N,
         int ord,
-         opus_val16 *mem);
+         int arch);
+
+#if !defined(OVERRIDE_CELT_FIR)
+#define celt_fir(x, num, y, N, ord, arch) \
+    (celt_fir_c(x, num, y, N, ord, arch))
+#endif

-void iir(const opus_val32 *x,
+void celt_iir(const opus_val32 *x,
         const opus_val16 *den,
         opus_val32 *y,
         int N,
         int ord,
-         opus_val16 *mem);
+         opus_val16 *mem,
+         int arch);

-void _celt_autocorr(const opus_val16 *x, opus_val32 *ac, const opus_val16 *window, int overlap, int lag, int n);
+int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
+         const celt_coef *window, int overlap, int lag, int n, int arch);

 #endif /* PLC_H */
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CPU_SUPPORT_H
+#define CPU_SUPPORT_H
+
+#include "opus_types.h"
+#include "opus_defines.h"
+
+#if defined(OPUS_HAVE_RTCD) && \
+  (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/armcpu.h"
+
+/* We currently support 5 ARM variants:
+ * arch[0] -> ARMv4
+ * arch[1] -> ARMv5E
+ * arch[2] -> ARMv6
+ * arch[3] -> NEON
+ * arch[4] -> NEON+DOTPROD
+ */
+#define OPUS_ARCHMASK 7
+
+#elif defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+  (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+  (defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_PRESUME_AVX2)))
+
+#include "x86/x86cpu.h"
+/* We currently support 5 x86 variants:
+ * arch[0] -> non-sse
+ * arch[1] -> sse
+ * arch[2] -> sse2
+ * arch[3] -> sse4.1
+ * arch[4] -> avx
+ */
+#define OPUS_ARCHMASK 7
+int opus_select_arch(void);
+
+#else
+#define OPUS_ARCHMASK 0
+
+static OPUS_INLINE int opus_select_arch(void)
+{
+  return 0;
+}
+#endif
+#endif
--- a/libcelt/cwrs.c
+++ b/libcelt/cwrs.c
@@ -17,8 +17,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -32,8 +32,6 @@
 #endif

 #include "os_support.h"
-#include <stdlib.h>
-#include <string.h>
 #include "cwrs.h"
 #include "mathops.h"
 #include "arch.h"
@@ -48,92 +46,35 @@ int log2_frac(opus_uint32 val, int frac)
 {
  int l;
  l=EC_ILOG(val);
-  if(val&val-1){
+  if(val&(val-1)){
    /*This is (val>>l-16), but guaranteed to round up, even if adding a bias
-       before the shift would cause overflow (e.g., for 0xFFFFxxxx).*/
-    if(l>16)val=(val>>l-16)+((val&(1<<l-16)-1)+(1<<l-16)-1>>l-16);
+       before the shift would cause overflow (e.g., for 0xFFFFxxxx).
+       Doesn't work for val=0, but that case fails the test above.*/
+    if(l>16)val=((val-1)>>(l-16))+1;
    else val<<=16-l;
-    l=l-1<<frac;
+    l=(l-1)<<frac;
    /*Note that we always need one iteration, since the rounding up above means
       that we might need to adjust the integer part of the logarithm.*/
    do{
      int b;
      b=(int)(val>>16);
      l+=b<<frac;
-      val=val+b>>b;
-      val=val*val+0x7FFF>>15;
+      val=(val+b)>>b;
+      val=(val*val+0x7FFF)>>15;
    }
    while(frac-->0);
    /*If val is not exactly 0x8000, then we have to round up the remainder.*/
    return l+(val>0x8000);
  }
  /*Exact powers of two require no rounding.*/
-  else return l-1<<frac;
+  else return (l-1)<<frac;
 }
 #endif

-#ifndef SMALL_FOOTPRINT
-
-#define MASK32 (0xFFFFFFFF)
-
-/*INV_TABLE[i] holds the multiplicative inverse of (2*i+1) mod 2**32.*/
-static const opus_uint32 INV_TABLE[53]={
-  0x00000001,0xAAAAAAAB,0xCCCCCCCD,0xB6DB6DB7,
-  0x38E38E39,0xBA2E8BA3,0xC4EC4EC5,0xEEEEEEEF,
-  0xF0F0F0F1,0x286BCA1B,0x3CF3CF3D,0xE9BD37A7,
-  0xC28F5C29,0x684BDA13,0x4F72C235,0xBDEF7BDF,
-  0x3E0F83E1,0x8AF8AF8B,0x914C1BAD,0x96F96F97,
-  0xC18F9C19,0x2FA0BE83,0xA4FA4FA5,0x677D46CF,
-  0x1A1F58D1,0xFAFAFAFB,0x8C13521D,0x586FB587,
-  0xB823EE09,0xA08AD8F3,0xC10C9715,0xBEFBEFBF,
-  0xC0FC0FC1,0x07A44C6B,0xA33F128D,0xE327A977,
-  0xC7E3F1F9,0x962FC963,0x3F2B3885,0x613716AF,
-  0x781948B1,0x2B2E43DB,0xFCFCFCFD,0x6FD0EB67,
-  0xFA3F47E9,0xD2FD2FD3,0x3F4FD3F5,0xD4E25B9F,
-  0x5F02A3A1,0xBF5A814B,0x7C32B16D,0xD3431B57,
-  0xD8FD8FD9,
-};
-
-/*Computes (_a*_b-_c)/(2*_d+1) when the quotient is known to be exact.
-  _a, _b, _c, and _d may be arbitrary so long as the arbitrary precision result
-   fits in 32 bits, but currently the table for multiplicative inverses is only
-   valid for _d<=52.*/
-static inline opus_uint32 imusdiv32odd(opus_uint32 _a,opus_uint32 _b,
- opus_uint32 _c,int _d){
-  celt_assert(_d<=52);
-  return (_a*_b-_c)*INV_TABLE[_d]&MASK32;
-}
-
-/*Computes (_a*_b-_c)/_d when the quotient is known to be exact.
-  _d does not actually have to be even, but imusdiv32odd will be faster when
-   it's odd, so you should use that instead.
-  _a and _d are assumed to be small (e.g., _a*_d fits in 32 bits; currently the
-   table for multiplicative inverses is only valid for _d<=54).
-  _b and _c may be arbitrary so long as the arbitrary precision reuslt fits in
-   32 bits.*/
-static inline opus_uint32 imusdiv32even(opus_uint32 _a,opus_uint32 _b,
- opus_uint32 _c,int _d){
-  opus_uint32 inv;
-  int           mask;
-  int           shift;
-  int           one;
-  celt_assert(_d>0);
-  celt_assert(_d<=54);
-  shift=EC_ILOG(_d^_d-1);
-  inv=INV_TABLE[_d-1>>shift];
-  shift--;
-  one=1<<shift;
-  mask=one-1;
-  return (_a*(_b>>shift)-(_c>>shift)+
-   (_a*(_b&mask)+one-(_c&mask)>>shift)-1)*inv&MASK32;
-}
-
-#endif /* SMALL_FOOTPRINT */
-
 /*Although derived separately, the pulse vector coding scheme is equivalent to
   a Pyramid Vector Quantizer \cite{Fis86}.
  Some additional notes about an early version appear at
-   http://people.xiph.org/~tterribe/notes/cwrs.html, but the codebook ordering
+   https://people.xiph.org/~tterribe/notes/cwrs.html, but the codebook ordering
   and the definitions of some terms have evolved since that was written.

  The conversion from a pulse vector to an integer index (encoding) and back
@@ -249,60 +190,362 @@ static inline opus_uint32 imusdiv32even(opus_uint32 _a,opus_uint32 _b,
    year=1986
  }*/

-#ifndef SMALL_FOOTPRINT
-/*Compute V(1,_k).*/
-static inline unsigned ncwrs1(int _k){
-  return _k?2:1;
-}
-
-/*Compute U(2,_k).
-  Note that this may be called with _k=32768 (maxK[2]+1).*/
-static inline unsigned ucwrs2(unsigned _k){
-  return _k?_k+(_k-1):0;
-}
-
-/*Compute V(2,_k).*/
-static inline opus_uint32 ncwrs2(int _k){
-  return _k?4*(opus_uint32)_k:1;
-}
+#if !defined(SMALL_FOOTPRINT)
+
+/*U(N,K) = U(K,N) := N>0?K>0?U(N-1,K)+U(N,K-1)+U(N-1,K-1):0:K>0?1:0*/
+# define CELT_PVQ_U(_n,_k) (CELT_PVQ_U_ROW[IMIN(_n,_k)][IMAX(_n,_k)])
+/*V(N,K) := U(N,K)+U(N,K+1) = the number of PVQ codewords for a band of size N
+   with K pulses allocated to it.*/
+# define CELT_PVQ_V(_n,_k) (CELT_PVQ_U(_n,_k)+CELT_PVQ_U(_n,(_k)+1))
+
+/*For each V(N,K) supported, we will access element U(min(N,K+1),max(N,K+1)).
+  Thus, the number of entries in row I is the larger of the maximum number of
+   pulses we will ever allocate for a given N=I (K=128, or however many fit in
+   32 bits, whichever is smaller), plus one, and the maximum N for which
+   K=I-1 pulses fit in 32 bits.
+  The largest band size in an Opus Custom mode is 208.
+  Otherwise, we can limit things to the set of N which can be achieved by
+   splitting a band from a standard Opus mode: 176, 144, 96, 88, 72, 64, 48,
+   44, 36, 32, 24, 22, 18, 16, 8, 4, 2).*/
+#if defined(CUSTOM_MODES)
+static const opus_uint32 CELT_PVQ_U_DATA[1488]={
+#else
+static const opus_uint32 CELT_PVQ_U_DATA[1272]={
+#endif
+  /*N=0, K=0...176:*/
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0,
+#endif
+  /*N=1, K=1...176:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1,
+#endif
+  /*N=2, K=2...176:*/
+  3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+  43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79,
+  81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113,
+  115, 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173,
+  175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199, 201, 203,
+  205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233,
+  235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, 257, 259, 261, 263,
+  265, 267, 269, 271, 273, 275, 277, 279, 281, 283, 285, 287, 289, 291, 293,
+  295, 297, 299, 301, 303, 305, 307, 309, 311, 313, 315, 317, 319, 321, 323,
+  325, 327, 329, 331, 333, 335, 337, 339, 341, 343, 345, 347, 349, 351,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  353, 355, 357, 359, 361, 363, 365, 367, 369, 371, 373, 375, 377, 379, 381,
+  383, 385, 387, 389, 391, 393, 395, 397, 399, 401, 403, 405, 407, 409, 411,
+  413, 415,
+#endif
+  /*N=3, K=3...176:*/
+  13, 25, 41, 61, 85, 113, 145, 181, 221, 265, 313, 365, 421, 481, 545, 613,
+  685, 761, 841, 925, 1013, 1105, 1201, 1301, 1405, 1513, 1625, 1741, 1861,
+  1985, 2113, 2245, 2381, 2521, 2665, 2813, 2965, 3121, 3281, 3445, 3613, 3785,
+  3961, 4141, 4325, 4513, 4705, 4901, 5101, 5305, 5513, 5725, 5941, 6161, 6385,
+  6613, 6845, 7081, 7321, 7565, 7813, 8065, 8321, 8581, 8845, 9113, 9385, 9661,
+  9941, 10225, 10513, 10805, 11101, 11401, 11705, 12013, 12325, 12641, 12961,
+  13285, 13613, 13945, 14281, 14621, 14965, 15313, 15665, 16021, 16381, 16745,
+  17113, 17485, 17861, 18241, 18625, 19013, 19405, 19801, 20201, 20605, 21013,
+  21425, 21841, 22261, 22685, 23113, 23545, 23981, 24421, 24865, 25313, 25765,
+  26221, 26681, 27145, 27613, 28085, 28561, 29041, 29525, 30013, 30505, 31001,
+  31501, 32005, 32513, 33025, 33541, 34061, 34585, 35113, 35645, 36181, 36721,
+  37265, 37813, 38365, 38921, 39481, 40045, 40613, 41185, 41761, 42341, 42925,
+  43513, 44105, 44701, 45301, 45905, 46513, 47125, 47741, 48361, 48985, 49613,
+  50245, 50881, 51521, 52165, 52813, 53465, 54121, 54781, 55445, 56113, 56785,
+  57461, 58141, 58825, 59513, 60205, 60901, 61601,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  62305, 63013, 63725, 64441, 65161, 65885, 66613, 67345, 68081, 68821, 69565,
+  70313, 71065, 71821, 72581, 73345, 74113, 74885, 75661, 76441, 77225, 78013,
+  78805, 79601, 80401, 81205, 82013, 82825, 83641, 84461, 85285, 86113,
+#endif
+  /*N=4, K=4...176:*/
+  63, 129, 231, 377, 575, 833, 1159, 1561, 2047, 2625, 3303, 4089, 4991, 6017,
+  7175, 8473, 9919, 11521, 13287, 15225, 17343, 19649, 22151, 24857, 27775,
+  30913, 34279, 37881, 41727, 45825, 50183, 54809, 59711, 64897, 70375, 76153,
+  82239, 88641, 95367, 102425, 109823, 117569, 125671, 134137, 142975, 152193,
+  161799, 171801, 182207, 193025, 204263, 215929, 228031, 240577, 253575,
+  267033, 280959, 295361, 310247, 325625, 341503, 357889, 374791, 392217,
+  410175, 428673, 447719, 467321, 487487, 508225, 529543, 551449, 573951,
+  597057, 620775, 645113, 670079, 695681, 721927, 748825, 776383, 804609,
+  833511, 863097, 893375, 924353, 956039, 988441, 1021567, 1055425, 1090023,
+  1125369, 1161471, 1198337, 1235975, 1274393, 1313599, 1353601, 1394407,
+  1436025, 1478463, 1521729, 1565831, 1610777, 1656575, 1703233, 1750759,
+  1799161, 1848447, 1898625, 1949703, 2001689, 2054591, 2108417, 2163175,
+  2218873, 2275519, 2333121, 2391687, 2451225, 2511743, 2573249, 2635751,
+  2699257, 2763775, 2829313, 2895879, 2963481, 3032127, 3101825, 3172583,
+  3244409, 3317311, 3391297, 3466375, 3542553, 3619839, 3698241, 3777767,
+  3858425, 3940223, 4023169, 4107271, 4192537, 4278975, 4366593, 4455399,
+  4545401, 4636607, 4729025, 4822663, 4917529, 5013631, 5110977, 5209575,
+  5309433, 5410559, 5512961, 5616647, 5721625, 5827903, 5935489, 6044391,
+  6154617, 6266175, 6379073, 6493319, 6608921, 6725887, 6844225, 6963943,
+  7085049, 7207551,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  7331457, 7456775, 7583513, 7711679, 7841281, 7972327, 8104825, 8238783,
+  8374209, 8511111, 8649497, 8789375, 8930753, 9073639, 9218041, 9363967,
+  9511425, 9660423, 9810969, 9963071, 10116737, 10271975, 10428793, 10587199,
+  10747201, 10908807, 11072025, 11236863, 11403329, 11571431, 11741177,
+  11912575,
+#endif
+  /*N=5, K=5...176:*/
+  321, 681, 1289, 2241, 3649, 5641, 8361, 11969, 16641, 22569, 29961, 39041,
+  50049, 63241, 78889, 97281, 118721, 143529, 172041, 204609, 241601, 283401,
+  330409, 383041, 441729, 506921, 579081, 658689, 746241, 842249, 947241,
+  1061761, 1186369, 1321641, 1468169, 1626561, 1797441, 1981449, 2179241,
+  2391489, 2618881, 2862121, 3121929, 3399041, 3694209, 4008201, 4341801,
+  4695809, 5071041, 5468329, 5888521, 6332481, 6801089, 7295241, 7815849,
+  8363841, 8940161, 9545769, 10181641, 10848769, 11548161, 12280841, 13047849,
+  13850241, 14689089, 15565481, 16480521, 17435329, 18431041, 19468809,
+  20549801, 21675201, 22846209, 24064041, 25329929, 26645121, 28010881,
+  29428489, 30899241, 32424449, 34005441, 35643561, 37340169, 39096641,
+  40914369, 42794761, 44739241, 46749249, 48826241, 50971689, 53187081,
+  55473921, 57833729, 60268041, 62778409, 65366401, 68033601, 70781609,
+  73612041, 76526529, 79526721, 82614281, 85790889, 89058241, 92418049,
+  95872041, 99421961, 103069569, 106816641, 110664969, 114616361, 118672641,
+  122835649, 127107241, 131489289, 135983681, 140592321, 145317129, 150160041,
+  155123009, 160208001, 165417001, 170752009, 176215041, 181808129, 187533321,
+  193392681, 199388289, 205522241, 211796649, 218213641, 224775361, 231483969,
+  238341641, 245350569, 252512961, 259831041, 267307049, 274943241, 282741889,
+  290705281, 298835721, 307135529, 315607041, 324252609, 333074601, 342075401,
+  351257409, 360623041, 370174729, 379914921, 389846081, 399970689, 410291241,
+  420810249, 431530241, 442453761, 453583369, 464921641, 476471169, 488234561,
+  500214441, 512413449, 524834241, 537479489, 550351881, 563454121, 576788929,
+  590359041, 604167209, 618216201, 632508801,
+#if defined(CUSTOM_MODES)
+  /*...208:*/
+  647047809, 661836041, 676876329, 692171521, 707724481, 723538089, 739615241,
+  755958849, 772571841, 789457161, 806617769, 824056641, 841776769, 859781161,
+  878072841, 896654849, 915530241, 934702089, 954173481, 973947521, 994027329,
+  1014416041, 1035116809, 1056132801, 1077467201, 1099123209, 1121104041,
+  1143412929, 1166053121, 1189027881, 1212340489, 1235994241,
+#endif
+  /*N=6, K=6...96:*/
+  1683, 3653, 7183, 13073, 22363, 36365, 56695, 85305, 124515, 177045, 246047,
+  335137, 448427, 590557, 766727, 982729, 1244979, 1560549, 1937199, 2383409,
+  2908411, 3522221, 4235671, 5060441, 6009091, 7095093, 8332863, 9737793,
+  11326283, 13115773, 15124775, 17372905, 19880915, 22670725, 25765455,
+  29189457, 32968347, 37129037, 41699767, 46710137, 52191139, 58175189,
+  64696159, 71789409, 79491819, 87841821, 96879431, 106646281, 117185651,
+  128542501, 140763503, 153897073, 167993403, 183104493, 199284183, 216588185,
+  235074115, 254801525, 275831935, 298228865, 322057867, 347386557, 374284647,
+  402823977, 433078547, 465124549, 499040399, 534906769, 572806619, 612825229,
+  655050231, 699571641, 746481891, 795875861, 847850911, 902506913, 959946283,
+  1020274013, 1083597703, 1150027593, 1219676595, 1292660325, 1369097135,
+  1449108145, 1532817275, 1620351277, 1711839767, 1807415257, 1907213187,
+  2011371957, 2120032959,
+#if defined(CUSTOM_MODES)
+  /*...109:*/
+  2233340609U, 2351442379U, 2474488829U, 2602633639U, 2736033641U, 2874848851U,
+  3019242501U, 3169381071U, 3325434321U, 3487575323U, 3655980493U, 3830829623U,
+  4012305913U,
+#endif
+  /*N=7, K=7...54*/
+  8989, 19825, 40081, 75517, 134245, 227305, 369305, 579125, 880685, 1303777,
+  1884961, 2668525, 3707509, 5064793, 6814249, 9041957, 11847485, 15345233,
+  19665841, 24957661, 31388293, 39146185, 48442297, 59511829, 72616013,
+  88043969, 106114625, 127178701, 151620757, 179861305, 212358985, 249612805,
+  292164445, 340600625, 395555537, 457713341, 527810725, 606639529, 695049433,
+  793950709, 904317037, 1027188385, 1163673953, 1314955181, 1482288821,
+  1667010073, 1870535785, 2094367717,
+#if defined(CUSTOM_MODES)
+  /*...60:*/
+  2340095869U, 2609401873U, 2904062449U, 3225952925U, 3577050821U, 3959439497U,
+#endif
+  /*N=8, K=8...37*/
+  48639, 108545, 224143, 433905, 795455, 1392065, 2340495, 3800305, 5984767,
+  9173505, 13726991, 20103025, 28875327, 40754369, 56610575, 77500017,
+  104692735, 139703809, 184327311, 240673265, 311207743, 398796225, 506750351,
+  638878193, 799538175, 993696769, 1226990095, 1505789553, 1837271615,
+  2229491905U,
+#if defined(CUSTOM_MODES)
+  /*...40:*/
+  2691463695U, 3233240945U, 3866006015U,
+#endif
+  /*N=9, K=9...28:*/
+  265729, 598417, 1256465, 2485825, 4673345, 8405905, 14546705, 24331777,
+  39490049, 62390545, 96220561, 145198913, 214828609, 312193553, 446304145,
+  628496897, 872893441, 1196924561, 1621925137, 2173806145U,
+#if defined(CUSTOM_MODES)
+  /*...29:*/
+  2883810113U,
+#endif
+  /*N=10, K=10...24:*/
+  1462563, 3317445, 7059735, 14218905, 27298155, 50250765, 89129247, 152951073,
+  254831667, 413442773, 654862247, 1014889769, 1541911931, 2300409629U,
+  3375210671U,
+  /*N=11, K=11...19:*/
+  8097453, 18474633, 39753273, 81270333, 158819253, 298199265, 540279585,
+  948062325, 1616336765,
+#if defined(CUSTOM_MODES)
+  /*...20:*/
+  2684641785U,
+#endif
+  /*N=12, K=12...18:*/
+  45046719, 103274625, 224298231, 464387817, 921406335, 1759885185,
+  3248227095U,
+  /*N=13, K=13...16:*/
+  251595969, 579168825, 1267854873, 2653649025U,
+  /*N=14, K=14:*/
+  1409933619
+};

-/*Compute U(3,_k).
-  Note that this may be called with _k=32768 (maxK[3]+1).*/
-static inline opus_uint32 ucwrs3(unsigned _k){
-  return _k?(2*(opus_uint32)_k-2)*_k+1:0;
-}
+#if defined(CUSTOM_MODES)
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 208,CELT_PVQ_U_DATA+ 415,
+  CELT_PVQ_U_DATA+ 621,CELT_PVQ_U_DATA+ 826,CELT_PVQ_U_DATA+1030,
+  CELT_PVQ_U_DATA+1233,CELT_PVQ_U_DATA+1336,CELT_PVQ_U_DATA+1389,
+  CELT_PVQ_U_DATA+1421,CELT_PVQ_U_DATA+1441,CELT_PVQ_U_DATA+1455,
+  CELT_PVQ_U_DATA+1464,CELT_PVQ_U_DATA+1470,CELT_PVQ_U_DATA+1473
+};
+#else
+static const opus_uint32 *const CELT_PVQ_U_ROW[15]={
+  CELT_PVQ_U_DATA+   0,CELT_PVQ_U_DATA+ 176,CELT_PVQ_U_DATA+ 351,
+  CELT_PVQ_U_DATA+ 525,CELT_PVQ_U_DATA+ 698,CELT_PVQ_U_DATA+ 870,
+  CELT_PVQ_U_DATA+1041,CELT_PVQ_U_DATA+1131,CELT_PVQ_U_DATA+1178,
+  CELT_PVQ_U_DATA+1207,CELT_PVQ_U_DATA+1226,CELT_PVQ_U_DATA+1240,
+  CELT_PVQ_U_DATA+1248,CELT_PVQ_U_DATA+1254,CELT_PVQ_U_DATA+1257
+};
+#endif

-/*Compute V(3,_k).*/
-static inline opus_uint32 ncwrs3(int _k){
-  return _k?2*(2*(unsigned)_k*(opus_uint32)_k+1):1;
+#if defined(CUSTOM_MODES)
+void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){
+  int k;
+  /*_maxk==0 => there's nothing to do.*/
+  celt_assert(_maxk>0);
+  _bits[0]=0;
+  for(k=1;k<=_maxk;k++)_bits[k]=log2_frac(CELT_PVQ_V(_n,k),_frac);
 }
+#endif

-/*Compute U(4,_k).*/
-static inline opus_uint32 ucwrs4(int _k){
-  return _k?imusdiv32odd(2*_k,(2*_k-3)*(opus_uint32)_k+4,3,1):0;
+static opus_uint32 icwrs(int _n,const int *_y){
+  opus_uint32 i;
+  int         j;
+  int         k;
+  celt_assert(_n>=2);
+  j=_n-1;
+  i=_y[j]<0;
+  k=abs(_y[j]);
+  do{
+    j--;
+    i+=CELT_PVQ_U(_n-j,k);
+    k+=abs(_y[j]);
+    if(_y[j]<0)i+=CELT_PVQ_U(_n-j,k+1);
+  }
+  while(j>0);
+  return i;
 }

-/*Compute V(4,_k).*/
-static inline opus_uint32 ncwrs4(int _k){
-  return _k?((_k*(opus_uint32)_k+2)*_k)/3<<3:1;
+void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
+  celt_assert(_k>0);
+  ec_enc_uint(_enc,icwrs(_n,_y),CELT_PVQ_V(_n,_k));
 }

-/*Compute U(5,_k).*/
-static inline opus_uint32 ucwrs5(int _k){
-  return _k?(((((_k-2)*(unsigned)_k+5)*(opus_uint32)_k-4)*_k)/3<<1)+1:0;
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y){
+  opus_uint32 p;
+  int         s;
+  int         k0;
+  opus_int16  val;
+  opus_val32  yy=0;
+  celt_assert(_k>0);
+  celt_assert(_n>1);
+  while(_n>2){
+    opus_uint32 q;
+    /*Lots of pulses case:*/
+    if(_k>=_n){
+      const opus_uint32 *row;
+      row=CELT_PVQ_U_ROW[_n];
+      /*Are the pulses in this dimension negative?*/
+      p=row[_k+1];
+      s=-(_i>=p);
+      _i-=p&s;
+      /*Count how many pulses were placed in this dimension.*/
+      k0=_k;
+      q=row[_n];
+      if(q>_i){
+        celt_sig_assert(p>q);
+        _k=_n;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+      }
+      else for(p=row[_k];p>_i;p=row[_k])_k--;
+      _i-=p;
+      val=(k0-_k+s)^s;
+      *_y++=val;
+      yy=MAC16_16(yy,val,val);
+    }
+    /*Lots of dimensions case:*/
+    else{
+      /*Are there any pulses in this dimension at all?*/
+      p=CELT_PVQ_U_ROW[_k][_n];
+      q=CELT_PVQ_U_ROW[_k+1][_n];
+      if(p<=_i&&_i<q){
+        _i-=p;
+        *_y++=0;
+      }
+      else{
+        /*Are the pulses in this dimension negative?*/
+        s=-(_i>=q);
+        _i-=q&s;
+        /*Count how many pulses were placed in this dimension.*/
+        k0=_k;
+        do p=CELT_PVQ_U_ROW[--_k][_n];
+        while(p>_i);
+        _i-=p;
+        val=(k0-_k+s)^s;
+        *_y++=val;
+        yy=MAC16_16(yy,val,val);
+      }
+    }
+    _n--;
+  }
+  /*_n==2*/
+  p=2*_k+1;
+  s=-(_i>=p);
+  _i-=p&s;
+  k0=_k;
+  _k=(_i+1)>>1;
+  if(_k)_i-=2*_k-1;
+  val=(k0-_k+s)^s;
+  *_y++=val;
+  yy=MAC16_16(yy,val,val);
+  /*_n==1*/
+  s=-(int)_i;
+  val=(_k+s)^s;
+  *_y=val;
+  yy=MAC16_16(yy,val,val);
+  return yy;
 }

-/*Compute V(5,_k).*/
-static inline opus_uint32 ncwrs5(int _k){
-  return _k?(((_k*(unsigned)_k+5)*(opus_uint32)_k*_k)/3<<2)+2:1;
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  return cwrsi(_n,_k,ec_dec_uint(_dec,CELT_PVQ_V(_n,_k)),_y);
 }

-#endif /* SMALL_FOOTPRINT */
+#else /* SMALL_FOOTPRINT */

 /*Computes the next row/column of any recurrence that obeys the relation
   u[i][j]=u[i-1][j]+u[i][j-1]+u[i-1][j-1].
  _ui0 is the base case for the new row/column.*/
-static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
+static OPUS_INLINE void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
  opus_uint32 ui1;
  unsigned      j;
  /*This do-while will overrun the array if we don't have storage for at least
@@ -318,7 +561,7 @@ static inline void unext(opus_uint32 *_ui,unsigned _len,opus_uint32 _ui0){
 /*Computes the previous row/column of any recurrence that obeys the relation
   u[i-1][j]=u[i][j]-u[i][j-1]-u[i-1][j-1].
  _ui0 is the base case for the new row/column.*/
-static inline void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){
+static OPUS_INLINE void uprev(opus_uint32 *_ui,unsigned _n,opus_uint32 _ui0){
  opus_uint32 ui1;
  unsigned      j;
  /*This do-while will overrun the array if we don't have storage for at least
@@ -342,132 +585,27 @@ static opus_uint32 ncwrs_urow(unsigned _n,unsigned _k,opus_uint32 *_u){
  celt_assert(len>=3);
  _u[0]=0;
  _u[1]=um2=1;
-#ifndef SMALL_FOOTPRINT
-  /*_k>52 doesn't work in the false branch due to the limits of INV_TABLE,
-    but _k isn't tested here because k<=52 for n=7*/
-  if(_n<=6)
-#endif
- {
-    /*If _n==0, _u[0] should be 1 and the rest should be 0.*/
-    /*If _n==1, _u[i] should be 1 for i>1.*/
-    celt_assert(_n>=2);
-    /*If _k==0, the following do-while loop will overflow the buffer.*/
-    celt_assert(_k>0);
-    k=2;
-    do _u[k]=(k<<1)-1;
-    while(++k<len);
-    for(k=2;k<_n;k++)unext(_u+1,_k+1,1);
-  }
-#ifndef SMALL_FOOTPRINT
-  else{
-    opus_uint32 um1;
-    opus_uint32 n2m1;
-    _u[2]=n2m1=um1=(_n<<1)-1;
-    for(k=3;k<len;k++){
-      /*U(N,K) = ((2*N-1)*U(N,K-1)-U(N,K-2))/(K-1) + U(N,K-2)*/
-      _u[k]=um2=imusdiv32even(n2m1,um1,um2,k-1)+um2;
-      if(++k>=len)break;
-      _u[k]=um1=imusdiv32odd(n2m1,um2,um1,k-1>>1)+um1;
-    }
-  }
-#endif /* SMALL_FOOTPRINT */
+  /*If _n==0, _u[0] should be 1 and the rest should be 0.*/
+  /*If _n==1, _u[i] should be 1 for i>1.*/
+  celt_assert(_n>=2);
+  /*If _k==0, the following do-while loop will overflow the buffer.*/
+  celt_assert(_k>0);
+  k=2;
+  do _u[k]=(k<<1)-1;
+  while(++k<len);
+  for(k=2;k<_n;k++)unext(_u+1,_k+1,1);
  return _u[_k]+_u[_k+1];
 }

-#ifndef SMALL_FOOTPRINT
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 1 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static inline void cwrsi1(int _k,opus_uint32 _i,int *_y){
-  int s;
-  s=-(int)_i;
-  _y[0]=_k+s^s;
-}
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 2 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static inline void cwrsi2(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  p=ucwrs2(_k+1U);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  _k=_i+1>>1;
-  p=ucwrs2(_k);
-  _i-=p;
-  yj-=_k;
-  _y[0]=yj+s^s;
-  cwrsi1(_k,_i,_y+1);
-}
-
-/*Returns the _i'th combination of _k elements (at most 32767) chosen from a
-   set of size 3 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static void cwrsi3(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  p=ucwrs3(_k+1U);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  /*Finds the maximum _k such that ucwrs3(_k)<=_i (tested for all
-     _i<2147418113=U(3,32768)).*/
-  _k=_i>0?isqrt32(2*_i-1)+1>>1:0;
-  p=ucwrs3(_k);
-  _i-=p;
-  yj-=_k;
-  _y[0]=yj+s^s;
-  cwrsi2(_k,_i,_y+1);
-}
-
-/*Returns the _i'th combination of _k elements (at most 1172) chosen from a set
-   of size 4 with associated sign bits.
-  _y: Returns the vector of pulses.*/
-static void cwrsi4(int _k,opus_uint32 _i,int *_y){
-  opus_uint32 p;
-  int           s;
-  int           yj;
-  int           kl;
-  int           kr;
-  p=ucwrs4(_k+1);
-  s=-(_i>=p);
-  _i-=p&s;
-  yj=_k;
-  /*We could solve a cubic for k here, but the form of the direct solution does
-     not lend itself well to exact integer arithmetic.
-    Instead we do a binary search on U(4,K).*/
-  kl=0;
-  kr=_k;
-  for(;;){
-    _k=kl+kr>>1;
-    p=ucwrs4(_k);
-    if(p<_i){
-      if(_k>=kr)break;
-      kl=_k+1;
-    }
-    else if(p>_i)kr=_k-1;
-    else break;
-  }
-  _i-=p;
-  yj-=_k;
-  _y[0]=yj+s^s;
-  cwrsi3(_k,_i,_y+1);
-}
-
-#endif /* SMALL_FOOTPRINT */
-
 /*Returns the _i'th combination of _k elements chosen from a set of size _n
   with associated sign bits.
  _y: Returns the vector of pulses.
  _u: Must contain entries [0..._k+1] of row _n of U() on input.
      Its contents will be destructively modified.*/
-static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
+static opus_val32 cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
  int j;
+  opus_int16 val;
+  opus_val32 yy=0;
  celt_assert(_n>0);
  j=0;
  do{
@@ -482,93 +620,33 @@ static void cwrsi(int _n,int _k,opus_uint32 _i,int *_y,opus_uint32 *_u){
    while(p>_i)p=_u[--_k];
    _i-=p;
    yj-=_k;
-    _y[j]=yj+s^s;
+    val=(yj+s)^s;
+    _y[j]=val;
+    yy=MAC16_16(yy,val,val);
    uprev(_u,_k+2,0);
  }
  while(++j<_n);
+  return yy;
 }

 /*Returns the index of the given combination of K elements chosen from a set
   of size 1 with associated sign bits.
  _y: The vector of pulses, whose sum of absolute values is K.
  _k: Returns K.*/
-static inline opus_uint32 icwrs1(const int *_y,int *_k){
+static OPUS_INLINE opus_uint32 icwrs1(const int *_y,int *_k){
  *_k=abs(_y[0]);
  return _y[0]<0;
 }

-#ifndef SMALL_FOOTPRINT
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 2 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs2(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs1(_y+1,&k);
-  i+=ucwrs2(k);
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs2(k+1U);
-  *_k=k;
-  return i;
-}
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 3 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs3(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs2(_y+1,&k);
-  i+=ucwrs3(k);
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs3(k+1U);
-  *_k=k;
-  return i;
-}
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 4 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs4(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs3(_y+1,&k);
-  i+=ucwrs4(k);
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs4(k+1);
-  *_k=k;
-  return i;
-}
-
-/*Returns the index of the given combination of K elements chosen from a set
-   of size 5 with associated sign bits.
-  _y: The vector of pulses, whose sum of absolute values is K.
-  _k: Returns K.*/
-static inline opus_uint32 icwrs5(const int *_y,int *_k){
-  opus_uint32 i;
-  int           k;
-  i=icwrs4(_y+1,&k);
-  i+=ucwrs5(k);
-  k+=abs(_y[0]);
-  if(_y[0]<0)i+=ucwrs5(k+1);
-  *_k=k;
-  return i;
-}
-#endif /* SMALL_FOOTPRINT */
-
 /*Returns the index of the given combination of K elements chosen from a set
   of size _n with associated sign bits.
  _y:  The vector of pulses, whose sum of absolute values must be _k.
  _nc: Returns V(_n,_k).*/
-opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
+static OPUS_INLINE opus_uint32 icwrs(int _n,int _k,opus_uint32 *_nc,const int *_y,
 opus_uint32 *_u){
  opus_uint32 i;
-  int           j;
-  int           k;
+  int         j;
+  int         k;
  /*We can't unroll the first two iterations of the loop unless _n>=2.*/
  celt_assert(_n>=2);
  _u[0]=0;
@@ -613,53 +691,25 @@ void get_required_bits(opus_int16 *_bits,int _n,int _maxk,int _frac){

 void encode_pulses(const int *_y,int _n,int _k,ec_enc *_enc){
  opus_uint32 i;
-#ifndef SMALL_FOOTPRINT
-  switch(_n){
-    case 2:{
-      i=icwrs2(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs2(_k));
-    }break;
-    case 3:{
-      i=icwrs3(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs3(_k));
-    }break;
-    case 4:{
-      i=icwrs4(_y,&_k);
-      ec_enc_uint(_enc,i,ncwrs4(_k));
-    }break;
-     default:
-    {
-#endif
-      VARDECL(opus_uint32,u);
-      opus_uint32 nc;
-      SAVE_STACK;
-      ALLOC(u,_k+2U,opus_uint32);
-      i=icwrs(_n,_k,&nc,_y,u);
-      ec_enc_uint(_enc,i,nc);
-      RESTORE_STACK;
-#ifndef SMALL_FOOTPRINT
-    };
-  }
-#endif
+  VARDECL(opus_uint32,u);
+  opus_uint32 nc;
+  SAVE_STACK;
+  celt_assert(_k>0);
+  ALLOC(u,_k+2U,opus_uint32);
+  i=icwrs(_n,_k,&nc,_y,u);
+  ec_enc_uint(_enc,i,nc);
+  RESTORE_STACK;
 }

-void decode_pulses(int *_y,int _n,int _k,ec_dec *_dec)
-{
-#ifndef SMALL_FOOTPRINT
-   switch(_n){
-    case 2:cwrsi2(_k,ec_dec_uint(_dec,ncwrs2(_k)),_y);break;
-    case 3:cwrsi3(_k,ec_dec_uint(_dec,ncwrs3(_k)),_y);break;
-    case 4:cwrsi4(_k,ec_dec_uint(_dec,ncwrs4(_k)),_y);break;
-      default:
-    {
-#endif
-      VARDECL(opus_uint32,u);
-      SAVE_STACK;
-      ALLOC(u,_k+2U,opus_uint32);
-      cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
-      RESTORE_STACK;
-#ifndef SMALL_FOOTPRINT
-    }
-  }
-#endif
+opus_val32 decode_pulses(int *_y,int _n,int _k,ec_dec *_dec){
+  VARDECL(opus_uint32,u);
+  int ret;
+  SAVE_STACK;
+  celt_assert(_k>0);
+  ALLOC(u,_k+2U,opus_uint32);
+  ret = cwrsi(_n,_k,ec_dec_uint(_dec,ncwrs_urow(_n,_k,u)),_y,u);
+  RESTORE_STACK;
+  return ret;
 }
+
+#endif /* SMALL_FOOTPRINT */
--- a/libcelt/cwrs.h
+++ b/libcelt/cwrs.h
@@ -17,8 +17,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -43,6 +43,6 @@ void get_required_bits(opus_int16 *bits, int N, int K, int frac);

 void encode_pulses(const int *_y, int N, int K, ec_enc *enc);

-void decode_pulses(int *_y, int N, int K, ec_dec *dec);
+opus_val32 decode_pulses(int *_y, int N, int K, ec_dec *dec);

 #endif /* CWRS_H */
--- a/celt/dump_modes/Makefile
+++ b/celt/dump_modes/Makefile
+
+CFLAGS=-O2 -Wall -Wextra -DHAVE_CONFIG_H
+INCLUDES=-I. -I../ -I../.. -I../../include
+
+SOURCES = dump_modes.c \
+          ../modes.c \
+          ../cwrs.c \
+          ../rate.c \
+          ../entcode.c \
+          ../entenc.c \
+          ../entdec.c \
+          ../mathops.c \
+          ../mdct.c \
+          ../celt.c \
+          ../kiss_fft.c
+
+ifdef HAVE_ARM_NE10
+CC = gcc
+CFLAGS += -mfpu=neon
+INCLUDES += -I$(NE10_INCDIR) -DHAVE_ARM_NE10 -DOPUS_ARM_PRESUME_NEON_INTR
+LIBS = -L$(NE10_LIBDIR) -lNE10
+SOURCES += ../arm/celt_ne10_fft.c \
+           dump_modes_arm_ne10.c \
+           ../arm/armcpu.c
+endif
+
+all: dump_modes
+
+dump_modes:
+	$(PREFIX)$(CC) $(CFLAGS) $(INCLUDES) -DCUSTOM_MODES_ONLY -DCUSTOM_MODES $(SOURCES) -o $@ $(LIBS) -lm
+
+clean:
+	rm -f dump_modes
--- a/libcelt/dump_modes.c
+++ b/libcelt/dump_modes.c
@@ -16,8 +16,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -35,6 +35,7 @@
 #include "modes.h"
 #include "celt.h"
 #include "rate.h"
+#include "dump_modes_arch.h"

 #define INT16 "%d"
 #define INT32 "%d"
@@ -48,13 +49,29 @@
 #define WORD32 FLOAT
 #endif

+#define COEF16(x, a) ((opus_int16)SATURATE(((opus_int64)(x)+(1<<(a)>>1))>>(a), 32767))
+int opus_select_arch(void) {
+   return 0;
+}
+
 void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
 {
   int i, j, k;
-   fprintf(file, "/* The contents of this file is automatically generated and contains static\n");
-   fprintf(file, "   definitions for some pre-defined modes */\n");
+   int mdct_twiddles_size;
+   fprintf(file, "/* The contents of this file was automatically generated by dump_modes.c\n");
+   fprintf(file, "   with arguments:");
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts);
+   }
+   fprintf(file, "\n   It contains static definitions for some pre-defined modes. */\n");
   fprintf(file, "#include \"modes.h\"\n");
   fprintf(file, "#include \"rate.h\"\n");
+   fprintf(file, "\n#ifdef HAVE_ARM_NE10\n");
+   fprintf(file, "#define OVERRIDE_FFT 1\n");
+   fprintf(file, "#include \"%s\"\n", ARM_NE10_ARCH_FILE_NAME);
+   fprintf(file, "#endif\n");

   fprintf(file, "\n");

@@ -82,9 +99,19 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)

      fprintf(file, "#ifndef DEF_WINDOW%d\n", mode->overlap);
      fprintf(file, "#define DEF_WINDOW%d\n", mode->overlap);
-      fprintf (file, "static const opus_val16 window%d[%d] = {\n", mode->overlap, mode->overlap);
+      fprintf (file, "static const celt_coef window%d[%d] = {\n", mode->overlap, mode->overlap);
+#if defined(FIXED_POINT) && defined(ENABLE_QEXT)
+      fprintf(file, "#ifdef ENABLE_QEXT\n");
+      for (j=0;j<mode->overlap;j++)
+         fprintf (file, WORD32 ",%c", mode->window[j],(j+6)%5==0?'\n':' ');
+      fprintf(file, "#else\n");
+      for (j=0;j<mode->overlap;j++)
+         fprintf (file, WORD16 ",%c", COEF16(mode->window[j], 16),(j+6)%5==0?'\n':' ');
+      fprintf(file, "#endif\n");
+#else
      for (j=0;j<mode->overlap;j++)
-         fprintf (file, WORD16 ", ", mode->window[j]);
+         fprintf (file, WORD16 ",%c", mode->window[j],(j+6)%5==0?'\n':' ');
+#endif
      fprintf (file, "};\n");
      fprintf(file, "#endif\n");
      fprintf(file, "\n");
@@ -119,15 +146,15 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
      fprintf(file, "#define DEF_PULSE_CACHE%d\n", mode->Fs/mdctSize);
      fprintf (file, "static const opus_int16 cache_index%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+2)*mode->nbEBands);
      for (j=0;j<mode->nbEBands*(mode->maxLM+2);j++)
-         fprintf (file, "%d, ", mode->cache.index[j]);
+         fprintf (file, "%d,%c", mode->cache.index[j],(j+16)%15==0?'\n':' ');
      fprintf (file, "};\n");
      fprintf (file, "static const unsigned char cache_bits%d[%d] = {\n", mode->Fs/mdctSize, mode->cache.size);
      for (j=0;j<mode->cache.size;j++)
-         fprintf (file, "%d, ", mode->cache.bits[j]);
+         fprintf (file, "%d,%c", mode->cache.bits[j],(j+16)%15==0?'\n':' ');
      fprintf (file, "};\n");
      fprintf (file, "static const unsigned char cache_caps%d[%d] = {\n", mode->Fs/mdctSize, (mode->maxLM+1)*2*mode->nbEBands);
      for (j=0;j<(mode->maxLM+1)*2*mode->nbEBands;j++)
-         fprintf (file, "%d, ", mode->cache.caps[j]);
+         fprintf (file, "%d,%c", mode->cache.caps[j],(j+16)%15==0?'\n':' ');
      fprintf (file, "};\n");

      fprintf(file, "#endif\n");
@@ -136,12 +163,26 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
      /* FFT twiddles */
      fprintf(file, "#ifndef FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize);
      fprintf(file, "#define FFT_TWIDDLES%d_%d\n", mode->Fs, mdctSize);
+
      fprintf (file, "static const kiss_twiddle_cpx fft_twiddles%d_%d[%d] = {\n",
            mode->Fs, mdctSize, mode->mdct.kfft[0]->nfft);
+#if defined(FIXED_POINT) && defined(ENABLE_QEXT)
+      fprintf(file, "#ifdef ENABLE_QEXT\n");
+      for (j=0;j<mode->mdct.kfft[0]->nfft;j++)
+         fprintf (file, "{" WORD32 ", " WORD32 "},%c", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i,(j+3)%2==0?'\n':' ');
+      fprintf(file, "#else\n");
      for (j=0;j<mode->mdct.kfft[0]->nfft;j++)
-         fprintf (file, "{" WORD16 ", " WORD16 "}, ", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i);
+         fprintf (file, "{" WORD16 ", " WORD16 "},%c", COEF16(mode->mdct.kfft[0]->twiddles[j].r,16), COEF16(mode->mdct.kfft[0]->twiddles[j].i,16),(j+3)%2==0?'\n':' ');
+      fprintf(file, "#endif\n");
+#else
+      for (j=0;j<mode->mdct.kfft[0]->nfft;j++)
+         fprintf (file, "{" WORD16 ", " WORD16 "},%c", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i,(j+3)%2==0?'\n':' ');
+#endif
      fprintf (file, "};\n");

+#ifdef OVERRIDE_FFT
+      dump_mode_arch(mode);
+#endif
      /* FFT Bitrev tables */
      for (k=0;k<=mode->mdct.maxshift;k++)
      {
@@ -150,7 +191,7 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
         fprintf (file, "static const opus_int16 fft_bitrev%d[%d] = {\n",
               mode->mdct.kfft[k]->nfft, mode->mdct.kfft[k]->nfft);
         for (j=0;j<mode->mdct.kfft[k]->nfft;j++)
-            fprintf (file, "%d, ", mode->mdct.kfft[k]->bitrev[j]);
+            fprintf (file, "%d,%c", mode->mdct.kfft[k]->bitrev[j],(j+16)%15==0?'\n':' ');
         fprintf (file, "};\n");

         fprintf(file, "#endif\n");
@@ -164,17 +205,34 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
         fprintf(file, "#define FFT_STATE%d_%d_%d\n", mode->Fs, mdctSize, k);
         fprintf (file, "static const kiss_fft_state fft_state%d_%d_%d = {\n",
               mode->Fs, mdctSize, k);
-         fprintf (file, "%d,\t/* nfft */\n", mode->mdct.kfft[k]->nfft);
-#ifndef FIXED_POINT
-         fprintf (file, "%0.9ff,\t/* scale */\n", mode->mdct.kfft[k]->scale);
+         fprintf (file, "%d,    /* nfft */\n", mode->mdct.kfft[k]->nfft);
+
+#if defined(FIXED_POINT) && defined(ENABLE_QEXT)
+         fprintf(file, "#ifdef ENABLE_QEXT\n");
+         fprintf (file, WORD32 ",    /* scale */\n", mode->mdct.kfft[k]->scale);
+         fprintf(file, "#else\n");
+         fprintf (file, WORD16 ",    /* scale */\n", COEF16(mode->mdct.kfft[k]->scale, 15));
+         fprintf(file, "#endif\n");
+#else
+         fprintf (file, WORD16 ",    /* scale */\n", mode->mdct.kfft[k]->scale);
 #endif
-         fprintf (file, "%d,\t/* shift */\n", mode->mdct.kfft[k]->shift);
+#ifdef FIXED_POINT
+         fprintf (file, "%d,    /* scale_shift */\n", mode->mdct.kfft[k]->scale_shift);
+#endif
+         fprintf (file, "%d,    /* shift */\n", mode->mdct.kfft[k]->shift);
         fprintf (file, "{");
         for (j=0;j<2*MAXFACTORS;j++)
            fprintf (file, "%d, ", mode->mdct.kfft[k]->factors[j]);
-         fprintf (file, "},\t/* factors */\n");
-         fprintf (file, "fft_bitrev%d,\t/* bitrev */\n", mode->mdct.kfft[k]->nfft);
-         fprintf (file, "fft_twiddles%d_%d,\t/* bitrev */\n", mode->Fs, mdctSize);
+         fprintf (file, "},    /* factors */\n");
+         fprintf (file, "fft_bitrev%d,    /* bitrev */\n", mode->mdct.kfft[k]->nfft);
+         fprintf (file, "fft_twiddles%d_%d,    /* bitrev */\n", mode->Fs, mdctSize);
+
+         fprintf (file, "#ifdef OVERRIDE_FFT\n");
+         fprintf (file, "(arch_fft_state *)&cfg_arch_%d,\n", mode->mdct.kfft[k]->nfft);
+         fprintf (file, "#else\n");
+         fprintf (file, "NULL,\n");
+         fprintf(file, "#endif\n");
+
         fprintf (file, "};\n");

         fprintf(file, "#endif\n");
@@ -185,12 +243,25 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
      fprintf(file, "\n");

      /* MDCT twiddles */
+      mdct_twiddles_size = mode->mdct.n-(mode->mdct.n/2>>mode->mdct.maxshift);
      fprintf(file, "#ifndef MDCT_TWIDDLES%d\n", mdctSize);
      fprintf(file, "#define MDCT_TWIDDLES%d\n", mdctSize);
-      fprintf (file, "static const opus_val16 mdct_twiddles%d[%d] = {\n",
-            mdctSize, mode->mdct.n/4+1);
-      for (j=0;j<=mode->mdct.n/4;j++)
-         fprintf (file, WORD16 ", ", mode->mdct.trig[j]);
+      fprintf (file, "static const celt_coef mdct_twiddles%d[%d] = {\n",
+            mdctSize, mdct_twiddles_size);
+
+#if defined(FIXED_POINT) && defined(ENABLE_QEXT)
+      fprintf(file, "#ifdef ENABLE_QEXT\n");
+      for (j=0;j<mdct_twiddles_size;j++)
+         fprintf (file, WORD32 ",%c", mode->mdct.trig[j],(j+6)%5==0?'\n':' ');
+      fprintf(file, "#else\n");
+      for (j=0;j<mdct_twiddles_size;j++)
+         fprintf (file, WORD16 ",%c", COEF16(mode->mdct.trig[j], 16),(j+6)%5==0?'\n':' ');
+      fprintf(file, "#endif\n");
+#else
+      for (j=0;j<mdct_twiddles_size;j++)
+         fprintf (file, WORD16 ",%c", mode->mdct.trig[j],(j+6)%5==0?'\n':' ');
+#endif
+
      fprintf (file, "};\n");

      fprintf(file, "#endif\n");
@@ -199,35 +270,37 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)

      /* Print the actual mode data */
      fprintf(file, "static const CELTMode mode%d_%d_%d = {\n", mode->Fs, mdctSize, mode->overlap);
-      fprintf(file, INT32 ",\t/* Fs */\n", mode->Fs);
-      fprintf(file, "%d,\t/* overlap */\n", mode->overlap);
-      fprintf(file, "%d,\t/* nbEBands */\n", mode->nbEBands);
-      fprintf(file, "%d,\t/* effEBands */\n", mode->effEBands);
+      fprintf(file, INT32 ",    /* Fs */\n", mode->Fs);
+      fprintf(file, "%d,    /* overlap */\n", mode->overlap);
+      fprintf(file, "%d,    /* nbEBands */\n", mode->nbEBands);
+      fprintf(file, "%d,    /* effEBands */\n", mode->effEBands);
      fprintf(file, "{");
      for (j=0;j<4;j++)
         fprintf(file, WORD16 ", ", mode->preemph[j]);
-      fprintf(file, "},\t/* preemph */\n");
+      fprintf(file, "},    /* preemph */\n");
      if (standard)
-         fprintf(file, "eband5ms,\t/* eBands */\n");
+         fprintf(file, "eband5ms,    /* eBands */\n");
      else
-         fprintf(file, "eBands%d_%d,\t/* eBands */\n", mode->Fs, mdctSize);
-      fprintf(file, "%d,\t/* nbAllocVectors */\n", mode->nbAllocVectors);
+         fprintf(file, "eBands%d_%d,    /* eBands */\n", mode->Fs, mdctSize);
+
+      fprintf(file, "%d,    /* maxLM */\n", mode->maxLM);
+      fprintf(file, "%d,    /* nbShortMdcts */\n", mode->nbShortMdcts);
+      fprintf(file, "%d,    /* shortMdctSize */\n", mode->shortMdctSize);
+
+      fprintf(file, "%d,    /* nbAllocVectors */\n", mode->nbAllocVectors);
      if (standard)
-         fprintf(file, "band_allocation,\t/* allocVectors */\n");
+         fprintf(file, "band_allocation,    /* allocVectors */\n");
      else
-         fprintf(file, "allocVectors%d_%d,\t/* allocVectors */\n", mode->Fs, mdctSize);
+         fprintf(file, "allocVectors%d_%d,    /* allocVectors */\n", mode->Fs, mdctSize);

+      fprintf(file, "logN%d,    /* logN */\n", framerate);
+      fprintf(file, "window%d,    /* window */\n", mode->overlap);
      fprintf(file, "{%d, %d, {", mode->mdct.n, mode->mdct.maxshift);
      for (k=0;k<=mode->mdct.maxshift;k++)
         fprintf(file, "&fft_state%d_%d_%d, ", mode->Fs, mdctSize, k);
-      fprintf (file, "}, mdct_twiddles%d},\t/* mdct */\n", mdctSize);
-
-      fprintf(file, "window%d,\t/* window */\n", mode->overlap);
-      fprintf(file, "%d,\t/* maxLM */\n", mode->maxLM);
-      fprintf(file, "%d,\t/* nbShortMdcts */\n", mode->nbShortMdcts);
-      fprintf(file, "%d,\t/* shortMdctSize */\n", mode->shortMdctSize);
-      fprintf(file, "logN%d,\t/* logN */\n", framerate);
-      fprintf(file, "{%d, cache_index%d, cache_bits%d, cache_caps%d},\t/* cache */\n",
+      fprintf (file, "}, mdct_twiddles%d},    /* mdct */\n", mdctSize);
+
+      fprintf(file, "{%d, cache_index%d, cache_bits%d, cache_caps%d},    /* cache */\n",
            mode->cache.size, mode->Fs/mdctSize, mode->Fs/mdctSize, mode->Fs/mdctSize);
      fprintf(file, "};\n");
   }
@@ -291,9 +364,9 @@ int main(int argc, char **argv)
   int i, nb;
   FILE *file;
   CELTMode **m;
-   if (argc%2 != 1)
+   if (argc%2 != 1 || argc<3)
   {
-      fprintf (stderr, "must have a multiple of 2 arguments\n");
+      fprintf (stderr, "Usage: %s rate frame_size [rate frame_size] [rate frame_size]...\n",argv[0]);
      return 1;
   }
   nb = (argc-1)/2;
@@ -303,7 +376,7 @@ int main(int argc, char **argv)
      int Fs, frame;
      Fs      = atoi(argv[2*i+1]);
      frame   = atoi(argv[2*i+2]);
-      m[i] = celt_mode_create(Fs, frame, NULL);
+      m[i] = opus_custom_mode_create(Fs, frame, NULL);
      if (m[i]==NULL)
      {
         fprintf(stderr,"Error creating mode with Fs=%s, frame_size=%s\n",
@@ -312,10 +385,16 @@ int main(int argc, char **argv)
      }
   }
   file = fopen(BASENAME ".h", "w");
+#ifdef OVERRIDE_FFT
+   dump_modes_arch_init(m, nb);
+#endif
   dump_modes(file, m, nb);
   fclose(file);
+#ifdef OVERRIDE_FFT
+   dump_modes_arch_finalize();
+#endif
   for (i=0;i<nb;i++)
-      celt_mode_destroy(m[i]);
+      opus_custom_mode_destroy(m[i]);
   free(m);
   return 0;
 }
--- a/celt/dump_modes/dump_modes_arch.h
+++ b/celt/dump_modes/dump_modes_arch.h
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DUMP_MODE_ARCH_H
+#define DUMP_MODE_ARCH_H
+
+void dump_modes_arch_init();
+void dump_mode_arch(CELTMode *mode);
+void dump_modes_arch_finalize();
+
+#if !defined(FIXED_POINT)
+#define ARM_NE10_ARCH_FILE_NAME "static_modes_float_arm_ne10.h"
+#else
+#define ARM_NE10_ARCH_FILE_NAME "static_modes_fixed_arm_ne10.h"
+#endif
+
+#if defined(HAVE_ARM_NE10)
+#define OVERRIDE_FFT (1)
+#endif
+
+#endif
--- a/celt/dump_modes/dump_modes_arm_ne10.c
+++ b/celt/dump_modes/dump_modes_arm_ne10.c
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if defined(HAVE_CONFIG_H)
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "modes.h"
+#include "dump_modes_arch.h"
+#include <NE10_dsp.h>
+
+#if !defined(FIXED_POINT)
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
+# define NE10_FFT_CPX_TYPE_T_STR "ne10_fft_cpx_float32_t"
+# define NE10_FFT_STATE_TYPE_T_STR "ne10_fft_state_float32_t"
+#else
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
+# define NE10_FFT_CPX_TYPE_T_STR "ne10_fft_cpx_int32_t"
+# define NE10_FFT_STATE_TYPE_T_STR "ne10_fft_state_int32_t"
+#endif
+
+static FILE *file;
+
+void dump_modes_arch_init(CELTMode **modes, int nb_modes)
+{
+   int i;
+
+   file = fopen(ARM_NE10_ARCH_FILE_NAME, "w");
+   fprintf(file, "/* The contents of this file was automatically generated by\n");
+   fprintf(file, " * dump_mode_arm_ne10.c with arguments:");
+   for (i=0;i<nb_modes;i++)
+   {
+      CELTMode *mode = modes[i];
+      fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts);
+   }
+   fprintf(file, "\n * It contains static definitions for some pre-defined modes. */\n");
+   fprintf(file, "#include <NE10_types.h>\n\n");
+}
+
+void dump_modes_arch_finalize()
+{
+   fclose(file);
+}
+
+void dump_mode_arch(CELTMode *mode)
+{
+   int k, j;
+   int mdctSize;
+
+   mdctSize = mode->shortMdctSize*mode->nbShortMdcts;
+
+   fprintf(file, "#ifndef NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize);
+   fprintf(file, "#define NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize);
+   /* cfg->factors */
+   for(k=0;k<=mode->mdct.maxshift;k++) {
+      NE10_FFT_CFG_TYPE_T cfg;
+      cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv;
+      if (!cfg)
+         continue;
+      fprintf(file, "static const ne10_int32_t ne10_factors_%d[%d] = {\n",
+              mode->mdct.kfft[k]->nfft, (NE10_MAXFACTORS * 2));
+      for(j=0;j<(NE10_MAXFACTORS * 2);j++) {
+         fprintf(file, "%d,%c", cfg->factors[j],(j+16)%15==0?'\n':' ');
+      }
+      fprintf (file, "};\n");
+   }
+
+   /* cfg->twiddles */
+   for(k=0;k<=mode->mdct.maxshift;k++) {
+      NE10_FFT_CFG_TYPE_T cfg;
+      cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv;
+      if (!cfg)
+         continue;
+      fprintf(file, "static const %s ne10_twiddles_%d[%d] = {\n",
+              NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft,
+              mode->mdct.kfft[k]->nfft);
+      for(j=0;j<mode->mdct.kfft[k]->nfft;j++) {
+#if !defined(FIXED_POINT)
+         fprintf(file, "{%#0.8gf,%#0.8gf},%c",
+                 cfg->twiddles[j].r, cfg->twiddles[j].i,(j+4)%3==0?'\n':' ');
+#else
+         fprintf(file, "{%d,%d},%c",
+                 cfg->twiddles[j].r, cfg->twiddles[j].i,(j+4)%3==0?'\n':' ');
+#endif
+      }
+      fprintf (file, "};\n");
+   }
+
+   for(k=0;k<=mode->mdct.maxshift;k++) {
+      NE10_FFT_CFG_TYPE_T cfg;
+      cfg = (NE10_FFT_CFG_TYPE_T)mode->mdct.kfft[k]->arch_fft->priv;
+      if (!cfg) {
+         fprintf(file, "/* Ne10 does not support scaled FFT for length = %d */\n",
+                 mode->mdct.kfft[k]->nfft);
+         fprintf(file, "static const arch_fft_state cfg_arch_%d = {\n", mode->mdct.kfft[k]->nfft);
+         fprintf(file, "0,\n");
+         fprintf(file, "NULL\n");
+         fprintf(file, "};\n");
+         continue;
+      }
+      fprintf(file, "static const %s %s_%d = {\n", NE10_FFT_STATE_TYPE_T_STR,
+              NE10_FFT_STATE_TYPE_T_STR, mode->mdct.kfft[k]->nfft);
+      fprintf(file, "%d,\n", cfg->nfft);
+      fprintf(file, "(ne10_int32_t *)ne10_factors_%d,\n", mode->mdct.kfft[k]->nfft);
+      fprintf(file, "(%s *)ne10_twiddles_%d,\n",
+              NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft);
+      fprintf(file, "NULL,\n");  /* buffer */
+      fprintf(file, "(%s *)&ne10_twiddles_%d[%d],\n",
+              NE10_FFT_CPX_TYPE_T_STR, mode->mdct.kfft[k]->nfft, cfg->nfft);
+#if !defined(FIXED_POINT)
+      fprintf(file, "/* is_forward_scaled = true */\n");
+      fprintf(file, "(ne10_int32_t) 1,\n");
+      fprintf(file, "/* is_backward_scaled = false */\n");
+      fprintf(file, "(ne10_int32_t) 0,\n");
+#endif
+      fprintf(file, "};\n");
+
+      fprintf(file, "static const arch_fft_state cfg_arch_%d = {\n",
+              mode->mdct.kfft[k]->nfft);
+      fprintf(file, "1,\n");
+      fprintf(file, "(void *)&%s_%d,\n",
+              NE10_FFT_STATE_TYPE_T_STR, mode->mdct.kfft[k]->nfft);
+      fprintf(file, "};\n\n");
+   }
+   fprintf(file, "#endif  /* end NE10_FFT_PARAMS%d_%d */\n", mode->Fs, mdctSize);
+}
--- a/libcelt/ecintrin.h
+++ b/libcelt/ecintrin.h
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -29,41 +29,31 @@
 #include "opus_types.h"
 #include <math.h>
 #include <limits.h>
+#include "arch.h"
 #if !defined(_ecintrin_H)
 # define _ecintrin_H (1)

-/*Some specific platforms may have optimized intrinsic or inline assembly
+/*Some specific platforms may have optimized intrinsic or OPUS_INLINE assembly
   versions of these functions which can substantially improve performance.
  We define macros for them to allow easy incorporation of these non-ANSI
   features.*/

-/*Note that we do not provide a macro for abs(), because it is provided as a
-   library function, which we assume is translated into an intrinsic to avoid
-   the function call overhead and then implemented in the smartest way for the
-   target platform.
-  With modern gcc (4.x), this is true: it uses cmov instructions if the
-   architecture supports it and branchless bit-twiddling if it does not (the
-   speed difference between the two approaches is not measurable).
-  Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150)
-   by Sun Microsystems, despite prior art dating back to at least 1996:
-   http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT
-  On gcc 3.x, however, our assumption is not true, as abs() is translated to a
-   conditional jump, which is horrible on deeply piplined architectures (e.g.,
-   all consumer architectures for the past decade or more) when the sign cannot
-   be reliably predicted.*/
-
 /*Modern gcc (4.x) can compile the naive versions of min and max with cmov if
   given an appropriate architecture, but the branchless bit-twiddling versions
   are just as fast, and do not require any special target architecture.
  Earlier gcc versions (3.x) compiled both code to the same assembly
   instructions, because of the way they represented ((_b)>(_a)) internally.*/
-# define EC_MINI(_a,_b)      ((_a)+((_b)-(_a)&-((_b)<(_a))))
+# define EC_MINI(_a,_b)      ((_a)+(((_b)-(_a))&-((_b)<(_a))))

 /*Count leading zeros.
  This macro should only be used for implementing ec_ilog(), if it is defined.
  All other code should use EC_ILOG() instead.*/
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)
+#if defined(_MSC_VER) && (_MSC_VER >= 1910)
+# include <intrin0.h> /* Improve compiler throughput. */
+#else
 # include <intrin.h>
+#endif
 /*In _DEBUG mode this is not an intrinsic by default.*/
 # pragma intrinsic(_BitScanReverse)

@@ -78,15 +68,13 @@ static __inline int ec_bsr(unsigned long _x){
 # include "dsplib.h"
 # define EC_CLZ0    (31)
 # define EC_CLZ(_x) (_lnorm(_x))
-#elif defined(__GNUC_PREREQ)
-# if __GNUC_PREREQ(3,4)
-#  if INT_MAX>=2147483647
-#   define EC_CLZ0    ((int)sizeof(unsigned)*CHAR_BIT)
-#   define EC_CLZ(_x) (__builtin_clz(_x))
-#  elif LONG_MAX>=2147483647L
-#   define EC_CLZ0    ((int)sizeof(unsigned long)*CHAR_BIT)
-#   define EC_CLZ(_x) (__builtin_clzl(_x))
-#  endif
+#elif __GNUC_PREREQ(3,4)
+# if INT_MAX>=2147483647
+#  define EC_CLZ0    ((int)sizeof(unsigned)*CHAR_BIT)
+#  define EC_CLZ(_x) (__builtin_clz(_x))
+# elif LONG_MAX>=2147483647L
+#  define EC_CLZ0    ((int)sizeof(unsigned long)*CHAR_BIT)
+#  define EC_CLZ(_x) (__builtin_clzl(_x))
 # endif
 #endif


--- a/libcelt/entcode.c
+++ b/libcelt/entcode.c
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -33,6 +33,11 @@
 #include "arch.h"

 #if !defined(EC_CLZ)
+/*This is a fallback for systems where we don't know how to access
+   a BSR or CLZ instruction (see ecintrin.h).
+  If you are optimizing Opus on a new platform and it has a native CLZ or
+   BZR (e.g. cell, MIPS, x86, etc) then making it available to Opus will be
+   an easy performance win.*/
 int ec_ilog(opus_uint32 _v){
  /*On a Pentium M, this branchless version tested as the fastest on
     1,000,000,000 random 32-bit integers, edging out a similar version with
@@ -57,6 +62,27 @@ int ec_ilog(opus_uint32 _v){
 }
 #endif

+#if 1
+/* This is a faster version of ec_tell_frac() that takes advantage
+   of the low (1/8 bit) resolution to use just a linear function
+   followed by a lookup to determine the exact transition thresholds. */
+opus_uint32 ec_tell_frac(ec_ctx *_this){
+  static const unsigned correction[8] =
+    {35733, 38967, 42495, 46340,
+     50535, 55109, 60097, 65535};
+  opus_uint32 nbits;
+  opus_uint32 r;
+  int         l;
+  unsigned    b;
+  nbits=_this->nbits_total<<BITRES;
+  l=EC_ILOG(_this->rng);
+  r=_this->rng>>(l-16);
+  b = (r>>12)-8;
+  b += r>correction[b];
+  l = (l<<3)+b;
+  return nbits-l;
+}
+#else
 opus_uint32 ec_tell_frac(ec_ctx *_this){
  opus_uint32 nbits;
  opus_uint32 r;
@@ -76,7 +102,7 @@ opus_uint32 ec_tell_frac(ec_ctx *_this){
     encoder or decoder claims to have used 1 bit.*/
  nbits=_this->nbits_total<<BITRES;
  l=EC_ILOG(_this->rng);
-  r=_this->rng>>l-16;
+  r=_this->rng>>(l-16);
  for(i=BITRES;i-->0;){
    int b;
    r=r*r>>15;
@@ -86,3 +112,42 @@ opus_uint32 ec_tell_frac(ec_ctx *_this){
  }
  return nbits-l;
 }
+#endif
+
+#ifdef USE_SMALL_DIV_TABLE
+/* Result of 2^32/(2*i+1), except for i=0. */
+const opus_uint32 SMALL_DIV_TABLE[129] = {
+   0xFFFFFFFF, 0x55555555, 0x33333333, 0x24924924,
+   0x1C71C71C, 0x1745D174, 0x13B13B13, 0x11111111,
+   0x0F0F0F0F, 0x0D79435E, 0x0C30C30C, 0x0B21642C,
+   0x0A3D70A3, 0x097B425E, 0x08D3DCB0, 0x08421084,
+   0x07C1F07C, 0x07507507, 0x06EB3E45, 0x06906906,
+   0x063E7063, 0x05F417D0, 0x05B05B05, 0x0572620A,
+   0x05397829, 0x05050505, 0x04D4873E, 0x04A7904A,
+   0x047DC11F, 0x0456C797, 0x04325C53, 0x04104104,
+   0x03F03F03, 0x03D22635, 0x03B5CC0E, 0x039B0AD1,
+   0x0381C0E0, 0x0369D036, 0x03531DEC, 0x033D91D2,
+   0x0329161F, 0x03159721, 0x03030303, 0x02F14990,
+   0x02E05C0B, 0x02D02D02, 0x02C0B02C, 0x02B1DA46,
+   0x02A3A0FD, 0x0295FAD4, 0x0288DF0C, 0x027C4597,
+   0x02702702, 0x02647C69, 0x02593F69, 0x024E6A17,
+   0x0243F6F0, 0x0239E0D5, 0x02302302, 0x0226B902,
+   0x021D9EAD, 0x0214D021, 0x020C49BA, 0x02040810,
+   0x01FC07F0, 0x01F44659, 0x01ECC07B, 0x01E573AC,
+   0x01DE5D6E, 0x01D77B65, 0x01D0CB58, 0x01CA4B30,
+   0x01C3F8F0, 0x01BDD2B8, 0x01B7D6C3, 0x01B20364,
+   0x01AC5701, 0x01A6D01A, 0x01A16D3F, 0x019C2D14,
+   0x01970E4F, 0x01920FB4, 0x018D3018, 0x01886E5F,
+   0x0183C977, 0x017F405F, 0x017AD220, 0x01767DCE,
+   0x01724287, 0x016E1F76, 0x016A13CD, 0x01661EC6,
+   0x01623FA7, 0x015E75BB, 0x015AC056, 0x01571ED3,
+   0x01539094, 0x01501501, 0x014CAB88, 0x0149539E,
+   0x01460CBC, 0x0142D662, 0x013FB013, 0x013C995A,
+   0x013991C2, 0x013698DF, 0x0133AE45, 0x0130D190,
+   0x012E025C, 0x012B404A, 0x01288B01, 0x0125E227,
+   0x01234567, 0x0120B470, 0x011E2EF3, 0x011BB4A4,
+   0x01194538, 0x0116E068, 0x011485F0, 0x0112358E,
+   0x010FEF01, 0x010DB20A, 0x010B7E6E, 0x010953F3,
+   0x01073260, 0x0105197F, 0x0103091B, 0x01010101
+};
+#endif
--- a/libcelt/entcode.h
+++ b/libcelt/entcode.h
@@ -15,8 +15,8 @@
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
@@ -26,6 +26,7 @@
 */

 #include "opus_types.h"
+#include "opus_defines.h"

 #if !defined(_entcode_H)
 # define _entcode_H (1)
@@ -33,6 +34,12 @@
 # include <stddef.h>
 # include "ecintrin.h"

+extern const opus_uint32 SMALL_DIV_TABLE[129];
+
+#ifdef OPUS_ARM_ASM
+#define USE_SMALL_DIV_TABLE
+#endif
+
 /*OPT: ec_window must be at least 32 bits, but if you have fast arithmetic on a
   larger type, you can speed up the decoder by using it here.*/
 typedef opus_uint32           ec_window;
@@ -83,15 +90,15 @@ struct ec_ctx{
   int            error;
 };

-static inline opus_uint32 ec_range_bytes(ec_ctx *_this){
+static OPUS_INLINE opus_uint32 ec_range_bytes(ec_ctx *_this){
  return _this->offs;
 }

-static inline unsigned char *ec_get_buffer(ec_ctx *_this){
+static OPUS_INLINE unsigned char *ec_get_buffer(ec_ctx *_this){
  return _this->buf;
 }

-static inline int ec_get_error(ec_ctx *_this){
+static OPUS_INLINE int ec_get_error(ec_ctx *_this){
  return _this->error;
 }

@@ -101,7 +108,7 @@ static inline int ec_get_error(ec_ctx *_this){
  Return: The number of bits.
          This will always be slightly larger than the exact value (e.g., all
           rounding error is in the positive direction).*/
-static inline int ec_tell(ec_ctx *_this){
+static OPUS_INLINE int ec_tell(ec_ctx *_this){
  return _this->nbits_total-EC_ILOG(_this->rng);
 }

@@ -113,4 +120,33 @@ static inline int ec_tell(ec_ctx *_this){
           rounding error is in the positive direction).*/
 opus_uint32 ec_tell_frac(ec_ctx *_this);

+/* Tested exhaustively for all n and for 1<=d<=256 */
+static OPUS_INLINE opus_uint32 celt_udiv(opus_uint32 n, opus_uint32 d) {
+   celt_sig_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (d>256)
+      return n/d;
+   else {
+      opus_uint32 t, q;
+      t = EC_ILOG(d&-d);
+      q = (opus_uint64)SMALL_DIV_TABLE[d>>t]*(n>>(t-1))>>32;
+      return q+(n-q*d >= d);
+   }
+#else
+   return n/d;
+#endif
+}
+
+static OPUS_INLINE opus_int32 celt_sudiv(opus_int32 n, opus_int32 d) {
+   celt_sig_assert(d>0);
+#ifdef USE_SMALL_DIV_TABLE
+   if (n<0)
+      return -(opus_int32)celt_udiv(-n, d);
+   else
+      return celt_udiv(n, d);
+#else
+   return n/d;
+#endif
+}
+
 #endif
No results found