Compare revisions

1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855 · 1c370855
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
+/* Copyright (c) 2010 Xiph.Org Foundation
+ * Copyright (c) 2013 Parrot
+ * Copyright (c) 2024 Arm Limited */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "kiss_fft.h"
+#include "mathops.h"
+#include "mdct.h"
+#include "pitch.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if !defined(DISABLE_FLOAT_API)
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
+  celt_float2int16_c,   /* ARMv4 */
+  celt_float2int16_c,   /* EDSP */
+  celt_float2int16_c,   /* Media */
+  celt_float2int16_neon,/* NEON */
+  celt_float2int16_neon /* DOTPROD */
+};
+#  endif
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+  celt_inner_prod_c,   /* ARMv4 */
+  celt_inner_prod_c,   /* EDSP */
+  celt_inner_prod_c,   /* Media */
+  celt_inner_prod_neon,/* NEON */
+  celt_inner_prod_neon /* DOTPROD */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2) = {
+  dual_inner_prod_c,   /* ARMv4 */
+  dual_inner_prod_c,   /* EDSP */
+  dual_inner_prod_c,   /* Media */
+  dual_inner_prod_neon,/* NEON */
+  dual_inner_prod_neon /* DOTPROD */
+};
+# endif
+
+# if defined(FIXED_POINT)
+#  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
+    (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
+    (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
+opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int, int, int) = {
+  celt_pitch_xcorr_c,               /* ARMv4 */
+  MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
+  MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
+  MAY_HAVE_NEON(celt_pitch_xcorr),  /* NEON */
+  MAY_HAVE_NEON(celt_pitch_xcorr)   /* DOTPROD */
+};
+
+#  endif
+# else /* !FIXED_POINT */
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int, int, int) = {
+  celt_pitch_xcorr_c,              /* ARMv4 */
+  celt_pitch_xcorr_c,              /* EDSP */
+  celt_pitch_xcorr_c,              /* Media */
+  celt_pitch_xcorr_float_neon,     /* Neon */
+  celt_pitch_xcorr_float_neon      /* DOTPROD */
+};
+#  endif
+# endif /* FIXED_POINT */
+
+#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
+ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+         const opus_val16 *x,
+         const opus_val16 *y,
+         opus_val32       sum[4],
+         int              len
+) = {
+  xcorr_kernel_c,                /* ARMv4 */
+  xcorr_kernel_c,                /* EDSP */
+  xcorr_kernel_c,                /* Media */
+  xcorr_kernel_neon_fixed,       /* Neon */
+  xcorr_kernel_neon_fixed        /* DOTPROD */
+};
+
+#endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  if defined(HAVE_ARM_NE10)
+#   if defined(CUSTOM_MODES)
+int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
+   opus_fft_alloc_arch_c,        /* ARMv4 */
+   opus_fft_alloc_arch_c,        /* EDSP */
+   opus_fft_alloc_arch_c,        /* Media */
+   opus_fft_alloc_arm_neon,      /* Neon with NE10 library support */
+   opus_fft_alloc_arm_neon       /* DOTPROD with NE10 library support */
+};
+
+void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
+   opus_fft_free_arch_c,         /* ARMv4 */
+   opus_fft_free_arch_c,         /* EDSP */
+   opus_fft_free_arch_c,         /* Media */
+   opus_fft_free_arm_neon,       /* Neon with NE10 */
+   opus_fft_free_arm_neon        /* DOTPROD with NE10 */
+};
+#   endif /* CUSTOM_MODES */
+
+void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+                                        const kiss_fft_cpx *fin,
+                                        kiss_fft_cpx *fout) = {
+   opus_fft_c,                   /* ARMv4 */
+   opus_fft_c,                   /* EDSP */
+   opus_fft_c,                   /* Media */
+   opus_fft_neon,                /* Neon with NE10 */
+   opus_fft_neon                 /* DOTPROD with NE10 */
+};
+
+void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
+                                         const kiss_fft_cpx *fin,
+                                         kiss_fft_cpx *fout) = {
+   opus_ifft_c,                   /* ARMv4 */
+   opus_ifft_c,                   /* EDSP */
+   opus_ifft_c,                   /* Media */
+   opus_ifft_neon,                /* Neon with NE10 */
+   opus_ifft_neon                 /* DOTPROD with NE10 */
+};
+
+void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
+                                                     kiss_fft_scalar *in,
+                                                     kiss_fft_scalar * OPUS_RESTRICT out,
+                                                     const opus_val16 *window,
+                                                     int overlap, int shift,
+                                                     int stride, int arch) = {
+   clt_mdct_forward_c,           /* ARMv4 */
+   clt_mdct_forward_c,           /* EDSP */
+   clt_mdct_forward_c,           /* Media */
+   clt_mdct_forward_neon,        /* Neon with NE10 */
+   clt_mdct_forward_neon         /* DOTPROD with NE10 */
+};
+
+void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
+                                                      kiss_fft_scalar *in,
+                                                      kiss_fft_scalar * OPUS_RESTRICT out,
+                                                      const opus_val16 *window,
+                                                      int overlap, int shift,
+                                                      int stride, int arch) = {
+   clt_mdct_backward_c,           /* ARMv4 */
+   clt_mdct_backward_c,           /* EDSP */
+   clt_mdct_backward_c,           /* Media */
+   clt_mdct_backward_neon,        /* Neon with NE10 */
+   clt_mdct_backward_neon         /* DOTPROD with NE10 */
+};
+
+#  endif /* HAVE_ARM_NE10 */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+#endif /* OPUS_HAVE_RTCD */
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -37,11 +37,13 @@
 #include "cpu_support.h"
 #include "os_support.h"
 #include "opus_types.h"
+#include "arch.h"

-#define OPUS_CPU_ARM_V4    (1)
-#define OPUS_CPU_ARM_EDSP  (1<<1)
-#define OPUS_CPU_ARM_MEDIA (1<<2)
-#define OPUS_CPU_ARM_NEON  (1<<3)
+#define OPUS_CPU_ARM_V4_FLAG    (1<<OPUS_ARCH_ARM_V4)
+#define OPUS_CPU_ARM_EDSP_FLAG  (1<<OPUS_ARCH_ARM_EDSP)
+#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
+#define OPUS_CPU_ARM_NEON_FLAG  (1<<OPUS_ARCH_ARM_NEON)
+#define OPUS_CPU_ARM_DOTPROD_FLAG  (1<<OPUS_ARCH_ARM_DOTPROD)

 #if defined(_MSC_VER)
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
@@ -55,29 +57,31 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
  /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
   * instructions via their assembled hex code.
   * All of these instructions should be essentially nops. */
-# if defined(ARMv5E_ASM)
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
  __try{
    /*PLD [r13]*/
    __emit(0xF5DDF000);
-    flags|=OPUS_CPU_ARM_EDSP;
+    flags|=OPUS_CPU_ARM_EDSP_FLAG;
  }
  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
    /*Ignore exception.*/
  }
-#  if defined(ARMv6E_ASM)
+#  if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
  __try{
    /*SHADD8 r3,r3,r3*/
    __emit(0xE6333F93);
-    flags|=OPUS_CPU_ARM_MEDIA;
+    flags|=OPUS_CPU_ARM_MEDIA_FLAG;
  }
  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
    /*Ignore exception.*/
  }
-#   if defined(ARM_HAVE_NEON)
+#   if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
  __try{
    /*VORR q0,q0,q0*/
    __emit(0xF2200150);
-    flags|=OPUS_CPU_ARM_NEON;
+    flags|=OPUS_CPU_ARM_NEON_FLAG;
  }
  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
    /*Ignore exception.*/
@@ -90,7 +94,9 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){

 #elif defined(__linux__)
 /* Linux based */
-opus_uint32 opus_cpu_capabilities(void)
+#include <stdio.h>
+
+static opus_uint32 opus_cpu_capabilities(void)
 {
  opus_uint32 flags = 0;
  FILE *cpuinfo;
@@ -107,19 +113,34 @@ opus_uint32 opus_cpu_capabilities(void)

    while(fgets(buf, 512, cpuinfo) != NULL)
    {
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
      /* Search for edsp and neon flag */
      if(memcmp(buf, "Features", 8) == 0)
      {
        char *p;
        p = strstr(buf, " edsp");
        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
-          flags |= OPUS_CPU_ARM_EDSP;
+          flags |= OPUS_CPU_ARM_EDSP_FLAG;

+#  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
        p = strstr(buf, " neon");
        if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
-          flags |= OPUS_CPU_ARM_NEON;
+          flags |= OPUS_CPU_ARM_NEON_FLAG;
+        p = strstr(buf, " asimd");
+        if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
+          flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+        p = strstr(buf, " asimddp");
+        if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
+          flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+#  endif
      }
+# endif

+# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
      /* Search for media capabilities (>= ARMv6) */
      if(memcmp(buf, "CPU architecture:", 17) == 0)
      {
@@ -127,14 +148,97 @@ opus_uint32 opus_cpu_capabilities(void)
        version = atoi(buf+17);

        if(version >= 6)
-          flags |= OPUS_CPU_ARM_MEDIA;
+          flags |= OPUS_CPU_ARM_MEDIA_FLAG;
      }
+# endif
    }

+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
    fclose(cpuinfo);
  }
  return flags;
 }
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static opus_uint32 opus_cpu_capabilities(void)
+{
+  opus_uint32 flags = 0;
+
+#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+  size_t size = sizeof(uint32_t);
+  uint32_t value = 0;
+  if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
+  {
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+  }
+#endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+  flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+  flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+  return flags;
+}
+
+#elif defined(__FreeBSD__)
+#include <sys/auxv.h>
+
+static opus_uint32 opus_cpu_capabilities(void)
+{
+  long hwcap = 0;
+  opus_uint32 flags = 0;
+
+# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+  /* FreeBSD requires armv6+, which always supports media instructions */
+  flags |= OPUS_CPU_ARM_MEDIA_FLAG;
+# endif
+
+  elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
+
+# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  ifdef HWCAP_EDSP
+  if (hwcap & HWCAP_EDSP)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG;
+#  endif
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#   ifdef HWCAP_NEON
+  if (hwcap & HWCAP_NEON)
+    flags |= OPUS_CPU_ARM_NEON_FLAG;
+#   elif defined(HWCAP_ASIMD)
+  if (hwcap & HWCAP_ASIMD)
+    flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
+#   endif
+#  endif
+#  if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP)
+  if (hwcap & HWCAP_ASIMDDP)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+#  endif
+# endif
+
+#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
+    flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+    flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
+# endif
+#endif
+
+  return (flags);
+}
+
 #else
 /* The feature registers which can tell us what the processor supports are
 * accessible in priveleged modes only, so we can't have a general user-space
@@ -143,24 +247,45 @@ opus_uint32 opus_cpu_capabilities(void)
   "your platform.  Reconfigure with --disable-rtcd (or send patches)."
 #endif

-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
  opus_uint32 flags = opus_cpu_capabilities();
  int arch = 0;

-  if(!(flags & OPUS_CPU_ARM_EDSP))
+  if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
+    /* Asserts ensure arch values are sequential */
+    celt_assert(arch == OPUS_ARCH_ARM_V4);
+    return arch;
+  }
+  arch++;
+
+  if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_EDSP);
    return arch;
+  }
  arch++;

-  if(!(flags & OPUS_CPU_ARM_MEDIA))
+  if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
    return arch;
+  }
  arch++;

-  if(!(flags & OPUS_CPU_ARM_NEON))
+  if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
+    celt_assert(arch == OPUS_ARCH_ARM_NEON);
    return arch;
+  }
  arch++;

+  celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
  return arch;
 }

+int opus_select_arch(void) {
+  int arch = opus_select_arch_impl();
+#ifdef FUZZING
+  arch = rand()%(arch+1);
+#endif
+  return arch;
+}
 #endif
--- a/celt/arm/armcpu.h
+++ b/celt/arm/armcpu.h
@@ -25,11 +25,66 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-/* Original code from libtheora modified to suit to Opus */
+#if !defined(ARMCPU_H)
+# define ARMCPU_H

-#ifndef ARMCPU_H
-#define ARMCPU_H
+# if defined(OPUS_ARM_MAY_HAVE_EDSP)
+#  define MAY_HAVE_EDSP(name) name ## _edsp
+# else
+#  define MAY_HAVE_EDSP(name) name ## _c
+# endif

+# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
+#  define MAY_HAVE_MEDIA(name) name ## _media
+# else
+#  define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+#  define MAY_HAVE_NEON(name) name ## _neon
+# else
+#  define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
+# endif
+
+# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
+#  define MAY_HAVE_DOTPROD(name) name ## _dotprod
+# else
+#  define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_EDSP)
+#  define PRESUME_EDSP(name) name ## _edsp
+# else
+#  define PRESUME_EDSP(name) name ## _c
+# endif
+
+# if defined(OPUS_ARM_PRESUME_MEDIA)
+#  define PRESUME_MEDIA(name) name ## _media
+# else
+#  define PRESUME_MEDIA(name) PRESUME_EDSP(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_NEON)
+#  define PRESUME_NEON(name) name ## _neon
+# else
+#  define PRESUME_NEON(name) PRESUME_MEDIA(name)
+# endif
+
+# if defined(OPUS_ARM_PRESUME_DOTPROD)
+#  define PRESUME_DOTPROD(name) name ## _dotprod
+# else
+#  define PRESUME_DOTPROD(name) PRESUME_NEON(name)
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
 int opus_select_arch(void);

+#define OPUS_ARCH_ARM_V4    (0)
+#define OPUS_ARCH_ARM_EDSP  (1)
+#define OPUS_ARCH_ARM_MEDIA (2)
+#define OPUS_ARCH_ARM_NEON  (3)
+#define OPUS_ARCH_ARM_DOTPROD  (4)
+
+# endif
+
 #endif
--- a/celt/arm/armopts.s.in
+++ b/celt/arm/armopts.s.in
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP  * @OPUS_ARM_MAY_HAVE_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON  * @OPUS_ARM_MAY_HAVE_NEON@
+
+END
--- a/celt/arm/celt_fft_ne10.c
+++ b/celt/arm/celt_fft_ne10.c
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_fft_ne10.c
+   @brief ARM Neon optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include <NE10_dsp.h>
+#include "os_support.h"
+#include "kiss_fft.h"
+#include "stack_alloc.h"
+
+#if !defined(FIXED_POINT)
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
+#else
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
+#endif
+
+#if defined(CUSTOM_MODES)
+
+/* nfft lengths in NE10 that support scaled fft */
+# define NE10_FFTSCALED_SUPPORT_MAX 4
+static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
+   480, 240, 120, 60
+};
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st)
+{
+   int i;
+   size_t memneeded = sizeof(struct arch_fft_state);
+
+   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
+   if (!st->arch_fft)
+      return -1;
+
+   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
+      if(st->nfft == ne10_fft_scaled_support[i])
+         break;
+   }
+   if (i == NE10_FFTSCALED_SUPPORT_MAX) {
+      /* This nfft length (scaled fft) is not supported in NE10 */
+      st->arch_fft->is_supported = 0;
+      st->arch_fft->priv = NULL;
+   }
+   else {
+      st->arch_fft->is_supported = 1;
+      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
+      if (st->arch_fft->priv == NULL) {
+         return -1;
+      }
+   }
+   return 0;
+}
+
+void opus_fft_free_arm_neon(kiss_fft_state *st)
+{
+   NE10_FFT_CFG_TYPE_T cfg;
+
+   if (!st->arch_fft)
+      return;
+
+   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
+   if (cfg)
+      NE10_FFT_DESTROY_C2C_TYPE(cfg);
+   opus_free(st->arch_fft);
+}
+#endif
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_fft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_forward_scaled = 1;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0, 1);
+#endif
+   }
+   RESTORE_STACK;
+}
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_ifft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_backward_scaled = 0;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1, 0);
+#endif
+   }
+   RESTORE_STACK;
+}
--- a/celt/arm/celt_mdct_ne10.c
+++ b/celt/arm/celt_mdct_ne10.c
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_mdct_ne10.c
+   @brief ARM Neon optimizations for mdct using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include "mdct.h"
+#include "stack_alloc.h"
+
+void clt_mdct_forward_neon(const mdct_lookup *l,
+                           kiss_fft_scalar *in,
+                           kiss_fft_scalar * OPUS_RESTRICT out,
+                           const opus_val16 *window,
+                           int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+
+   SAVE_STACK;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         f2[i] = yc;
+      }
+   }
+
+   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+void clt_mdct_backward_neon(const mdct_lookup *l,
+                            kiss_fft_scalar *in,
+                            kiss_fft_scalar * OPUS_RESTRICT out,
+                            const opus_val16 * OPUS_RESTRICT window,
+                            int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   const kiss_twiddle_scalar *trig;
+   const kiss_fft_state *st = l->kfft[shift];
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         yp[2*i] = yr;
+         yp[2*i+1] = yi;
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         re = yp1[0];
+         im = yp1[1];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+   RESTORE_STACK;
+}
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
+/* Copyright (c) 2014-2015 Xiph.Org Foundation
+   Copyright (c) 2024 Arm Limited
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "../float_cast.h"
+#include "../mathops.h"
+#include "../pitch.h"
+#if defined(OPUS_CHECK_ASM)
+#include <stdlib.h>
+#endif
+
+#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
+{
+   int i = 0;
+
+#if defined(__ARM_NEON)
+   const int BLOCK_SIZE = 16;
+   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
+
+   for (; i < blockedSize; i += BLOCK_SIZE)
+   {
+      float32x4_t orig_a = vld1q_f32(&in[i +  0]);
+      float32x4_t orig_b = vld1q_f32(&in[i +  4]);
+      float32x4_t orig_c = vld1q_f32(&in[i +  8]);
+      float32x4_t orig_d = vld1q_f32(&in[i + 12]);
+
+      int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
+      int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
+      int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
+      int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
+
+      vst1_s16(&out[i +  0], asShort_a);
+      vst1_s16(&out[i +  4], asShort_b);
+      vst1_s16(&out[i +  8], asShort_c);
+      vst1_s16(&out[i + 12], asShort_d);
+# if defined(OPUS_CHECK_ASM)
+      short out_c[BLOCK_SIZE];
+      int j;
+      for(j = 0; j < BLOCK_SIZE; j++)
+      {
+         out_c[j] = FLOAT2INT16(in[i + j]);
+         celt_assert(abs((out_c[j] - out[i + j])) <= 1);
+      }
+# endif
+   }
+#endif
+
+   for (; i < cnt; i++)
+   {
+      out[i] = FLOAT2INT16(in[i]);
+   }
+}
+#endif
+
+#if defined(FIXED_POINT)
+#include <string.h>
+
+void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
+{
+   int j;
+   int32x4_t a = vld1q_s32(sum);
+   /* Load y[0...3] */
+   /* This requires len>0 to always be valid (which we assert in the C code). */
+   int16x4_t y0 = vld1_s16(y);
+   y += 4;
+
+   /* This loop loads one y value more than we actually need.
+      Therefore we have to stop as soon as there are 8 or fewer samples left
+       (instead of 7), to avoid reading past the end of the array. */
+   for (j = 0; j + 8 < len; j += 8)
+   {
+      /* Load x[0...7] */
+      int16x8_t xx = vld1q_s16(x);
+      int16x4_t x0 = vget_low_s16(xx);
+      int16x4_t x4 = vget_high_s16(xx);
+      /* Load y[4...11] */
+      int16x8_t yy = vld1q_s16(y);
+      int16x4_t y4 = vget_low_s16(yy);
+      int16x4_t y8 = vget_high_s16(yy);
+      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+      int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
+
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      int16x4_t y5 = vext_s16(y4, y8, 1);
+      int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
+      int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
+
+      int16x4_t y2 = vext_s16(y0, y4, 2);
+      int16x4_t y6 = vext_s16(y4, y8, 2);
+      int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
+      int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
+
+      int16x4_t y3 = vext_s16(y0, y4, 3);
+      int16x4_t y7 = vext_s16(y4, y8, 3);
+      int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
+      int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
+
+      y0 = y8;
+      a = a7;
+      x += 8;
+      y += 8;
+   }
+   if (j + 4 < len) {
+      /* Load x[0...3] */
+      int16x4_t x0 = vld1_s16(x);
+      /* Load y[4...7] */
+      int16x4_t y4 = vld1_s16(y);
+      int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
+      int16x4_t y2 = vext_s16(y0, y4, 2);
+      int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
+      int16x4_t y3 = vext_s16(y0, y4, 3);
+      int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
+      y0 = y4;
+      a = a3;
+      x += 4;
+      y += 4;
+      j += 4;
+   }
+   if (j + 2 < len) {
+      /* Load x[0...1] */
+      int16x4x2_t xx = vld2_dup_s16(x);
+      int16x4_t x0 = xx.val[0];
+      int16x4_t x1 = xx.val[1];
+      /* Load y[4...5].
+         We would like to use vld1_dup_s32(), but casting the pointer would
+          break strict aliasing rules and potentially have alignment issues.
+         Fortunately the compiler seems capable of translating this memcpy()
+          and vdup_n_s32() into the equivalent vld1_dup_s32().*/
+      int32_t yy;
+      memcpy(&yy, y, sizeof(yy));
+      int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
+      int32x4_t a0 = vmlal_s16(a, y0, x0);
+      int16x4_t y1 = vext_s16(y0, y4, 1);
+      /* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
+          using VSRI instead of VEXT, since it's a data-processing
+          instruction. */
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 32));
+      int32x4_t a1 = vmlal_s16(a0, y1, x1);
+      a = a1;
+      x += 2;
+      y += 2;
+      j += 2;
+   }
+   if (j + 1 < len) {
+      /* Load next x. */
+      int16x4_t x0 = vld1_dup_s16(x);
+      int32x4_t a0 = vmlal_s16(a, y0, x0);
+      /* Load last y. */
+      int16x4_t y4 = vld1_dup_s16(y);
+      y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
+       vreinterpret_s64_s16(y0), 16));
+      a = a0;
+      x++;
+   }
+   /* Load last x. */
+   int16x4_t x0 = vld1_dup_s16(x);
+   int32x4_t a0 = vmlal_s16(a, y0, x0);
+   vst1q_s32(sum, a0);
+}
+
+#else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ *    vmlaq_f32() into fmul/fadd. */
+#ifdef vmlaq_lane_f32
+#undef vmlaq_lane_f32
+#endif
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
+/*
+ * Function: xcorr_kernel_neon_float
+ * ---------------------------------
+ * Computes 4 correlation values and stores them in sum[4]
+ */
+static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
+      float32_t sum[4], int len) {
+   float32x4_t YY[3];
+   float32x4_t YEXT[3];
+   float32x4_t XX[2];
+   float32x2_t XX_2;
+   float32x4_t SUMM;
+   const float32_t *xi = x;
+   const float32_t *yi = y;
+
+   celt_assert(len>0);
+
+   YY[0] = vld1q_f32(yi);
+   SUMM = vdupq_n_f32(0);
+
+   /* Consume 8 elements in x vector and 12 elements in y
+    * vector. However, the 12'th element never really gets
+    * touched in this loop. So, if len == 8, then we only
+    * must access y[0] to y[10]. y[11] must not be accessed
+    * hence make sure len > 8 and not len >= 8
+    */
+   while (len > 8) {
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+      yi += 4;
+      YY[2] = vld1q_f32(yi);
+
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+      XX[1] = vld1q_f32(xi);
+      xi += 4;
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
+      YEXT[0] = vextq_f32(YY[1], YY[2], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
+      YEXT[1] = vextq_f32(YY[1], YY[2], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
+      YEXT[2] = vextq_f32(YY[1], YY[2], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
+
+      YY[0] = YY[2];
+      len -= 8;
+   }
+
+   /* Consume 4 elements in x vector and 8 elements in y
+    * vector. However, the 8'th element in y never really gets
+    * touched in this loop. So, if len == 4, then we only
+    * must access y[0] to y[6]. y[7] must not be accessed
+    * hence make sure len>4 and not len>=4
+    */
+   if (len > 4) {
+      yi += 4;
+      YY[1] = vld1q_f32(yi);
+
+      XX[0] = vld1q_f32(xi);
+      xi += 4;
+
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
+      YEXT[0] = vextq_f32(YY[0], YY[1], 1);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
+      YEXT[1] = vextq_f32(YY[0], YY[1], 2);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
+      YEXT[2] = vextq_f32(YY[0], YY[1], 3);
+      SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
+
+      YY[0] = YY[1];
+      len -= 4;
+   }
+
+   while (--len > 0) {
+      XX_2 = vld1_dup_f32(xi++);
+      SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+      YY[0]= vld1q_f32(++yi);
+   }
+
+   XX_2 = vld1_dup_f32(xi);
+   SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
+
+   vst1q_f32(sum, SUMM);
+}
+
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+                        opus_val32 *xcorr, int len, int max_pitch, int arch) {
+   int i;
+   (void)arch;
+   celt_assert(max_pitch > 0);
+   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+
+   for (i = 0; i < (max_pitch-3); i += 4) {
+      xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
+            (float32_t *)xcorr+i, len);
+   }
+
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
+   for (; i < max_pitch; i++) {
+      xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
+   }
+}
+#endif
--- a/celt/arm/celt_pitch_xcorr_arm.s
+++ b/celt/arm/celt_pitch_xcorr_arm.s
--- a/celt/arm/fft_arm.h
+++ b/celt/arm/fft_arm.h
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file fft_arm.h
+   @brief ARM Neon Intrinsic optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#if !defined(FFT_ARM_H)
+#define FFT_ARM_H
+
+#include "kiss_fft.h"
+
+#if defined(HAVE_ARM_NE10)
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st);
+void opus_fft_free_arm_neon(kiss_fft_state *st);
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout);
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout);
+
+#if !defined(OPUS_HAVE_RTCD)
+#define OVERRIDE_OPUS_FFT (1)
+
+#define opus_fft_alloc_arch(_st, arch) \
+   ((void)(arch), opus_fft_alloc_arm_neon(_st))
+
+#define opus_fft_free_arch(_st, arch) \
+   ((void)(arch), opus_fft_free_arm_neon(_st))
+
+#define opus_fft(_st, _fin, _fout, arch) \
+   ((void)(arch), opus_fft_neon(_st, _fin, _fout))
+
+#define opus_ifft(_st, _fin, _fout, arch) \
+   ((void)(arch), opus_ifft_neon(_st, _fin, _fout))
+
+#endif /* OPUS_HAVE_RTCD */
+
+#endif /* HAVE_ARM_NE10 */
+
+#endif
--- a/celt/arm/fixed_arm64.h
+++ b/celt/arm/fixed_arm64.h
+/* Copyright (C) 2015 Vidyo */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_ARM64_H
+#define FIXED_ARM64_H
+
+#include <arm_neon.h>
+
+#undef SIG2WORD16
+#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT)))
+
+#endif
--- a/celt/arm/fixed_armv4.h
+++ b/celt/arm/fixed_armv4.h
--- a/celt/arm/fixed_armv5e.h
+++ b/celt/arm/fixed_armv5e.h
--- a/celt/arm/mathops_arm.h
+++ b/celt/arm/mathops_arm.h
--- a/celt/arm/mdct_arm.h
+++ b/celt/arm/mdct_arm.h
--- a/celt/arm/meson.build
+++ b/celt/arm/meson.build
+arm2gnu = [find_program('arm2gnu.pl')] + arm2gnu_args
+celt_sources_arm_asm = configure_file(input: 'celt_pitch_xcorr_arm.s',
+  output: '@BASENAME@-gnu.S',
+  command: arm2gnu + ['@INPUT@'],
+  capture: true)
+celt_arm_armopts_s_in = configure_file(input: 'armopts.s.in',
+  output: 'armopts.s',
+  configuration: opus_conf)
+celt_arm_armopts_s = configure_file(input: [celt_arm_armopts_s_in],
+  output: '@BASENAME@-gnu.S',
+  command: arm2gnu + ['@INPUT@'],
+  capture: true)
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
--- a/celt/bands.c
+++ b/celt/bands.c
--- a/celt/bands.h
+++ b/celt/bands.h
--- a/celt/celt.c
+++ b/celt/celt.c
No results found