From d4494e6ed7d564cbee3cfdfe41d0c0989c4345ff Mon Sep 17 00:00:00 2001
From: Sandor Zsombor Vegh <sandorzsombor.vegh@arm.com>
Date: Wed, 11 Sep 2024 14:00:32 +0200
Subject: [PATCH] Arm: Speed up FLOAT2INT16 conversion with Neon

Using Neon for float to int conversion, and introducing platform-
specific function for converting an array of float values to int16.
Also adding appropriate unit test.

Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
---
 celt/arm/arm_celt_map.c        |  18 +++++-
 celt/arm/celt_neon_intr.c      |  51 +++++++++++++++++
 celt/arm/mathops_arm.h         |  65 +++++++++++++++++++++
 celt/float_cast.h              |   7 +++
 celt/mathops.c                 |  15 +++++
 celt/mathops.h                 |  16 ++++++
 celt/tests/test_unit_mathops.c | 100 ++++++++++++++++++++++++++++++++-
 celt_headers.mk                |   1 +
 src/opus_decoder.c             |   8 ++-
 9 files changed, 277 insertions(+), 4 deletions(-)
 create mode 100644 celt/arm/mathops_arm.h

diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index cbaea4957..d9980444e 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2010 Xiph.Org Foundation
- * Copyright (c) 2013 Parrot */
+ * Copyright (c) 2013 Parrot
+ * Copyright (c) 2024 Arm Limited */
 /*
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -29,12 +30,25 @@
 #include "config.h"
 #endif
 
-#include "pitch.h"
 #include "kiss_fft.h"
+#include "mathops.h"
 #include "mdct.h"
+#include "pitch.h"
 
 #if defined(OPUS_HAVE_RTCD)
 
+# if !defined(DISABLE_FLOAT_API)
+#  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
+  celt_float2int16_c,   /* ARMv4 */
+  celt_float2int16_c,   /* EDSP */
+  celt_float2int16_c,   /* Media */
+  celt_float2int16_neon,/* NEON */
+  celt_float2int16_neon /* DOTPROD */
+};
+#  endif
+# endif
+
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
   celt_inner_prod_c,   /* ARMv4 */
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index 250f83621..32b6e5ac0 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2014-2015 Xiph.Org Foundation
+   Copyright (c) 2024 Arm Limited
    Written by Viswanath Puttagunta */
 /**
    @file celt_neon_intr.c
@@ -35,7 +36,57 @@
 #endif
 
 #include <arm_neon.h>
+#include "../float_cast.h"
+#include "../mathops.h"
 #include "../pitch.h"
+#if defined(OPUS_CHECK_ASM)
+#include <stdlib.h>
+#endif
+
+#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
+{
+   int i = 0;
+
+#if defined(__ARM_NEON)
+   const int BLOCK_SIZE = 16;
+   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
+
+   for (; i < blockedSize; i += BLOCK_SIZE)
+   {
+      float32x4_t orig_a = vld1q_f32(&in[i +  0]);
+      float32x4_t orig_b = vld1q_f32(&in[i +  4]);
+      float32x4_t orig_c = vld1q_f32(&in[i +  8]);
+      float32x4_t orig_d = vld1q_f32(&in[i + 12]);
+
+      int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
+      int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
+      int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
+      int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
+
+      vst1_s16(&out[i +  0], asShort_a);
+      vst1_s16(&out[i +  4], asShort_b);
+      vst1_s16(&out[i +  8], asShort_c);
+      vst1_s16(&out[i + 12], asShort_d);
+# if defined(OPUS_CHECK_ASM)
+      short out_c[BLOCK_SIZE];
+      int j;
+      for(j = 0; j < BLOCK_SIZE; j++)
+      {
+         out_c[j] = FLOAT2INT16(in[i + j]);
+         celt_assert(abs((out_c[j] - out[i + j])) <= 1);
+      }
+# endif
+   }
+#endif
+
+   for (; i < cnt; i++)
+   {
+      out[i] = FLOAT2INT16(in[i]);
+   }
+}
+#endif
 
 #if defined(FIXED_POINT)
 #include <string.h>
diff --git a/celt/arm/mathops_arm.h b/celt/arm/mathops_arm.h
new file mode 100644
index 000000000..ced719d32
--- /dev/null
+++ b/celt/arm/mathops_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2024 Arm Limited */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(MATHOPS_ARM_H)
+# define MATHOPS_ARM_H
+
+#include "armcpu.h"
+#include "cpu_support.h"
+#include "opus_defines.h"
+
+# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+
+#include <arm_neon.h>
+
+static inline int32x4_t vroundf(float32x4_t x)
+{
+#  if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
+    return vcvtaq_s32_f32(x);
+#  else
+    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
+    uint32x4_t bias = vdupq_n_u32(0x3F000000);
+    return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
+#  endif
+}
+
+void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+#  if defined(OPUS_HAVE_RTCD) && \
+    (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void
+(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+
+#   define OVERRIDE_FLOAT2INT16 (1)
+#   define celt_float2int16(in, out, cnt, arch) \
+      ((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
+
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_FLOAT2INT16 (1)
+#   define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
+#  endif
+# endif
+
+#endif /* MATHOPS_ARM_H */
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 8915a5fd7..0645d54b7 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
 
                 return intgr ;
         }
+#elif defined(__aarch64__)
+
+        #include <arm_neon.h>
+        static OPUS_INLINE opus_int32 float2int(float flt)
+        {
+                return vcvtns_s32_f32(flt);
+        }
 
 #elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
diff --git a/celt/mathops.c b/celt/mathops.c
index 64c9f457f..0ad57ca71 100644
--- a/celt/mathops.c
+++ b/celt/mathops.c
@@ -1,6 +1,7 @@
 /* Copyright (c) 2002-2008 Jean-Marc Valin
    Copyright (c) 2007-2008 CSIRO
    Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2024 Arm Limited
    Written by Jean-Marc Valin */
 /**
    @file mathops.h
@@ -35,6 +36,7 @@
 #include "config.h"
 #endif
 
+#include "float_cast.h"
 #include "mathops.h"
 
 /*Compute floor(sqrt(_val)) with exact arithmetic.
@@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
 }
 
 #endif
+
+#ifndef DISABLE_FLOAT_API
+
+void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
+{
+   int i;
+   for (i = 0; i < cnt; i++)
+   {
+      out[i] = FLOAT2INT16(in[i]);
+   }
+}
+
+#endif /* DISABLE_FLOAT_API */
diff --git a/celt/mathops.h b/celt/mathops.h
index d3cda4c23..24dbfb4cc 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -1,6 +1,7 @@
 /* Copyright (c) 2002-2008 Jean-Marc Valin
    Copyright (c) 2007-2008 CSIRO
    Copyright (c) 2007-2009 Xiph.Org Foundation
+   Copyright (c) 2024 Arm Limited
    Written by Jean-Marc Valin, and Yunho Huh */
 /**
    @file mathops.h
@@ -38,6 +39,10 @@
 #include "entcode.h"
 #include "os_support.h"
 
+#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#include "arm/mathops_arm.h"
+#endif
+
 #define PI 3.141592653f
 
 /* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
@@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
 }
 
 #endif /* FIXED_POINT */
+
+#ifndef DISABLE_FLOAT_API
+
+void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
+
+#ifndef OVERRIDE_FLOAT2INT16
+#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
+#endif
+
+#endif /* DISABLE_FLOAT_API */
+
 #endif /* MATHOPS_H */
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 50ee431cb..98fcdec4b 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
                            Gregory Maxwell
+   Copyright (c) 2024 Arm Limited
    Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
    and Yunho Huh */
 /*
@@ -37,8 +38,10 @@
 
 #include <stdio.h>
 #include <math.h>
-#include "mathops.h"
 #include "bands.h"
+#include "cpu_support.h"
+#include "float_cast.h"
+#include "mathops.h"
 
 #ifdef FIXED_POINT
 #define WORD "%d"
@@ -351,8 +354,94 @@ void testilog2(void)
 }
 #endif
 
+
+#ifndef DISABLE_FLOAT_API
+
+void testcelt_float2int16(int use_ref_impl, int buffer_size)
+{
+
+#define MAX_BUFFER_SIZE 2080
+   int i, cnt;
+   float floatsToConvert[MAX_BUFFER_SIZE];
+   short results[MAX_BUFFER_SIZE] = { 0 };
+   float scaleInt16RangeTo01;
+
+   celt_assert(buffer_size <= MAX_BUFFER_SIZE);
+
+   scaleInt16RangeTo01 = 1.f / 32768.f;
+   cnt = 0;
+
+   while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
+   {
+      floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = .0f;
+      floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
+      floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
+
+      celt_assert(cnt < buffer_size);
+   }
+
+   while (cnt < buffer_size)
+   {
+      float inInt16Range = cnt * 7 + .5;
+      inInt16Range += (cnt & 0x01) ? .1 : -.1;
+      inInt16Range *= (cnt & 0x02) ? 1 : -1;
+      floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
+   }
+
+   for (i = 0; i < MAX_BUFFER_SIZE; ++i)
+   {
+      results[i] = 42;
+   }
+
+   if (use_ref_impl)
+   {
+      celt_float2int16_c(floatsToConvert, results, cnt);
+   } else {
+      celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
+   }
+
+   for (i = 0; i < cnt; ++i)
+   {
+      const float expected = FLOAT2INT16(floatsToConvert[i]);
+      if (results[i] != expected)
+      {
+         fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
+               floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
+         ret = 1;
+      }
+   }
+
+   for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
+   {
+      if (results[i] != 42)
+      {
+         fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
+         ret = 1;
+         break;
+      }
+   }
+#undef MAX_BUFFER_SIZE
+}
+
+#endif
+
 int main(void)
 {
+   int i;
+   int use_ref_impl[2] = { 0, 1 };
+
    testbitexactcos();
    testbitexactlog2tan();
    testdiv();
@@ -364,6 +453,15 @@ int main(void)
    testilog2();
    testlog2_db();
    testexp2_db();
+#endif
+#ifndef DISABLE_FLOAT_API
+   for (i = 0; i <= 1; ++i)
+   {
+      testcelt_float2int16(use_ref_impl[i], 1);
+      testcelt_float2int16(use_ref_impl[i], 32);
+      testcelt_float2int16(use_ref_impl[i], 127);
+      testcelt_float2int16(use_ref_impl[i], 1031);
+   }
 #endif
    return ret;
 }
diff --git a/celt_headers.mk b/celt_headers.mk
index 94a655739..267b418bf 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
 celt/arm/fixed_arm64.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
+celt/arm/mathops_arm.h \
 celt/arm/pitch_arm.h \
 celt/arm/fft_arm.h \
 celt/arm/mdct_arm.h \
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 64b8c31ea..190221b7c 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited
+   Copyright (c) 2024 Arm Limited
    Written by Jean-Marc Valin and Koen Vos */
 /*
    Redistribution and use in source and binary forms, with or without
@@ -835,7 +836,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
       opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec)
 {
        VARDECL(opus_res, out);
-       int ret, i;
+       int ret;
        int nb_samples;
        ALLOC_STACK;
 
@@ -858,8 +859,13 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
        ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0);
        if (ret > 0)
        {
+# if defined(FIXED_POINT)
+          int i;
           for (i=0;i<ret*st->channels;i++)
              pcm[i] = RES2INT16(out[i]);
+# else
+          celt_float2int16(out, pcm, ret*st->channels, st->arch);
+# endif
        }
        RESTORE_STACK;
        return ret;
-- 
GitLab