From d4494e6ed7d564cbee3cfdfe41d0c0989c4345ff Mon Sep 17 00:00:00 2001 From: Sandor Zsombor Vegh <sandorzsombor.vegh@arm.com> Date: Wed, 11 Sep 2024 14:00:32 +0200 Subject: [PATCH] Arm: Speed up FLOAT2INT16 conversion with Neon Using Neon for float to int conversion, and introducing platform- specific function for converting an array of float values to int16. Also adding appropriate unit test. Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com> --- celt/arm/arm_celt_map.c | 18 +++++- celt/arm/celt_neon_intr.c | 51 +++++++++++++++++ celt/arm/mathops_arm.h | 65 +++++++++++++++++++++ celt/float_cast.h | 7 +++ celt/mathops.c | 15 +++++ celt/mathops.h | 16 ++++++ celt/tests/test_unit_mathops.c | 100 ++++++++++++++++++++++++++++++++- celt_headers.mk | 1 + src/opus_decoder.c | 8 ++- 9 files changed, 277 insertions(+), 4 deletions(-) create mode 100644 celt/arm/mathops_arm.h diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index cbaea4957..d9980444e 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -1,5 +1,6 @@ /* Copyright (c) 2010 Xiph.Org Foundation - * Copyright (c) 2013 Parrot */ + * Copyright (c) 2013 Parrot + * Copyright (c) 2024 Arm Limited */ /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -29,12 +30,25 @@ #include "config.h" #endif -#include "pitch.h" #include "kiss_fft.h" +#include "mathops.h" #include "mdct.h" +#include "pitch.h" #if defined(OPUS_HAVE_RTCD) +# if !defined(DISABLE_FLOAT_API) +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) +void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = { + celt_float2int16_c, /* ARMv4 */ + celt_float2int16_c, /* EDSP */ + celt_float2int16_c, /* Media */ + celt_float2int16_neon,/* NEON */ + celt_float2int16_neon /* DOTPROD */ +}; +# endif +# endif + # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = { celt_inner_prod_c, /* ARMv4 */ diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index 250f83621..32b6e5ac0 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -1,4 +1,5 @@ /* Copyright (c) 2014-2015 Xiph.Org Foundation + Copyright (c) 2024 Arm Limited Written by Viswanath Puttagunta */ /** @file celt_neon_intr.c @@ -35,7 +36,57 @@ #endif #include <arm_neon.h> +#include "../float_cast.h" +#include "../mathops.h" #include "../pitch.h" +#if defined(OPUS_CHECK_ASM) +#include <stdlib.h> +#endif + +#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR) + +void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) +{ + int i = 0; + +#if defined(__ARM_NEON) + const int BLOCK_SIZE = 16; + const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE; + + for (; i < blockedSize; i += BLOCK_SIZE) + { + float32x4_t orig_a = vld1q_f32(&in[i + 0]); + float32x4_t orig_b = vld1q_f32(&in[i + 4]); + float32x4_t orig_c = vld1q_f32(&in[i + 8]); + float32x4_t orig_d = vld1q_f32(&in[i + 12]); + + int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE))); + int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE))); + int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE))); + int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE))); + + vst1_s16(&out[i + 0], asShort_a); + vst1_s16(&out[i + 4], asShort_b); + vst1_s16(&out[i + 8], asShort_c); + vst1_s16(&out[i + 12], asShort_d); +# if defined(OPUS_CHECK_ASM) + short out_c[BLOCK_SIZE]; + int j; + for(j = 0; j < BLOCK_SIZE; j++) + { + out_c[j] = FLOAT2INT16(in[i + j]); + celt_assert(abs((out_c[j] - out[i + j])) <= 1); + } +# endif + } +#endif + + for (; i < cnt; i++) + { + out[i] = FLOAT2INT16(in[i]); + } +} +#endif #if defined(FIXED_POINT) #include <string.h> diff --git a/celt/arm/mathops_arm.h b/celt/arm/mathops_arm.h new file mode 100644 index 000000000..ced719d32 --- /dev/null +++ b/celt/arm/mathops_arm.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2024 Arm Limited */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(MATHOPS_ARM_H) +# define MATHOPS_ARM_H + +#include "armcpu.h" +#include "cpu_support.h" +#include "opus_defines.h" + +# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR) + +#include <arm_neon.h> + +static inline int32x4_t vroundf(float32x4_t x) +{ +# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) + return vcvtaq_s32_f32(x); +# else + uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000)); + uint32x4_t bias = vdupq_n_u32(0x3F000000); + return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign)))); +# endif +} + +void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt); +# if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) +extern void +(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt); + +# define OVERRIDE_FLOAT2INT16 (1) +# define celt_float2int16(in, out, cnt, arch) \ + ((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt)) + +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_FLOAT2INT16 (1) +# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt)) +# endif +# endif + +#endif /* MATHOPS_ARM_H */ diff --git a/celt/float_cast.h b/celt/float_cast.h index 8915a5fd7..0645d54b7 100644 --- a/celt/float_cast.h +++ b/celt/float_cast.h @@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s return intgr ; } +#elif defined(__aarch64__) + + #include <arm_neon.h> + static OPUS_INLINE opus_int32 float2int(float flt) + { + return vcvtns_s32_f32(flt); + } #elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L diff --git a/celt/mathops.c b/celt/mathops.c index 64c9f457f..0ad57ca71 100644 --- a/celt/mathops.c +++ b/celt/mathops.c @@ -1,6 +1,7 @@ /* Copyright (c) 2002-2008 Jean-Marc Valin Copyright (c) 2007-2008 CSIRO Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2024 Arm Limited Written by Jean-Marc Valin */ /** @file mathops.h @@ -35,6 +36,7 @@ #include "config.h" #endif +#include "float_cast.h" #include "mathops.h" /*Compute floor(sqrt(_val)) with exact arithmetic. @@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x) } #endif + +#ifndef DISABLE_FLOAT_API + +void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) +{ + int i; + for (i = 0; i < cnt; i++) + { + out[i] = FLOAT2INT16(in[i]); + } +} + +#endif /* DISABLE_FLOAT_API */ diff --git a/celt/mathops.h b/celt/mathops.h index d3cda4c23..24dbfb4cc 100644 --- a/celt/mathops.h +++ b/celt/mathops.h @@ -1,6 +1,7 @@ /* Copyright (c) 2002-2008 Jean-Marc Valin Copyright (c) 2007-2008 CSIRO Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2024 Arm Limited Written by Jean-Marc Valin, and Yunho Huh */ /** @file mathops.h @@ -38,6 +39,10 @@ #include "entcode.h" #include "os_support.h" +#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +#include "arm/mathops_arm.h" +#endif + #define PI 3.141592653f /* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */ @@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x) } #endif /* FIXED_POINT */ + +#ifndef DISABLE_FLOAT_API + +void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt); + +#ifndef OVERRIDE_FLOAT2INT16 +#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt)) +#endif + +#endif /* DISABLE_FLOAT_API */ + #endif /* MATHOPS_H */ diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index 50ee431cb..98fcdec4b 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -1,5 +1,6 @@ /* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation, Gregory Maxwell + Copyright (c) 2024 Arm Limited Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry, and Yunho Huh */ /* @@ -37,8 +38,10 @@ #include <stdio.h> #include <math.h> -#include "mathops.h" #include "bands.h" +#include "cpu_support.h" +#include "float_cast.h" +#include "mathops.h" #ifdef FIXED_POINT #define WORD "%d" @@ -351,8 +354,94 @@ void testilog2(void) } #endif + +#ifndef DISABLE_FLOAT_API + +void testcelt_float2int16(int use_ref_impl, int buffer_size) +{ + +#define MAX_BUFFER_SIZE 2080 + int i, cnt; + float floatsToConvert[MAX_BUFFER_SIZE]; + short results[MAX_BUFFER_SIZE] = { 0 }; + float scaleInt16RangeTo01; + + celt_assert(buffer_size <= MAX_BUFFER_SIZE); + + scaleInt16RangeTo01 = 1.f / 32768.f; + cnt = 0; + + while (cnt + 15 < buffer_size && cnt < buffer_size / 2) + { + floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01; + floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = .0f; + floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01; + floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01; + + celt_assert(cnt < buffer_size); + } + + while (cnt < buffer_size) + { + float inInt16Range = cnt * 7 + .5; + inInt16Range += (cnt & 0x01) ? .1 : -.1; + inInt16Range *= (cnt & 0x02) ? 1 : -1; + floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01; + } + + for (i = 0; i < MAX_BUFFER_SIZE; ++i) + { + results[i] = 42; + } + + if (use_ref_impl) + { + celt_float2int16_c(floatsToConvert, results, cnt); + } else { + celt_float2int16(floatsToConvert, results, cnt, opus_select_arch()); + } + + for (i = 0; i < cnt; ++i) + { + const float expected = FLOAT2INT16(floatsToConvert[i]); + if (results[i] != expected) + { + fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n", + floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl); + ret = 1; + } + } + + for (i = cnt; i < MAX_BUFFER_SIZE; ++i) + { + if (results[i] != 42) + { + fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl); + ret = 1; + break; + } + } +#undef MAX_BUFFER_SIZE +} + +#endif + int main(void) { + int i; + int use_ref_impl[2] = { 0, 1 }; + testbitexactcos(); testbitexactlog2tan(); testdiv(); @@ -364,6 +453,15 @@ int main(void) testilog2(); testlog2_db(); testexp2_db(); +#endif +#ifndef DISABLE_FLOAT_API + for (i = 0; i <= 1; ++i) + { + testcelt_float2int16(use_ref_impl[i], 1); + testcelt_float2int16(use_ref_impl[i], 32); + testcelt_float2int16(use_ref_impl[i], 127); + testcelt_float2int16(use_ref_impl[i], 1031); + } #endif return ret; } diff --git a/celt_headers.mk b/celt_headers.mk index 94a655739..267b418bf 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \ celt/arm/fixed_arm64.h \ celt/arm/kiss_fft_armv4.h \ celt/arm/kiss_fft_armv5e.h \ +celt/arm/mathops_arm.h \ celt/arm/pitch_arm.h \ celt/arm/fft_arm.h \ celt/arm/mdct_arm.h \ diff --git a/src/opus_decoder.c b/src/opus_decoder.c index 64b8c31ea..190221b7c 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -1,4 +1,5 @@ /* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited + Copyright (c) 2024 Arm Limited Written by Jean-Marc Valin and Koen Vos */ /* Redistribution and use in source and binary forms, with or without @@ -835,7 +836,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data, opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec) { VARDECL(opus_res, out); - int ret, i; + int ret; int nb_samples; ALLOC_STACK; @@ -858,8 +859,13 @@ int opus_decode(OpusDecoder *st, const unsigned char *data, ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0); if (ret > 0) { +# if defined(FIXED_POINT) + int i; for (i=0;i<ret*st->channels;i++) pcm[i] = RES2INT16(out[i]); +# else + celt_float2int16(out, pcm, ret*st->channels, st->arch); +# endif } RESTORE_STACK; return ret; -- GitLab