Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • xiph/opus
  • tterribe/opus
  • markh/opus
  • cisquiers/opus
  • xnorpx/opus
  • tpm/opus
  • 0dvictor/opus
  • traud/opus
  • thesamesam/opus
  • TD-Linux/opus
  • mklingb/opus
  • jmvalin/opus
  • janbuethe/opus
  • tmatth/opus
  • MarekPikula/opus
15 results
Show changes
Showing
with 3541 additions and 1 deletion
celt @ cb73e5ca
Subproject commit cb73e5ca62f93f530e65b244cf52f905a727d04d
/*Copyright (c) 2003-2004, Mark Borgerding
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.*/
#ifndef KISS_FFT_GUTS_H
#define KISS_FFT_GUTS_H
#define MIN(a,b) ((a)<(b) ? (a):(b))
#define MAX(a,b) ((a)>(b) ? (a):(b))
/* kiss_fft.h
defines kiss_fft_scalar as either short or a float type
and defines
typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
#include "kiss_fft.h"
/*
Explanation of macros dealing with complex math:
C_MUL(m,a,b) : m = a*b
C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise
C_SUB( res, a,b) : res = a - b
C_SUBFROM( res , a) : res -= a
C_ADDTO( res , a) : res += a
* */
#ifdef FIXED_POINT
#include "arch.h"
#define SAMP_MAX 2147483647
#define TWID_MAX 32767
#define TRIG_UPSCALE 1
#define SAMP_MIN -SAMP_MAX
#ifdef ENABLE_QEXT
# define S_MUL(a,b) MULT32_32_Q31(b, a)
# define S_MUL2(a,b) MULT32_32_Q31(b, a)
#else
# define S_MUL(a,b) MULT16_32_Q15(b, a)
# define S_MUL2(a,b) MULT16_32_Q16(b, a)
#endif
# define C_MUL(m,a,b) \
do{ (m).r = SUB32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = ADD32_ovflw(S_MUL((a).r,(b).i) , S_MUL((a).i,(b).r)); }while(0)
# define C_MULC(m,a,b) \
do{ (m).r = ADD32_ovflw(S_MUL((a).r,(b).r) , S_MUL((a).i,(b).i)); \
(m).i = SUB32_ovflw(S_MUL((a).i,(b).r) , S_MUL((a).r,(b).i)); }while(0)
# define C_MULBYSCALAR( c, s ) \
do{ (c).r = S_MUL( (c).r , s ) ;\
(c).i = S_MUL( (c).i , s ) ; }while(0)
# define DIVSCALAR(x,k) \
(x) = S_MUL( x, (TWID_MAX-((k)>>1))/(k)+1 )
# define C_FIXDIV(c,div) \
do { DIVSCALAR( (c).r , div); \
DIVSCALAR( (c).i , div); }while (0)
#define C_ADD( res, a,b)\
do {(res).r=ADD32_ovflw((a).r,(b).r); (res).i=ADD32_ovflw((a).i,(b).i); \
}while(0)
#define C_SUB( res, a,b)\
do {(res).r=SUB32_ovflw((a).r,(b).r); (res).i=SUB32_ovflw((a).i,(b).i); \
}while(0)
#define C_ADDTO( res , a)\
do {(res).r = ADD32_ovflw((res).r, (a).r); (res).i = ADD32_ovflw((res).i,(a).i);\
}while(0)
#define C_SUBFROM( res , a)\
do {(res).r = ADD32_ovflw((res).r,(a).r); (res).i = SUB32_ovflw((res).i,(a).i); \
}while(0)
#if defined(OPUS_ARM_INLINE_ASM)
#include "arm/kiss_fft_armv4.h"
#endif
#if defined(OPUS_ARM_INLINE_EDSP)
#include "arm/kiss_fft_armv5e.h"
#endif
#if defined(MIPSr1_ASM)
#include "mips/kiss_fft_mipsr1.h"
#endif
#else /* not FIXED_POINT*/
# define S_MUL(a,b) ( (a)*(b) )
# define S_MUL2(a,b) ( (a)*(b) )
#define C_MUL(m,a,b) \
do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
(m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
#define C_MULC(m,a,b) \
do{ (m).r = (a).r*(b).r + (a).i*(b).i;\
(m).i = (a).i*(b).r - (a).r*(b).i; }while(0)
#define C_MUL4(m,a,b) C_MUL(m,a,b)
# define C_FIXDIV(c,div) /* NOOP */
# define C_MULBYSCALAR( c, s ) \
do{ (c).r *= (s);\
(c).i *= (s); }while(0)
#endif
#ifndef CHECK_OVERFLOW_OP
# define CHECK_OVERFLOW_OP(a,op,b) /* noop */
#endif
#ifndef C_ADD
#define C_ADD( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,+,(b).r)\
CHECK_OVERFLOW_OP((a).i,+,(b).i)\
(res).r=(a).r+(b).r; (res).i=(a).i+(b).i; \
}while(0)
#define C_SUB( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,-,(b).r)\
CHECK_OVERFLOW_OP((a).i,-,(b).i)\
(res).r=(a).r-(b).r; (res).i=(a).i-(b).i; \
}while(0)
#define C_ADDTO( res , a)\
do { \
CHECK_OVERFLOW_OP((res).r,+,(a).r)\
CHECK_OVERFLOW_OP((res).i,+,(a).i)\
(res).r += (a).r; (res).i += (a).i;\
}while(0)
#define C_SUBFROM( res , a)\
do {\
CHECK_OVERFLOW_OP((res).r,-,(a).r)\
CHECK_OVERFLOW_OP((res).i,-,(a).i)\
(res).r -= (a).r; (res).i -= (a).i; \
}while(0)
#endif /* C_ADD defined */
#ifdef FIXED_POINT
/*# define KISS_FFT_COS(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * cos (phase))))
# define KISS_FFT_SIN(phase) TRIG_UPSCALE*floor(MIN(32767,MAX(-32767,.5+32768 * sin (phase))))*/
# define KISS_FFT_COS(phase) floor(.5+TWID_MAX*cos (phase))
# define KISS_FFT_SIN(phase) floor(.5+TWID_MAX*sin (phase))
# define HALF_OF(x) ((x)>>1)
#elif defined(USE_SIMD)
# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
# define HALF_OF(x) ((x)*_mm_set1_ps(.5f))
#else
# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
# define HALF_OF(x) ((x)*.5f)
#endif
#define kf_cexp(x,phase) \
do{ \
(x)->r = KISS_FFT_COS(phase);\
(x)->i = KISS_FFT_SIN(phase);\
}while(0)
#define kf_cexp2(x,phase) \
do{ \
(x)->r = TRIG_UPSCALE*celt_cos_norm((phase));\
(x)->i = TRIG_UPSCALE*celt_cos_norm((phase)-32768);\
}while(0)
#endif /* KISS_FFT_GUTS_H */
/* Copyright (c) 2003-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Written by Jean-Marc Valin */
/**
@file arch.h
@brief Various architecture definitions for CELT
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef ARCH_H
#define ARCH_H
#include "opus_types.h"
#include "opus_defines.h"
# if !defined(__GNUC_PREREQ)
# if defined(__GNUC__)&&defined(__GNUC_MINOR__)
# define __GNUC_PREREQ(_maj,_min) \
((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
# else
# define __GNUC_PREREQ(_maj,_min) 0
# endif
# endif
#if OPUS_GNUC_PREREQ(3, 0)
#define opus_likely(x) (__builtin_expect(!!(x), 1))
#define opus_unlikely(x) (__builtin_expect(!!(x), 0))
#else
#define opus_likely(x) (!!(x))
#define opus_unlikely(x) (!!(x))
#endif
#define CELT_SIG_SCALE 32768.f
#define CELT_FATAL(str) celt_fatal(str, __FILE__, __LINE__)
#if defined(ENABLE_ASSERTIONS) || defined(ENABLE_HARDENING)
#ifdef __GNUC__
__attribute__((noreturn))
#endif
void celt_fatal(const char *str, const char *file, int line);
#if defined(CELT_C) && !defined(OVERRIDE_celt_fatal)
#include <stdio.h>
#include <stdlib.h>
#ifdef __GNUC__
__attribute__((noreturn))
#endif
void celt_fatal(const char *str, const char *file, int line)
{
fprintf (stderr, "Fatal (internal) error in %s, line %d: %s\n", file, line, str);
#if defined(_MSC_VER)
_set_abort_behavior( 0, _WRITE_ABORT_MSG);
#endif
abort();
}
#endif
#define celt_assert(cond) {if (!(cond)) {CELT_FATAL("assertion failed: " #cond);}}
#define celt_assert2(cond, message) {if (!(cond)) {CELT_FATAL("assertion failed: " #cond "\n" message);}}
#define MUST_SUCCEED(call) celt_assert((call) == OPUS_OK)
#else
#define celt_assert(cond)
#define celt_assert2(cond, message)
#define MUST_SUCCEED(call) do {if((call) != OPUS_OK) {RESTORE_STACK; return OPUS_INTERNAL_ERROR;} } while (0)
#endif
#if defined(ENABLE_ASSERTIONS)
#define celt_sig_assert(cond) {if (!(cond)) {CELT_FATAL("signal assertion failed: " #cond);}}
#else
#define celt_sig_assert(cond)
#endif
#define IMUL32(a,b) ((a)*(b))
#define MIN16(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 16-bit value. */
#define MAX16(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 16-bit value. */
#define MIN32(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum 32-bit value. */
#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */
#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */
#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */
#define UADD32(a,b) ((a)+(b))
#define USUB32(a,b) ((a)-(b))
#define MAXG(a,b) MAX32(a, b)
#define MING(a,b) MIN32(a, b)
/* Throughout the code, we use the following scaling for signals:
FLOAT: used for float API, normalized to +/-1.
INT16: used for 16-bit API, normalized to +/- 32768
RES: internal Opus resolution, defined as +/-1. in float builds, or either 16-bit or 24-bit int for fixed-point builds
SIG: internal CELT resolution: defined as +/- 32768. in float builds, or Q27 in fixed-point builds (int16 shifted by 12)
*/
/* Set this if opus_int64 is a native type of the CPU. */
/* Assume that all LP64 architectures have fast 64-bit types; also x86_64
(which can be ILP32 for x32) and Win64 (which is LLP64). */
#if defined(__x86_64__) || defined(__LP64__) || defined(_WIN64)
#define OPUS_FAST_INT64 1
#else
#define OPUS_FAST_INT64 0
#endif
#ifdef FIXED_POINT
#define ARG_FIXED(arg) , arg
#else
#define ARG_FIXED(arg)
#endif
#define PRINT_MIPS(file)
#ifdef FIXED_POINT
typedef opus_int16 opus_val16;
typedef opus_int32 opus_val32;
typedef opus_int64 opus_val64;
typedef opus_val32 celt_sig;
typedef opus_val16 celt_norm;
typedef opus_val32 celt_ener;
typedef opus_val32 celt_glog;
#ifdef ENABLE_RES24
typedef opus_val32 opus_res;
#define RES_SHIFT 8
#define SIG2RES(a) PSHR32(a, SIG_SHIFT-RES_SHIFT)
#define RES2INT16(a) SAT16(PSHR32(a, RES_SHIFT))
#define RES2INT24(a) (a)
#define RES2FLOAT(a) ((1.f/32768.f/256.)*(a))
#define INT16TORES(a) SHL32(EXTEND32(a), RES_SHIFT)
#define INT24TORES(a) (a)
#define ADD_RES(a, b) ADD32(a, b)
#define FLOAT2RES(a) float2int(32768.f*256.f*(a))
#define RES2SIG(a) SHL32((a), SIG_SHIFT-RES_SHIFT)
#define MULT16_RES_Q15(a,b) MULT16_32_Q15(a,b)
#define MAX_ENCODING_DEPTH 24
#else
typedef opus_val16 opus_res;
#define RES_SHIFT 0
#define SIG2RES(a) SIG2WORD16(a)
#define RES2INT16(a) (a)
#define RES2INT24(a) SHL32(EXTEND32(a), 8)
#define RES2FLOAT(a) ((1.f/32768.f)*(a))
#define INT16TORES(a) (a)
#define INT24TORES(a) SAT16(PSHR32(a, 8))
#define ADD_RES(a, b) SAT16(ADD32((a), (b)));
#define FLOAT2RES(a) FLOAT2INT16(a)
#define RES2SIG(a) SHL32(EXTEND32(a), SIG_SHIFT)
#define MULT16_RES_Q15(a,b) MULT16_16_Q15(a,b)
#define MAX_ENCODING_DEPTH 16
#endif
#define RES2VAL16(a) RES2INT16(a)
#define FLOAT2SIG(a) float2int(((opus_int32)32768<<SIG_SHIFT)*(a))
#define INT16TOSIG(a) SHL32(EXTEND32(a), SIG_SHIFT)
#define INT24TOSIG(a) SHL32(a, SIG_SHIFT-8)
#ifdef ENABLE_QEXT
typedef opus_val32 celt_coef;
#define COEF_ONE Q31ONE
#define MULT_COEF_32(a, b) MULT32_32_Q31(a,b)
#define MAC_COEF_32_ARM(c, a, b) ADD32((c), MULT32_32_Q32(a,b))
#define MULT_COEF(a, b) MULT32_32_Q31(a,b)
#define MULT_COEF_TAPS(a, b) SHL32(MULT16_16(a,b), 1)
#define COEF2VAL16(x) EXTRACT16(SHR32(x, 16))
#else
typedef opus_val16 celt_coef;
#define COEF_ONE Q15ONE
#define MULT_COEF_32(a, b) MULT16_32_Q15(a,b)
#define MAC_COEF_32_ARM(a, b, c) MAC16_32_Q16(a,b,c)
#define MULT_COEF(a, b) MULT16_16_Q15(a,b)
#define MULT_COEF_TAPS(a, b) MULT16_16_P15(a,b)
#define COEF2VAL16(x) (x)
#endif
#define celt_isnan(x) 0
#define Q15ONE 32767
#define Q31ONE 2147483647
#define SIG_SHIFT 12
/* Safe saturation value for 32-bit signals. We need to make sure that we can
add two sig values and that the first stages of the MDCT don't cause an overflow.
The most constraining is the ARM_ASM comb filter where we shift left by one
and then add two values. Because of that, we use 2^29-1. SIG_SAT must be large
enough to fit a full-scale high-freq tone through the prefilter and comb filter,
meaning 1.85*1.75*2^(15+SIG_SHIFT) = 434529895.
so the limit should be about 2^31*sqrt(.5). */
#define SIG_SAT (536870911)
#define NORM_SCALING 16384
#define DB_SHIFT 24
#define EPSILON 1
#define VERY_SMALL 0
#define VERY_LARGE16 ((opus_val16)32767)
#define Q15_ONE ((opus_val16)32767)
#define ABS16(x) ((x) < 0 ? (-(x)) : (x))
#define ABS32(x) ((x) < 0 ? (-(x)) : (x))
static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
return x > 32767 ? 32767 : x < -32768 ? -32768 : (opus_int16)x;
}
#ifdef FIXED_DEBUG
#include "fixed_debug.h"
#else
#include "fixed_generic.h"
#ifdef OPUS_ARM_PRESUME_AARCH64_NEON_INTR
#include "arm/fixed_arm64.h"
#elif defined (OPUS_ARM_INLINE_EDSP)
#include "arm/fixed_armv5e.h"
#elif defined (OPUS_ARM_INLINE_ASM)
#include "arm/fixed_armv4.h"
#elif defined (BFIN_ASM)
#include "fixed_bfin.h"
#elif defined (TI_C5X_ASM)
#include "fixed_c5x.h"
#elif defined (TI_C6X_ASM)
#include "fixed_c6x.h"
#endif
#endif
#else /* FIXED_POINT */
typedef float opus_val16;
typedef float opus_val32;
typedef float opus_val64;
typedef float celt_sig;
typedef float celt_norm;
typedef float celt_ener;
typedef float celt_glog;
typedef float opus_res;
typedef float celt_coef;
#ifdef FLOAT_APPROX
/* This code should reliably detect NaN/inf even when -ffast-math is used.
Assumes IEEE 754 format. */
static OPUS_INLINE int celt_isnan(float x)
{
union {float f; opus_uint32 i;} in;
in.f = x;
return ((in.i>>23)&0xFF)==0xFF && (in.i&0x007FFFFF)!=0;
}
#else
#ifdef __FAST_MATH__
#error Cannot build libopus with -ffast-math unless FLOAT_APPROX is defined. This could result in crashes on extreme (e.g. NaN) input
#endif
#define celt_isnan(x) ((x)!=(x))
#endif
#define Q15ONE 1.0f
#define Q31ONE 1.0f
#define COEF_ONE 1.0f
#define COEF2VAL16(x) (x)
#define NORM_SCALING 1.f
#define EPSILON 1e-15f
#define VERY_SMALL 1e-30f
#define VERY_LARGE16 1e15f
#define Q15_ONE ((opus_val16)1.f)
/* This appears to be the same speed as C99's fabsf() but it's more portable. */
#define ABS16(x) ((float)fabs(x))
#define ABS32(x) ((float)fabs(x))
#define QCONST16(x,bits) (x)
#define QCONST32(x,bits) (x)
#define GCONST(x) (x)
#define NEG16(x) (-(x))
#define NEG32(x) (-(x))
#define NEG32_ovflw(x) (-(x))
#define EXTRACT16(x) (x)
#define EXTEND32(x) (x)
#define SHR16(a,shift) (a)
#define SHL16(a,shift) (a)
#define SHR32(a,shift) (a)
#define SHL32(a,shift) (a)
#define PSHR32(a,shift) (a)
#define VSHR32(a,shift) (a)
#define PSHR(a,shift) (a)
#define SHR(a,shift) (a)
#define SHL(a,shift) (a)
#define SATURATE(x,a) (x)
#define SATURATE16(x) (x)
#define ROUND16(a,shift) (a)
#define SROUND16(a,shift) (a)
#define HALF16(x) (.5f*(x))
#define HALF32(x) (.5f*(x))
#define ADD16(a,b) ((a)+(b))
#define SUB16(a,b) ((a)-(b))
#define ADD32(a,b) ((a)+(b))
#define SUB32(a,b) ((a)-(b))
#define ADD32_ovflw(a,b) ((a)+(b))
#define SUB32_ovflw(a,b) ((a)-(b))
#define PSHR32_ovflw(a,shift) (a)
#define MULT16_16_16(a,b) ((a)*(b))
#define MULT16_16(a,b) ((opus_val32)(a)*(opus_val32)(b))
#define MAC16_16(c,a,b) ((c)+(opus_val32)(a)*(opus_val32)(b))
#define MULT16_32_Q15(a,b) ((a)*(b))
#define MULT16_32_Q16(a,b) ((a)*(b))
#define MULT32_32_Q31(a,b) ((a)*(b))
#define MAC16_32_Q15(c,a,b) ((c)+(a)*(b))
#define MAC16_32_Q16(c,a,b) ((c)+(a)*(b))
#define MAC_COEF_32_ARM(c,a,b) ((c)+(a)*(b))
#define MULT16_16_Q11_32(a,b) ((a)*(b))
#define MULT16_16_Q11(a,b) ((a)*(b))
#define MULT16_16_Q13(a,b) ((a)*(b))
#define MULT16_16_Q14(a,b) ((a)*(b))
#define MULT16_16_Q15(a,b) ((a)*(b))
#define MULT16_16_P15(a,b) ((a)*(b))
#define MULT16_16_P13(a,b) ((a)*(b))
#define MULT16_16_P14(a,b) ((a)*(b))
#define MULT16_32_P16(a,b) ((a)*(b))
#define MULT_COEF_32(a, b) ((a)*(b))
#define MULT_COEF(a, b) ((a)*(b))
#define MULT_COEF_TAPS(a, b) ((a)*(b))
#define DIV32_16(a,b) (((opus_val32)(a))/(opus_val16)(b))
#define DIV32(a,b) (((opus_val32)(a))/(opus_val32)(b))
#define SIG2RES(a) ((1/CELT_SIG_SCALE)*(a))
#define RES2INT16(a) FLOAT2INT16(a)
#define RES2INT24(a) float2int(32768.f*256.f*(a))
#define RES2FLOAT(a) (a)
#define INT16TORES(a) ((a)*(1/CELT_SIG_SCALE))
#define INT24TORES(a) ((1.f/32768.f/256.)*(a))
#define ADD_RES(a, b) ADD32(a, b)
#define FLOAT2RES(a) (a)
#define RES2SIG(a) (CELT_SIG_SCALE*(a))
#define MULT16_RES_Q15(a,b) MULT16_16_Q15(a,b)
#define RES2VAL16(a) (a)
#define FLOAT2SIG(a) ((a)*CELT_SIG_SCALE)
#define INT16TOSIG(a) ((float)(a))
#define INT24TOSIG(a) ((float)(a)*(1.f/256.f))
#define MAX_ENCODING_DEPTH 24
#endif /* !FIXED_POINT */
#ifndef GLOBAL_STACK_SIZE
#ifdef FIXED_POINT
#define GLOBAL_STACK_SIZE 120000
#else
#define GLOBAL_STACK_SIZE 120000
#endif
#endif
#endif /* ARCH_H */
#!/usr/bin/perl
# Copyright (C) 2002-2013 Xiph.org Foundation
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
# OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
my $bigend; # little/big endian
my $nxstack;
my $apple = 0;
my $symprefix = "";
$nxstack = 0;
eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
if $running_under_some_shell;
while ($ARGV[0] =~ /^-/) {
$_ = shift;
last if /^--$/;
if (/^-n$/) {
$nflag++;
next;
}
if (/^--apple$/) {
$apple = 1;
$symprefix = "_";
next;
}
die "I don't recognize this switch: $_\\n";
}
$printit++ unless $nflag;
$\ = "\n"; # automatically add newline on print
$n=0;
$thumb = 0; # ARM mode by default, not Thumb.
@proc_stack = ();
printf (" .syntax unified\n");
LINE:
while (<>) {
# For ADRLs we need to add a new line after the substituted one.
$addPadding = 0;
# First, we do not dare to touch *anything* inside double quotes, do we?
# Second, if you want a dollar character in the string,
# insert two of them -- that's how ARM C and assembler treat strings.
s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
# If there's nothing on a line but a comment, don't try to apply any further
# substitutions (this is a cheap hack to avoid mucking up the license header)
s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
# If substituted -- leave immediately !
s/@/,:/;
s/;/@/;
while ( /@.*'/ ) {
s/(@.*)'/$1/g;
}
s/\{FALSE\}/0/g;
s/\{TRUE\}/1/g;
s/\{(\w\w\w\w+)\}/$1/g;
s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
s/\bIMPORT\b/.extern/;
s/\bEXPORT\b\s*/.global $symprefix/;
s/^(\s+)\[/$1IF/;
s/^(\s+)\|/$1ELSE/;
s/^(\s+)\]/$1ENDIF/;
s/IF *:DEF:/ .ifdef/;
s/IF *:LNOT: *:DEF:/ .ifndef/;
s/ELSE/ .else/;
s/ENDIF/ .endif/;
if( /\bIF\b/ ) {
s/\bIF\b/ .if/;
s/=/==/;
}
if ( $n == 2) {
s/\$/\\/g;
}
if ($n == 1) {
s/\$//g;
s/label//g;
$n = 2;
}
if ( /MACRO/ ) {
s/MACRO *\n/.macro/;
$n=1;
}
if ( /\bMEND\b/ ) {
s/\bMEND\b/.endm/;
$n=0;
}
# ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
#
if ( /\bAREA\b/ ) {
my $align;
$align = "2";
if ( /ALIGN=(\d+)/ ) {
$align = $1;
}
if ( /CODE/ ) {
$nxstack = 1;
}
s/^(.+)CODE(.+)READONLY(.*)/ .text/;
s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/;
s/^(.+)\|\|\.data\|\|(.+)/ .data/;
s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
s/$/; .p2align $align/;
# Enable NEON instructions but don't produce a binary that requires
# ARMv7. RVCT does not have equivalent directives, so we just do this
# for all CODE areas.
if ( /.text/ ) {
# Separating .arch, .fpu, etc., by semicolons does not work (gas
# thinks the semicolon is part of the arch name, even when there's
# whitespace separating them). Sadly this means our line numbers
# won't match the original source file (we could use the .line
# directive, which is documented to be obsolete, but then gdb will
# show the wrong line in the translated source file).
s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/ unless ($apple);
}
}
s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
s/^(\s+)\%(\s)/ .space $1/;
s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
s/\bCODE32\b/.code 32/ && do {$thumb = 0};
s/\bCODE16\b/.code 16/ && do {$thumb = 1};
if (/\bPROC\b/)
{
my $prefix;
my $proc;
/^([A-Za-z_\.]\w+)\b/;
$proc = $1;
$prefix = "";
if ($proc)
{
$prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
# Make sure we $prefix isn't empty here (for the $apple case).
# We handle mangling the label here, make sure it doesn't match
# the label handling below (if $prefix would be empty).
$prefix = $prefix."; ";
push(@proc_stack, $proc);
s/^[A-Za-z_\.]\w+/$symprefix$&:/;
}
$prefix = $prefix."\t.thumb_func; " if ($thumb);
s/\bPROC\b/@ $&/;
$_ = $prefix.$_;
}
s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
if (/\bENDP\b/)
{
my $proc;
s/\bENDP\b/@ $&/;
$proc = pop(@proc_stack);
$_ = "\t.size $proc, .-$proc".$_ if ($proc && !$apple);
}
s/\bSUBT\b/@ $&/;
s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
s/\bKEEP\b/@ $&/;
s/\bEXPORTAS\b/@ $&/;
s/\|\|(.)+\bEQU\b/@ $&/;
s/\|\|([\w\$]+)\|\|/$1/;
s/\bENTRY\b/@ $&/;
s/\bASSERT\b/@ $&/;
s/\bGBLL\b/@ $&/;
s/\bGBLA\b/@ $&/;
s/^\W+OPT\b/@ $&/;
s/:OR:/|/g;
s/:SHL:/<</g;
s/:SHR:/>>/g;
s/:AND:/&/g;
s/:LAND:/&&/g;
s/CPSR/cpsr/;
s/SPSR/spsr/;
s/ALIGN$/.balign 4/;
s/ALIGN\s+([0-9x]+)$/.balign $1/;
s/psr_cxsf/psr_all/;
s/LTORG/.ltorg/;
s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
# {PC} + 0xdeadfeed --> . + 0xdeadfeed
s/\{PC\} \+/ \. +/;
# Single hex constant on the line !
#
# >>> NOTE <<<
# Double-precision floats in gcc are always mixed-endian, which means
# bytes in two words are little-endian, but words are big-endian.
# So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
# and 0xfeed0000 at high address.
#
s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
# Only decimal constants on the line, no hex !
s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
# Single hex constant on the line !
# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
# Only decimal constants on the line, no hex !
# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
s/\bDCFS[ \t]+0x/.word 0x/;
s/\bDCFS\b/.float/;
s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
s/\bDCD\b/.word/;
s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
s/\bDCW\b/.short/;
s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
s/\bDCB\b/.byte/;
s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
s/^[A-Za-z_\.]\w+/$&:/;
s/^(\d+)/$1:/;
s/\%(\d+)/$1b_or_f/;
s/\%[Bb](\d+)/$1b/;
s/\%[Ff](\d+)/$1f/;
s/\%[Ff][Tt](\d+)/$1f/;
s/&([\dA-Fa-f]+)/0x$1/;
if ( /\b2_[01]+\b/ ) {
s/\b2_([01]+)\b/conv$1&&&&/g;
while ( /[01][01][01][01]&&&&/ ) {
s/0000&&&&/&&&&0/g;
s/0001&&&&/&&&&1/g;
s/0010&&&&/&&&&2/g;
s/0011&&&&/&&&&3/g;
s/0100&&&&/&&&&4/g;
s/0101&&&&/&&&&5/g;
s/0110&&&&/&&&&6/g;
s/0111&&&&/&&&&7/g;
s/1000&&&&/&&&&8/g;
s/1001&&&&/&&&&9/g;
s/1010&&&&/&&&&A/g;
s/1011&&&&/&&&&B/g;
s/1100&&&&/&&&&C/g;
s/1101&&&&/&&&&D/g;
s/1110&&&&/&&&&E/g;
s/1111&&&&/&&&&F/g;
}
s/000&&&&/&&&&0/g;
s/001&&&&/&&&&1/g;
s/010&&&&/&&&&2/g;
s/011&&&&/&&&&3/g;
s/100&&&&/&&&&4/g;
s/101&&&&/&&&&5/g;
s/110&&&&/&&&&6/g;
s/111&&&&/&&&&7/g;
s/00&&&&/&&&&0/g;
s/01&&&&/&&&&1/g;
s/10&&&&/&&&&2/g;
s/11&&&&/&&&&3/g;
s/0&&&&/&&&&0/g;
s/1&&&&/&&&&1/g;
s/conv&&&&/0x/g;
}
if ( /commandline/)
{
if( /-bigend/)
{
$bigend=1;
}
}
if ( /\bDCDU\b/ )
{
my $cmd=$_;
my $value;
my $prefix;
my $w1;
my $w2;
my $w3;
my $w4;
s/\s+DCDU\b/@ $&/;
$cmd =~ /\bDCDU\b\s+0x(\d+)/;
$value = $1;
$value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
$w1 = $1;
$w2 = $2;
$w3 = $3;
$w4 = $4;
if( $bigend ne "")
{
# big endian
$prefix = "\t.byte\t0x".$w1.";".
"\t.byte\t0x".$w2.";".
"\t.byte\t0x".$w3.";".
"\t.byte\t0x".$w4."; ";
}
else
{
# little endian
$prefix = "\t.byte\t0x".$w4.";".
"\t.byte\t0x".$w3.";".
"\t.byte\t0x".$w2.";".
"\t.byte\t0x".$w1."; ";
}
$_=$prefix.$_;
}
if ( /\badrl\b/i )
{
s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
$addPadding = 1;
}
s/\bEND\b/@ END/;
} continue {
printf ("%s", $_) if $printit;
if ($addPadding != 0)
{
printf (" mov r0,r0\n");
$addPadding = 0;
}
}
#If we had a code section, mark that this object doesn't need an executable
# stack.
if ($nxstack && !$apple) {
printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n");
}
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot
* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "kiss_fft.h"
#include "mathops.h"
#include "mdct.h"
#include "pitch.h"
#if defined(OPUS_HAVE_RTCD)
# if !defined(DISABLE_FLOAT_API)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
celt_float2int16_c, /* ARMv4 */
celt_float2int16_c, /* EDSP */
celt_float2int16_c, /* Media */
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};
# endif
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */
celt_inner_prod_c, /* EDSP */
celt_inner_prod_c, /* Media */
celt_inner_prod_neon,/* NEON */
celt_inner_prod_neon /* DOTPROD */
};
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2) = {
dual_inner_prod_c, /* ARMv4 */
dual_inner_prod_c, /* EDSP */
dual_inner_prod_c, /* Media */
dual_inner_prod_neon,/* NEON */
dual_inner_prod_neon /* DOTPROD */
};
# endif
# if defined(FIXED_POINT)
# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
MAY_HAVE_NEON(celt_pitch_xcorr), /* NEON */
MAY_HAVE_NEON(celt_pitch_xcorr) /* DOTPROD */
};
# endif
# else /* !FIXED_POINT */
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
celt_pitch_xcorr_float_neon, /* Neon */
celt_pitch_xcorr_float_neon /* DOTPROD */
};
# endif
# endif /* FIXED_POINT */
#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* ARMv4 */
xcorr_kernel_c, /* EDSP */
xcorr_kernel_c, /* Media */
xcorr_kernel_neon_fixed, /* Neon */
xcorr_kernel_neon_fixed /* DOTPROD */
};
#endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# if defined(HAVE_ARM_NE10)
# if defined(CUSTOM_MODES)
int (*const OPUS_FFT_ALLOC_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_alloc_arch_c, /* ARMv4 */
opus_fft_alloc_arch_c, /* EDSP */
opus_fft_alloc_arch_c, /* Media */
opus_fft_alloc_arm_neon, /* Neon with NE10 library support */
opus_fft_alloc_arm_neon /* DOTPROD with NE10 library support */
};
void (*const OPUS_FFT_FREE_ARCH_IMPL[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
opus_fft_free_arch_c, /* ARMv4 */
opus_fft_free_arch_c, /* EDSP */
opus_fft_free_arch_c, /* Media */
opus_fft_free_arm_neon, /* Neon with NE10 */
opus_fft_free_arm_neon /* DOTPROD with NE10 */
};
# endif /* CUSTOM_MODES */
void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout) = {
opus_fft_c, /* ARMv4 */
opus_fft_c, /* EDSP */
opus_fft_c, /* Media */
opus_fft_neon, /* Neon with NE10 */
opus_fft_neon /* DOTPROD with NE10 */
};
void (*const OPUS_IFFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout) = {
opus_ifft_c, /* ARMv4 */
opus_ifft_c, /* EDSP */
opus_ifft_c, /* Media */
opus_ifft_neon, /* Neon with NE10 */
opus_ifft_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_FORWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift,
int stride, int arch) = {
clt_mdct_forward_c, /* ARMv4 */
clt_mdct_forward_c, /* EDSP */
clt_mdct_forward_c, /* Media */
clt_mdct_forward_neon, /* Neon with NE10 */
clt_mdct_forward_neon /* DOTPROD with NE10 */
};
void (*const CLT_MDCT_BACKWARD_IMPL[OPUS_ARCHMASK+1])(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift,
int stride, int arch) = {
clt_mdct_backward_c, /* ARMv4 */
clt_mdct_backward_c, /* EDSP */
clt_mdct_backward_c, /* Media */
clt_mdct_backward_neon, /* Neon with NE10 */
clt_mdct_backward_neon /* DOTPROD with NE10 */
};
# endif /* HAVE_ARM_NE10 */
# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
#endif /* OPUS_HAVE_RTCD */
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Original code from libtheora modified to suit to Opus */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef OPUS_HAVE_RTCD
#include "armcpu.h"
#include "cpu_support.h"
#include "os_support.h"
#include "opus_types.h"
#include "arch.h"
#define OPUS_CPU_ARM_V4_FLAG (1<<OPUS_ARCH_ARM_V4)
#define OPUS_CPU_ARM_EDSP_FLAG (1<<OPUS_ARCH_ARM_EDSP)
#define OPUS_CPU_ARM_MEDIA_FLAG (1<<OPUS_ARCH_ARM_MEDIA)
#define OPUS_CPU_ARM_NEON_FLAG (1<<OPUS_ARCH_ARM_NEON)
#define OPUS_CPU_ARM_DOTPROD_FLAG (1<<OPUS_ARCH_ARM_DOTPROD)
#if defined(_MSC_VER)
/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
# define WIN32_LEAN_AND_MEAN
# define WIN32_EXTRA_LEAN
# include <windows.h>
static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){
opus_uint32 flags;
flags=0;
/* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit
* instructions via their assembled hex code.
* All of these instructions should be essentially nops. */
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*PLD [r13]*/
__emit(0xF5DDF000);
flags|=OPUS_CPU_ARM_EDSP_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*SHADD8 r3,r3,r3*/
__emit(0xE6333F93);
flags|=OPUS_CPU_ARM_MEDIA_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
__try{
/*VORR q0,q0,q0*/
__emit(0xF2200150);
flags|=OPUS_CPU_ARM_NEON_FLAG;
}
__except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
/*Ignore exception.*/
}
# endif
# endif
# endif
return flags;
}
#elif defined(__linux__)
/* Linux based */
#include <stdio.h>
static opus_uint32 opus_cpu_capabilities(void)
{
opus_uint32 flags = 0;
FILE *cpuinfo;
/* Reading /proc/self/auxv would be easier, but that doesn't work reliably on
* Android */
cpuinfo = fopen("/proc/cpuinfo", "r");
if(cpuinfo != NULL)
{
/* 512 should be enough for anybody (it's even enough for all the flags that
* x86 has accumulated... so far). */
char buf[512];
while(fgets(buf, 512, cpuinfo) != NULL)
{
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
/* Search for edsp and neon flag */
if(memcmp(buf, "Features", 8) == 0)
{
char *p;
p = strstr(buf, " edsp");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_EDSP_FLAG;
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
p = strstr(buf, " neon");
if(p != NULL && (p[5] == ' ' || p[5] == '\n'))
flags |= OPUS_CPU_ARM_NEON_FLAG;
p = strstr(buf, " asimd");
if(p != NULL && (p[6] == ' ' || p[6] == '\n'))
flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
# endif
# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
p = strstr(buf, " asimddp");
if(p != NULL && (p[8] == ' ' || p[8] == '\n'))
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
}
# endif
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
/* Search for media capabilities (>= ARMv6) */
if(memcmp(buf, "CPU architecture:", 17) == 0)
{
int version;
version = atoi(buf+17);
if(version >= 6)
flags |= OPUS_CPU_ARM_MEDIA_FLAG;
}
# endif
}
#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
# if defined(OPUS_ARM_PRESUME_DOTPROD)
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
#endif
fclose(cpuinfo);
}
return flags;
}
#elif defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
static opus_uint32 opus_cpu_capabilities(void)
{
opus_uint32 flags = 0;
#if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
size_t size = sizeof(uint32_t);
uint32_t value = 0;
if (!sysctlbyname("hw.optional.arm.FEAT_DotProd", &value, &size, NULL, 0) && value)
{
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
}
#endif
#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
# if defined(OPUS_ARM_PRESUME_DOTPROD)
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
#endif
return flags;
}
#elif defined(__FreeBSD__)
#include <sys/auxv.h>
static opus_uint32 opus_cpu_capabilities(void)
{
long hwcap = 0;
opus_uint32 flags = 0;
# if defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
/* FreeBSD requires armv6+, which always supports media instructions */
flags |= OPUS_CPU_ARM_MEDIA_FLAG;
# endif
elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_MEDIA) \
|| defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# ifdef HWCAP_EDSP
if (hwcap & HWCAP_EDSP)
flags |= OPUS_CPU_ARM_EDSP_FLAG;
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# ifdef HWCAP_NEON
if (hwcap & HWCAP_NEON)
flags |= OPUS_CPU_ARM_NEON_FLAG;
# elif defined(HWCAP_ASIMD)
if (hwcap & HWCAP_ASIMD)
flags |= OPUS_CPU_ARM_NEON_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_EDSP_FLAG;
# endif
# endif
# if defined(OPUS_ARM_MAY_HAVE_DOTPROD) && defined(HWCAP_ASIMDDP)
if (hwcap & HWCAP_ASIMDDP)
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
# endif
#if defined(OPUS_ARM_PRESUME_AARCH64_NEON_INTR)
flags |= OPUS_CPU_ARM_EDSP_FLAG | OPUS_CPU_ARM_MEDIA_FLAG | OPUS_CPU_ARM_NEON_FLAG;
# if defined(OPUS_ARM_PRESUME_DOTPROD)
flags |= OPUS_CPU_ARM_DOTPROD_FLAG;
# endif
#endif
return (flags);
}
#else
/* The feature registers which can tell us what the processor supports are
* accessible in priveleged modes only, so we can't have a general user-space
* detection method like on x86.*/
# error "Configured to use ARM asm but no CPU detection method available for " \
"your platform. Reconfigure with --disable-rtcd (or send patches)."
#endif
static int opus_select_arch_impl(void)
{
opus_uint32 flags = opus_cpu_capabilities();
int arch = 0;
if(!(flags & OPUS_CPU_ARM_EDSP_FLAG)) {
/* Asserts ensure arch values are sequential */
celt_assert(arch == OPUS_ARCH_ARM_V4);
return arch;
}
arch++;
if(!(flags & OPUS_CPU_ARM_MEDIA_FLAG)) {
celt_assert(arch == OPUS_ARCH_ARM_EDSP);
return arch;
}
arch++;
if(!(flags & OPUS_CPU_ARM_NEON_FLAG)) {
celt_assert(arch == OPUS_ARCH_ARM_MEDIA);
return arch;
}
arch++;
if(!(flags & OPUS_CPU_ARM_DOTPROD_FLAG)) {
celt_assert(arch == OPUS_ARCH_ARM_NEON);
return arch;
}
arch++;
celt_assert(arch == OPUS_ARCH_ARM_DOTPROD);
return arch;
}
int opus_select_arch(void) {
int arch = opus_select_arch_impl();
#ifdef FUZZING
arch = rand()%(arch+1);
#endif
return arch;
}
#endif
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(ARMCPU_H)
# define ARMCPU_H
# if defined(OPUS_ARM_MAY_HAVE_EDSP)
# define MAY_HAVE_EDSP(name) name ## _edsp
# else
# define MAY_HAVE_EDSP(name) name ## _c
# endif
# if defined(OPUS_ARM_MAY_HAVE_MEDIA)
# define MAY_HAVE_MEDIA(name) name ## _media
# else
# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name)
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON)
# define MAY_HAVE_NEON(name) name ## _neon
# else
# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name)
# endif
# if defined(OPUS_ARM_MAY_HAVE_DOTPROD)
# define MAY_HAVE_DOTPROD(name) name ## _dotprod
# else
# define MAY_HAVE_DOTPROD(name) MAY_HAVE_NEON(name)
# endif
# if defined(OPUS_ARM_PRESUME_EDSP)
# define PRESUME_EDSP(name) name ## _edsp
# else
# define PRESUME_EDSP(name) name ## _c
# endif
# if defined(OPUS_ARM_PRESUME_MEDIA)
# define PRESUME_MEDIA(name) name ## _media
# else
# define PRESUME_MEDIA(name) PRESUME_EDSP(name)
# endif
# if defined(OPUS_ARM_PRESUME_NEON)
# define PRESUME_NEON(name) name ## _neon
# else
# define PRESUME_NEON(name) PRESUME_MEDIA(name)
# endif
# if defined(OPUS_ARM_PRESUME_DOTPROD)
# define PRESUME_DOTPROD(name) name ## _dotprod
# else
# define PRESUME_DOTPROD(name) PRESUME_NEON(name)
# endif
# if defined(OPUS_HAVE_RTCD)
int opus_select_arch(void);
#define OPUS_ARCH_ARM_V4 (0)
#define OPUS_ARCH_ARM_EDSP (1)
#define OPUS_ARCH_ARM_MEDIA (2)
#define OPUS_ARCH_ARM_NEON (3)
#define OPUS_ARCH_ARM_DOTPROD (4)
# endif
#endif
/* Copyright (C) 2013 Mozilla Corporation */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
; Set the following to 1 if we have EDSP instructions
; (LDRD/STRD, etc., ARMv5E and later).
OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@
; Set the following to 1 if we have ARMv6 media instructions.
OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@
; Set the following to 1 if we have NEON (some ARMv7)
OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@
END
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_fft_ne10.c
@brief ARM Neon optimizations for fft using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SKIP_CONFIG_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#endif
#include <NE10_dsp.h>
#include "os_support.h"
#include "kiss_fft.h"
#include "stack_alloc.h"
#if !defined(FIXED_POINT)
# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
#else
# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
#endif
#if defined(CUSTOM_MODES)
/* nfft lengths in NE10 that support scaled fft */
# define NE10_FFTSCALED_SUPPORT_MAX 4
static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
480, 240, 120, 60
};
int opus_fft_alloc_arm_neon(kiss_fft_state *st)
{
int i;
size_t memneeded = sizeof(struct arch_fft_state);
st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
if (!st->arch_fft)
return -1;
for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
if(st->nfft == ne10_fft_scaled_support[i])
break;
}
if (i == NE10_FFTSCALED_SUPPORT_MAX) {
/* This nfft length (scaled fft) is not supported in NE10 */
st->arch_fft->is_supported = 0;
st->arch_fft->priv = NULL;
}
else {
st->arch_fft->is_supported = 1;
st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
if (st->arch_fft->priv == NULL) {
return -1;
}
}
return 0;
}
void opus_fft_free_arm_neon(kiss_fft_state *st)
{
NE10_FFT_CFG_TYPE_T cfg;
if (!st->arch_fft)
return;
cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
if (cfg)
NE10_FFT_DESTROY_C2C_TYPE(cfg);
opus_free(st->arch_fft);
}
#endif
void opus_fft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout)
{
NE10_FFT_STATE_TYPE_T state;
NE10_FFT_CFG_TYPE_T cfg = &state;
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
SAVE_STACK;
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
if (!st->arch_fft->is_supported) {
/* This nfft length (scaled fft) not supported in NE10 */
opus_fft_c(st, fin, fout);
}
else {
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
#if !defined(FIXED_POINT)
state.is_forward_scaled = 1;
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 0);
#else
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 0, 1);
#endif
}
RESTORE_STACK;
}
void opus_ifft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout)
{
NE10_FFT_STATE_TYPE_T state;
NE10_FFT_CFG_TYPE_T cfg = &state;
VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
SAVE_STACK;
ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
if (!st->arch_fft->is_supported) {
/* This nfft length (scaled fft) not supported in NE10 */
opus_ifft_c(st, fin, fout);
}
else {
memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
#if !defined(FIXED_POINT)
state.is_backward_scaled = 0;
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 1);
#else
NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
(NE10_FFT_CPX_TYPE_T *)fin,
cfg, 1, 0);
#endif
}
RESTORE_STACK;
}
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_mdct_ne10.c
@brief ARM Neon optimizations for mdct using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SKIP_CONFIG_H
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#endif
#include "kiss_fft.h"
#include "_kiss_fft_guts.h"
#include "mdct.h"
#include "stack_alloc.h"
void clt_mdct_forward_neon(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window,
int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
VARDECL(kiss_fft_scalar, f);
VARDECL(kiss_fft_cpx, f2);
const kiss_fft_state *st = l->kfft[shift];
const kiss_twiddle_scalar *trig;
SAVE_STACK;
N = l->n;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f, N2, kiss_fft_scalar);
ALLOC(f2, N4, kiss_fft_cpx);
/* Consider the input to be composed of four blocks: [a, b, c, d] */
/* Window, shuffle, fold */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
for(i=0;i<((overlap+3)>>2);i++)
{
/* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
*yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
*yp++ = MULT16_32_Q15(*wp1, *xp1) - MULT16_32_Q15(*wp2, xp2[-N2]);
xp1+=2;
xp2-=2;
wp1+=2;
wp2-=2;
}
wp1 = window;
wp2 = window+overlap-1;
for(;i<N4-((overlap+3)>>2);i++)
{
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
*yp++ = *xp2;
*yp++ = *xp1;
xp1+=2;
xp2-=2;
}
for(;i<N4;i++)
{
/* Real part arranged as a-bR, Imag part arranged as -c-dR */
*yp++ = -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
*yp++ = MULT16_32_Q15(*wp2, *xp1) + MULT16_32_Q15(*wp1, xp2[N2]);
xp1+=2;
xp2-=2;
wp1+=2;
wp2-=2;
}
}
/* Pre-rotation */
{
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const kiss_twiddle_scalar *t = &trig[0];
for(i=0;i<N4;i++)
{
kiss_fft_cpx yc;
kiss_twiddle_scalar t0, t1;
kiss_fft_scalar re, im, yr, yi;
t0 = t[i];
t1 = t[N4+i];
re = *yp++;
im = *yp++;
yr = S_MUL(re,t0) - S_MUL(im,t1);
yi = S_MUL(im,t0) + S_MUL(re,t1);
yc.r = yr;
yc.i = yi;
f2[i] = yc;
}
}
opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
/* Post-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
const kiss_twiddle_scalar *t = &trig[0];
/* Temp pointers to make it really clear to the compiler what we're doing */
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
*yp1 = yr;
*yp2 = yi;
fp++;
yp1 += 2*stride;
yp2 -= 2*stride;
}
}
RESTORE_STACK;
}
void clt_mdct_backward_neon(const mdct_lookup *l,
kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 * OPUS_RESTRICT window,
int overlap, int shift, int stride, int arch)
{
int i;
int N, N2, N4;
VARDECL(kiss_fft_scalar, f);
const kiss_twiddle_scalar *trig;
const kiss_fft_state *st = l->kfft[shift];
N = l->n;
trig = l->trig;
for (i=0;i<shift;i++)
{
N >>= 1;
trig += N;
}
N2 = N>>1;
N4 = N>>2;
ALLOC(f, N2, kiss_fft_scalar);
/* Pre-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
kiss_fft_scalar * OPUS_RESTRICT yp = f;
const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
for(i=0;i<N4;i++)
{
kiss_fft_scalar yr, yi;
yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
yp[2*i] = yr;
yp[2*i+1] = yi;
xp1+=2*stride;
xp2-=2*stride;
}
}
opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
/* Post-rotate and de-shuffle from both ends of the buffer at once to make
it in-place. */
{
kiss_fft_scalar * yp0 = out+(overlap>>1);
kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
const kiss_twiddle_scalar *t = &trig[0];
/* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
middle pair will be computed twice. */
for(i=0;i<(N4+1)>>1;i++)
{
kiss_fft_scalar re, im, yr, yi;
kiss_twiddle_scalar t0, t1;
re = yp0[0];
im = yp0[1];
t0 = t[i];
t1 = t[N4+i];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) + S_MUL(im,t1);
yi = S_MUL(re,t1) - S_MUL(im,t0);
re = yp1[0];
im = yp1[1];
yp0[0] = yr;
yp1[1] = yi;
t0 = t[(N4-i-1)];
t1 = t[(N2-i-1)];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
yr = S_MUL(re,t0) + S_MUL(im,t1);
yi = S_MUL(re,t1) - S_MUL(im,t0);
yp1[0] = yr;
yp0[1] = yi;
yp0 += 2;
yp1 -= 2;
}
}
/* Mirror on both sides for TDAC */
{
kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
const opus_val16 * OPUS_RESTRICT wp1 = window;
const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
for(i = 0; i < overlap/2; i++)
{
kiss_fft_scalar x1, x2;
x1 = *xp1;
x2 = *yp1;
*yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
*xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
wp1++;
wp2--;
}
}
RESTORE_STACK;
}
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
@brief ARM Neon Intrinsic optimizations for celt
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <arm_neon.h>
#include "../float_cast.h"
#include "../mathops.h"
#include "../pitch.h"
#if defined(OPUS_CHECK_ASM)
#include <stdlib.h>
#endif
#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i = 0;
#if defined(__ARM_NEON)
const int BLOCK_SIZE = 16;
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
for (; i < blockedSize; i += BLOCK_SIZE)
{
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
float32x4_t orig_d = vld1q_f32(&in[i + 12]);
int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
vst1_s16(&out[i + 0], asShort_a);
vst1_s16(&out[i + 4], asShort_b);
vst1_s16(&out[i + 8], asShort_c);
vst1_s16(&out[i + 12], asShort_d);
# if defined(OPUS_CHECK_ASM)
short out_c[BLOCK_SIZE];
int j;
for(j = 0; j < BLOCK_SIZE; j++)
{
out_c[j] = FLOAT2INT16(in[i + j]);
celt_assert(abs((out_c[j] - out[i + j])) <= 1);
}
# endif
}
#endif
for (; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}
#endif
#if defined(FIXED_POINT)
#include <string.h>
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
int32x4_t a = vld1q_s32(sum);
/* Load y[0...3] */
/* This requires len>0 to always be valid (which we assert in the C code). */
int16x4_t y0 = vld1_s16(y);
y += 4;
/* This loop loads one y value more than we actually need.
Therefore we have to stop as soon as there are 8 or fewer samples left
(instead of 7), to avoid reading past the end of the array. */
for (j = 0; j + 8 < len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
int16x4_t x0 = vget_low_s16(xx);
int16x4_t x4 = vget_high_s16(xx);
/* Load y[4...11] */
int16x8_t yy = vld1q_s16(y);
int16x4_t y4 = vget_low_s16(yy);
int16x4_t y8 = vget_high_s16(yy);
int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
int16x4_t y1 = vext_s16(y0, y4, 1);
int16x4_t y5 = vext_s16(y4, y8, 1);
int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
int16x4_t y2 = vext_s16(y0, y4, 2);
int16x4_t y6 = vext_s16(y4, y8, 2);
int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
int16x4_t y3 = vext_s16(y0, y4, 3);
int16x4_t y7 = vext_s16(y4, y8, 3);
int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
y0 = y8;
a = a7;
x += 8;
y += 8;
}
if (j + 4 < len) {
/* Load x[0...3] */
int16x4_t x0 = vld1_s16(x);
/* Load y[4...7] */
int16x4_t y4 = vld1_s16(y);
int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
int16x4_t y1 = vext_s16(y0, y4, 1);
int32x4_t a1 = vmlal_lane_s16(a0, y1, x0, 1);
int16x4_t y2 = vext_s16(y0, y4, 2);
int32x4_t a2 = vmlal_lane_s16(a1, y2, x0, 2);
int16x4_t y3 = vext_s16(y0, y4, 3);
int32x4_t a3 = vmlal_lane_s16(a2, y3, x0, 3);
y0 = y4;
a = a3;
x += 4;
y += 4;
j += 4;
}
if (j + 2 < len) {
/* Load x[0...1] */
int16x4x2_t xx = vld2_dup_s16(x);
int16x4_t x0 = xx.val[0];
int16x4_t x1 = xx.val[1];
/* Load y[4...5].
We would like to use vld1_dup_s32(), but casting the pointer would
break strict aliasing rules and potentially have alignment issues.
Fortunately the compiler seems capable of translating this memcpy()
and vdup_n_s32() into the equivalent vld1_dup_s32().*/
int32_t yy;
memcpy(&yy, y, sizeof(yy));
int16x4_t y4 = vreinterpret_s16_s32(vdup_n_s32(yy));
int32x4_t a0 = vmlal_s16(a, y0, x0);
int16x4_t y1 = vext_s16(y0, y4, 1);
/* Replace bottom copy of {y[5], y[4]} in y4 with {y[3], y[2]} from y0,
using VSRI instead of VEXT, since it's a data-processing
instruction. */
y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
vreinterpret_s64_s16(y0), 32));
int32x4_t a1 = vmlal_s16(a0, y1, x1);
a = a1;
x += 2;
y += 2;
j += 2;
}
if (j + 1 < len) {
/* Load next x. */
int16x4_t x0 = vld1_dup_s16(x);
int32x4_t a0 = vmlal_s16(a, y0, x0);
/* Load last y. */
int16x4_t y4 = vld1_dup_s16(y);
y0 = vreinterpret_s16_s64(vsri_n_s64(vreinterpret_s64_s16(y4),
vreinterpret_s64_s16(y0), 16));
a = a0;
x++;
}
/* Load last x. */
int16x4_t x0 = vld1_dup_s16(x);
int32x4_t a0 = vmlal_s16(a, y0, x0);
vst1q_s32(sum, a0);
}
#else
#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
/* If we can, force the compiler to use an FMA instruction rather than break
* vmlaq_f32() into fmul/fadd. */
#ifdef vmlaq_lane_f32
#undef vmlaq_lane_f32
#endif
#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
#endif
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
* Computes 4 correlation values and stores them in sum[4]
*/
static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
float32_t sum[4], int len) {
float32x4_t YY[3];
float32x4_t YEXT[3];
float32x4_t XX[2];
float32x2_t XX_2;
float32x4_t SUMM;
const float32_t *xi = x;
const float32_t *yi = y;
celt_assert(len>0);
YY[0] = vld1q_f32(yi);
SUMM = vdupq_n_f32(0);
/* Consume 8 elements in x vector and 12 elements in y
* vector. However, the 12'th element never really gets
* touched in this loop. So, if len == 8, then we only
* must access y[0] to y[10]. y[11] must not be accessed
* hence make sure len > 8 and not len >= 8
*/
while (len > 8) {
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
YEXT[0] = vextq_f32(YY[1], YY[2], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
YEXT[1] = vextq_f32(YY[1], YY[2], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
YEXT[2] = vextq_f32(YY[1], YY[2], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
YY[0] = YY[2];
len -= 8;
}
/* Consume 4 elements in x vector and 8 elements in y
* vector. However, the 8'th element in y never really gets
* touched in this loop. So, if len == 4, then we only
* must access y[0] to y[6]. y[7] must not be accessed
* hence make sure len>4 and not len>=4
*/
if (len > 4) {
yi += 4;
YY[1] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
YY[0] = YY[1];
len -= 4;
}
while (--len > 0) {
XX_2 = vld1_dup_f32(xi++);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
YY[0]= vld1q_f32(++yi);
}
XX_2 = vld1_dup_f32(xi);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
vst1q_f32(sum, SUMM);
}
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch) {
int i;
(void)arch;
celt_assert(max_pitch > 0);
celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
for (i = 0; i < (max_pitch-3); i += 4) {
xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
(float32_t *)xcorr+i, len);
}
/* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
for (; i < max_pitch; i++) {
xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
}
}
#endif
; Copyright (c) 2007-2008 CSIRO
; Copyright (c) 2007-2009 Xiph.Org Foundation
; Copyright (c) 2013 Parrot
; Written by Aurélien Zanelli
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; - Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; - Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
AREA |.text|, CODE, READONLY
GET celt/arm/armopts.s
IF OPUS_ARM_MAY_HAVE_EDSP
EXPORT celt_pitch_xcorr_edsp
ENDIF
IF OPUS_ARM_MAY_HAVE_NEON
EXPORT celt_pitch_xcorr_neon
ENDIF
IF OPUS_ARM_MAY_HAVE_NEON
; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
xcorr_kernel_neon PROC
xcorr_kernel_neon_start
; input:
; r3 = int len
; r4 = opus_val16 *x
; r5 = opus_val16 *y
; q0 = opus_val32 sum[4]
; output:
; q0 = opus_val32 sum[4]
; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
; internal usage:
; r12 = int j
; d3 = y_3|y_2|y_1|y_0
; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
; q8 = scratch
;
; Load y[0...3]
; This requires len>0 to always be valid (which we assert in the C code).
VLD1.16 {d5}, [r5]!
SUBS r12, r3, #8
BLE xcorr_kernel_neon_process4
; Process 8 samples at a time.
; This loop loads one y value more than we actually need. Therefore we have to
; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
; reading past the end of the array.
xcorr_kernel_neon_process8
; This loop has 19 total instructions (10 cycles to issue, minimum), with
; - 2 cycles of ARM insrtuctions,
; - 10 cycles of load/store/byte permute instructions, and
; - 9 cycles of data processing instructions.
; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
; latter two categories, meaning the whole loop should run in 10 cycles per
; iteration, barring cache misses.
;
; Load x[0...7]
VLD1.16 {d6, d7}, [r4]!
; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
VAND d3, d5, d5
SUBS r12, r12, #8
; Load y[4...11]
VLD1.16 {d4, d5}, [r5]!
VMLAL.S16 q0, d3, d6[0]
VEXT.16 d16, d3, d4, #1
VMLAL.S16 q0, d4, d7[0]
VEXT.16 d17, d4, d5, #1
VMLAL.S16 q0, d16, d6[1]
VEXT.16 d16, d3, d4, #2
VMLAL.S16 q0, d17, d7[1]
VEXT.16 d17, d4, d5, #2
VMLAL.S16 q0, d16, d6[2]
VEXT.16 d16, d3, d4, #3
VMLAL.S16 q0, d17, d7[2]
VEXT.16 d17, d4, d5, #3
VMLAL.S16 q0, d16, d6[3]
VMLAL.S16 q0, d17, d7[3]
BGT xcorr_kernel_neon_process8
; Process 4 samples here if we have > 4 left (still reading one extra y value).
xcorr_kernel_neon_process4
ADDS r12, r12, #4
BLE xcorr_kernel_neon_process2
; Load x[0...3]
VLD1.16 d6, [r4]!
; Use VAND since it's a data processing instruction again.
VAND d4, d5, d5
SUB r12, r12, #4
; Load y[4...7]
VLD1.16 d5, [r5]!
VMLAL.S16 q0, d4, d6[0]
VEXT.16 d16, d4, d5, #1
VMLAL.S16 q0, d16, d6[1]
VEXT.16 d16, d4, d5, #2
VMLAL.S16 q0, d16, d6[2]
VEXT.16 d16, d4, d5, #3
VMLAL.S16 q0, d16, d6[3]
; Process 2 samples here if we have > 2 left (still reading one extra y value).
xcorr_kernel_neon_process2
ADDS r12, r12, #2
BLE xcorr_kernel_neon_process1
; Load x[0...1]
VLD2.16 {d6[],d7[]}, [r4]!
; Use VAND since it's a data processing instruction again.
VAND d4, d5, d5
SUB r12, r12, #2
; Load y[4...5]
VLD1.32 {d5[]}, [r5]!
VMLAL.S16 q0, d4, d6
VEXT.16 d16, d4, d5, #1
; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
; instead of VEXT, since it's a data-processing instruction.
VSRI.64 d5, d4, #32
VMLAL.S16 q0, d16, d7
; Process 1 sample using the extra y value we loaded above.
xcorr_kernel_neon_process1
; Load next *x
VLD1.16 {d6[]}, [r4]!
ADDS r12, r12, #1
; y[0...3] are left in d5 from prior iteration(s) (if any)
VMLAL.S16 q0, d5, d6
MOVLE pc, lr
; Now process 1 last sample, not reading ahead.
; Load last *y
VLD1.16 {d4[]}, [r5]!
VSRI.64 d4, d5, #16
; Load last *x
VLD1.16 {d6[]}, [r4]!
VMLAL.S16 q0, d4, d6
MOV pc, lr
ENDP
; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
; opus_val32 *xcorr, int len, int max_pitch, int arch)
celt_pitch_xcorr_neon PROC
; input:
; r0 = opus_val16 *_x
; r1 = opus_val16 *_y
; r2 = opus_val32 *xcorr
; r3 = int len
; output:
; r0 = int maxcorr
; internal usage:
; r4 = opus_val16 *x (for xcorr_kernel_neon())
; r5 = opus_val16 *y (for xcorr_kernel_neon())
; r6 = int max_pitch
; r12 = int j
; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
; ignored:
; int arch
STMFD sp!, {r4-r6, lr}
LDR r6, [sp, #16]
VMOV.S32 q15, #1
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
SUBS r6, r6, #4
BLT celt_pitch_xcorr_neon_process4_done
celt_pitch_xcorr_neon_process4
; xcorr_kernel_neon parameters:
; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
MOV r4, r0
MOV r5, r1
VEOR q0, q0, q0
; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
; So we don't save/restore any other registers.
BL xcorr_kernel_neon_start
SUBS r6, r6, #4
VST1.32 {q0}, [r2]!
; _y += 4
ADD r1, r1, #8
VMAX.S32 q15, q15, q0
; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
BGE celt_pitch_xcorr_neon_process4
; We have less than 4 sums left to compute.
celt_pitch_xcorr_neon_process4_done
ADDS r6, r6, #4
; Reduce maxcorr to a single value
VMAX.S32 d30, d30, d31
VPMAX.S32 d30, d30, d30
; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
BLE celt_pitch_xcorr_neon_done
; Now compute each remaining sum one at a time.
celt_pitch_xcorr_neon_process_remaining
MOV r4, r0
MOV r5, r1
VMOV.I32 q0, #0
SUBS r12, r3, #8
BLT celt_pitch_xcorr_neon_process_remaining4
; Sum terms 8 at a time.
celt_pitch_xcorr_neon_process_remaining_loop8
; Load x[0...7]
VLD1.16 {q1}, [r4]!
; Load y[0...7]
VLD1.16 {q2}, [r5]!
SUBS r12, r12, #8
VMLAL.S16 q0, d4, d2
VMLAL.S16 q0, d5, d3
BGE celt_pitch_xcorr_neon_process_remaining_loop8
; Sum terms 4 at a time.
celt_pitch_xcorr_neon_process_remaining4
ADDS r12, r12, #4
BLT celt_pitch_xcorr_neon_process_remaining4_done
; Load x[0...3]
VLD1.16 {d2}, [r4]!
; Load y[0...3]
VLD1.16 {d3}, [r5]!
SUB r12, r12, #4
VMLAL.S16 q0, d3, d2
celt_pitch_xcorr_neon_process_remaining4_done
; Reduce the sum to a single value.
VADD.S32 d0, d0, d1
VPADDL.S32 d0, d0
ADDS r12, r12, #4
BLE celt_pitch_xcorr_neon_process_remaining_loop_done
; Sum terms 1 at a time.
celt_pitch_xcorr_neon_process_remaining_loop1
VLD1.16 {d2[]}, [r4]!
VLD1.16 {d3[]}, [r5]!
SUBS r12, r12, #1
VMLAL.S16 q0, d2, d3
BGT celt_pitch_xcorr_neon_process_remaining_loop1
celt_pitch_xcorr_neon_process_remaining_loop_done
VST1.32 {d0[0]}, [r2]!
VMAX.S32 d30, d30, d0
SUBS r6, r6, #1
; _y++
ADD r1, r1, #2
; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
BGT celt_pitch_xcorr_neon_process_remaining
celt_pitch_xcorr_neon_done
VMOV.32 r0, d30[0]
LDMFD sp!, {r4-r6, pc}
ENDP
ENDIF
IF OPUS_ARM_MAY_HAVE_EDSP
; This will get used on ARMv7 devices without NEON, so it has been optimized
; to take advantage of dual-issuing where possible.
xcorr_kernel_edsp PROC
xcorr_kernel_edsp_start
; input:
; r3 = int len
; r4 = opus_val16 *_x (must be 32-bit aligned)
; r5 = opus_val16 *_y (must be 32-bit aligned)
; r6...r9 = opus_val32 sum[4]
; output:
; r6...r9 = opus_val32 sum[4]
; preserved: r0-r5
; internal usage
; r2 = int j
; r12,r14 = opus_val16 x[4]
; r10,r11 = opus_val16 y[4]
STMFD sp!, {r2,r4,r5,lr}
LDR r10, [r5], #4 ; Load y[0...1]
SUBS r2, r3, #4 ; j = len-4
LDR r11, [r5], #4 ; Load y[2...3]
BLE xcorr_kernel_edsp_process4_done
LDR r12, [r4], #4 ; Load x[0...1]
; Stall
xcorr_kernel_edsp_process4
; The multiplies must issue from pipeline 0, and can't dual-issue with each
; other. Every other instruction here dual-issues with a multiply, and is
; thus "free". There should be no stalls in the body of the loop.
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
LDR r14, [r4], #4 ; Load x[2...3]
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
SUBS r2, r2, #4 ; j-=4
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
LDR r10, [r5], #4 ; Load y[4...5]
SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
LDRGT r12, [r4], #4 ; Load x[0...1]
SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
LDR r11, [r5], #4 ; Load y[6...7]
SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
BGT xcorr_kernel_edsp_process4
xcorr_kernel_edsp_process4_done
ADDS r2, r2, #4
BLE xcorr_kernel_edsp_done
LDRH r12, [r4], #2 ; r12 = *x++
SUBS r2, r2, #1 ; j--
; Stall
SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
LDRHGT r14, [r4], #2 ; r14 = *x++
SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
BLE xcorr_kernel_edsp_done
SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
SUBS r2, r2, #1 ; j--
SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
LDRH r10, [r5], #2 ; r10 = y_4 = *y++
SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
LDRHGT r12, [r4], #2 ; r12 = *x++
SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
BLE xcorr_kernel_edsp_done
SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
CMP r2, #1 ; j--
SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
LDRH r2, [r5], #2 ; r2 = y_5 = *y++
SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
LDRHGT r14, [r4] ; r14 = *x
SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
BLE xcorr_kernel_edsp_done
SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
LDRH r11, [r5] ; r11 = y_6 = *y
SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
xcorr_kernel_edsp_done
LDMFD sp!, {r2,r4,r5,pc}
ENDP
celt_pitch_xcorr_edsp PROC
; input:
; r0 = opus_val16 *_x (must be 32-bit aligned)
; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
; r2 = opus_val32 *xcorr
; r3 = int len
; output:
; r0 = maxcorr
; internal usage
; r4 = opus_val16 *x
; r5 = opus_val16 *y
; r6 = opus_val32 sum0
; r7 = opus_val32 sum1
; r8 = opus_val32 sum2
; r9 = opus_val32 sum3
; r1 = int max_pitch
; r12 = int j
; ignored:
; int arch
STMFD sp!, {r4-r11, lr}
MOV r5, r1
LDR r1, [sp, #36]
MOV r4, r0
TST r5, #3
; maxcorr = 1
MOV r0, #1
BEQ celt_pitch_xcorr_edsp_process1u_done
; Compute one sum at the start to make y 32-bit aligned.
SUBS r12, r3, #4
; r14 = sum = 0
MOV r14, #0
LDRH r8, [r5], #2
BLE celt_pitch_xcorr_edsp_process1u_loop4_done
LDR r6, [r4], #4
MOV r8, r8, LSL #16
celt_pitch_xcorr_edsp_process1u_loop4
LDR r9, [r5], #4
SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
LDR r7, [r4], #4
SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
LDR r8, [r5], #4
SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
SUBS r12, r12, #4 ; j-=4
SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
LDRGT r6, [r4], #4
BGT celt_pitch_xcorr_edsp_process1u_loop4
MOV r8, r8, LSR #16
celt_pitch_xcorr_edsp_process1u_loop4_done
ADDS r12, r12, #4
celt_pitch_xcorr_edsp_process1u_loop1
LDRHGE r6, [r4], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
SUBSGE r12, r12, #1
LDRHGT r8, [r5], #2
BGT celt_pitch_xcorr_edsp_process1u_loop1
; Restore _x
SUB r4, r4, r3, LSL #1
; Restore and advance _y
SUB r5, r5, r3, LSL #1
; maxcorr = max(maxcorr, sum)
CMP r0, r14
ADD r5, r5, #2
MOVLT r0, r14
SUBS r1, r1, #1
; xcorr[i] = sum
STR r14, [r2], #4
BLE celt_pitch_xcorr_edsp_done
celt_pitch_xcorr_edsp_process1u_done
; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
SUBS r1, r1, #4
BLT celt_pitch_xcorr_edsp_process2
celt_pitch_xcorr_edsp_process4
; xcorr_kernel_edsp parameters:
; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
MOV r6, #0
MOV r7, #0
MOV r8, #0
MOV r9, #0
BL xcorr_kernel_edsp_start ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
CMP r0, r6
; _y+=4
ADD r5, r5, #8
MOVLT r0, r6
CMP r0, r7
MOVLT r0, r7
CMP r0, r8
MOVLT r0, r8
CMP r0, r9
MOVLT r0, r9
STMIA r2!, {r6-r9}
SUBS r1, r1, #4
BGE celt_pitch_xcorr_edsp_process4
celt_pitch_xcorr_edsp_process2
ADDS r1, r1, #2
BLT celt_pitch_xcorr_edsp_process1a
SUBS r12, r3, #4
; {r10, r11} = {sum0, sum1} = {0, 0}
MOV r10, #0
MOV r11, #0
LDR r8, [r5], #4
BLE celt_pitch_xcorr_edsp_process2_loop_done
LDR r6, [r4], #4
LDR r9, [r5], #4
celt_pitch_xcorr_edsp_process2_loop4
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDR r7, [r4], #4
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
SUBS r12, r12, #4 ; j-=4
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
LDR r8, [r5], #4
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
LDRGT r6, [r4], #4
SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
LDRGT r9, [r5], #4
SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
BGT celt_pitch_xcorr_edsp_process2_loop4
celt_pitch_xcorr_edsp_process2_loop_done
ADDS r12, r12, #2
BLE celt_pitch_xcorr_edsp_process2_1
LDR r6, [r4], #4
; Stall
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDR r9, [r5], #4
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
SUB r12, r12, #2
SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
MOV r8, r9
SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
celt_pitch_xcorr_edsp_process2_1
LDRH r6, [r4], #2
ADDS r12, r12, #1
; Stall
SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
LDRHGT r7, [r4], #2
SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
BLE celt_pitch_xcorr_edsp_process2_done
LDRH r9, [r5], #2
SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
celt_pitch_xcorr_edsp_process2_done
; Restore _x
SUB r4, r4, r3, LSL #1
; Restore and advance _y
SUB r5, r5, r3, LSL #1
; maxcorr = max(maxcorr, sum0)
CMP r0, r10
ADD r5, r5, #2
MOVLT r0, r10
SUB r1, r1, #2
; maxcorr = max(maxcorr, sum1)
CMP r0, r11
; xcorr[i] = sum
STR r10, [r2], #4
MOVLT r0, r11
STR r11, [r2], #4
celt_pitch_xcorr_edsp_process1a
ADDS r1, r1, #1
BLT celt_pitch_xcorr_edsp_done
SUBS r12, r3, #4
; r14 = sum = 0
MOV r14, #0
BLT celt_pitch_xcorr_edsp_process1a_loop_done
LDR r6, [r4], #4
LDR r8, [r5], #4
LDR r7, [r4], #4
LDR r9, [r5], #4
celt_pitch_xcorr_edsp_process1a_loop4
SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
SUBS r12, r12, #4 ; j-=4
SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
LDRGE r6, [r4], #4
SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
LDRGE r8, [r5], #4
SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
LDRGE r7, [r4], #4
LDRGE r9, [r5], #4
BGE celt_pitch_xcorr_edsp_process1a_loop4
celt_pitch_xcorr_edsp_process1a_loop_done
ADDS r12, r12, #2
LDRGE r6, [r4], #4
LDRGE r8, [r5], #4
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
SUBGE r12, r12, #2
SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
ADDS r12, r12, #1
LDRHGE r6, [r4], #2
LDRHGE r8, [r5], #2
; Stall
SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
; maxcorr = max(maxcorr, sum)
CMP r0, r14
; xcorr[i] = sum
STR r14, [r2], #4
MOVLT r0, r14
celt_pitch_xcorr_edsp_done
LDMFD sp!, {r4-r11, pc}
ENDP
ENDIF
END
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file fft_arm.h
@brief ARM Neon Intrinsic optimizations for fft using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(FFT_ARM_H)
#define FFT_ARM_H
#include "kiss_fft.h"
#if defined(HAVE_ARM_NE10)
int opus_fft_alloc_arm_neon(kiss_fft_state *st);
void opus_fft_free_arm_neon(kiss_fft_state *st);
void opus_fft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout);
void opus_ifft_neon(const kiss_fft_state *st,
const kiss_fft_cpx *fin,
kiss_fft_cpx *fout);
#if !defined(OPUS_HAVE_RTCD)
#define OVERRIDE_OPUS_FFT (1)
#define opus_fft_alloc_arch(_st, arch) \
((void)(arch), opus_fft_alloc_arm_neon(_st))
#define opus_fft_free_arch(_st, arch) \
((void)(arch), opus_fft_free_arm_neon(_st))
#define opus_fft(_st, _fin, _fout, arch) \
((void)(arch), opus_fft_neon(_st, _fin, _fout))
#define opus_ifft(_st, _fin, _fout, arch) \
((void)(arch), opus_ifft_neon(_st, _fin, _fout))
#endif /* OPUS_HAVE_RTCD */
#endif /* HAVE_ARM_NE10 */
#endif
/* Copyright (C) 2015 Vidyo */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FIXED_ARM64_H
#define FIXED_ARM64_H
#include <arm_neon.h>
#undef SIG2WORD16
#define SIG2WORD16(x) (vqmovns_s32(PSHR32((x), SIG_SHIFT)))
#endif
/* Copyright (C) 2013 Xiph.Org Foundation and contributors */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FIXED_ARMv4_H
#define FIXED_ARMv4_H
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
#undef MULT16_32_Q16
static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
{
unsigned rd_lo;
int rd_hi;
__asm__(
"#MULT16_32_Q16\n\t"
"smull %0, %1, %2, %3\n\t"
: "=&r"(rd_lo), "=&r"(rd_hi)
: "%r"(b),"r"(SHL32(a,16))
);
return rd_hi;
}
#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv4(a, b))
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
#undef MULT16_32_Q15
static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
{
unsigned rd_lo;
int rd_hi;
__asm__(
"#MULT16_32_Q15\n\t"
"smull %0, %1, %2, %3\n\t"
: "=&r"(rd_lo), "=&r"(rd_hi)
: "%r"(b), "r"(SHL32(a,16))
);
/*We intentionally don't OR in the high bit of rd_lo for speed.*/
return SHL32(rd_hi,1);
}
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
b must fit in 31 bits.
Result fits in 32 bits. */
#undef MAC16_32_Q15
#define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b))
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
Result fits in 32 bits. */
#undef MAC16_32_Q16
#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b))
/** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
#undef MULT32_32_Q31
#define MULT32_32_Q31(a,b) (opus_val32)((((opus_int64)(a)) * ((opus_int64)(b)))>>31)
#endif
/* Copyright (C) 2007-2009 Xiph.Org Foundation
Copyright (C) 2003-2008 Jean-Marc Valin
Copyright (C) 2007-2008 CSIRO
Copyright (C) 2013 Parrot */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FIXED_ARMv5E_H
#define FIXED_ARMv5E_H
#include "fixed_armv4.h"
/** 16x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
#undef MULT16_32_Q16
static OPUS_INLINE opus_val32 MULT16_32_Q16_armv5e(opus_val16 a, opus_val32 b)
{
int res;
__asm__(
"#MULT16_32_Q16\n\t"
"smulwb %0, %1, %2\n\t"
: "=r"(res)
: "r"(b),"r"(a)
);
return res;
}
#define MULT16_32_Q16(a, b) (MULT16_32_Q16_armv5e(a, b))
/** 16x32 multiplication, followed by a 15-bit shift right. Results fits in 32 bits */
#undef MULT16_32_Q15
static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
{
int res;
__asm__(
"#MULT16_32_Q15\n\t"
"smulwb %0, %1, %2\n\t"
: "=r"(res)
: "r"(b), "r"(a)
);
return SHL32(res,1);
}
#define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
/** 16x32 multiply, followed by a 15-bit shift right and 32-bit add.
b must fit in 31 bits.
Result fits in 32 bits. */
#undef MAC16_32_Q15
static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
opus_val32 b)
{
int res;
__asm__(
"#MAC16_32_Q15\n\t"
"smlawb %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(SHL32(b,1)), "r"(a), "r"(c)
);
return res;
}
#define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b))
/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add.
Result fits in 32 bits. */
#undef MAC16_32_Q16
static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a,
opus_val32 b)
{
int res;
__asm__(
"#MAC16_32_Q16\n\t"
"smlawb %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(b), "r"(a), "r"(c)
);
return res;
}
#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b))
/** 16x16 multiply-add where the result fits in 32 bits */
#undef MAC16_16
static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a,
opus_val16 b)
{
int res;
__asm__(
"#MAC16_16\n\t"
"smlabb %0, %1, %2, %3;\n"
: "=r"(res)
: "r"(a), "r"(b), "r"(c)
);
return res;
}
#define MAC16_16(c, a, b) (MAC16_16_armv5e(c, a, b))
/** 16x16 multiplication where the result fits in 32 bits */
#undef MULT16_16
static OPUS_INLINE opus_val32 MULT16_16_armv5e(opus_val16 a, opus_val16 b)
{
int res;
__asm__(
"#MULT16_16\n\t"
"smulbb %0, %1, %2;\n"
: "=r"(res)
: "r"(a), "r"(b)
);
return res;
}
#define MULT16_16(a, b) (MULT16_16_armv5e(a, b))
#ifdef OPUS_ARM_INLINE_MEDIA
#undef SIG2WORD16
static OPUS_INLINE opus_val16 SIG2WORD16_armv6(opus_val32 x)
{
celt_sig res;
__asm__(
"#SIG2WORD16\n\t"
"ssat %0, #16, %1, ASR #12\n\t"
: "=r"(res)
: "r"(x+2048)
);
return EXTRACT16(res);
}
#define SIG2WORD16(x) (SIG2WORD16_armv6(x))
#endif /* OPUS_ARM_INLINE_MEDIA */
#endif
/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.*/
#ifndef KISS_FFT_ARMv4_H
#define KISS_FFT_ARMv4_H
#if !defined(KISS_FFT_GUTS_H)
#error "This file should only be included from _kiss_fft_guts.h"
#endif
#ifdef FIXED_POINT
#undef C_MUL
#define C_MUL(m,a,b) \
do{ \
int br__; \
int bi__; \
int tt__; \
__asm__ __volatile__( \
"#C_MUL\n\t" \
"ldrsh %[br], [%[bp], #0]\n\t" \
"ldm %[ap], {r0,r1}\n\t" \
"ldrsh %[bi], [%[bp], #2]\n\t" \
"smull %[tt], %[mi], r1, %[br]\n\t" \
"smlal %[tt], %[mi], r0, %[bi]\n\t" \
"rsb %[bi], %[bi], #0\n\t" \
"smull %[br], %[mr], r0, %[br]\n\t" \
"mov %[tt], %[tt], lsr #15\n\t" \
"smlal %[br], %[mr], r1, %[bi]\n\t" \
"orr %[mi], %[tt], %[mi], lsl #17\n\t" \
"mov %[br], %[br], lsr #15\n\t" \
"orr %[mr], %[br], %[mr], lsl #17\n\t" \
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
: "r0", "r1" \
); \
} \
while(0)
#undef C_MUL4
#define C_MUL4(m,a,b) \
do{ \
int br__; \
int bi__; \
int tt__; \
__asm__ __volatile__( \
"#C_MUL4\n\t" \
"ldrsh %[br], [%[bp], #0]\n\t" \
"ldm %[ap], {r0,r1}\n\t" \
"ldrsh %[bi], [%[bp], #2]\n\t" \
"smull %[tt], %[mi], r1, %[br]\n\t" \
"smlal %[tt], %[mi], r0, %[bi]\n\t" \
"rsb %[bi], %[bi], #0\n\t" \
"smull %[br], %[mr], r0, %[br]\n\t" \
"mov %[tt], %[tt], lsr #17\n\t" \
"smlal %[br], %[mr], r1, %[bi]\n\t" \
"orr %[mi], %[tt], %[mi], lsl #15\n\t" \
"mov %[br], %[br], lsr #17\n\t" \
"orr %[mr], %[br], %[mr], lsl #15\n\t" \
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
: "r0", "r1" \
); \
} \
while(0)
#undef C_MULC
#define C_MULC(m,a,b) \
do{ \
int br__; \
int bi__; \
int tt__; \
__asm__ __volatile__( \
"#C_MULC\n\t" \
"ldrsh %[br], [%[bp], #0]\n\t" \
"ldm %[ap], {r0,r1}\n\t" \
"ldrsh %[bi], [%[bp], #2]\n\t" \
"smull %[tt], %[mr], r0, %[br]\n\t" \
"smlal %[tt], %[mr], r1, %[bi]\n\t" \
"rsb %[bi], %[bi], #0\n\t" \
"smull %[br], %[mi], r1, %[br]\n\t" \
"mov %[tt], %[tt], lsr #15\n\t" \
"smlal %[br], %[mi], r0, %[bi]\n\t" \
"orr %[mr], %[tt], %[mr], lsl #17\n\t" \
"mov %[br], %[br], lsr #15\n\t" \
"orr %[mi], %[br], %[mi], lsl #17\n\t" \
: [mr]"=r"((m).r), [mi]"=r"((m).i), \
[br]"=&r"(br__), [bi]"=r"(bi__), [tt]"=r"(tt__) \
: [ap]"r"(&(a)), [bp]"r"(&(b)) \
: "r0", "r1" \
); \
} \
while(0)
#endif /* FIXED_POINT */
#endif /* KISS_FFT_ARMv4_H */
/*Copyright (c) 2013, Xiph.Org Foundation and contributors.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.*/
#ifndef KISS_FFT_ARMv5E_H
#define KISS_FFT_ARMv5E_H
#if !defined(KISS_FFT_GUTS_H)
#error "This file should only be included from _kiss_fft_guts.h"
#endif
#ifdef FIXED_POINT
#if defined(__thumb__)||defined(__thumb2__)
#define LDRD_CONS "Q"
#else
#define LDRD_CONS "Uq"
#endif
#undef C_MUL
#define C_MUL(m,a,b) \
do{ \
int mr1__; \
int mr2__; \
int mi__; \
long long aval__; \
int bval__; \
__asm__( \
"#C_MUL\n\t" \
"ldrd %[aval], %H[aval], %[ap]\n\t" \
"ldr %[bval], %[bp]\n\t" \
"smulwb %[mi], %H[aval], %[bval]\n\t" \
"smulwb %[mr1], %[aval], %[bval]\n\t" \
"smulwt %[mr2], %H[aval], %[bval]\n\t" \
"smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
: [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
: [ap]LDRD_CONS(a), [bp]"m"(b) \
); \
(m).r = SHL32(SUB32(mr1__, mr2__), 1); \
(m).i = SHL32(mi__, 1); \
} \
while(0)
#undef C_MUL4
#define C_MUL4(m,a,b) \
do{ \
int mr1__; \
int mr2__; \
int mi__; \
long long aval__; \
int bval__; \
__asm__( \
"#C_MUL4\n\t" \
"ldrd %[aval], %H[aval], %[ap]\n\t" \
"ldr %[bval], %[bp]\n\t" \
"smulwb %[mi], %H[aval], %[bval]\n\t" \
"smulwb %[mr1], %[aval], %[bval]\n\t" \
"smulwt %[mr2], %H[aval], %[bval]\n\t" \
"smlawt %[mi], %[aval], %[bval], %[mi]\n\t" \
: [mr1]"=r"(mr1__), [mr2]"=r"(mr2__), [mi]"=r"(mi__), \
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
: [ap]LDRD_CONS(a), [bp]"m"(b) \
); \
(m).r = SHR32(SUB32(mr1__, mr2__), 1); \
(m).i = SHR32(mi__, 1); \
} \
while(0)
#undef C_MULC
#define C_MULC(m,a,b) \
do{ \
int mr__; \
int mi1__; \
int mi2__; \
long long aval__; \
int bval__; \
__asm__( \
"#C_MULC\n\t" \
"ldrd %[aval], %H[aval], %[ap]\n\t" \
"ldr %[bval], %[bp]\n\t" \
"smulwb %[mr], %[aval], %[bval]\n\t" \
"smulwb %[mi1], %H[aval], %[bval]\n\t" \
"smulwt %[mi2], %[aval], %[bval]\n\t" \
"smlawt %[mr], %H[aval], %[bval], %[mr]\n\t" \
: [mr]"=r"(mr__), [mi1]"=r"(mi1__), [mi2]"=r"(mi2__), \
[aval]"=&r"(aval__), [bval]"=r"(bval__) \
: [ap]LDRD_CONS(a), [bp]"m"(b) \
); \
(m).r = SHL32(mr__, 1); \
(m).i = SHL32(SUB32(mi1__, mi2__), 1); \
} \
while(0)
#endif /* FIXED_POINT */
#endif /* KISS_FFT_GUTS_H */
/* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(MATHOPS_ARM_H)
# define MATHOPS_ARM_H
#include "armcpu.h"
#include "cpu_support.h"
#include "opus_defines.h"
# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
#include <arm_neon.h>
static inline int32x4_t vroundf(float32x4_t x)
{
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
return vcvtaq_s32_f32(x);
# else
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
uint32x4_t bias = vdupq_n_u32(0x3F000000);
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
# endif
}
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) \
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
# endif
# endif
#endif /* MATHOPS_ARM_H */
/* Copyright (c) 2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file arm_mdct.h
@brief ARM Neon Intrinsic optimizations for mdct using NE10 library
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(MDCT_ARM_H)
#define MDCT_ARM_H
#include "mdct.h"
#if defined(HAVE_ARM_NE10)
/** Compute a forward MDCT and scale by 4/N, trashes the input array */
void clt_mdct_forward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap,
int shift, int stride, int arch);
void clt_mdct_backward_neon(const mdct_lookup *l, kiss_fft_scalar *in,
kiss_fft_scalar * OPUS_RESTRICT out,
const opus_val16 *window, int overlap,
int shift, int stride, int arch);
#if !defined(OPUS_HAVE_RTCD)
#define OVERRIDE_OPUS_MDCT (1)
#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
clt_mdct_forward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
#define clt_mdct_backward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
clt_mdct_backward_neon(_l, _in, _out, _window, _int, _shift, _stride, _arch)
#endif /* OPUS_HAVE_RTCD */
#endif /* HAVE_ARM_NE10 */
#endif