From 48923ae9966c4c065c7cea776944155ca0617f34 Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin <jean-marc.valin@octasic.com> Date: Fri, 23 Jul 2010 17:28:50 -0400 Subject: [PATCH] Cleanup, de-inlining some math functions --- libcelt/Makefile.am | 2 +- libcelt/bands.c | 16 ++++ libcelt/mathops.c | 179 ++++++++++++++++++++++++++++++++++++ libcelt/mathops.h | 211 +------------------------------------------ tests/dft-test.c | 1 + tests/mathops-test.c | 21 +---- tests/mdct-test.c | 1 + 7 files changed, 204 insertions(+), 227 deletions(-) create mode 100644 libcelt/mathops.c diff --git a/libcelt/Makefile.am b/libcelt/Makefile.am index 086fe1118..8d166cbae 100644 --- a/libcelt/Makefile.am +++ b/libcelt/Makefile.am @@ -15,7 +15,7 @@ lib_LTLIBRARIES = libcelt@LIBCELT_SUFFIX@.la # Sources for compilation in the library libcelt@LIBCELT_SUFFIX@_la_SOURCES = bands.c celt.c cwrs.c ecintrin.h entcode.c \ - entdec.c entenc.c header.c kiss_fft.c laplace.c mdct.c \ + entdec.c entenc.c header.c kiss_fft.c laplace.c mathops.c mdct.c \ modes.c pitch.c plc.c quant_bands.c rangedec.c rangeenc.c rate.c \ vq.c diff --git a/libcelt/bands.c b/libcelt/bands.c index 5deca8ba4..d154ac85e 100644 --- a/libcelt/bands.c +++ b/libcelt/bands.c @@ -45,6 +45,22 @@ #include "mathops.h" #include "rate.h" +/* This is a cos() approximation designed to be bit-exact on any platform. Bit exactness + with this approximation is important because it has an impact on the bit allocation */ +static celt_int16 bitexact_cos(celt_int16 x) +{ + celt_int32 tmp; + celt_int16 x2; + tmp = (4096+((celt_int32)(x)*(x)))>>13; + if (tmp > 32767) + tmp = 32767; + x2 = tmp; + x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2))))); + if (x2 > 32766) + x2 = 32766; + return 1+x2; +} + #ifdef FIXED_POINT /* Compute the amplitude (sqrt energy) in each of the bands */ diff --git a/libcelt/mathops.c b/libcelt/mathops.c new file mode 100644 index 000000000..2a9ab5e78 --- /dev/null +++ b/libcelt/mathops.c @@ -0,0 +1,179 @@ +/* Copyright (c) 2002-2008 Jean-Marc Valin + Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Written by Jean-Marc Valin */ +/** + @file mathops.h + @brief Various math functions +*/ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + - Neither the name of the Xiph.org Foundation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "mathops.h" + +#ifdef FIXED_POINT + +celt_word32 frac_div32(celt_word32 a, celt_word32 b) +{ + celt_word16 rcp; + celt_word32 result, rem; + int shift = 30-celt_ilog2(b); + a = SHL32(a,shift); + b = SHL32(b,shift); + + /* 16-bit reciprocal */ + rcp = ROUND16(celt_rcp(ROUND16(b,16)),2); + result = SHL32(MULT16_32_Q15(rcp, a),1); + rem = a-MULT32_32_Q31(result, b); + result += SHL32(MULT16_32_Q15(rcp, rem),1); + return result; +} + +/** Reciprocal sqrt approximation in the range [0.25,1) (Q16 in, Q14 out) */ +celt_word16 celt_rsqrt_norm(celt_word32 x) +{ + celt_word16 n; + celt_word16 r; + celt_word16 r2; + celt_word16 y; + /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */ + n = x-32768; + /* Get a rough initial guess for the root. + The optimal minimax quadratic approximation (using relative error) is + r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485). + Coefficients here, and the final result r, are Q14.*/ + r = ADD16(23557, MULT16_16_Q15(n, ADD16(-13490, MULT16_16_Q15(n, 6713)))); + /* We want y = x*r*r-1 in Q15, but x is 32-bit Q16 and r is Q14. + We can compute the result from n and r using Q15 multiplies with some + adjustment, carefully done to avoid overflow. + Range of y is [-1564,1594]. */ + r2 = MULT16_16_Q15(r, r); + y = SHL16(SUB16(ADD16(MULT16_16_Q15(r2, n), r2), 16384), 1); + /* Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5). + This yields the Q14 reciprocal square root of the Q16 x, with a maximum + relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a + peak absolute error of 2.26591/16384. */ + return ADD16(r, MULT16_16_Q15(r, MULT16_16_Q15(y, + SUB16(MULT16_16_Q15(y, 12288), 16384)))); +} + +/** Sqrt approximation (QX input, QX/2 output) */ +celt_word32 celt_sqrt(celt_word32 x) +{ + int k; + celt_word16 n; + celt_word32 rt; + static const celt_word16 C[5] = {23175, 11561, -3011, 1699, -664}; + if (x==0) + return 0; + k = (celt_ilog2(x)>>1)-7; + x = VSHR32(x, (k<<1)); + n = x-32768; + rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], + MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4]))))))))); + rt = VSHR32(rt,7-k); + return rt; +} + +#define L1 32767 +#define L2 -7651 +#define L3 8277 +#define L4 -626 + +static inline celt_word16 _celt_cos_pi_2(celt_word16 x) +{ + celt_word16 x2; + + x2 = MULT16_16_P15(x,x); + return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2 + )))))))); +} + +#undef L1 +#undef L2 +#undef L3 +#undef L4 + +celt_word16 celt_cos_norm(celt_word32 x) +{ + x = x&0x0001ffff; + if (x>SHL32(EXTEND32(1), 16)) + x = SUB32(SHL32(EXTEND32(1), 17),x); + if (x&0x00007fff) + { + if (x<SHL32(EXTEND32(1), 15)) + { + return _celt_cos_pi_2(EXTRACT16(x)); + } else { + return NEG32(_celt_cos_pi_2(EXTRACT16(65536-x))); + } + } else { + if (x&0x0000ffff) + return 0; + else if (x&0x0001ffff) + return -32767; + else + return 32767; + } +} + +/** Reciprocal approximation (Q15 input, Q16 output) */ +celt_word32 celt_rcp(celt_word32 x) +{ + int i; + celt_word16 n; + celt_word16 r; + celt_assert2(x>0, "celt_rcp() only defined for positive values"); + i = celt_ilog2(x); + /* n is Q15 with range [0,1). */ + n = VSHR32(x,i-15)-32768; + /* Start with a linear approximation: + r = 1.8823529411764706-0.9411764705882353*n. + The coefficients and the result are Q14 in the range [15420,30840].*/ + r = ADD16(30840, MULT16_16_Q15(-15420, n)); + /* Perform two Newton iterations: + r -= r*((r*n)-1.Q15) + = r*((r*n)+(r-1.Q15)). */ + r = SUB16(r, MULT16_16_Q15(r, + ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768)))); + /* We subtract an extra 1 in the second iteration to avoid overflow; it also + neatly compensates for truncation error in the rest of the process. */ + r = SUB16(r, ADD16(1, MULT16_16_Q15(r, + ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768))))); + /* r is now the Q15 solution to 2/(n+1), with a maximum relative error + of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute + error of 1.24665/32768. */ + return VSHR32(EXTEND32(r),i-16); +} + +#endif diff --git a/libcelt/mathops.h b/libcelt/mathops.h index 31778a13d..224857df8 100644 --- a/libcelt/mathops.h +++ b/libcelt/mathops.h @@ -42,66 +42,13 @@ #include "entcode.h" #include "os_support.h" - - -#ifndef OVERRIDE_FIND_MAX16 -static inline int find_max16(celt_word16 *x, int len) -{ - celt_word16 max_corr=-VERY_LARGE16; - int i, id = 0; - for (i=0;i<len;i++) - { - if (x[i] > max_corr) - { - id = i; - max_corr = x[i]; - } - } - return id; -} -#endif - -#ifndef OVERRIDE_FIND_MAX32 -static inline int find_max32(celt_word32 *x, int len) -{ - celt_word32 max_corr=-VERY_LARGE32; - int i, id = 0; - for (i=0;i<len;i++) - { - if (x[i] > max_corr) - { - id = i; - max_corr = x[i]; - } - } - return id; -} -#endif - /* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */ #define FRAC_MUL16(a,b) ((16384+((celt_int32)(celt_int16)(a)*(celt_int16)(b)))>>15) -/* This is a cos() approximation designed to be bit-exact on any platform. Bit exactness - with this approximation is important because it has an impact on the bit allocation */ -static inline celt_int16 bitexact_cos(celt_int16 x) -{ - celt_int32 tmp; - celt_int16 x2; - tmp = (4096+((celt_int32)(x)*(x)))>>13; - if (tmp > 32767) - tmp = 32767; - x2 = tmp; - x2 = (32767-x2) + FRAC_MUL16(x2, (-7651 + FRAC_MUL16(x2, (8277 + FRAC_MUL16(-626, x2))))); - if (x2 > 32766) - x2 = 32766; - return 1+x2; -} - #ifndef FIXED_POINT #define celt_sqrt(x) ((float)sqrt(x)) -#define celt_psqrt(x) ((float)sqrt(x)) #define celt_rsqrt(x) (1.f/celt_sqrt(x)) #define celt_rsqrt_norm(x) (celt_rsqrt(x)) #define celt_acos acos @@ -195,119 +142,12 @@ static inline celt_int16 celt_zlog2(celt_word32 x) return x <= 0 ? 0 : celt_ilog2(x); } -/** Reciprocal sqrt approximation in the range [0.25,1) (Q16 in, Q14 out) */ -static inline celt_word16 celt_rsqrt_norm(celt_word32 x) -{ - celt_word16 n; - celt_word16 r; - celt_word16 r2; - celt_word16 y; - /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */ - n = x-32768; - /* Get a rough initial guess for the root. - The optimal minimax quadratic approximation (using relative error) is - r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485). - Coefficients here, and the final result r, are Q14.*/ - r = ADD16(23557, MULT16_16_Q15(n, ADD16(-13490, MULT16_16_Q15(n, 6713)))); - /* We want y = x*r*r-1 in Q15, but x is 32-bit Q16 and r is Q14. - We can compute the result from n and r using Q15 multiplies with some - adjustment, carefully done to avoid overflow. - Range of y is [-1564,1594]. */ - r2 = MULT16_16_Q15(r, r); - y = SHL16(SUB16(ADD16(MULT16_16_Q15(r2, n), r2), 16384), 1); - /* Apply a 2nd-order Householder iteration: r += r*y*(y*0.375-0.5). - This yields the Q14 reciprocal square root of the Q16 x, with a maximum - relative error of 1.04956E-4, a (relative) RMSE of 2.80979E-5, and a - peak absolute error of 2.26591/16384. */ - return ADD16(r, MULT16_16_Q15(r, MULT16_16_Q15(y, - SUB16(MULT16_16_Q15(y, 12288), 16384)))); -} +celt_word16 celt_rsqrt_norm(celt_word32 x); -/** Reciprocal sqrt approximation (Q30 input, Q0 output or equivalent) */ -static inline celt_word32 celt_rsqrt(celt_word32 x) -{ - int k; - k = celt_ilog2(x)>>1; - x = VSHR32(x, (k-7)<<1); - return PSHR32(celt_rsqrt_norm(x), k); -} +celt_word32 celt_sqrt(celt_word32 x); -/** Sqrt approximation (QX input, QX/2 output) */ -static inline celt_word32 celt_sqrt(celt_word32 x) -{ - int k; - celt_word16 n; - celt_word32 rt; - static const celt_word16 C[5] = {23175, 11561, -3011, 1699, -664}; - if (x==0) - return 0; - k = (celt_ilog2(x)>>1)-7; - x = VSHR32(x, (k<<1)); - n = x-32768; - rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], - MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4]))))))))); - rt = VSHR32(rt,7-k); - return rt; -} - -/** Sqrt approximation (QX input, QX/2 output) that assumes that the input is - strictly positive */ -static inline celt_word32 celt_psqrt(celt_word32 x) -{ - int k; - celt_word16 n; - celt_word32 rt; - static const celt_word16 C[5] = {23175, 11561, -3011, 1699, -664}; - k = (celt_ilog2(x)>>1)-7; - x = VSHR32(x, (k<<1)); - n = x-32768; - rt = ADD16(C[0], MULT16_16_Q15(n, ADD16(C[1], MULT16_16_Q15(n, ADD16(C[2], - MULT16_16_Q15(n, ADD16(C[3], MULT16_16_Q15(n, (C[4]))))))))); - rt = VSHR32(rt,7-k); - return rt; -} +celt_word16 celt_cos_norm(celt_word32 x); -#define L1 32767 -#define L2 -7651 -#define L3 8277 -#define L4 -626 - -static inline celt_word16 _celt_cos_pi_2(celt_word16 x) -{ - celt_word16 x2; - - x2 = MULT16_16_P15(x,x); - return ADD16(1,MIN16(32766,ADD32(SUB16(L1,x2), MULT16_16_P15(x2, ADD32(L2, MULT16_16_P15(x2, ADD32(L3, MULT16_16_P15(L4, x2 - )))))))); -} - -#undef L1 -#undef L2 -#undef L3 -#undef L4 - -static inline celt_word16 celt_cos_norm(celt_word32 x) -{ - x = x&0x0001ffff; - if (x>SHL32(EXTEND32(1), 16)) - x = SUB32(SHL32(EXTEND32(1), 17),x); - if (x&0x00007fff) - { - if (x<SHL32(EXTEND32(1), 15)) - { - return _celt_cos_pi_2(EXTRACT16(x)); - } else { - return NEG32(_celt_cos_pi_2(EXTRACT16(65536-x))); - } - } else { - if (x&0x0000ffff) - return 0; - else if (x&0x0001ffff) - return -32767; - else - return 32767; - } -} static inline celt_word16 celt_log2(celt_word32 x) { @@ -349,52 +189,11 @@ static inline celt_word32 celt_exp2(celt_word16 x) return VSHR32(EXTEND32(frac), -integer-2); } -/** Reciprocal approximation (Q15 input, Q16 output) */ -static inline celt_word32 celt_rcp(celt_word32 x) -{ - int i; - celt_word16 n; - celt_word16 r; - celt_assert2(x>0, "celt_rcp() only defined for positive values"); - i = celt_ilog2(x); - /* n is Q15 with range [0,1). */ - n = VSHR32(x,i-15)-32768; - /* Start with a linear approximation: - r = 1.8823529411764706-0.9411764705882353*n. - The coefficients and the result are Q14 in the range [15420,30840].*/ - r = ADD16(30840, MULT16_16_Q15(-15420, n)); - /* Perform two Newton iterations: - r -= r*((r*n)-1.Q15) - = r*((r*n)+(r-1.Q15)). */ - r = SUB16(r, MULT16_16_Q15(r, - ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768)))); - /* We subtract an extra 1 in the second iteration to avoid overflow; it also - neatly compensates for truncation error in the rest of the process. */ - r = SUB16(r, ADD16(1, MULT16_16_Q15(r, - ADD16(MULT16_16_Q15(r, n), ADD16(r, -32768))))); - /* r is now the Q15 solution to 2/(n+1), with a maximum relative error - of 7.05346E-5, a (relative) RMSE of 2.14418E-5, and a peak absolute - error of 1.24665/32768. */ - return VSHR32(EXTEND32(r),i-16); -} +celt_word32 celt_rcp(celt_word32 x); #define celt_div(a,b) MULT32_32_Q31((celt_word32)(a),celt_rcp(b)) -static inline celt_word32 frac_div32(celt_word32 a, celt_word32 b) -{ - celt_word16 rcp; - celt_word32 result, rem; - int shift = 30-celt_ilog2(b); - a = SHL32(a,shift); - b = SHL32(b,shift); - - /* 16-bit reciprocal */ - rcp = ROUND16(celt_rcp(ROUND16(b,16)),2); - result = SHL32(MULT16_32_Q15(rcp, a),1); - rem = a-MULT32_32_Q31(result, b); - result += SHL32(MULT16_32_Q15(rcp, rem),1); - return result; -} +celt_word32 frac_div32(celt_word32 a, celt_word32 b); #define M1 32767 #define M2 -21 diff --git a/tests/dft-test.c b/tests/dft-test.c index 8bad32876..8f3f1d7a0 100644 --- a/tests/dft-test.c +++ b/tests/dft-test.c @@ -8,6 +8,7 @@ #define CELT_C #include "../libcelt/stack_alloc.h" #include "../libcelt/kiss_fft.c" +#include "../libcelt/mathops.c" #include "../libcelt/entcode.c" diff --git a/tests/mathops-test.c b/tests/mathops-test.c index ca1ef344b..5c9bba767 100644 --- a/tests/mathops-test.c +++ b/tests/mathops-test.c @@ -2,7 +2,7 @@ #include "config.h" #endif -#include "mathops.h" +#include "mathops.c" #include <stdio.h> #include <math.h> @@ -56,24 +56,6 @@ void testsqrt(void) } } -void testrsqrt(void) -{ - celt_int32 i; - for (i=1;i<=2000000;i++) - { - double ratio; - celt_word16 val; - val = celt_rsqrt(i); - ratio = val*sqrt(i)/Q15ONE; - if (fabs(ratio - 1) > .05) - { - fprintf (stderr, "rsqrt failed: rsqrt(%d)="WORD" (ratio = %f)\n", i, val, ratio); - ret = 1; - } - i+= i>>10; - } -} - #ifndef FIXED_POINT void testlog2(void) { @@ -179,7 +161,6 @@ int main(void) { testdiv(); testsqrt(); - testrsqrt(); testlog2(); testexp2(); testexp2log2(); diff --git a/tests/mdct-test.c b/tests/mdct-test.c index ed8d98b76..63dd4e746 100644 --- a/tests/mdct-test.c +++ b/tests/mdct-test.c @@ -9,6 +9,7 @@ #include "../libcelt/kiss_fft.c" #include "../libcelt/mdct.c" +#include "../libcelt/mathops.c" #ifndef M_PI #define M_PI 3.141592653 -- GitLab