diff --git a/celt/arch.h b/celt/arch.h index 8c79a66e11558a042a5e9e148c723a34301e13b3..c910c807fb777828a69ad4649e80e4fb7575d24b 100644 --- a/celt/arch.h +++ b/celt/arch.h @@ -208,6 +208,7 @@ static OPUS_INLINE int celt_isnan(float x) #define MULT32_32_Q31(a,b) ((a)*(b)) #define MAC16_32_Q15(c,a,b) ((c)+(a)*(b)) +#define MAC16_32_Q16(c,a,b) ((c)+(a)*(b)) #define MULT16_16_Q11_32(a,b) ((a)*(b)) #define MULT16_16_Q11(a,b) ((a)*(b)) diff --git a/celt/arm/fixed_armv4.h b/celt/arm/fixed_armv4.h index b690bc8ceae8e56a54dff2053d4721b6f2aea2d5..efb3b1896a866b1d5ec861fedbb334575a464c4d 100644 --- a/celt/arm/fixed_armv4.h +++ b/celt/arm/fixed_armv4.h @@ -68,6 +68,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b) #undef MAC16_32_Q15 #define MAC16_32_Q15(c, a, b) ADD32(c, MULT16_32_Q15(a, b)) +/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add. + Result fits in 32 bits. */ +#undef MAC16_32_Q16 +#define MAC16_32_Q16(c, a, b) ADD32(c, MULT16_32_Q16(a, b)) /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */ #undef MULT32_32_Q31 diff --git a/celt/arm/fixed_armv5e.h b/celt/arm/fixed_armv5e.h index 1194a7d3ecb61ae9ac20ec530ec2e1eacc6a519a..36d6bed0e27a703264c964201e983d84ecbd90a6 100644 --- a/celt/arm/fixed_armv5e.h +++ b/celt/arm/fixed_armv5e.h @@ -82,6 +82,23 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a, } #define MAC16_32_Q15(c, a, b) (MAC16_32_Q15_armv5e(c, a, b)) +/** 16x32 multiply, followed by a 16-bit shift right and 32-bit add. + Result fits in 32 bits. */ +#undef MAC16_32_Q16 +static OPUS_INLINE opus_val32 MAC16_32_Q16_armv5e(opus_val32 c, opus_val16 a, + opus_val32 b) +{ + int res; + __asm__( + "#MAC16_32_Q16\n\t" + "smlawb %0, %1, %2, %3;\n" + : "=r"(res) + : "r"(b), "r"(a), "r"(c) + ); + return res; +} +#define MAC16_32_Q16(c, a, b) (MAC16_32_Q16_armv5e(c, a, b)) + /** 16x16 multiply-add where the result fits in 32 bits */ #undef MAC16_16 static OPUS_INLINE opus_val32 MAC16_16_armv5e(opus_val32 c, opus_val16 a, diff --git a/celt/celt.c b/celt/celt.c index 4f9c9e051457e994eb41ba16821f83c8d6d9102e..5741b6b546a9f294dccf446c7c34ed9c460142da 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -86,6 +86,33 @@ int resampling_factor(opus_int32 rate) } #ifndef OVERRIDE_COMB_FILTER_CONST +/* This version should be faster on ARM */ +#ifdef OPUS_ARM_ASM +static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + opus_val32 x0, x1, x2, x3, x4; + int i; + x4 = SHL32(x[-T-2], 1); + x3 = SHL32(x[-T-1], 1); + x2 = SHL32(x[-T], 1); + x1 = SHL32(x[-T+1], 1); + for (i=0;i<N;i++) + { + opus_val32 t; + x0=SHL32(x[i-T+2],1); + t = MAC16_32_Q16(x[i], g10, x2); + t = MAC16_32_Q16(t, g11, ADD32(x1,x3)); + t = MAC16_32_Q16(t, g12, ADD32(x0,x4)); + y[i] = t; + x4=x3; + x3=x2; + x2=x1; + x1=x0; + } + +} +#else static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { @@ -110,6 +137,7 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, } #endif +#endif void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h index 80bc94910fa3e51fe762847734d7bd60ab0cb5af..5d7120d4b1e3a8eebfb82ad2e0283ab4e314f48f 100644 --- a/celt/fixed_debug.h +++ b/celt/fixed_debug.h @@ -496,6 +496,7 @@ static OPUS_INLINE int MULT16_32_PX_(int a, opus_int64 b, int Q, char *file, int #define MULT16_32_Q15(a,b) MULT16_32_QX(a,b,15) #define MAC16_32_Q15(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q15((a),(b)))) +#define MAC16_32_Q16(c,a,b) (celt_mips-=2,ADD32((c),MULT16_32_Q16((a),(b)))) static OPUS_INLINE int SATURATE(int a, int b) { diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h index ecf018a2443a200ecb55afd3445ac499afd75c80..8d13fde7a143f91b7591ec2c1089b0cd2d51baca 100644 --- a/celt/fixed_generic.h +++ b/celt/fixed_generic.h @@ -113,7 +113,11 @@ /** 16x32 multiply, followed by a 15-bit shift right and 32-bit add. b must fit in 31 bits. Result fits in 32 bits. */ -#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))) +#define MAC16_32_Q15(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))) + +/** 16x32 multiplication, followed by a 16-bit shift right and 32-bit add. + Results fits in 32 bits */ +#define MAC16_32_Q16(c,a,b) ADD32((c),ADD32(MULT16_16((a),SHR((b),16)), SHR(MULT16_16SU((a),((b)&0x0000ffff)),16))) #define MULT16_16_Q11_32(a,b) (SHR(MULT16_16((a),(b)),11)) #define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))