From 43120f00758219a784f952754f33b9788a8d731b Mon Sep 17 00:00:00 2001 From: Jonathan Lennox <jonathan@vidyo.com> Date: Mon, 3 Aug 2015 17:04:27 -0400 Subject: [PATCH] Reorganize x86 SSE intrinsics code. Enable x86 intrinsics when building in floating-point mode. Support SSE as an arch value. Use RTCD to conditionally enable existing floating-point Celt SSE code. Call functions directly (without RTCD) when their architecture can be presumed. Use SSE4.1 intrinsics optimized code for Silk even in floating-point mode. --- Makefile.am | 3 + celt/bands.c | 6 +- celt/celt.c | 16 ++- celt/celt.h | 12 +- celt/celt_decoder.c | 6 +- celt/celt_encoder.c | 4 +- celt/celt_lpc.h | 2 +- celt/cpu_support.h | 12 +- celt/mips/celt_mipsr1.h | 2 +- celt/pitch.c | 4 +- celt/pitch.h | 17 ++- celt/x86/celt_lpc_sse.c | 4 + celt/x86/celt_lpc_sse.h | 12 +- celt/x86/pitch_sse.c | 148 +++++++++++++++++++++ celt/x86/pitch_sse.h | 261 ++++++++++++++++--------------------- celt/x86/x86_celt_map.c | 76 ++++++++++- celt/x86/x86cpu.c | 16 +++ celt/x86/x86cpu.h | 6 + configure.ac | 8 -- silk/x86/SigProc_FIX_sse.h | 17 +++ silk/x86/main_sse.h | 48 +++++++ silk/x86/x86_silk_map.c | 25 +++- 22 files changed, 503 insertions(+), 202 deletions(-) diff --git a/Makefile.am b/Makefile.am index efff3371..2758f459 100644 --- a/Makefile.am +++ b/Makefile.am @@ -23,6 +23,9 @@ SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1) endif else SILK_SOURCES += $(SILK_SOURCES_FLOAT) +if HAVE_SSE4_1 +SILK_SOURCES += $(SILK_SOURCES_SSE4_1) +endif endif if DISABLE_FLOAT_API diff --git a/celt/bands.c b/celt/bands.c index c643b093..25f229e2 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -398,7 +398,7 @@ static void stereo_split(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT } } -static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N) +static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT Y, opus_val16 mid, int N, int arch) { int j; opus_val32 xp=0, side=0; @@ -410,7 +410,7 @@ static void stereo_merge(celt_norm * OPUS_RESTRICT X, celt_norm * OPUS_RESTRICT opus_val32 t, lgain, rgain; /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */ - dual_inner_prod(Y, X, Y, N, &xp, &side); + dual_inner_prod(Y, X, Y, N, &xp, &side, arch); /* Compensating for the mid normalization */ xp = MULT16_32_Q15(mid, xp); /* mid and side are in Q15, not Q14 like X and Y */ @@ -1348,7 +1348,7 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm if (resynth) { if (N!=2) - stereo_merge(X, Y, mid, N); + stereo_merge(X, Y, mid, N, ctx->arch); if (inv) { int j; diff --git a/celt/celt.c b/celt/celt.c index a610de4f..40c62ce5 100644 --- a/celt/celt.c +++ b/celt/celt.c @@ -89,10 +89,12 @@ int resampling_factor(opus_int32 rate) return ret; } -#ifndef OVERRIDE_COMB_FILTER_CONST /* This version should be faster on ARM */ #ifdef OPUS_ARM_ASM -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, +#ifndef NON_STATIC_COMB_FILTER_CONST_C +static +#endif +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { opus_val32 x0, x1, x2, x3, x4; @@ -147,7 +149,10 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, #endif } #else -static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, +#ifndef NON_STATIC_COMB_FILTER_CONST_C +static +#endif +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, opus_val16 g10, opus_val16 g11, opus_val16 g12) { opus_val32 x0, x1, x2, x3, x4; @@ -171,12 +176,11 @@ static void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, } #endif -#endif #ifndef OVERRIDE_comb_filter void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap) + const opus_val16 *window, int overlap, int arch) { int i; /* printf ("%d %d %f %f\n", T0, T1, g0, g1); */ @@ -234,7 +238,7 @@ void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, } /* Compute the part with the constant filter. */ - comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12); + comb_filter_const(y+i, x+i, T1, N-i, g10, g11, g12, arch); } #endif /* OVERRIDE_comb_filter */ diff --git a/celt/celt.h b/celt/celt.h index b1967516..a423b950 100644 --- a/celt/celt.h +++ b/celt/celt.h @@ -201,7 +201,17 @@ void celt_preemphasis(const opus_val16 * OPUS_RESTRICT pcmp, celt_sig * OPUS_RES void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap); + const opus_val16 *window, int overlap, int arch); + +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + +#ifndef OVERRIDE_COMB_FILTER_CONST +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_c(y, x, T, N, g10, g11, g12)) +#endif void init_caps(const CELTMode *m,int *cap,int LM,int C); diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c index 4304a3e8..3f4be806 100644 --- a/celt/celt_decoder.c +++ b/celt/celt_decoder.c @@ -698,7 +698,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM) comb_filter(etmp, buf+DECODE_BUFFER_SIZE, st->postfilter_period, st->postfilter_period, overlap, -st->postfilter_gain, -st->postfilter_gain, - st->postfilter_tapset, st->postfilter_tapset, NULL, 0); + st->postfilter_tapset, st->postfilter_tapset, NULL, 0, st->arch); /* Simulate TDAC on the concealed audio so that it blends with the MDCT of the next frame. */ @@ -1009,11 +1009,11 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat st->postfilter_period_old=IMAX(st->postfilter_period_old, COMBFILTER_MINPERIOD); comb_filter(out_syn[c], out_syn[c], st->postfilter_period_old, st->postfilter_period, mode->shortMdctSize, st->postfilter_gain_old, st->postfilter_gain, st->postfilter_tapset_old, st->postfilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); if (LM!=0) comb_filter(out_syn[c]+mode->shortMdctSize, out_syn[c]+mode->shortMdctSize, st->postfilter_period, postfilter_pitch, N-mode->shortMdctSize, st->postfilter_gain, postfilter_gain, st->postfilter_tapset, postfilter_tapset, - mode->window, overlap); + mode->window, overlap, st->arch); } while (++c<CC); st->postfilter_period_old = st->postfilter_period; diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c index 86a3fbb5..07e70711 100644 --- a/celt/celt_encoder.c +++ b/celt/celt_encoder.c @@ -1163,11 +1163,11 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem, if (offset) comb_filter(in+c*(N+overlap)+overlap, pre[c]+COMBFILTER_MAXPERIOD, st->prefilter_period, st->prefilter_period, offset, -st->prefilter_gain, -st->prefilter_gain, - st->prefilter_tapset, st->prefilter_tapset, NULL, 0); + st->prefilter_tapset, st->prefilter_tapset, NULL, 0, st->arch); comb_filter(in+c*(N+overlap)+overlap+offset, pre[c]+COMBFILTER_MAXPERIOD+offset, st->prefilter_period, pitch_index, N-offset, -st->prefilter_gain, -gain1, - st->prefilter_tapset, prefilter_tapset, mode->window, overlap); + st->prefilter_tapset, prefilter_tapset, mode->window, overlap, st->arch); OPUS_COPY(st->in_mem+c*(overlap), in+c*(N+overlap)+N, overlap); if (N>COMBFILTER_MAXPERIOD) diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h index dc8967f0..323459eb 100644 --- a/celt/celt_lpc.h +++ b/celt/celt_lpc.h @@ -48,7 +48,7 @@ void celt_fir_c( opus_val16 *mem, int arch); -#if !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if !defined(OVERRIDE_CELT_FIR) #define celt_fir(x, num, y, N, ord, mem, arch) \ (celt_fir_c(x, num, y, N, ord, mem, arch)) #endif diff --git a/celt/cpu_support.h b/celt/cpu_support.h index 590cf2e0..db1cb588 100644 --- a/celt/cpu_support.h +++ b/celt/cpu_support.h @@ -43,14 +43,16 @@ */ #define OPUS_ARCHMASK 3 -#elif (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2) || (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))) +#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) #include "x86/x86cpu.h" -/* We currently support 3 x86 variants: +/* We currently support 4 x86 variants: * arch[0] -> non-sse - * arch[1] -> sse2 - * arch[2] -> sse4.1 - * arch[3] -> NULL + * arch[1] -> sse + * arch[2] -> sse2 + * arch[3] -> sse4.1 */ #define OPUS_ARCHMASK 3 int opus_select_arch(void); diff --git a/celt/mips/celt_mipsr1.h b/celt/mips/celt_mipsr1.h index 03915d88..7915d596 100644 --- a/celt/mips/celt_mipsr1.h +++ b/celt/mips/celt_mipsr1.h @@ -56,7 +56,7 @@ #define OVERRIDE_comb_filter void comb_filter(opus_val32 *y, opus_val32 *x, int T0, int T1, int N, opus_val16 g0, opus_val16 g1, int tapset0, int tapset1, - const opus_val16 *window, int overlap) + const opus_val16 *window, int overlap, int arch) { int i; opus_val32 x0, x1, x2, x3, x4; diff --git a/celt/pitch.c b/celt/pitch.c index 43647030..1d89cb03 100644 --- a/celt/pitch.c +++ b/celt/pitch.c @@ -439,7 +439,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, T = T0 = *T0_; ALLOC(yy_lookup, maxperiod+1, opus_val32); - dual_inner_prod(x, x, x-T0, N, &xx, &xy); + dual_inner_prod(x, x, x-T0, N, &xx, &xy, arch); yy_lookup[0] = xx; yy=xx; for (i=1;i<=maxperiod;i++) @@ -483,7 +483,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod, { T1b = celt_udiv(2*second_check[k]*T0+k, 2*k); } - dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2); + dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2, arch); xy += xy2; yy = yy_lookup[T1] + yy_lookup[T1b]; #ifdef FIXED_POINT diff --git a/celt/pitch.h b/celt/pitch.h index 50b1ea6e..af745eba 100644 --- a/celt/pitch.h +++ b/celt/pitch.h @@ -37,8 +37,8 @@ #include "modes.h" #include "cpu_support.h" -#if defined(__SSE__) && !defined(FIXED_POINT) \ - || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) \ + || ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) #include "x86/pitch_sse.h" #endif @@ -135,8 +135,7 @@ static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * #endif /* OVERRIDE_XCORR_KERNEL */ -#ifndef OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, +static OPUS_INLINE void dual_inner_prod_c(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) { int i; @@ -150,6 +149,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y *xy1 = xy01; *xy2 = xy02; } + +#ifndef OVERRIDE_DUAL_INNER_PROD +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_c(x, y01, y02, N, xy1, xy2)) #endif /*We make sure a C version is always available for cases where the overhead of @@ -169,6 +172,12 @@ static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x, ((void)(arch),celt_inner_prod_c(x, y, N)) #endif +#ifdef NON_STATIC_COMB_FILTER_CONST_C +void comb_filter_const_c(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12); +#endif + + #ifdef FIXED_POINT opus_val32 #else diff --git a/celt/x86/celt_lpc_sse.c b/celt/x86/celt_lpc_sse.c index 9fb97798..67e5592a 100644 --- a/celt/x86/celt_lpc_sse.c +++ b/celt/x86/celt_lpc_sse.c @@ -38,6 +38,8 @@ #include "pitch.h" #include "x86cpu.h" +#if defined(FIXED_POINT) + void celt_fir_sse4_1(const opus_val16 *_x, const opus_val16 *num, opus_val16 *_y, @@ -126,3 +128,5 @@ void celt_fir_sse4_1(const opus_val16 *_x, #endif RESTORE_STACK; } + +#endif diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h index f1114202..c5ec796e 100644 --- a/celt/x86/celt_lpc_sse.h +++ b/celt/x86/celt_lpc_sse.h @@ -32,7 +32,9 @@ #include "config.h" #endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_FIR + void celt_fir_sse4_1( const opus_val16 *x, const opus_val16 *num, @@ -42,6 +44,12 @@ void celt_fir_sse4_1( opus_val16 *mem, int arch); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) + +#else + extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *num, @@ -56,3 +64,5 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( #endif #endif + +#endif diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c index 13d739e2..20e73126 100644 --- a/celt/x86/pitch_sse.c +++ b/celt/x86/pitch_sse.c @@ -35,3 +35,151 @@ #include "mathops.h" #include "pitch.h" +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) + +#include <xmmintrin.h> +#include "arch.h" + +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +{ + int j; + __m128 xsum1, xsum2; + xsum1 = _mm_loadu_ps(sum); + xsum2 = _mm_setzero_ps(); + + for (j = 0; j < len-3; j += 4) + { + __m128 x0 = _mm_loadu_ps(x+j); + __m128 yj = _mm_loadu_ps(y+j); + __m128 y3 = _mm_loadu_ps(y+j+3); + + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), + _mm_shuffle_ps(yj,y3,0x49))); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), + _mm_shuffle_ps(yj,y3,0x9e))); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); + } + if (j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + } + } + } + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); +} + + +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + __m128 xsum1, xsum2; + xsum1 = _mm_setzero_ps(); + xsum2 = _mm_setzero_ps(); + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 y1i = _mm_loadu_ps(y01+i); + __m128 y2i = _mm_loadu_ps(y02+i); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); + } + /* Horizontal sum */ + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); + _mm_store_ss(xy1, xsum1); + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); + _mm_store_ss(xy2, xsum2); + for (;i<N;i++) + { + *xy1 = MAC16_16(*xy1, x[i], y01[i]); + *xy2 = MAC16_16(*xy2, x[i], y02[i]); + } +} + +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif + + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } +#ifdef CUSTOM_MODES + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +#endif +} + + +#endif diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h index 99d1919a..d4cbeb8b 100644 --- a/celt/x86/pitch_sse.h +++ b/celt/x86/pitch_sse.h @@ -37,17 +37,37 @@ #include "config.h" #endif -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) void xcorr_kernel_sse4_1( const opus_int16 *x, const opus_int16 *y, opus_val32 sum[4], int len); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) +void xcorr_kernel_sse( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); +#endif + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse(x, y, sum, len)) + +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( - const opus_int16 *x, - const opus_int16 *y, + const opus_val16 *x, + const opus_val16 *y, opus_val32 sum[4], int len); @@ -55,181 +75,118 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( #define xcorr_kernel(x, y, sum, len, arch) \ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) opus_val32 celt_inner_prod_sse4_1( const opus_int16 *x, const opus_int16 *y, int N); #endif -#if defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) opus_val32 celt_inner_prod_sse2( const opus_int16 *x, const opus_int16 *y, int N); #endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse( + const opus_val16 *x, + const opus_val16 *y, + int N); +#endif + + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse4_1(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse2(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse(x, y, N)) + + +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( - const opus_int16 *x, - const opus_int16 *y, + const opus_val16 *x, + const opus_val16 *y, int N); #define OVERRIDE_CELT_INNER_PROD #define celt_inner_prod(x, y, N, arch) \ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) -#else -#include <xmmintrin.h> -#include "arch.h" +#endif -#define OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) -{ - int j; - __m128 xsum1, xsum2; - xsum1 = _mm_loadu_ps(sum); - xsum2 = _mm_setzero_ps(); - - for (j = 0; j < len-3; j += 4) - { - __m128 x0 = _mm_loadu_ps(x+j); - __m128 yj = _mm_loadu_ps(y+j); - __m128 y3 = _mm_loadu_ps(y+j+3); - - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), - _mm_shuffle_ps(yj,y3,0x49))); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), - _mm_shuffle_ps(yj,y3,0x9e))); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); - } - if (j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - } - } - } - _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); -} - -#define xcorr_kernel(_x, _y, _z, len, arch) \ - ((void)(arch),xcorr_kernel_sse(_x, _y, _z, len)) +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) #define OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) -{ - int i; - __m128 xsum1, xsum2; - xsum1 = _mm_setzero_ps(); - xsum2 = _mm_setzero_ps(); - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 y1i = _mm_loadu_ps(y01+i); - __m128 y2i = _mm_loadu_ps(y02+i); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); - } - /* Horizontal sum */ - xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); - xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); - _mm_store_ss(xy1, xsum1); - xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); - xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); - _mm_store_ss(xy2, xsum2); - for (;i<N;i++) - { - *xy1 = MAC16_16(*xy1, x[i], y01[i]); - *xy2 = MAC16_16(*xy2, x[i], y02[i]); - } -} +#define OVERRIDE_COMB_FILTER_CONST -#define OVERRIDE_CELT_INNER_PROD -static OPUS_INLINE opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, - int N) -{ - int i; - float xy; - __m128 sum; - sum = _mm_setzero_ps(); - /* FIXME: We should probably go 8-way and use 2 sums. */ - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 yi = _mm_loadu_ps(y+i); - sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); - } - /* Horizontal sum */ - sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); - sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); - _mm_store_ss(&xy, sum); - for (;i<N;i++) - { - xy = MAC16_16(xy, x[i], y[i]); - } - return xy; -} - -# define celt_inner_prod(_x, _y, len, arch) \ - ((void)(arch),celt_inner_prod_sse(_x, _y, len)) +#undef dual_inner_prod +#undef comb_filter_const -#define OVERRIDE_COMB_FILTER_CONST -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) -{ - int i; - __m128 x0v; - __m128 g10v, g11v, g12v; - g10v = _mm_load1_ps(&g10); - g11v = _mm_load1_ps(&g11); - g12v = _mm_load1_ps(&g12); - x0v = _mm_loadu_ps(&x[-T-2]); - for (i=0;i<N-3;i+=4) - { - __m128 yi, yi2, x1v, x2v, x3v, x4v; - const opus_val32 *xp = &x[i-T-2]; - yi = _mm_loadu_ps(x+i); - x4v = _mm_loadu_ps(xp+4); -#if 0 - /* Slower version with all loads */ - x1v = _mm_loadu_ps(xp+1); - x2v = _mm_loadu_ps(xp+2); - x3v = _mm_loadu_ps(xp+3); -#else - x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); - x1v = _mm_shuffle_ps(x0v, x2v, 0x99); - x3v = _mm_shuffle_ps(x2v, x4v, 0x99); -#endif +void dual_inner_prod_sse(const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +void comb_filter_const_sse(opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); - yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ - yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); - yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + +#if defined(OPUS_X86_PRESUME_SSE) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) + +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12)) #else - /* Use partial sums */ - yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), - _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); - yi = _mm_add_ps(yi, yi2); + +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) + +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12)) + +#define NON_STATIC_COMB_FILTER_CONST_C + #endif - x0v=x4v; - _mm_storeu_ps(y+i, yi); - } -#ifdef CUSTOM_MODES - for (;i<N;i++) - { - y[i] = x[i] - + MULT16_32_Q15(g10,x[i-T]) - + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) - + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); - } #endif -} #endif -#endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 83410dbc..1ed2acbc 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -38,6 +38,8 @@ # if defined(FIXED_POINT) +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1) + void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *num, @@ -49,8 +51,8 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( ) = { celt_fir_c, /* non-sse */ celt_fir_c, + celt_fir_c, MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ - NULL }; void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( @@ -61,24 +63,86 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( ) = { xcorr_kernel_c, /* non-sse */ xcorr_kernel_c, + xcorr_kernel_c, MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ - NULL }; +#endif + +#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) + opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( const opus_val16 *x, const opus_val16 *y, int N ) = { celt_inner_prod_c, /* non-sse */ + celt_inner_prod_c, MAY_HAVE_SSE2(celt_inner_prod), MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ - NULL }; +#endif + # else -# error "Floating-point implementation is not supported by x86 RTCD yet." \ - "Reconfigure with --disable-rtcd or send patches." -# endif +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* non-sse */ + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), +}; + +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N +) = { + celt_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), +}; + +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2 +) = { + dual_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), +}; + +void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12 +) = { + comb_filter_const_c, /* non-sse */ + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), +}; + + +#endif + +#endif #endif diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c index b901bd96..76bfd6c7 100644 --- a/celt/x86/x86cpu.c +++ b/celt/x86/x86cpu.c @@ -35,6 +35,11 @@ #include "pitch.h" #include "x86cpu.h" +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) + + #if defined(_MSC_VER) #include <intrin.h> @@ -79,6 +84,7 @@ static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) typedef struct CPU_Feature{ /* SIMD: 128-bit */ + int HW_SSE; int HW_SSE2; int HW_SSE41; } CPU_Feature; @@ -93,10 +99,12 @@ static void opus_cpu_feature_check(CPU_Feature *cpu_feature) if (nIds >= 1){ cpuid(info, 1); + cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; } else { + cpu_feature->HW_SSE = 0; cpu_feature->HW_SSE2 = 0; cpu_feature->HW_SSE41 = 0; } @@ -110,6 +118,12 @@ int opus_select_arch(void) opus_cpu_feature_check(&cpu_feature); arch = 0; + if (!cpu_feature.HW_SSE) + { + return arch; + } + arch++; + if (!cpu_feature.HW_SSE2) { return arch; @@ -124,3 +138,5 @@ int opus_select_arch(void) return arch; } + +#endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index cdbab9c3..870b15e1 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -28,6 +28,12 @@ #if !defined(X86CPU_H) # define X86CPU_H +# if defined(OPUS_X86_MAY_HAVE_SSE) +# define MAY_HAVE_SSE(name) name ## _sse +# else +# define MAY_HAVE_SSE(name) name ## _c +# endif + # if defined(OPUS_X86_MAY_HAVE_SSE2) # define MAY_HAVE_SSE2(name) name ## _sse2 # else diff --git a/configure.ac b/configure.ac index ea91ab2e..de75c094 100644 --- a/configure.ac +++ b/configure.ac @@ -491,9 +491,6 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ ] ) - #Currently we only have intrinsic optimizations for floating point - AS_IF([test x"$enable_float" = x"no"], - [ AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""]) AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"], [ @@ -541,11 +538,6 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ [rtcd_support=no], [rtcd_support="x86$rtcd_support"], ) - ], [ - AC_MSG_WARN([Currently only have X86 intrinsics for fixed-point]) - intrinsics_support=no - ] - ) AS_IF([test x"$enable_rtcd" = x"yes" && test x"$rtcd_support" != x""],[ get_cpuid_by_asm="no" diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h index 9a0e0964..61efa8da 100644 --- a/silk/x86/SigProc_FIX_sse.h +++ b/silk/x86/SigProc_FIX_sse.h @@ -45,6 +45,12 @@ void silk_burg_modified_sse4_1( int arch /* I Run-time architecture */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ + ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) + +#else + extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ @@ -59,12 +65,22 @@ extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])( # define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \ ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch)) +#endif + opus_int64 silk_inner_prod16_aligned_64_sse4_1( const opus_int16 *inVec1, const opus_int16 *inVec2, const opus_int len ); + +#if defined(OPUS_X86_PRESUME_SSE4_1) + +#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \ + ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len)) + +#else + extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -75,3 +91,4 @@ extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])( #endif #endif +#endif diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h index f970632c..afd5ec26 100644 --- a/silk/x86/main_sse.h +++ b/silk/x86/main_sse.h @@ -50,6 +50,15 @@ void silk_VQ_WMat_EC_sse4_1( opus_int L /* I number of vectors in codebook */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L, arch) \ + ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ + mu_Q9, max_gain_Q7, L)) + +#else + extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( opus_int8 *ind, /* O index of best codebook vector */ opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */ @@ -69,6 +78,8 @@ extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \ mu_Q9, max_gain_Q7, L)) +#endif + # define OVERRIDE_silk_NSQ void silk_NSQ_sse4_1( @@ -89,6 +100,15 @@ void silk_NSQ_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -112,6 +132,8 @@ extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + # define OVERRIDE_silk_NSQ_del_dec void silk_NSQ_del_dec_sse4_1( @@ -132,6 +154,15 @@ void silk_NSQ_del_dec_sse4_1( const opus_int LTP_scale_Q14 /* I LTP state scaling */ ); +#if defined OPUS_X86_PRESUME_SSE4_1 + +#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \ + ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ + HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) + +#else + extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( const silk_encoder_state *psEncC, /* I/O Encoder State */ silk_nsq_state *NSQ, /* I/O NSQ state */ @@ -155,6 +186,8 @@ extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])( ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \ HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14)) +#endif + void silk_noise_shape_quantizer( silk_nsq_state *NSQ, /* I/O NSQ state */ opus_int signalType, /* I Signal type */ @@ -192,6 +225,11 @@ opus_int silk_VAD_GetSA_Q8_sse4_1( const opus_int16 pIn[] ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn)) + +#else + # define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \ ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn)) @@ -201,6 +239,8 @@ extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])( # define OVERRIDE_silk_warped_LPC_analysis_filter_FIX +#endif + void silk_warped_LPC_analysis_filter_FIX_sse4_1( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -211,6 +251,12 @@ void silk_warped_LPC_analysis_filter_FIX_sse4_1( const opus_int order /* I Filter order (even) */ ); +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ + ((void)(arch),silk_warped_LPC_analysis_filter_FIX_c(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) + +#else + extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1])( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -224,5 +270,7 @@ extern void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[OPUS_ARCHMASK + 1]) # define silk_warped_LPC_analysis_filter_FIX(state, res_Q2, coef_Q13, input, lambda_Q16, length, order, arch) \ ((*SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[(arch) & OPUS_ARCHMASK])(state, res_Q2, coef_Q13, input, lambda_Q16, length, order)) +#endif + # endif #endif diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c index 6747d101..ad9fef2a 100644 --- a/silk/x86/x86_silk_map.c +++ b/silk/x86/x86_silk_map.c @@ -35,6 +35,10 @@ #include "pitch.h" #include "main.h" +#if !defined(OPUS_X86_PRESUME_SSE4_1) + +#if defined(FIXED_POINT) + opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( const opus_int16 *inVec1, const opus_int16 *inVec2, @@ -42,18 +46,20 @@ opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_inner_prod16_aligned_64_c, /* non-sse */ silk_inner_prod16_aligned_64_c, + silk_inner_prod16_aligned_64_c, MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */ - NULL }; +#endif + opus_int (*const SILK_VAD_GETSA_Q8_IMPL[ OPUS_ARCHMASK + 1 ] )( silk_encoder_state *psEncC, const opus_int16 pIn[] ) = { silk_VAD_GetSA_Q8_c, /* non-sse */ silk_VAD_GetSA_Q8_c, + silk_VAD_GetSA_Q8_c, MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -75,8 +81,8 @@ void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_c, /* non-sse */ silk_NSQ_c, + silk_NSQ_c, MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */ - NULL }; void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -94,8 +100,8 @@ void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_VQ_WMat_EC_c, /* non-sse */ silk_VQ_WMat_EC_c, + silk_VQ_WMat_EC_c, MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */ - NULL }; void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -117,10 +123,12 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_NSQ_del_dec_c, /* non-sse */ silk_NSQ_del_dec_c, + silk_NSQ_del_dec_c, MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */ - NULL }; +#if defined(FIXED_POINT) + void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( opus_int32 state[], /* I/O State [order + 1] */ opus_int32 res_Q2[], /* O Residual signal [length] */ @@ -132,8 +140,8 @@ void (*const SILK_WARPED_LPC_ANALYSIS_FILTER_FIX_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_warped_LPC_analysis_filter_FIX_c, /* non-sse */ silk_warped_LPC_analysis_filter_FIX_c, + silk_warped_LPC_analysis_filter_FIX_c, MAY_HAVE_SSE4_1( silk_warped_LPC_analysis_filter_FIX ), /* sse4.1 */ - NULL }; void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( @@ -149,6 +157,9 @@ void (*const SILK_BURG_MODIFIED_IMPL[ OPUS_ARCHMASK + 1 ] )( ) = { silk_burg_modified_c, /* non-sse */ silk_burg_modified_c, + silk_burg_modified_c, MAY_HAVE_SSE4_1( silk_burg_modified ), /* sse4.1 */ - NULL }; + +#endif +#endif -- GitLab