diff --git a/celt/bands.c b/celt/bands.c index 87791b4a9dfaec097e556a0b0e07238c3333cc6c..868a98dc4993399bb3fa91a7b5cff79eedfe9d65 100644 --- a/celt/bands.c +++ b/celt/bands.c @@ -1036,7 +1036,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X, /* Finally do the actual quantization */ if (encode) { - cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth); + cm = alg_quant(X, N, K, spread, B, ec, gain, ctx->resynth, ctx->arch); } else { cm = alg_unquant(X, N, K, spread, B, ec, gain); } diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index fd3319da91ceb3ecfa94c702d1eff3dab738c87a..d49a2bd224465980da24d55e916ea5d008312f28 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -57,6 +57,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 1080c2085d02c5c975fd0f5ef9dd832303ad9e45..571ed12d4a39f28c68de96b619c4d68167c1c256 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -55,6 +55,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_SSE2) # include "x86/pitch_sse2.c" +# include "x86/vq_sse2.c" # endif # if defined(OPUS_X86_MAY_HAVE_SSE4_1) # include "x86/pitch_sse4_1.c" diff --git a/celt/vq.c b/celt/vq.c index e68959523b338b63e84c7c52c712d3385f16e613..8d49d80491f16633b6631b9f86e366a44e2c178e 100644 --- a/celt/vq.c +++ b/celt/vq.c @@ -158,29 +158,21 @@ static unsigned extract_collapse_mask(int *iy, int N, int B) return collapse_mask; } -unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, - opus_val16 gain, int resynth) +opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch) { VARDECL(celt_norm, y); - VARDECL(int, iy); VARDECL(int, signx); int i, j; int pulsesLeft; opus_val32 sum; opus_val32 xy; opus_val16 yy; - unsigned collapse_mask; SAVE_STACK; - celt_assert2(K>0, "alg_quant() needs at least one pulse"); - celt_assert2(N>1, "alg_quant() needs at least two dimensions"); - + (void)arch; ALLOC(y, N, celt_norm); - ALLOC(iy, N, int); ALLOC(signx, N, int); - exp_rotation(X, N, 1, B, K, spread); - /* Get rid of the sign */ sum = 0; j=0; do { @@ -322,6 +314,28 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, but has the same performance otherwise. */ iy[j] = (iy[j]^-signx[j]) + signx[j]; } while (++j<N); + RESTORE_STACK; + return yy; +} + +unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, + opus_val16 gain, int resynth, int arch) +{ + VARDECL(int, iy); + opus_val16 yy; + unsigned collapse_mask; + SAVE_STACK; + + celt_assert2(K>0, "alg_quant() needs at least one pulse"); + celt_assert2(N>1, "alg_quant() needs at least two dimensions"); + + /* Covers vectorization by up to 4. */ + ALLOC(iy, N+3, int); + + exp_rotation(X, N, 1, B, K, spread); + + yy = op_pvq_search(X, iy, K, N, arch); + encode_pulses(iy, N, K, enc); if (resynth) diff --git a/celt/vq.h b/celt/vq.h index c2fa02455621c0fd4b62d56c5346d669572c3102..5dbf9ce06315b65b5fc598ef6e3f54ed2a8e8b2d 100644 --- a/celt/vq.h +++ b/celt/vq.h @@ -37,10 +37,20 @@ #include "entdec.h" #include "modes.h" +#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)) +#include "x86/vq_sse.h" +#endif + #if defined(MIPSr1_ASM) #include "mips/vq_mipsr1.h" #endif +opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch); + +#if !defined(OVERRIDE_OP_PVQ_SEARCH) +#define op_pvq_search(x, iy, K, N, arch) \ + (op_pvq_search_c(x, iy, K, N, arch)) +#endif /** Algebraic pulse-vector quantiser. The signal x is replaced by the sum of * the pitch and a combination of pulses such that its norm is still equal @@ -52,7 +62,7 @@ * @ret A mask indicating which blocks in the band received pulses */ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc, - opus_val16 gain, int resynth); + opus_val16 gain, int resynth, int arch); /** Algebraic pulse decoder * @param X Decoded normalised spectrum (returned) diff --git a/celt/x86/vq_sse.h b/celt/x86/vq_sse.h new file mode 100644 index 0000000000000000000000000000000000000000..b4efe8f249f893ed160040ec825b2a7647e192bb --- /dev/null +++ b/celt/x86/vq_sse.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef VQ_SSE_H +#define VQ_SSE_H + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +#define OVERRIDE_OP_PVQ_SEARCH + +opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); + +#if defined(OPUS_X86_PRESUME_SSE2) +#define op_pvq_search(x, iy, K, N, arch) \ + (op_pvq_search_sse2(x, iy, K, N, arch)) + +#else + +extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( + celt_norm *_X, int *iy, int K, int N, int arch); + +# define op_pvq_search(X, iy, K, N, arch) \ + ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch)) + +#endif +#endif + +#endif diff --git a/celt/x86/vq_sse2.c b/celt/x86/vq_sse2.c new file mode 100644 index 0000000000000000000000000000000000000000..0891a5ba3f289248e670b48941cc442c53c6dc7d --- /dev/null +++ b/celt/x86/vq_sse2.c @@ -0,0 +1,217 @@ +/* Copyright (c) 2007-2008 CSIRO + Copyright (c) 2007-2009 Xiph.Org Foundation + Copyright (c) 2007-2016 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "vq.h" +#include "x86cpu.h" + + +#ifndef FIXED_POINT + +opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) +{ + int i, j; + int pulsesLeft; + float xy, yy; + VARDECL(celt_norm, y); + VARDECL(celt_norm, X); + VARDECL(float, signy); + __m128 signmask; + __m128 sums; + __m128i fours; + SAVE_STACK; + + (void)arch; + /* All bits set to zero, except for the sign bit. */ + signmask = _mm_set_ps1(-0.f); + fours = _mm_set_epi32(4, 4, 4, 4); + ALLOC(y, N+3, celt_norm); + ALLOC(X, N+3, celt_norm); + ALLOC(signy, N+3, float); + + OPUS_COPY(X, _X, N); + X[N] = X[N+1] = X[N+2] = 0; + sums = _mm_setzero_ps(); + for (j=0;j<N;j+=4) + { + __m128 x4, s4; + x4 = _mm_loadu_ps(&X[j]); + s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); + /* Get rid of the sign */ + x4 = _mm_andnot_ps(signmask, x4); + sums = _mm_add_ps(sums, x4); + /* Clear y and iy in case we don't do the projection. */ + _mm_storeu_ps(&y[j], _mm_setzero_ps()); + _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); + _mm_storeu_ps(&X[j], x4); + _mm_storeu_ps(&signy[j], s4); + } + sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); + sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); + + xy = yy = 0; + + pulsesLeft = K; + + /* Do a pre-search by projecting on the pyramid */ + if (K > (N>>1)) + { + __m128i pulses_sum; + __m128 yy4, xy4; + __m128 rcp4; + opus_val32 sum = _mm_cvtss_f32(sums); + /* If X is too small, just replace it with a pulse at 0 */ + /* Prevents infinities and NaNs from causing too many pulses + to be allocated. 64 is an approximation of infinity here. */ + if (!(sum > EPSILON && sum < 64)) + { + X[0] = QCONST16(1.f,14); + j=1; do + X[j]=0; + while (++j<N); + sums = _mm_set_ps1(1.f); + } + rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K-1)), _mm_rcp_ps(sums)); + xy4 = yy4 = _mm_setzero_ps(); + pulses_sum = _mm_setzero_si128(); + for (j=0;j<N;j+=4) + { + __m128 rx4, x4, y4; + __m128i iy4; + x4 = _mm_loadu_ps(&X[j]); + rx4 = _mm_mul_ps(x4, rcp4); + iy4 = _mm_cvttps_epi32(rx4); + pulses_sum = _mm_add_epi32(pulses_sum, iy4); + _mm_storeu_si128((__m128i*)&iy[j], iy4); + y4 = _mm_cvtepi32_ps(iy4); + xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); + yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); + /* double the y[] vector so we don't have to do it in the search loop. */ + _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); + } + pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); + pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); + pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); + xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); + xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); + xy = _mm_cvtss_f32(xy4); + yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); + yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); + yy = _mm_cvtss_f32(yy4); + } + X[N] = X[N+1] = X[N+2] = -100; + y[N] = y[N+1] = y[N+2] = 100; + celt_assert2(pulsesLeft>=1, "Allocated too many pulses in the quick pass"); + + /* This should never happen, but just in case it does (e.g. on silence) + we fill the first bin with pulses. */ + if (pulsesLeft > N+3) + { + opus_val16 tmp = (opus_val16)pulsesLeft; + yy = MAC16_16(yy, tmp, tmp); + yy = MAC16_16(yy, tmp, y[0]); + iy[0] += pulsesLeft; + pulsesLeft=0; + } + + for (i=0;i<pulsesLeft;i++) + { + int best_id; + __m128 xy4, yy4; + __m128 max, max2; + __m128i count; + __m128i pos; + best_id = 0; + /* The squared magnitude term gets added anyway, so we might as well + add it outside the loop */ + yy = ADD16(yy, 1); + xy4 = _mm_load1_ps(&xy); + yy4 = _mm_load1_ps(&yy); + max = _mm_setzero_ps(); + pos = _mm_setzero_si128(); + count = _mm_set_epi32(3, 2, 1, 0); + for (j=0;j<N;j+=4) + { + __m128 x4, y4, r4; + x4 = _mm_loadu_ps(&X[j]); + y4 = _mm_loadu_ps(&y[j]); + x4 = _mm_add_ps(x4, xy4); + y4 = _mm_add_ps(y4, yy4); + y4 = _mm_rsqrt_ps(y4); + r4 = _mm_mul_ps(x4, y4); + /* Update the index of the max. */ + pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); + /* Update the max. */ + max = _mm_max_ps(max, r4); + /* Update the indices (+4) */ + count = _mm_add_epi32(count, fours); + } + /* Horizontal max */ + max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); + max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); + /* Now that max2 contains the max at all positions, look at which value(s) of the + partial max is equal to the global max. */ + pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); + pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); + pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); + best_id = _mm_cvtsi128_si32(pos); + + /* Updating the sums of the new pulse(s) */ + xy = ADD32(xy, EXTEND32(X[best_id])); + /* We're multiplying y[j] by two so we don't have to do it here */ + yy = ADD16(yy, y[best_id]); + + /* Only now that we've made the final choice, update y/iy */ + /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ + y[best_id] += 2; + iy[best_id]++; + } + + /* Put the original sign back */ + for (j=0;j<N;j+=4) + { + __m128i y4; + __m128i s4; + y4 = _mm_loadu_si128((__m128i*)&iy[j]); + s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); + y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); + _mm_storeu_si128((__m128i*)&iy[j], y4); + } + RESTORE_STACK; + return yy; +} + +#endif diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c index 47ba41b9eefd5e3dcf4e4f8f5908b1cb0c5eae5a..80559a28e85d467a140b1c713e14d2512e28ef89 100644 --- a/celt/x86/x86_celt_map.c +++ b/celt/x86/x86_celt_map.c @@ -33,6 +33,7 @@ #include "celt_lpc.h" #include "pitch.h" #include "pitch_sse.h" +#include "vq.h" #if defined(OPUS_HAVE_RTCD) @@ -151,5 +152,17 @@ void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( #endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2) +opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( + celt_norm *_X, int *iy, int K, int N, int arch +) = { + op_pvq_search_c, /* non-sse */ + op_pvq_search_c, + MAY_HAVE_SSE2(op_pvq_search), + MAY_HAVE_SSE2(op_pvq_search), + MAY_HAVE_SSE2(op_pvq_search) +}; +#endif + #endif #endif diff --git a/celt_headers.mk b/celt_headers.mk index c9df94b336336554bf243c808e6b59b2b31b19b4..706185da4ea50d11069087276c958c2a903c6ea4 100644 --- a/celt_headers.mk +++ b/celt_headers.mk @@ -49,4 +49,5 @@ celt/mips/mdct_mipsr1.h \ celt/mips/pitch_mipsr1.h \ celt/mips/vq_mipsr1.h \ celt/x86/pitch_sse.h \ +celt/x86/vq_sse.h \ celt/x86/x86cpu.h diff --git a/celt_sources.mk b/celt_sources.mk index 2ffe99a3a9a68ca89084d207bcf9ba33dab2c9a3..cabc48ffc723553b26b414afb653d30011da001f 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -21,7 +21,7 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \ celt/x86/x86_celt_map.c \ celt/x86/pitch_sse.c -CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c celt/x86/vq_sse2.c CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \ celt/x86/pitch_sse4_1.c