Commit c95c9a04 authored by xiangmingzhu's avatar xiangmingzhu Committed by Jean-Marc Valin

Cisco optimization for x86 & fixed point

1. Only for fixed point on x86 platform (32bit and 64bit, uses SIMD
   intrinsics up to SSE4.2)
2. Use "configure --enable-fixed-point --enable-intrinsics" to enable
   optimization, default is disabled.
3. Official test cases are verified and passed.
Signed-off-by: Timothy B. Terriberry's avatarTimothy B. Terriberry <tterribe@xiph.org>
parent 80460334
......@@ -18,6 +18,9 @@ include opus_sources.mk
if FIXED_POINT
SILK_SOURCES += $(SILK_SOURCES_FIXED)
if HAVE_SSE4_1
SILK_SOURCES += $(SILK_SOURCES_SSE4_1) $(SILK_SOURCES_FIXED_SSE4_1)
endif
else
SILK_SOURCES += $(SILK_SOURCES_FLOAT)
endif
......@@ -27,6 +30,14 @@ else
OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
endif
if HAVE_SSE4_1
CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1)
else
if HAVE_SSE2
CELT_SOURCES += $(CELT_SOURCES_SSE)
endif
endif
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
SILK_SOURCES += $(SILK_SOURCES_ARM)
......@@ -229,3 +240,13 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl
# For autoconf-modified sources (e.g., armopts.s)
%-gnu.S: %.s
$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o
if HAVE_SSE4_1
$(SSE_OBJ): CFLAGS += -msse4.1
else
if HAVE_SSE2
$(SSE_OBJ): CFLAGS += -msse2
endif
endif
......@@ -164,7 +164,7 @@ void compute_band_energies(const CELTMode *m, const celt_sig *X, celt_ener *band
for (i=0;i<end;i++)
{
opus_val32 sum;
sum = 1e-27f + celt_inner_prod(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
sum = 1e-27f + celt_inner_prod_c(&X[c*N+(eBands[i]<<LM)], &X[c*N+(eBands[i]<<LM)], (eBands[i+1]-eBands[i])<<LM);
bandE[i+c*m->nbEBands] = celt_sqrt(sum);
/*printf ("%f ", bandE[i+c*m->nbEBands]);*/
}
......@@ -266,7 +266,7 @@ void denormalise_bands(const CELTMode *m, const celt_norm * OPUS_RESTRICT X,
/* This prevents energy collapse for transients with multiple short MDCTs */
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed)
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed, int arch)
{
int c, i, j, k;
for (i=start;i<end;i++)
......@@ -355,7 +355,7 @@ void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_mas
}
/* We just added some energy, so we need to renormalise */
if (renormalize)
renormalise_vector(X, N0<<LM, Q15ONE);
renormalise_vector(X, N0<<LM, Q15ONE, arch);
} while (++c<C);
}
}
......@@ -656,6 +656,7 @@ struct band_ctx {
opus_int32 remaining_bits;
const celt_ener *bandE;
opus_uint32 seed;
int arch;
};
struct split_ctx {
......@@ -707,7 +708,7 @@ static void compute_theta(struct band_ctx *ctx, struct split_ctx *sctx,
side and mid. With just that parameter, we can re-scale both
mid and side because we know that 1) they have unit norm and
2) they are orthogonal. */
itheta = stereo_itheta(X, Y, stereo, N);
itheta = stereo_itheta(X, Y, stereo, N, ctx->arch);
}
tell = ec_tell_frac(ec);
if (qn!=1)
......@@ -1055,7 +1056,7 @@ static unsigned quant_partition(struct band_ctx *ctx, celt_norm *X,
}
cm = fill;
}
renormalise_vector(X, N, gain);
renormalise_vector(X, N, gain, ctx->arch);
}
}
}
......@@ -1360,9 +1361,11 @@ static unsigned quant_band_stereo(struct band_ctx *ctx, celt_norm *X, celt_norm
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int LM, int codedBands, opus_uint32 *seed)
celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
opus_int32 balance, ec_ctx *ec, int LM, int codedBands,
opus_uint32 *seed, int arch)
{
int i;
opus_int32 remaining_bits;
......@@ -1404,6 +1407,7 @@ void quant_all_bands(int encode, const CELTMode *m, int start, int end,
ctx.m = m;
ctx.seed = *seed;
ctx.spread = spread;
ctx.arch = arch;
for (i=start;i<end;i++)
{
opus_int32 tell;
......
......@@ -98,15 +98,20 @@ void haar1(celt_norm *X, int N0, int stride);
* @param LM log2() of the number of 2.5 subframes in the frame
* @param codedBands Last band to receive bits + 1
* @param seed Random generator seed
* @param arch Run-time architecture (see opus_select_arch())
*/
void quant_all_bands(int encode, const CELTMode *m, int start, int end,
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks, const celt_ener *bandE, int *pulses,
int shortBlocks, int spread, int dual_stereo, int intensity, int *tf_res,
opus_int32 total_bits, opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed);
void anti_collapse(const CELTMode *m, celt_norm *X_, unsigned char *collapse_masks, int LM, int C, int size,
int start, int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed);
celt_norm * X, celt_norm * Y, unsigned char *collapse_masks,
const celt_ener *bandE, int *pulses, int shortBlocks, int spread,
int dual_stereo, int intensity, int *tf_res, opus_int32 total_bits,
opus_int32 balance, ec_ctx *ec, int M, int codedBands, opus_uint32 *seed,
int arch);
void anti_collapse(const CELTMode *m, celt_norm *X_,
unsigned char *collapse_masks, int LM, int C, int size, int start,
int end, const opus_val16 *logE, const opus_val16 *prev1logE,
const opus_val16 *prev2logE, const int *pulses, opus_uint32 seed,
int arch);
opus_uint32 celt_lcg_rand(opus_uint32 seed);
......
......@@ -499,7 +499,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
seed = celt_lcg_rand(seed);
X[boffs+j] = (celt_norm)((opus_int32)seed>>20);
}
renormalise_vector(X+boffs, blen, Q15ONE);
renormalise_vector(X+boffs, blen, Q15ONE, st->arch);
}
}
st->rng = seed;
......@@ -583,7 +583,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
}
/* Compute the excitation for exc_length samples before the loss. */
celt_fir(exc+MAX_PERIOD-exc_length, lpc+c*LPC_ORDER,
exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem);
exc+MAX_PERIOD-exc_length, exc_length, LPC_ORDER, lpc_mem, st->arch);
}
/* Check if the waveform is decaying, and if so how fast.
......@@ -650,7 +650,7 @@ static void celt_decode_lost(CELTDecoder * OPUS_RESTRICT st, int N, int LM)
the signal domain. */
celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
buf+DECODE_BUFFER_SIZE-N, extrapolation_len, LPC_ORDER,
lpc_mem);
lpc_mem, st->arch);
}
/* Check if the synthesis energy is higher than expected, which can
......@@ -982,7 +982,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
quant_all_bands(0, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
NULL, pulses, shortBlocks, spread_decision, dual_stereo, intensity, tf_res,
len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng);
len*(8<<BITRES)-anti_collapse_rsv, balance, dec, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{
......@@ -994,7 +994,7 @@ int celt_decode_with_ec(CELTDecoder * OPUS_RESTRICT st, const unsigned char *dat
if (anti_collapse_on)
anti_collapse(mode, X, collapse_masks, LM, C, N,
start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng);
start, end, oldBandE, oldLogE, oldLogE2, pulses, st->rng, st->arch);
if (silence)
{
......
......@@ -751,7 +751,7 @@ static void tf_encode(int start, int end, int isTransient, int *tf_res, int LM,
static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
const opus_val16 *bandLogE, int end, int LM, int C, int N0,
AnalysisInfo *analysis, opus_val16 *stereo_saving, opus_val16 tf_estimate,
int intensity, opus_val16 surround_trim)
int intensity, opus_val16 surround_trim, int arch)
{
int i;
opus_val32 diff=0;
......@@ -767,7 +767,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=0;i<8;i++)
{
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
(m->eBands[i+1]-m->eBands[i])<<LM, arch);
sum = ADD16(sum, EXTRACT16(SHR32(partial, 18)));
}
sum = MULT16_16_Q15(QCONST16(1.f/8, 15), sum);
......@@ -776,7 +777,8 @@ static int alloc_trim_analysis(const CELTMode *m, const celt_norm *X,
for (i=8;i<intensity;i++)
{
opus_val32 partial;
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)], (m->eBands[i+1]-m->eBands[i])<<LM);
partial = celt_inner_prod(&X[m->eBands[i]<<LM], &X[N0+(m->eBands[i]<<LM)],
(m->eBands[i+1]-m->eBands[i])<<LM, arch);
minXC = MIN16(minXC, ABS16(EXTRACT16(SHR32(partial, 18))));
}
minXC = MIN16(QCONST16(1.f, 10), ABS16(minXC));
......@@ -1097,7 +1099,7 @@ static int run_prefilter(CELTEncoder *st, celt_sig *in, celt_sig *prefilter_mem,
pitch_index = COMBFILTER_MAXPERIOD-pitch_index;
gain1 = remove_doubling(pitch_buf, COMBFILTER_MAXPERIOD, COMBFILTER_MINPERIOD,
N, &pitch_index, st->prefilter_period, st->prefilter_gain);
N, &pitch_index, st->prefilter_period, st->prefilter_gain, st->arch);
if (pitch_index > COMBFILTER_MAXPERIOD-2)
pitch_index = COMBFILTER_MAXPERIOD-2;
gain1 = MULT16_16_Q15(QCONST16(.7f,15),gain1);
......@@ -1887,7 +1889,8 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
alloc_trim = 5;
else
alloc_trim = alloc_trim_analysis(mode, X, bandLogE,
end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate, st->intensity, surround_trim);
end, LM, C, N, &st->analysis, &st->stereo_saving, tf_estimate,
st->intensity, surround_trim, st->arch);
ec_enc_icdf(enc, alloc_trim, trim_icdf, 7);
tell = ec_tell_frac(enc);
}
......@@ -2022,8 +2025,9 @@ int celt_encode_with_ec(CELTEncoder * OPUS_RESTRICT st, const opus_val16 * pcm,
/* Residual quantisation */
ALLOC(collapse_masks, C*nbEBands, unsigned char);
quant_all_bands(1, mode, start, end, X, C==2 ? X+N : NULL, collapse_masks,
bandE, pulses, shortBlocks, st->spread_decision, dual_stereo, st->intensity, tf_res,
nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv, balance, enc, LM, codedBands, &st->rng);
bandE, pulses, shortBlocks, st->spread_decision,
dual_stereo, st->intensity, tf_res, nbCompressedBytes*(8<<BITRES)-anti_collapse_rsv,
balance, enc, LM, codedBands, &st->rng, st->arch);
if (anti_collapse_rsv > 0)
{
......
......@@ -88,12 +88,15 @@ int p
#endif
}
void celt_fir(const opus_val16 *_x,
void celt_fir_c(
const opus_val16 *_x,
const opus_val16 *num,
opus_val16 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
int i,j;
VARDECL(opus_val16, rnum);
......@@ -124,7 +127,7 @@ void celt_fir(const opus_val16 *_x,
for (i=0;i<N-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(rnum, x+i, sum, ord);
xcorr_kernel(rnum, x+i, sum, ord, arch);
_y[i ] = SATURATE16(ADD32(EXTEND32(_x[i ]), PSHR32(sum[0], SIG_SHIFT)));
_y[i+1] = SATURATE16(ADD32(EXTEND32(_x[i+1]), PSHR32(sum[1], SIG_SHIFT)));
_y[i+2] = SATURATE16(ADD32(EXTEND32(_x[i+2]), PSHR32(sum[2], SIG_SHIFT)));
......@@ -146,7 +149,8 @@ void celt_iir(const opus_val32 *_x,
opus_val32 *_y,
int N,
int ord,
opus_val16 *mem)
opus_val16 *mem,
int arch)
{
#ifdef SMALL_FOOTPRINT
int i,j;
......@@ -187,7 +191,7 @@ void celt_iir(const opus_val32 *_x,
sum[1]=_x[i+1];
sum[2]=_x[i+2];
sum[3]=_x[i+3];
xcorr_kernel(rden, y+i, sum, ord);
xcorr_kernel(rden, y+i, sum, ord, arch);
/* Patch up the result to compensate for the fact that this is an IIR */
y[i+ord ] = -ROUND16(sum[0],SIG_SHIFT);
......
......@@ -29,24 +29,37 @@
#define PLC_H
#include "arch.h"
#include "cpu_support.h"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.h"
#endif
#define LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
void celt_fir(const opus_val16 *x,
void celt_fir_c(
const opus_val16 *x,
const opus_val16 *num,
opus_val16 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define celt_fir(x, num, y, N, ord, mem, arch) \
(celt_fir_c(x, num, y, N, ord, mem, arch))
#endif
void celt_iir(const opus_val32 *x,
const opus_val16 *den,
opus_val32 *y,
int N,
int ord,
opus_val16 *mem);
opus_val16 *mem,
int arch);
int _celt_autocorr(const opus_val16 *x, opus_val32 *ac,
const opus_val16 *window, int overlap, int lag, int n, int arch);
......
......@@ -42,6 +42,18 @@
*/
#define OPUS_ARCHMASK 3
#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/x86cpu.h"
/* We currently support 3 x86 variants:
* arch[0] -> non-sse
* arch[1] -> sse2
* arch[2] -> sse4.1
* arch[3] -> NULL
*/
#define OPUS_ARCHMASK 3
int opus_select_arch(void);
#else
#define OPUS_ARCHMASK 0
......@@ -50,5 +62,4 @@ static OPUS_INLINE int opus_select_arch(void)
return 0;
}
#endif
#endif
......@@ -98,7 +98,7 @@ static void ec_enc_carry_out(ec_enc *_this,int _c){
else _this->ext++;
}
static void ec_enc_normalize(ec_enc *_this){
static OPUS_INLINE void ec_enc_normalize(ec_enc *_this){
/*If the range is too small, output some bits and rescale it.*/
while(_this->rng<=EC_CODE_BOT){
ec_enc_carry_out(_this,(int)(_this->val>>EC_CODE_SHIFT));
......
......@@ -73,7 +73,11 @@ static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_
}
#define OVERRIDE_renormalise_vector
void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
#define renormalise_vector(X, N, gain, arch) \
((void)(arch), renormalize_vector_mips(x, N, gain))
void renormalise_vector_mips(celt_norm *X, int N, opus_val16 gain)
{
int i;
#ifdef FIXED_POINT
......
......@@ -250,7 +250,8 @@ opus_val32
#else
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr, int len, int max_pitch)
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch)
{
int i;
/*The EDSP version requires that max_pitch is at least 1, and that _x is
......@@ -264,7 +265,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (i=0;i<max_pitch-3;i+=4)
{
opus_val32 sum[4]={0,0,0,0};
xcorr_kernel(_x, _y+i, sum, len);
xcorr_kernel(_x, _y+i, sum, len, arch);
xcorr[i]=sum[0];
xcorr[i+1]=sum[1];
xcorr[i+2]=sum[2];
......@@ -280,7 +281,7 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y, opus_val32 *xcorr
for (;i<max_pitch;i++)
{
opus_val32 sum;
sum = celt_inner_prod(_x, _y+i, len);
sum = celt_inner_prod(_x, _y+i, len, arch);
xcorr[i] = sum;
#ifdef FIXED_POINT
maxcorr = MAX32(maxcorr, sum);
......@@ -369,7 +370,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
for (j=0;j<len>>1;j++)
sum += SHR32(MULT16_16(x_lp[j],y[i+j]), shift);
#else
sum = celt_inner_prod(x_lp, y+i, len>>1);
sum = celt_inner_prod_c(x_lp, y+i, len>>1);
#endif
xcorr[i] = MAX32(-1, sum);
#ifdef FIXED_POINT
......@@ -405,7 +406,7 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0_, int prev_period, opus_val16 prev_gain)
int N, int *T0_, int prev_period, opus_val16 prev_gain, int arch)
{
int k, i, T, T0;
opus_val16 g, g0;
......@@ -517,7 +518,7 @@ opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
pg = SHR32(frac_div32(best_xy,best_yy+1),16);
for (k=0;k<3;k++)
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
xcorr[k] = celt_inner_prod(x, x-(T+k-1), N, arch);
if ((xcorr[2]-xcorr[0]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[0]))
offset = 1;
else if ((xcorr[0]-xcorr[2]) > MULT16_32_Q15(QCONST16(.7f,15),xcorr[1]-xcorr[2]))
......
......@@ -37,7 +37,8 @@
#include "modes.h"
#include "cpu_support.h"
#if defined(__SSE__) && !defined(FIXED_POINT)
#if defined(__SSE__) && !defined(FIXED_POINT) \
|| defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.h"
#endif
......@@ -56,12 +57,13 @@ void pitch_search(const opus_val16 * OPUS_RESTRICT x_lp, opus_val16 * OPUS_RESTR
int len, int max_pitch, int *pitch, int arch);
opus_val16 remove_doubling(opus_val16 *x, int maxperiod, int minperiod,
int N, int *T0, int prev_period, opus_val16 prev_gain);
int N, int *T0, int prev_period, opus_val16 prev_gain, int arch);
/* OPT: This is the kernel you really want to optimize. It gets used a lot
by the prefilter and by the PLC. */
#ifndef OVERRIDE_XCORR_KERNEL
static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
static OPUS_INLINE void xcorr_kernel_c(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
opus_val16 y_0, y_1, y_2, y_3;
......@@ -126,8 +128,15 @@ static OPUS_INLINE void xcorr_kernel(const opus_val16 * x, const opus_val16 * y,
sum[3] = MAC16_16(sum[3],tmp,y_1);
}
}
#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
#define xcorr_kernel(x, y, sum, len, arch) \
((void)(arch),xcorr_kernel_c(x, y, sum, len))
#endif
#endif /* OVERRIDE_XCORR_KERNEL */
#ifndef OVERRIDE_DUAL_INNER_PROD
static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
......@@ -145,9 +154,10 @@ static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y
}
#endif
#ifndef OVERRIDE_CELT_INNER_PROD
static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_val16 *y,
int N)
/*We make sure a C version is always available for cases where the overhead of
vectorization and passing around an arch flag aren't worth it.*/
static OPUS_INLINE opus_val32 celt_inner_prod_c(const opus_val16 *x,
const opus_val16 *y, int N)
{
int i;
opus_val32 xy=0;
......@@ -155,6 +165,10 @@ static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x, const opus_va
xy = MAC16_16(xy, x[i], y[i]);
return xy;
}
#if !defined(OVERRIDE_CELT_INNER_PROD)
# define celt_inner_prod(x, y, N, arch) \
((void)(arch),celt_inner_prod_c(x, y, N))
#endif
#ifdef FIXED_POINT
......@@ -163,11 +177,11 @@ opus_val32
void
#endif
celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
opus_val32 *xcorr, int len, int max_pitch, int arch);
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
# if defined(OPUS_HAVE_RTCD)
# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
extern
# if defined(FIXED_POINT)
opus_val32
......@@ -179,10 +193,10 @@ void
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
xcorr, len, max_pitch))
xcorr, len, max_pitch, arch))
# else
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch))
((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
# endif
#endif
......
......@@ -36,6 +36,8 @@
#define CELT_C
#include <stdio.h>
#include <math.h>
#include "mathops.c"
#include "entenc.c"
#include "entdec.c"
......@@ -45,8 +47,16 @@
#include "laplace.c"
#include "vq.c"
#include "cwrs.c"
#include <stdio.h>
#include <math.h>
#include "pitch.c"
#include "celt_lpc.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#endif
#ifdef FIXED_POINT
#define WORD "%d"
......
......@@ -44,7 +44,18 @@
#include "entdec.c"
#include "mathops.c"
#include "bands.h"
#include "pitch.c"
#include "celt_lpc.c"
#include <math.h>
#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)
#include "x86/pitch_sse.c"
#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#endif
#define MAX_SIZE 100
int ret=0;
......
......@@ -350,7 +350,7 @@ unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
}
#ifndef OVERRIDE_renormalise_vector
void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch)
{
int i;
#ifdef FIXED_POINT
......@@ -360,7 +360,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
opus_val16 g;
opus_val32 t;
celt_norm *xptr;
E = EPSILON + celt_inner_prod(X, X, N);
E = EPSILON + celt_inner_prod(X, X, N, arch);
#ifdef FIXED_POINT
k = celt_ilog2(E)>>1;
#endif
......@@ -377,7 +377,7 @@ void renormalise_vector(celt_norm *X, int N, opus_val16 gain)
}
#endif /* OVERRIDE_renormalise_vector */
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
{
int i;
int itheta;
......@@ -396,8 +396,8 @@ int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N)
Eside = MAC16_16(Eside, s, s);
}
} else {
Emid += celt_inner_prod(X, X, N);
Eside += celt_inner_prod(Y, Y, N);
Emid += celt_inner_prod(X, X, N, arch);
Eside += celt_inner_prod(Y, Y, N, arch);
}
mid = celt_sqrt(Emid);
side = celt_sqrt(Eside);
......
......@@ -63,8 +63,8 @@ unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B,
unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
ec_dec *dec, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain);
void renormalise_vector(celt_norm *X, int N, opus_val16 gain, int arch);
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N);
int stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch);