Unverified Commit 4fe9de3b authored by Linfeng Zhang's avatar Linfeng Zhang Committed by Jean-Marc Valin

Optimize floating-point celt_inner_prod() and dual_inner_prod() for ARM NEON

The floating-point optimizations are not bit exact with C functions,
because of the different orders of floating-point operations.
But they are bit exact with the simulation C functions which simulate
the floating operations in the optimizations.

Change-Id: I149fda5b602fd5712b16fc8983df3c6c0c9e76ad
Signed-off-by: Jean-Marc Valin's avatarJean-Marc Valin <jmvalin@jmvalin.ca>
parent 4d4ed208
......@@ -35,8 +35,6 @@
#if defined(OPUS_HAVE_RTCD)
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */
......@@ -54,6 +52,7 @@ void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const o
};
# endif
# if defined(FIXED_POINT)
# if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
(defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
(defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
......
......@@ -30,8 +30,6 @@
# include "armcpu.h"
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
......@@ -68,6 +66,8 @@ extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
# endif
# endif
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON)
opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch, int arch);
......
......@@ -126,4 +126,165 @@ void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus
#endif
}
#else /* !FIXED_POINT */
/* ========================================================================== */
#ifdef OPUS_CHECK_ASM
/* This part of code simulates floating-point NEON operations. */
/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of celt_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
{
int i;
opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
}
xy0 += xy2;
xy1 += xy3;
xy = xy0 + xy1;
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
}
return xy;
}
/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point */
/* operations of dual_inner_prod_neon(), and both functions should have bit */
/* exact output. */
static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
for (i = 0; i < N - 3; i += 4) {
xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
}
xy01_0 += xy01_2;
xy02_0 += xy02_2;
xy01_1 += xy01_3;
xy02_1 += xy02_3;
xy01 = xy01_0 + xy01_1;
xy02 = xy02_0 + xy02_1;
for (; i < N; i++) {
xy01 = MAC16_16(xy01, x[i], y01[i]);
xy02 = MAC16_16(xy02, x[i], y02[i]);
}
*xy1 = xy01;
*xy2 = xy02;
}
#endif /* OPUS_CHECK_ASM */
/* ========================================================================== */
opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
{
int i;
opus_val32 xy;
float32x4_t xy_f32x4 = vdupq_n_f32(0);
float32x2_t xy_f32x2;
for (i = 0; i < N - 7; i += 8) {
float32x4_t x_f32x4, y_f32x4;
x_f32x4 = vld1q_f32(&x[i]);
y_f32x4 = vld1q_f32(&y[i]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
x_f32x4 = vld1q_f32(&x[i + 4]);
y_f32x4 = vld1q_f32(&y[i + 4]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
}
if (N - i >= 4) {
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
i += 4;
}
xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
xy = vget_lane_f32(xy_f32x2, 0);
for (; i < N; i++) {
xy = MAC16_16(xy, x[i], y[i]);
}
#ifdef OPUS_CHECK_ASM
celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
#endif
return xy;
}
void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2)
{
int i;
opus_val32 xy01, xy02;
float32x4_t xy01_f32x4 = vdupq_n_f32(0);
float32x4_t xy02_f32x4 = vdupq_n_f32(0);
float32x2_t xy01_f32x2, xy02_f32x2;
for (i = 0; i < N - 7; i += 8) {
float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
x_f32x4 = vld1q_f32(&x[i]);
y01_f32x4 = vld1q_f32(&y01[i]);
y02_f32x4 = vld1q_f32(&y02[i]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
x_f32x4 = vld1q_f32(&x[i + 4]);
y01_f32x4 = vld1q_f32(&y01[i + 4]);
y02_f32x4 = vld1q_f32(&y02[i + 4]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
}
if (N - i >= 4) {
const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
i += 4;
}
xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
xy01 = vget_lane_f32(xy01_f32x2, 0);
xy02 = vget_lane_f32(xy02_f32x2, 0);
for (; i < N; i++) {
xy01 = MAC16_16(xy01, x[i], y01[i]);
xy02 = MAC16_16(xy02, x[i], y02[i]);
}
*xy1 = xy01;
*xy2 = xy02;
#ifdef OPUS_CHECK_ASM
{
opus_val32 xy1_c, xy2_c;
dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
}
#endif
}
#endif /* FIXED_POINT */
......@@ -46,8 +46,7 @@
#include "mips/pitch_mipsr1.h"
#endif
#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
|| defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment