Commit 7017f550 authored by Jonathan Lennox's avatar Jonathan Lennox Committed by Timothy B. Terriberry
Browse files

Add Neon fixed-point implementation of xcorr_kernel.



Used for celt_pitch_xcorr on aarch64, and celt_fir and celt_iir on both armv7 and aarch64.
Signed-off-by: Timothy B. Terriberry's avatarTimothy B. Terriberry <tterribe@xiph.org>
parent 7f1c6fa6
......@@ -60,6 +60,23 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
# endif
# endif /* FIXED_POINT */
#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \
defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* ARMv4 */
xcorr_kernel_c, /* EDSP */
xcorr_kernel_c, /* Media */
xcorr_kernel_neon_fixed, /* Neon */
};
#endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# if defined(HAVE_ARM_NE10)
# if defined(CUSTOM_MODES)
......
......@@ -37,7 +37,66 @@
#include <arm_neon.h>
#include "../pitch.h"
#if !defined(FIXED_POINT)
#if defined(FIXED_POINT)
void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len)
{
int j;
int32x4_t a = vld1q_s32(sum);
/* Load y[0...3] */
/* This requires len>0 to always be valid (which we assert in the C code). */
int16x4_t y0 = vld1_s16(y);
y += 4;
for (j = 0; j + 8 <= len; j += 8)
{
/* Load x[0...7] */
int16x8_t xx = vld1q_s16(x);
int16x4_t x0 = vget_low_s16(xx);
int16x4_t x4 = vget_high_s16(xx);
/* Load y[4...11] */
int16x8_t yy = vld1q_s16(y);
int16x4_t y4 = vget_low_s16(yy);
int16x4_t y8 = vget_high_s16(yy);
int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0);
int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0);
int16x4_t y1 = vext_s16(y0, y4, 1);
int16x4_t y5 = vext_s16(y4, y8, 1);
int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1);
int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1);
int16x4_t y2 = vext_s16(y0, y4, 2);
int16x4_t y6 = vext_s16(y4, y8, 2);
int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2);
int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2);
int16x4_t y3 = vext_s16(y0, y4, 3);
int16x4_t y7 = vext_s16(y4, y8, 3);
int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3);
int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3);
y0 = y8;
a = a7;
x += 8;
y += 8;
}
for (; j < len; j++)
{
int16x4_t x0 = vld1_dup_s16(x); /* load next x */
int32x4_t a0 = vmlal_s16(a, y0, x0);
int16x4_t y4 = vld1_dup_s16(y); /* load next y */
y0 = vext_s16(y0, y4, 1);
a = a0;
x++;
y++;
}
vst1q_s32(sum, a);
}
#else
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
......
......@@ -64,6 +64,35 @@ extern opus_val32
# define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void xcorr_kernel_neon_fixed(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
# endif
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len);
# define OVERRIDE_XCORR_KERNEL (1)
# define xcorr_kernel(x, y, sum, len, arch) \
((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_XCORR_KERNEL (1)
# define xcorr_kernel(x, y, sum, len, arch) \
((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len))
# endif
#else /* Start !FIXED_POINT */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment