From 7017f550931ca47c1aae597623425c5b90a5aeef Mon Sep 17 00:00:00 2001 From: Jonathan Lennox <jonathan@vidyo.com> Date: Tue, 22 Dec 2015 19:21:47 -0500 Subject: [PATCH] Add Neon fixed-point implementation of xcorr_kernel. Used for celt_pitch_xcorr on aarch64, and celt_fir and celt_iir on both armv7 and aarch64. Signed-off-by: Timothy B. Terriberry <tterribe@xiph.org> --- celt/arm/arm_celt_map.c | 17 +++++++++++ celt/arm/celt_neon_intr.c | 61 ++++++++++++++++++++++++++++++++++++++- celt/arm/pitch_arm.h | 29 +++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c index f692c739a..4d4d069a8 100644 --- a/celt/arm/arm_celt_map.c +++ b/celt/arm/arm_celt_map.c @@ -60,6 +60,23 @@ void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, # endif # endif /* FIXED_POINT */ +#if defined(FIXED_POINT) && defined(OPUS_HAVE_RTCD) && \ + defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR) + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* ARMv4 */ + xcorr_kernel_c, /* EDSP */ + xcorr_kernel_c, /* Media */ + xcorr_kernel_neon_fixed, /* Neon */ +}; + +#endif + # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) # if defined(HAVE_ARM_NE10) # if defined(CUSTOM_MODES) diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c index 47dce15ba..47bbe3dc2 100644 --- a/celt/arm/celt_neon_intr.c +++ b/celt/arm/celt_neon_intr.c @@ -37,7 +37,66 @@ #include <arm_neon.h> #include "../pitch.h" -#if !defined(FIXED_POINT) +#if defined(FIXED_POINT) +void xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) +{ + int j; + int32x4_t a = vld1q_s32(sum); + /* Load y[0...3] */ + /* This requires len>0 to always be valid (which we assert in the C code). */ + int16x4_t y0 = vld1_s16(y); + y += 4; + + for (j = 0; j + 8 <= len; j += 8) + { + /* Load x[0...7] */ + int16x8_t xx = vld1q_s16(x); + int16x4_t x0 = vget_low_s16(xx); + int16x4_t x4 = vget_high_s16(xx); + /* Load y[4...11] */ + int16x8_t yy = vld1q_s16(y); + int16x4_t y4 = vget_low_s16(yy); + int16x4_t y8 = vget_high_s16(yy); + int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); + int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0); + + int16x4_t y1 = vext_s16(y0, y4, 1); + int16x4_t y5 = vext_s16(y4, y8, 1); + int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1); + int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1); + + int16x4_t y2 = vext_s16(y0, y4, 2); + int16x4_t y6 = vext_s16(y4, y8, 2); + int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2); + int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2); + + int16x4_t y3 = vext_s16(y0, y4, 3); + int16x4_t y7 = vext_s16(y4, y8, 3); + int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3); + int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3); + + y0 = y8; + a = a7; + x += 8; + y += 8; + } + + for (; j < len; j++) + { + int16x4_t x0 = vld1_dup_s16(x); /* load next x */ + int32x4_t a0 = vmlal_s16(a, y0, x0); + + int16x4_t y4 = vld1_dup_s16(y); /* load next y */ + y0 = vext_s16(y0, y4, 1); + a = a0; + x++; + y++; + } + + vst1q_s32(sum, a); +} + +#else /* * Function: xcorr_kernel_neon_float * --------------------------------- diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h index d270b988a..14331169e 100644 --- a/celt/arm/pitch_arm.h +++ b/celt/arm/pitch_arm.h @@ -64,6 +64,35 @@ extern opus_val32 # define OVERRIDE_PITCH_XCORR (1) # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) + +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) +void xcorr_kernel_neon_fixed( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); +# endif + +# if defined(OPUS_HAVE_RTCD) && \ + (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)) + +extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); + +# define OVERRIDE_XCORR_KERNEL (1) +# define xcorr_kernel(x, y, sum, len, arch) \ + ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) + +# elif defined(OPUS_ARM_PRESUME_NEON_INTR) +# define OVERRIDE_XCORR_KERNEL (1) +# define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_neon_fixed(x, y, sum, len)) + # endif #else /* Start !FIXED_POINT */ -- GitLab