Commit 0b0a2b4d authored by Viswanath Puttagunta's avatar Viswanath Puttagunta Committed by Timothy B. Terriberry
Browse files

armv7: celt_pitch_xcorr: Introduce ARM NEON intrinsics



Optimize celt_pitch_xcorr function (for floating point)
using ARM NEON intrinsics for SoCs that have NEON VFP unit.

To enable this optimization, use --enable-intrinsics
configure option.

Compile time and runtime checks are also supported to make sure
this optimization is only enabled when the compiler supports
NEON intrinsics.
Signed-off-by: Timothy B. Terriberry's avatarTimothy B. Terriberry <tterribe@xiph.org>
parent 5b712da9
......@@ -41,6 +41,12 @@ endif
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
SILK_SOURCES += $(SILK_SOURCES_ARM)
if OPUS_ARM_NEON_INTR
CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon
endif
if OPUS_ARM_EXTERNAL_ASM
nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
......@@ -260,3 +266,9 @@ if HAVE_SSE2
$(SSE_OBJ): CFLAGS += -msse2
endif
endif
if OPUS_ARM_NEON_INTR
CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
%test_unit_rotation.o %test_unit_mathops.o
$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS)
endif
......@@ -41,9 +41,16 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */
};
# else
# error "Floating-point implementation is not supported by ARM asm yet." \
"Reconfigure with --disable-rtcd or send patches."
# else /* !FIXED_POINT */
# if defined(OPUS_ARM_NEON_INTR)
void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int, int) = {
celt_pitch_xcorr_c, /* ARMv4 */
celt_pitch_xcorr_c, /* EDSP */
celt_pitch_xcorr_c, /* Media */
celt_pitch_xcorr_float_neon /* Neon */
};
# endif
# endif
#endif
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
@brief ARM Neon Intrinsic optimizations for celt
*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <arm_neon.h>
#include "../pitch.h"
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
* Computes 4 correlation values and stores them in sum[4]
*/
static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
float32_t sum[4], int len) {
float32x4_t YY[3];
float32x4_t YEXT[3];
float32x4_t XX[2];
float32x2_t XX_2;
float32x4_t SUMM;
const float32_t *xi = x;
const float32_t *yi = y;
celt_assert(len>0);
YY[0] = vld1q_f32(yi);
SUMM = vdupq_n_f32(0);
/* Consume 8 elements in x vector and 12 elements in y
* vector. However, the 12'th element never really gets
* touched in this loop. So, if len == 8, then we only
* must access y[0] to y[10]. y[11] must not be accessed
* hence make sure len > 8 and not len >= 8
*/
while (len > 8) {
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
YEXT[0] = vextq_f32(YY[1], YY[2], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
YEXT[1] = vextq_f32(YY[1], YY[2], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
YEXT[2] = vextq_f32(YY[1], YY[2], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
YY[0] = YY[2];
len -= 8;
}
/* Consume 4 elements in x vector and 8 elements in y
* vector. However, the 8'th element in y never really gets
* touched in this loop. So, if len == 4, then we only
* must access y[0] to y[6]. y[7] must not be accessed
* hence make sure len>4 and not len>=4
*/
if (len > 4) {
yi += 4;
YY[1] = vld1q_f32(yi);
XX[0] = vld1q_f32(xi);
xi += 4;
SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
YEXT[0] = vextq_f32(YY[0], YY[1], 1);
SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
YEXT[1] = vextq_f32(YY[0], YY[1], 2);
SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
YEXT[2] = vextq_f32(YY[0], YY[1], 3);
SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
YY[0] = YY[1];
len -= 4;
}
while (--len > 0) {
XX_2 = vld1_dup_f32(xi++);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
YY[0]= vld1q_f32(++yi);
}
XX_2 = vld1_dup_f32(xi);
SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
vst1q_f32(sum, SUMM);
}
/*
* Function: xcorr_kernel_neon_float_process1
* ---------------------------------
* Computes single correlation values and stores in *sum
*/
static void xcorr_kernel_neon_float_process1(const float32_t *x,
const float32_t *y, float32_t *sum, int len) {
float32x4_t XX[4];
float32x4_t YY[4];
float32x2_t XX_2;
float32x2_t YY_2;
float32x4_t SUMM;
float32x2_t SUMM_2[2];
const float32_t *xi = x;
const float32_t *yi = y;
SUMM = vdupq_n_f32(0);
/* Work on 16 values per iteration */
while (len >= 16) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
XX[2] = vld1q_f32(xi);
xi += 4;
XX[3] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
YY[2] = vld1q_f32(yi);
yi += 4;
YY[3] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
len -= 16;
}
/* Work on 8 values */
if (len >= 8) {
XX[0] = vld1q_f32(xi);
xi += 4;
XX[1] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
YY[1] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
len -= 8;
}
/* Work on 4 values */
if (len >= 4) {
XX[0] = vld1q_f32(xi);
xi += 4;
YY[0] = vld1q_f32(yi);
yi += 4;
SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
len -= 4;
}
/* Start accumulating results */
SUMM_2[0] = vget_low_f32(SUMM);
if (len >= 2) {
/* While at it, consume 2 more values if available */
XX_2 = vld1_f32(xi);
xi += 2;
YY_2 = vld1_f32(yi);
yi += 2;
SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
len -= 2;
}
SUMM_2[1] = vget_high_f32(SUMM);
SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
/* Ok, now we have result accumulated in SUMM_2[0].0 */
if (len > 0) {
/* Case when you have one value left */
XX_2 = vld1_dup_f32(xi);
YY_2 = vld1_dup_f32(yi);
SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
}
vst1_lane_f32(sum, SUMM_2[0], 0);
}
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch) {
int i;
celt_assert(max_pitch > 0);
celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
for (i = 0; i < (max_pitch-3); i += 4) {
xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
(float32_t *)xcorr+i, len);
}
/* In case max_pitch isn't multiple of 4
* compute single correlation value per iteration
*/
for (; i < max_pitch; i++) {
xcorr_kernel_neon_float_process1((const float32_t *)_x,
(const float32_t *)_y+i, (float32_t *)xcorr+i, len);
}
}
......@@ -52,6 +52,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# endif
# endif
#else /* Start !FIXED_POINT */
/* Float case */
#if defined(OPUS_ARM_NEON_INTR)
void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
#if !defined(OPUS_HAVE_RTCD)
#define OVERRIDE_PITCH_XCORR (1)
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
#endif
#endif
#endif /* end !FIXED_POINT */
#endif
......@@ -31,7 +31,8 @@
#include "opus_types.h"
#include "opus_defines.h"
#if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
#if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
#include "arm/armcpu.h"
/* We currently support 4 ARM variants:
......
......@@ -46,7 +46,8 @@
#include "mips/pitch_mipsr1.h"
#endif
#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
|| defined(OPUS_ARM_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
......@@ -178,7 +179,8 @@ celt_pitch_xcorr_c(const opus_val16 *_x, const opus_val16 *_y,
#if !defined(OVERRIDE_PITCH_XCORR)
/*Is run-time CPU detection enabled on this platform?*/
# if defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_ASM)
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
extern
# if defined(FIXED_POINT)
opus_val32
......
......@@ -56,7 +56,11 @@
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
|| defined(OPUS_ARM_NEON_INTR))
#if defined(OPUS_ARM_NEON_INTR)
#include "arm/celt_neon_intr.c"
#endif
#include "arm/arm_celt_map.c"
#endif
......
......@@ -54,7 +54,11 @@
#include "x86/celt_lpc_sse.c"
#endif
#include "x86/x86_celt_map.c"
#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
#elif ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
|| defined(OPUS_ARM_NEON_INTR))
#if defined(OPUS_ARM_NEON_INTR)
#include "arm/celt_neon_intr.c"
#endif
#include "arm/arm_celt_map.c"
#endif
......
......@@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s
CELT_AM_SOURCES_ARM_ASM = \
celt/arm/armopts.s.in
CELT_SOURCES_ARM_NEON_INTR = \
celt/arm/celt_neon_intr.c
......@@ -190,14 +190,14 @@ AC_ARG_ENABLE([rtcd],
[enable_rtcd=yes])
AC_ARG_ENABLE([intrinsics],
[AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],,
[AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations for ARM(float) X86(fixed)])],,
[enable_intrinsics=no])
rtcd_support=no
cpu_arm=no
AS_IF([test x"${enable_asm}" = x"yes"],[
inline_optimization="No ASM for your platform, please send patches"
inline_optimization="No inline ASM for your platform, please send patches"
case $host_cpu in
arm*)
dnl Currently we only have asm for fixed-point
......@@ -343,7 +343,6 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
asm_optimization="disabled"
])
AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
[test x"${inline_optimization%% *}" = x"ARM"])
AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
......@@ -351,9 +350,59 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
AM_CONDITIONAL([HAVE_SSE4_1], [false])
AM_CONDITIONAL([HAVE_SSE2], [false])
AS_IF([test x"$enable_intrinsics" = x"yes"],[
AS_IF([test x"$enable_float" = x"no"],
[AS_IF([test x"$host_cpu" = x"i386" -o x"$host_cpu" = x"i686" -o x"$host_cpu" = x"x86_64"],[
case $host_cpu in
arm*)
cpu_arm=yes
AC_MSG_CHECKING(if compiler supports ARM NEON intrinsics)
save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
AC_LINK_IFELSE(
[
AC_LANG_PROGRAM(
[[#include <arm_neon.h>
]],
[[
static float32x4_t A[2], SUMM;
SUMM = vmlaq_f32(SUMM, A[0], A[1]);
]]
)
],[
OPUS_ARM_NEON_INTR=1
AC_MSG_RESULT([yes])
],[
OPUS_ARM_NEON_INTR=0
AC_MSG_RESULT([no])
]
)
CFLAGS="$save_CFLAGS"
#Now we know if compiler supports ARM neon intrinsics or not
#Currently we only have intrinsic optimization for floating point
AS_IF([test x"$enable_float" = x"yes"],
[
AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"],
[
AC_DEFINE([OPUS_ARM_NEON_INTR], 1, [Compiler supports ARMv7 Neon Intrinsics])
AS_IF([test x"enable_rtcd" != x""],
[rtcd_support="ARM (ARMv7_Neon_Intrinsics)"],[])
enable_intrinsics="$enable_intrinsics ARMv7_Neon_Intrinsics"
dnl Don't see why defining these is necessary to check features at runtime
AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions])
AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions])
AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions])
],
[
AC_MSG_WARN([Compiler does not support ARM intrinsics])
enable_intrinsics=no
])
], [
AC_MSG_WARN([Currently on have ARM intrinsics for float])
enable_intrinsics=no
])
;;
"i386" | "i686" | "x86_64")
AS_IF([test x"$enable_float" = x"no"],[
AS_IF([test x"$enable_rtcd" = x"yes"],[
get_cpuid_by_asm="no"
AC_MSG_CHECKING([Get CPU Info])
......@@ -423,7 +472,7 @@ AS_IF([test x"$enable_float" = x"no"],
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by C method])])
],[
],[ ##### Else case for AS_IF([test x"$?" = x"0"])
gcc -Q --help=target | grep "\-msse2 "
AC_MSG_CHECKING([sse2])
AS_IF([test x"$?" = x"0"],[
......@@ -446,13 +495,28 @@ AS_IF([test x"$enable_float" = x"no"],
AM_CONDITIONAL([HAVE_SSE2], [true])
AS_IF([test x"$get_cpuid_by_asm" = x"yes"],[AC_DEFINE([CPU_INFO_BY_ASM], [1], [Get CPU Info by asm method])],
[AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])])
],[enable_intrinsics="no"])
],[enable_intrinsics="no"]) #End of AS_IF([test x"$?" = x"0"]
])
], [enable_intrinsics="no"])
])
], [enable_intrinsics="no"])
], [
enable_intrinsics="no"
]) ## End of AS_IF([test x"$enable_rtcd" = x"yes"]
],
[ ## Else case for AS_IF([test x"$enable_float" = x"no"]
AC_MSG_WARN([Disabling intrinsics .. x86 intrinsics only avail for fixed point])
enable_intrinsics="no"
]) ## End of AS_IF([test x"$enable_float" = x"no"]
;;
*)
AC_MSG_WARN([No intrinsics support for your architecture])
enable_intrinsics="no"
;;
esac
])
AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
[test x"$OPUS_ARM_NEON_INTR" = x"1"])
AS_IF([test x"$enable_rtcd" = x"yes"],[
AS_IF([test x"$rtcd_support" != x"no"],[
AC_DEFINE([OPUS_HAVE_RTCD], [1],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment