Skip to content
Snippets Groups Projects
Verified Commit 735c4070 authored by Victor Ding's avatar Victor Ding Committed by Jean-Marc Valin
Browse files

Optimize NSQ_del_dec() for AVX2

The optimization is bit-exact with C function.

This optimization speeds up SILK encoder (floating point) as following:

AMD Zen:
Complexity 0-5 :      0%
Complexity 6-7 : 3 -  7%
Complexity 8-10: 8 - 15%

Intel Skylake:
Complexity 0-5 :       0%
Complexity 6-7 : 14 - 18%
Complexity 8-10: 17 - 22%

Adapted by Jean-Marc Valin
parent 452aa952
No related branches found
No related tags found
No related merge requests found
......@@ -70,6 +70,7 @@ LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
endif
endif
if HAVE_AVX2
SILK_SOURCES += $(SILK_SOURCES_AVX2)
CELT_SOURCES += $(CELT_SOURCES_AVX2)
if ENABLE_DEEP_PLC
LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
......@@ -425,6 +426,7 @@ endif
if HAVE_AVX2
AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
$(SILK_SOURCES_AVX2:.c=.lo) \
$(DNN_SOURCES_AVX2:.c=.lo)
$(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
endif
......
This diff is collapsed.
......@@ -154,7 +154,33 @@ void silk_NSQ_del_dec_sse4_1(
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# if defined OPUS_X86_PRESUME_SSE4_1
void silk_NSQ_del_dec_avx2(
const silk_encoder_state *psEncC, /* I Encoder State */
silk_nsq_state *NSQ, /* I/O NSQ state */
SideInfoIndices *psIndices, /* I/O Quantization Indices */
const opus_int16 x16[], /* I Input */
opus_int8 pulses[], /* O Quantized pulse signal */
const opus_int16 PredCoef_Q12[2 * MAX_LPC_ORDER], /* I Short term prediction coefs */
const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */
const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */
const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */
const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */
const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */
const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */
const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */
const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
const opus_int LTP_scale_Q14 /* I LTP state scaling */
);
# if defined (OPUS_X86_PRESUME_AVX2)
# define OVERRIDE_silk_NSQ_del_dec
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
((void)(arch),silk_NSQ_del_dec_avx2(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
# elif defined (OPUS_X86_PRESUME_SSE4_1)
# define OVERRIDE_silk_NSQ_del_dec
# define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
......
......@@ -132,7 +132,7 @@ void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
silk_NSQ_del_dec_c,
silk_NSQ_del_dec_c,
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
MAY_HAVE_SSE4_1( silk_NSQ_del_dec ) /* avx */
MAY_HAVE_AVX2( silk_NSQ_del_dec ) /* avx */
};
#if defined(FIXED_POINT)
......
......@@ -86,6 +86,9 @@ silk/x86/NSQ_del_dec_sse4_1.c \
silk/x86/VAD_sse4_1.c \
silk/x86/VQ_WMat_EC_sse4_1.c
SILK_SOURCES_AVX2 = \
silk/x86/NSQ_del_dec_avx2.c
SILK_SOURCES_ARM_RTCD = \
silk/arm/arm_silk_map.c
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment