Commit d40e986a authored by Erik de Castro Lopo's avatar Erik de Castro Lopo

Add FLAC__SSE_SUPPORTED and FLAC__SSE2_SUPPORTED flags.

* Allow compiling using GCC GCC w/o SSE support.
* Allow SSE4.1 intrinsic functions to be enabled.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
parent c2747bec
......@@ -199,12 +199,4 @@ int flac_snprintf(char *str, size_t size, const char *fmt, ...);
};
#endif
/* SSSE3, SSE4 support: MSVS 2008, GCC 4.3 -- currently disabled, Intel Compiler 10.0 */
#if ( defined _MSC_VER && _MSC_VER >= 1500 ) \
|| ( 0 && defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) ) \
|| ( defined __INTEL_COMPILER && __INTEL_COMPILER >= 1000 )
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_SUPPORTED 1
#endif
#endif /* FLAC__SHARE__COMPAT_H */
......@@ -39,6 +39,47 @@
#include <config.h>
#endif
/* SSE intrinsics support by ICC/MSVC/GCC */
#if defined __INTEL_COMPILER
#define FLAC__SSE_TARGET(x)
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#elif defined _MSC_VER
#define FLAC__SSE_TARGET(x)
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#elif defined __GNUC__
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
#define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1
#define FLAC__SSSE3_SUPPORTED 1
#define FLAC__SSE4_1_SUPPORTED 1
#else /* for GCC older than 4.9 */
#define FLAC__SSE_TARGET(x)
#ifdef __SSE__
#define FLAC__SSE_SUPPORTED 1
#endif
#ifdef __SSE2__
#define FLAC__SSE2_SUPPORTED 1
#endif
#ifdef __SSSE3__
#define FLAC__SSSE3_SUPPORTED 1
#endif
#ifdef __SSE4_1__
#define FLAC__SSE4_1_SUPPORTED 1
#endif
#endif /* GCC version */
#endif /* compiler version */
typedef enum {
FLAC__CPUINFO_TYPE_IA32,
FLAC__CPUINFO_TYPE_X86_64,
......
......@@ -37,6 +37,7 @@
#include <config.h>
#endif
#include "private/cpu.h"
#include "private/float.h"
#include "FLAC/format.h"
......@@ -80,10 +81,12 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], u
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
# endif
# endif
#endif
......@@ -156,9 +159,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# ifdef FLAC__SSE4_SUPPORTED
# endif
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# endif
# endif
......@@ -195,9 +200,9 @@ void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], u
void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif /* FLAC__CPU_IA32 || FLAC__CPU_PPC */
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE4_SUPPORTED
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif
# endif
# endif
#endif /* FLAC__NO_ASM */
......
......@@ -38,11 +38,13 @@
#endif
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "share/compat.h"
#include "private/cpu.h"
#include "FLAC/format.h"
#ifdef FLAC__SSE2_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps);
#endif
#ifdef FLAC__SSSE3_SUPPORTED
extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
......
......@@ -37,13 +37,15 @@
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/lpc.h"
#ifdef FLAC__SSE_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <xmmintrin.h> /* SSE */
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm2, xmm5;
......@@ -80,6 +82,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[],
_mm_storeu_ps(autoc, xmm5);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
......@@ -125,6 +128,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[],
_mm_storeu_ps(autoc+4, xmm6);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
......@@ -178,6 +182,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[]
_mm_storeu_ps(autoc+8, xmm7);
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
......@@ -241,6 +246,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[]
_mm_storeu_ps(autoc+12,xmm9);
}
#endif /* FLAC__SSE_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */
......@@ -37,13 +37,15 @@
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/lpc.h"
#ifdef FLAC__SSE2_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <emmintrin.h> /* SSE2 */
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
......@@ -787,6 +789,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
......@@ -1313,6 +1316,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
}
#endif /* FLAC__SSE2_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */
......@@ -34,16 +34,14 @@
# include <config.h>
#endif
#include "share/compat.h"
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#ifdef FLAC__SSE4_SUPPORTED
#include "private/lpc.h"
#ifdef FLAC__SSE4_1_SUPPORTED
#include "FLAC/assert.h"
#include "FLAC/format.h"
#include "private/lpc.h"
#include <smmintrin.h> /* SSE4.1 */
......@@ -68,6 +66,7 @@
#define DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
#endif
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
......@@ -594,6 +593,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
}
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
int i;
......@@ -1120,7 +1120,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un
}
}
#endif /* FLAC__SSE4_SUPPORTED */
#endif /* FLAC__SSE4_1_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */
......@@ -417,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
}
#endif
#ifdef FLAC__HAS_X86INTRIN
# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
# if defined FLAC__SSE4_1_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
if(decoder->private_->cpuinfo.ia32.sse41)
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
# endif
......
......@@ -920,11 +920,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov;
# endif /* FLAC__HAS_NASM */
# ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse2) {
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
}
# ifdef FLAC__SSE4_SUPPORTED
# endif
# ifdef FLAC__SSE4_1_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse41)
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41;
# endif
......@@ -932,6 +934,7 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
# elif defined FLAC__CPU_X86_64
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
# ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
else if(encoder->protected_->max_lpc_order < 8)
......@@ -940,9 +943,11 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
# endif
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
# endif
# endif /* FLAC__HAS_X86INTRIN */
# endif /* FLAC__CPU_... */
}
......@@ -956,15 +961,19 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
else
# endif
# ifdef FLAC__SSE2_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse2)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
# endif
# elif defined FLAC__CPU_X86_64
# ifdef FLAC__SSSE3_SUPPORTED
if(encoder->private_->cpuinfo.x86_64.ssse3)
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_ssse3;
else
# endif
# ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_precompute_partition_info_sums = FLAC__precompute_partition_info_sums_intrin_sse2;
# endif
# endif /* FLAC__CPU_... */
}
#endif /* !FLAC__NO_ASM && FLAC__HAS_X86INTRIN */
......
......@@ -36,12 +36,14 @@
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/stream_encoder.h"
#ifdef FLAC__SSE2_SUPPORTED
#include <stdlib.h> /* for abs() */
#include <emmintrin.h> /* SSE2 */
#include "FLAC/assert.h"
#include "private/stream_encoder.h"
FLAC__SSE_TARGET("sse2")
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{
......@@ -157,5 +159,6 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
}
}
#endif /* FLAC__SSE2_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
......@@ -34,17 +34,16 @@
# include <config.h>
#endif
#include "share/compat.h"
#ifndef FLAC__NO_ASM
#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
#include "private/stream_encoder.h"
#ifdef FLAC__SSSE3_SUPPORTED
#include <stdlib.h> /* for abs() */
#include <tmmintrin.h> /* SSSE3 */
#include "FLAC/assert.h"
#include "private/stream_encoder.h"
FLAC__SSE_TARGET("ssse3")
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment