Commit ac0b4b4c authored by Erik de Castro Lopo's avatar Erik de Castro Lopo
Browse files

Improve encoding speed on older Intel CPUs.

The commit http://git.xiph.org/?p=flac.git;a=commit;h=e9d805dd4374
changed the that calculate autocorrelation. However, the new code
worked slightly (about 4%) slower on Core 2, but with the new
presets the speed decrease can reach ~25%.

This patch enables both old and new functions and chooses between
them at runtime.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
parent d03c9f46
......@@ -164,6 +164,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
/* http://www.sandpile.org/x86/cpuid.htm */
#ifdef FLAC__HAS_X86INTRIN
FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
info->ia32.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */
FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
#else
FLAC__uint32 flags_ecx, flags_edx;
......@@ -347,6 +349,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
{
/* http://www.sandpile.org/x86/cpuid.htm */
FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx;
FLAC__cpu_info_x86(0, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
info->x86.intel = (flags_ebx == 0x756E6547 && flags_edx == 0x49656E69 && flags_ecx == 0x6C65746E)? true : false; /* GenuineIntel */
FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx);
info->x86.sse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false;
info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false;
......
......@@ -36,10 +36,10 @@
data_section
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
......@@ -443,7 +443,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
ret
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old
;[esp + 16] == autoc[]
;[esp + 12] == lag
;[esp + 8] == data_len
......@@ -490,7 +490,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
ret
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old
;[esp + 16] == autoc[]
;[esp + 12] == lag
;[esp + 8] == data_len
......@@ -549,7 +549,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
ret
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old
;[esp + 16] == autoc[]
;[esp + 12] == lag
;[esp + 8] == data_len
......@@ -623,7 +623,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
ret
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old
;[ebp + 20] == autoc[]
;[ebp + 16] == lag
;[ebp + 12] == data_len
......
......@@ -116,6 +116,8 @@ typedef enum {
#if defined FLAC__CPU_IA32
typedef struct {
FLAC__bool intel;
FLAC__bool cmov;
FLAC__bool mmx;
FLAC__bool sse;
......@@ -131,6 +133,8 @@ typedef struct {
} FLAC__CPUInfo_IA32;
#elif defined FLAC__CPU_X86_64
typedef struct {
FLAC__bool intel;
FLAC__bool sse3;
FLAC__bool ssse3;
FLAC__bool sse41;
......
......@@ -73,18 +73,22 @@ void FLAC__lpc_compute_autocorrelation(const FLAC__real data[], unsigned data_le
# ifdef FLAC__CPU_IA32
# ifdef FLAC__HAS_NASM
void FLAC__lpc_compute_autocorrelation_asm_ia32(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
# endif
# endif
#endif
......
......@@ -45,11 +45,15 @@
#include <xmmintrin.h> /* SSE */
#if 1
/* Faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
/* new routines: more unaligned loads, less shuffle
* old routines: less unaligned loads, more shuffle
* these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
*/
/* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
int i;
int limit = data_len - 4;
......@@ -85,7 +89,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[],
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
int i;
int limit = data_len - 8;
......@@ -129,7 +133,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[],
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
int i;
int limit = data_len - 12;
......@@ -181,7 +185,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[]
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
int i;
int limit = data_len - 16;
......@@ -240,11 +244,10 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[]
_mm_storeu_ps(autoc+12,sum3);
}
#else
/* Faster on older Intel CPUs (up to Core 2) */
/* old routines: faster on older Intel CPUs (up to Core 2) */
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm2, xmm5;
......@@ -281,7 +284,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4(const FLAC__real data[],
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
......@@ -327,7 +330,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8(const FLAC__real data[],
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
......@@ -381,7 +384,7 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12(const FLAC__real data[]
}
FLAC__SSE_TARGET("sse")
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
......@@ -443,7 +446,6 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16(const FLAC__real data[]
_mm_storeu_ps(autoc+8, xmm8);
_mm_storeu_ps(autoc+12,xmm9);
}
#endif
#endif /* FLAC__SSE_SUPPORTED */
#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
......
......@@ -898,13 +898,13 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
# ifdef FLAC__HAS_NASM
if(encoder->private_->cpuinfo.ia32.sse) {
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old;
else
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
}
......@@ -927,16 +927,30 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
# ifdef FLAC__HAS_X86INTRIN
# if defined FLAC__SSE_SUPPORTED
if(encoder->private_->cpuinfo.ia32.sse) {
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
else
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
if(encoder->private_->cpuinfo.ia32.sse42 || !encoder->private_->cpuinfo.ia32.intel) { /* use new autocorrelation functions */
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new;
else
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
}
else { /* use old autocorrelation functions */
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old;
else
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
}
}
# endif
......@@ -977,14 +991,26 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
# ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE_SUPPORTED
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16;
if(encoder->private_->cpuinfo.x86.sse42 || !encoder->private_->cpuinfo.x86.intel) { /* use new autocorrelation functions */
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new;
}
else {
if(encoder->protected_->max_lpc_order < 4)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old;
else if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old;
}
# endif
# ifdef FLAC__SSE2_SUPPORTED
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment