Commit f5925df5 authored by Josh Coalson's avatar Josh Coalson
Browse files

add 3DNOW stuff from Miroslav

parent ae4ed272
......@@ -27,6 +27,10 @@ const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_SSE = 0x02000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2 = 0x04000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW = 0x80000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW = 0x40000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX = 0x00400000;
void FLAC__cpu_info(FLAC__CPUInfo *info)
{
......@@ -41,6 +45,11 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
info->data.ia32.fxsr = (cpuid & FLAC__CPUINFO_IA32_CPUID_FXSR)? true : false;
info->data.ia32.sse = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE)? true : false; /* @@@ also need to check for operating system support */
info->data.ia32.sse2 = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE2)? true : false; /* @@@ also need to check for operating system support */
cpuid = FLAC__cpu_info_extended_amd_asm_ia32();
info->data.ia32._3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW)? true : false;
info->data.ia32.ext3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW)? true : false;
info->data.ia32.extmmx = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX)? true : false;
}
#else
info->use_asm = false;
......
......@@ -21,20 +21,18 @@
data_section
cglobal FLAC__cpu_info_asm_ia32
cglobal FLAC__cpu_info_extended_amd_asm_ia32
code_section
; **********************************************************************
;
ALIGN 16
cident FLAC__cpu_info_asm_ia32
push ebx
have_cpuid:
pushfd
pop eax
mov edx, eax
xor eax, 00200000h
xor eax, 0x00200000
push eax
popfd
pushfd
......@@ -42,12 +40,43 @@ cident FLAC__cpu_info_asm_ia32
cmp eax, edx
jz .no_cpuid
mov eax, 1
jmp .end
.no_cpuid:
xor eax, eax
.end:
ret
cident FLAC__cpu_info_asm_ia32
push ebx
call have_cpuid
test eax, eax
jz .no_cpuid
mov eax, 1
cpuid
mov eax, edx
jmp short .end
jmp .end
.no_cpuid:
xor eax, eax ; return 0
.end:
xor eax, eax
.end
pop ebx
ret
cident FLAC__cpu_info_extended_amd_asm_ia32
push ebx
call have_cpuid
test eax, eax
jz .no_cpuid
mov eax, 0x80000000
cpuid
cmp eax, 0x80000001
jb .no_cpuid
mov eax, 0x80000001
cpuid
mov eax, edx
jmp .end
.no_cpuid
xor eax, eax
.end
pop ebx
ret
......
......@@ -24,6 +24,7 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_asm_ia32
......@@ -592,6 +593,124 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
.end:
ret
align 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
;[ebp + 32] autoc
;[ebp + 28] lag
;[ebp + 24] data_len
;[ebp + 20] data
push ebp
push ebx
push esi
push edi
mov ebp, esp
mov esi, [ebp + 20]
mov edi, [ebp + 24]
mov edx, [ebp + 28]
mov eax, edx
neg eax
and esp, byte -8
lea esp, [esp + 4 * eax]
mov ecx, edx
xor eax, eax
.loop0:
dec ecx
mov [esp + 4 * ecx], eax
jnz short .loop0
mov eax, edi
sub eax, edx
mov ebx, edx
and ebx, byte 1
sub eax, ebx
lea ecx, [esi + 4 * eax - 12]
cmp esi, ecx
mov eax, esi
ja short .loop2_pre
align 16 ;8 nops
.loop1_i:
movd mm0, [eax]
movd mm2, [eax + 4]
movd mm4, [eax + 8]
movd mm6, [eax + 12]
mov ebx, edx
punpckldq mm0, mm0
punpckldq mm2, mm2
punpckldq mm4, mm4
punpckldq mm6, mm6
align 16 ;3 nops
.loop1_j:
sub ebx, byte 2
movd mm1, [eax + 4 * ebx]
movd mm3, [eax + 4 * ebx + 4]
movd mm5, [eax + 4 * ebx + 8]
movd mm7, [eax + 4 * ebx + 12]
punpckldq mm1, mm3
punpckldq mm3, mm5
pfmul mm1, mm0
punpckldq mm5, mm7
pfmul mm3, mm2
punpckldq mm7, [eax + 4 * ebx + 16]
pfmul mm5, mm4
pfmul mm7, mm6
pfadd mm1, mm3
movq mm3, [esp + 4 * ebx]
pfadd mm5, mm7
pfadd mm1, mm5
pfadd mm3, mm1
movq [esp + 4 * ebx], mm3
jg short .loop1_j
add eax, byte 16
cmp eax, ecx
jb short .loop1_i
.loop2_pre:
mov ebx, eax
sub eax, esi
shr eax, 2
lea ecx, [esi + 4 * edi]
mov esi, ebx
.loop2_i:
movd mm0, [esi]
mov ebx, edi
sub ebx, eax
cmp ebx, edx
jbe short .loop2_j
mov ebx, edx
.loop2_j:
dec ebx
movd mm1, [esi + 4 * ebx]
pfmul mm1, mm0
movd mm2, [esp + 4 * ebx]
pfadd mm1, mm2
movd [esp + 4 * ebx], mm1
jnz short .loop2_j
add esi, byte 4
inc eax
cmp esi, ecx
jnz short .loop2_i
mov edi, [ebp + 32]
.loop3:
dec edx
mov eax, [esp + 4 * edx]
mov [edi + 4 * edx], eax
jnz short .loop3
femms
mov esp, ebp
pop edi
pop esi
pop ebx
pop ebp
ret
;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
;
; for(i = 0; i < data_len; i++) {
......
......@@ -33,13 +33,21 @@ typedef struct {
FLAC__bool fxsr;
FLAC__bool sse;
FLAC__bool sse2;
FLAC__bool _3dnow;
FLAC__bool ext3dnow;
FLAC__bool extmmx;
} FLAC__CPUInfo_IA32;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX;
typedef struct {
FLAC__bool use_asm;
FLAC__CPUInfo_Type type;
......@@ -54,6 +62,7 @@ void FLAC__cpu_info(FLAC__CPUInfo *info);
#ifdef FLAC__CPU_IA32
#ifdef FLAC__HAS_NASM
unsigned FLAC__cpu_info_asm_ia32();
unsigned FLAC__cpu_info_extended_amd_asm_ia32();
#endif
#endif
#endif
......
......@@ -44,6 +44,7 @@ void FLAC__lpc_compute_autocorrelation_asm_ia32(const FLAC__real data[], unsigne
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
#endif
#endif
#endif
......
......@@ -375,7 +375,7 @@ FLAC__StreamEncoderState FLAC__stream_encoder_init(FLAC__StreamEncoder *encoder)
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(encoder->private->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
if(0 && encoder->private->cpuinfo.data.ia32.sse) { /* SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
if(0 && encoder->private->cpuinfo.data.ia32.sse) { /*@@@ SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
if(encoder->protected->max_lpc_order < 4)
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
else if(encoder->protected->max_lpc_order < 8)
......@@ -385,6 +385,8 @@ FLAC__StreamEncoderState FLAC__stream_encoder_init(FLAC__StreamEncoder *encoder)
else
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
}
else if(0 && encoder->private->cpuinfo.data.ia32._3dnow) /*@@@ turn back on in first beta after 1.0 */
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow;
else
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
if(encoder->private->cpuinfo.data.ia32.mmx && encoder->private->cpuinfo.data.ia32.cmov)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment