Commit 7e927893 authored by Erik de Castro Lopo's avatar Erik de Castro Lopo
Browse files

libFLAC : Add asm versions for two _wide() functions.

GCC generates slow ia32 code for FLAC__lpc_restore_signal_wide() and
FLAC__lpc_compute_residual_from_qlp_coefficients_wide() so 24-bit
encoding/decoding is slower for GCC compile than for MSVS or ICC
compile. This patch adds ia32 asm versions of these functions.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
parent 8e4a45ac
......@@ -43,8 +43,10 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
cglobal FLAC__lpc_restore_signal_asm_ia32
cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_wide_asm_ia32
code_section
......@@ -1603,4 +1605,565 @@ cident FLAC__lpc_restore_signal_asm_ia32_mmx
pop ebp
ret
; **********************************************************************
;
;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
; {
; unsigned i, j;
; FLAC__int64 sum;
;
; FLAC__ASSERT(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
; }
; }
ALIGN 16
cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
;[esp + 40] residual[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] data[]
;ASSERT(order > 0)
;ASSERT(order <= 32)
push ebp
push ebx
push esi
push edi
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
mov eax, [esp + 32] ; eax = order
cmp eax, 1
jg short .i_32
mov esi, [esp + 40] ; esi = residual[]
mov edi, [esp + 20] ; edi = data[]
mov ecx, [esp + 28] ; ecx = qlp_coeff[]
mov ebp, [ecx] ; ebp = qlp_coeff[0]
mov eax, [edi - 4] ; eax = data[-1]
mov cl, [esp + 36] ; cl = lp_quantization
ALIGN 16
.i_1_loop_i:
imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
shrd eax, edx, cl ; 0 <= lp_quantization <= 15
neg eax
add eax, [edi]
mov [esi], eax
mov eax, [edi]
add esi, 4
add edi, 4
dec ebx
jnz .i_1_loop_i
jmp .end
.mov_eip_to_eax:
mov eax, [esp]
ret
.i_32: ; eax = order
neg eax
add eax, eax
lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
call .mov_eip_to_eax
.get_eip0:
add ebp, eax
inc ebp ; compensate for the shorter opcode on the last iteration
mov ebx, [esp + 28] ; ebx = qlp_coeff[]
mov edi, [esp + 20] ; edi = data[]
sub [esp + 40], edi ; residual[] -= data[]
xor ecx, ecx
xor esi, esi
jmp ebp
;eax = --
;edx = --
;ecx = 0
;esi = 0
;
;ebx = qlp_coeff[]
;edi = data[]
;ebp = @address
mov eax, [ebx + 124] ; eax = qlp_coeff[31]
imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
mov eax, [ebx + 120] ; eax = qlp_coeff[30]
imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
mov eax, [ebx + 116]
imul dword [edi - 120]
add ecx, eax
adc esi, edx
mov eax, [ebx + 112]
imul dword [edi - 116]
add ecx, eax
adc esi, edx
mov eax, [ebx + 108]
imul dword [edi - 112]
add ecx, eax
adc esi, edx
mov eax, [ebx + 104]
imul dword [edi - 108]
add ecx, eax
adc esi, edx
mov eax, [ebx + 100]
imul dword [edi - 104]
add ecx, eax
adc esi, edx
mov eax, [ebx + 96]
imul dword [edi - 100]
add ecx, eax
adc esi, edx
mov eax, [ebx + 92]
imul dword [edi - 96]
add ecx, eax
adc esi, edx
mov eax, [ebx + 88]
imul dword [edi - 92]
add ecx, eax
adc esi, edx
mov eax, [ebx + 84]
imul dword [edi - 88]
add ecx, eax
adc esi, edx
mov eax, [ebx + 80]
imul dword [edi - 84]
add ecx, eax
adc esi, edx
mov eax, [ebx + 76]
imul dword [edi - 80]
add ecx, eax
adc esi, edx
mov eax, [ebx + 72]
imul dword [edi - 76]
add ecx, eax
adc esi, edx
mov eax, [ebx + 68]
imul dword [edi - 72]
add ecx, eax
adc esi, edx
mov eax, [ebx + 64]
imul dword [edi - 68]
add ecx, eax
adc esi, edx
mov eax, [ebx + 60]
imul dword [edi - 64]
add ecx, eax
adc esi, edx
mov eax, [ebx + 56]
imul dword [edi - 60]
add ecx, eax
adc esi, edx
mov eax, [ebx + 52]
imul dword [edi - 56]
add ecx, eax
adc esi, edx
mov eax, [ebx + 48]
imul dword [edi - 52]
add ecx, eax
adc esi, edx
mov eax, [ebx + 44]
imul dword [edi - 48]
add ecx, eax
adc esi, edx
mov eax, [ebx + 40]
imul dword [edi - 44]
add ecx, eax
adc esi, edx
mov eax, [ebx + 36]
imul dword [edi - 40]
add ecx, eax
adc esi, edx
mov eax, [ebx + 32]
imul dword [edi - 36]
add ecx, eax
adc esi, edx
mov eax, [ebx + 28]
imul dword [edi - 32]
add ecx, eax
adc esi, edx
mov eax, [ebx + 24]
imul dword [edi - 28]
add ecx, eax
adc esi, edx
mov eax, [ebx + 20]
imul dword [edi - 24]
add ecx, eax
adc esi, edx
mov eax, [ebx + 16]
imul dword [edi - 20]
add ecx, eax
adc esi, edx
mov eax, [ebx + 12]
imul dword [edi - 16]
add ecx, eax
adc esi, edx
mov eax, [ebx + 8]
imul dword [edi - 12]
add ecx, eax
adc esi, edx
mov eax, [ebx + 4]
imul dword [edi - 8]
add ecx, eax
adc esi, edx
mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
.jumper_0:
mov edx, ecx
;esi:edx = sum
mov ecx, [esp + 36] ; cl = lp_quantization
shrd edx, esi, cl ; edx = (sum >> lp_quantization)
;eax = --
;ecx = --
;edx = sum >> lp_q
;esi = --
neg edx ; edx = -(sum >> lp_quantization)
mov eax, [esp + 40] ; residual[] - data[]
add edx, [edi] ; edx = data[i] - (sum >> lp_quantization)
mov [edi + eax], edx
add edi, 4
dec dword [esp + 24]
jz short .end
xor ecx, ecx
xor esi, esi
jmp ebp
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; **********************************************************************
;
; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
; {
; unsigned i, j;
; FLAC__int64 sum;
;
; FLAC__ASSERT(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
; }
; }
ALIGN 16
cident FLAC__lpc_restore_signal_wide_asm_ia32
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
;ASSERT(order <= 32)
push ebp
push ebx
push esi
push edi
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
mov eax, [esp + 32] ; eax = order
cmp eax, 1
jg short .x87_32
mov esi, [esp + 20] ; esi = residual[]
mov edi, [esp + 40] ; edi = data[]
mov ecx, [esp + 28] ; ecx = qlp_coeff[]
mov ebp, [ecx] ; ebp = qlp_coeff[0]
mov eax, [edi - 4] ; eax = data[-1]
mov cl, [esp + 36] ; cl = lp_quantization
ALIGN 16
.x87_1_loop_i:
imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
shrd eax, edx, cl ; 0 <= lp_quantization <= 15
;
add eax, [esi]
mov [edi], eax
;
add esi, 4
add edi, 4
dec ebx
jnz .x87_1_loop_i
jmp .end
.mov_eip_to_eax:
mov eax, [esp]
ret
.x87_32: ; eax = order
neg eax
add eax, eax
lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
call .mov_eip_to_eax
.get_eip0:
add ebp, eax
inc ebp ; compensate for the shorter opcode on the last iteration
mov ebx, [esp + 28] ; ebx = qlp_coeff[]
mov edi, [esp + 40] ; esi = data[]
sub [esp + 20], edi ; residual[] -= data[]
xor ecx, ecx
xor esi, esi
jmp ebp
;eax = --
;edx = --
;ecx = 0
;esi = 0
;
;ebx = qlp_coeff[]
;edi = data[]
;ebp = @address
mov eax, [ebx + 124] ; eax = qlp_coeff[31]
imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
mov eax, [ebx + 120] ; eax = qlp_coeff[30]
imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
mov eax, [ebx + 116]
imul dword [edi - 120]
add ecx, eax
adc esi, edx
mov eax, [ebx + 112]
imul dword [edi - 116]
add ecx, eax
adc esi, edx
mov eax, [ebx + 108]
imul dword [edi - 112]
add ecx, eax
adc esi, edx
mov eax, [ebx + 104]
imul dword [edi - 108]
add ecx, eax
adc esi, edx
mov eax, [ebx + 100]
imul dword [edi - 104]
add ecx, eax
adc esi, edx
mov eax, [ebx + 96]
imul dword [edi - 100]
add ecx, eax
adc esi, edx
mov eax, [ebx + 92]
imul dword [edi - 96]
add ecx, eax
adc esi, edx
mov eax, [ebx + 88]
imul dword [edi - 92]
add ecx, eax
adc esi, edx
mov eax, [ebx + 84]
imul dword [edi - 88]
add ecx, eax
adc esi, edx
mov eax, [ebx + 80]
imul dword [edi - 84]
add ecx, eax
adc esi, edx
mov eax, [ebx + 76]
imul dword [edi - 80]
add ecx, eax
adc esi, edx
mov eax, [ebx + 72]
imul dword [edi - 76]
add ecx, eax
adc esi, edx
mov eax, [ebx + 68]
imul dword [edi - 72]
add ecx, eax
adc esi, edx
mov eax, [ebx + 64]
imul dword [edi - 68]
add ecx, eax
adc esi, edx
mov eax, [ebx + 60]
imul dword [edi - 64]
add ecx, eax
adc esi, edx
mov eax, [ebx + 56]
imul dword [edi - 60]
add ecx, eax
adc esi, edx
mov eax, [ebx + 52]
imul dword [edi - 56]
add ecx, eax
adc esi, edx
mov eax, [ebx + 48]
imul dword [edi - 52]
add ecx, eax
adc esi, edx
mov eax, [ebx + 44]
imul dword [edi - 48]
add ecx, eax
adc esi, edx
mov eax, [ebx + 40]
imul dword [edi - 44]
add ecx, eax
adc esi, edx
mov eax, [ebx + 36]
imul dword [edi - 40]
add ecx, eax
adc esi, edx
mov eax, [ebx + 32]
imul dword [edi - 36]
add ecx, eax
adc esi, edx
mov eax, [ebx + 28]
imul dword [edi - 32]
add ecx, eax
adc esi, edx
mov eax, [ebx + 24]
imul dword [edi - 28]
add ecx, eax
adc esi, edx
mov eax, [ebx + 20]
imul dword [edi - 24]
add ecx, eax
adc esi, edx
mov eax, [ebx + 16]
imul dword [edi - 20]
add ecx, eax
adc esi, edx
mov eax, [ebx + 12]
imul dword [edi - 16]
add ecx, eax
adc esi, edx
mov eax, [ebx + 8]
imul dword [edi - 12]
add ecx, eax
adc esi, edx
mov eax, [ebx + 4]
imul dword [edi - 8]
add ecx, eax
adc esi, edx
mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
.jumper_0:
mov edx, ecx
;esi:edx = sum
mov ecx, [esp + 36] ; cl = lp_quantization
shrd edx, esi, cl ; edx = (sum >> lp_quantization)
;eax = --
;ecx = --
;edx = sum >> lp_q
;esi = --
;
mov eax, [esp + 20] ; residual[] - data[]
add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization)
mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization)
add edi, 4
dec dword [esp + 24]
jz short .end
xor ecx, ecx
xor esi, esi
jmp ebp
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; end
......@@ -152,6 +152,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *da
# ifdef FLAC__HAS_NASM
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]);
# endif
# endif
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
......@@ -187,6 +188,7 @@ void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_l
# ifdef FLAC__HAS_NASM
void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif /* FLAC__HAS_NASM */
# elif defined FLAC__CPU_PPC
void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
......
......@@ -404,6 +404,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
if(decoder->private_->cpuinfo.ia32.bswap)
decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
#endif
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32;
if(decoder->private_->cpuinfo.ia32.mmx) {
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
......@@ -416,7 +417,7 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
}
#endif
#ifdef FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE4_SUPPORTED
# if defined FLAC__SSE4_SUPPORTED && 0 /* now we have FLAC__lpc_restore_signal_wide_asm_ia32() which is slightly faster */
if(decoder->private_->cpuinfo.ia32.sse41)
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;