Commit fa85e3ce authored by Erik de Castro Lopo's avatar Erik de Castro Lopo
Browse files

lpc_intrin* : Remove unused code.

Which in turn simplifies FLAC__lpc_restore_signal_16_intrin_sse2()
function.

Patch-from: lvqcl <lvqcl.mail@gmail.com>
parent 71875b0c
......@@ -200,7 +200,6 @@ void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], unsign
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED
void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
# endif
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
......
......@@ -51,713 +51,6 @@
#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
#define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
#if 0
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
int i;
FLAC__int32 sum;
FLAC__ASSERT(order > 0);
FLAC__ASSERT(order <= 32);
FLAC__ASSERT(data_len > 0);
if(order <= 12) {
FLAC__int32 curr;
if(order > 8) { /* order == 9, 10, 11, 12 */
#ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
switch(order) /* ...and zero them out */
{
case 9:
xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
case 10:
xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
case 11:
xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
}
xmm2 = _mm_setzero_si128();
xmm0 = _mm_packs_epi32(xmm0, xmm6);
xmm1 = _mm_packs_epi32(xmm1, xmm2);
xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm4 = _mm_packs_epi32(xmm4, xmm2);
xmm3 = _mm_packs_epi32(xmm3, xmm5);
xmm7 = _mm_slli_si128(xmm1, 2);
xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
xmm2 = _mm_slli_si128(xmm0, 2);
/* xmm0, xmm1: qlp_coeff
xmm2, xmm7: qlp_coeff << 16 bit
xmm3, xmm4: data */
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
if(r) {
xmm4 = _mm_slli_si128(xmm4, 2);
xmm6 = xmm3;
xmm3 = _mm_slli_si128(xmm3, 2);
xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
}
while(data_len) { /* data_len is a multiple of 2 */
/* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
xmm4 = _mm_slli_si128(xmm4, 4);
xmm6 = xmm3;
xmm3 = _mm_slli_si128(xmm3, 4);
xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm7);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm2);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len-=2;
}
#else /* 16 XMM registers available */
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
switch(order) /* ...and zero them out */
{
case 9:
xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
case 10:
xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
case 11:
xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
}
xmm2 = _mm_setzero_si128();
xmm0 = _mm_packs_epi32(xmm0, xmm6);
xmm1 = _mm_packs_epi32(xmm1, xmm2);
xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm4 = _mm_packs_epi32(xmm4, xmm2);
xmm3 = _mm_packs_epi32(xmm3, xmm5);
xmm7 = _mm_slli_si128(xmm1, 2);
xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
xmm2 = _mm_slli_si128(xmm0, 2);
xmm9 = _mm_slli_si128(xmm1, 4);
xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
xmm8 = _mm_slli_si128(xmm0, 4);
xmmB = _mm_slli_si128(xmm1, 6);
xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
xmmA = _mm_slli_si128(xmm0, 6);
/* xmm0, xmm1: qlp_coeff
xmm2, xmm7: qlp_coeff << 16 bit
xmm8, xmm9: qlp_coeff << 2*16 bit
xmmA, xmmB: qlp_coeff << 3*16 bit
xmm3, xmm4: data */
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 4;
while(r) {
xmm4 = _mm_slli_si128(xmm4, 2);
xmm6 = xmm3;
xmm3 = _mm_slli_si128(xmm3, 2);
xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
while(data_len) { /* data_len is a multiple of 4 */
xmm4 = _mm_slli_si128(xmm4, 8);
xmm6 = xmm3;
xmm3 = _mm_slli_si128(xmm3, 8);
xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
xmm3 = _mm_insert_epi16(xmm3, curr, 3);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmmB);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmmA);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 2);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm9);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm8);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm7);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm2);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm4;
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm5 = xmm3;
xmm5 = _mm_madd_epi16(xmm5, xmm0);
xmm6 = _mm_add_epi32(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len-=4;
}
#endif
} /* endif(order > 8) */
else if(order > 4) { /* order == 5, 6, 7, 8 */
if(order > 6) { /* order == 7, 8 */
if(order == 8) {
__m128i xmm0, xmm1, xmm3, xmm6;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm0 = _mm_packs_epi32(xmm0, xmm1);
xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm1);
/* xmm0: qlp_coeff
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
while(data_len) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
}
}
else { /* order == 7 */
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm6;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
xmm0 = _mm_packs_epi32(xmm0, xmm1);
xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm1);
xmm2 = _mm_slli_si128(xmm0, 2);
/* xmm0: qlp_coeff
xmm2: qlp_coeff << 16 bit
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
if(r) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
}
while(data_len) { /* data_len is a multiple of 2 */
xmm3 = _mm_slli_si128(xmm3, 4);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm2);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len-=2;
}
}
}
else { /* order == 5, 6 */
if(order == 6) {
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
xmm0 = _mm_packs_epi32(xmm0, xmm1);
xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm1);
xmm2 = _mm_slli_si128(xmm0, 2);
xmm4 = _mm_slli_si128(xmm0, 4);
/* xmm0: qlp_coeff
xmm2: qlp_coeff << 16 bit
xmm4: qlp_coeff << 2*16 bit
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 3;
while(r) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
while(data_len) { /* data_len is a multiple of 3 */
xmm3 = _mm_slli_si128(xmm3, 6);
xmm3 = _mm_insert_epi16(xmm3, curr, 2);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm4);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm2);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len-=3;
}
}
else { /* order == 5 */
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
xmm0 = _mm_packs_epi32(xmm0, xmm1);
xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm1);
xmm2 = _mm_slli_si128(xmm0, 2);
xmm4 = _mm_slli_si128(xmm0, 4);
xmm5 = _mm_slli_si128(xmm0, 6);
/* xmm0: qlp_coeff
xmm2: qlp_coeff << 16 bit
xmm4: qlp_coeff << 2*16 bit
xmm4: qlp_coeff << 3*16 bit
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 4;
while(r) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
while(data_len) { /* data_len is a multiple of 4 */
xmm3 = _mm_slli_si128(xmm3, 8);
xmm3 = _mm_insert_epi16(xmm3, curr, 3);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm5);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 2);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm4);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm2);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len-=4;
}
}
}
}
else { /* order == 1, 2, 3, 4 */
if(order > 2) {
if(order == 4) {
__m128i xmm0, xmm3, xmm6;
xmm6 = _mm_setzero_si128();
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_packs_epi32(xmm0, xmm6);
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm6);
/* xmm0: qlp_coeff
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
while(data_len) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
}
}
else { /* order == 3 */
int r;
__m128i xmm0, xmm1, xmm3, xmm6;
xmm6 = _mm_setzero_si128();
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
xmm0 = _mm_packs_epi32(xmm0, xmm6);
xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
xmm3 = _mm_packs_epi32(xmm3, xmm6);
xmm1 = _mm_slli_si128(xmm0, 2);
/* xmm0: qlp_coeff
xmm1: qlp_coeff << 16 bit
xmm3: data */
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
if(r) {
xmm3 = _mm_slli_si128(xmm3, 2);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
RESIDUAL16_RESULT(xmm6);
data_len--;
}
while(data_len) { /* data_len is a multiple of 2 */
xmm3 = _mm_slli_si128(xmm3, 4);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);