Commit 421961f0 authored by lvqcl's avatar lvqcl Committed by Erik de Castro Lopo

Replace hadd with shuffle + add

parent faafa4c8
...@@ -980,8 +980,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_ ...@@ -980,8 +980,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1])); summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0])); summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt); summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
...@@ -1009,8 +1009,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_ ...@@ -1009,8 +1009,8 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
for (i = 0;;) { for (i = 0;;) {
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0])); summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt); summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
...@@ -1079,8 +1079,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint ...@@ -1079,8 +1079,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
summ = _mm_madd_epi16(dat[1], qlp[1]); summ = _mm_madd_epi16(dat[1], qlp[1]);
summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0])); summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt); summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
...@@ -1109,8 +1109,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint ...@@ -1109,8 +1109,8 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
for(i = 0;;) { for(i = 0;;) {
summ = _mm_madd_epi16(dat0, qlp0); summ = _mm_madd_epi16(dat0, qlp0);
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_hadd_epi32(summ, summ); summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt); summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
......
...@@ -83,8 +83,8 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual ...@@ -83,8 +83,8 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
sum128 = _mm_add_epi32(sum128, res128); sum128 = _mm_add_epi32(sum128, res128);
} }
sum128 = _mm_hadd_epi32(sum128, sum128); sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2)));
sum128 = _mm_hadd_epi32(sum128, sum128); sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
......
...@@ -97,8 +97,8 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual ...@@ -97,8 +97,8 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
mm_sum = _mm_add_epi32(mm_sum, mm_res); mm_sum = _mm_add_epi32(mm_sum, mm_res);
} }
mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8)); mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4)); mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
......
...@@ -86,8 +86,8 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua ...@@ -86,8 +86,8 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
mm_sum = _mm_add_epi32(mm_sum, mm_res); mm_sum = _mm_add_epi32(mm_sum, mm_res);
} }
mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment