Commit f0292355 authored by Yaowu Xu's avatar Yaowu Xu
Browse files

Revert "av1_convolve_ x,y _avx2() -- use 256 bit load/store"

This reverts commit 8a59122d.

BUG=aomedia:1197

Change-Id: If3b3700ea743bc41dd5c1a079634961bc4fa5b62
parent 3ec11d1e
...@@ -462,21 +462,26 @@ void av1_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, ...@@ -462,21 +462,26 @@ void av1_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_lo_shift = _mm256_sll_epi32(res_lo, left_shift); const __m256i res_lo_shift = _mm256_sll_epi32(res_lo, left_shift);
const __m256i res_hi_shift = _mm256_sll_epi32(res_hi, left_shift); const __m256i res_hi_shift = _mm256_sll_epi32(res_hi, left_shift);
const __m256i res_01_shift =
_mm256_permute2x128_si256(res_lo_shift, res_hi_shift, 0x20);
const __m256i res_23_shift =
_mm256_permute2x128_si256(res_lo_shift, res_hi_shift, 0x31);
// Accumulate values into the destination buffer // Accumulate values into the destination buffer
__m256i *const p = (__m256i *)&dst[i * dst_stride + j]; __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
if (do_average) { if (do_average) {
_mm256_storeu_si256( _mm_storeu_si128(p + 0,
p + 0, _mm256_add_epi32(_mm256_load_si256(p + 0), res_01_shift)); _mm_add_epi32(_mm_loadu_si128(p + 0),
_mm256_storeu_si256( _mm256_castsi256_si128(res_lo_shift)));
p + 1, _mm256_add_epi32(_mm256_load_si256(p + 1), res_23_shift)); _mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1),
_mm256_castsi256_si128(res_hi_shift)));
_mm_storeu_si128(
p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2),
_mm256_extractf128_si256(res_lo_shift, 1)));
_mm_storeu_si128(
p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3),
_mm256_extractf128_si256(res_hi_shift, 1)));
} else { } else {
_mm256_storeu_si256(p + 0, res_01_shift); _mm_storeu_si128(p + 0, _mm256_castsi256_si128(res_lo_shift));
_mm256_storeu_si256(p + 1, res_23_shift); _mm_storeu_si128(p + 1, _mm256_castsi256_si128(res_hi_shift));
_mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_shift, 1));
_mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_shift, 1));
} }
} }
} }
...@@ -588,21 +593,26 @@ void av1_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, ...@@ -588,21 +593,26 @@ void av1_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i res_lo_shift = _mm256_sll_epi32(res_lo_round, left_shift); const __m256i res_lo_shift = _mm256_sll_epi32(res_lo_round, left_shift);
const __m256i res_hi_shift = _mm256_sll_epi32(res_hi_round, left_shift); const __m256i res_hi_shift = _mm256_sll_epi32(res_hi_round, left_shift);
const __m256i res_01_shift =
_mm256_permute2x128_si256(res_lo_shift, res_hi_shift, 0x20);
const __m256i res_23_shift =
_mm256_permute2x128_si256(res_lo_shift, res_hi_shift, 0x31);
// Accumulate values into the destination buffer // Accumulate values into the destination buffer
__m256i *const p = (__m256i *)&dst[i * dst_stride + j]; __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
if (do_average) { if (do_average) {
_mm256_storeu_si256( _mm_storeu_si128(p + 0,
p + 0, _mm256_add_epi32(_mm256_load_si256(p + 0), res_01_shift)); _mm_add_epi32(_mm_loadu_si128(p + 0),
_mm256_storeu_si256( _mm256_castsi256_si128(res_lo_shift)));
p + 1, _mm256_add_epi32(_mm256_load_si256(p + 1), res_23_shift)); _mm_storeu_si128(p + 1,
_mm_add_epi32(_mm_loadu_si128(p + 1),
_mm256_castsi256_si128(res_hi_shift)));
_mm_storeu_si128(
p + 2, _mm_add_epi32(_mm_loadu_si128(p + 2),
_mm256_extractf128_si256(res_lo_shift, 1)));
_mm_storeu_si128(
p + 3, _mm_add_epi32(_mm_loadu_si128(p + 3),
_mm256_extractf128_si256(res_hi_shift, 1)));
} else { } else {
_mm256_storeu_si256(p + 0, res_01_shift); _mm_storeu_si128(p + 0, _mm256_castsi256_si128(res_lo_shift));
_mm256_storeu_si256(p + 1, res_23_shift); _mm_storeu_si128(p + 1, _mm256_castsi256_si128(res_hi_shift));
_mm_storeu_si128(p + 2, _mm256_extractf128_si256(res_lo_shift, 1));
_mm_storeu_si128(p + 3, _mm256_extractf128_si256(res_hi_shift, 1));
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment