Commit 94dfe5aa authored by Kyle Siefring's avatar Kyle Siefring
Browse files

Use alignr in lbd convolve_2d/convolve_x AVX2

Results measured on Haswell.

For convolve_2d
Width Height Improvement
    4      4        8.3%
    8      8        5.6%
   64     64         13%
    4     16         13%
   32      8          9%

For convolve_x
Width Height Improvement
    4      4          0%
    8      8          0%
   64     64         21%
    4     16          0%
   32      8         29%

Change-Id: Ic13215d61f48400b3e31d5ffea3af11c0b3f0ed7
parent b1674b1f
......@@ -74,17 +74,16 @@ void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]),
_MM_SHUFFLE(2, 1, 1, 0));
const __m256i src_lo = _mm256_unpacklo_epi8(data, zero);
const __m256i src_hi = _mm256_unpackhi_epi8(data, zero);
// Filter even-index pixels
const __m256i src_0 = _mm256_unpacklo_epi8(data, zero);
const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
const __m256i src_2 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 2), zero);
const __m256i res_0 = _mm256_madd_epi16(src_lo, coeff_01);
const __m256i src_2 = _mm256_alignr_epi8(src_hi, src_lo, 4);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
const __m256i src_4 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 4), zero);
const __m256i src_4 = _mm256_alignr_epi8(src_hi, src_lo, 8);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
const __m256i src_6 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 6), zero);
const __m256i src_6 = _mm256_alignr_epi8(src_hi, src_lo, 12);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
__m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_4),
......@@ -93,17 +92,13 @@ void av1_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
round_shift);
// Filter odd-index pixels
const __m256i src_1 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 1), zero);
const __m256i src_1 = _mm256_alignr_epi8(src_hi, src_lo, 2);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
const __m256i src_3 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 3), zero);
const __m256i src_3 = _mm256_alignr_epi8(src_hi, src_lo, 6);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
const __m256i src_5 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 5), zero);
const __m256i src_5 = _mm256_alignr_epi8(src_hi, src_lo, 10);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
const __m256i src_7 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 7), zero);
const __m256i src_7 = _mm256_alignr_epi8(src_hi, src_lo, 14);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
__m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_5),
......
......@@ -80,14 +80,16 @@ void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
const __m128i data =
_mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
// Filter even-index pixels
const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
......@@ -96,13 +98,13 @@ void av1_jnt_convolve_2d_sse4_1(const uint8_t *src, int src_stride,
_mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
// Filter odd-index pixels
const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
......
......@@ -548,34 +548,29 @@ void av1_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
_MM_SHUFFLE(2, 1, 1, 0));
const __m256i zero = _mm256_setzero_si256();
const __m256i src_lo = _mm256_unpacklo_epi8(data, zero);
const __m256i src_hi = _mm256_unpackhi_epi8(data, zero);
// Filter even-index pixels
const __m256i src_0 = _mm256_unpacklo_epi8(data, zero);
const __m256i res_0 = _mm256_madd_epi16(src_0, coeff_01);
const __m256i src_2 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 2), zero);
const __m256i res_0 = _mm256_madd_epi16(src_lo, coeff_01);
const __m256i src_2 = _mm256_alignr_epi8(src_hi, src_lo, 4);
const __m256i res_2 = _mm256_madd_epi16(src_2, coeff_23);
const __m256i src_4 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 4), zero);
const __m256i src_4 = _mm256_alignr_epi8(src_hi, src_lo, 8);
const __m256i res_4 = _mm256_madd_epi16(src_4, coeff_45);
const __m256i src_6 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 6), zero);
const __m256i src_6 = _mm256_alignr_epi8(src_hi, src_lo, 12);
const __m256i res_6 = _mm256_madd_epi16(src_6, coeff_67);
const __m256i res_even = _mm256_add_epi32(
_mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
// Filter odd-index pixels
const __m256i src_1 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 1), zero);
const __m256i src_1 = _mm256_alignr_epi8(src_hi, src_lo, 2);
const __m256i res_1 = _mm256_madd_epi16(src_1, coeff_01);
const __m256i src_3 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 3), zero);
const __m256i src_3 = _mm256_alignr_epi8(src_hi, src_lo, 6);
const __m256i res_3 = _mm256_madd_epi16(src_3, coeff_23);
const __m256i src_5 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 5), zero);
const __m256i src_5 = _mm256_alignr_epi8(src_hi, src_lo, 10);
const __m256i res_5 = _mm256_madd_epi16(src_5, coeff_45);
const __m256i src_7 =
_mm256_unpacklo_epi8(_mm256_srli_si256(data, 7), zero);
const __m256i src_7 = _mm256_alignr_epi8(src_hi, src_lo, 14);
const __m256i res_7 = _mm256_madd_epi16(src_7, coeff_67);
const __m256i res_odd = _mm256_add_epi32(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment