Commit 237cf1b2 authored by Yi Luo's avatar Yi Luo

Lowbd TM_PRED intra pred avx2 optimization

For block width >= 16, avx2 can further speedup the
TM_PREM intra prediction.

Function speedup on i7-6700:
Predictor  avx2 v. ssse3
16x8       ~1.6x
16x16      ~1.8x
16x32      ~1.9x
32x16      ~1.9x
32x32      ~1.9x

Change-Id: I62c20bd7628f52251b0c051b99a9b738ee44f7e6
parent 5eb471c0
......@@ -142,11 +142,11 @@ specialize qw/aom_paeth_predictor_4x8 ssse3/;
specialize qw/aom_paeth_predictor_8x4 ssse3/;
specialize qw/aom_paeth_predictor_8x8 ssse3/;
specialize qw/aom_paeth_predictor_8x16 ssse3/;
specialize qw/aom_paeth_predictor_16x8 ssse3/;
specialize qw/aom_paeth_predictor_16x16 ssse3/;
specialize qw/aom_paeth_predictor_16x32 ssse3/;
specialize qw/aom_paeth_predictor_32x16 ssse3/;
specialize qw/aom_paeth_predictor_32x32 ssse3/;
specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
specialize qw/aom_d63e_predictor_4x4 ssse3/;
specialize qw/aom_d135_predictor_4x4 neon/;
......
......@@ -217,3 +217,197 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
(void)left;
row_store_32xh(&row, 16, dst, stride);
}
// -----------------------------------------------------------------------------
// TM_PRED
// Return 16 16-bit pixels in one row (__m256i)
static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
const __m256i *topleft) {
const __m256i base =
_mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
__m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
__m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
__m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
__m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
__m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
pl = _mm256_andnot_si256(mask1, *left);
ptl = _mm256_and_si256(mask2, *topleft);
pt = _mm256_andnot_si256(mask2, *top);
pt = _mm256_or_si256(pt, ptl);
pt = _mm256_and_si256(mask1, pt);
return _mm256_or_si256(pt, pl);
}
// Return 16 8-bit pixels in one row (__m128i)
static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
const __m256i *topleft) {
const __m256i p0 = paeth_pred(left, top, topleft);
const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
const __m256i p = _mm256_packus_epi16(p0, p1);
return _mm256_castsi256_si128(p);
}
static INLINE __m256i get_top_vector(const uint8_t *above) {
const __m128i x = _mm_load_si128((const __m128i *)above);
const __m128i zero = _mm_setzero_si128();
const __m128i t0 = _mm_unpacklo_epi8(x, zero);
const __m128i t1 = _mm_unpackhi_epi8(x, zero);
return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
}
void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
__m128i x = _mm_loadl_epi64((const __m128i *)left);
const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
__m256i rep = _mm256_set1_epi16(0x8000);
const __m256i one = _mm256_set1_epi16(1);
const __m256i top = get_top_vector(above);
int i;
for (i = 0; i < 8; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
_mm_store_si128((__m128i *)dst, row);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
}
static INLINE __m256i get_left_vector(const uint8_t *left) {
const __m128i x = _mm_load_si128((const __m128i *)left);
return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
}
void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m256i l = get_left_vector(left);
const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
__m256i rep = _mm256_set1_epi16(0x8000);
const __m256i one = _mm256_set1_epi16(1);
const __m256i top = get_top_vector(above);
int i;
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
_mm_store_si128((__m128i *)dst, row);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
}
void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
__m256i l = get_left_vector(left);
const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
__m256i rep = _mm256_set1_epi16(0x8000);
const __m256i one = _mm256_set1_epi16(1);
const __m256i top = get_top_vector(above);
int i;
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
_mm_store_si128((__m128i *)dst, row);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
l = get_left_vector(left + 16);
rep = _mm256_set1_epi16(0x8000);
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
_mm_store_si128((__m128i *)dst, row);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
}
// Return 32 8-bit pixels in one row (__m256i)
static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
const __m256i *top1,
const __m256i *topleft) {
__m256i p0 = paeth_pred(left, top0, topleft);
__m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
const __m256i x0 = _mm256_packus_epi16(p0, p1);
p0 = paeth_pred(left, top1, topleft);
p1 = _mm256_permute4x64_epi64(p0, 0xe);
const __m256i x1 = _mm256_packus_epi16(p0, p1);
return _mm256_permute2x128_si256(x0, x1, 0x20);
}
void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m256i l = get_left_vector(left);
const __m256i t0 = get_top_vector(above);
const __m256i t1 = get_top_vector(above + 16);
const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
__m256i rep = _mm256_set1_epi16(0x8000);
const __m256i one = _mm256_set1_epi16(1);
int i;
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
_mm256_storeu_si256((__m256i *)dst, r);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
}
void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
__m256i l = get_left_vector(left);
const __m256i t0 = get_top_vector(above);
const __m256i t1 = get_top_vector(above + 16);
const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
__m256i rep = _mm256_set1_epi16(0x8000);
const __m256i one = _mm256_set1_epi16(1);
int i;
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
_mm_store_si128((__m128i *)dst, r0);
_mm_store_si128((__m128i *)(dst + 16), r1);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
l = get_left_vector(left + 16);
rep = _mm256_set1_epi16(0x8000);
for (i = 0; i < 16; ++i) {
const __m256i l16 = _mm256_shuffle_epi8(l, rep);
const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
_mm_store_si128((__m128i *)dst, r0);
_mm_store_si128((__m128i *)(dst + 16), r1);
dst += stride;
rep = _mm256_add_epi16(rep, one);
}
}
......@@ -234,7 +234,9 @@ const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
lowbd_entry(v, 32, 32, avx2), lowbd_entry(h, 32, 32, avx2),
lowbd_entry(dc, 32, 16, avx2), lowbd_entry(dc_top, 32, 16, avx2),
lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
lowbd_entry(v, 32, 16, avx2),
lowbd_entry(v, 32, 16, avx2), lowbd_entry(paeth, 16, 8, avx2),
lowbd_entry(paeth, 16, 16, avx2), lowbd_entry(paeth, 16, 32, avx2),
lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2),
};
INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
......
......@@ -666,12 +666,24 @@ INTRA_PRED_TEST(SSSE3_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
NULL, NULL, aom_paeth_predictor_16x16_ssse3, NULL, NULL, NULL)
INTRA_PRED_TEST(SSSE3_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x8, NULL, NULL, NULL)
aom_paeth_predictor_16x8_ssse3, NULL, NULL, NULL)
INTRA_PRED_TEST(SSSE3_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x32_ssse3, NULL, NULL, NULL)
#endif // HAVE_SSSE3
#if HAVE_AVX2
INTRA_PRED_TEST(AVX2_1, TestIntraPred16, "intra16x16", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
INTRA_PRED_TEST(AVX2_2, TestIntraPred16, "intra16x8", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
INTRA_PRED_TEST(AVX2_3, TestIntraPred16, "intra16x32", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
#endif // HAVE_AVX2
#if HAVE_DSPR2
INTRA_PRED_TEST(DSPR2, TestIntraPred16, "intra16x16",
aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL, NULL,
......@@ -767,13 +779,13 @@ INTRA_PRED_TEST(AVX2_1, TestIntraPred32, "intra32x32",
aom_dc_top_predictor_32x32_avx2,
aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
aom_h_predictor_32x32_avx2, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL)
aom_paeth_predictor_32x32_avx2, NULL, NULL, NULL)
INTRA_PRED_TEST(AVX2_2, TestIntraPred32, "intra32x16",
aom_dc_predictor_32x16_avx2, aom_dc_left_predictor_32x16_avx2,
aom_dc_top_predictor_32x16_avx2,
aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL)
NULL, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
#endif // HAVE_AVX2
#if HAVE_NEON
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment