Commit 71b6e043 authored by Yi Luo's avatar Yi Luo

Migrate some vp9 highbd intrapred x86 speedup to av1

Function speedup on i7-6700:
D117   sse2   ssse3
4x4    ~1.8x
8x8           ~3.4x
16x16         ~5.5x
32x32         ~2.9x

D135   sse2   ssse3
4x4    ~1.9
8x8           ~3.3x
16x16         ~5.3x
32x32         ~3.6x

D153   sse2   ssse3
4x4    ~1.9x
8x8           ~2.8x
16x16         ~5.5x
32x32         ~3.6x

Change-Id: I43ab5fa8dcbcfa51acbde554abf3e5d7d336f391
parent e30159ce
......@@ -239,6 +239,10 @@ if (CONFIG_HIGHBITDEPTH)
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
set(AOM_DSP_COMMON_INTRIN_SSSE3
${AOM_DSP_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
......
......@@ -84,6 +84,7 @@ ifeq ($(CONFIG_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c
DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c
endif # CONFIG_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
......
......@@ -242,6 +242,19 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
specialize qw/aom_highbd_d117_predictor_4x4 sse2/;
specialize qw/aom_highbd_d117_predictor_8x8 ssse3/;
specialize qw/aom_highbd_d117_predictor_16x16 ssse3/;
specialize qw/aom_highbd_d117_predictor_32x32 ssse3/;
specialize qw/aom_highbd_d135_predictor_4x4 sse2/;
specialize qw/aom_highbd_d135_predictor_8x8 ssse3/;
specialize qw/aom_highbd_d135_predictor_16x16 ssse3/;
specialize qw/aom_highbd_d135_predictor_32x32 ssse3/;
specialize qw/aom_highbd_d153_predictor_4x4 sse2/;
specialize qw/aom_highbd_d153_predictor_8x8 ssse3/;
specialize qw/aom_highbd_d153_predictor_16x16 ssse3/;
specialize qw/aom_highbd_d153_predictor_32x32 ssse3/;
} # CONFIG_HIGHBITDEPTH
#
......
......@@ -982,3 +982,115 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
dst += stride;
}
}
// -----------------------------------------------------------------------------
/*
; ------------------------------------------
; input: x, y, z, result
;
; trick from pascal
; (x+2y+z+2)>>2 can be calculated as:
; result = avg(x,z)
; result -= xor(x,z) & 1
; result = avg(result,y)
; ------------------------------------------
*/
static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
const __m128i *z) {
const __m128i one = _mm_set1_epi16(1);
const __m128i a = _mm_avg_epu16(*x, *z);
const __m128i b =
_mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
return _mm_avg_epu16(b, *y);
}
void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
const __m128i row0 = _mm_srli_si128(avg2, 6);
const __m128i row1 = _mm_srli_si128(avg3, 4);
const __m128i row2 = _mm_srli_si128(avg2, 4);
const __m128i row3 = _mm_srli_si128(avg3, 2);
(void)bd;
_mm_storel_epi64((__m128i *)dst, row0);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row1);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row2);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row3);
dst -= stride;
dst[0] = _mm_extract_epi16(avg3, 1);
dst[stride] = _mm_extract_epi16(avg3, 0);
}
void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
const int L = left[3];
const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
const __m128i row0 = _mm_srli_si128(avg3, 6);
const __m128i row1 = _mm_srli_si128(avg3, 4);
const __m128i row2 = _mm_srli_si128(avg3, 2);
const __m128i row3 = avg3;
(void)bd;
_mm_storel_epi64((__m128i *)dst, row0);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row1);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row2);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row3);
}
void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
const int L = left[3];
const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
const __m128i row2 = _mm_srli_si128(row3, 4);
const __m128i row1 = _mm_srli_si128(row3, 8);
const __m128i row0 = _mm_srli_si128(avg3, 4);
(void)bd;
_mm_storel_epi64((__m128i *)dst, row0);
dst[0] = _mm_extract_epi16(avg2, 3);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row1);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row2);
dst += stride;
_mm_storel_epi64((__m128i *)dst, row3);
}
This diff is collapsed.
......@@ -171,12 +171,14 @@ TEST_P(LowbdIntraPredTest, IntraPredTests) {
highbd_entry(type, 16, 32, opt, bd), \
highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
#if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH
#if HAVE_SSE2
const IntraPredFunc<HighbdIntraPred> IntraPredTestVector8[] = {
highbd_intrapred(dc, sse2, 8), highbd_intrapred(dc_left, sse2, 8),
highbd_intrapred(dc_top, sse2, 8), highbd_intrapred(dc_128, sse2, 8),
highbd_intrapred(h, sse2, 8), highbd_intrapred(v, sse2, 8),
highbd_entry(d117, 4, 4, sse2, 8), highbd_entry(d135, 4, 4, sse2, 8),
highbd_entry(d153, 4, 4, sse2, 8),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, HighbdIntraPredTest,
......@@ -186,6 +188,8 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVector10[] = {
highbd_intrapred(dc, sse2, 10), highbd_intrapred(dc_left, sse2, 10),
highbd_intrapred(dc_top, sse2, 10), highbd_intrapred(dc_128, sse2, 10),
highbd_intrapred(h, sse2, 10), highbd_intrapred(v, sse2, 10),
highbd_entry(d117, 4, 4, sse2, 10), highbd_entry(d135, 4, 4, sse2, 10),
highbd_entry(d153, 4, 4, sse2, 10),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, HighbdIntraPredTest,
......@@ -195,14 +199,48 @@ const IntraPredFunc<HighbdIntraPred> IntraPredTestVector12[] = {
highbd_intrapred(dc, sse2, 12), highbd_intrapred(dc_left, sse2, 12),
highbd_intrapred(dc_top, sse2, 12), highbd_intrapred(dc_128, sse2, 12),
highbd_intrapred(h, sse2, 12), highbd_intrapred(v, sse2, 12),
highbd_entry(d117, 4, 4, sse2, 12), highbd_entry(d135, 4, 4, sse2, 12),
highbd_entry(d153, 4, 4, sse2, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVector12));
#endif // CONFIG_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_SSSE3
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_8[] = {
highbd_entry(d117, 8, 8, ssse3, 8), highbd_entry(d117, 16, 16, ssse3, 8),
highbd_entry(d117, 32, 32, ssse3, 8), highbd_entry(d135, 8, 8, ssse3, 8),
highbd_entry(d135, 16, 16, ssse3, 8), highbd_entry(d135, 32, 32, ssse3, 8),
highbd_entry(d153, 8, 8, ssse3, 8), highbd_entry(d153, 16, 16, ssse3, 8),
highbd_entry(d153, 32, 32, ssse3, 8),
};
INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_8, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorSsse3_8));
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_10[] = {
highbd_entry(d117, 8, 8, ssse3, 10), highbd_entry(d117, 16, 16, ssse3, 10),
highbd_entry(d117, 32, 32, ssse3, 10), highbd_entry(d135, 8, 8, ssse3, 10),
highbd_entry(d135, 16, 16, ssse3, 10), highbd_entry(d135, 32, 32, ssse3, 10),
highbd_entry(d153, 8, 8, ssse3, 10), highbd_entry(d153, 16, 16, ssse3, 10),
highbd_entry(d153, 32, 32, ssse3, 10),
};
INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_10, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorSsse3_10));
const IntraPredFunc<HighbdIntraPred> IntraPredTestVectorSsse3_12[] = {
highbd_entry(d117, 8, 8, ssse3, 12), highbd_entry(d117, 16, 16, ssse3, 12),
highbd_entry(d117, 32, 32, ssse3, 12), highbd_entry(d135, 8, 8, ssse3, 12),
highbd_entry(d135, 16, 16, ssse3, 12), highbd_entry(d135, 32, 32, ssse3, 12),
highbd_entry(d153, 8, 8, ssse3, 12), highbd_entry(d153, 16, 16, ssse3, 12),
highbd_entry(d153, 32, 32, ssse3, 12),
};
INSTANTIATE_TEST_CASE_P(SSSE3_TO_C_12, HighbdIntraPredTest,
::testing::ValuesIn(IntraPredTestVectorSsse3_12));
#endif // HAVE_SSSE3
#endif // CONFIG_HIGHBITDEPTH
#define lowbd_entry(type, width, height, opt) \
IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
&aom_##type##_predictor_##width##x##height##_c, \
......
......@@ -1141,14 +1141,13 @@ HIGHBD_INTRA_PRED_TEST(
#undef smooth_h_pred_func
#if HAVE_SSE2
HIGHBD_INTRA_PRED_TEST(SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
aom_highbd_dc_predictor_4x4_sse2,
aom_highbd_dc_left_predictor_4x4_sse2,
aom_highbd_dc_top_predictor_4x4_sse2,
aom_highbd_dc_128_predictor_4x4_sse2,
aom_highbd_v_predictor_4x4_sse2,
aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(
SSE2_1, TestHighbdIntraPred4, "Hbd Intra4x4",
aom_highbd_dc_predictor_4x4_sse2, aom_highbd_dc_left_predictor_4x4_sse2,
aom_highbd_dc_top_predictor_4x4_sse2, aom_highbd_dc_128_predictor_4x4_sse2,
aom_highbd_v_predictor_4x4_sse2, aom_highbd_h_predictor_4x4_sse2, NULL,
aom_highbd_d135_predictor_4x4_sse2, aom_highbd_d117_predictor_4x4_sse2,
aom_highbd_d153_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL)
HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred4, "Hbd Intra4x8",
aom_highbd_dc_predictor_4x8_sse2,
aom_highbd_dc_left_predictor_4x8_sse2,
......@@ -1229,9 +1228,18 @@ HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred8, "Hbd Intra8x16",
NULL, NULL, NULL, NULL, NULL, NULL)
#endif
#if HAVE_SSSE3
HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, "Hbd Intra8x8", NULL, NULL,
NULL, NULL, NULL, NULL, NULL,
aom_highbd_d135_predictor_8x8_ssse3,
aom_highbd_d117_predictor_8x8_ssse3,
aom_highbd_d153_predictor_8x8_ssse3, NULL, NULL, NULL,
NULL, NULL, NULL)
#endif
#if CONFIG_SMOOTH_HV
#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x8_c
#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x8_c
#define smooth_v_pred_func aom_highbd_smooth_v_predictor_8x4_c
#define smooth_h_pred_func aom_highbd_smooth_h_predictor_8x4_c
#else
#define smooth_v_pred_func NULL
#define smooth_h_pred_func NULL
......@@ -1320,6 +1328,15 @@ HIGHBD_INTRA_PRED_TEST(SSE2_3, TestHighbdIntraPred16, "Hbd Intra16x32",
NULL, NULL, NULL, NULL, NULL, NULL, NULL)
#endif
#if HAVE_SSSE3
HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, "Hbd Intra16x16", NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
aom_highbd_d135_predictor_16x16_ssse3,
aom_highbd_d117_predictor_16x16_ssse3,
aom_highbd_d153_predictor_16x16_ssse3, NULL, NULL, NULL,
NULL, NULL, NULL)
#endif
#if CONFIG_SMOOTH_HV
#define smooth_v_pred_func aom_highbd_smooth_v_predictor_16x8_c
#define smooth_h_pred_func aom_highbd_smooth_h_predictor_16x8_c
......@@ -1404,6 +1421,15 @@ HIGHBD_INTRA_PRED_TEST(SSE2_2, TestHighbdIntraPred32, "Hbd Intra32x16",
NULL, NULL, NULL, NULL, NULL, NULL, NULL)
#endif
#if HAVE_SSSE3
HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, "Hbd Intra32x32", NULL,
NULL, NULL, NULL, NULL, NULL, NULL,
aom_highbd_d135_predictor_32x32_ssse3,
aom_highbd_d117_predictor_32x32_ssse3,
aom_highbd_d153_predictor_32x32_ssse3, NULL, NULL, NULL,
NULL, NULL, NULL)
#endif
#if CONFIG_SMOOTH_HV
#define smooth_v_pred_func aom_highbd_smooth_v_predictor_32x16_c
#define smooth_h_pred_func aom_highbd_smooth_h_predictor_32x16_c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment