Commit bbf6186e authored by Yi Luo's avatar Yi Luo

Highbd intrapred DC_LEFT/TOP/128 sse2 optimization

Also extend intra pred speed test to rectangular block.
Speedup (i7-6700)
predictor      sse2 v. C
left 4x4       ~5.6x
top  4x4       ~7.2x
128  4x4       ~6.9x
left 4x8       ~7.7x
top  4x8       ~10.1x
128  4x8       ~10.0x

left 8x4       ~8.1x
top  8x4       ~9.1x
128  8x4       ~10.1x
left 8x8       ~10.3x
top  8x8       ~13.6x
128  8x8       ~14.8x
left 8x16      ~12.6x
top  8x16      ~14.0x
128  8x16      ~15.5x

left 16x8      ~6.3x
top  16x8      ~7.0x
128  16x8      ~6.5x
left 16x16     ~6.5x
top  16x16     ~7.1x
128  16x16     ~8.2x
left 16x32     ~5.1x
top  16x32     ~6.4x
128  16x32     ~5.6x

left 32x16     ~4.2x
top  32x16     ~4.3x
128  32x16     ~4.5x
left 32x32     ~3.8x
top  32x32     ~3.7x
128  32x32     ~3.9x

Change-Id: Ie7fcc85b9ded3030ee904623c40e9edeec1695ae
parent 94a504af
......@@ -133,6 +133,36 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
specialize qw/aom_highbd_h_predictor_16x32 sse2/;
specialize qw/aom_highbd_h_predictor_32x16 sse2/;
specialize qw/aom_highbd_h_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
} # CONFIG_HIGHBITDEPTH
#
......
This diff is collapsed.
......@@ -136,45 +136,171 @@ TEST_P(AV1IntraPredTest, IntraPredTests) {
#if HAVE_SSE2
#if CONFIG_HIGHBITDEPTH
const IntraPredFunc IntraPredTestVector8[] = {
highbd_entry(dc, 4, 4, sse2, 8), highbd_entry(dc, 8, 8, sse2, 8),
highbd_entry(dc, 16, 16, sse2, 8), highbd_entry(dc, 32, 32, sse2, 8),
highbd_entry(v, 4, 4, sse2, 8), highbd_entry(v, 8, 8, sse2, 8),
highbd_entry(v, 16, 16, sse2, 8), highbd_entry(v, 32, 32, sse2, 8),
highbd_entry(h, 4, 4, sse2, 8), highbd_entry(h, 4, 8, sse2, 8),
highbd_entry(h, 8, 4, sse2, 8), highbd_entry(h, 8, 8, sse2, 8),
highbd_entry(h, 8, 16, sse2, 8), highbd_entry(h, 16, 8, sse2, 8),
highbd_entry(h, 16, 16, sse2, 8), highbd_entry(h, 16, 32, sse2, 8),
highbd_entry(h, 32, 16, sse2, 8), highbd_entry(h, 32, 32, sse2, 8),
highbd_entry(dc, 4, 4, sse2, 8),
highbd_entry(dc, 8, 8, sse2, 8),
highbd_entry(dc, 16, 16, sse2, 8),
highbd_entry(dc, 32, 32, sse2, 8),
highbd_entry(dc_left, 4, 4, sse2, 8),
highbd_entry(dc_left, 4, 8, sse2, 8),
highbd_entry(dc_top, 4, 4, sse2, 8),
highbd_entry(dc_top, 4, 8, sse2, 8),
highbd_entry(dc_128, 4, 4, sse2, 8),
highbd_entry(dc_128, 4, 8, sse2, 8),
highbd_entry(dc_left, 8, 4, sse2, 8),
highbd_entry(dc_top, 8, 4, sse2, 8),
highbd_entry(dc_128, 8, 4, sse2, 8),
highbd_entry(dc_left, 8, 8, sse2, 8),
highbd_entry(dc_top, 8, 8, sse2, 8),
highbd_entry(dc_128, 8, 8, sse2, 8),
highbd_entry(dc_left, 8, 16, sse2, 8),
highbd_entry(dc_top, 8, 16, sse2, 8),
highbd_entry(dc_128, 8, 16, sse2, 8),
highbd_entry(dc_left, 16, 8, sse2, 8),
highbd_entry(dc_top, 16, 8, sse2, 8),
highbd_entry(dc_128, 16, 8, sse2, 8),
highbd_entry(dc_left, 16, 16, sse2, 8),
highbd_entry(dc_top, 16, 16, sse2, 8),
highbd_entry(dc_128, 16, 16, sse2, 8),
highbd_entry(dc_left, 16, 32, sse2, 8),
highbd_entry(dc_top, 16, 32, sse2, 8),
highbd_entry(dc_128, 16, 32, sse2, 8),
highbd_entry(dc_left, 32, 16, sse2, 8),
highbd_entry(dc_top, 32, 16, sse2, 8),
highbd_entry(dc_128, 32, 16, sse2, 8),
highbd_entry(dc_left, 32, 32, sse2, 8),
highbd_entry(dc_top, 32, 32, sse2, 8),
highbd_entry(dc_128, 32, 32, sse2, 8),
highbd_entry(v, 4, 4, sse2, 8),
highbd_entry(v, 8, 8, sse2, 8),
highbd_entry(v, 16, 16, sse2, 8),
highbd_entry(v, 32, 32, sse2, 8),
highbd_entry(h, 4, 4, sse2, 8),
highbd_entry(h, 4, 8, sse2, 8),
highbd_entry(h, 8, 4, sse2, 8),
highbd_entry(h, 8, 8, sse2, 8),
highbd_entry(h, 8, 16, sse2, 8),
highbd_entry(h, 16, 8, sse2, 8),
highbd_entry(h, 16, 16, sse2, 8),
highbd_entry(h, 16, 32, sse2, 8),
highbd_entry(h, 32, 16, sse2, 8),
highbd_entry(h, 32, 32, sse2, 8),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, AV1IntraPredTest,
::testing::ValuesIn(IntraPredTestVector8));
const IntraPredFunc IntraPredTestVector10[] = {
highbd_entry(dc, 4, 4, sse2, 10), highbd_entry(dc, 8, 8, sse2, 10),
highbd_entry(dc, 16, 16, sse2, 10), highbd_entry(dc, 32, 32, sse2, 10),
highbd_entry(v, 4, 4, sse2, 10), highbd_entry(v, 8, 8, sse2, 10),
highbd_entry(v, 16, 16, sse2, 10), highbd_entry(v, 32, 32, sse2, 10),
highbd_entry(h, 4, 4, sse2, 10), highbd_entry(h, 4, 8, sse2, 10),
highbd_entry(h, 8, 4, sse2, 10), highbd_entry(h, 8, 8, sse2, 10),
highbd_entry(h, 8, 16, sse2, 10), highbd_entry(h, 16, 8, sse2, 10),
highbd_entry(h, 16, 16, sse2, 10), highbd_entry(h, 16, 32, sse2, 10),
highbd_entry(h, 32, 16, sse2, 10), highbd_entry(h, 32, 32, sse2, 10),
highbd_entry(dc, 4, 4, sse2, 10),
highbd_entry(dc, 8, 8, sse2, 10),
highbd_entry(dc, 16, 16, sse2, 10),
highbd_entry(dc, 32, 32, sse2, 10),
highbd_entry(dc_left, 4, 4, sse2, 10),
highbd_entry(dc_left, 4, 8, sse2, 10),
highbd_entry(dc_top, 4, 4, sse2, 10),
highbd_entry(dc_top, 4, 8, sse2, 10),
highbd_entry(dc_128, 4, 4, sse2, 10),
highbd_entry(dc_128, 4, 8, sse2, 10),
highbd_entry(dc_left, 8, 4, sse2, 10),
highbd_entry(dc_top, 8, 4, sse2, 10),
highbd_entry(dc_128, 8, 4, sse2, 10),
highbd_entry(dc_left, 8, 8, sse2, 10),
highbd_entry(dc_top, 8, 8, sse2, 10),
highbd_entry(dc_128, 8, 8, sse2, 10),
highbd_entry(dc_left, 8, 16, sse2, 10),
highbd_entry(dc_top, 8, 16, sse2, 10),
highbd_entry(dc_128, 8, 16, sse2, 10),
highbd_entry(dc_left, 16, 8, sse2, 10),
highbd_entry(dc_top, 16, 8, sse2, 10),
highbd_entry(dc_128, 16, 8, sse2, 10),
highbd_entry(dc_left, 16, 16, sse2, 10),
highbd_entry(dc_top, 16, 16, sse2, 10),
highbd_entry(dc_128, 16, 16, sse2, 10),
highbd_entry(dc_left, 16, 32, sse2, 10),
highbd_entry(dc_top, 16, 32, sse2, 10),
highbd_entry(dc_128, 16, 32, sse2, 10),
highbd_entry(dc_left, 32, 16, sse2, 10),
highbd_entry(dc_top, 32, 16, sse2, 10),
highbd_entry(dc_128, 32, 16, sse2, 10),
highbd_entry(dc_left, 32, 32, sse2, 10),
highbd_entry(dc_top, 32, 32, sse2, 10),
highbd_entry(dc_128, 32, 32, sse2, 10),
highbd_entry(v, 4, 4, sse2, 10),
highbd_entry(v, 8, 8, sse2, 10),
highbd_entry(v, 16, 16, sse2, 10),
highbd_entry(v, 32, 32, sse2, 10),
highbd_entry(h, 4, 4, sse2, 10),
highbd_entry(h, 4, 8, sse2, 10),
highbd_entry(h, 8, 4, sse2, 10),
highbd_entry(h, 8, 8, sse2, 10),
highbd_entry(h, 8, 16, sse2, 10),
highbd_entry(h, 16, 8, sse2, 10),
highbd_entry(h, 16, 16, sse2, 10),
highbd_entry(h, 16, 32, sse2, 10),
highbd_entry(h, 32, 16, sse2, 10),
highbd_entry(h, 32, 32, sse2, 10),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, AV1IntraPredTest,
::testing::ValuesIn(IntraPredTestVector10));
const IntraPredFunc IntraPredTestVector12[] = {
highbd_entry(dc, 4, 4, sse2, 12), highbd_entry(dc, 8, 8, sse2, 12),
highbd_entry(dc, 16, 16, sse2, 12), highbd_entry(dc, 32, 32, sse2, 12),
highbd_entry(v, 4, 4, sse2, 12), highbd_entry(v, 8, 8, sse2, 12),
highbd_entry(v, 16, 16, sse2, 12), highbd_entry(v, 32, 32, sse2, 12),
highbd_entry(h, 4, 4, sse2, 12), highbd_entry(h, 4, 8, sse2, 12),
highbd_entry(h, 8, 4, sse2, 12), highbd_entry(h, 8, 8, sse2, 12),
highbd_entry(h, 8, 16, sse2, 12), highbd_entry(h, 16, 8, sse2, 12),
highbd_entry(h, 16, 16, sse2, 12), highbd_entry(h, 16, 32, sse2, 12),
highbd_entry(h, 32, 16, sse2, 12), highbd_entry(h, 32, 32, sse2, 12),
highbd_entry(dc, 4, 4, sse2, 12),
highbd_entry(dc, 8, 8, sse2, 12),
highbd_entry(dc, 16, 16, sse2, 12),
highbd_entry(dc, 32, 32, sse2, 12),
highbd_entry(dc_left, 4, 4, sse2, 12),
highbd_entry(dc_left, 4, 8, sse2, 12),
highbd_entry(dc_top, 4, 4, sse2, 12),
highbd_entry(dc_top, 4, 8, sse2, 12),
highbd_entry(dc_128, 4, 4, sse2, 12),
highbd_entry(dc_128, 4, 8, sse2, 12),
highbd_entry(dc_left, 8, 4, sse2, 12),
highbd_entry(dc_top, 8, 4, sse2, 12),
highbd_entry(dc_128, 8, 4, sse2, 12),
highbd_entry(dc_left, 8, 8, sse2, 12),
highbd_entry(dc_top, 8, 8, sse2, 12),
highbd_entry(dc_128, 8, 8, sse2, 12),
highbd_entry(dc_left, 8, 16, sse2, 12),
highbd_entry(dc_top, 8, 16, sse2, 12),
highbd_entry(dc_128, 8, 16, sse2, 12),
highbd_entry(dc_left, 16, 8, sse2, 12),
highbd_entry(dc_top, 16, 8, sse2, 12),
highbd_entry(dc_128, 16, 8, sse2, 12),
highbd_entry(dc_left, 16, 16, sse2, 12),
highbd_entry(dc_top, 16, 16, sse2, 12),
highbd_entry(dc_128, 16, 16, sse2, 12),
highbd_entry(dc_left, 16, 32, sse2, 12),
highbd_entry(dc_top, 16, 32, sse2, 12),
highbd_entry(dc_128, 16, 32, sse2, 12),
highbd_entry(dc_left, 32, 16, sse2, 12),
highbd_entry(dc_top, 32, 16, sse2, 12),
highbd_entry(dc_128, 32, 16, sse2, 12),
highbd_entry(dc_left, 32, 32, sse2, 12),
highbd_entry(dc_top, 32, 32, sse2, 12),
highbd_entry(dc_128, 32, 32, sse2, 12),
highbd_entry(v, 4, 4, sse2, 12),
highbd_entry(v, 8, 8, sse2, 12),
highbd_entry(v, 16, 16, sse2, 12),
highbd_entry(v, 32, 32, sse2, 12),
highbd_entry(h, 4, 4, sse2, 12),
highbd_entry(h, 4, 8, sse2, 12),
highbd_entry(h, 8, 4, sse2, 12),
highbd_entry(h, 8, 8, sse2, 12),
highbd_entry(h, 8, 16, sse2, 12),
highbd_entry(h, 16, 8, sse2, 12),
highbd_entry(h, 16, 16, sse2, 12),
highbd_entry(h, 16, 32, sse2, 12),
highbd_entry(h, 32, 16, sse2, 12),
highbd_entry(h, 32, 32, sse2, 12),
};
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, AV1IntraPredTest,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment