Commit 37ebf187 authored by Urvang Joshi's avatar Urvang Joshi

[Normative] DC_PRED: Avoid div using mult+shift.

DC_PRED requires dividing by 'count' = block width + height.
- For square blocks this is efficient, as count is a power of 2.
- But for 1x2 and 1x4 rectangular blocks the division is inefficient.

So, we approximate this division with an integer multiply and a shift.

test_intra_pred_speed test results:

Size    Before (ms) After (ms)
4x8     221         206
8x4     195         189
8x16    121          88
16x8     84          73
16x32    63          60
32x16    68          66

Compression quality is neutral:
- Division vs 32-bit mult + shift is neutral:
https://arewecompressedyet.com/?job=ALL_rectpred_bef%402018-01-10T19%3A02%3A20.745Z&job=ALL_rectpred_aft_mult_shift%402018-01-10T19%3A04%3A21.380Z
- And 32-bit mult + shift vs 8-bit mult + shift is also neutral:
https://arewecompressedyet.com/?job=mult_shift_32bit%402018-01-16T20%3A30%3A21.418Z&job=mult_shift_08bit%402018-01-16T20%3A31%3A25.673Z

BUG=aomedia:1191

Change-Id: I2343a055bc3d85cb7be7652d7a6db1b768361be9
parent 1e959897
......@@ -172,20 +172,24 @@ specialize qw/aom_smooth_predictor_32x32 ssse3/;
specialize qw/aom_d135_predictor_4x4 neon/;
specialize qw/aom_d153_predictor_4x4 ssse3/;
specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
specialize qw/aom_dc_predictor_4x8 sse2/;
specialize qw/aom_d153_predictor_8x8 ssse3/;
specialize qw/aom_dc_predictor_8x4 sse2/;
specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_8x16 sse2/;
specialize qw/aom_d153_predictor_16x16 ssse3/;
specialize qw/aom_dc_predictor_16x8 sse2/;
specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_16x32 sse2/;
specialize qw/aom_d153_predictor_32x32 ssse3/;
specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
# TODO(luoyi): Need to rewrite these.
# specialize qw/aom_dc_predictor_4x8 sse2/;
# specialize qw/aom_dc_predictor_8x4 sse2/;
# specialize qw/aom_dc_predictor_8x16 sse2/;
# specialize qw/aom_dc_predictor_16x8 sse2/;
# specialize qw/aom_dc_predictor_16x32 sse2/;
# specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
specialize qw/aom_d207e_predictor_4x4 sse2/;
specialize qw/aom_d207e_predictor_4x8 sse2/;
specialize qw/aom_d207e_predictor_8x4 sse2/;
......@@ -229,16 +233,20 @@ specialize qw/aom_d45e_predictor_32x32 ssse3/;
specialize qw/aom_highbd_v_predictor_16x32 sse2/;
specialize qw/aom_highbd_v_predictor_32x16 sse2/;
specialize qw/aom_highbd_v_predictor_32x32 sse2/;
specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
specialize qw/aom_highbd_dc_predictor_8x8 sse2/;
specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
# TODO(luoyi): Need to rewrite these
# specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
# specialize qw/aom_highbd_dc_predictor_8x4 sse2/;
# specialize qw/aom_highbd_dc_predictor_8x16 sse2/;
# specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
# specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
# specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
specialize qw/aom_highbd_h_predictor_4x4 sse2/;
specialize qw/aom_highbd_h_predictor_4x8 sse2/;
specialize qw/aom_highbd_h_predictor_8x4 sse2/;
......
This diff is collapsed.
......@@ -335,10 +335,10 @@ INSTANTIATE_TEST_CASE_P(AVX2_TO_C_12, HighbdIntraPredTest,
#if HAVE_SSE2
const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
lowbd_intrapred(d63e, sse2), lowbd_intrapred(d207e, sse2),
lowbd_intrapred(dc, sse2), lowbd_intrapred(dc_top, sse2),
lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
lowbd_intrapred(v, sse2), lowbd_intrapred(h, sse2),
lowbd_intrapred(d63e, sse2), lowbd_intrapred(d207e, sse2),
lowbd_intrapred(dc_top, sse2), lowbd_intrapred(dc_left, sse2),
lowbd_intrapred(dc_128, sse2), lowbd_intrapred(v, sse2),
lowbd_intrapred(h, sse2),
};
INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
......@@ -362,11 +362,11 @@ const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
lowbd_entry(dc, 32, 32, avx2), lowbd_entry(dc_top, 32, 32, avx2),
lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2),
lowbd_entry(v, 32, 32, avx2), lowbd_entry(h, 32, 32, avx2),
lowbd_entry(dc, 32, 16, avx2), lowbd_entry(dc_top, 32, 16, avx2),
lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
lowbd_entry(v, 32, 16, avx2), lowbd_entry(paeth, 16, 8, avx2),
lowbd_entry(paeth, 16, 16, avx2), lowbd_entry(paeth, 16, 32, avx2),
lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2),
lowbd_entry(dc_top, 32, 16, avx2), lowbd_entry(dc_left, 32, 16, avx2),
lowbd_entry(dc_128, 32, 16, avx2), lowbd_entry(v, 32, 16, avx2),
lowbd_entry(paeth, 16, 8, avx2), lowbd_entry(paeth, 16, 16, avx2),
lowbd_entry(paeth, 16, 32, avx2), lowbd_entry(paeth, 32, 16, avx2),
lowbd_entry(paeth, 32, 32, avx2),
};
INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment