diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c index 1b0bca45384da6d363eef9f11ca92c3c7868c688..50b6a081bbc9d798878861458fa32fde1d682116 100644 --- a/aom_dsp/intrapred.c +++ b/aom_dsp/intrapred.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include "./aom_config.h" @@ -259,61 +260,75 @@ static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -// Weights are quadratic from 'bs' to '1'. -// Scale is same as 'bs'. -// TODO(urvang): Integerize the weights at a suitable precision. +// Weights are quadratic from 'bs' to '1', scaled by 2^12. +// TODO(urvang): All weights can be at the same scale: going from '1' to '1/bs' +// instead (still scaled by 2^12 or more). +// Rationale: Given that max block dimension is 64 (=2^6), and max pixel value +// is below 2^12 (for both normal and highbitdepth), power of (31 - 6 - 12 - 1) +// = 12 is chosen so that all weighted sums in smooth_predictor() remain within +// 2^31 (unsigned integer) range. +static const int sm_weight_log2_scale = 12; + #if CONFIG_TX64X64 -static const double sm_weight_arrays[6][64] = { +static const uint32_t sm_weight_arrays[6][64] = { #else -static const double sm_weight_arrays[5][32] = { +static const uint32_t sm_weight_arrays[5][32] = { #endif // CONFIG_TX64X64 // bs = 2 - { 2, 1 }, + { 8192, 4096 }, // bs = 4 - { 4, 2.33333, 1.33333, 1 }, + { 16384, 9557, 5461, 4096 }, // bs = 8 - { 8, 6.14286, 4.57143, 3.28571, 2.28571, 1.57143, 1.14286, 1 }, + { 32768, 25161, 18725, 13458, 9362, 6437, 4681, 4096 }, // bs = 16 - { 16, 14.0667, 12.2667, 10.6, 9.06667, 7.66667, 6.4, 5.26667, 4.26667, 3.4, - 2.66667, 2.06667, 1.6, 1.26667, 1.06667, 1 }, + { 65536, 57617, 50244, 43418, 37137, 31403, 26214, 21572, 17476, 13926, 10923, + 8465, 6554, 5188, 4369, 4096 }, // bs = 32 - { 32, 30.0323, 28.129, 26.2903, 24.5161, 22.8065, 21.1613, 19.5806, - 18.0645, 16.6129, 15.2258, 13.9032, 12.6452, 11.4516, 10.3226, 9.25806, - 8.25806, 7.32258, 6.45161, 5.64516, 4.90323, 4.22581, 3.6129, 3.06452, - 2.58065, 2.16129, 1.80645, 1.51613, 1.29032, 1.12903, 1.03226, 1 }, + { 131072, 123012, 115217, 107685, 100418, 93415, 86677, 80202, + 73992, 68046, 62365, 56948, 51795, 46906, 42281, 37921, + 33825, 29993, 26426, 23123, 20084, 17309, 14798, 12552, + 10570, 8853, 7399, 6210, 5285, 4625, 4228, 4096 }, #if CONFIG_TX64X64 // bs = 64 - { 64, 62.0159, 60.0635, 58.1429, 56.254, 54.3968, 52.5714, 50.7778, - 49.0159, 47.2857, 45.5873, 43.9206, 42.2857, 40.6825, 39.1111, 37.5714, - 36.0635, 34.5873, 33.1429, 31.7302, 30.3492, 29, 27.6825, 26.3968, - 25.1429, 23.9206, 22.7302, 21.5714, 20.4444, 19.3492, 18.2857, 17.254, - 16.254, 15.2857, 14.3492, 13.4444, 12.5714, 11.7302, 10.9206, 10.1429, - 9.39683, 8.68254, 8, 7.34921, 6.73016, 6.14286, 5.5873, 5.06349, - 4.57143, 4.11111, 3.68254, 3.28571, 2.92063, 2.5873, 2.28571, 2.01587, - 1.77778, 1.57143, 1.39683, 1.25397, 1.14286, 1.06349, 1.01587, 1 }, + { 262144, 254017, 246020, 238153, 230416, 222809, 215333, 207986, + 200769, 193682, 186726, 179899, 173202, 166636, 160199, 153893, + 147716, 141670, 135753, 129967, 124310, 118784, 113388, 108121, + 102985, 97979, 93103, 88357, 83740, 79254, 74898, 70672, + 66576, 62610, 58774, 55068, 51493, 48047, 44731, 41545, + 38489, 35564, 32768, 30102, 27567, 25161, 22886, 20740, + 18725, 16839, 15084, 13458, 11963, 10598, 9362, 8257, + 7282, 6437, 5721, 5136, 4681, 4356, 4161, 4096 }, #endif // CONFIG_TX64X64 }; +#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) + static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel - const int arr_index = (int)lround(log2(bs)) - 1; - const double *const sm_weights = sm_weight_arrays[arr_index]; - const double scale = 2.0 * bs; + const int log2_bs = (int)lround(log2(bs)); + const int arr_index = log2_bs - 1; + const uint32_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * bs * 2^sm_weight_log2_scale + const int log2_scale = 1 + log2_bs + sm_weight_log2_scale; + assert(log2_scale + 8 < 8 * 31); // sanity check: no overflow. + const uint32_t scaled_bs = sm_weights[0]; + assert((int)scaled_bs == (bs << sm_weight_log2_scale)); int r; for (r = 0; r < bs; ++r) { int c; for (c = 0; c < bs; ++c) { - const int pixels[] = { above[c], below_pred, left[r], right_pred }; - const double weights[] = { sm_weights[r], bs - sm_weights[r], - sm_weights[c], bs - sm_weights[c] }; - double this_pred = 0; + const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint32_t weights[] = { sm_weights[r], scaled_bs - sm_weights[r], + sm_weights[c], scaled_bs - sm_weights[c] }; + uint32_t this_pred = 0; int i; + assert(scaled_bs >= sm_weights[r] && scaled_bs >= sm_weights[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel(lround(this_pred / scale)); + dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); } dst += stride; } @@ -1027,22 +1042,28 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel - const int arr_index = (int)lround(log2(bs)) - 1; - const double *const sm_weights = sm_weight_arrays[arr_index]; - const double scale = 2.0 * bs; + const int log2_bs = (int)lround(log2(bs)); + const int arr_index = log2_bs - 1; + const uint32_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * bs * 2^sm_weight_log2_scale + const int log2_scale = 1 + log2_bs + sm_weight_log2_scale; + assert(log2_scale + 8 < 8 * 31); // sanity check: no overflow. + const uint32_t scaled_bs = sm_weights[0]; + assert((int)scaled_bs == (bs << sm_weight_log2_scale)); int r; for (r = 0; r < bs; ++r) { int c; for (c = 0; c < bs; ++c) { - const int pixels[] = { above[c], below_pred, left[r], right_pred }; - const double weights[] = { sm_weights[r], bs - sm_weights[r], - sm_weights[c], bs - sm_weights[c] }; - double this_pred = 0; + const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint32_t weights[] = { sm_weights[r], scaled_bs - sm_weights[r], + sm_weights[c], scaled_bs - sm_weights[c] }; + uint32_t this_pred = 0; int i; + assert(scaled_bs >= sm_weights[r] && scaled_bs >= sm_weights[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel_highbd(lround(this_pred / scale), bd); + dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); } dst += stride; } diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index ddaeb561376171df296e76ddcaf55c4fb41fd3dd..ce4b3c1d24080eb25c55a45bdfdb649454eadddc 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -129,7 +129,7 @@ void TestIntraPred8(AvxPredFunc const *pred_funcs) { "95f7bfc262329a5849eda66d8f7c68ce", #if CONFIG_ALT_INTRA "f6ade499c626d38eb70661184b79bc57", - "28a52163fa8bd2216e6af1ce3113af09" + "f9217748b7188479c2990e42d2dc1da1" #else "815b75c8e0d91cc1ae766dc5d3e445a3", #endif // CONFIG_ALT_INTRA @@ -154,7 +154,7 @@ void TestIntraPred16(AvxPredFunc const *pred_funcs) { "a8fe1c70432f09d0c20c67bdb6432c4d", #if CONFIG_ALT_INTRA "7adcaaa3554eb71a81fc48cb9043984b", - "3f83cda25a2c1647e1b48803922c33df" + "de44142b9670ab7c85d4c318c47257e5" #else "b8a41aa968ec108af447af4217cba91b", #endif // CONFIG_ALT_INTRA