### Rect DC_PRED: mult and shifts equivalent to div.

```(1) We use an initial variable shift, and then
(2) A multiply + 16 bit shift for bit-depth = 8, OR
A multiply + 17 bit shift for bit-depth = 10 and 12

All the constants (shifts and multipliers) for different block sizes are
based on "Algorithm 1" in
http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632

Note:
This also reverts test hashes etc to the version before
commit 37ebf187.

BUG=aomedia:1191

Change-Id: I2aaebc3e95958a3f145c0408aa5cbcc85e30c8dc```
parent e1f0b227
 ... ... @@ -172,20 +172,19 @@ specialize qw/aom_d153_predictor_8x8 ssse3/; specialize qw/aom_d153_predictor_16x16 ssse3/; specialize qw/aom_d153_predictor_32x32 ssse3/; # TODO(yungingwang): optimize rectangular DC_PRED to replace division # by multiply and shift. specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; specialize qw/aom_dc_predictor_4x8 sse2/; specialize qw/aom_dc_predictor_8x4 sse2/; specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; specialize qw/aom_dc_predictor_8x16 sse2/; specialize qw/aom_dc_predictor_16x8 sse2/; specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; specialize qw/aom_dc_predictor_16x32 sse2/; specialize qw/aom_dc_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; # TODO(luoyi): Need to rewrite these. # specialize qw/aom_dc_predictor_4x8 sse2/; # specialize qw/aom_dc_predictor_8x4 sse2/; # specialize qw/aom_dc_predictor_8x16 sse2/; # specialize qw/aom_dc_predictor_16x8 sse2/; # specialize qw/aom_dc_predictor_16x32 sse2/; # specialize qw/aom_dc_predictor_32x16 sse2 avx2/; specialize qw/aom_d207e_predictor_4x4 sse2/; specialize qw/aom_d207e_predictor_4x8 sse2/; specialize qw/aom_d207e_predictor_8x4 sse2/; ... ... @@ -230,18 +229,18 @@ specialize qw/aom_d45e_predictor_32x32 ssse3/; specialize qw/aom_highbd_v_predictor_32x16 sse2/; specialize qw/aom_highbd_v_predictor_32x32 sse2/; # TODO(yungingwang): optimize rectangular DC_PRED to replace division # by multiply and shift. specialize qw/aom_highbd_dc_predictor_4x4 sse2/; specialize qw/aom_highbd_dc_predictor_8x8 sse2/; specialize qw/aom_highbd_dc_predictor_4x8 sse2/; specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; specialize qw/aom_highbd_dc_predictor_16x8 sse2/; specialize qw/aom_highbd_dc_predictor_16x16 sse2/; specialize qw/aom_highbd_dc_predictor_16x32 sse2/; specialize qw/aom_highbd_dc_predictor_32x16 sse2/; specialize qw/aom_highbd_dc_predictor_32x32 sse2/; # TODO(luoyi): Need to rewrite these # specialize qw/aom_highbd_dc_predictor_4x8 sse2/; # specialize qw/aom_highbd_dc_predictor_8x4 sse2/; # specialize qw/aom_highbd_dc_predictor_8x16 sse2/; # specialize qw/aom_highbd_dc_predictor_16x8 sse2/; # specialize qw/aom_highbd_dc_predictor_16x32 sse2/; # specialize qw/aom_highbd_dc_predictor_32x16 sse2/; specialize qw/aom_highbd_h_predictor_4x4 sse2/; specialize qw/aom_highbd_h_predictor_4x8 sse2/; ... ...
 ... ... @@ -361,101 +361,133 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, } } static INLINE int divide_using_multiply_shift(int num, int shift1, int multiplier, int shift2) { const int interm = num >> shift1; return interm * multiplier >> shift2; } // The constants (multiplier and shifts) for a given block size are obtained // as follows: // - Let sum_w_h = block width + block height. // - Shift 'sum_w_h' right until we reach an odd number. Let the number of // shifts for that block size be called 'shift1' (see the parameter in // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect // block]. // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, // using the "Algorithm 1" in: // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd // shift will be 16, regardless of the block size. // Note: For low bitdepth, assembly code may be optimized by using smaller // constants for smaller block sizes, where the range of the 'sum' is // restricted to fewer bits. #define DC_MULTIPLIER_1X2 0x5556 #define DC_MULTIPLIER_1X4 0x3334 #define DC_SHIFT2 16 static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, uint32_t multiplier, int shift) { int i, r, expected_dc, sum = 0; const uint8_t *left, int shift1, int multiplier) { int sum = 0; for (i = 0; i < bw; i++) { for (int i = 0; i < bw; i++) { sum += above[i]; } for (i = 0; i < bh; i++) { for (int i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (int)(((uint64_t)sum * multiplier) >> shift); expected_dc = clip_pixel(expected_dc); const int expected_dc = divide_using_multiply_shift( sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); assert(expected_dc < (1 << 8)); for (r = 0; r < bh; r++) { for (int r = 0; r < bh; r++) { memset(dst, expected_dc, bw); dst += stride; } } #define DC_MULTIPLIER_1X2 0xABU #define DC_MULTIPLIER_1X4 0xCDU #undef DC_SHIFT2 void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 4, 8, above, left, DC_MULTIPLIER_1X2, 11); dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); } void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 4, above, left, DC_MULTIPLIER_1X2, 11); dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); } void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 4, 16, above, left, DC_MULTIPLIER_1X4, 12); dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); } void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 4, above, left, DC_MULTIPLIER_1X4, 12); dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); } void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 16, above, left, DC_MULTIPLIER_1X2, 12); dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); } void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 8, above, left, DC_MULTIPLIER_1X2, 12); dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); } void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 8, 32, above, left, DC_MULTIPLIER_1X4, 13); dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); } void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 8, above, left, DC_MULTIPLIER_1X4, 13); dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); } void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 32, above, left, DC_MULTIPLIER_1X2, 13); dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); } void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 16, above, left, DC_MULTIPLIER_1X2, 13); dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); } void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 16, 64, above, left, DC_MULTIPLIER_1X4, 14); dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); } void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 64, 16, above, left, DC_MULTIPLIER_1X4, 14); dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); } void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 32, 64, above, left, DC_MULTIPLIER_1X2, 14); dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); } void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { dc_predictor_rect(dst, stride, 64, 32, above, left, DC_MULTIPLIER_1X2, 14); dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); } #undef DC_MULTIPLIER_1X2 #undef DC_MULTIPLIER_1X4 void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const int A = above; ... ... @@ -1003,127 +1035,148 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, } } // Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but // assume 2nd shift of 17 bits instead of 16. // Note: Strictly speaking, 2nd shift needs to be 17 only when: // - bit depth == 12, and // - bw + bh is divisible by 5 (as opposed to divisible by 3). // All other cases can use half the multipliers with a shift of 16 instead. // This special optimization can be used when writing assembly code. #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB // Note: This constant is odd, but a smaller even constant (0x199a) with the // appropriate shift should work for neon in 8/10-bit. #define HIGHBD_DC_MULTIPLIER_1X4 0x6667 #define HIGHBD_DC_SHIFT2 17 static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd, uint32_t multiplier, int shift) { int i, r, expected_dc, sum = 0; int shift1, uint32_t multiplier) { int sum = 0; (void)bd; for (i = 0; i < bw; i++) { for (int i = 0; i < bw; i++) { sum += above[i]; } for (i = 0; i < bh; i++) { for (int i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (int)(((uint64_t)sum * multiplier) >> shift); expected_dc = clip_pixel_highbd(expected_dc, bd); const int expected_dc = divide_using_multiply_shift( sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); assert(expected_dc < (1 << bd)); for (r = 0; r < bh; r++) { for (int r = 0; r < bh; r++) { aom_memset16(dst, expected_dc, bw); dst += stride; } } #undef HIGHBD_DC_SHIFT2 void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, DC_MULTIPLIER_1X2, 11); highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, DC_MULTIPLIER_1X2, 11); highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, DC_MULTIPLIER_1X4, 12); highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, DC_MULTIPLIER_1X4, 12); highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, DC_MULTIPLIER_1X2, 12); highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, DC_MULTIPLIER_1X2, 12); highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, DC_MULTIPLIER_1X4, 13); highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, DC_MULTIPLIER_1X4, 13); highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, DC_MULTIPLIER_1X2, 13); highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, DC_MULTIPLIER_1X2, 13); highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, DC_MULTIPLIER_1X4, 14); highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, DC_MULTIPLIER_1X4, 14); highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, HIGHBD_DC_MULTIPLIER_1X4); } void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, DC_MULTIPLIER_1X2, 14); highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, HIGHBD_DC_MULTIPLIER_1X2); } void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, DC_MULTIPLIER_1X2, 14); highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, HIGHBD_DC_MULTIPLIER_1X2); } #undef HIGHBD_DC_MULTIPLIER_1X2 #undef HIGHBD_DC_MULTIPLIER_1X4 // This serves as a wrapper function, so that all the prediction functions // can be unified and accessed as a pointer array. Note that the boundary // above and left are not necessarily used all the time. ... ...
 ... ... @@ -57,7 +57,7 @@ decode_to_md5() { decode_to_md5_av1() { # expected MD5 sum for the last frame. local expected_md5="085ee3045d9e5e6538853dd762b73512" local expected_md5="fc7565de847d04dc3485b4858c0ed298" local file="\${AV1_IVF_FILE}" # TODO(urvang): Check in the encoded file (like libvpx does) to avoid ... ...
 ... ... @@ -334,10 +334,10 @@ INSTANTIATE_TEST_CASE_P(AVX2_TO_C_12, HighbdIntraPredTest, #if HAVE_SSE2 const IntraPredFunc LowbdIntraPredTestVector[] = { lowbd_intrapred(d63e, sse2), lowbd_intrapred(d207e, sse2), lowbd_intrapred(dc_top, sse2), lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2), lowbd_intrapred(v, sse2), lowbd_intrapred(h, sse2), lowbd_intrapred(d63e, sse2), lowbd_intrapred(d207e, sse2), lowbd_intrapred(dc, sse2), lowbd_intrapred(dc_top, sse2), lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2), lowbd_intrapred(v, sse2), lowbd_intrapred(h, sse2), }; INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest, ... ... @@ -363,11 +363,11 @@ const IntraPredFunc LowbdIntraPredTestVectorAvx2[] = { lowbd_entry(dc, 32, 32, avx2), lowbd_entry(dc_top, 32, 32, avx2), lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2), lowbd_entry(v, 32, 32, avx2), lowbd_entry(h, 32, 32, avx2), lowbd_entry(dc_top, 32, 16, avx2), lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2), lowbd_entry(v, 32, 16, avx2), lowbd_entry(paeth, 16, 8, avx2), lowbd_entry(paeth, 16, 16, avx2), lowbd_entry(paeth, 16, 32, avx2), lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2), lowbd_entry(dc, 32, 16, avx2), lowbd_entry(dc_top, 32, 16, avx2), lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2), lowbd_entry(v, 32, 16, avx2), lowbd_entry(paeth, 16, 8, avx2), lowbd_entry(paeth, 16, 16, avx2), lowbd_entry(paeth, 16, 32, avx2), lowbd_entry(paeth, 32, 16, avx2), lowbd_entry(paeth, 32, 32, avx2), }; INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest, ... ...