From a77ec1c922d7d7995de9d58389a322d2224ff608 Mon Sep 17 00:00:00 2001 From: Debargha Mukherjee Date: Wed, 24 May 2017 22:40:33 -0700 Subject: [PATCH] Change warp filter to use one less precision bit Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22 --- av1/common/warped_motion.c | 53 ++++++++++++++---------- av1/common/x86/highbd_warp_plane_ssse3.c | 14 +++++-- av1/common/x86/warp_plane_sse2.c | 15 +++++-- av1/common/x86/warp_plane_ssse3.c | 23 +++++----- 4 files changed, 65 insertions(+), 40 deletions(-) diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c index fb28b0446..be3ad49d9 100644 --- a/av1/common/warped_motion.c +++ b/av1/common/warped_motion.c @@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = { 8240, 8224, 8208, 8192, }; -static INLINE int16_t saturate_int16(int32_t v) { - if (v > 32767) - return 32767; - else if (v < -32768) - return -32768; - return v; +static INLINE uint16_t saturate_uint(int32_t v, int bits) { + return (uint16_t)clamp(v, 0, (1 << bits) - 1); } #if CONFIG_WARPED_MOTION @@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, if (ix4 <= -7) { for (l = 0; l < 8; ++l) { tmp[(k + 7) * 8 + l] = - ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); + (1 << (bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS - 1)) + + ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS)); } } else if (ix4 >= width + 6) { for (l = 0; l < 8; ++l) { - tmp[(k + 7) * 8 + l] = - ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); + tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS)); } } else { int sx = sx4 + beta * (k + 4); @@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; const int16_t *coeffs = warped_filter[offs]; - int32_t sum = 0; + int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += ref[iy * stride + ix + m] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS); #if HORSHEAR_REDUCE_PREC_BITS >= 5 - tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum); + tmp[(k + 7) * 8 + (l + 4)] = + saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS + 1); #else tmp[(k + 7) * 8 + (l + 4)] = sum; #endif @@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; const int16_t *coeffs = warped_filter[offs]; - int32_t sum = 0; + int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; @@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int16_t delta) { int16_t tmp[15 * 8]; int i, j, k, l, m; + const int bd = 8; /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, // (once border extension is taken into account) for (l = 0; l < 8; ++l) { tmp[(k + 7) * 8 + l] = - ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); + (1 << (bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS - 1)) + + ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS)); } } else if (ix4 >= width + 6) { // In this case, the leftmost pixel sampled is in column @@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, // will sample only from the rightmost column // (once border extension is taken into account) for (l = 0; l < 8; ++l) { - tmp[(k + 7) * 8 + l] = - ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); + tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS)); } } else { // If we get here, then @@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; const int16_t *coeffs = warped_filter[offs]; - int32_t sum = 0; + int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += ref[iy * stride + ix + m] * coeffs[m]; } sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS); - tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum); + tmp[(k + 7) * 8 + (l + 4)] = + saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS - + HORSHEAR_REDUCE_PREC_BITS + 1); sx += alpha; } } @@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + WARPEDPIXEL_PREC_SHIFTS; const int16_t *coeffs = warped_filter[offs]; - int32_t sum = 0; + int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); for (m = 0; m < 8; ++m) { sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; diff --git a/av1/common/x86/highbd_warp_plane_ssse3.c b/av1/common/x86/highbd_warp_plane_ssse3.c index 51f67f731..eac7cafbd 100644 --- a/av1/common/x86/highbd_warp_plane_ssse3.c +++ b/av1/common/x86/highbd_warp_plane_ssse3.c @@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i round_const = - _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); // Calculate filtered results const __m128i res_0 = _mm_madd_epi16(src, coeff_0); @@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, // Round and pack into 8 bits const __m128i round_const = - _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c index 09e72da15..674a77fd8 100644 --- a/av1/common/x86/warp_plane_sse2.c +++ b/av1/common/x86/warp_plane_sse2.c @@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, int16_t delta) { __m128i tmp[15]; int i, j, k; + const int bd = 8; /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i round_const = - _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); // Calculate filtered results const __m128i src_0 = _mm_unpacklo_epi8(src, zero); @@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, // Round and pack into 8 bits const __m128i round_const = - _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); diff --git a/av1/common/x86/warp_plane_ssse3.c b/av1/common/x86/warp_plane_ssse3.c index 37f7c4c44..39a1f71f5 100644 --- a/av1/common/x86/warp_plane_ssse3.c +++ b/av1/common/x86/warp_plane_ssse3.c @@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, int16_t delta) { __m128i tmp[15]; int i, j, k; + const int bd = 8; /* Note: For this code to work, the left/right frame borders need to be extended by at least 13 pixels each. By the time we get here, other @@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else if (ix4 >= width + 6) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, else if (iy > height - 1) iy = height - 1; tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS - + 1)) + ref[iy * stride + (width - 1)] * - (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); + (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); } } else { for (k = -7; k < AOMMIN(8, p_height - i); ++k) { @@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57); const __m128i round_const = - _mm_set1_epi16((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) + + ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1)); // Note: res_02 + res_46 and res_13 + res_57 are always in the range // [-6120, 32640]. This gives us enough room to add the rounding @@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, _mm_add_epi16(_mm_add_epi16(res_02, res_46), round_const); const __m128i res_b = _mm_add_epi16(res_13, res_57); - // Calculate (res_a + res_b) >> 1 while avoiding overflow - const __m128i t1 = _mm_and_si128(res_a, res_b); - const __m128i t2 = _mm_srai_epi16(_mm_xor_si128(res_a, res_b), 1); - - const __m128i res = _mm_srai_epi16(_mm_add_epi16(t1, t2), - HORSHEAR_REDUCE_PREC_BITS - 1); + const __m128i res = _mm_srli_epi16(_mm_add_epi16(res_a, res_b), + HORSHEAR_REDUCE_PREC_BITS); tmp[k + 7] = res; } } @@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, // Round and pack into 8 bits const __m128i round_const = - _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); + _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) + + ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1)); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); -- GitLab