Commit a77ec1c9 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Change warp filter to use one less precision bit

Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22
parent 8feaaac8
...@@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = { ...@@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
8240, 8224, 8208, 8192, 8240, 8224, 8208, 8192,
}; };
static INLINE int16_t saturate_int16(int32_t v) { static INLINE uint16_t saturate_uint(int32_t v, int bits) {
if (v > 32767) return (uint16_t)clamp(v, 0, (1 << bits) - 1);
return 32767;
else if (v < -32768)
return -32768;
return v;
} }
#if CONFIG_WARPED_MOTION #if CONFIG_WARPED_MOTION
...@@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, ...@@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
if (ix4 <= -7) { if (ix4 <= -7) {
for (l = 0; l < 8; ++l) { for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] = tmp[(k + 7) * 8 + l] =
ref[iy * stride] * (1 << (bd + WARPEDPIXEL_FILTER_BITS -
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); HORSHEAR_REDUCE_PREC_BITS - 1)) +
ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS));
} }
} else if (ix4 >= width + 6) { } else if (ix4 >= width + 6) {
for (l = 0; l < 8; ++l) { for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] = tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
ref[iy * stride + (width - 1)] * HORSHEAR_REDUCE_PREC_BITS - 1)) +
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS));
} }
} else { } else {
int sx = sx4 + beta * (k + 4); int sx = sx4 + beta * (k + 4);
...@@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, ...@@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = warped_filter[offs]; const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0; int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) { for (m = 0; m < 8; ++m) {
sum += ref[iy * stride + ix + m] * coeffs[m]; sum += ref[iy * stride + ix + m] * coeffs[m];
} }
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS); sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
#if HORSHEAR_REDUCE_PREC_BITS >= 5 #if HORSHEAR_REDUCE_PREC_BITS >= 5
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum); tmp[(k + 7) * 8 + (l + 4)] =
saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS + 1);
#else #else
tmp[(k + 7) * 8 + (l + 4)] = sum; tmp[(k + 7) * 8 + (l + 4)] = sum;
#endif #endif
...@@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, ...@@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = warped_filter[offs]; const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0; int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) { for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
...@@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, ...@@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
int16_t delta) { int16_t delta) {
int16_t tmp[15 * 8]; int16_t tmp[15 * 8];
int i, j, k, l, m; int i, j, k, l, m;
const int bd = 8;
/* Note: For this code to work, the left/right frame borders need to be /* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other extended by at least 13 pixels each. By the time we get here, other
...@@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, ...@@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// (once border extension is taken into account) // (once border extension is taken into account)
for (l = 0; l < 8; ++l) { for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] = tmp[(k + 7) * 8 + l] =
ref[iy * stride] * (1 << (bd + WARPEDPIXEL_FILTER_BITS -
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); HORSHEAR_REDUCE_PREC_BITS - 1)) +
ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS));
} }
} else if (ix4 >= width + 6) { } else if (ix4 >= width + 6) {
// In this case, the leftmost pixel sampled is in column // In this case, the leftmost pixel sampled is in column
...@@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, ...@@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// will sample only from the rightmost column // will sample only from the rightmost column
// (once border extension is taken into account) // (once border extension is taken into account)
for (l = 0; l < 8; ++l) { for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] = tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
ref[iy * stride + (width - 1)] * HORSHEAR_REDUCE_PREC_BITS - 1)) +
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)); ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS));
} }
} else { } else {
// If we get here, then // If we get here, then
...@@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, ...@@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = warped_filter[offs]; const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0; int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) { for (m = 0; m < 8; ++m) {
sum += ref[iy * stride + ix + m] * coeffs[m]; sum += ref[iy * stride + ix + m] * coeffs[m];
} }
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS); sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum); tmp[(k + 7) * 8 + (l + 4)] =
saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
HORSHEAR_REDUCE_PREC_BITS + 1);
sx += alpha; sx += alpha;
} }
} }
...@@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, ...@@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
WARPEDPIXEL_PREC_SHIFTS; WARPEDPIXEL_PREC_SHIFTS;
const int16_t *coeffs = warped_filter[offs]; const int16_t *coeffs = warped_filter[offs];
int32_t sum = 0; int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); // assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
for (m = 0; m < 8; ++m) { for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
......
...@@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, ...@@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride] * ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else if (ix4 >= width + 6) { } else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, ...@@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride + (width - 1)] * ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else { } else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, ...@@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
const __m128i round_const = const __m128i round_const =
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
// Calculate filtered results // Calculate filtered results
const __m128i res_0 = _mm_madd_epi16(src, coeff_0); const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
...@@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref, ...@@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
// Round and pack into 8 bits // Round and pack into 8 bits
const __m128i round_const = const __m128i round_const =
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
const __m128i res_lo_round = _mm_srai_epi32( const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
......
...@@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, ...@@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
int16_t delta) { int16_t delta) {
__m128i tmp[15]; __m128i tmp[15];
int i, j, k; int i, j, k;
const int bd = 8;
/* Note: For this code to work, the left/right frame borders need to be /* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other extended by at least 13 pixels each. By the time we get here, other
...@@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, ...@@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride] * ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else if (ix4 >= width + 6) { } else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, ...@@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride + (width - 1)] * ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else { } else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, ...@@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
const __m128i round_const = const __m128i round_const =
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
// Calculate filtered results // Calculate filtered results
const __m128i src_0 = _mm_unpacklo_epi8(src, zero); const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
...@@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width, ...@@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits // Round and pack into 8 bits
const __m128i round_const = const __m128i round_const =
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
const __m128i res_lo_round = _mm_srai_epi32( const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
......
...@@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
int16_t delta) { int16_t delta) {
__m128i tmp[15]; __m128i tmp[15];
int i, j, k; int i, j, k;
const int bd = 8;
/* Note: For this code to work, the left/right frame borders need to be /* Note: For this code to work, the left/right frame borders need to be
extended by at least 13 pixels each. By the time we get here, other extended by at least 13 pixels each. By the time we get here, other
...@@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride] * ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else if (ix4 >= width + 6) { } else if (ix4 >= width + 6) {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
else if (iy > height - 1) else if (iy > height - 1)
iy = height - 1; iy = height - 1;
tmp[k + 7] = _mm_set1_epi16( tmp[k + 7] = _mm_set1_epi16(
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
1)) +
ref[iy * stride + (width - 1)] * ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS))); (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} }
} else { } else {
for (k = -7; k < AOMMIN(8, p_height - i); ++k) { for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
...@@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57); const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
const __m128i round_const = const __m128i round_const =
_mm_set1_epi16((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
// Note: res_02 + res_46 and res_13 + res_57 are always in the range // Note: res_02 + res_46 and res_13 + res_57 are always in the range
// [-6120, 32640]. This gives us enough room to add the rounding // [-6120, 32640]. This gives us enough room to add the rounding
...@@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
_mm_add_epi16(_mm_add_epi16(res_02, res_46), round_const); _mm_add_epi16(_mm_add_epi16(res_02, res_46), round_const);
const __m128i res_b = _mm_add_epi16(res_13, res_57); const __m128i res_b = _mm_add_epi16(res_13, res_57);
// Calculate (res_a + res_b) >> 1 while avoiding overflow const __m128i res = _mm_srli_epi16(_mm_add_epi16(res_a, res_b),
const __m128i t1 = _mm_and_si128(res_a, res_b); HORSHEAR_REDUCE_PREC_BITS);
const __m128i t2 = _mm_srai_epi16(_mm_xor_si128(res_a, res_b), 1);
const __m128i res = _mm_srai_epi16(_mm_add_epi16(t1, t2),
HORSHEAR_REDUCE_PREC_BITS - 1);
tmp[k + 7] = res; tmp[k + 7] = res;
} }
} }
...@@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width, ...@@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Round and pack into 8 bits // Round and pack into 8 bits
const __m128i round_const = const __m128i round_const =
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1); _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
const __m128i res_lo_round = _mm_srai_epi32( const __m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS); _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment