Commit 1d18460f authored by Debargha Mukherjee's avatar Debargha Mukherjee

Reduce precision bis between shears

Change-Id: I89e981c9396c7a1ba8051d65036a16692da94d0d
parent 0d749d60
......@@ -53,6 +53,11 @@ typedef struct mv32 {
// Precision of filter taps
#define WARPEDPIXEL_FILTER_BITS 7
// Precision bits reduction after horizontal shear
#define HORSHEAR_REDUCE_PREC_BITS 5
#define VERSHEAR_REDUCE_PREC_BITS \
(2 * WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
......
......@@ -916,6 +916,7 @@ static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
else
sum += ref[iy * stride + ix + m - 3] * coeffs[m];
}
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
tmp[(k + 7) * 8 + (l + 4)] = sum;
}
}
......@@ -933,8 +934,7 @@ static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
sum = clip_pixel_highbd(
ROUND_POWER_OF_TWO_SIGNED(sum, 2 * WARPEDPIXEL_FILTER_BITS),
bd);
ROUND_POWER_OF_TWO_SIGNED(sum, VERSHEAR_REDUCE_PREC_BITS), bd);
if (ref_frm)
*p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
else
......@@ -1116,7 +1116,8 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
// (once border extension is taken into account)
for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] =
ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
}
} else if (ix4 >= width + 6) {
// In this case, the leftmost pixel sampled is in column
......@@ -1125,7 +1126,8 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
// (once border extension is taken into account)
for (l = 0; l < 8; ++l) {
tmp[(k + 7) * 8 + l] =
ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
}
} else {
// If we get here, then
......@@ -1148,6 +1150,7 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
for (m = 0; m < 8; ++m) {
sum += ref[iy * stride + ix + m] * coeffs[m];
}
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
sx += alpha;
}
......@@ -1169,8 +1172,7 @@ void av1_warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
for (m = 0; m < 8; ++m) {
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
}
sum =
clip_pixel(ROUND_POWER_OF_TWO(sum, 2 * WARPEDPIXEL_FILTER_BITS));
sum = clip_pixel(ROUND_POWER_OF_TWO(sum, VERSHEAR_REDUCE_PREC_BITS));
if (ref_frm)
*p = ROUND_POWER_OF_TWO(*p + sum, 1);
else
......@@ -1199,7 +1201,6 @@ static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
const int32_t gamma = wm->gamma;
const int32_t delta = wm->delta;
// printf("%d %d %d %d\n", mat[2], mat[3], mat[4], mat[5]);
av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
p_width, p_height, p_stride, subsampling_x, subsampling_y,
ref_frm, alpha, beta, gamma, delta);
......
......@@ -79,11 +79,13 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
// would be taken from the leftmost/rightmost column, then we can
// skip the expensive horizontal filter.
if (ix4 <= -7) {
tmp[k + 7] =
_mm_set1_epi16(ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS));
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else if (ix4 >= width + 6) {
tmp[k + 7] = _mm_set1_epi16(ref[iy * stride + (width - 1)] *
(1 << WARPEDPIXEL_FILTER_BITS));
tmp[k + 7] = _mm_set1_epi16(
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
......@@ -119,6 +121,9 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
// coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
__m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
__m128i round_const =
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
// Calculate filtered results
__m128i src_0 = _mm_unpacklo_epi8(src, zero);
__m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
......@@ -131,6 +136,8 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
__m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
_mm_add_epi32(res_2, res_6));
res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
HORSHEAR_REDUCE_PREC_BITS);
// Filter odd-index pixels
__m128i tmp_1 = filter[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS];
......@@ -159,6 +166,8 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
__m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
_mm_add_epi32(res_3, res_7));
res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
HORSHEAR_REDUCE_PREC_BITS);
// Combine results into one register.
// We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
......@@ -240,12 +249,12 @@ void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
// Round and pack into 8 bits
__m128i round_const =
_mm_set1_epi32((1 << (2 * WARPEDPIXEL_FILTER_BITS)) >> 1);
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
__m128i res_lo_round = _mm_srai_epi32(
_mm_add_epi32(res_lo, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_hi_round = _mm_srai_epi32(
_mm_add_epi32(res_hi, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
_mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
__m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
__m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment