Commit 726a953c authored by David Barker's avatar David Barker Committed by Angie Chiang
Browse files

Unify high-precision convolve filters: convolve-round

* Reduce bit widths of intermediate values where possible
* Change ROUND_POWER_OF_TWO_SIGNED to ROUND_POWER_OF_TWO
  in av1(_highbd)_convolve_2d
* Apply offsetting and bounds checking, to match the intended
  hardware implementation
* Separate the implementations of av1(_highbd)_convolve_2d
  into compound-round and non-compound-round cases. This is because
  there are now a significant number of differences between the
  functions.

Overall, this is expected to affect the bitstream and encoder output
when convolve-round alone is enabled, but *not* when compound-round
is enabled.

Change-Id: I8c21e0645fd11f64c59552885f87f4a5dd40ccf7
parent e64d51a9
...@@ -203,58 +203,110 @@ void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst, ...@@ -203,58 +203,110 @@ void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
for (r = 0; r < h; ++r) { for (r = 0; r < h; ++r) {
for (c = 0; c < w; ++c) { for (c = 0; c < w; ++c) {
dst[r * dst_stride + c] = dst[r * dst_stride + c] =
clip_pixel(ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits)); clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
} }
} }
} }
#if CONFIG_COMPOUND_ROUND
void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
int dst_stride, int w, int h, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y, const int subpel_x_q4, InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params) { const int subpel_y_q4, ConvolveParams *conv_params) {
int x, y, k; int x, y, k;
CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1; int im_h = h + filter_params_y->taps - 1;
int im_stride = w; int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1;
(void)conv_params;
// horizontal filter // horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride; const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK); *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) { for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) { for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0; int32_t sum = 0;
for (k = 0; k < filter_params_x->taps; ++k) { for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
} }
#if CONFIG_COMPOUND_ROUND
im_block[y * im_stride + x] = im_block[y * im_stride + x] =
clip_pixel(ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0)); clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
}
}
// vertical filter
uint8_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0;
for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
}
}
}
#else #else
/* When convolve-round is enabled and compound-round is disabled, we use a
high-precision convolve filter.
Note: For notes on hardware implementations, including the required
bit widths for various intermediate values, see the comments above
av1_warp_affine_c.
*/
void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y, const int subpel_x_q4,
const int subpel_y_q4, ConvolveParams *conv_params) {
int x, y, k;
int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int bd = 8;
// horizontal filter
const uint8_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
im_block[y * im_stride + x] = im_block[y * im_stride + x] =
ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0); ROUND_POWER_OF_TWO(sum, conv_params->round_0);
#endif
} }
} }
// vertical filter // vertical filter
CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride; int32_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK); *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
for (y = 0; y < h; ++y) { for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) { for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0; CONV_BUF_TYPE sum = 1 << offset_bits;
for (k = 0; k < filter_params_y->taps; ++k) { for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
} }
dst[y * dst_stride + x] += assert(0 <= sum && sum < (1 << (offset_bits + 2)));
ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
} }
} }
} }
#endif
static INLINE void transpose_uint8(uint8_t *dst, int dst_stride, static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
const uint8_t *src, int src_stride, int w, const uint8_t *src, int src_stride, int w,
...@@ -350,11 +402,12 @@ void av1_highbd_convolve_rounding(const int32_t *src, int src_stride, ...@@ -350,11 +402,12 @@ void av1_highbd_convolve_rounding(const int32_t *src, int src_stride,
for (r = 0; r < h; ++r) { for (r = 0; r < h; ++r) {
for (c = 0; c < w; ++c) { for (c = 0; c < w; ++c) {
dst[r * dst_stride + c] = clip_pixel_highbd( dst[r * dst_stride + c] = clip_pixel_highbd(
ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits), bd); ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
} }
} }
} }
#if CONFIG_COMPOUND_ROUND
void av1_highbd_convolve_2d(const uint16_t *src, int src_stride, void av1_highbd_convolve_2d(const uint16_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h, CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_x,
...@@ -362,48 +415,93 @@ void av1_highbd_convolve_2d(const uint16_t *src, int src_stride, ...@@ -362,48 +415,93 @@ void av1_highbd_convolve_2d(const uint16_t *src, int src_stride,
const int subpel_x_q4, const int subpel_y_q4, const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) { ConvolveParams *conv_params, int bd) {
int x, y, k; int x, y, k;
CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1; int im_h = h + filter_params_y->taps - 1;
int im_stride = w; int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1;
(void)conv_params;
// horizontal filter // horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride; const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK); *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) { for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) { for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0; int32_t sum = 0;
for (k = 0; k < filter_params_x->taps; ++k) { for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
} }
#if CONFIG_COMPOUND_ROUND im_block[y * im_stride + x] =
im_block[y * im_stride + x] = clip_pixel_highbd( clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0), bd); }
}
// vertical filter
uint16_t *src_vert = im_block + fo_vert * im_stride;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0;
for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
}
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
dst[y * dst_stride + x] += res;
}
}
}
#else #else
void av1_highbd_convolve_2d(const uint16_t *src, int src_stride,
CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
InterpFilterParams *filter_params_x,
InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params, int bd) {
int x, y, k;
int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
int im_h = h + filter_params_y->taps - 1;
int im_stride = w;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
// horizontal filter
const uint16_t *src_horiz = src - fo_vert * src_stride;
const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
for (y = 0; y < im_h; ++y) {
for (x = 0; x < w; ++x) {
int32_t sum = (1 << (bd + FILTER_BITS - 1));
for (k = 0; k < filter_params_x->taps; ++k) {
sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
}
assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
(void)bd; (void)bd;
im_block[y * im_stride + x] = im_block[y * im_stride + x] =
ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0); ROUND_POWER_OF_TWO(sum, conv_params->round_0);
#endif
} }
} }
// vertical filter // vertical filter
CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride; int32_t *src_vert = im_block + fo_vert * im_stride;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
*filter_params_y, subpel_y_q4 & SUBPEL_MASK); *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
for (y = 0; y < h; ++y) { for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) { for (x = 0; x < w; ++x) {
CONV_BUF_TYPE sum = 0; CONV_BUF_TYPE sum = 1 << offset_bits;
for (k = 0; k < filter_params_y->taps; ++k) { for (k = 0; k < filter_params_y->taps; ++k) {
sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
} }
dst[y * dst_stride + x] += assert(0 <= sum && sum < (1 << (offset_bits + 2)));
ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1); CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
((1 << (offset_bits - conv_params->round_1)) +
(1 << (offset_bits - conv_params->round_1 - 1)));
dst[y * dst_stride + x] += res;
} }
} }
} }
#endif
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h, uint8_t *dst, int dst_stride, int w, int h,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment