Commit 22bbe4cc authored by Debargha Mukherjee's avatar Debargha Mukherjee

Reduce line buffer size for Wiener filter.

This patch forces the vertical filtering for the top and bottom
rows of a processing unit for the Wiener filter to be 5-tap.
The 5-taps are derived from the primary 7-tap fitler by forcing
the taps at the end to be zero, and absorbing their weights into
the other taps to maintain normalization.
This will effectively reduce the line buffer size for luma Wiener
filter to 4 (from 6).

Change-Id: I5e21b58369777eabf553a8987387d112f98a5598
parent 1a191125
......@@ -146,6 +146,17 @@ static void loop_copy_tile(uint8_t *data, int tile_idx, int subtile_idx,
h_end - h_start);
}
// Convert 7-tap filter to 5-tap for top and bottom rows of a processing unit
static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert) {
memcpy(vert, orig, sizeof(InterpKernel));
int delta = vert[0] / 2;
vert[1] += delta;
vert[WIENER_WIN - 2] += delta;
vert[2] += vert[0] - delta;
vert[WIENER_WIN - 3] += vert[0] - delta;
vert[0] = vert[WIENER_WIN - 1] = 0;
}
static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
int height, int stride,
RestorationInternal *rst, uint8_t *dst,
......@@ -161,6 +172,9 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
dst_stride);
return;
}
InterpKernel vertical_topbot;
stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
vertical_topbot);
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
......@@ -172,15 +186,41 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
// Use 5-tap vertical filtering for top and bottom rows in
// processing unit
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1);
#else
aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
data_p += stride;
dst_p += dst_stride;
// Note h is at least 16
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
h);
h - 2);
#else
aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
h - 2);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
data_p += stride * (h - 2);
dst_p += dst_stride * (h - 2);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1);
#else
aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h);
vertical_topbot, 16, w, 1);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
}
}
......@@ -1011,6 +1051,9 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
InterpKernel vertical_topbot;
stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
vertical_topbot);
// Convolve the whole tile (done in blocks here to match the requirements
// of the vectorized convolve functions, but the result is equivalent)
for (i = v_start; i < v_end; i += procunit_height)
......@@ -1019,16 +1062,45 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
// if the filter is 7-tap do only horizontal filtering for top and
// bottom rows.
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_highbd_convolve8_add_src_hip(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1, bit_depth);
#else
aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
CONVERT_TO_BYTEPTR(dst_p), dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
data_p += stride;
dst_p += dst_stride;
// Note h is at least 16
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_highbd_convolve8_add_src_hip(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
#else
aom_highbd_convolve8_add_src(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
data_p += stride * (h - 2);
dst_p += dst_stride * (h - 2);
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
aom_highbd_convolve8_add_src_hip(
CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1, bit_depth);
#else
aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
CONVERT_TO_BYTEPTR(dst_p), dst_stride,
rst->rsi->wiener_info[tile_idx].hfilter, 16,
vertical_topbot, 16, w, 1, bit_depth);
#endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
}
}
......
......@@ -358,8 +358,8 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
int flt1_stride = width;
int flt2_stride = width;
int flt1_stride = ((width + 7) & ~7) + 8;
int flt2_stride = ((width + 7) & ~7) + 8;
assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
pu_width == RESTORATION_PROC_UNIT_SIZE);
assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
......@@ -385,11 +385,11 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
flt1_stride, sgr_params[ep].corner,
sgr_params[ep].edge);
#else
av1_selfguided_restoration_highbd_c(
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_c(
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
}
......@@ -406,13 +406,13 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
sgr_params[ep].corner, sgr_params[ep].edge);
#else
av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt1_p,
flt1_stride, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
sgr_params[ep].r1, sgr_params[ep].e1,
tmpbuf2);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
}
#if CONFIG_HIGHBITDEPTH
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment