Commit 7a5587a8 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Make loop-restoration use 64x64 processing units

Changes loop-restoration to use processing unit size that is
64x64 for luma; for chroma the processing unit is coupled to
64x64 support region for luma.
Thus for chroma the processing unit size is 32x32 for 4:2:0,
32x64 for 4:2:2 and 64x64 for 4:4:4, etc.

While the Wiener filter output should not change with this patch,
the sgr filter will change since the boundary pixel handling in
sgr is internal within the filter.

Change-Id: I65a9e2df88927a19445420ce400acb1fcf7afa93
parent 1545bdb3
......@@ -150,6 +150,8 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
int height, int stride,
RestorationInternal *rst, uint8_t *dst,
int dst_stride) {
const int procunit_width = rst->rsi->procunit_width;
const int procunit_height = rst->rsi->procunit_height;
const int tile_width = rst->tile_width;
const int tile_height = rst->tile_height;
int i, j;
......@@ -164,10 +166,10 @@ static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
&h_start, &h_end, &v_start, &v_end);
// Convolve the whole tile (done in blocks here to match the requirements
// of the vectorized convolve functions, but the result is equivalent)
for (i = v_start; i < v_end; i += MAX_SB_SIZE)
for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
for (i = v_start; i < v_end; i += procunit_height)
for (j = h_start; j < h_end; j += procunit_width) {
int w = AOMMIN(procunit_width, (h_end - j + 15) & ~15);
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
......@@ -896,10 +898,12 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
int height, int stride,
RestorationInternal *rst, uint8_t *dst,
int dst_stride) {
const int procunit_width = rst->rsi->procunit_width;
const int procunit_height = rst->rsi->procunit_height;
const int tile_width = rst->tile_width;
const int tile_height = rst->tile_height;
int i, j;
int h_start, h_end, v_start, v_end;
uint8_t *data_p, *dst_p;
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
......@@ -909,12 +913,16 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
data_p = data + h_start + v_start * stride;
dst_p = dst + h_start + v_start * dst_stride;
apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
dst_stride, rst->tmpbuf);
for (i = v_start; i < v_end; i += procunit_height)
for (j = h_start; j < h_end; j += procunit_width) {
int w = AOMMIN(procunit_width, h_end - j);
int h = AOMMIN(procunit_height, v_end - i);
uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
apply_selfguided_restoration(
data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
}
static void loop_sgrproj_filter(uint8_t *data, int width, int height,
......@@ -988,6 +996,8 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
RestorationInternal *rst,
int bit_depth, uint16_t *dst,
int dst_stride) {
const int procunit_width = rst->rsi->procunit_width;
const int procunit_height = rst->rsi->procunit_height;
const int tile_width = rst->tile_width;
const int tile_height = rst->tile_height;
int h_start, h_end, v_start, v_end;
......@@ -1003,10 +1013,10 @@ static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
&h_start, &h_end, &v_start, &v_end);
// Convolve the whole tile (done in blocks here to match the requirements
// of the vectorized convolve functions, but the result is equivalent)
for (i = v_start; i < v_end; i += MAX_SB_SIZE)
for (j = h_start; j < h_end; j += MAX_SB_SIZE) {
int w = AOMMIN(MAX_SB_SIZE, (h_end - j + 15) & ~15);
int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
for (i = v_start; i < v_end; i += procunit_height)
for (j = h_start; j < h_end; j += procunit_width) {
int w = AOMMIN(procunit_width, (h_end - j + 15) & ~15);
int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
const uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
......@@ -1185,10 +1195,12 @@ static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
RestorationInternal *rst,
int bit_depth, uint16_t *dst,
int dst_stride) {
const int procunit_width = rst->rsi->procunit_width;
const int procunit_height = rst->rsi->procunit_height;
const int tile_width = rst->tile_width;
const int tile_height = rst->tile_height;
int i, j;
int h_start, h_end, v_start, v_end;
uint16_t *data_p, *dst_p;
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst, dst,
......@@ -1198,12 +1210,16 @@ static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
tile_width, tile_height, width, height, 0, 0,
&h_start, &h_end, &v_start, &v_end);
data_p = data + h_start + v_start * stride;
dst_p = dst + h_start + v_start * dst_stride;
apply_selfguided_restoration_highbd(
data_p, h_end - h_start, v_end - v_start, stride, bit_depth,
rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].xqd,
dst_p, dst_stride, rst->tmpbuf);
for (i = v_start; i < v_end; i += procunit_height)
for (j = h_start; j < h_end; j += procunit_width) {
int w = AOMMIN(procunit_width, h_end - j);
int h = AOMMIN(procunit_height, v_end - i);
uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
apply_selfguided_restoration_highbd(
data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
}
static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
......
......@@ -24,6 +24,8 @@ extern "C" {
#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
#define RESTORATION_TILESIZE_MAX 256
#define RESTORATION_TILEPELS_MAX \
(RESTORATION_TILESIZE_MAX * RESTORATION_TILESIZE_MAX * 9 / 4)
......@@ -150,6 +152,7 @@ typedef struct {
typedef struct {
int restoration_tilesize;
int procunit_width, procunit_height;
RestorationType frame_restoration_type;
RestorationType *restoration_type;
// Wiener filter
......
......@@ -2790,6 +2790,13 @@ static void decode_restoration_mode(AV1_COMMON *cm,
cm->rst_info[1].restoration_tilesize = cm->rst_info[0].restoration_tilesize;
}
cm->rst_info[2].restoration_tilesize = cm->rst_info[1].restoration_tilesize;
cm->rst_info[0].procunit_width = cm->rst_info[0].procunit_height =
RESTORATION_PROC_UNIT_SIZE;
cm->rst_info[1].procunit_width = cm->rst_info[2].procunit_width =
RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_x;
cm->rst_info[1].procunit_height = cm->rst_info[2].procunit_height =
RESTORATION_PROC_UNIT_SIZE >> cm->subsampling_y;
}
static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
......
......@@ -3874,6 +3874,12 @@ static void set_restoration_tilesize(int width, int height, int sx, int sy,
rst[0].restoration_tilesize = (RESTORATION_TILESIZE_MAX >> 1);
rst[1].restoration_tilesize = rst[0].restoration_tilesize >> s;
rst[2].restoration_tilesize = rst[1].restoration_tilesize;
rst[0].procunit_width = rst[0].procunit_height = RESTORATION_PROC_UNIT_SIZE;
rst[1].procunit_width = rst[2].procunit_width =
RESTORATION_PROC_UNIT_SIZE >> sx;
rst[1].procunit_height = rst[2].procunit_height =
RESTORATION_PROC_UNIT_SIZE >> sy;
}
#endif // CONFIG_LOOP_RESTORATION
......@@ -3921,6 +3927,8 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
for (int i = 0; i < MAX_MB_PLANE; ++i) {
cpi->rst_search[i].restoration_tilesize =
cm->rst_info[i].restoration_tilesize;
cpi->rst_search[i].procunit_width = cm->rst_info[i].procunit_width;
cpi->rst_search[i].procunit_height = cm->rst_info[i].procunit_height;
av1_alloc_restoration_struct(cm, &cpi->rst_search[i],
#if CONFIG_FRAME_SUPERRES
cm->superres_upscaled_width,
......
......@@ -349,7 +349,8 @@ void encode_xq(int *xq, int *xqd) {
static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int dat_stride, const uint8_t *src8,
int src_stride, int use_highbitdepth,
int bit_depth, int *eps, int *xqd,
int bit_depth, int pu_width,
int pu_height, int *eps, int *xqd,
int32_t *rstbuf) {
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
......@@ -357,45 +358,71 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
int flt1_stride = width;
int flt2_stride = width;
assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
pu_width == RESTORATION_PROC_UNIT_SIZE);
assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
pu_height == RESTORATION_PROC_UNIT_SIZE);
for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
int exq[2];
#if CONFIG_HIGHBITDEPTH
if (use_highbitdepth) {
uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
for (int i = 0; i < height; i += pu_height)
for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j);
const int h = AOMMIN(pu_height, height - i);
uint16_t *dat_p = dat + i * dat_stride + j;
int32_t *flt1_p = flt1 + i * flt1_stride + j;
int32_t *flt2_p = flt2 + i * flt2_stride + j;
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter_highbd(dat, width, height, dat_stride, flt1, width,
sgr_params[ep].corner, sgr_params[ep].edge);
av1_highpass_filter_highbd(dat_p, w, h, dat_stride, flt1_p,
flt1_stride, sgr_params[ep].corner,
sgr_params[ep].edge);
#else
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt1,
width, bit_depth, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration_highbd_c(
dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd(dat, width, height, dat_stride, flt2,
width, bit_depth, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
av1_selfguided_restoration_highbd_c(
dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
}
} else {
#endif
for (int i = 0; i < height; i += pu_height)
for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j);
const int h = AOMMIN(pu_height, height - i);
uint8_t *dat_p = dat8 + i * dat_stride + j;
int32_t *flt1_p = flt1 + i * flt1_stride + j;
int32_t *flt2_p = flt2 + i * flt2_stride + j;
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter(dat8, width, height, dat_stride, flt1, width,
sgr_params[ep].corner, sgr_params[ep].edge);
av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
sgr_params[ep].corner, sgr_params[ep].edge);
#else
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt1_p,
flt1_stride, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
}
#if CONFIG_HIGHBITDEPTH
}
#endif
aom_clear_system_state();
get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
use_highbitdepth, flt1, width, flt2, width, exq);
use_highbitdepth, flt1, flt1_stride, flt2, flt2_stride,
exq);
aom_clear_system_state();
encode_xq(exq, exqd);
err = finer_search_pixel_proj_error(src8, width, height, src_stride, dat8,
dat_stride, use_highbitdepth, flt1,
width, flt2, width, 2, exqd);
err = finer_search_pixel_proj_error(
src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
flt1, flt1_stride, flt2, flt2_stride, 2, exqd);
if (besterr == -1 || err < besterr) {
bestep = ep;
besterr = err;
......@@ -557,16 +584,17 @@ static void search_sgrproj_for_rtile(const struct rest_search_ctxt *ctxt,
const uint8_t *src_start =
ctxt->src_buffer + v_start * ctxt->src_stride + h_start;
search_selfguided_restoration(dgd_start, h_end - h_start, v_end - v_start,
ctxt->dgd_stride, src_start, ctxt->src_stride,
search_selfguided_restoration(
dgd_start, h_end - h_start, v_end - v_start, ctxt->dgd_stride, src_start,
ctxt->src_stride,
#if CONFIG_HIGHBITDEPTH
cm->use_highbitdepth, cm->bit_depth,
cm->use_highbitdepth, cm->bit_depth,
#else
0, 8,
0, 8,
#endif // CONFIG_HIGHBITDEPTH
&rtile_sgrproj_info->ep,
rtile_sgrproj_info->xqd,
cm->rst_internal.tmpbuf);
rsi[ctxt->plane].procunit_width, rsi[ctxt->plane].procunit_height,
&rtile_sgrproj_info->ep, rtile_sgrproj_info->xqd,
cm->rst_internal.tmpbuf);
plane_rsi->restoration_type[rtile_idx] = RESTORE_SGRPROJ;
err = try_restoration_tile(ctxt->src, ctxt->cpi, rsi, (1 << ctxt->plane),
ctxt->partial_frame, rtile_idx, 0, 0,
......@@ -610,7 +638,6 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
SgrprojInfo ref_sgrproj_info;
set_default_sgrproj(&ref_sgrproj_info);
foreach_rtile_in_tile(&ctxt, tile_row, tile_col, search_sgrproj_for_rtile,
&ref_sgrproj_info);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment