Commit 3a0df186 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee
Browse files

Simplify buffer management for self-guided restoration filter

* Remove some unused variables
* Reduce need for casts by typing intermediate buffers appropriately
* Avoid copying data which is never modified; use the original data
  instead.
* Reduce number of intermediate buffers required, saving allocations
  of 576KiB in the decoder and ~1MiB in the encoder

No effect on performance

Change-Id: I55243904dd8e818fb6d43fa431903736475d23ff
parent 2cc057cf
......@@ -91,7 +91,7 @@ void av1_free_ref_frame_buffers(BufferPool *pool) {
void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
av1_alloc_restoration_struct(&cm->rst_info, cm->width, cm->height);
cm->rst_internal.tmpbuf =
(uint8_t *)aom_realloc(cm->rst_internal.tmpbuf, RESTORATION_TMPBUF_SIZE);
(int32_t *)aom_realloc(cm->rst_internal.tmpbuf, RESTORATION_TMPBUF_SIZE);
if (cm->rst_internal.tmpbuf == NULL)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate internal tmpbuf for restoration");
......
......@@ -328,8 +328,9 @@ void decode_xq(int *xqd, int *xq) {
#define APPROXIMATE_SGR 1
void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int bit_depth, int r, int eps, void *tmpbuf) {
int32_t *A = (int32_t *)tmpbuf;
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int32_t *A = tmpbuf;
int32_t *B = A + RESTORATION_TILEPELS_MAX;
int32_t *T = B + RESTORATION_TILEPELS_MAX;
int8_t num[RESTORATION_TILEPELS_MAX];
......@@ -498,15 +499,15 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
static void apply_selfguided_restoration(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
void *tmpbuf) {
int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = (int32_t *)tmpbuf;
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
assert(i * width + j < RESTORATION_TILEPELS_MAX);
flt1[i * width + j] = dat[i * stride + j];
flt2[i * width + j] = dat[i * stride + j];
}
......@@ -540,9 +541,6 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
const int tile_height = rst->tile_height >> rst->subsampling_y;
int h_start, h_end, v_start, v_end;
uint8_t *data_p, *dst_p;
uint8_t *dat = (uint8_t *)rst->tmpbuf;
uint8_t *tmpbuf =
(uint8_t *)rst->tmpbuf + RESTORATION_TILEPELS_MAX * sizeof(*dat);
if (rst->rsi->sgrproj_info[tile_idx].level == 0) {
loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
......@@ -557,7 +555,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
8, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
dst_stride, tmpbuf);
dst_stride, rst->tmpbuf);
}
static void loop_sgrproj_filter(uint8_t *data, int width, int height,
......@@ -814,15 +812,13 @@ static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
}
}
static void apply_selfguided_restoration_highbd(uint16_t *dat, int width,
int height, int stride,
int bit_depth, int eps,
int *xqd, uint16_t *dst,
int dst_stride, void *tmpbuf) {
static void apply_selfguided_restoration_highbd(
uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = (int32_t *)tmpbuf;
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -861,9 +857,6 @@ static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
const int tile_height = rst->tile_height >> rst->subsampling_y;
int h_start, h_end, v_start, v_end;
uint16_t *data_p, *dst_p;
uint16_t *dat = (uint16_t *)rst->tmpbuf;
uint8_t *tmpbuf =
(uint8_t *)rst->tmpbuf + RESTORATION_TILEPELS_MAX * sizeof(*dat);
if (rst->rsi->sgrproj_info[tile_idx].level == 0) {
loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst, dst,
......@@ -878,7 +871,7 @@ static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
apply_selfguided_restoration_highbd(
data_p, h_end - h_start, v_end - v_start, stride, bit_depth,
rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].xqd,
dst_p, dst_stride, tmpbuf);
dst_p, dst_stride, rst->tmpbuf);
}
static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
......
......@@ -40,12 +40,20 @@ extern "C" {
sqrt(((1 << (DOMAINTXFMRF_ITERS * 2)) - 1) * 2.0 / 3.0)
// A single 32 bit buffer needed for the filter
#define DOMAINTXFMRF_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * sizeof(int32_t))
// One extra buffer needed in encoder, which is either 8-bit or 16-bit
// depending on the video bit depth.
#if CONFIG_AOM_HIGHBITDEPTH
#define DOMAINTXFMRF_EXTBUF_SIZE (RESTORATION_TILEPELS_MAX * sizeof(uint16_t))
#else
#define DOMAINTXFMRF_EXTBUF_SIZE (RESTORATION_TILEPELS_MAX * sizeof(uint8_t))
#endif
#define DOMAINTXFMRF_BITS (DOMAINTXFMRF_PARAMS_BITS)
// 6 highprecision buffers needed for the filter:
// 1 for the degraded frame, 2 for the restored versions and
// 5 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 3 for each restoration operation
#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 6 * sizeof(int32_t))
#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 5 * sizeof(int32_t))
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 3
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
......@@ -69,6 +77,7 @@ extern "C" {
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
#define WIENER_TMPBUF_SIZE (0)
#define WIENER_EXTBUF_SIZE (0)
#define WIENER_FILT_PREC_BITS 7
#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
......@@ -101,6 +110,8 @@ extern "C" {
// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
// Max of SGRPROJ_EXTBUF_SIZE, DOMAINTXFMRF_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
#define RESTORATION_EXTBUF_SIZE (DOMAINTXFMRF_EXTBUF_SIZE)
typedef struct {
int level;
......@@ -144,7 +155,7 @@ typedef struct {
int ntiles;
int tile_width, tile_height;
int nhtiles, nvtiles;
uint8_t *tmpbuf;
int32_t *tmpbuf;
} RestorationInternal;
static INLINE int get_rest_tilesize(int width, int height) {
......@@ -211,7 +222,7 @@ int av1_alloc_restoration_struct(RestorationInfo *rst_info, int width,
void av1_free_restoration_struct(RestorationInfo *rst_info);
void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int bit_depth, int r, int eps, void *tmpbuf);
int bit_depth, int r, int eps, int32_t *tmpbuf);
void av1_domaintxfmrf_restoration(uint8_t *dgd, int width, int height,
int stride, int param, uint8_t *dst,
int dst_stride, int32_t *tmpbuf);
......
......@@ -742,8 +742,8 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate trial restored frame buffer");
cpi->extra_rstbuf = (uint8_t *)aom_realloc(
cpi->extra_rstbuf, RESTORATION_TILEPELS_MAX * sizeof(int32_t));
cpi->extra_rstbuf =
(uint8_t *)aom_realloc(cpi->extra_rstbuf, RESTORATION_EXTBUF_SIZE);
if (!cpi->extra_rstbuf)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate extra rstbuf for restoration");
......
......@@ -404,7 +404,7 @@ typedef struct AV1_COMP {
#if CONFIG_LOOP_RESTORATION
YV12_BUFFER_CONFIG last_frame_db;
YV12_BUFFER_CONFIG trial_frame_rst;
uint8_t *extra_rstbuf; // Size RESTORATION_TILEPELS_MAX at highest precision
uint8_t *extra_rstbuf; // Extra buffers used in restoration search
RestorationInfo rst_search; // Used for encoder side search
#endif // CONFIG_LOOP_RESTORATION
......
......@@ -121,35 +121,55 @@ static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
return filt_err;
}
static int64_t get_pixel_proj_error(int32_t *src, int width, int height,
int src_stride, int32_t *dgd,
int dgd_stride, int32_t *flt1,
int flt1_stride, int32_t *flt2,
int flt2_stride, int *xqd) {
static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
int src_stride, uint8_t *dat8,
int dat_stride, int bit_depth,
int32_t *flt1, int flt1_stride,
int32_t *flt2, int flt2_stride, int *xqd) {
int i, j;
int64_t err = 0;
int xq[2];
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int32_t s = (int32_t)src[i * src_stride + j];
const int32_t u = (int32_t)dgd[i * dgd_stride + j];
const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
ROUND_POWER_OF_TWO(s, SGRPROJ_RST_BITS);
err += e * e;
if (bit_depth == 8) {
const uint8_t *src = src8;
const uint8_t *dat = dat8;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int32_t u =
(int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
src[i * src_stride + j];
err += e * e;
}
}
} else {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int32_t u =
(int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
src[i * src_stride + j];
err += e * e;
}
}
}
return err;
}
static void get_proj_subspace(int32_t *src, int width, int height,
int src_stride, int32_t *dgd, int dgd_stride,
int32_t *flt1, int flt1_stride, int32_t *flt2,
int flt2_stride, int *xq) {
static void get_proj_subspace(uint8_t *src8, int width, int height,
int src_stride, uint8_t *dat8, int dat_stride,
int bit_depth, int32_t *flt1, int flt1_stride,
int32_t *flt2, int flt2_stride, int *xq) {
int i, j;
double H[2][2] = { { 0, 0 }, { 0, 0 } };
double C[2] = { 0, 0 };
......@@ -159,17 +179,39 @@ static void get_proj_subspace(int32_t *src, int width, int height,
xq[0] = -(1 << SGRPROJ_PRJ_BITS) / 4;
xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0];
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const double u = (double)dgd[i * dgd_stride + j];
const double s = (double)src[i * src_stride + j] - u;
const double f1 = (double)flt1[i * flt1_stride + j] - u;
const double f2 = (double)flt2[i * flt2_stride + j] - u;
H[0][0] += f1 * f1;
H[1][1] += f2 * f2;
H[0][1] += f1 * f2;
C[0] += f1 * s;
C[1] += f2 * s;
if (bit_depth == 8) {
const uint8_t *src = src8;
const uint8_t *dat = dat8;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const double s =
(double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
const double f1 = (double)flt1[i * flt1_stride + j] - u;
const double f2 = (double)flt2[i * flt2_stride + j] - u;
H[0][0] += f1 * f1;
H[1][1] += f2 * f2;
H[0][1] += f1 * f2;
C[0] += f1 * s;
C[1] += f2 * s;
}
}
} else {
const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const double s =
(double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
const double f1 = (double)flt1[i * flt1_stride + j] - u;
const double f2 = (double)flt2[i * flt2_stride + j] - u;
H[0][0] += f1 * f1;
H[1][1] += f2 * f2;
H[0][1] += f1 * f2;
C[0] += f1 * s;
C[1] += f2 * s;
}
}
}
H[0][0] /= size;
......@@ -196,33 +238,25 @@ void encode_xq(int *xq, int *xqd) {
static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int dat_stride, uint8_t *src8,
int src_stride, int bit_depth,
int *eps, int *xqd, void *srcbuf,
void *rstbuf) {
int32_t *srd = (int32_t *)srcbuf;
int32_t *dgd = (int32_t *)rstbuf;
int32_t *flt1 = dgd + RESTORATION_TILEPELS_MAX;
int *eps, int *xqd, int32_t *rstbuf) {
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
uint8_t *tmpbuf2 = (uint8_t *)(flt2 + RESTORATION_TILEPELS_MAX);
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j, ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
int exq[2];
if (bit_depth > 8) {
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
flt1[i * width + j] = (int32_t)dat[i * dat_stride + j];
flt2[i * width + j] = (int32_t)dat[i * dat_stride + j];
dgd[i * width + j] = (int32_t)dat[i * dat_stride + j]
<< SGRPROJ_RST_BITS;
srd[i * width + j] = (int32_t)src[i * src_stride + j]
<< SGRPROJ_RST_BITS;
}
}
} else {
uint8_t *src = src8;
uint8_t *dat = dat8;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -230,8 +264,6 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
const int l = i * dat_stride + j;
flt1[k] = (int32_t)dat[l];
flt2[k] = (int32_t)dat[l];
dgd[k] = (int32_t)dat[l] << SGRPROJ_RST_BITS;
srd[k] = (int32_t)src[i * src_stride + j] << SGRPROJ_RST_BITS;
}
}
}
......@@ -239,11 +271,12 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(flt2, width, height, width, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
get_proj_subspace(srd, width, height, width, dgd, width, flt1, width, flt2,
width, exq);
get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
bit_depth, flt1, width, flt2, width, exq);
encode_xq(exq, exqd);
err = get_pixel_proj_error(srd, width, height, width, dgd, width, flt1,
width, flt2, width, exqd);
err =
get_pixel_proj_error(src8, width, height, src_stride, dat8, dat_stride,
bit_depth, flt1, width, flt2, width, exqd);
if (besterr == -1 || err < besterr) {
bestep = ep;
besterr = err;
......@@ -303,7 +336,7 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
8,
#endif // CONFIG_AOM_HIGHBITDEPTH
&rsi->sgrproj_info[tile_idx].ep, rsi->sgrproj_info[tile_idx].xqd,
cpi->extra_rstbuf, cm->rst_internal.tmpbuf);
cm->rst_internal.tmpbuf);
rsi->sgrproj_info[tile_idx].level = 1;
err = try_restoration_tile(src, cpi, rsi, 1, partial_frame, tile_idx, 0, 0,
dst_frame);
......@@ -376,7 +409,7 @@ static void search_domaintxfmrf_restoration(uint8_t *dgd8, int width,
int height, int dgd_stride,
uint8_t *src8, int src_stride,
int bit_depth, int *sigma_r,
void *fltbuf, void *rstbuf) {
uint8_t *fltbuf, int32_t *tmpbuf) {
const int first_p_step = 8;
const int second_p_range = first_p_step >> 1;
const int second_p_step = 2;
......@@ -385,8 +418,7 @@ static void search_domaintxfmrf_restoration(uint8_t *dgd8, int width,
int p, best_p0, best_p = -1;
int64_t best_sse = INT64_MAX, sse;
if (bit_depth == 8) {
uint8_t *flt = (uint8_t *)fltbuf;
int32_t *tmpbuf = (int32_t *)rstbuf;
uint8_t *flt = fltbuf;
uint8_t *dgd = dgd8;
uint8_t *src = src8;
// First phase
......@@ -428,7 +460,6 @@ static void search_domaintxfmrf_restoration(uint8_t *dgd8, int width,
} else {
#if CONFIG_AOM_HIGHBITDEPTH
uint16_t *flt = (uint16_t *)fltbuf;
int32_t *tmpbuf = (int32_t *)rstbuf;
uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
// First phase
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment