Commit 1330dfd1 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Refactoring/simplification of buffers used for sgr

Inlcudes miscellaneous cleanups, test fixes, and code reorganization
for loop-restoration components.

Change-Id: I5b2e6419234d945e6f4344b22636119b50df4054
parent e168a783
......@@ -628,7 +628,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_selfguided_restoration sse4_1/;
add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
......@@ -638,7 +638,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
......
This diff is collapsed.
......@@ -25,10 +25,28 @@ extern "C" {
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
// Determines line buffer requirement for LR. Should be set at the max
// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
#define RESTORATION_BORDER_VERT 0
#define RESTORATION_BORDER_HORZ 3 // Do not change this
#define SGRPROJ_BORDER_VERT 0 // Vertical border used for Sgr
#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for Sgr
#define WIENER_BORDER_VERT 0 // Vertical border used for Wiener
#define WIENER_HALFWIN 3
#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener
// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
// Note the line buffer needed is twice the value of this macro.
#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
#else
#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
#else
#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
// Pad up to 20 more (may be much less is needed)
#define RESTORATION_PADDING 20
......@@ -39,28 +57,20 @@ extern "C" {
RESTORATION_PADDING))
#define RESTORATION_TILESIZE_MAX 256
#define RESTORATION_TILEPELS_MAX \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
// 4 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 2 for each restoration operation
#define SGRPROJ_OUTBUF_SIZE \
((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
#define SGRPROJ_TMPBUF_SIZE \
(RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
#define RESTORATION_TILEPELS_MAX \
((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
// Two 32-bit buffers needed for the restored versions from two filters
// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
// on the decoder side.
#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t))
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr
#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
// Restoration precision bits generated higher than source before projection
......@@ -85,15 +95,12 @@ extern "C" {
#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_RADIUS 2 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
#define WIENER_HALFWIN 3
#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
#define WIENER_BORDER_VERT 0
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
......@@ -268,9 +275,11 @@ int av1_alloc_restoration_struct(struct AV1Common *cm,
int height);
void av1_free_restoration_struct(RestorationInfo *rst_info);
void extend_frame(uint8_t *data, int width, int height, int stride);
void extend_frame(uint8_t *data, int width, int height, int stride,
int border_horz, int border_vert);
#if CONFIG_HIGHBITDEPTH
void extend_frame_highbd(uint16_t *data, int width, int height, int stride);
void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
int border_horz, int border_vert);
#endif // CONFIG_HIGHBITDEPTH
void decode_xq(int *xqd, int *xq);
void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
......
......@@ -664,17 +664,18 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps,
int32_t *tmpbuf) {
int dst_stride, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width + 3) & ~3) + 16;
int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
......@@ -1059,7 +1060,6 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -1067,12 +1067,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1,
tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2,
tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
......@@ -1374,17 +1372,18 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width + 3) & ~3) + 16;
int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
......@@ -1741,7 +1740,6 @@ void apply_selfguided_restoration_highbd_sse4_1(
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -1751,11 +1749,11 @@ void apply_selfguided_restoration_highbd_sse4_1(
#else
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
......
......@@ -29,9 +29,9 @@
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/mathutils.h"
#include "av1/encoder/picklpf.h"
#include "av1/encoder/pickrst.h"
#include "av1/encoder/mathutils.h"
// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
// When set to RESTORE_TYPES we allow switchable.
......@@ -354,7 +354,6 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int32_t *rstbuf) {
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
......@@ -387,11 +386,11 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
#else
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].r2, sgr_params[ep].e2);
}
} else {
#endif
......@@ -407,12 +406,11 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
sgr_params[ep].corner, sgr_params[ep].edge);
#else
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
sgr_params[ep].r1, sgr_params[ep].e1,
tmpbuf2);
sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].e2);
}
#if CONFIG_HIGHBITDEPTH
}
......@@ -640,11 +638,12 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
ctxt.plane_height, ctxt.dgd_stride);
ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
SGRPROJ_BORDER_VERT);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
ctxt.dgd_stride);
ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
......@@ -1242,14 +1241,17 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
AV1_COMMON *const cm = &cpi->common;
// Construct a (WIENER_HALFWIN)-pixel border around the frame
// Note use this border to gather stats even though the actual filter
// may use less border on the top/bottom of a processing unit.
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
ctxt.plane_height, ctxt.dgd_stride);
ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
WIENER_HALFWIN);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
ctxt.dgd_stride);
ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
// Compute best Wiener filters for each rtile, one (encoder/decoder)
// tile at a time.
......
......@@ -40,9 +40,11 @@ class AV1SelfguidedFilterTest
protected:
void RunSpeedTest() {
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
int i, j, k;
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
......@@ -72,8 +74,15 @@ class AV1SelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration(input, width, height, stride, eps, xqd,
output, out_stride, tmpbuf);
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
int w = AOMMIN(pu_width, width - j);
int h = AOMMIN(pu_height, height - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf);
}
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
......@@ -87,6 +96,8 @@ class AV1SelfguidedFilterTest
}
void RunCorrectnessTest() {
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
// range of sizes *up to* this size, so that we can check, eg.,
// the behaviour on tiles which are not a multiple of 4 wide.
......@@ -127,10 +138,24 @@ class AV1SelfguidedFilterTest
int test_w = max_w - (i / 9);
int test_h = max_h - (i % 9);
for (k = 0; k < test_h; k += pu_height)
for (j = 0; j < test_w; j += pu_width) {
int w = AOMMIN(pu_width, test_w - j);
int h = AOMMIN(pu_height, test_h - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
uint8_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf);
apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
output2_p, out_stride, tmpbuf);
}
/*
apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
*/
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k) {
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
......@@ -167,9 +192,11 @@ class AV1HighbdSelfguidedFilterTest
protected:
void RunSpeedTest() {
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
int i, j, k;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
......@@ -201,9 +228,16 @@ class AV1HighbdSelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_highbd(input, width, height, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
int w = AOMMIN(pu_width, width - j);
int h = AOMMIN(pu_height, height - k);
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
eps, xqd, output_p, out_stride,
tmpbuf);
}
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
......@@ -217,6 +251,8 @@ class AV1HighbdSelfguidedFilterTest
}
void RunCorrectnessTest() {
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
// range of sizes *up to* this size, so that we can check, eg.,
// the behaviour on tiles which are not a multiple of 4 wide.
......@@ -259,12 +295,29 @@ class AV1HighbdSelfguidedFilterTest
int test_w = max_w - (i / 9);
int test_h = max_h - (i % 9);
for (k = 0; k < test_h; k += pu_height)
for (j = 0; j < test_w; j += pu_width) {
int w = AOMMIN(pu_width, test_w - j);
int h = AOMMIN(pu_height, test_h - k);
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
uint16_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration_highbd(input_p, w, h, stride, bit_depth,
eps, xqd, output_p, out_stride,
tmpbuf);
apply_selfguided_restoration_highbd_c(input_p, w, h, stride,
bit_depth, eps, xqd, output2_p,
out_stride, tmpbuf);
}
/*
apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
bit_depth, eps, xqd, output2,
out_stride, tmpbuf);
*/
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment