Commit e168a783 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Reduce/Eliminate line buffer for loop-restoration.

This patch forces the vertical filtering for the top and bottom
rows of a processing unit for the Wiener filter to not use border
more than what is set in the WIENER_BORDER_VERT macro.
This macro is currently set at 0 to eliminate line buffer completely,
but it could be increased to 1 or 2 to use limited line buffers
if the coding efficiency is affected too much with a 0 line-buffer.

Also, for the sgr filter we added the option of using overlapping
windows horizonttally and vertically to improve coding efficiency.
The vertical border used is set by the SGRPROJ_BORDER_VERT
macro, while the horizontal border can be set by the
SGRPROJ_BORDER_HORZ macro set at 2, the max needed. Currently we do not
recommend changing SGRPROJ_BORDER_HORZ below 2.

The overall line buffer requirement for LR is twice the max of
WIENER_BORDER_VERT and SGRPROJ_BORDER_VERT.
Currently both are set as 0, eliminating line buffers completely.

Also this patch extends borders consistently before CDEF / LR.

Change-Id: Ie58a98c784a0db547627b9cfcf55f018c30e8e79
parent db4ccd42
This diff is collapsed.
......@@ -25,24 +25,42 @@ extern "C" {
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
// Determines line buffer requirement for LR. Should be set at the max
// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
#define RESTORATION_BORDER_VERT 0
#define RESTORATION_BORDER_HORZ 3 // Do not change this
// Pad up to 20 more (may be much less is needed)
#define RESTORATION_PADDING 20
#define RESTORATION_PROC_UNIT_PELS \
((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
RESTORATION_PADDING) * \
(RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
RESTORATION_PADDING))
#define RESTORATION_TILESIZE_MAX 256
#define RESTORATION_TILEPELS_MAX \
(RESTORATION_TILESIZE_MAX * RESTORATION_TILESIZE_MAX * 9 / 4)
#define RESTORATION_TILEPELS_MAX \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
// 4 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 2 for each restoration operation
#define SGRPROJ_OUTBUF_SIZE \
((RESTORATION_TILESIZE_MAX * 3 / 2) * (RESTORATION_TILESIZE_MAX * 3 / 2 + 16))
#define SGRPROJ_OUTBUF_SIZE \
((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
#define SGRPROJ_TMPBUF_SIZE \
(RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
SGRPROJ_OUTBUF_SIZE * 2 * sizeof(int32_t))
SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr
#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
// Restoration precision bits generated higher than source before projection
......@@ -74,6 +92,8 @@ extern "C" {
#define SGRPROJ_RECIP_BITS 12
#define WIENER_HALFWIN 3
#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
#define WIENER_BORDER_VERT 0
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
......
......@@ -663,8 +663,11 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
}
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int r, int eps, int32_t *tmpbuf) {
int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps,
int32_t *tmpbuf) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
......@@ -676,25 +679,31 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
uint8_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
selfguided_restoration_1_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, 8);
selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
buf_stride);
selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 2) {
selfguided_restoration_2_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, 8);
selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
buf_stride);
selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else if (r == 3) {
selfguided_restoration_3_v(dgd, width, height, stride, A, B, buf_stride);
selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, 8);
selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride, A, B,
buf_stride);
selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps, 8);
} else {
assert(0);
}
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
......@@ -706,7 +715,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
......@@ -719,7 +728,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
......@@ -734,7 +743,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
......@@ -750,7 +759,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
......@@ -803,7 +812,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
......@@ -825,7 +834,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
......@@ -844,7 +853,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
......@@ -856,7 +865,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
......@@ -869,7 +878,7 @@ void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
......@@ -1363,10 +1372,12 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
}
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
int height, int stride,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int i, j;
......@@ -1378,28 +1389,34 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
uint16_t *dgd0 = dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ;
if (r == 1) {
highbd_selfguided_restoration_1_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_1_h(A, B, width, height, buf_stride, eps, bit_depth);
highbd_selfguided_restoration_1_v(dgd0, width_ext, height_ext, dgd_stride,
A, B, buf_stride);
selfguided_restoration_1_h(A, B, width_ext, height_ext, buf_stride, eps,
bit_depth);
} else if (r == 2) {
highbd_selfguided_restoration_2_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_2_h(A, B, width, height, buf_stride, eps, bit_depth);
highbd_selfguided_restoration_2_v(dgd0, width_ext, height_ext, dgd_stride,
A, B, buf_stride);
selfguided_restoration_2_h(A, B, width_ext, height_ext, buf_stride, eps,
bit_depth);
} else if (r == 3) {
highbd_selfguided_restoration_3_v(dgd, width, height, stride, A, B,
buf_stride);
selfguided_restoration_3_h(A, B, width, height, buf_stride, eps, bit_depth);
highbd_selfguided_restoration_3_v(dgd0, width_ext, height_ext, dgd_stride,
A, B, buf_stride);
selfguided_restoration_3_h(A, B, width_ext, height_ext, buf_stride, eps,
bit_depth);
} else {
assert(0);
}
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
{
i = 0;
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] +
......@@ -1411,7 +1428,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
......@@ -1424,7 +1441,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] +
......@@ -1439,7 +1456,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
......@@ -1455,7 +1472,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
// Vectorize the innermost loop
for (j = 1; j < width - 1; j += 4) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
......@@ -1508,7 +1525,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
// (typically have 2 such pixels, but may have anywhere between 0 and 3)
for (; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a =
......@@ -1530,7 +1547,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
......@@ -1549,7 +1566,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = 0;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] +
......@@ -1561,7 +1578,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
}
for (j = 1; j < width - 1; ++j) {
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
......@@ -1574,7 +1591,7 @@ void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
j = width - 1;
{
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 3;
const int32_t a = 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] +
......
......@@ -5358,6 +5358,7 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
aom_extend_frame_borders(new_fb);
#if CONFIG_CDEF
if (!cm->skip_loop_filter && !cm->all_lossless) {
av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
......
......@@ -4183,6 +4183,8 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
#endif
}
aom_extend_frame_borders(cm->frame_to_show);
#if CONFIG_CDEF
if (is_lossless_requested(&cpi->oxcf)) {
cm->cdef_bits = 0;
......
......@@ -637,6 +637,15 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
// Compute best Sgrproj filters for each rtile, one (encoder/decoder)
// tile at a time.
const AV1_COMMON *const cm = &cpi->common;
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
ctxt.plane_height, ctxt.dgd_stride);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
ctxt.dgd_stride);
for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
SgrprojInfo ref_sgrproj_info;
......
......@@ -40,18 +40,23 @@ class AV1SelfguidedFilterTest
protected:
void RunSpeedTest() {
const int w = 256, h = 256;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
uint8_t *output_ = (uint8_t *)aom_memalign(
16, out_stride * (height + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
uint8_t *input = input_ + stride * 16 + 16;
uint8_t *output = output_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & 0xFF;
for (i = -16; i < height + 16; ++i)
for (j = -16; j < width + 16; ++j)
input[i * stride + j] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
......@@ -67,16 +72,17 @@ class AV1SelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
apply_selfguided_restoration(input, width, height, stride, eps, xqd,
output, out_stride, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
height, elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(input);
aom_free(output);
aom_free(input_);
aom_free(output_);
aom_free(tmpbuf);
}
......@@ -88,21 +94,26 @@ class AV1SelfguidedFilterTest
const int NUM_ITERS = 81;
int i, j, k;
uint8_t *input =
(uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
uint8_t *output =
(uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
uint8_t *output2 =
(uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
uint8_t *output_ = (uint8_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint8_t));
uint8_t *output2_ = (uint8_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
uint8_t *input = input_ + stride * 16 + 16;
uint8_t *output = output_ + out_stride * 16 + 16;
uint8_t *output2 = output2_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; j < max_h; ++j)
for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & 0xFF;
for (j = -16; j < max_h + 16; ++j)
for (k = -16; k < max_w + 16; ++k)
input[j * stride + k] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
......@@ -121,13 +132,14 @@ class AV1SelfguidedFilterTest
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
for (k = 0; k < test_w; ++k) {
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
}
aom_free(input);
aom_free(output);
aom_free(output2);
aom_free(input_);
aom_free(output_);
aom_free(output2_);
aom_free(tmpbuf);
}
};
......@@ -155,20 +167,25 @@ class AV1HighbdSelfguidedFilterTest
protected:
void RunSpeedTest() {
const int w = 256, h = 256;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
uint16_t *input_ =
(uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
uint16_t *output_ = (uint16_t *)aom_memalign(
16, out_stride * (height + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
uint16_t *input = input_ + stride * 16 + 16;
uint16_t *output = output_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
for (i = -16; i < height + 16; ++i)
for (j = -16; j < width + 16; ++j)
input[i * stride + j] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
......@@ -184,17 +201,18 @@ class AV1HighbdSelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
output, w, tmpbuf);
apply_selfguided_restoration_highbd(input, width, height, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
height, elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(input);
aom_free(output);
aom_free(input_);
aom_free(output_);
aom_free(tmpbuf);
}
......@@ -208,21 +226,26 @@ class AV1HighbdSelfguidedFilterTest
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input =
(uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
uint16_t *output =
(uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
uint16_t *output2 =
(uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
uint16_t *input_ =
(uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
uint16_t *output_ = (uint16_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint16_t));
uint16_t *output2_ = (uint16_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
uint16_t *input = input_ + stride * 16 + 16;
uint16_t *output = output_ + out_stride * 16 + 16;
uint16_t *output2 = output2_ + out_stride * 16 + 16;
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; j < max_h; ++j)
for (k = 0; k < max_w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
for (j = -16; j < max_h + 16; ++j)
for (k = -16; k < max_w + 16; ++k)
input[j * stride + k] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
......@@ -247,9 +270,9 @@ class AV1HighbdSelfguidedFilterTest
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(input);
aom_free(output);
aom_free(output2);
aom_free(input_);
aom_free(output_);
aom_free(output2_);
aom_free(tmpbuf);
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment