Commit 1330dfd1 authored by Debargha Mukherjee's avatar Debargha Mukherjee
Browse files

Refactoring/simplification of buffers used for sgr

Inlcudes miscellaneous cleanups, test fixes, and code reorganization
for loop-restoration components.

Change-Id: I5b2e6419234d945e6f4344b22636119b50df4054
parent e168a783
......@@ -628,7 +628,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
specialize qw/av1_selfguided_restoration sse4_1/;
add_proto qw/void av1_highpass_filter/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
......@@ -638,7 +638,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_highpass_filter_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps";
......
......@@ -22,14 +22,6 @@
#include "aom_ports/mem.h"
#define USE_SIMPLER_SGR 1
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
#if USE_HIGHPASS_IN_SGRPROJ
// corner, edge, r2, eps2
......@@ -39,7 +31,7 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
{ -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
#else
// r1, eps1, r2, eps2
#if USE_SIMPLER_SGR
#if MAX_RADIUS == 2
{ 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 },
{ 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
{ 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
......@@ -49,7 +41,7 @@ const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
{ 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
{ 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
{ 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
#endif // USE_SIMPLER_SGR
#endif // MAX_RADIUS == 2
#endif
};
......@@ -112,21 +104,22 @@ static void loop_restoration_init(RestorationInternal *rst, int kf) {
rst->keyframe = kf;
}
void extend_frame(uint8_t *data, int width, int height, int stride) {
void extend_frame(uint8_t *data, int width, int height, int stride,
int border_horz, int border_vert) {
uint8_t *data_p;
int i;
for (i = 0; i < height; ++i) {
data_p = data + i * stride;
memset(data_p - WIENER_HALFWIN, data_p[0], WIENER_HALFWIN);
memset(data_p + width, data_p[width - 1], WIENER_HALFWIN);
memset(data_p - border_horz, data_p[0], border_horz);
memset(data_p + width, data_p[width - 1], border_horz);
}
data_p = data - WIENER_HALFWIN;
for (i = -WIENER_HALFWIN; i < 0; ++i) {
memcpy(data_p + i * stride, data_p, width + 2 * WIENER_HALFWIN);
data_p = data - border_horz;
for (i = -border_vert; i < 0; ++i) {
memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
}
for (i = height; i < height + WIENER_HALFWIN; ++i) {
for (i = height; i < height + border_vert; ++i) {
memcpy(data_p + i * stride, data_p + (height - 1) * stride,
width + 2 * WIENER_HALFWIN);
width + 2 * border_horz);
}
}
......@@ -256,7 +249,8 @@ static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
RestorationInternal *rst, uint8_t *dst,
int dst_stride) {
int tile_idx;
extend_frame(data, width, height, stride);
extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
WIENER_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
dst_stride);
......@@ -639,16 +633,17 @@ const int32_t x_by_xplus1[256] = {
const int32_t one_by_x[MAX_NELEM] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158,
152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105,
102, 100, 98, 95, 93, 91, 89, 87, 85, 84
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
#if MAX_RADIUS > 2
158, 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108,
105, 102, 100, 98, 95, 93, 91, 89, 87, 85, 84
#endif // MAX_RADIUS > 2
};
static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
const int num_stride = width_ext;
......@@ -657,10 +652,11 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
int32_t *A = tmpbuf;
int32_t *B = tmpbuf + SGRPROJ_OUTBUF_SIZE;
int8_t num_[RESTORATION_TILEPELS_MAX];
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int8_t num_[RESTORATION_PROC_UNIT_PELS];
int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
int i, j;
......@@ -844,10 +840,11 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int r, int eps, int32_t *tmpbuf) {
int r, int eps) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
......@@ -855,8 +852,7 @@ void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
dst_stride, 8, r, eps,
tmpbuf + RESTORATION_TILEPELS_MAX);
dst_stride, 8, r, eps);
}
void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
......@@ -955,7 +951,6 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -963,10 +958,10 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -1009,7 +1004,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
int h = AOMMIN(procunit_height, v_end - i);
uint8_t *data_p = data + i * stride + j;
uint8_t *dst_p = dst + i * dst_stride + j;
apply_selfguided_restoration_c(
apply_selfguided_restoration(
data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
......@@ -1019,7 +1014,8 @@ static void loop_sgrproj_filter(uint8_t *data, int width, int height,
int stride, RestorationInternal *rst,
uint8_t *dst, int dst_stride) {
int tile_idx;
extend_frame(data, width, height, stride);
extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
SGRPROJ_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
dst_stride);
......@@ -1030,7 +1026,8 @@ static void loop_switchable_filter(uint8_t *data, int width, int height,
int stride, RestorationInternal *rst,
uint8_t *dst, int dst_stride) {
int tile_idx;
extend_frame(data, width, height, stride);
extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
RESTORATION_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile(data, tile_idx, 0, 0, width, height, stride, rst, dst,
......@@ -1046,23 +1043,23 @@ static void loop_switchable_filter(uint8_t *data, int width, int height,
}
#if CONFIG_HIGHBITDEPTH
void extend_frame_highbd(uint16_t *data, int width, int height, int stride) {
void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
int border_horz, int border_vert) {
uint16_t *data_p;
int i, j;
for (i = 0; i < height; ++i) {
data_p = data + i * stride;
for (j = -WIENER_HALFWIN; j < 0; ++j) data_p[j] = data_p[0];
for (j = width; j < width + WIENER_HALFWIN; ++j)
data_p[j] = data_p[width - 1];
for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
}
data_p = data - WIENER_HALFWIN;
for (i = -WIENER_HALFWIN; i < 0; ++i) {
data_p = data - border_horz;
for (i = -border_vert; i < 0; ++i) {
memcpy(data_p + i * stride, data_p,
(width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
(width + 2 * border_horz) * sizeof(uint16_t));
}
for (i = height; i < height + WIENER_HALFWIN; ++i) {
for (i = height; i < height + border_vert; ++i) {
memcpy(data_p + i * stride, data_p + (height - 1) * stride,
(width + 2 * WIENER_HALFWIN) * sizeof(uint16_t));
(width + 2 * border_horz) * sizeof(uint16_t));
}
}
......@@ -1171,7 +1168,8 @@ static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
int tile_idx;
extend_frame_highbd(data, width, height, stride);
extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
WIENER_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
bit_depth, dst, dst_stride);
......@@ -1181,10 +1179,11 @@ static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
int stride, int32_t *dst,
int dst_stride, int bit_depth, int r,
int eps, int32_t *tmpbuf) {
int eps) {
int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
int32_t *dgd32 =
tmpbuf + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
int i, j;
for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
......@@ -1192,8 +1191,7 @@ void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
}
}
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
dst_stride, bit_depth, r, eps,
tmpbuf + RESTORATION_TILEPELS_MAX);
dst_stride, bit_depth, r, eps);
}
void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
......@@ -1294,7 +1292,6 @@ void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -1303,11 +1300,11 @@ void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
#else
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].e2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -1351,7 +1348,7 @@ static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
int h = AOMMIN(procunit_height, v_end - i);
uint16_t *data_p = data + i * stride + j;
uint16_t *dst_p = dst + i * dst_stride + j;
apply_selfguided_restoration_highbd_c(
apply_selfguided_restoration_highbd(
data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
}
......@@ -1364,7 +1361,8 @@ static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
int tile_idx;
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
extend_frame_highbd(data, width, height, stride);
extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
SGRPROJ_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
bit_depth, dst, dst_stride);
......@@ -1378,7 +1376,8 @@ static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
uint16_t *data = CONVERT_TO_SHORTPTR(data8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
int tile_idx;
extend_frame_highbd(data, width, height, stride);
extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
RESTORATION_BORDER_VERT);
for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
loop_copy_tile_highbd(data, tile_idx, 0, 0, width, height, stride, rst,
......
......@@ -25,10 +25,28 @@ extern "C" {
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
#define RESTORATION_PROC_UNIT_SIZE 64
// Determines line buffer requirement for LR. Should be set at the max
// of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT
#define RESTORATION_BORDER_VERT 0
#define RESTORATION_BORDER_HORZ 3 // Do not change this
#define SGRPROJ_BORDER_VERT 0 // Vertical border used for Sgr
#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for Sgr
#define WIENER_BORDER_VERT 0 // Vertical border used for Wiener
#define WIENER_HALFWIN 3
#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener
// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
// Note the line buffer needed is twice the value of this macro.
#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
#else
#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
#else
#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
// Pad up to 20 more (may be much less is needed)
#define RESTORATION_PADDING 20
......@@ -39,28 +57,20 @@ extern "C" {
RESTORATION_PADDING))
#define RESTORATION_TILESIZE_MAX 256
#define RESTORATION_TILEPELS_MAX \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT)
// 4 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 2 for each restoration operation
#define SGRPROJ_OUTBUF_SIZE \
((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16))
#define SGRPROJ_TMPBUF_SIZE \
(RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
SGRPROJ_OUTBUF_SIZE * 3 * sizeof(int32_t) + 2 * RESTORATION_PROC_UNIT_PELS)
#define RESTORATION_TILEPELS_MAX \
((RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) * \
(RESTORATION_TILESIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT))
// Two 32-bit buffers needed for the restored versions from two filters
// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
// on the decoder side.
#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t))
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
#define USE_HIGHPASS_IN_SGRPROJ 0
#define SGRPROJ_BORDER_VERT 0 // Vertical border used for sgr
#define SGRPROJ_BORDER_HORZ 2 // Horizontal border used for sgr
// Precision bits for projection
#define SGRPROJ_PRJ_BITS 7
// Restoration precision bits generated higher than source before projection
......@@ -85,15 +95,12 @@ extern "C" {
#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_RADIUS 2 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
#define WIENER_HALFWIN 3
#define WIENER_BORDER_HORZ (WIENER_HALFWIN)
#define WIENER_BORDER_VERT 0
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
......@@ -268,9 +275,11 @@ int av1_alloc_restoration_struct(struct AV1Common *cm,
int height);
void av1_free_restoration_struct(RestorationInfo *rst_info);
void extend_frame(uint8_t *data, int width, int height, int stride);
void extend_frame(uint8_t *data, int width, int height, int stride,
int border_horz, int border_vert);
#if CONFIG_HIGHBITDEPTH
void extend_frame_highbd(uint16_t *data, int width, int height, int stride);
void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
int border_horz, int border_vert);
#endif // CONFIG_HIGHBITDEPTH
void decode_xq(int *xqd, int *xq);
void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
......
......@@ -664,17 +664,18 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width,
void av1_selfguided_restoration_sse4_1(uint8_t *dgd, int width, int height,
int dgd_stride, int32_t *dst,
int dst_stride, int r, int eps,
int32_t *tmpbuf) {
int dst_stride, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width + 3) & ~3) + 16;
int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
......@@ -1059,7 +1060,6 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -1067,12 +1067,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
sgr_params[eps].corner, sgr_params[eps].edge);
#else
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].r1, sgr_params[eps].e1,
tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_sse4_1(dat, width, height, stride, flt2, width,
sgr_params[eps].r2, sgr_params[eps].e2,
tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
......@@ -1374,17 +1372,18 @@ static void highbd_selfguided_restoration_3_v(uint16_t *src, int width,
void av1_selfguided_restoration_highbd_sse4_1(uint16_t *dgd, int width,
int height, int dgd_stride,
int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
int32_t *A = tmpbuf;
int32_t *B = A + SGRPROJ_OUTBUF_SIZE;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes for efficiency.
int buf_stride = ((width + 3) & ~3) + 16;
int buf_stride = ((width_ext + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
......@@ -1741,7 +1740,6 @@ void apply_selfguided_restoration_highbd_sse4_1(
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
......@@ -1751,11 +1749,11 @@ void apply_selfguided_restoration_highbd_sse4_1(
#else
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].e2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
......
......@@ -29,9 +29,9 @@
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/mathutils.h"
#include "av1/encoder/picklpf.h"
#include "av1/encoder/pickrst.h"
#include "av1/encoder/mathutils.h"
// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
// When set to RESTORE_TYPES we allow switchable.
......@@ -354,7 +354,6 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
int32_t *rstbuf) {
int32_t *flt1 = rstbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int ep, bestep = 0;
int64_t err, besterr = -1;
int exqd[2], bestxqd[2] = { 0, 0 };
......@@ -387,11 +386,11 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
#else
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd(
dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].r2, sgr_params[ep].e2);
}
} else {
#endif
......@@ -407,12 +406,11 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
sgr_params[ep].corner, sgr_params[ep].edge);
#else
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
sgr_params[ep].r1, sgr_params[ep].e1,
tmpbuf2);
sgr_params[ep].r1, sgr_params[ep].e1);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
flt2_stride, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].e2);
}
#if CONFIG_HIGHBITDEPTH
}
......@@ -640,11 +638,12 @@ static double search_sgrproj(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
ctxt.plane_height, ctxt.dgd_stride);
ctxt.plane_height, ctxt.dgd_stride, SGRPROJ_BORDER_HORZ,
SGRPROJ_BORDER_VERT);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
ctxt.dgd_stride);
ctxt.dgd_stride, SGRPROJ_BORDER_HORZ, SGRPROJ_BORDER_VERT);
for (int tile_row = 0; tile_row < cm->tile_rows; ++tile_row) {
for (int tile_col = 0; tile_col < cm->tile_cols; ++tile_col) {
......@@ -1242,14 +1241,17 @@ static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
AV1_COMMON *const cm = &cpi->common;
// Construct a (WIENER_HALFWIN)-pixel border around the frame
// Note use this border to gather stats even though the actual filter
// may use less border on the top/bottom of a processing unit.
#if CONFIG_HIGHBITDEPTH
if (cm->use_highbitdepth)
extend_frame_highbd(CONVERT_TO_SHORTPTR(ctxt.dgd_buffer), ctxt.plane_width,
ctxt.plane_height, ctxt.dgd_stride);
ctxt.plane_height, ctxt.dgd_stride, WIENER_HALFWIN,
WIENER_HALFWIN);
else
#endif
extend_frame(ctxt.dgd_buffer, ctxt.plane_width, ctxt.plane_height,
ctxt.dgd_stride);
ctxt.dgd_stride, WIENER_HALFWIN, WIENER_HALFWIN);
// Compute best Wiener filters for each rtile, one (encoder/decoder)
// tile at a time.
......
......@@ -40,9 +40,11 @@ class AV1SelfguidedFilterTest
protected:
void RunSpeedTest() {
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j;
int i, j, k;
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
......@@ -72,8 +74,15 @@ class AV1SelfguidedFilterTest