Commit ce110cc5 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Vectorize self-guided filter

Add an SSE4.1 lowbd version of the self-guided filter for
loop-restoration, and apply some optimizations to the C
version.

Approximate times per 128x128 / 256x256 tile on the machine
this was developed on:
Previous C:  620us / 2800us
Optimized C: 500us / 2200us ( 24% /  27% faster)
SSE4.1:      147us / 600us  (320% / 370% faster)

Change-Id: I23ff5a5482a191aeb06f9d1f767a9f036bb357fe
parent 4d5bbbd9
......@@ -75,8 +75,11 @@ AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
endif
AV1_COMMON_SRCS-yes += common/convolve.c
AV1_COMMON_SRCS-yes += common/convolve.h
AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
ifeq ($(CONFIG_LOOP_RESTORATION),yes)
AV1_COMMON_SRCS-yes += common/restoration.h
AV1_COMMON_SRCS-yes += common/restoration.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
endif
ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
AV1_COMMON_SRCS-yes += common/warped_motion.h
AV1_COMMON_SRCS-yes += common/warped_motion.c
......
......@@ -777,4 +777,11 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
specialize qw/av1_warp_affine sse2/;
}
# LOOP_RESTORATION functions
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
}
1;
......@@ -126,7 +126,7 @@ static void GenDomainTxfmRFVtable() {
// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
static int sgrproj_mtable[MAX_EPS][MAX_NELEM];
int sgrproj_mtable[MAX_EPS][MAX_NELEM];
static void GenSgrprojVtable() {
int e, n;
......@@ -581,7 +581,7 @@ void decode_xq(int *xqd, int *xq) {
}
#if APPROXIMATE_SGR
static const uint16_t x_by_xplus1[256] = {
const int32_t x_by_xplus1[256] = {
0, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
......@@ -602,7 +602,7 @@ static const uint16_t x_by_xplus1[256] = {
256,
};
static const uint16_t one_by_x[MAX_NELEM] = {
const int32_t one_by_x[MAX_NELEM] = {
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158,
152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105,
......@@ -617,18 +617,23 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int32_t *B = A + RESTORATION_TILEPELS_MAX;
int8_t num[RESTORATION_TILEPELS_MAX];
int i, j;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width + 3) & ~3) + 16;
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
boxsum(dgd, width, height, stride, r, 0, B, width);
boxsum(dgd, width, height, stride, r, 1, A, width);
boxsum(dgd, width, height, stride, r, 0, B, buf_stride);
boxsum(dgd, width, height, stride, r, 1, A, buf_stride);
boxnum(width, height, r, num, width);
assert(r <= 3);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int n = num[k];
const int k = i * buf_stride + j;
const int n = num[i * width + j];
#if APPROXIMATE_SGR
// a < 2^16 * n < 2^22 regardless of bit depth
uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
......@@ -671,117 +676,119 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
i = 0;
j = 0;
{
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k + 1] + 2 * A[k + width] + A[k + width + 1];
3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
const int32_t b =
3 * B[k] + 2 * B[k + 1] + 2 * B[k + width] + B[k + width + 1];
3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = 0;
j = width - 1;
{
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k - 1] + 2 * A[k + width] + A[k + width - 1];
3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
const int32_t b =
3 * B[k] + 2 * B[k - 1] + 2 * B[k + width] + B[k + width - 1];
3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
j = 0;
{
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k + 1] + 2 * A[k - width] + A[k - width + 1];
3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
const int32_t b =
3 * B[k] + 2 * B[k + 1] + 2 * B[k - width] + B[k - width + 1];
3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
j = width - 1;
{
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a =
3 * A[k] + 2 * A[k - 1] + 2 * A[k - width] + A[k - width - 1];
3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
const int32_t b =
3 * B[k] + 2 * B[k - 1] + 2 * B[k - width] + B[k - width - 1];
3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = 0;
for (j = 1; j < width - 1; ++j) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + width] +
A[k + width - 1] + A[k + width + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + width] +
B[k + width - 1] + B[k + width + 1];
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
A[k + buf_stride - 1] + A[k + buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
B[k + buf_stride - 1] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
i = height - 1;
for (j = 1; j < width - 1; ++j) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - width] +
A[k - width - 1] + A[k - width + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - width] +
B[k - width - 1] + B[k - width + 1];
const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
A[k - buf_stride - 1] + A[k - buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
B[k - buf_stride - 1] + B[k - buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
j = 0;
for (i = 1; i < height - 1; ++i) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - width] + A[k + width]) + A[k + 1] +
A[k - width + 1] + A[k + width + 1];
const int32_t b = B[k] + 2 * (B[k - width] + B[k + width]) + B[k + 1] +
B[k - width + 1] + B[k + width + 1];
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
j = width - 1;
for (i = 1; i < height - 1; ++i) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 3;
const int32_t a = A[k] + 2 * (A[k - width] + A[k + width]) + A[k - 1] +
A[k - width - 1] + A[k + width - 1];
const int32_t b = B[k] + 2 * (B[k - width] + B[k + width]) + B[k - 1] +
B[k - width - 1] + B[k + width - 1];
const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
for (i = 1; i < height - 1; ++i) {
for (j = 1; j < width - 1; ++j) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int nb = 5;
const int32_t a =
(A[k] + A[k - 1] + A[k + 1] + A[k - width] + A[k + width]) * 4 +
(A[k - 1 - width] + A[k - 1 + width] + A[k + 1 - width] +
A[k + 1 + width]) *
(A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
4 +
(A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
3;
const int32_t b =
(B[k] + B[k - 1] + B[k + 1] + B[k - width] + B[k + width]) * 4 +
(B[k - 1 - width] + B[k - 1 + width] + B[k + 1 - width] +
B[k + 1 + width]) *
(B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
4 +
(B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
3;
const int32_t v = a * dgd[l] + b;
dgd[l] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
......@@ -793,7 +800,7 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
boxsum(B, width, height, width, r, 0, B, width);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int k = i * buf_stride + j;
const int l = i * stride + j;
const int n = num[k];
const int32_t v =
......@@ -804,10 +811,10 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
#endif // APPROXIMATE_SGR
}
static void apply_selfguided_restoration(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
......@@ -833,7 +840,7 @@ static void apply_selfguided_restoration(uint8_t *dat, int width, int height,
const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[k] - u;
const int32_t f2 = (int32_t)flt2[k] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
dst[m] = clip_pixel(w);
......@@ -1171,7 +1178,7 @@ static void apply_selfguided_restoration_highbd(
const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[k] - u;
const int32_t f2 = (int32_t)flt2[k] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
......
......@@ -55,7 +55,11 @@ extern "C" {
// 4 32-bit buffers needed for the filter:
// 2 for the restored versions of the frame and
// 2 for each restoration operation
#define SGRPROJ_TMPBUF_SIZE (RESTORATION_TILEPELS_MAX * 4 * sizeof(int32_t))
#define SGRPROJ_OUTBUF_SIZE \
((RESTORATION_TILESIZE_BIG * 3 / 2) * (RESTORATION_TILESIZE_BIG * 3 / 2 + 16))
#define SGRPROJ_TMPBUF_SIZE \
(RESTORATION_TILEPELS_MAX * 2 * sizeof(int32_t) + \
SGRPROJ_OUTBUF_SIZE * 2 * sizeof(int32_t))
#define SGRPROJ_EXTBUF_SIZE (0)
#define SGRPROJ_PARAMS_BITS 4
#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
......@@ -75,6 +79,12 @@ extern "C" {
#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
#define WIENER_HALFWIN 3
#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
......@@ -229,6 +239,9 @@ static INLINE void av1_get_rest_tile_limits(
}
extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
extern int sgrproj_mtable[MAX_EPS][MAX_NELEM];
extern const int32_t x_by_xplus1[256];
extern const int32_t one_by_x[MAX_NELEM];
int av1_alloc_restoration_struct(struct AV1Common *cm,
RestorationInfo *rst_info, int width,
......
This diff is collapsed.
......@@ -183,7 +183,7 @@ static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
(int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
src[i * src_stride + j];
......@@ -199,7 +199,7 @@ static int64_t get_pixel_proj_error(uint8_t *src8, int width, int height,
(int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[i * flt1_stride + j] - u;
const int32_t f2 = (int32_t)flt2[i * flt2_stride + j] - u;
const int64_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) -
src[i * src_stride + j];
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <ctime>
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "./av1_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "av1/common/mv.h"
#include "av1/common/restoration.h"
namespace {
using std::tr1::tuple;
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
typedef tuple<> FilterTestParam;
class AV1SelfguidedFilterTest
: public ::testing::TestWithParam<FilterTestParam> {
public:
virtual ~AV1SelfguidedFilterTest() {}
virtual void SetUp() {}
virtual void TearDown() { libaom_test::ClearSystemState(); }
protected:
void RunSpeedTest() {
const int w = 256, h = 256;
const int NUM_ITERS = 2000;
int i, j;
uint8_t *input = new uint8_t[w * h];
uint8_t *output = new uint8_t[w * h];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
// Fix a parameter set, since the speed depends slightly on r.
// Change this to test different combinations of values of r.
int eps = 4;
av1_loop_restoration_precal();
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration(input, w, h, w, 8, eps, xqd, output, w,
tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(tmpbuf);
delete[] input;
delete[] output;
}
void RunCorrectnessTest() {
const int w = 256, h = 256, stride = 672, out_stride = 672;
const int NUM_ITERS = 250;
int i, j, k;
uint8_t *input = new uint8_t[stride * h];
uint8_t *output = new uint8_t[out_stride * h];
uint8_t *output2 = new uint8_t[out_stride * h];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; i < h; ++i)
for (k = 0; j < w; ++j) input[j * stride + k] = rnd.Rand16() & 0xFF;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
apply_selfguided_restoration(input, w, h, stride, 8, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_c(input, w, h, stride, 8, eps, xqd, output2,
out_stride, tmpbuf);
for (j = 0; j < h; ++j)
for (k = 0; k < w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(tmpbuf);
delete[] input;
delete[] output;
delete[] output2;
}
};
TEST_P(AV1SelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
const FilterTestParam params[] = { make_tuple() };
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
::testing::ValuesIn(params));
#endif
} // namespace
......@@ -209,6 +209,9 @@ LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_convolve_optimz_test.cc
ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
LIBAOM_TEST_SRCS-$(HAVE_SSE2) += warp_filter_test.cc
endif
ifeq ($(CONFIG_LOOP_RESTORATION),yes)
LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += selfguided_filter_test.cc
endif
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment