Commit 4d2af5db authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Add SSE4.1 highbitdepth self-guided filter

Performance is very similar to the lowbd path (only 4-5% slower)

Change-Id: Ifdb272c3f6c0e6f41e7046cc49497c72b5a796d9
parent 0a4bc8d3
......@@ -780,18 +780,18 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
# LOOP_RESTORATION functions
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration sse4_1/;
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd/;
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration_highbd/;
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
}
}
......
......@@ -731,22 +731,20 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride,
int bit_depth, int r, int eps,
int32_t *tmpbuf) {
int r, int eps, int32_t *tmpbuf) {
int i, j;
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j];
}
}
av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth,
r, eps, tmpbuf);
av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
tmpbuf);
}
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps,
int *xqd, uint8_t *dst, int dst_stride,
int32_t *tmpbuf) {
int stride, int eps, int *xqd, uint8_t *dst,
int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
......@@ -754,11 +752,9 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
......@@ -796,7 +792,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
data_p = data + h_start + v_start * stride;
dst_p = dst + h_start + v_start * dst_stride;
apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
8, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
dst_stride, rst->tmpbuf);
}
......
This diff is collapsed.
......@@ -301,11 +301,9 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
} else {
#endif
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
bit_depth, sgr_params[ep].r1,
sgr_params[ep].e1, tmpbuf2);
sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
bit_depth, sgr_params[ep].r2,
sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
#if CONFIG_AOM_HIGHBITDEPTH
}
#endif
......
......@@ -68,8 +68,7 @@ class AV1SelfguidedFilterTest
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_c(input, w, h, w, 8, eps, xqd, output, w,
tmpbuf);
apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
......@@ -113,9 +112,9 @@ class AV1SelfguidedFilterTest
int test_w = w + 4 - (i / 9);
int test_h = h + 4 - (i % 9);
apply_selfguided_restoration(input, test_w, test_h, stride, 8, eps, xqd,
apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, 8, eps, xqd,
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
......@@ -139,4 +138,125 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
::testing::ValuesIn(params));
#endif
#if CONFIG_AOM_HIGHBITDEPTH
typedef tuple<int> HighbdFilterTestParam;
class AV1HighbdSelfguidedFilterTest
: public ::testing::TestWithParam<HighbdFilterTestParam> {
public:
virtual ~AV1HighbdSelfguidedFilterTest() {}
virtual void SetUp() {}
virtual void TearDown() { libaom_test::ClearSystemState(); }
protected:
void RunSpeedTest() {
const int w = 256, h = 256;
const int NUM_ITERS = 2000;
int i, j;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[w * h];
uint16_t *output = new uint16_t[w * h];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
// Fix a parameter set, since the speed depends slightly on r.
// Change this to test different combinations of values of r.
int eps = 15;
av1_loop_restoration_precal();
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
output, w, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(tmpbuf);
delete[] input;
delete[] output;
}
void RunCorrectnessTest() {
const int w = 256, h = 256, stride = 672, out_stride = 672;
const int NUM_ITERS = 81;
int i, j, k;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[stride * (h + 16)];
uint16_t *output = new uint16_t[out_stride * (h + 16)];
uint16_t *output2 = new uint16_t[out_stride * (h + 16)];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; j < h; ++j)
for (k = 0; k < w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
// Test various tile sizes around 256x256
int test_w = w + 4 - (i / 9);
int test_h = h + 4 - (i % 9);
apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
bit_depth, eps, xqd, output2,
out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(tmpbuf);
delete[] input;
delete[] output;
delete[] output2;
}
};
TEST_P(AV1HighbdSelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
const HighbdFilterTestParam highbd_params[] = { make_tuple(8), make_tuple(10),
make_tuple(12) };
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdSelfguidedFilterTest,
::testing::ValuesIn(highbd_params));
#endif
#endif
} // namespace
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment