Commit 4d2af5db authored by David Barker's avatar David Barker Committed by Debargha Mukherjee
Browse files

Add SSE4.1 highbitdepth self-guided filter

Performance is very similar to the lowbd path (only 4-5% slower)

Change-Id: Ifdb272c3f6c0e6f41e7046cc49497c72b5a796d9
parent 0a4bc8d3
...@@ -780,18 +780,18 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") || ...@@ -780,18 +780,18 @@ if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
# LOOP_RESTORATION functions # LOOP_RESTORATION functions
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf"; add_proto qw/void apply_selfguided_restoration/, "uint8_t *dat, int width, int height, int stride, int eps, int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration sse4_1/; specialize qw/apply_selfguided_restoration sse4_1/;
add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf"; add_proto qw/void av1_selfguided_restoration/, "uint8_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration sse4_1/; specialize qw/av1_selfguided_restoration sse4_1/;
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf"; add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd/; specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf"; add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration_highbd/; specialize qw/av1_selfguided_restoration_highbd sse4_1/;
} }
} }
......
...@@ -731,22 +731,20 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width, ...@@ -731,22 +731,20 @@ static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height, void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
int stride, int32_t *dst, int dst_stride, int stride, int32_t *dst, int dst_stride,
int bit_depth, int r, int eps, int r, int eps, int32_t *tmpbuf) {
int32_t *tmpbuf) {
int i, j; int i, j;
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) { for (j = 0; j < width; ++j) {
dst[i * dst_stride + j] = dgd[i * stride + j]; dst[i * dst_stride + j] = dgd[i * stride + j];
} }
} }
av1_selfguided_restoration_internal(dst, width, height, dst_stride, bit_depth, av1_selfguided_restoration_internal(dst, width, height, dst_stride, 8, r, eps,
r, eps, tmpbuf); tmpbuf);
} }
void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int stride, int bit_depth, int eps, int stride, int eps, int *xqd, uint8_t *dst,
int *xqd, uint8_t *dst, int dst_stride, int dst_stride, int32_t *tmpbuf) {
int32_t *tmpbuf) {
int xq[2]; int xq[2];
int32_t *flt1 = tmpbuf; int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX; int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
...@@ -754,11 +752,9 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height, ...@@ -754,11 +752,9 @@ void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
int i, j; int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX); assert(width * height <= RESTORATION_TILEPELS_MAX);
av1_selfguided_restoration_c(dat, width, height, stride, flt1, width, av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
bit_depth, sgr_params[eps].r1, sgr_params[eps].r1, sgr_params[eps].e1, tmpbuf2);
sgr_params[eps].e1, tmpbuf2);
av1_selfguided_restoration_c(dat, width, height, stride, flt2, width, av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
bit_depth, sgr_params[eps].r2, sgr_params[eps].r2, sgr_params[eps].e2, tmpbuf2);
sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq); decode_xq(xqd, xq);
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) { for (j = 0; j < width; ++j) {
...@@ -796,7 +792,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width, ...@@ -796,7 +792,7 @@ static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
data_p = data + h_start + v_start * stride; data_p = data + h_start + v_start * stride;
dst_p = dst + h_start + v_start * dst_stride; dst_p = dst + h_start + v_start * dst_stride;
apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride, apply_selfguided_restoration(data_p, h_end - h_start, v_end - v_start, stride,
8, rst->rsi->sgrproj_info[tile_idx].ep, rst->rsi->sgrproj_info[tile_idx].ep,
rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, rst->rsi->sgrproj_info[tile_idx].xqd, dst_p,
dst_stride, rst->tmpbuf); dst_stride, rst->tmpbuf);
} }
......
This diff is collapsed.
...@@ -301,11 +301,9 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height, ...@@ -301,11 +301,9 @@ static void search_selfguided_restoration(uint8_t *dat8, int width, int height,
} else { } else {
#endif #endif
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width, av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, width,
bit_depth, sgr_params[ep].r1, sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
sgr_params[ep].e1, tmpbuf2);
av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width, av1_selfguided_restoration(dat8, width, height, dat_stride, flt2, width,
bit_depth, sgr_params[ep].r2, sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
sgr_params[ep].e2, tmpbuf2);
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
} }
#endif #endif
......
...@@ -68,8 +68,7 @@ class AV1SelfguidedFilterTest ...@@ -68,8 +68,7 @@ class AV1SelfguidedFilterTest
std::clock_t start = std::clock(); std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) { for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_c(input, w, h, w, 8, eps, xqd, output, w, apply_selfguided_restoration(input, w, h, w, eps, xqd, output, w, tmpbuf);
tmpbuf);
} }
std::clock_t end = std::clock(); std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC); double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
...@@ -113,9 +112,9 @@ class AV1SelfguidedFilterTest ...@@ -113,9 +112,9 @@ class AV1SelfguidedFilterTest
int test_w = w + 4 - (i / 9); int test_w = w + 4 - (i / 9);
int test_h = h + 4 - (i % 9); int test_h = h + 4 - (i % 9);
apply_selfguided_restoration(input, test_w, test_h, stride, 8, eps, xqd, apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
output, out_stride, tmpbuf); output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, 8, eps, xqd, apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf); output2, out_stride, tmpbuf);
for (j = 0; j < test_h; ++j) for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k) for (k = 0; k < test_w; ++k)
...@@ -139,4 +138,125 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest, ...@@ -139,4 +138,125 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
::testing::ValuesIn(params)); ::testing::ValuesIn(params));
#endif #endif
#if CONFIG_AOM_HIGHBITDEPTH
typedef tuple<int> HighbdFilterTestParam;
class AV1HighbdSelfguidedFilterTest
: public ::testing::TestWithParam<HighbdFilterTestParam> {
public:
virtual ~AV1HighbdSelfguidedFilterTest() {}
virtual void SetUp() {}
virtual void TearDown() { libaom_test::ClearSystemState(); }
protected:
void RunSpeedTest() {
const int w = 256, h = 256;
const int NUM_ITERS = 2000;
int i, j;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[w * h];
uint16_t *output = new uint16_t[w * h];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
// Fix a parameter set, since the speed depends slightly on r.
// Change this to test different combinations of values of r.
int eps = 15;
av1_loop_restoration_precal();
std::clock_t start = std::clock();
for (i = 0; i < NUM_ITERS; ++i) {
apply_selfguided_restoration_highbd(input, w, h, w, bit_depth, eps, xqd,
output, w, tmpbuf);
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(tmpbuf);
delete[] input;
delete[] output;
}
void RunCorrectnessTest() {
const int w = 256, h = 256, stride = 672, out_stride = 672;
const int NUM_ITERS = 81;
int i, j, k;
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[stride * (h + 16)];
uint16_t *output = new uint16_t[out_stride * (h + 16)];
uint16_t *output2 = new uint16_t[out_stride * (h + 16)];
int32_t *tmpbuf = (int32_t *)aom_malloc(RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
ACMRandom rnd(ACMRandom::DeterministicSeed());
av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; j < h; ++j)
for (k = 0; k < w; ++k) input[j * stride + k] = rnd.Rand16() & mask;
int xqd[2] = {
SGRPROJ_PRJ_MIN0 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 - SGRPROJ_PRJ_MIN0),
SGRPROJ_PRJ_MIN1 +
rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 - SGRPROJ_PRJ_MIN1)
};
int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
// Test various tile sizes around 256x256
int test_w = w + 4 - (i / 9);
int test_h = h + 4 - (i % 9);
apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
bit_depth, eps, xqd, output2,
out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(tmpbuf);
delete[] input;
delete[] output;
delete[] output2;
}
};
TEST_P(AV1HighbdSelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
const HighbdFilterTestParam highbd_params[] = { make_tuple(8), make_tuple(10),
make_tuple(12) };
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdSelfguidedFilterTest,
::testing::ValuesIn(highbd_params));
#endif
#endif
} // namespace } // namespace
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment