Commit 7e08ac3f authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Fix two bugs in highbitdepth self-guided filter

This filter was temporarily removed due to test failures.
This patch reintroduces the filter and fixes two bugs:

* The test cases would occasionally segfault on x86, since
  the highbd filter requires its inputs to be aligned to
  16 bytes. This will always be true when used on real videos,
  so adjust the test cases to match.

* The function calc_block was incorrect for bit_depth > 8,
  due to passing an incorrect argument to _mm_srl_epi32().
  This was the cause of the original test failures.

BUG=aomedia:392

Change-Id: Ia06b76c3e6122eebadd0995fb62f32c2fcab8b3e
parent bb9c73b0
......@@ -791,7 +791,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void apply_selfguided_restoration_highbd/, "uint16_t *dat, int width, int height, int stride, int bit_depth, int eps, int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf";
specialize qw/apply_selfguided_restoration_highbd /;
specialize qw/apply_selfguided_restoration_highbd sse4_1/;
add_proto qw/void av1_selfguided_restoration_highbd/, "uint16_t *dgd, int width, int height, int stride, int32_t *dst, int dst_stride, int bit_depth, int r, int eps, int32_t *tmpbuf";
specialize qw/av1_selfguided_restoration_highbd sse4_1/;
......
......@@ -16,10 +16,10 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
if (bit_depth > 8) {
__m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1);
__m128i rounding_b = _mm_set1_epi32((1 << (bit_depth - 8)) >> 1);
a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a),
_mm_set1_epi32(2 * (bit_depth - 8)));
b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b),
_mm_set1_epi32(bit_depth - 8));
__m128i shift_a = _mm_set_epi64x(0, 2 * (bit_depth - 8));
__m128i shift_b = _mm_set_epi64x(0, bit_depth - 8);
a = _mm_srl_epi32(_mm_add_epi32(sum_sq, rounding_a), shift_a);
b = _mm_srl_epi32(_mm_add_epi32(sum, rounding_b), shift_b);
a = _mm_mullo_epi32(a, n);
b = _mm_mullo_epi32(b, b);
p = _mm_sub_epi32(_mm_max_epi32(a, b), b);
......@@ -1719,4 +1719,87 @@ void av1_highpass_filter_highbd_sse4_1(uint16_t *dgd, int width, int height,
}
}
void apply_selfguided_restoration_highbd_sse4_1(
uint16_t *dat, int width, int height, int stride, int bit_depth, int eps,
int *xqd, uint16_t *dst, int dst_stride, int32_t *tmpbuf) {
int xq[2];
int32_t *flt1 = tmpbuf;
int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX);
#if USE_HIGHPASS_IN_SGRPROJ
av1_highpass_filter_highbd_sse4_1(dat, width, height, stride, flt1, width,
sgr_params[eps].corner,
sgr_params[eps].edge);
#else
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt1,
width, bit_depth, sgr_params[eps].r1,
sgr_params[eps].e1, tmpbuf2);
#endif // USE_HIGHPASS_IN_SGRPROJ
av1_selfguided_restoration_highbd_sse4_1(dat, width, height, stride, flt2,
width, bit_depth, sgr_params[eps].r2,
sgr_params[eps].e2, tmpbuf2);
decode_xq(xqd, xq);
__m128i xq0 = _mm_set1_epi32(xq[0]);
__m128i xq1 = _mm_set1_epi32(xq[1]);
for (i = 0; i < height; ++i) {
// Calculate output in batches of 8 pixels
for (j = 0; j < width; j += 8) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
__m128i src =
_mm_slli_epi16(_mm_load_si128((__m128i *)&dat[l]), SGRPROJ_RST_BITS);
const __m128i u_0 = _mm_cvtepu16_epi32(src);
const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(src, 8));
const __m128i f1_0 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k]), u_0);
const __m128i f2_0 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k]), u_0);
const __m128i f1_1 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt1[k + 4]), u_1);
const __m128i f2_1 =
_mm_sub_epi32(_mm_loadu_si128((__m128i *)&flt2[k + 4]), u_1);
const __m128i v_0 = _mm_add_epi32(
_mm_add_epi32(_mm_mullo_epi32(xq0, f1_0), _mm_mullo_epi32(xq1, f2_0)),
_mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS));
const __m128i v_1 = _mm_add_epi32(
_mm_add_epi32(_mm_mullo_epi32(xq0, f1_1), _mm_mullo_epi32(xq1, f2_1)),
_mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS));
const __m128i rounding =
_mm_set1_epi32((1 << (SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS)) >> 1);
const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
// Pack into 16 bits and clamp to [0, 2^bit_depth)
const __m128i tmp = _mm_packus_epi32(w_0, w_1);
const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
const __m128i res = _mm_min_epi16(tmp, max);
_mm_store_si128((__m128i *)&dst[m], res);
}
// Process leftover pixels
for (; j < width; ++j) {
const int k = i * width + j;
const int l = i * stride + j;
const int m = i * dst_stride + j;
const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
const int32_t f1 = (int32_t)flt1[k] - u;
const int32_t f2 = (int32_t)flt2[k] - u;
const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
const int16_t w =
(int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
}
}
}
#endif
......@@ -44,8 +44,8 @@ class AV1SelfguidedFilterTest
const int NUM_ITERS = 2000;
int i, j;
uint8_t *input = new uint8_t[w * h];
uint8_t *output = new uint8_t[w * h];
uint8_t *input = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
uint8_t *output = (uint8_t *)aom_memalign(16, w * h * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
......@@ -76,9 +76,9 @@ class AV1SelfguidedFilterTest
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(input);
aom_free(output);
aom_free(tmpbuf);
delete[] input;
delete[] output;
}
void RunCorrectnessTest() {
......@@ -89,9 +89,12 @@ class AV1SelfguidedFilterTest
const int NUM_ITERS = 81;
int i, j, k;
uint8_t *input = new uint8_t[stride * max_h];
uint8_t *output = new uint8_t[out_stride * max_h];
uint8_t *output2 = new uint8_t[out_stride * max_h];
uint8_t *input =
(uint8_t *)aom_memalign(16, stride * max_h * sizeof(uint8_t));
uint8_t *output =
(uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
uint8_t *output2 =
(uint8_t *)aom_memalign(16, out_stride * max_h * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
......@@ -124,10 +127,10 @@ class AV1SelfguidedFilterTest
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(input);
aom_free(output);
aom_free(output2);
aom_free(tmpbuf);
delete[] input;
delete[] output;
delete[] output2;
}
};
......@@ -161,8 +164,8 @@ class AV1HighbdSelfguidedFilterTest
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[w * h];
uint16_t *output = new uint16_t[w * h];
uint16_t *input = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
uint16_t *output = (uint16_t *)aom_memalign(16, w * h * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
......@@ -194,9 +197,9 @@ class AV1HighbdSelfguidedFilterTest
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, w, h,
elapsed, elapsed * 1000000. / NUM_ITERS);
aom_free(input);
aom_free(output);
aom_free(tmpbuf);
delete[] input;
delete[] output;
}
void RunCorrectnessTest() {
......@@ -209,9 +212,12 @@ class AV1HighbdSelfguidedFilterTest
int bit_depth = GET_PARAM(0);
int mask = (1 << bit_depth) - 1;
uint16_t *input = new uint16_t[stride * max_h];
uint16_t *output = new uint16_t[out_stride * max_h];
uint16_t *output2 = new uint16_t[out_stride * max_h];
uint16_t *input =
(uint16_t *)aom_memalign(16, stride * max_h * sizeof(uint16_t));
uint16_t *output =
(uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
uint16_t *output2 =
(uint16_t *)aom_memalign(16, out_stride * max_h * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
memset(tmpbuf, 0, RESTORATION_TMPBUF_SIZE);
......@@ -246,10 +252,10 @@ class AV1HighbdSelfguidedFilterTest
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
}
aom_free(input);
aom_free(output);
aom_free(output2);
aom_free(tmpbuf);
delete[] input;
delete[] output;
delete[] output2;
}
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment