Commit 1a709944 authored by Debargha Mukherjee's avatar Debargha Mukherjee

Remove code for CONFIG_FAST_SGR=2 and cleanup

Change-Id: I01cecc829e2d57517427a1de6387e91ba3c64312
parent d051e560
...@@ -770,164 +770,7 @@ const int32_t one_by_x[MAX_NELEM] = { ...@@ -770,164 +770,7 @@ const int32_t one_by_x[MAX_NELEM] = {
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
}; };
#if CONFIG_FAST_SGR == 2 #if CONFIG_FAST_SGR
static void av1_selfguided_restoration_fast2_internal(
int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth, int r, int eps) {
const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
// Adjusting the stride of A and B here appears to avoid bad cache effects,
// leading to a significant speed improvement.
// We also align the stride to a multiple of 16 bytes, for consistency
// with the SIMD version of this function.
int buf_stride = ((width_ext + 3) & ~3) + 16;
int32_t A_[RESTORATION_PROC_UNIT_PELS];
int32_t B_[RESTORATION_PROC_UNIT_PELS];
int32_t *A = A_;
int32_t *B = B_;
int i, j;
assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
"Need SGRPROJ_BORDER_* >= r+1");
boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
// Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
// for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
for (i = -1; i < height + 1; ++i) {
for (j = -1; j < width + 1; j += 2) {
const int k = i * buf_stride + j;
const int n = (2 * r + 1) * (2 * r + 1);
// a < 2^16 * n < 2^22 regardless of bit depth
uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
// b < 2^8 * n < 2^14 regardless of bit depth
uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
// Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
// and p itself satisfies p < 2^14 * n^2 < 2^26.
// This bound on p is due to:
// https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
//
// Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
// This is an artefact of rounding, and can only happen if all pixels
// are (almost) identical, so in this case we saturate to p=0.
uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
// Note: If MAX_RADIUS <= 2, then this 's' is a function only of
// r and eps. Further, this is the only place we use 'eps', so we could
// pre-calculate 's' for each parameter set and store that in place of
// 'eps'.
uint32_t s = sgrproj_mtable[eps - 1][n - 1];
// p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
// as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
// (this holds even after accounting for the rounding in s)
const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
// Note: We have to be quite careful about the value of A[k].
// This is used as a blend factor between individual pixel values and the
// local mean. So it logically has a range of [0, 256], including both
// endpoints.
//
// This is a pain for hardware, as we'd like something which can be stored
// in exactly 8 bits.
// Further, in the calculation of B[k] below, if z == 0 and r == 2,
// then A[k] "should be" 0. But then we can end up setting B[k] to a value
// slightly above 2^(8 + bit depth), due to rounding in the value of
// one_by_x[25-1].
//
// Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
// This fixes the above issues (256 - A[k] fits in a uint8, and we can't
// overflow), without significantly affecting the final result: z == 0
// implies that the image is essentially "flat", so the local mean and
// individual pixel values are very similar.
//
// Note that saturating on the other side, ie. requring A[k] <= 255,
// would be a bad idea, as that corresponds to the case where the image
// is very variable, when we want to preserve the local pixel value as
// much as possible.
A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
// SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
// one_by_x[n - 1] = round(2^12 / n)
// => the product here is < 2^(20 + bit_depth) <= 2^32,
// and B[k] is set to a value < 2^(8 + bit depth)
// This holds even with the rounding in one_by_x and in the overall
// result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
(uint32_t)B[k] *
(uint32_t)one_by_x[n - 1],
SGRPROJ_RECIP_BITS);
}
}
// Use the A[] and B[] arrays to calculate the filtered image
for (i = 0; i < height; ++i) {
const int width2 = width + (width & 1);
for (j = 0; j < width2; j += 2) {
{ // even col
const int k = i * buf_stride + j;
const int l = i * dgd_stride + j;
const int m = i * dst_stride + j;
const int nb = 5;
const int32_t a = (A[k - 1] + A[k + 1]) * 6 +
(A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
5;
const int32_t b = (B[k - 1] + B[k + 1]) * 6 +
(B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
5;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
if (j + 1 < width - 1) { // odd col and not last
const int k = i * buf_stride + j + 1;
const int l = i * dgd_stride + j + 1;
const int m = i * dst_stride + j + 1;
const int nb = 6;
const int32_t a = A[k] * 16 +
(A[k - buf_stride] + A[k + buf_stride]) * 14 +
(A[k - 2] + A[k + 2]) * 4 +
(A[k - 2 - buf_stride] + A[k - 2 + buf_stride] +
A[k + 2 - buf_stride] + A[k + 2 + buf_stride]) *
3;
const int32_t b = B[k] * 16 +
(B[k - buf_stride] + B[k + buf_stride]) * 14 +
(B[k - 2] + B[k + 2]) * 4 +
(B[k - 2 - buf_stride] + B[k - 2 + buf_stride] +
B[k + 2 - buf_stride] + B[k + 2 + buf_stride]) *
3;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
} else if (j + 1 < width) { // odd col and last
const int k = i * buf_stride + j + 1;
const int l = i * dgd_stride + j + 1;
const int m = i * dst_stride + j + 1;
const int nb = 6;
const int32_t a =
A[k] * 18 + (A[k - buf_stride] + A[k + buf_stride]) * 16 +
A[k - 2] * 6 + (A[k - 2 - buf_stride] + A[k - 2 + buf_stride]) * 4;
const int32_t b =
B[k] * 18 + (B[k - buf_stride] + B[k + buf_stride]) * 16 +
B[k - 2] * 6 + (B[k - 2 - buf_stride] + B[k - 2 + buf_stride]) * 4;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
}
}
}
}
#elif CONFIG_FAST_SGR == 1
static void av1_selfguided_restoration_fast_internal( static void av1_selfguided_restoration_fast_internal(
int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
int dst_stride, int bit_depth, int r, int eps) { int dst_stride, int bit_depth, int r, int eps) {
...@@ -1267,14 +1110,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, ...@@ -1267,14 +1110,7 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
} }
} }
#if CONFIG_FAST_SGR == 2 #if CONFIG_FAST_SGR
av1_selfguided_restoration_fast2_internal(dgd32, width, height, dgd32_stride,
flt1, flt_stride, bit_depth,
params->r1, params->e1);
av1_selfguided_restoration_fast2_internal(dgd32, width, height, dgd32_stride,
flt2, flt_stride, bit_depth,
params->r2, params->e2);
#elif CONFIG_FAST_SGR == 1
// r == 2 filter // r == 2 filter
av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
flt1, flt_stride, bit_depth, flt1, flt_stride, bit_depth,
...@@ -1338,15 +1174,9 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, ...@@ -1338,15 +1174,9 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
for (int j = 0; j < stripe_width; j += procunit_width) { for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j); int w = AOMMIN(procunit_width, stripe_width - j);
#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0);
#else
apply_selfguided_restoration(src + j, w, stripe_height, src_stride, apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd, rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0); dst + j, dst_stride, tmpbuf, bit_depth, 0);
#endif // CONFIG_FAST_SGR == 2
} }
} }
...@@ -1382,15 +1212,9 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui, ...@@ -1382,15 +1212,9 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
int32_t *tmpbuf, int bit_depth) { int32_t *tmpbuf, int bit_depth) {
for (int j = 0; j < stripe_width; j += procunit_width) { for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j); int w = AOMMIN(procunit_width, stripe_width - j);
#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
#else
apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride, apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd, rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1); dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
#endif // CONFIG_FAST_SGR == 2
} }
} }
......
...@@ -349,19 +349,6 @@ void encode_xq(int *xq, int *xqd) { ...@@ -349,19 +349,6 @@ void encode_xq(int *xq, int *xqd) {
xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); xqd[1] = clamp(xqd[1], SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
} }
static void sgr_filter_block(const sgr_params_type *params, const uint8_t *dat8,
int width, int height, int dat_stride,
int use_highbd, int bit_depth, int32_t *flt1,
int32_t *flt2, int flt_stride) {
#if CONFIG_FAST_SGR == 2
av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
#else
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
#endif // CONFIG_FAST_SGR == 2
}
// Apply the self-guided filter across an entire restoration unit. // Apply the self-guided filter across an entire restoration unit.
static void apply_sgr(const sgr_params_type *params, const uint8_t *dat8, static void apply_sgr(const sgr_params_type *params, const uint8_t *dat8,
int width, int height, int dat_stride, int use_highbd, int width, int height, int dat_stride, int use_highbd,
...@@ -376,8 +363,9 @@ static void apply_sgr(const sgr_params_type *params, const uint8_t *dat8, ...@@ -376,8 +363,9 @@ static void apply_sgr(const sgr_params_type *params, const uint8_t *dat8,
// Iterate over the stripe in blocks of width pu_width // Iterate over the stripe in blocks of width pu_width
for (int j = 0; j < width; j += pu_width) { for (int j = 0; j < width; j += pu_width) {
const int w = AOMMIN(pu_width, width - j); const int w = AOMMIN(pu_width, width - j);
sgr_filter_block(params, dat8_row + j, w, h, dat_stride, use_highbd, av1_selfguided_restoration(dat8_row + j, w, h, dat_stride, flt1_row + j,
bit_depth, flt1_row + j, flt2_row + j, flt_stride); flt2_row + j, flt_stride, params, bit_depth,
use_highbd);
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment