Commit 9198d135 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

Rearrange self-guided filter for vectorization

By rearranging the order of operations, we can ensure that all
intermediate values fit into 32 bits. This will help when we
vectorize the self-guided filter.

Results in the noise range.

Change-Id: Ic0c73613882bd103c4e8e57a0155b3132672ae04
parent c5958db4
......@@ -122,8 +122,8 @@ static void GenDomainTxfmRFVtable() {
#define MAX_RADIUS 3 // Only 1, 2, 3 allowed
#define MAX_EPS 80 // Max value of eps
#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
#define SGRPROJ_MTABLE_BITS 30
#define SGRPROJ_RECIP_BITS 16
#define SGRPROJ_MTABLE_BITS 20
#define SGRPROJ_RECIP_BITS 12
// TODO(debargha): This table can be substantially reduced since only a few
// values are actually used.
......@@ -604,11 +604,10 @@ static const uint16_t x_by_xplus1[256] = {
};
static const uint16_t one_by_x[MAX_NELEM] = {
65535, 32768, 21845, 16384, 13107, 10923, 9362, 8192, 7282, 6554,
5958, 5461, 5041, 4681, 4369, 4096, 3855, 3641, 3449, 3277,
3121, 2979, 2849, 2731, 2621, 2521, 2427, 2341, 2260, 2185,
2114, 2048, 1986, 1928, 1872, 1820, 1771, 1725, 1680, 1638,
1598, 1560, 1524, 1489, 1456, 1425, 1394, 1365, 1337,
4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, 158,
152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108, 105,
102, 100, 98, 95, 93, 91, 89, 87, 85, 84
};
#endif // APPROXIMATE_SGR
......@@ -619,7 +618,6 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
int32_t *B = A + RESTORATION_TILEPELS_MAX;
int8_t num[RESTORATION_TILEPELS_MAX];
int i, j;
eps <<= 2 * (bit_depth - 8);
// Don't filter tiles with dimensions < 5 on any axis
if ((width < 5) || (height < 5)) return;
......@@ -632,30 +630,37 @@ void av1_selfguided_restoration(int32_t *dgd, int width, int height, int stride,
for (j = 0; j < width; ++j) {
const int k = i * width + j;
const int n = num[k];
// Assuming that we only allow up to 12-bit depth and r <= 2,
// we calculate p = n^2 * Var(n-pixel block of original image)
// (where n = 2 * r + 1 <= 5).
//
// There is an inequality which gives a bound on the variance:
// https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
// In this case, since each pixel is in the range [0, 2^12),
// the variance is at most 1/4 * (2^12)^2 = 2^22.
// Then p <= 25^2 * 2^22 < 2^32, and also q <= p + 25^2 * 68 < 2^32.
//
// The point of all this is to guarantee that q < 2^32, so that
// platforms with a 64-bit by 32-bit divide unit (eg, x86)
// can do the division by q more efficiently.
const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
#if APPROXIMATE_SGR
const int s = sgrproj_mtable[eps - 1][n - 1];
const int z =
((int64_t)s * (int64_t)p + (1 << (SGRPROJ_MTABLE_BITS - 1))) >>
SGRPROJ_MTABLE_BITS;
A[k] = x_by_xplus1[AOMMIN(z, 255)];
B[k] = ((SGRPROJ_SGR - A[k]) * (int64_t)B[k] * (int64_t)one_by_x[n - 1] +
(1 << (SGRPROJ_RECIP_BITS - 1))) >>
SGRPROJ_RECIP_BITS;
// a < 2^16 * n < 2^22 regardless of bit depth
uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
// b < 2^8 * n < 2^14 regardless of bit depth
uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
// Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
// and p itself satisfies p < 2^14 * n^2 < 2^26.
// Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
// This is an artefact of rounding, and can only happen if all pixels
// are (almost) identical, so in this case we saturate to p=0.
uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
uint32_t s = sgrproj_mtable[eps - 1][n - 1];
// p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
// as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
// (this holds even after accounting for the rounding in s)
const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
A[k] = x_by_xplus1[AOMMIN(z, 255)]; // < 2^8
// SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
// one_by_x[n - 1] = round(2^12 / n)
// => the product here is < 2^(20 + bit_depth) <= 2^32,
// and B[k] is set to a value < 2^(8 + bit depth)
B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
(uint32_t)B[k] *
(uint32_t)one_by_x[n - 1],
SGRPROJ_RECIP_BITS);
#else
const uint32_t p = (uint32_t)((uint64_t)A[k] * n - (uint64_t)B[k] * B[k]);
const uint32_t q = (uint32_t)(p + n * n * eps);
assert((uint64_t)A[k] * n - (uint64_t)B[k] * B[k] < (25 * 25U << 22));
A[k] = (int32_t)(((uint64_t)p << SGRPROJ_SGR_BITS) + (q >> 1)) / q;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment