Commit 127b562a authored by Debargha Mukherjee's avatar Debargha Mukherjee

Adjust last odd row weight in fast_sgr

Change-Id: I2348a7c6a3553bbbb0d061820a7c546a1a0367df
parent 6cd8e177
......@@ -916,12 +916,12 @@ static void av1_selfguided_restoration_fast_internal(
const int m = i * dst_stride + j;
const int nb = 6;
const int buf_stride2 = 2 * buf_stride;
const int32_t a = A[k] * 16 + (A[k - 1] + A[k + 1]) * 14 +
A[k - buf_stride2] * 8 +
(A[k - 1 - buf_stride2] + A[k + 1 - buf_stride2]) * 6;
const int32_t b = B[k] * 16 + (B[k - 1] + B[k + 1]) * 14 +
B[k - buf_stride2] * 8 +
(B[k - 1 - buf_stride2] + B[k + 1 - buf_stride2]) * 6;
const int32_t a = A[k] * 18 + (A[k - 1] + A[k + 1]) * 16 +
A[k - buf_stride2] * 6 +
(A[k - 1 - buf_stride2] + A[k + 1 - buf_stride2]) * 4;
const int32_t b = B[k] * 18 + (B[k - 1] + B[k + 1]) * 16 +
B[k - buf_stride2] * 6 +
(B[k - 1 - buf_stride2] + B[k + 1 - buf_stride2]) * 4;
const int32_t v = a * dgd[l] + b;
dst[m] =
ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
......
......@@ -499,21 +499,21 @@ static __m256i cross_sum_fast_odd_not_last(const int32_t *buf, int stride) {
// xl x xr
//
// Pixels are weighted like this:
// 6 8 6
// 4 6 4
// 0 0 0
// 14 16 14
// 16 18 16
//
// buf points to x
//
// sixes = xtl + xtr
// eights = xt
// fourteens = xl + xr
// sixteens = x
// cross_sum = 6 * sixes + 8 * eights + 14 * fourteens + 16 * sixteens
// = 8 * (sixes + eights) + 16 * (sixteens + fourteens)
// - 2 * (sixes + fourteens)
// = (sixes + eights) << 3 + (sixteens + fourteens) << 4
// - (sixes + fourteens) << 1
// fours = xtl + xtr
// sixes = xt
// sixteens = xl + xr
// eighteens = x
// cross_sum = 4 * fours + 6 * sixes + 16 * sixteens + 18 * eighteens
// = 4 * (fours + sixes) + 16 * (sixteens + eighteens)
// + 2 * (sixes + eighteens)
// = (fours + sixes) << 2 + (sixteens + eighteens) << 4
// + (sixes + eighteens) << 1
static __m256i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
const int two_stride = 2 * stride;
const __m256i xtl = yy_loadu_256(buf - 1 - two_stride);
......@@ -523,19 +523,19 @@ static __m256i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
const __m256i x = yy_loadu_256(buf);
const __m256i xr = yy_loadu_256(buf + 1);
const __m256i sixes = _mm256_add_epi32(xtl, xtr);
const __m256i eights = xt;
const __m256i fourteens = _mm256_add_epi32(xl, xr);
const __m256i sixteens = x;
const __m256i fours = _mm256_add_epi32(xtl, xtr);
const __m256i sixes = xt;
const __m256i sixteens = _mm256_add_epi32(xl, xr);
const __m256i eighteens = x;
const __m256i sixes_plus_eights = _mm256_add_epi32(sixes, eights);
const __m256i sixteens_plus_fourteens = _mm256_add_epi32(sixteens, fourteens);
const __m256i sixes_plus_fourteens = _mm256_add_epi32(sixes, fourteens);
const __m256i fours_plus_sixes = _mm256_add_epi32(fours, sixes);
const __m256i sixteens_plus_eighteens = _mm256_add_epi32(sixteens, eighteens);
const __m256i sixes_plus_eighteens = _mm256_add_epi32(sixes, eighteens);
return _mm256_sub_epi32(
_mm256_add_epi32(_mm256_slli_epi32(sixes_plus_eights, 3),
_mm256_slli_epi32(sixteens_plus_fourteens, 4)),
_mm256_slli_epi32(sixes_plus_fourteens, 1));
return _mm256_add_epi32(
_mm256_add_epi32(_mm256_slli_epi32(fours_plus_sixes, 2),
_mm256_slli_epi32(sixteens_plus_eighteens, 4)),
_mm256_slli_epi32(sixes_plus_eighteens, 1));
}
// The final filter for selfguided restoration. Computes a weighted average
......
......@@ -462,21 +462,21 @@ static __m128i cross_sum_fast_odd_not_last(const int32_t *buf, int stride) {
// xl x xr
//
// Pixels are weighted like this:
// 6 8 6
// 4 6 4
// 0 0 0
// 14 16 14
// 16 18 16
//
// buf points to x
//
// sixes = xtl + xtr
// eights = xt
// fourteens = xl + xr
// sixteens = x
// cross_sum = 6 * sixes + 8 * eights + 14 * fourteens + 16 * sixteens
// = 8 * (sixes + eights) + 16 * (sixteens + fourteens)
// - 2 * (sixes + fourteens)
// = (sixes + eights) << 3 + (sixteens + fourteens) << 4
// - (sixes + fourteens) << 1
// fours = xtl + xtr
// sixes = xt
// sixteens = xl + xr
// eighteens = x
// cross_sum = 4 * fours + 6 * sixes + 16 * sixteens + 18 * eighteens
// = 4 * (fours + sixes) + 16 * (sixteens + eighteens)
// + 2 * (sixes + eighteens)
// = (fours + sixes) << 2 + (sixteens + eighteens) << 4
// + (sixes + eighteens) << 1
static __m128i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
const int two_stride = 2 * stride;
const __m128i xtl = xx_loadu_128(buf - 1 - two_stride);
......@@ -486,19 +486,19 @@ static __m128i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
const __m128i x = xx_loadu_128(buf);
const __m128i xr = xx_loadu_128(buf + 1);
const __m128i sixes = _mm_add_epi32(xtl, xtr);
const __m128i eights = xt;
const __m128i fourteens = _mm_add_epi32(xl, xr);
const __m128i sixteens = x;
const __m128i fours = _mm_add_epi32(xtl, xtr);
const __m128i sixes = xt;
const __m128i sixteens = _mm_add_epi32(xl, xr);
const __m128i eighteens = x;
const __m128i sixes_plus_eights = _mm_add_epi32(sixes, eights);
const __m128i sixteens_plus_fourteens = _mm_add_epi32(sixteens, fourteens);
const __m128i sixes_plus_fourteens = _mm_add_epi32(sixes, fourteens);
const __m128i fours_plus_sixes = _mm_add_epi32(fours, sixes);
const __m128i sixteens_plus_eighteens = _mm_add_epi32(sixteens, eighteens);
const __m128i sixes_plus_eighteens = _mm_add_epi32(sixes, eighteens);
return _mm_sub_epi32(
_mm_add_epi32(_mm_slli_epi32(sixes_plus_eights, 3),
_mm_slli_epi32(sixteens_plus_fourteens, 4)),
_mm_slli_epi32(sixes_plus_fourteens, 1));
return _mm_add_epi32(
_mm_add_epi32(_mm_slli_epi32(fours_plus_sixes, 2),
_mm_slli_epi32(sixteens_plus_eighteens, 4)),
_mm_slli_epi32(sixes_plus_eighteens, 1));
}
// The final filter for selfguided restoration. Computes a weighted average
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment