Commit 5765fad5 authored by David Barker's avatar David Barker
Browse files

Handle non-multiple-of-4 widths in SSE4.1 self-guided filter

Adjust the vectorized filter so that it can handle tile widths
which are not a multiple of 4, so we do not have to fall back
to the C version of the filter.

Negligible speed impact for tiles with widths which are multiples
of 4, and greatly improves speed on tiles with non-multiple-of-4
widths.

Change-Id: Iae9d14f812c52c6f66910d27da1d8e98930df7ba
parent e2e591c7
...@@ -41,7 +41,7 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n, ...@@ -41,7 +41,7 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
x_by_xplus1[_mm_extract_epi32(z, 1)], x_by_xplus1[_mm_extract_epi32(z, 1)],
x_by_xplus1[_mm_extract_epi32(z, 0)]); x_by_xplus1[_mm_extract_epi32(z, 0)]);
_mm_store_si128((__m128i *)&A[idx], a_res); _mm_storeu_si128((__m128i *)&A[idx], a_res);
__m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1); __m128i rounding_res = _mm_set1_epi32((1 << SGRPROJ_RECIP_BITS) >> 1);
__m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); __m128i a_complement = _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
...@@ -50,7 +50,7 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n, ...@@ -50,7 +50,7 @@ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n,
__m128i b_res = __m128i b_res =
_mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS); _mm_srli_epi32(_mm_add_epi32(b_int, rounding_res), SGRPROJ_RECIP_BITS);
_mm_store_si128((__m128i *)&B[idx], b_res); _mm_storeu_si128((__m128i *)&B[idx], b_res);
} }
static void selfguided_restoration_1(uint8_t *src, int width, int height, static void selfguided_restoration_1(uint8_t *src, int width, int height,
...@@ -59,8 +59,11 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height, ...@@ -59,8 +59,11 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height,
int i, j; int i, j;
// Vertical sum // Vertical sum
assert(!(width & 3)); // When the width is not a multiple of 4, we know that 'stride' is rounded up
for (j = 0; j < width; j += 4) { // to a multiple of 4. So it is safe for this loop to calculate extra columns
// at the right-hand edge of the frame.
int width_extend = (width + 3) & ~3;
for (j = 0; j < width_extend; j += 4) {
__m128i a, b, x, y, x2, y2; __m128i a, b, x, y, x2, y2;
__m128i sum, sum_sq, tmp; __m128i sum, sum_sq, tmp;
...@@ -140,12 +143,20 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height, ...@@ -140,12 +143,20 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height,
s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]); s = _mm_set1_epi32(sgrproj_mtable[eps - 1][3 * h - 1]);
// Re-align a1 and b1 so that they start at index i * buf_stride + 3 // Re-align a1 and b1 so that they start at index i * buf_stride + 3
a1 = _mm_alignr_epi8(a2, a1, 12); a2 = _mm_alignr_epi8(a2, a1, 12);
b1 = _mm_alignr_epi8(b2, b1, 12); b2 = _mm_alignr_epi8(b2, b1, 12);
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 7]);
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 7]); // Note: When the width is not a multiple of 4, this loop may end up
// writing to the last 4 columns of the frame, potentially with incorrect
for (j = 4; j < width - 4; j += 4) { // values (especially for r=2 and r=3).
// This is fine, since we fix up those values in the block after this
// loop, and in exchange we never have more than four values to
// write / fix up after this loop finishes.
for (j = 4; j < width_extend - 4; j += 4) {
a1 = a2;
b1 = b2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
/* Loop invariant: At this point, /* Loop invariant: At this point,
a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3] a1 = original A[i * buf_stride + j - 1 : i * buf_stride + j + 3]
a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7] a2 = original A[i * buf_stride + j + 3 : i * buf_stride + j + 7]
...@@ -157,12 +168,38 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height, ...@@ -157,12 +168,38 @@ static void selfguided_restoration_1(uint8_t *src, int width, int height,
_mm_alignr_epi8(a2, a1, 8))); _mm_alignr_epi8(a2, a1, 8)));
calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
A, B); A, B);
a1 = a2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 7]);
b1 = b2;
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 7]);
} }
__m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]);
__m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]);
j = width - 4;
switch (width % 4) {
case 0:
a1 = a2;
b1 = b2;
a2 = a3;
b2 = b3;
break;
case 1:
a1 = _mm_alignr_epi8(a2, a1, 4);
b1 = _mm_alignr_epi8(b2, b1, 4);
a2 = _mm_alignr_epi8(a3, a2, 4);
b2 = _mm_alignr_epi8(b3, b2, 4);
break;
case 2:
a1 = _mm_alignr_epi8(a2, a1, 8);
b1 = _mm_alignr_epi8(b2, b1, 8);
a2 = _mm_alignr_epi8(a3, a2, 8);
b2 = _mm_alignr_epi8(b3, b2, 8);
break;
case 3:
a1 = _mm_alignr_epi8(a2, a1, 12);
b1 = _mm_alignr_epi8(b2, b1, 12);
a2 = _mm_alignr_epi8(a3, a2, 12);
b2 = _mm_alignr_epi8(b3, b2, 12);
break;
}
// Zero out the data loaded from "off the edge" of the array // Zero out the data loaded from "off the edge" of the array
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
a2 = _mm_blend_epi16(a2, zero, 0xfc); a2 = _mm_blend_epi16(a2, zero, 0xfc);
...@@ -189,8 +226,8 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height, ...@@ -189,8 +226,8 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height,
int i, j; int i, j;
// Vertical sum // Vertical sum
assert(!(width & 3)); int width_extend = (width + 3) & ~3;
for (j = 0; j < width; j += 4) { for (j = 0; j < width_extend; j += 4) {
__m128i a, b, c, c2, x, y, x2, y2; __m128i a, b, c, c2, x, y, x2, y2;
__m128i sum, sum_sq, tmp; __m128i sum, sum_sq, tmp;
...@@ -290,16 +327,18 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height, ...@@ -290,16 +327,18 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height,
B); B);
// Re-align a1 and b1 so that they start at index i * buf_stride + 2 // Re-align a1 and b1 so that they start at index i * buf_stride + 2
a1 = _mm_alignr_epi8(a2, a1, 8); a2 = _mm_alignr_epi8(a2, a1, 8);
b1 = _mm_alignr_epi8(b2, b1, 8); b2 = _mm_alignr_epi8(b2, b1, 8);
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 6]);
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 6]);
n = _mm_set1_epi32(5 * h); n = _mm_set1_epi32(5 * h);
one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]); one_over_n = _mm_set1_epi32(one_by_x[5 * h - 1]);
s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]); s = _mm_set1_epi32(sgrproj_mtable[eps - 1][5 * h - 1]);
for (j = 4; j < width - 4; j += 4) { for (j = 4; j < width_extend - 4; j += 4) {
a1 = a2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
b1 = b2;
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
/* Loop invariant: At this point, /* Loop invariant: At this point,
a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2] a1 = original A[i * buf_stride + j - 2 : i * buf_stride + j + 2]
a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6] a2 = original A[i * buf_stride + j + 2 : i * buf_stride + j + 6]
...@@ -316,12 +355,40 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height, ...@@ -316,12 +355,40 @@ static void selfguided_restoration_2(uint8_t *src, int width, int height,
calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
A, B); A, B);
a1 = a2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 6]);
b1 = b2;
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 6]);
} }
// If the width is not a multiple of 4, we need to reset j to width - 4
// and adjust a1, a2, b1, b2 so that the loop invariant above is maintained
__m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 2]);
__m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 2]);
j = width - 4;
switch (width % 4) {
case 0:
a1 = a2;
b1 = b2;
a2 = a3;
b2 = b3;
break;
case 1:
a1 = _mm_alignr_epi8(a2, a1, 4);
b1 = _mm_alignr_epi8(b2, b1, 4);
a2 = _mm_alignr_epi8(a3, a2, 4);
b2 = _mm_alignr_epi8(b3, b2, 4);
break;
case 2:
a1 = _mm_alignr_epi8(a2, a1, 8);
b1 = _mm_alignr_epi8(b2, b1, 8);
a2 = _mm_alignr_epi8(a3, a2, 8);
b2 = _mm_alignr_epi8(b3, b2, 8);
break;
case 3:
a1 = _mm_alignr_epi8(a2, a1, 12);
b1 = _mm_alignr_epi8(b2, b1, 12);
a2 = _mm_alignr_epi8(a3, a2, 12);
b2 = _mm_alignr_epi8(b3, b2, 12);
break;
}
// Zero out the data loaded from "off the edge" of the array // Zero out the data loaded from "off the edge" of the array
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
a2 = _mm_blend_epi16(a2, zero, 0xf0); a2 = _mm_blend_epi16(a2, zero, 0xf0);
...@@ -353,8 +420,8 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height, ...@@ -353,8 +420,8 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height,
int i, j; int i, j;
// Vertical sum over 7-pixel regions, 4 columns at a time // Vertical sum over 7-pixel regions, 4 columns at a time
assert(!(width & 3)); int width_extend = (width + 3) & ~3;
for (j = 0; j < width; j += 4) { for (j = 0; j < width_extend; j += 4) {
__m128i a, b, c, d, x, y, x2, y2; __m128i a, b, c, d, x, y, x2, y2;
__m128i sum, sum_sq, tmp, tmp2; __m128i sum, sum_sq, tmp, tmp2;
...@@ -476,16 +543,18 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height, ...@@ -476,16 +543,18 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height,
B); B);
// Re-align a1 and b1 so that they start at index i * buf_stride + 1 // Re-align a1 and b1 so that they start at index i * buf_stride + 1
a1 = _mm_alignr_epi8(a2, a1, 4); a2 = _mm_alignr_epi8(a2, a1, 4);
b1 = _mm_alignr_epi8(b2, b1, 4); b2 = _mm_alignr_epi8(b2, b1, 4);
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + 5]);
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + 5]);
n = _mm_set1_epi32(7 * h); n = _mm_set1_epi32(7 * h);
one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]); one_over_n = _mm_set1_epi32(one_by_x[7 * h - 1]);
s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]); s = _mm_set1_epi32(sgrproj_mtable[eps - 1][7 * h - 1]);
for (j = 4; j < width - 4; j += 4) { for (j = 4; j < width_extend - 4; j += 4) {
a1 = a2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
b1 = b2;
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
__m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]); __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
__m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]); __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
/* Loop invariant: At this point, /* Loop invariant: At this point,
...@@ -509,12 +578,38 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height, ...@@ -509,12 +578,38 @@ static void selfguided_restoration_3(uint8_t *src, int width, int height,
calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j,
A, B); A, B);
a1 = a2;
a2 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 5]);
b1 = b2;
b2 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 5]);
} }
__m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]);
__m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]);
j = width - 4;
switch (width % 4) {
case 0:
a1 = a2;
b1 = b2;
a2 = a3;
b2 = b3;
break;
case 1:
a1 = _mm_alignr_epi8(a2, a1, 4);
b1 = _mm_alignr_epi8(b2, b1, 4);
a2 = _mm_alignr_epi8(a3, a2, 4);
b2 = _mm_alignr_epi8(b3, b2, 4);
break;
case 2:
a1 = _mm_alignr_epi8(a2, a1, 8);
b1 = _mm_alignr_epi8(b2, b1, 8);
a2 = _mm_alignr_epi8(a3, a2, 8);
b2 = _mm_alignr_epi8(b3, b2, 8);
break;
case 3:
a1 = _mm_alignr_epi8(a2, a1, 12);
b1 = _mm_alignr_epi8(b2, b1, 12);
a2 = _mm_alignr_epi8(a3, a2, 12);
b2 = _mm_alignr_epi8(b3, b2, 12);
break;
}
// Zero out the data loaded from "off the edge" of the array // Zero out the data loaded from "off the edge" of the array
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
a2 = _mm_blend_epi16(a2, zero, 0xc0); a2 = _mm_blend_epi16(a2, zero, 0xc0);
...@@ -775,14 +870,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height, ...@@ -775,14 +870,10 @@ void apply_selfguided_restoration_sse4_1(uint8_t *dat, int width, int height,
int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX; int32_t *tmpbuf2 = flt2 + RESTORATION_TILEPELS_MAX;
int i, j; int i, j;
assert(width * height <= RESTORATION_TILEPELS_MAX); assert(width * height <= RESTORATION_TILEPELS_MAX);
// The SSE4.1 code currently only supports tiles which are a multiple of 4 // The SSE4.1 code does not currently support highbitdepth, so drop back
// pixels wide (but has no height restriction). If this is not the case, // to the C filter in that case.
// we fall back to the C version. // TODO(david.barker): Allow bit_depth > 8 in the SSE4.1 code.
// Similarly, highbitdepth mode is not fully supported yet, so drop back if (bit_depth != 8) {
// to the C code in that case.
// TODO(david.barker): Allow non-multiple-of-4 widths and bit_depth > 8
// in the SSE4.1 code.
if ((width & 3) || bit_depth != 8) {
apply_selfguided_restoration_c(dat, width, height, stride, bit_depth, eps, apply_selfguided_restoration_c(dat, width, height, stride, bit_depth, eps,
xqd, dst, dst_stride, tmpbuf); xqd, dst, dst_stride, tmpbuf);
return; return;
......
...@@ -62,7 +62,7 @@ class AV1SelfguidedFilterTest ...@@ -62,7 +62,7 @@ class AV1SelfguidedFilterTest
}; };
// Fix a parameter set, since the speed depends slightly on r. // Fix a parameter set, since the speed depends slightly on r.
// Change this to test different combinations of values of r. // Change this to test different combinations of values of r.
int eps = 4; int eps = 15;
av1_loop_restoration_precal(); av1_loop_restoration_precal();
...@@ -84,7 +84,7 @@ class AV1SelfguidedFilterTest ...@@ -84,7 +84,7 @@ class AV1SelfguidedFilterTest
void RunCorrectnessTest() { void RunCorrectnessTest() {
const int w = 256, h = 256, stride = 672, out_stride = 672; const int w = 256, h = 256, stride = 672, out_stride = 672;
const int NUM_ITERS = 250; const int NUM_ITERS = 81;
int i, j, k; int i, j, k;
uint8_t *input = new uint8_t[stride * h]; uint8_t *input = new uint8_t[stride * h];
...@@ -98,8 +98,8 @@ class AV1SelfguidedFilterTest ...@@ -98,8 +98,8 @@ class AV1SelfguidedFilterTest
av1_loop_restoration_precal(); av1_loop_restoration_precal();
for (i = 0; i < NUM_ITERS; ++i) { for (i = 0; i < NUM_ITERS; ++i) {
for (j = 0; i < h; ++i) for (j = 0; j < h; ++j)
for (k = 0; j < w; ++j) input[j * stride + k] = rnd.Rand16() & 0xFF; for (k = 0; k < w; ++k) input[j * stride + k] = rnd.Rand16() & 0xFF;
int xqd[2] = { int xqd[2] = {
SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MIN0 +
...@@ -109,12 +109,16 @@ class AV1SelfguidedFilterTest ...@@ -109,12 +109,16 @@ class AV1SelfguidedFilterTest
}; };
int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS); int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
apply_selfguided_restoration(input, w, h, stride, 8, eps, xqd, output, // Test various tile sizes around 256x256
out_stride, tmpbuf); int test_w = w + 4 - (i / 9);
apply_selfguided_restoration_c(input, w, h, stride, 8, eps, xqd, output2, int test_h = h + 4 - (i % 9);
out_stride, tmpbuf);
for (j = 0; j < h; ++j) apply_selfguided_restoration(input, test_w, test_h, stride, 8, eps, xqd,
for (k = 0; k < w; ++k) output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, 8, eps, xqd,
output2, out_stride, tmpbuf);
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]); ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment