Commit d051e560 authored by Imdad Sardharwalla's avatar Imdad Sardharwalla Committed by Debargha Mukherjee

SSE4 and AVX2 implementations of updated FAST_SGR

The SSE4.1 and AVX2 implementations of the self-guided filter have been updated
to match the updated FAST_SGR C implementation in restoration.c.

The self-guided filter speed tests have been altered to compare the speeds of
the SIMD and C implementations of the relevant functions.

Speed Tests (code compiled with CLANG)
===========

For LowBD:
- The SSE4.1 implementation is ~220% faster (~69% less time) than the C code
- The AVX2 implementation is ~314% faster (~76% less time) than the C code

For HighBD:
- The SSE4.1 implementation is ~240% faster (~71% less time) than the C code
- The AVX2 implementation is ~343% faster (~77% less time) than the C code

Change-Id: Ic2734bb89ccd3f66667c68647e5f677a5a496233
parent 1a796617
......@@ -1275,9 +1275,11 @@ void av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
flt2, flt_stride, bit_depth,
params->r2, params->e2);
#elif CONFIG_FAST_SGR == 1
// r == 2 filter
av1_selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
flt1, flt_stride, bit_depth,
params->r1, params->e1);
// r == 1 filter
av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt2,
flt_stride, bit_depth, params->r2,
params->e2);
......@@ -1336,7 +1338,7 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
#if CONFIG_FAST_SGR
#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0);
......@@ -1344,7 +1346,7 @@ static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst + j, dst_stride, tmpbuf, bit_depth, 0);
#endif // CONFIG_FAST_SGR
#endif // CONFIG_FAST_SGR == 2
}
}
......@@ -1380,7 +1382,7 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
int32_t *tmpbuf, int bit_depth) {
for (int j = 0; j < stripe_width; j += procunit_width) {
int w = AOMMIN(procunit_width, stripe_width - j);
#if CONFIG_FAST_SGR
#if CONFIG_FAST_SGR == 2
apply_selfguided_restoration_c(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
......@@ -1388,7 +1390,7 @@ static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
#endif // CONFIG_FAST_SGR
#endif // CONFIG_FAST_SGR == 2
}
}
......
......@@ -311,7 +311,7 @@ static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
for (int i = 0; i < height; ++i) {
for (int j = 0; j < width; j += 4) {
for (int j = 0; j < width; j += 8) {
const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
......@@ -398,7 +398,7 @@ static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
}
}
// Calculate 4 values of the "cross sum" starting at buf.
// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
......@@ -434,7 +434,38 @@ static __m256i cross_sum_fast_even(const int32_t *buf, int stride) {
sixes);
}
// Calculate 4 values of the "cross sum" starting at buf.
// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xl x xr
//
// Pixels are weighted like this:
// 5 6 5
//
// buf points to x
//
// fives = xl + xr
// sixes = x
// cross_sum = 5 * fives + 6 * sixes
// = 4 * (fives + sixes) + (fives + sixes) + sixes
// = (fives + sixes) << 2 + (fives + sixes) + sixes
static __m256i cross_sum_fast_odd(const int32_t *buf) {
const __m256i xl = yy_loadu_256(buf - 1);
const __m256i x = yy_loadu_256(buf);
const __m256i xr = yy_loadu_256(buf + 1);
const __m256i fives = _mm256_add_epi32(xl, xr);
const __m256i sixes = x;
const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
return _mm256_add_epi32(
_mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
fives_plus_sixes),
sixes);
}
// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
......@@ -491,7 +522,7 @@ static __m256i cross_sum_fast_odd_not_last(const int32_t *buf, int stride) {
fourteens);
}
// Calculate 4 values of the "cross sum" starting at buf.
// Calculate 8 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xtl xt xtr
......@@ -539,11 +570,12 @@ static __m256i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
}
// The final filter for selfguided restoration. Computes a weighted average
// across A, B with "cross sums" (see cross_sum_... implementations above)
static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
// across A, B with "cross sums" (see cross_sum_... implementations above).
// Designed for the first vertical sub-sampling version of FAST_SGR.
static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
const int nb0 = 5;
const int nb1 = 6;
......@@ -557,7 +589,7 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
for (int i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
for (int j = 0; j < width; j += 4) {
for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
const __m256i b =
......@@ -576,7 +608,7 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
yy_storeu_256(dst + i * dst_stride + j, w);
}
} else if (i != height - 1) { // odd row and not last
for (int j = 0; j < width; j += 4) {
for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_odd_not_last(A + i * buf_stride + j, buf_stride);
const __m256i b =
......@@ -595,7 +627,7 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
yy_storeu_256(dst + i * dst_stride + j, w);
}
} else { // odd row and last
for (int j = 0; j < width; j += 4) {
for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_odd_last(A + i * buf_stride + j, buf_stride);
const __m256i b =
......@@ -616,6 +648,65 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
}
// The final filter for selfguided restoration. Computes a weighted average
// across A, B with "cross sums" (see cross_sum_... implementations above).
// Designed for the second vertical sub-sampling version of FAST_SGR.
static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
const int nb0 = 5;
const int nb1 = 4;
const __m256i rounding0 =
round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
const __m256i rounding1 =
round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
const uint8_t *dgd_real =
highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
for (int i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
for (int j = 0; j < width; j += 8) {
const __m256i a =
cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
const __m256i b =
cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
const __m128i raw =
xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
const __m256i src =
highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
__m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
__m256i w =
_mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
yy_storeu_256(dst + i * dst_stride + j, w);
}
} else { // odd row
for (int j = 0; j < width; j += 8) {
const __m256i a = cross_sum_fast_odd(A + i * buf_stride + j);
const __m256i b = cross_sum_fast_odd(B + i * buf_stride + j);
const __m128i raw =
xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
const __m256i src =
highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
__m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
__m256i w =
_mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
yy_storeu_256(dst + i * dst_stride + j, w);
}
}
}
}
#endif
void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
......@@ -676,23 +767,36 @@ void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
// Write to flt1 and flt2
// Write to flt1 and flt2
#if CONFIG_FAST_SGR
assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
// r == 2 filter
assert(params->r1 == 2);
calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
params->r1);
final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
width, height, highbd);
// r == 1 filter
assert(params->r2 == 1);
calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
params->r2);
final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
#else
for (int i = 0; i < 2; ++i) {
int r = i ? params->r2 : params->r1;
int e = i ? params->e2 : params->e1;
int32_t *flt = i ? flt2 : flt1;
assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
#if CONFIG_FAST_SGR
calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
width, height, highbd);
#else
calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
#endif
}
#endif
}
void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
......
......@@ -398,6 +398,36 @@ static __m128i cross_sum_fast_even(const int32_t *buf, int stride) {
sixes);
}
// Calculate 4 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
// xl x xr
//
// Pixels are weighted like this:
// 5 6 5
//
// buf points to x
//
// fives = xl + xr
// sixes = x
// cross_sum = 5 * fives + 6 * sixes
// = 4 * (fives + sixes) + (fives + sixes) + sixes
// = (fives + sixes) << 2 + (fives + sixes) + sixes
static __m128i cross_sum_fast_odd(const int32_t *buf) {
const __m128i xl = xx_loadu_128(buf - 1);
const __m128i x = xx_loadu_128(buf);
const __m128i xr = xx_loadu_128(buf + 1);
const __m128i fives = _mm_add_epi32(xl, xr);
const __m128i sixes = x;
const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
return _mm_add_epi32(
_mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
sixes);
}
// Calculate 4 values of the "cross sum" starting at buf.
//
// Pixels are indexed like this:
......@@ -502,11 +532,12 @@ static __m128i cross_sum_fast_odd_last(const int32_t *buf, int stride) {
}
// The final filter for selfguided restoration. Computes a weighted average
// across A, B with "cross sums" (see cross_sum_... implementations above)
static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
// across A, B with "cross sums" (see cross_sum_... implementations above).
// Designed for the first vertical sub-sampling version of FAST_SGR.
static void final_filter_fast1(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
const int nb0 = 5;
const int nb1 = 6;
......@@ -573,6 +604,61 @@ static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
}
}
}
// The final filter for selfguided restoration. Computes a weighted average
// across A, B with "cross sums" (see cross_sum_... implementations above).
// Designed for the second vertical sub-sampling version of FAST_SGR.
static void final_filter_fast2(int32_t *dst, int dst_stride, const int32_t *A,
const int32_t *B, int buf_stride,
const void *dgd8, int dgd_stride, int width,
int height, int highbd) {
const int nb0 = 5;
const int nb1 = 4;
const __m128i rounding0 =
round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
const __m128i rounding1 =
round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
const uint8_t *dgd_real =
highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
for (int i = 0; i < height; ++i) {
if (!(i & 1)) { // even row
for (int j = 0; j < width; j += 4) {
const __m128i a =
cross_sum_fast_even(A + i * buf_stride + j, buf_stride);
const __m128i b =
cross_sum_fast_even(B + i * buf_stride + j, buf_stride);
const __m128i raw =
xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
const __m128i src =
highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
__m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
__m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
xx_storeu_128(dst + i * dst_stride + j, w);
}
} else { // odd row
for (int j = 0; j < width; j += 4) {
const __m128i a = cross_sum_fast_odd(A + i * buf_stride + j);
const __m128i b = cross_sum_fast_odd(B + i * buf_stride + j);
const __m128i raw =
xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
const __m128i src =
highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
__m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
__m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
xx_storeu_128(dst + i * dst_stride + j, w);
}
}
}
}
#endif
void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
......@@ -629,23 +715,36 @@ void av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
buf_stride);
// Write to flt1 and flt2
// Write to flt1 and flt2
#if CONFIG_FAST_SGR
assert(params->r1 < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
// r == 2 filter
assert(params->r1 == 2);
calc_ab_fast(A, B, C, D, width, height, buf_stride, params->e1, bit_depth,
params->r1);
final_filter_fast2(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
width, height, highbd);
// r == 1 filter
assert(params->r2 == 1);
calc_ab(A, B, C, D, width, height, buf_stride, params->e2, bit_depth,
params->r2);
final_filter(flt2, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
#else
for (int i = 0; i < 2; ++i) {
int r = i ? params->r2 : params->r1;
int e = i ? params->e2 : params->e1;
int32_t *flt = i ? flt2 : flt1;
assert(r + 1 <= AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
#if CONFIG_FAST_SGR
calc_ab_fast(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter_fast(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
width, height, highbd);
#else
calc_ab(A, B, C, D, width, height, buf_stride, e, bit_depth, r);
final_filter(flt, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
height, highbd);
#endif
}
#endif
}
void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
......
......@@ -353,13 +353,13 @@ static void sgr_filter_block(const sgr_params_type *params, const uint8_t *dat8,
int width, int height, int dat_stride,
int use_highbd, int bit_depth, int32_t *flt1,
int32_t *flt2, int flt_stride) {
#if CONFIG_FAST_SGR
#if CONFIG_FAST_SGR == 2
av1_selfguided_restoration_c(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
#else
av1_selfguided_restoration(dat8, width, height, dat_stride, flt1, flt2,
flt_stride, params, bit_depth, use_highbd);
#endif // CONFIG_FAST_SGR
#endif // CONFIG_FAST_SGR == 2
}
// Apply the self-guided filter across an entire restoration unit.
......
......@@ -80,7 +80,24 @@ class AV1SelfguidedFilterTest
av1_loop_restoration_precal();
std::clock_t start = std::clock();
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
int w = AOMMIN(pu_width, width - j);
int h = AOMMIN(pu_height, height - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf, 8, 0);
}
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
......@@ -92,11 +109,16 @@ class AV1SelfguidedFilterTest
tmpbuf, 8, 0);
}
}
std::clock_t end = std::clock();
double elapsed = ((end - start) / (double)CLOCKS_PER_SEC);
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
std::cout << "[ ] C time = " << ref_time / 1000
<< " ms, SIMD time = " << tst_time / 1000 << " ms\n";
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
height, elapsed, elapsed * 1000000. / NUM_ITERS);
EXPECT_GT(ref_time, tst_time)
<< "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
<< "C time: " << ref_time << " us\n"
<< "SIMD time: " << tst_time << " us\n";
aom_free(input_);
aom_free(output_);
......@@ -238,8 +260,25 @@ class AV1HighbdSelfguidedFilterTest
av1_loop_restoration_precal();
aom_usec_timer timer;
aom_usec_timer_start(&timer);
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
int w = AOMMIN(pu_width, width - j);
int h = AOMMIN(pu_height, height - k);
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration_c(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
}
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (i = 0; i < NUM_ITERS; ++i) {
for (k = 0; k < height; k += pu_height)
for (j = 0; j < width; j += pu_width) {
......@@ -252,11 +291,17 @@ class AV1HighbdSelfguidedFilterTest
1);
}
}
aom_usec_timer_mark(&timer);
double elapsed = static_cast<double>(aom_usec_timer_elapsed(&timer));
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
std::cout << "[ ] C time = " << ref_time / 1000
<< " ms, SIMD time = " << tst_time / 1000 << " ms\n";
printf("%5d %dx%d blocks in %7.3fs = %7.3fus/block\n", NUM_ITERS, width,
height, elapsed / 1000000, elapsed / NUM_ITERS);
EXPECT_GT(ref_time, tst_time)
<< "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
"C.\n"
<< "C time: " << ref_time << " us\n"
<< "SIMD time: " << tst_time << " us\n";
aom_free(input_);
aom_free(output_);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment