Commit aab6aee3 authored by Imdad Sardharwalla's avatar Imdad Sardharwalla Committed by Debargha Mukherjee

AVX2 implementation of the Wiener filter

Added an AVX2 version of the Wiener filter, along with associated tests. Speed
tests have been added for all implementations of the Wiener filter.

Speed Test results
==================

GCC
---

Low bit-depth filter:
- SSE2 vs C: SSE2 takes ~92% less time
- AVX2 vs C: AVX2 takes ~96% less time
- SSE2 vs AVX2: AVX2 takes ~43% less time (~74% faster)

High bit-depth filter:
- SSSE3 vs C: SSSE3 takes ~92% less time
- AVX2  vs C: AVX2  takes ~96% less time
- SSSE3 vs AVX2: AVX2 takes ~46% less time (~84% faster)

CLANG
-----

Low bit-depth filter:
- SSE2 vs C: SSE2 takes ~84% less time
- AVX2 vs C: AVX2 takes ~88% less time
- SSE2 vs AVX2: AVX2 takes ~27% less time (~36% faster)

High bit-depth filter:
- SSSE3 vs C: SSSE3 takes ~85% less time
- AVX2  vs C: AVX2  takes ~89% less time
- SSS3  vs AVX2: AVX2 takes ~24% less time (~31% faster)

Change-Id: Ide22d7c09c0be61483e9682caf17a39438e4a208
parent f7d1ff49
......@@ -440,9 +440,17 @@ if (CONFIG_LOOP_RESTORATION)
${AOM_DSP_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
set(AOM_DSP_COMMON_INTRIN_SSSE3
${AOM_DSP_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_avx2.c")
set(AOM_DSP_COMMON_INTRIN_SSSE3
${AOM_DSP_COMMON_INTRIN_SSSE3}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
set(AOM_DSP_COMMON_INTRIN_AVX2
${AOM_DSP_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c")
endif ()
set(AOM_DSP_ENCODER_INTRIN_SSE4_1
......
......@@ -381,6 +381,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
specialize qw/aom_convolve8_add_src_horiz ssse3/;
specialize qw/aom_convolve8_add_src_vert ssse3/;
specialize qw/aom_convolve8_add_src_hip sse2/;
specialize qw/aom_convolve8_add_src_hip avx2/;
} # CONFIG_LOOP_RESTORATION
# TODO(any): These need to be extended to up to 128x128 block sizes
......@@ -424,7 +425,7 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
specialize qw/aom_highbd_convolve8_add_src_hip ssse3/;
specialize qw/aom_highbd_convolve8_add_src_hip ssse3 avx2/;
} # CONFIG_LOOP_RESTORATION
#
......
This diff is collapsed.
This diff is collapsed.
......@@ -42,4 +42,11 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) {
_mm256_storeu_si256((__m256i *)a, v);
}
// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
// therefore define an equivalent function using a different intrinsic.
// ([ hi ], [ lo ]) -> [ hi ][ lo ]
static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
......@@ -20,23 +20,40 @@ using std::tr1::tuple;
namespace {
#if HAVE_SSE2
#if HAVE_SSE2 || HAVE_AVX2
TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
RunSpeedTest(GET_PARAM(3));
}
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
libaom_test::AV1HiprecConvolve::BuildParams(
aom_convolve8_add_src_hip_sse2));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HiprecConvolveTest,
libaom_test::AV1HiprecConvolve::BuildParams(
aom_convolve8_add_src_hip_avx2));
#endif
#endif
#if HAVE_SSSE3
#if HAVE_SSSE3 || HAVE_AVX2
TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
RunCheckOutput(GET_PARAM(4));
}
TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
RunSpeedTest(GET_PARAM(4));
}
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdHiprecConvolveTest,
libaom_test::AV1HighbdHiprecConvolve::BuildParams(
aom_highbd_convolve8_add_src_hip_ssse3));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdHiprecConvolveTest,
libaom_test::AV1HighbdHiprecConvolve::BuildParams(
aom_highbd_convolve8_add_src_hip_avx2));
#endif
#endif
} // namespace
......@@ -74,10 +74,10 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
uint8_t *input_ = new uint8_t[h * w];
uint8_t *input = input_;
// The convolve functions always write rows with widths that are multiples of
// 8.
// So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
int output_n = ((out_w + 7) & ~7) * out_h;
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint8_t *output = new uint8_t[output_n];
uint8_t *output2 = new uint8_t[output_n];
......@@ -108,6 +108,70 @@ void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
delete[] output;
delete[] output2;
}
void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
const int w = 128, h = 128;
const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
const int num_iters = GET_PARAM(2) / 500;
int i, j, k;
uint8_t *input_ = new uint8_t[h * w];
uint8_t *input = input_;
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint8_t *output = new uint8_t[output_n];
uint8_t *output2 = new uint8_t[output_n];
// Generate random filter kernels
DECLARE_ALIGNED(16, InterpKernel, hkernel);
DECLARE_ALIGNED(16, InterpKernel, vkernel);
generate_kernels(&rnd_, hkernel, vkernel);
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (i = 0; i < num_iters; ++i) {
for (j = 3; j < h - out_h - 4; j++) {
for (k = 3; k < w - out_w - 4; k++) {
aom_convolve8_add_src_hip_c(input + j * w + k, w, output, out_w,
hkernel, 16, vkernel, 16, out_w, out_h);
}
}
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (i = 0; i < num_iters; ++i) {
for (j = 3; j < h - out_h - 4; j++) {
for (k = 3; k < w - out_w - 4; k++) {
test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
16, out_w, out_h);
}
}
}
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
std::cout << "[ ] C time = " << ref_time / 1000
<< " ms, SIMD time = " << tst_time / 1000 << " ms\n";
EXPECT_GT(ref_time, tst_time)
<< "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
<< "C time: " << ref_time << " us\n"
<< "SIMD time: " << tst_time << " us\n";
delete[] input_;
delete[] output;
delete[] output2;
}
} // namespace AV1HiprecConvolve
namespace AV1HighbdHiprecConvolve {
......@@ -143,10 +207,10 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
uint16_t *input = new uint16_t[h * w];
// The convolve functions always write rows with widths that are multiples of
// 8.
// So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
int output_n = ((out_w + 7) & ~7) * out_h;
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint16_t *output = new uint16_t[output_n];
uint16_t *output2 = new uint16_t[output_n];
......@@ -182,5 +246,75 @@ void AV1HighbdHiprecConvolveTest::RunCheckOutput(
delete[] output;
delete[] output2;
}
void AV1HighbdHiprecConvolveTest::RunSpeedTest(
highbd_hiprec_convolve_func test_impl) {
const int w = 128, h = 128;
const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
const int num_iters = GET_PARAM(2) / 500;
const int bd = GET_PARAM(3);
int i, j, k;
uint16_t *input = new uint16_t[h * w];
// The AVX2 convolve functions always write rows with widths that are
// multiples of 16. So to avoid a buffer overflow, we may need to pad
// rows to a multiple of 16.
int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
uint16_t *output = new uint16_t[output_n];
uint16_t *output2 = new uint16_t[output_n];
// Generate random filter kernels
DECLARE_ALIGNED(16, InterpKernel, hkernel);
DECLARE_ALIGNED(16, InterpKernel, vkernel);
generate_kernels(&rnd_, hkernel, vkernel);
for (i = 0; i < h; ++i)
for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
aom_usec_timer ref_timer;
aom_usec_timer_start(&ref_timer);
for (i = 0; i < num_iters; ++i) {
for (j = 3; j < h - out_h - 4; j++) {
for (k = 3; k < w - out_w - 4; k++) {
aom_highbd_convolve8_add_src_hip_c(input_ptr + j * w + k, w, output_ptr,
out_w, hkernel, 16, vkernel, 16,
out_w, out_h, bd);
}
}
}
aom_usec_timer_mark(&ref_timer);
const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer tst_timer;
aom_usec_timer_start(&tst_timer);
for (i = 0; i < num_iters; ++i) {
for (j = 3; j < h - out_h - 4; j++) {
for (k = 3; k < w - out_w - 4; k++) {
test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
vkernel, 16, out_w, out_h, bd);
}
}
}
aom_usec_timer_mark(&tst_timer);
const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
std::cout << "[ ] C time = " << ref_time / 1000
<< " ms, SIMD time = " << tst_time / 1000 << " ms\n";
EXPECT_GT(ref_time, tst_time)
<< "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
<< "C time: " << ref_time << " us\n"
<< "SIMD time: " << tst_time << " us\n";
delete[] input;
delete[] output;
delete[] output2;
}
} // namespace AV1HighbdHiprecConvolve
} // namespace libaom_test
......@@ -12,14 +12,15 @@
#ifndef TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
#define TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/util.h"
#include "./av1_rtcd.h"
#include "./aom_dsp_rtcd.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "./aom_dsp_rtcd.h"
#include "./av1_rtcd.h"
#include "aom_ports/aom_timer.h"
#include "av1/common/mv.h"
namespace libaom_test {
......@@ -48,6 +49,7 @@ class AV1HiprecConvolveTest
protected:
void RunCheckOutput(hiprec_convolve_func test_impl);
void RunSpeedTest(hiprec_convolve_func test_impl);
libaom_test::ACMRandom rnd_;
};
......@@ -76,6 +78,7 @@ class AV1HighbdHiprecConvolveTest
protected:
void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
void RunSpeedTest(highbd_hiprec_convolve_func test_impl);
libaom_test::ACMRandom rnd_;
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment