Commit c6acc531 authored by Imdad Sardharwalla's avatar Imdad Sardharwalla Committed by Debargha Mukherjee

Added AVX2 implementation of self-guided filter

The self-guided filter has now been implemented using
the intrinsics for AVX2. The corresponding speed and
correctness tests have also been added.

Note: All AVX2 functions are in synonyms_avx2.h, as
GCC produces 'ABI change' warnings if they are
included in synonyms.h.

Change-Id: I2a283a4acf8c01ee835d5edc526abc242d87ad9b
parent d3e22456
......@@ -16,6 +16,7 @@ DSP_SRCS-yes += aom_dsp_common.h
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms_avx2.h
# bit reader
DSP_SRCS-yes += prob.h
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_
#define AOM_DSP_X86_SYNONYMS_AVX2_H_
#include <immintrin.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
/**
* Various reusable shorthands for x86 SIMD intrinsics.
*
* Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
* Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
*/
// Loads and stores to do away with the tedium of casting the address
// to the right type.
static INLINE __m256i yy_load_256(const void *a) {
return _mm256_load_si256((const __m256i *)a);
}
static INLINE __m256i yy_loadu_256(const void *a) {
return _mm256_loadu_si256((const __m256i *)a);
}
static INLINE void yy_store_256(void *const a, const __m256i v) {
_mm256_store_si256((__m256i *)a, v);
}
static INLINE void yy_storeu_256(void *const a, const __m256i v) {
_mm256_storeu_si256((__m256i *)a, v);
}
#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_
......@@ -420,6 +420,10 @@ if (CONFIG_LOOP_RESTORATION)
${AOM_AV1_COMMON_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/common/x86/selfguided_sse4.c")
set(AOM_AV1_COMMON_INTRIN_AVX2
${AOM_AV1_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/av1/common/x86/selfguided_avx2.c")
set(AOM_AV1_ENCODER_SOURCES
${AOM_AV1_ENCODER_SOURCES}
"${AOM_ROOT}/av1/encoder/pickrst.c"
......
......@@ -88,6 +88,7 @@ ifeq ($(CONFIG_LOOP_RESTORATION),yes)
AV1_COMMON_SRCS-yes += common/restoration.h
AV1_COMMON_SRCS-yes += common/restoration.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/selfguided_sse4.c
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/selfguided_avx2.c
endif
ifeq ($(CONFIG_INTRA_EDGE),yes)
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/intra_edge_sse4.c
......
......@@ -509,10 +509,10 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
specialize qw/apply_selfguided_restoration sse4_1/;
specialize qw/apply_selfguided_restoration sse4_1 avx2/;
add_proto qw/void av1_selfguided_restoration/, "const uint8_t *dgd, int width, int height, int stride, int32_t *flt1, int32_t *flt2, int flt_stride, const sgr_params_type *params, int bit_depth, int highbd";
specialize qw/av1_selfguided_restoration sse4_1/;
specialize qw/av1_selfguided_restoration sse4_1 avx2/;
}
# CONVOLVE_ROUND/COMPOUND_ROUND functions
......
This diff is collapsed.
......@@ -29,7 +29,13 @@ using std::tr1::tuple;
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
typedef tuple<> FilterTestParam;
typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
int eps, const int *xqd, uint8_t *dst8, int dst_stride,
int32_t *tmpbuf, int bit_depth, int highbd);
// Test parameter list:
// <tst_fun_>
typedef tuple<SgrFunc> FilterTestParam;
class AV1SelfguidedFilterTest
: public ::testing::TestWithParam<FilterTestParam> {
......@@ -41,6 +47,7 @@ class AV1SelfguidedFilterTest
protected:
void RunSpeedTest() {
tst_fun_ = GET_PARAM(0);
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
......@@ -48,10 +55,10 @@ class AV1SelfguidedFilterTest
int i, j, k;
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint8_t));
(uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
uint8_t *output_ = (uint8_t *)aom_memalign(
16, out_stride * (height + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
32, out_stride * (height + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
uint8_t *input = input_ + stride * 16 + 16;
uint8_t *output = output_ + out_stride * 16 + 16;
......@@ -81,8 +88,8 @@ class AV1SelfguidedFilterTest
int h = AOMMIN(pu_height, height - k);
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf, 8, 0);
tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
tmpbuf, 8, 0);
}
}
std::clock_t end = std::clock();
......@@ -97,6 +104,7 @@ class AV1SelfguidedFilterTest
}
void RunCorrectnessTest() {
tst_fun_ = GET_PARAM(0);
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
......@@ -107,12 +115,12 @@ class AV1SelfguidedFilterTest
int i, j, k;
uint8_t *input_ =
(uint8_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint8_t));
(uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
uint8_t *output_ = (uint8_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint8_t));
32, out_stride * (max_h + 32) * sizeof(uint8_t));
uint8_t *output2_ = (uint8_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
32, out_stride * (max_h + 32) * sizeof(uint8_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
uint8_t *input = input_ + stride * 16 + 16;
uint8_t *output = output_ + out_stride * 16 + 16;
......@@ -146,17 +154,12 @@ class AV1SelfguidedFilterTest
uint8_t *input_p = input + k * stride + j;
uint8_t *output_p = output + k * out_stride + j;
uint8_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration(input_p, w, h, stride, eps, xqd,
output_p, out_stride, tmpbuf, 8, 0);
tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
tmpbuf, 8, 0);
apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
output2_p, out_stride, tmpbuf, 8, 0);
}
/*
apply_selfguided_restoration(input, test_w, test_h, stride, eps, xqd,
output, out_stride, tmpbuf);
apply_selfguided_restoration_c(input, test_w, test_h, stride, eps, xqd,
output2, out_stride, tmpbuf);
*/
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k) {
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
......@@ -168,18 +171,27 @@ class AV1SelfguidedFilterTest
aom_free(output2_);
aom_free(tmpbuf);
}
private:
SgrFunc tst_fun_;
};
TEST_P(AV1SelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
#if HAVE_SSE4_1
const FilterTestParam params[] = { make_tuple() };
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
::testing::ValuesIn(params));
::testing::Values(apply_selfguided_restoration_sse4_1));
#endif
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest,
::testing::Values(apply_selfguided_restoration_avx2));
#endif
typedef tuple<int> HighbdFilterTestParam;
// Test parameter list:
// <tst_fun_, bit_depth>
typedef tuple<SgrFunc, int> HighbdFilterTestParam;
class AV1HighbdSelfguidedFilterTest
: public ::testing::TestWithParam<HighbdFilterTestParam> {
......@@ -191,19 +203,20 @@ class AV1HighbdSelfguidedFilterTest
protected:
void RunSpeedTest() {
tst_fun_ = GET_PARAM(0);
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
const int width = 256, height = 256, stride = 288, out_stride = 288;
const int NUM_ITERS = 2000;
int i, j, k;
int bit_depth = GET_PARAM(0);
int bit_depth = GET_PARAM(1);
int mask = (1 << bit_depth) - 1;
uint16_t *input_ =
(uint16_t *)aom_memalign(16, stride * (height + 32) * sizeof(uint16_t));
(uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
uint16_t *output_ = (uint16_t *)aom_memalign(
16, out_stride * (height + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
32, out_stride * (height + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
uint16_t *input = input_ + stride * 16 + 16;
uint16_t *output = output_ + out_stride * 16 + 16;
......@@ -234,9 +247,9 @@ class AV1HighbdSelfguidedFilterTest
int h = AOMMIN(pu_height, height - k);
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
apply_selfguided_restoration(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
1);
}
}
aom_usec_timer_mark(&timer);
......@@ -251,6 +264,7 @@ class AV1HighbdSelfguidedFilterTest
}
void RunCorrectnessTest() {
tst_fun_ = GET_PARAM(0);
const int pu_width = RESTORATION_PROC_UNIT_SIZE;
const int pu_height = RESTORATION_PROC_UNIT_SIZE;
// Set the maximum width/height to test here. We actually test a small
......@@ -259,16 +273,16 @@ class AV1HighbdSelfguidedFilterTest
const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
const int NUM_ITERS = 81;
int i, j, k;
int bit_depth = GET_PARAM(0);
int bit_depth = GET_PARAM(1);
int mask = (1 << bit_depth) - 1;
uint16_t *input_ =
(uint16_t *)aom_memalign(16, stride * (max_h + 32) * sizeof(uint16_t));
(uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
uint16_t *output_ = (uint16_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint16_t));
32, out_stride * (max_h + 32) * sizeof(uint16_t));
uint16_t *output2_ = (uint16_t *)aom_memalign(
16, out_stride * (max_h + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE);
32, out_stride * (max_h + 32) * sizeof(uint16_t));
int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
uint16_t *input = input_ + stride * 16 + 16;
uint16_t *output = output_ + out_stride * 16 + 16;
......@@ -302,22 +316,14 @@ class AV1HighbdSelfguidedFilterTest
uint16_t *input_p = input + k * stride + j;
uint16_t *output_p = output + k * out_stride + j;
uint16_t *output2_p = output2 + k * out_stride + j;
apply_selfguided_restoration(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
1);
apply_selfguided_restoration_c(
CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
}
/*
apply_selfguided_restoration_highbd(input, test_w, test_h, stride,
bit_depth, eps, xqd, output,
out_stride, tmpbuf);
apply_selfguided_restoration_highbd_c(input, test_w, test_h, stride,
bit_depth, eps, xqd, output2,
out_stride, tmpbuf);
*/
for (j = 0; j < test_h; ++j)
for (k = 0; k < test_w; ++k)
ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
......@@ -328,16 +334,28 @@ class AV1HighbdSelfguidedFilterTest
aom_free(output2_);
aom_free(tmpbuf);
}
private:
SgrFunc tst_fun_;
};
TEST_P(AV1HighbdSelfguidedFilterTest, SpeedTest) { RunSpeedTest(); }
TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
#if HAVE_SSE4_1
const HighbdFilterTestParam highbd_params[] = { make_tuple(8), make_tuple(10),
make_tuple(12) };
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdSelfguidedFilterTest,
::testing::ValuesIn(highbd_params));
const int highbd_params_sse4_1[] = { 8, 10, 12 };
INSTANTIATE_TEST_CASE_P(
SSE4_1, AV1HighbdSelfguidedFilterTest,
::testing::Combine(::testing::Values(apply_selfguided_restoration_sse4_1),
::testing::ValuesIn(highbd_params_sse4_1)));
#endif
#if HAVE_AVX2
const int highbd_params_avx2[] = { 8, 10, 12 };
INSTANTIATE_TEST_CASE_P(
AVX2, AV1HighbdSelfguidedFilterTest,
::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2),
::testing::ValuesIn(highbd_params_avx2)));
#endif
} // namespace
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment