Commit b4faea73 authored by Luc Trudeau's avatar Luc Trudeau
Browse files

[CFL] SSE2/AVX2 versions of subtract_average

Includes unit tests for conformance and speed.

SSE2/CFLAverageSpeedTest:
4x4: C time = 499 us, SIMD time = 156 us (~3.2x)
8x8: C time = 1124 us, SIMD time = 221 us (~5.1x)
16x16: C time = 4228 us, SIMD time = 620 us (~6.8x)
32x32: C time = 8743 us, SIMD time = 2236 us (~3.9x)

AVX2/CFLAverageSpeedTest:
4x4: C time = 482 us, SIMD time = 180 us (~2.7x)
8x8: C time = 1007 us, SIMD time = 227 us (~4.4x)
16x16: C time = 3471 us, SIMD time = 324 us (~11x)
32x32: C time = 8758 us, SIMD time = 1443 us (~6.1x)

Change-Id: Id5ae80142a9764f388c0770ebcff4e46fa3a4dad
parent 0105c604
......@@ -401,6 +401,14 @@ if (CONFIG_CFL)
${AOM_AV1_COMMON_SOURCES}
"${AOM_ROOT}/av1/common/cfl.c"
"${AOM_ROOT}/av1/common/cfl.h")
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/cfl_sse2.c")
set(AOM_AV1_COMMON_INTRIN_AVX2
${AOM_AV1_COMMON_INTRIN_AVX2}
"${AOM_ROOT}/av1/common/cfl_avx2.c")
endif ()
if (CONFIG_LOOP_RESTORATION)
......
......@@ -110,6 +110,8 @@ AV1_COMMON_SRCS-yes += common/odintrin.h
ifeq ($(CONFIG_CFL),yes)
AV1_COMMON_SRCS-yes += common/cfl.h
AV1_COMMON_SRCS-yes += common/cfl.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/cfl_sse2.c
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/cfl_avx2.c
endif
AV1_COMMON_SRCS-yes += common/obmc.h
......
......@@ -612,4 +612,11 @@ if (aom_config("CONFIG_DAALA_TX") eq "yes") {
specialize qw/daala_inv_txfm_add avx2/;
}
# CFL
if (aom_config("CONFIG_CFL") eq "yes") {
add_proto qw/void av1_cfl_subtract/, "int16_t *pred_buf_q3, int width, int height, int16_t avg_q3";
specialize qw/av1_cfl_subtract sse2 avx2/;
}
1;
......@@ -13,6 +13,8 @@
#include "av1/common/common_data.h"
#include "av1/common/onyxc_int.h"
#include "./av1_rtcd.h"
void cfl_init(CFL_CTX *cfl, AV1_COMMON *cm) {
assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
......@@ -128,6 +130,16 @@ static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
}
}
void av1_cfl_subtract_c(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
pred_buf_q3[i] -= avg_q3;
}
pred_buf_q3 += CFL_BUF_LINE;
}
}
static void cfl_subtract_average(CFL_CTX *cfl, TX_SIZE tx_size) {
const int tx_height = tx_size_high[tx_size];
const int tx_width = tx_size_wide[tx_size];
......@@ -150,14 +162,7 @@ static void cfl_subtract_average(CFL_CTX *cfl, TX_SIZE tx_size) {
// Loss is never more than 1/2 (in Q3)
assert(abs((avg_q3 * (1 << num_pel_log2)) - sum_q3) <= 1 << num_pel_log2 >>
1);
pred_buf_q3 = cfl->pred_buf_q3;
for (int j = 0; j < tx_height; j++) {
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < tx_width; i++) {
pred_buf_q3[i] -= avg_q3;
}
pred_buf_q3 += CFL_BUF_LINE;
}
av1_cfl_subtract(cfl->pred_buf_q3, tx_width, tx_height, avg_q3);
}
static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include "./av1_rtcd.h"
#include "av1/common/cfl.h"
/**
* Subtracts avg_q3 from the active part of the CfL prediction buffer.
*
* The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
* active area is specified using width and height.
*
* Note: We don't need to worry about going over the active area, as long as we
* stay inside the CfL prediction buffer.
*/
void av1_cfl_subtract_avx2(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
const __m256i avg_x16 = _mm256_set1_epi16(avg_q3);
// Sixteen int16 values fit in one __m256i register. If this is enough to do
// the entire row, we move to the next row (stride ==32), otherwise we move to
// the next sixteen values.
// width next
// 4 32
// 8 32
// 16 32
// 32 16
const int stride = CFL_BUF_LINE >> (width == 32);
const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
do {
__m256i val_x16 = _mm256_loadu_si256((__m256i *)pred_buf_q3);
_mm256_storeu_si256((__m256i *)pred_buf_q3,
_mm256_sub_epi16(val_x16, avg_x16));
} while ((pred_buf_q3 += stride) < end);
}
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <emmintrin.h>
#include "./av1_rtcd.h"
#include "av1/common/cfl.h"
#define INT16_IN_M128I (8)
#define TWO_BUFFER_LINES (64)
/**
* Subtracts avg_q3 from the active part of the CfL prediction buffer.
*
* The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
* active area is specified using width and height.
*
* Note: We don't need to worry about going over the active area, as long as we
* stay inside the CfL prediction buffer.
*/
void av1_cfl_subtract_sse2(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
const __m128i avg_x16 = _mm_set1_epi16(avg_q3);
// Eight int16 values fit in one __m128i register. If this is enough to do the
// entire row, the next value is in the next row, otherwise we move to the
// next eight values.
// width next
// 4 32
// 8 32
// 16 8
// 32 8
const int next = CFL_BUF_LINE >> (2 * (width > INT16_IN_M128I));
// If next was in the next row (next == 32), then we need to jump 2 rows
// (stride == 64). Otherwise, if width is 16 we move to the next row, if width
// is 32 we move 16 values.
// width stride
// 4 64
// 8 64
// 16 32
// 32 16
const int stride = TWO_BUFFER_LINES >> (width >> 4);
const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
do {
__m128i val_x16 = _mm_loadu_si128((__m128i *)pred_buf_q3);
__m128i next_val_x16 = _mm_loadu_si128((__m128i *)(pred_buf_q3 + next));
_mm_storeu_si128((__m128i *)pred_buf_q3, _mm_sub_epi16(val_x16, avg_x16));
_mm_storeu_si128((__m128i *)(pred_buf_q3 + next),
_mm_sub_epi16(next_val_x16, avg_x16));
} while ((pred_buf_q3 += stride) < end);
}
......@@ -36,6 +36,12 @@ class ACMRandom {
return (value >> 15) & 0xffff;
}
int16_t Rand15Signed(void) {
const uint32_t value =
random_.Generate(testing::internal::Random::kMaxRange);
return (value >> 17) & 0xffff;
}
int16_t Rand9Signed(void) {
// Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
const uint32_t value = random_.Generate(512);
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "aom_ports/aom_timer.h"
#include "./av1_rtcd.h"
#include "test/util.h"
#include "test/acm_random.h"
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
#define NUM_ITERATIONS (100)
#define NUM_ITERATIONS_SPEED (INT16_MAX)
#define ALL_SIZES_CFL(function) \
make_tuple(4, 4, &function), make_tuple(8, 4, &function), \
make_tuple(4, 8, &function), make_tuple(8, 8, &function), \
make_tuple(16, 8, &function), make_tuple(8, 16, &function), \
make_tuple(16, 16, &function), make_tuple(32, 16, &function), \
make_tuple(16, 32, &function), make_tuple(32, 32, &function)
namespace {
typedef void (*subtract_fn)(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3);
typedef std::tr1::tuple<int, int, subtract_fn> subtract_param;
static void assertFaster(int ref_elapsed_time, int elapsed_time) {
EXPECT_GT(ref_elapsed_time, elapsed_time)
<< "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
<< "C time: " << ref_elapsed_time << " us" << std::endl
<< "SIMD time: " << elapsed_time << " us" << std::endl;
}
static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
int height) {
std::cout.precision(2);
std::cout << "[ ] " << width << "x" << height
<< ": C time = " << ref_elapsed_time
<< " us, SIMD time = " << elapsed_time << " us"
<< " (~" << ref_elapsed_time / (double)elapsed_time << "x) "
<< std::endl;
}
class CFLSubtractTest : public ::testing::TestWithParam<subtract_param> {
public:
virtual ~CFLSubtractTest() {}
virtual void SetUp() { subtract = GET_PARAM(2); }
protected:
int16_t pred_buf_data[CFL_BUF_SQUARE];
int16_t pred_buf_data_ref[CFL_BUF_SQUARE];
subtract_fn subtract;
int Width() const { return GET_PARAM(0); }
int Height() const { return GET_PARAM(1); }
void init(int width, int height) {
int k = 0;
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
pred_buf_data[j * CFL_BUF_LINE + i] = k;
pred_buf_data_ref[j * CFL_BUF_LINE + i] = k++;
}
}
}
};
TEST_P(CFLSubtractTest, SubtractTest) {
const int width = Width();
const int height = Height();
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (int it = 0; it < NUM_ITERATIONS; it++) {
init(width, height);
int16_t k = rnd.Rand15Signed();
subtract(pred_buf_data, width, height, k);
av1_cfl_subtract_c(pred_buf_data_ref, width, height, k);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
ASSERT_EQ(pred_buf_data[j * CFL_BUF_LINE + i],
pred_buf_data_ref[j * CFL_BUF_LINE + i]);
ASSERT_EQ(pred_buf_data[j * CFL_BUF_LINE + i], -k);
k--;
}
}
}
}
TEST_P(CFLSubtractTest, DISABLED_SubtractSpeedTest) {
const int width = Width();
const int height = Height();
aom_usec_timer ref_timer;
aom_usec_timer timer;
init(width, height);
aom_usec_timer_start(&ref_timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
av1_cfl_subtract_c(pred_buf_data_ref, width, height, k);
}
aom_usec_timer_mark(&ref_timer);
const int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer_start(&timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
subtract(pred_buf_data, width, height, k);
}
aom_usec_timer_mark(&timer);
const int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
#if 1
printSpeed(ref_elapsed_time, elapsed_time, width, height);
#endif
assertFaster(ref_elapsed_time, elapsed_time);
}
#if HAVE_SSE2
const subtract_param subtract_sizes_sse2[] = { ALL_SIZES_CFL(
av1_cfl_subtract_sse2) };
INSTANTIATE_TEST_CASE_P(SSE2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_sse2));
#endif
#if HAVE_AVX2
const subtract_param subtract_sizes_avx2[] = { ALL_SIZES_CFL(
av1_cfl_subtract_avx2) };
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_avx2));
#endif
} // namespace
......@@ -140,6 +140,12 @@ if (NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/intrabc_test.cc")
endif ()
if (CONFIG_CFL)
set(AOM_UNIT_TEST_COMMON_SOURCES
${AOM_UNIT_TEST_COMMON_SOURCES}
"${AOM_ROOT}/test/cfl_test.cc")
endif ()
if (CONFIG_LOOP_RESTORATION)
set(AOM_UNIT_TEST_COMMON_SOURCES
${AOM_UNIT_TEST_COMMON_SOURCES}
......
......@@ -124,6 +124,9 @@ endif
LIBAOM_TEST_SRCS-yes += divu_small_test.cc
#LIBAOM_TEST_SRCS-yes += encoder_parms_get_to_decoder.cc
endif
ifeq ($(CONFIG_CFL),yes)
LIBAOM_TEST_SRCS-yes += cfl_test.cc
endif
LIBAOM_TEST_SRCS-yes += convolve_test.cc
LIBAOM_TEST_SRCS-yes += lpf_test.cc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment