Commit 365f73bb authored by Luc Trudeau's avatar Luc Trudeau Committed by David Michael Barr

[CFL] SSE2/AVX2 Versions of Sum and Subtract Average

Includes unit tests for conformance and speed.

SSSE2/CFLSubAvgTest
4x4: C time = 234 us, SIMD time = 152 us (~1.5x)
8x8: C time = 664 us, SIMD time = 208 us (~3.2x)
16x16: C time = 1687 us, SIMD time = 581 us (~2.9x)
32x32: C time = 6118 us, SIMD time = 2119 us (~2.9x)

AVX2/CFLSubAvgTest
4x4: C time = 250 us, SIMD time = 221 us (~1.1x)
8x8: C time = 683 us, SIMD time = 284 us (~2.4x)
16x16: C time = 1727 us, SIMD time = 1091 us (~1.6x)
32x32: C time = 6092 us, SIMD time = 2107 us (~2.9x)

Change-Id: I44ffedc683829d2c16089854ac43d4ddb4415bcd
parent 2e8ae05c
......@@ -384,8 +384,8 @@ if (CONFIG_CFL)
"${AOM_ROOT}/av1/common/cfl.h")
set(AOM_AV1_COMMON_INTRIN_SSE2
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/cfl_sse2.c")
${AOM_AV1_COMMON_INTRIN_SSE2}
"${AOM_ROOT}/av1/common/cfl_sse2.c")
set(AOM_AV1_COMMON_INTRIN_SSSE3
${AOM_AV1_COMMON_INTRIN_SSSE3}
......
......@@ -552,8 +552,8 @@ if (aom_config("CONFIG_INTRA_EDGE") eq "yes") {
# CFL
if (aom_config("CONFIG_CFL") eq "yes") {
add_proto qw/void av1_cfl_subtract/, "int16_t *pred_buf_q3, int width, int height, int16_t avg_q3";
specialize qw/av1_cfl_subtract sse2 avx2/;
add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
specialize qw/get_subtract_average_fn sse2 avx2/;
add_proto qw/cfl_subsample_lbd_fn get_subsample_lbd_fn/, "int sub_x, int sub_y";
specialize qw/get_subsample_lbd_fn ssse3 avx2/;
......
......@@ -520,6 +520,8 @@ typedef struct {
#endif // CONFIG_DEBUG
#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
#define CFL_BUF_LINE (32)
#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
typedef struct cfl_ctx {
// The CfL prediction buffer is used in two steps:
......
......@@ -125,41 +125,31 @@ static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
}
}
void av1_cfl_subtract_c(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
static void subtract_average_c(int16_t *pred_buf_q3, int width, int height,
int round_offset, int num_pel_log2) {
int sum_q3 = 0;
int16_t *pred_buf = pred_buf_q3;
for (int j = 0; j < height; j++) {
// assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < width; i++) {
pred_buf_q3[i] -= avg_q3;
sum_q3 += pred_buf[i];
}
pred_buf_q3 += CFL_BUF_LINE;
pred_buf += CFL_BUF_LINE;
}
}
static void cfl_subtract_average(CFL_CTX *cfl, TX_SIZE tx_size) {
const int tx_height = tx_size_high[tx_size];
const int tx_width = tx_size_wide[tx_size];
const int num_pel_log2 =
tx_size_high_log2[tx_size] + tx_size_wide_log2[tx_size];
int16_t *pred_buf_q3 = cfl->pred_buf_q3;
int sum_q3 = 0;
cfl_pad(cfl, tx_width, tx_height);
for (int j = 0; j < tx_height; j++) {
assert(pred_buf_q3 + tx_width <= cfl->pred_buf_q3 + CFL_BUF_SQUARE);
for (int i = 0; i < tx_width; i++) {
sum_q3 += pred_buf_q3[i];
const int avg_q3 = (sum_q3 + round_offset) >> num_pel_log2;
// Loss is never more than 1/2 (in Q3)
// assert(abs((avg_q3 * (1 << num_pel_log2)) - sum_q3) <= 1 << num_pel_log2 >>
// 1);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
pred_buf_q3[i] -= avg_q3;
}
pred_buf_q3 += CFL_BUF_LINE;
}
const int avg_q3 = (sum_q3 + (1 << (num_pel_log2 - 1))) >> num_pel_log2;
// Loss is never more than 1/2 (in Q3)
assert(abs((avg_q3 * (1 << num_pel_log2)) - sum_q3) <= 1 << num_pel_log2 >>
1);
av1_cfl_subtract(cfl->pred_buf_q3, tx_width, tx_height, avg_q3);
}
CFL_SUB_AVG_FN(c)
static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
CFL_PRED_TYPE pred_type) {
const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
......@@ -227,7 +217,8 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
}
#endif // CONFIG_DEBUG && !CONFIG_RECT_TX_EXT_INTRA
cfl_subtract_average(cfl, tx_size);
cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
get_subtract_average_fn(tx_size)(cfl->pred_buf_q3);
cfl->are_parameters_computed = 1;
}
......
......@@ -20,6 +20,8 @@ typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
int16_t *output_q3, int width, int height);
typedef void (*cfl_subtract_average_fn)(int16_t *pred_buf_q3);
typedef void (*cfl_predict_lbd_fn)(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size,
int alpha_q3);
......@@ -96,4 +98,83 @@ void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, int input_stride,
return cfl_luma_subsampling_444_hbd_c; \
}
// Null function used for invalid tx_sizes
static INLINE void cfl_subtract_average_null(int16_t *pred_buf_q3) {
(void)pred_buf_q3;
assert(0);
}
#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
static void subtract_average_##width##x##height##_x(int16_t *pred_buf_q3) { \
subtract_average_##arch(pred_buf_q3, width, height, round_offset, \
num_pel_log2); \
}
#if CONFIG_TX64X64
#define CFL_SUB_AVG_FN(arch) \
CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \
CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \
CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \
CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \
CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \
CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \
CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \
CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \
CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \
CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \
CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \
CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \
CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \
CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \
cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
subtract_average_4x4_x, /* 4x4 */ \
subtract_average_8x8_x, /* 8x8 */ \
subtract_average_16x16_x, /* 16x16 */ \
subtract_average_32x32_x, /* 32x32 */ \
cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ \
subtract_average_4x8_x, /* 4x8 */ \
subtract_average_8x4_x, /* 8x4 */ \
subtract_average_8x16_x, /* 8x16 */ \
subtract_average_16x8_x, /* 16x8 */ \
subtract_average_16x32_x, /* 16x32 */ \
subtract_average_32x16_x, /* 32x16 */ \
cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ \
cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ \
subtract_average_4x16_x, /* 4x16 (invalid CFL size) */ \
subtract_average_16x4_x, /* 16x4 (invalid CFL size) */ \
subtract_average_8x32_x, /* 8x32 (invalid CFL size) */ \
subtract_average_32x8_x, /* 32x8 (invalid CFL size) */ \
cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ \
cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ \
}; \
/* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
/* index the function pointer array out of bounds. */ \
return sub_avg[tx_size % TX_SIZES_ALL]; \
}
#else
#define CFL_SUB_AVG_FN(arch) \
cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
subtract_average_4x4_x, /* 4x4 */ \
subtract_average_8x8_x, /* 8x8 */ \
subtract_average_16x16_x, /* 16x16 */ \
subtract_average_32x32_x, /* 32x32 */ \
subtract_average_4x8_x, /* 4x8 */ \
subtract_average_8x4_x, /* 8x4 */ \
subtract_average_8x16_x, /* 8x16 */ \
subtract_average_16x8_x, /* 16x8 */ \
subtract_average_16x32_x, /* 16x32 */ \
subtract_average_32x16_x, /* 32x16 */ \
subtract_average_4x16_x, /* 4x16 (invalid CFL size) */ \
subtract_average_16x4_x, /* 16x4 (invalid CFL size) */ \
subtract_average_8x32_x, /* 8x32 (invalid CFL size) */ \
subtract_average_32x8_x, /* 32x8 (invalid CFL size) */ \
}; \
/* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
/* index the function pointer array out of bounds. */ \
return sub_avg[tx_size % TX_SIZES_ALL]; \
}
#endif
#endif // AV1_COMMON_CFL_H_
......@@ -14,37 +14,6 @@
#include "av1/common/cfl.h"
/**
* Subtracts avg_q3 from the active part of the CfL prediction buffer.
*
* The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
* active area is specified using width and height.
*
* Note: We don't need to worry about going over the active area, as long as we
* stay inside the CfL prediction buffer.
*/
void av1_cfl_subtract_avx2(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
const __m256i avg_x16 = _mm256_set1_epi16(avg_q3);
// Sixteen int16 values fit in one __m256i register. If this is enough to do
// the entire row, we move to the next row (stride ==32), otherwise we move to
// the next sixteen values.
// width next
// 4 32
// 8 32
// 16 32
// 32 16
const int stride = CFL_BUF_LINE >> (width == 32);
const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
do {
__m256i val_x16 = _mm256_loadu_si256((__m256i *)pred_buf_q3);
_mm256_storeu_si256((__m256i *)pred_buf_q3,
_mm256_sub_epi16(val_x16, avg_x16));
} while ((pred_buf_q3 += stride) < end);
}
/**
* Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
* precise version of a box filter 4:2:0 pixel subsampling in Q3.
......@@ -216,3 +185,74 @@ cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
};
return predict_hbd[(tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]) & 3];
}
static INLINE __m256i fill_sum_epi32(__m256i l0) {
l0 = _mm256_add_epi32(l0, _mm256_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm256_add_epi32(l0,
_mm256_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
}
static INLINE void subtract_average_avx2(int16_t *pred_buf, int width,
int height, int round_offset,
int num_pel_log2) {
const __m256i zeros = _mm256_setzero_si256();
__m256i *row = (__m256i *)pred_buf;
const __m256i *const end = row + height * CFL_BUF_LINE_I256;
const int step = CFL_BUF_LINE_I256 * (1 + (width == 8) + 3 * (width == 4));
union {
__m256i v;
int32_t i32[8];
} sum;
sum.v = zeros;
do {
if (width == 4) {
__m256i l0 = _mm256_loadu_si256(row);
__m256i l1 = _mm256_loadu_si256(row + CFL_BUF_LINE_I256);
__m256i l2 = _mm256_loadu_si256(row + 2 * CFL_BUF_LINE_I256);
__m256i l3 = _mm256_loadu_si256(row + 3 * CFL_BUF_LINE_I256);
__m256i t0 = _mm256_add_epi16(l0, l1);
__m256i t1 = _mm256_add_epi16(l2, l3);
sum.v = _mm256_add_epi32(
sum.v, _mm256_add_epi32(_mm256_unpacklo_epi16(t0, zeros),
_mm256_unpacklo_epi16(t1, zeros)));
} else {
__m256i l0;
if (width == 8) {
l0 = _mm256_add_epi16(_mm256_loadu_si256(row),
_mm256_loadu_si256(row + CFL_BUF_LINE_I256));
} else {
l0 = _mm256_loadu_si256(row);
l0 = _mm256_add_epi16(l0, _mm256_permute2x128_si256(l0, l0, 1));
}
sum.v = _mm256_add_epi32(
sum.v, _mm256_add_epi32(_mm256_unpacklo_epi16(l0, zeros),
_mm256_unpackhi_epi16(l0, zeros)));
if (width == 32) {
l0 = _mm256_loadu_si256(row + 1);
l0 = _mm256_add_epi16(l0, _mm256_permute2x128_si256(l0, l0, 1));
sum.v = _mm256_add_epi32(
sum.v, _mm256_add_epi32(_mm256_unpacklo_epi16(l0, zeros),
_mm256_unpackhi_epi16(l0, zeros)));
}
}
} while ((row += step) < end);
sum.v = fill_sum_epi32(sum.v);
__m256i avg_epi16 =
_mm256_set1_epi16((sum.i32[0] + round_offset) >> num_pel_log2);
row = (__m256i *)pred_buf;
do {
_mm256_storeu_si256(row,
_mm256_sub_epi16(_mm256_loadu_si256(row), avg_epi16));
if (width == 32) {
_mm256_storeu_si256(
row + 1, _mm256_sub_epi16(_mm256_loadu_si256(row + 1), avg_epi16));
}
} while ((row += CFL_BUF_LINE_I256) < end);
}
CFL_SUB_AVG_FN(avx2)
......@@ -13,52 +13,71 @@
#include "./av1_rtcd.h"
#include "av1/common/cfl.h"
#define INT16_IN_M128I (8)
static INLINE __m128i fill_sum_epi32(__m128i l0) {
l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
}
#define TWO_BUFFER_LINES (64)
static INLINE void subtract_average_sse2(int16_t *pred_buf, int width,
int height, int round_offset,
int num_pel_log2) {
const __m128i zeros = _mm_setzero_si128();
const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
__m128i *row = (__m128i *)pred_buf;
const __m128i *const end = row + height * CFL_BUF_LINE_I128;
const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
/**
* Subtracts avg_q3 from the active part of the CfL prediction buffer.
*
* The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
* active area is specified using width and height.
*
* Note: We don't need to worry about going over the active area, as long as we
* stay inside the CfL prediction buffer.
*/
void av1_cfl_subtract_sse2(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3) {
const __m128i avg_x16 = _mm_set1_epi16(avg_q3);
__m128i sum = zeros;
do {
__m128i l0;
if (width == 4) {
l0 = _mm_add_epi16(_mm_loadl_epi64(row),
_mm_loadl_epi64(row + CFL_BUF_LINE_I128));
__m128i l1 = _mm_add_epi16(_mm_loadl_epi64(row + 2 * CFL_BUF_LINE_I128),
_mm_loadl_epi64(row + 3 * CFL_BUF_LINE_I128));
sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
_mm_unpacklo_epi16(l1, zeros)));
} else {
if (width == 8) {
l0 = _mm_add_epi16(_mm_loadu_si128(row),
_mm_loadu_si128(row + CFL_BUF_LINE_I128));
} else {
l0 = _mm_add_epi16(_mm_loadu_si128(row), _mm_loadu_si128(row + 1));
}
sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
_mm_unpackhi_epi16(l0, zeros)));
if (width == 32) {
l0 = _mm_add_epi16(_mm_loadu_si128(row + 2), _mm_loadu_si128(row + 3));
sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
_mm_unpackhi_epi16(l0, zeros)));
}
}
} while ((row += step) < end);
// Eight int16 values fit in one __m128i register. If this is enough to do the
// entire row, the next value is in the next row, otherwise we move to the
// next eight values.
// width next
// 4 32
// 8 32
// 16 8
// 32 8
const int next = CFL_BUF_LINE >> (2 * (width > INT16_IN_M128I));
sum = fill_sum_epi32(sum);
// If next was in the next row (next == 32), then we need to jump 2 rows
// (stride == 64). Otherwise, if width is 16 we move to the next row, if width
// is 32 we move 16 values.
// width stride
// 4 64
// 8 64
// 16 32
// 32 16
const int stride = TWO_BUFFER_LINES >> (width >> 4);
__m128i avg_epi16 =
_mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
row = (__m128i *)pred_buf;
do {
__m128i val_x16 = _mm_loadu_si128((__m128i *)pred_buf_q3);
__m128i next_val_x16 = _mm_loadu_si128((__m128i *)(pred_buf_q3 + next));
_mm_storeu_si128((__m128i *)pred_buf_q3, _mm_sub_epi16(val_x16, avg_x16));
_mm_storeu_si128((__m128i *)(pred_buf_q3 + next),
_mm_sub_epi16(next_val_x16, avg_x16));
} while ((pred_buf_q3 += stride) < end);
if (width == 4) {
_mm_storel_epi64(row, _mm_sub_epi16(_mm_loadl_epi64(row), avg_epi16));
} else {
_mm_storeu_si128(row, _mm_sub_epi16(_mm_loadu_si128(row), avg_epi16));
if (width > 8) {
_mm_storeu_si128(row + 1,
_mm_sub_epi16(_mm_loadu_si128(row + 1), avg_epi16));
if (width == 32) {
_mm_storeu_si128(row + 2,
_mm_sub_epi16(_mm_loadu_si128(row + 2), avg_epi16));
_mm_storeu_si128(row + 3,
_mm_sub_epi16(_mm_loadu_si128(row + 3), avg_epi16));
}
}
}
} while ((row += CFL_BUF_LINE_I128) < end);
}
CFL_SUB_AVG_FN(sse2)
......@@ -14,12 +14,13 @@
#include "./av1_rtcd.h"
#include "test/util.h"
#include "test/acm_random.h"
#include "av1/common/cfl.h"
using std::tr1::make_tuple;
using libaom_test::ACMRandom;
#define NUM_ITERATIONS (10)
#define NUM_ITERATIONS (100)
#define NUM_ITERATIONS_SPEED (INT16_MAX)
#define ALL_CFL_SIZES(function) \
......@@ -35,24 +36,23 @@ using libaom_test::ACMRandom;
make_tuple(16, 8, &function), make_tuple(8, 16, &function), \
make_tuple(16, 16, &function)
#define ALL_CFL_TX_SIZES(function) \
make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function), \
make_tuple(TX_8X4, &function), make_tuple(TX_8X8, &function), \
make_tuple(TX_8X16, &function), make_tuple(TX_16X8, &function), \
make_tuple(TX_16X16, &function), make_tuple(TX_16X32, &function), \
#define ALL_CFL_TX_SIZES(function) \
make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function), \
make_tuple(TX_4X16, &function), make_tuple(TX_8X4, &function), \
make_tuple(TX_8X8, &function), make_tuple(TX_8X16, &function), \
make_tuple(TX_8X32, &function), make_tuple(TX_16X4, &function), \
make_tuple(TX_16X8, &function), make_tuple(TX_16X16, &function), \
make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
namespace {
typedef void (*subtract_fn)(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3);
typedef cfl_subsample_lbd_fn (*get_subsample_fn)(int width, int height);
typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
typedef std::tr1::tuple<int, int, subtract_fn> subtract_param;
typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
typedef std::tr1::tuple<int, int, get_subsample_fn> subsample_param;
......@@ -60,6 +60,8 @@ typedef std::tr1::tuple<TX_SIZE, get_predict_fn> predict_param;
typedef std::tr1::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
typedef std::tr1::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
static void assertFaster(int ref_elapsed_time, int elapsed_time) {
EXPECT_GT(ref_elapsed_time, elapsed_time)
<< "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
......@@ -77,23 +79,27 @@ static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
<< std::endl;
}
class CFLSubtractTest : public ::testing::TestWithParam<subtract_param> {
class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param> {
public:
virtual ~CFLSubtractTest() {}
virtual void SetUp() { subtract = GET_PARAM(2); }
virtual ~CFLSubAvgTest() {}
virtual void SetUp() { sub_avg = GET_PARAM(1); }
protected:
int Width() const { return GET_PARAM(0); }
int Height() const { return GET_PARAM(1); }
int16_t pred_buf_data[CFL_BUF_SQUARE];
int16_t pred_buf_data_ref[CFL_BUF_SQUARE];
subtract_fn subtract;
void init(int width, int height) {
int k = 0;
int Width() const { return tx_size_wide[GET_PARAM(0)]; }
int Height() const { return tx_size_high[GET_PARAM(0)]; }
TX_SIZE Tx_size() const { return GET_PARAM(0); }
sub_avg_fn sub_avg;
int16_t data[CFL_BUF_SQUARE];
int16_t data_ref[CFL_BUF_SQUARE];
void init() {
const int width = Width();
const int height = Height();
ACMRandom rnd(ACMRandom::DeterministicSeed());
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
pred_buf_data[j * CFL_BUF_LINE + i] = k;
pred_buf_data_ref[j * CFL_BUF_LINE + i] = k++;
const int16_t d = rnd.Rand15Signed();
data[j * CFL_BUF_LINE + i] = d;
data_ref[j * CFL_BUF_LINE + i] = d;
}
}
}
......@@ -223,49 +229,49 @@ class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd> {
}
};
TEST_P(CFLSubtractTest, SubtractTest) {
const int width = Width();
const int height = Height();
ACMRandom rnd(ACMRandom::DeterministicSeed());
TEST_P(CFLSubAvgTest, SubAvgTest) {
const TX_SIZE tx_size = Tx_size();
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
const cfl_subtract_average_fn ref_sub = get_subtract_average_fn_c(tx_size);
const cfl_subtract_average_fn sub = sub_avg(tx_size);
for (int it = 0; it < NUM_ITERATIONS; it++) {
init(width, height);
int16_t k = rnd.Rand15Signed();
subtract(pred_buf_data, width, height, k);
av1_cfl_subtract_c(pred_buf_data_ref, width, height, k);
init();
sub(data);
ref_sub(data_ref);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
ASSERT_EQ(pred_buf_data[j * CFL_BUF_LINE + i],
pred_buf_data_ref[j * CFL_BUF_LINE + i]);
ASSERT_EQ(pred_buf_data[j * CFL_BUF_LINE + i], -k);
k--;
ASSERT_EQ(data_ref[j * CFL_BUF_LINE + i], data[j * CFL_BUF_LINE + i]);
}
}
}
}
TEST_P(CFLSubtractTest, DISABLED_SubtractSpeedTest) {
TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
const int width = Width();
const int height = Height();
const TX_SIZE tx_size = Tx_size();
const cfl_subtract_average_fn ref_sub = get_subtract_average_fn_c(tx_size);
const cfl_subtract_average_fn sub = sub_avg(tx_size);
aom_usec_timer ref_timer;
aom_usec_timer timer;
init(width, height);
init();
aom_usec_timer_start(&ref_timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
av1_cfl_subtract_c(pred_buf_data_ref, width, height, k);
ref_sub(data_ref);
}
aom_usec_timer_mark(&ref_timer);
const int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
aom_usec_timer_start(&timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
subtract(pred_buf_data, width, height, k);
sub(data);
}
aom_usec_timer_mark(&timer);
const int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
printSpeed(ref_elapsed_time, elapsed_time, width, height);
assertFaster(ref_elapsed_time, elapsed_time);
......@@ -421,15 +427,16 @@ TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
}
#if HAVE_SSE2
const subtract_param subtract_sizes_sse2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_sse2) };
const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
get_subtract_average_fn_sse2) };
INSTANTIATE_TEST_CASE_P(SSE2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_sse2));
INSTANTIATE_TEST_CASE_P(SSE2, CFLSubAvgTest,
::testing::ValuesIn(sub_avg_sizes_sse2));
#endif
#if HAVE_SSSE3
const subsample_param subsample_sizes_ssse3[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_ssse3) };
......@@ -450,8 +457,8 @@ INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictHBDTest,
#endif
#if HAVE_AVX2
const subtract_param subtract_sizes_avx2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_avx2) };
const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
get_subtract_average_fn_avx2) };
const subsample_param subsample_sizes_avx2[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_avx2) };
......@@ -462,8 +469,8 @@ const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
get_predict_hbd_fn_avx2) };
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_avx2));
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubAvgTest,
::testing::ValuesIn(sub_avg_sizes_avx2));
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleTest,
::testing::ValuesIn(subsample_sizes_avx2));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment