Commit 16f38c2c authored by David Michael Barr's avatar David Michael Barr Committed by Luc Trudeau

[CFL] SSSE3/AVX2 versions of cfl_build_prediction_lbd

Includes unit tests for conformance and speed.

SSSE3/CFLPredictTest:
4x4: C time = 2063 us, SIMD time = 313 us (~6.6x)
8x8: C time = 6656 us, SIMD time = 493 us (~14x)
16x16: C time = 24970 us, SIMD time = 1327 us (~19x)
32x32: C time = 59020 us, SIMD time = 5178 us (~11x)

AVX2/CFLPredictTest:
4x4: C time = 2052 us, SIMD time = 333 us (~6.2x)
8x8: C time = 6712 us, SIMD time = 513 us (~13x)
16x16: C time = 25292 us, SIMD time = 1023 us (~25x)
32x32: C time = 58994 us, SIMD time = 2828 us (~21x)

Change-Id: I08690a548be981ff10e184de468b9e0e691ee812
parent f340fece
......@@ -587,7 +587,10 @@ if (aom_config("CONFIG_CFL") eq "yes") {
specialize qw/av1_cfl_subtract sse2 avx2/;
add_proto qw/cfl_subsample_lbd_fn get_subsample_lbd_fn/, "int sub_x, int sub_y";
specialize qw/get_subsample_lbd_fn ssse3 avx2/
specialize qw/get_subsample_lbd_fn ssse3 avx2/;
add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
specialize qw/get_predict_lbd_fn ssse3 avx2/;
}
1;
......@@ -171,9 +171,10 @@ static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
}
static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, int width, int height,
int dst_stride, TX_SIZE tx_size,
int alpha_q3) {
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
const int height = tx_size_high[tx_size];
const int width = tx_size_wide[tx_size];
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] =
......@@ -187,7 +188,6 @@ static void cfl_build_prediction_lbd(const int16_t *pred_buf_q3, uint8_t *dst,
static void cfl_build_prediction_hbd(const int16_t *pred_buf_q3, uint16_t *dst,
int dst_stride, int width, int height,
int alpha_q3, int bit_depth) {
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
dst[i] = clip_pixel_highbd(
......@@ -229,6 +229,11 @@ static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
cfl->are_parameters_computed = 1;
}
cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size) {
(void)tx_size;
return cfl_build_prediction_lbd;
}
void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
TX_SIZE tx_size, int plane) {
CFL_CTX *const cfl = &xd->cfl;
......@@ -239,16 +244,17 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
const int alpha_q3 =
cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
const int width = tx_size_wide[tx_size];
const int height = tx_size_high[tx_size];
assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
if (get_bitdepth_data_path_index(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
cfl_build_prediction_hbd(cfl->pred_buf_q3, dst_16, dst_stride,
tx_size_wide[tx_size], tx_size_high[tx_size],
alpha_q3, xd->bd);
cfl_build_prediction_hbd(cfl->pred_buf_q3, dst_16, dst_stride, width,
height, alpha_q3, xd->bd);
return;
}
cfl_build_prediction_lbd(cfl->pred_buf_q3, dst, dst_stride,
tx_size_wide[tx_size], tx_size_high[tx_size],
alpha_q3);
get_predict_lbd_fn(tx_size)(cfl->pred_buf_q3, dst, dst_stride, tx_size,
alpha_q3);
}
static void cfl_luma_subsampling_420_lbd(const uint8_t *input, int input_stride,
......
......@@ -17,6 +17,10 @@
typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
int16_t *output_q3, int width, int height);
typedef void (*cfl_predict_lbd_fn)(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size,
int alpha_q3);
static INLINE int is_cfl_allowed(const MB_MODE_INFO *mbmi) {
const BLOCK_SIZE bsize = mbmi->sb_type;
assert(bsize < BLOCK_SIZES_ALL);
......
......@@ -99,3 +99,77 @@ cfl_subsample_lbd_fn get_subsample_lbd_fn_avx2(int sub_x, int sub_y) {
// index the function pointer array out of bounds.
return subsample_lbd[sub_y & 1][sub_x & 1];
}
static INLINE __m256i predict_lbd_unclipped(const __m256i *input,
__m256i alpha_q12,
__m256i alpha_sign, __m256i dc_q0) {
__m256i ac_q3 = _mm256_loadu_si256(input);
__m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
__m256i scaled_luma_q0 =
_mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
return _mm256_add_epi16(scaled_luma_q0, dc_q0);
}
static INLINE void cfl_predict_lbd_x(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size,
int alpha_q3, int width) {
const int16_t *row_end = pred_buf_q3 + tx_size_high[tx_size] * CFL_BUF_LINE;
const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
const __m256i dc_q0 = _mm256_set1_epi16(*dst);
do {
__m256i res = predict_lbd_unclipped((__m256i *)pred_buf_q3, alpha_q12,
alpha_sign, dc_q0);
__m256i next = res;
if (width == 32)
next = predict_lbd_unclipped((__m256i *)(pred_buf_q3 + 16), alpha_q12,
alpha_sign, dc_q0);
res = _mm256_packus_epi16(res, next);
if (width == 4) {
*(int32_t *)dst = _mm256_extract_epi32(res, 0);
} else if (width == 8) {
#ifdef __x86_64__
*(int64_t *)dst = _mm256_extract_epi64(res, 0);
#else
_mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(res));
#endif
} else {
res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
if (width == 16)
_mm_store_si128((__m128i *)dst, _mm256_castsi256_si128(res));
else
_mm256_storeu_si256((__m256i *)dst, res);
}
dst += dst_stride;
pred_buf_q3 += CFL_BUF_LINE;
} while (pred_buf_q3 < row_end);
}
static void cfl_predict_lbd_4(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 4);
}
static void cfl_predict_lbd_8(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 8);
}
static void cfl_predict_lbd_16(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 16);
}
static void cfl_predict_lbd_32(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 32);
}
cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
static const cfl_predict_lbd_fn predict_lbd[4] = {
cfl_predict_lbd_4, cfl_predict_lbd_8, cfl_predict_lbd_16, cfl_predict_lbd_32
};
const int width_log2 = tx_size_wide_log2[tx_size];
return predict_lbd[(width_log2 - 2) & 3];
}
......@@ -89,3 +89,76 @@ cfl_subsample_lbd_fn get_subsample_lbd_fn_ssse3(int sub_x, int sub_y) {
// index the function pointer array out of bounds.
return subsample_lbd[sub_y & 1][sub_x & 1];
}
static INLINE __m128i predict_lbd_unclipped(const __m128i *input,
__m128i alpha_q12,
__m128i alpha_sign, __m128i dc_q0) {
__m128i ac_q3 = _mm_loadu_si128(input);
__m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
__m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
return _mm_add_epi16(scaled_luma_q0, dc_q0);
}
static INLINE void cfl_predict_lbd_x(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size,
int alpha_q3, int width) {
uint8_t *row_end = dst + tx_size_high[tx_size] * dst_stride;
const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
const __m128i dc_q0 = _mm_set1_epi16(*dst);
do {
__m128i res = predict_lbd_unclipped((__m128i *)(pred_buf_q3), alpha_q12,
alpha_sign, dc_q0);
if (width < 16) {
res = _mm_packus_epi16(res, res);
if (width == 4)
*(uint32_t *)dst = _mm_cvtsi128_si32(res);
else
_mm_storel_epi64((__m128i *)dst, res);
} else {
__m128i next = predict_lbd_unclipped((__m128i *)(pred_buf_q3 + 8),
alpha_q12, alpha_sign, dc_q0);
res = _mm_packus_epi16(res, next);
_mm_storeu_si128((__m128i *)dst, res);
if (width == 32) {
res = predict_lbd_unclipped((__m128i *)(pred_buf_q3 + 16), alpha_q12,
alpha_sign, dc_q0);
next = predict_lbd_unclipped((__m128i *)(pred_buf_q3 + 24), alpha_q12,
alpha_sign, dc_q0);
res = _mm_packus_epi16(res, next);
_mm_storeu_si128((__m128i *)(dst + 16), res);
}
}
dst += dst_stride;
pred_buf_q3 += CFL_BUF_LINE;
} while (dst < row_end);
}
static void cfl_predict_lbd_4(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 4);
}
static void cfl_predict_lbd_8(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 8);
}
static void cfl_predict_lbd_16(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 16);
}
static void cfl_predict_lbd_32(const int16_t *pred_buf_q3, uint8_t *dst,
int dst_stride, TX_SIZE tx_size, int alpha_q3) {
cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, 32);
}
cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size) {
static const cfl_predict_lbd_fn predict_lbd[4] = {
cfl_predict_lbd_4, cfl_predict_lbd_8, cfl_predict_lbd_16, cfl_predict_lbd_32
};
const int width_log2 = tx_size_wide_log2[tx_size];
return predict_lbd[(width_log2 - 2) & 3];
}
......@@ -35,16 +35,27 @@ using libaom_test::ACMRandom;
make_tuple(16, 8, &function), make_tuple(8, 16, &function), \
make_tuple(16, 16, &function)
#define ALL_CFL_TX_SIZES(function) \
make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function), \
make_tuple(TX_8X4, &function), make_tuple(TX_8X8, &function), \
make_tuple(TX_8X16, &function), make_tuple(TX_16X8, &function), \
make_tuple(TX_16X16, &function), make_tuple(TX_16X32, &function), \
make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
namespace {
typedef void (*subtract_fn)(int16_t *pred_buf_q3, int width, int height,
int16_t avg_q3);
typedef cfl_subsample_lbd_fn (*get_subsample_fn)(int width, int height);
typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
typedef std::tr1::tuple<int, int, subtract_fn> subtract_param;
typedef std::tr1::tuple<int, int, get_subsample_fn> subsample_param;
typedef std::tr1::tuple<TX_SIZE, get_predict_fn> predict_param;
static void assertFaster(int ref_elapsed_time, int elapsed_time) {
EXPECT_GT(ref_elapsed_time, elapsed_time)
<< "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
......@@ -109,6 +120,37 @@ class CFLSubsampleTest : public ::testing::TestWithParam<subsample_param> {
}
};
class CFLPredictTest : public ::testing::TestWithParam<predict_param> {
public:
virtual ~CFLPredictTest() {}
virtual void SetUp() { predict = GET_PARAM(1); }
protected:
int Width() const { return tx_size_wide[GET_PARAM(0)]; }
int Height() const { return tx_size_high[GET_PARAM(0)]; }
TX_SIZE Tx_size() const { return GET_PARAM(0); }
DECLARE_ALIGNED(32, uint8_t, chroma_pels_ref[CFL_BUF_SQUARE]);
DECLARE_ALIGNED(32, int16_t, sub_luma_pels_ref[CFL_BUF_SQUARE]);
DECLARE_ALIGNED(32, uint8_t, chroma_pels[CFL_BUF_SQUARE]);
DECLARE_ALIGNED(32, int16_t, sub_luma_pels[CFL_BUF_SQUARE]);
get_predict_fn predict;
int alpha_q3;
uint8_t dc;
void init(int width, int height) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
alpha_q3 = rnd(33) - 16;
dc = rnd.Rand8();
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
chroma_pels[j * CFL_BUF_LINE + i] = dc;
chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
sub_luma_pels[j * CFL_BUF_LINE + i] = rnd.Rand8() - 128;
}
}
}
};
TEST_P(CFLSubtractTest, SubtractTest) {
const int width = Width();
const int height = Height();
......@@ -203,6 +245,57 @@ TEST_P(CFLSubsampleTest, DISABLED_SubsampleSpeedTest) {
assertFaster(ref_elapsed_time, elapsed_time);
}
TEST_P(CFLPredictTest, PredictTest) {
const int width = Width();
const int height = Height();
const TX_SIZE tx_size = Tx_size();
for (int it = 0; it < NUM_ITERATIONS; it++) {
init(width, height);
predict(tx_size)(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size,
alpha_q3);
get_predict_lbd_fn_c(tx_size)(sub_luma_pels_ref, chroma_pels_ref,
CFL_BUF_LINE, tx_size, alpha_q3);
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; i++) {
ASSERT_EQ(chroma_pels_ref[j * CFL_BUF_LINE + i],
chroma_pels[j * CFL_BUF_LINE + i]);
}
}
}
}
TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
const int width = Width();
const int height = Height();
const TX_SIZE tx_size = Tx_size();
aom_usec_timer ref_timer;
aom_usec_timer timer;
init(width, height);
cfl_predict_lbd_fn predict_impl = get_predict_lbd_fn_c(tx_size);
aom_usec_timer_start(&ref_timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
predict_impl(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, tx_size,
alpha_q3);
}
aom_usec_timer_mark(&ref_timer);
int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
predict_impl = predict(tx_size);
aom_usec_timer_start(&timer);
for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
predict_impl(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size, alpha_q3);
}
aom_usec_timer_mark(&timer);
int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
printSpeed(ref_elapsed_time, elapsed_time, width, height);
assertFaster(ref_elapsed_time, elapsed_time);
}
#if HAVE_SSE2
const subtract_param subtract_sizes_sse2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_sse2) };
......@@ -216,9 +309,16 @@ INSTANTIATE_TEST_CASE_P(SSE2, CFLSubtractTest,
const subsample_param subsample_sizes_ssse3[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_ssse3) };
const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
get_predict_lbd_fn_ssse3) };
INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleTest,
::testing::ValuesIn(subsample_sizes_ssse3));
INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
::testing::ValuesIn(predict_sizes_ssse3));
#endif
#if HAVE_AVX2
const subtract_param subtract_sizes_avx2[] = { ALL_CFL_SIZES(
av1_cfl_subtract_avx2) };
......@@ -226,10 +326,16 @@ const subtract_param subtract_sizes_avx2[] = { ALL_CFL_SIZES(
const subsample_param subsample_sizes_avx2[] = { CHROMA_420_CFL_SIZES(
get_subsample_lbd_fn_avx2) };
const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
get_predict_lbd_fn_avx2) };
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubtractTest,
::testing::ValuesIn(subtract_sizes_avx2));
INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleTest,
::testing::ValuesIn(subsample_sizes_avx2));
INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
::testing::ValuesIn(predict_sizes_avx2));
#endif
} // namespace
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment