Commit 33ba1fe5 authored by Peng Bin's avatar Peng Bin Committed by Bin Peng

Add aom_comp_mask_<upsampled>pred_ssse3

1) For encoder speed, overall ~1% faster with no impact on coding performance.
2) aom_comp_mask_pred_ssse3 is 3.5x - 6x faster than aom_comp_mask_pred_c
3) aom_comp_mask_upsampled_pred_ssse3 1.5x - 3x faster than
aom_comp_mask_upsampled_pred_c, for special case where subpel_x ==
subpel_y == 0, optimized version achieves 4x - 7x speedup

Unittest for both functions have been added.

Change-Id: Ib498317975e0dbd9cdcf61be327b640dfac9a7e5
parent 1694a4ff
......@@ -1783,7 +1783,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
specialize qw/aom_comp_mask_pred ssse3/;
add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
specialize qw/aom_comp_mask_upsampled_pred ssse3/;
add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd";
......
......@@ -1039,3 +1039,74 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
*sum_ = _mm_cvtsi128_si32(sum);
*sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
}
INLINE void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, const uint8_t *ref,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
const uint8_t *src0 = invert_mask ? pred : ref;
const uint8_t *src1 = invert_mask ? ref : pred;
const int stride0 = invert_mask ? width : ref_stride;
const int stride1 = invert_mask ? ref_stride : width;
const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
const __m128i round_offset =
_mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
assert(height % 2 == 0);
assert(width % 8 == 0);
int i = 0, j = 0;
do {
// TODO(bingpengsmail@gmail.com): add 16 pixel version
j = 0;
do {
// odd line A
const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0 + j));
const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1 + j));
const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask + j));
// even line B
const __m128i sB0 =
_mm_loadl_epi64((const __m128i *)(src0 + j + stride0));
const __m128i sB1 =
_mm_loadl_epi64((const __m128i *)(src1 + j + stride1));
const __m128i a = _mm_castps_si128(_mm_loadh_pi(
_mm_castsi128_ps(aA), (const __m64 *)(mask + j + mask_stride)));
const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
const __m128i ma = _mm_sub_epi8(alpha_max, a);
const __m128i aaA = _mm_unpacklo_epi8(a, ma);
const __m128i aaB = _mm_unpackhi_epi8(a, ma);
const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
const __m128i round = _mm_packus_epi16(roundA, roundB);
_mm_storel_epi64((__m128i *)(comp_pred + j), round);
_mm_storeh_pi((__m64 *)(comp_pred + j + width), _mm_castsi128_ps(round));
j += 8;
} while (j < width);
comp_pred += (width << 1);
src0 += (stride0 << 1);
src1 += (stride1 << 1);
mask += (mask_stride << 1);
i += 2;
} while (i < height);
}
void aom_comp_mask_upsampled_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask) {
if (subpel_x_q3 || subpel_y_q3) {
aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref,
ref_stride);
ref = comp_pred;
ref_stride = width;
}
aom_comp_mask_pred_ssse3(comp_pred, pred, width, height, ref, ref_stride,
mask, mask_stride, invert_mask);
}
......@@ -232,8 +232,10 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
COMPOUND_TYPE comp_type;
int i;
if (!is_comp_ref_allowed(sb_type)) return 0;
for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) {
for (i = 0; i < COMPOUND_TYPES; i++) {
comp_type = (COMPOUND_TYPE)i;
if (is_masked_compound_type(comp_type) &&
is_interinter_compound_used(comp_type, sb_type))
return 1;
......@@ -286,8 +288,8 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
const int spel_right = spel_left - SUBPEL_SHIFTS;
const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
const int spel_bottom = spel_top - SUBPEL_SHIFTS;
MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)),
src_mv->col * (1 << (1 - ss_x)) };
MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
(int16_t)(src_mv->col * (1 << (1 - ss_x))) };
assert(ss_x <= 1);
assert(ss_y <= 1);
......
/*
* Copyright (c) 2018, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <cstdlib>
#include <new>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/aom_timer.h"
#include "aom_ports/mem.h"
#include "av1/common/reconinter.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
namespace AV1CompMaskVariance {
typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, const uint8_t *ref,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask);
typedef void (*comp_mask_up_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
int width, int height, int subpel_x_q3,
int subpel_y_q3, const uint8_t *ref,
int ref_stride, const uint8_t *mask,
int mask_stride, int invert_mask);
typedef std::tr1::tuple<intptr_t, BLOCK_SIZE> CompMaskPredParam;
class AV1CompMaskVarianceTest
: public ::testing::TestWithParam<CompMaskPredParam> {
public:
~AV1CompMaskVarianceTest();
void SetUp();
void TearDown();
protected:
void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
libaom_test::ACMRandom rnd_;
uint8_t *comp_pred1_;
uint8_t *comp_pred2_;
uint8_t *pred_;
uint8_t *ref_buffer_;
uint8_t *ref_;
};
AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
void AV1CompMaskVarianceTest::SetUp() {
rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
av1_init_wedge_masks();
comp_pred1_ = (uint8_t *)aom_calloc(MAX_SB_SQUARE, 1);
comp_pred2_ = (uint8_t *)aom_calloc(MAX_SB_SQUARE, 1);
pred_ = (uint8_t *)aom_malloc(MAX_SB_SQUARE);
ref_buffer_ = (uint8_t *)aom_malloc(MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
for (int i = 0; i < MAX_SB_SQUARE; ++i) {
pred_[i] = rnd_.Rand8();
}
for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
ref_buffer_[i] = rnd_.Rand8();
}
}
void AV1CompMaskVarianceTest::TearDown() {
aom_free(comp_pred1_);
aom_free(comp_pred2_);
aom_free(pred_);
aom_free(ref_buffer_);
libaom_test::ClearSystemState();
}
void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
BLOCK_SIZE bsize) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
for (int inv = 0; inv < 2; ++inv) {
aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
inv);
test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
// check result
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int idx = i * w + j;
ASSERT_EQ(comp_pred1_[idx], comp_pred2_[idx])
<< w << "x" << h << " Pixel mismatch at index " << idx << " = ("
<< i << ", " << j << "), wedge " << wedge_index << " inv " << inv;
}
}
}
}
}
void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
BLOCK_SIZE bsize) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
int wedge_index = wedge_types / 2;
const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
const int num_loops = 1000000000 / (w + h);
comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
double elapsed_time[2] = { 0 };
for (int i = 0; i < 2; ++i) {
aom_usec_timer timer;
aom_usec_timer_start(&timer);
comp_mask_pred_func func = funcs[i];
for (int j = 0; j < num_loops; ++j) {
func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
}
aom_usec_timer_mark(&timer);
double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
elapsed_time[i] = 1000.0 * time / num_loops;
}
printf("comp_mask_pred %3dx%-3d: %7.2f/%7.2f ns", w, h, elapsed_time[0],
elapsed_time[1]);
printf(" (%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
}
TEST_P(AV1CompMaskVarianceTest, CheckOutput) {
RunCheckOutput((comp_mask_pred_func)GET_PARAM(0), GET_PARAM(1));
}
TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
RunSpeedTest((comp_mask_pred_func)GET_PARAM(0), GET_PARAM(1));
}
#if HAVE_SSSE3
const intptr_t comp_mask_pred_ssse3_f = (intptr_t)(&aom_comp_mask_pred_ssse3);
const CompMaskPredParam kArrayCompMaskPred_ssse3[] = {
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X8),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X16),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X8),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X16),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X32),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X16),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X32),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X32),
testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X8),
};
INSTANTIATE_TEST_CASE_P(SSSE3, AV1CompMaskVarianceTest,
::testing::ValuesIn(kArrayCompMaskPred_ssse3));
#endif
class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
public:
~AV1CompMaskUpVarianceTest();
protected:
void RunCheckOutput(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize);
void RunSpeedTest(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize);
void RunSpeedTestSub(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize,
int havSub);
};
AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_up_pred_func test_impl,
BLOCK_SIZE bsize) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
// loop through subx and suby
for (int sub = 0; sub < 8 * 8; ++sub) {
int subx = sub & 0x7;
int suby = (sub >> 3);
for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
for (int inv = 0; inv < 2; ++inv) {
aom_comp_mask_upsampled_pred_c(comp_pred1_, pred_, w, h, subx, suby,
ref_, MAX_SB_SIZE, mask, w, inv);
test_impl(comp_pred2_, pred_, w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
w, inv);
// check result
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) {
int idx = i * w + j;
ASSERT_EQ(comp_pred1_[idx], comp_pred2_[idx])
<< w << "x" << h << " Pixel mismatch at index " << idx << " = ("
<< i << ", " << j << "), wedge " << wedge_index << " inv "
<< inv << "sub (" << subx << "," << suby << ")";
}
}
}
}
}
}
void AV1CompMaskUpVarianceTest::RunSpeedTestSub(
comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize, int havSub) {
const int w = block_size_wide[bsize];
const int h = block_size_high[bsize];
const int subx = havSub ? 3 : 0;
const int suby = havSub ? 4 : 0;
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
int wedge_index = wedge_types / 2;
const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
const int num_loops = 1000000000 / (w + h);
comp_mask_up_pred_func funcs[2] = { &aom_comp_mask_upsampled_pred_c,
test_impl };
double elapsed_time[2] = { 0 };
for (int i = 0; i < 2; ++i) {
aom_usec_timer timer;
aom_usec_timer_start(&timer);
comp_mask_up_pred_func func = funcs[i];
for (int j = 0; j < num_loops; ++j) {
func(comp_pred1_, pred_, w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w, 0);
}
aom_usec_timer_mark(&timer);
double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
elapsed_time[i] = 1000.0 * time / num_loops;
}
printf("CompMask[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
elapsed_time[1]);
printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
}
void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_up_pred_func test_impl,
BLOCK_SIZE bsize) {
RunSpeedTestSub(test_impl, bsize, 0); // could skip upsample
RunSpeedTestSub(test_impl, bsize, 1);
}
TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) {
RunCheckOutput((comp_mask_up_pred_func)GET_PARAM(0), GET_PARAM(1));
}
TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) {
RunSpeedTest((comp_mask_up_pred_func)GET_PARAM(0), GET_PARAM(1));
}
#if HAVE_SSSE3
const intptr_t comp_mask_up_pred_ssse3_f =
(intptr_t)(&aom_comp_mask_upsampled_pred_ssse3);
const CompMaskPredParam kArrayCompMaskUpPred_ssse3[] = {
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X8),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X16),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X8),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X16),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X32),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X16),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X32),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X32),
testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X8),
};
INSTANTIATE_TEST_CASE_P(SSSE3, AV1CompMaskUpVarianceTest,
::testing::ValuesIn(kArrayCompMaskUpPred_ssse3));
#endif
} // namespace AV1CompMaskVariance
......@@ -230,8 +230,9 @@ if (CONFIG_AV1_ENCODER)
"${AOM_ROOT}/test/hadamard_test.cc"
"${AOM_ROOT}/test/masked_sad_test.cc"
"${AOM_ROOT}/test/masked_variance_test.cc"
"${AOM_ROOT}/test/comp_mask_variance_test.cc"
"${AOM_ROOT}/test/minmax_test.cc"
"${AOM_ROOT}/test/noise_model_test.cc"
"${AOM_ROOT}/test/noise_model_test.cc"
"${AOM_ROOT}/test/subtract_test.cc"
"${AOM_ROOT}/test/sum_squares_test.cc"
"${AOM_ROOT}/test/variance_test.cc")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment