diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index e1b9d527bb3f75224bcd14b4308b261560d8e0df..fe0b650a4d65c9e7aec4685c45f35e8a26b97b88 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -1783,7 +1783,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_comp_mask_pred ssse3/; add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_comp_mask_upsampled_pred ssse3/; add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c index d670a4fab5b1ffc11e7202bf9a1d4a2819b827a6..d71f374e695cf9357e3b994fb2a9161c6b216981 100644 --- a/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -1039,3 +1039,74 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, *sum_ = _mm_cvtsi128_si32(sum); *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } + +INLINE void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + assert(height % 2 == 0); + assert(width % 8 == 0); + int i = 0, j = 0; + do { + // TODO(bingpengsmail@gmail.com): add 16 pixel version + j = 0; + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0 + j)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1 + j)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask + j)); + + // even line B + const __m128i sB0 = + _mm_loadl_epi64((const __m128i *)(src0 + j + stride0)); + const __m128i sB1 = + _mm_loadl_epi64((const __m128i *)(src1 + j + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + j + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + _mm_storel_epi64((__m128i *)(comp_pred + j), round); + _mm_storeh_pi((__m64 *)(comp_pred + j + width), _mm_castsi128_ps(round)); + j += 8; + } while (j < width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +void aom_comp_mask_upsampled_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + if (subpel_x_q3 || subpel_y_q3) { + aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, + ref_stride); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred_ssse3(comp_pred, pred, width, height, ref, ref_stride, + mask, mask_stride, invert_mask); +} diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h index df92cfe53a43412749cebf92796f6fb6084e387a..db5dfff4de8a0de7783c13e314683e9ae39b8275 100644 --- a/av1/common/reconinter.h +++ b/av1/common/reconinter.h @@ -232,8 +232,10 @@ static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) { COMPOUND_TYPE comp_type; + int i; if (!is_comp_ref_allowed(sb_type)) return 0; - for (comp_type = 0; comp_type < COMPOUND_TYPES; comp_type++) { + for (i = 0; i < COMPOUND_TYPES; i++) { + comp_type = (COMPOUND_TYPE)i; if (is_masked_compound_type(comp_type) && is_interinter_compound_used(comp_type, sb_type)) return 1; @@ -286,8 +288,8 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const int spel_right = spel_left - SUBPEL_SHIFTS; const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS; const int spel_bottom = spel_top - SUBPEL_SHIFTS; - MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)), - src_mv->col * (1 << (1 - ss_x)) }; + MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))), + (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; assert(ss_x <= 1); assert(ss_y <= 1); diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e082d577711d1d486c1b64ac49745872f7a915df --- /dev/null +++ b/test/comp_mask_variance_test.cc @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "av1/common/reconinter.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace AV1CompMaskVariance { +typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask); +typedef void (*comp_mask_up_pred_func)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask); + +typedef std::tr1::tuple CompMaskPredParam; + +class AV1CompMaskVarianceTest + : public ::testing::TestWithParam { + public: + ~AV1CompMaskVarianceTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize); + void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize); + + libaom_test::ACMRandom rnd_; + uint8_t *comp_pred1_; + uint8_t *comp_pred2_; + uint8_t *pred_; + uint8_t *ref_buffer_; + uint8_t *ref_; +}; + +AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; } + +void AV1CompMaskVarianceTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_init_wedge_masks(); + comp_pred1_ = (uint8_t *)aom_calloc(MAX_SB_SQUARE, 1); + comp_pred2_ = (uint8_t *)aom_calloc(MAX_SB_SQUARE, 1); + pred_ = (uint8_t *)aom_malloc(MAX_SB_SQUARE); + ref_buffer_ = (uint8_t *)aom_malloc(MAX_SB_SQUARE + (8 * MAX_SB_SIZE)); + ref_ = ref_buffer_ + (8 * MAX_SB_SIZE); + for (int i = 0; i < MAX_SB_SQUARE; ++i) { + pred_[i] = rnd_.Rand8(); + } + for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) { + ref_buffer_[i] = rnd_.Rand8(); + } +} + +void AV1CompMaskVarianceTest::TearDown() { + aom_free(comp_pred1_); + aom_free(comp_pred2_); + aom_free(pred_); + aom_free(ref_buffer_); + libaom_test::ClearSystemState(); +} + +void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + for (int inv = 0; inv < 2; ++inv) { + aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, + inv); + test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv); + // check result + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int idx = i * w + j; + ASSERT_EQ(comp_pred1_[idx], comp_pred2_[idx]) + << w << "x" << h << " Pixel mismatch at index " << idx << " = (" + << i << ", " << j << "), wedge " << wedge_index << " inv " << inv; + } + } + } + } +} + +void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl, + BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + const int num_loops = 1000000000 / (w + h); + + comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + comp_mask_pred_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0); + } + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("comp_mask_pred %3dx%-3d: %7.2f/%7.2f ns", w, h, elapsed_time[0], + elapsed_time[1]); + printf(" (%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +TEST_P(AV1CompMaskVarianceTest, CheckOutput) { + RunCheckOutput((comp_mask_pred_func)GET_PARAM(0), GET_PARAM(1)); +} + +TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) { + RunSpeedTest((comp_mask_pred_func)GET_PARAM(0), GET_PARAM(1)); +} + +#if HAVE_SSSE3 +const intptr_t comp_mask_pred_ssse3_f = (intptr_t)(&aom_comp_mask_pred_ssse3); +const CompMaskPredParam kArrayCompMaskPred_ssse3[] = { + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X8), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X16), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X8), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X16), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_16X32), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X16), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X32), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_8X32), + testing::make_tuple(comp_mask_pred_ssse3_f, BLOCK_32X8), +}; + +INSTANTIATE_TEST_CASE_P(SSSE3, AV1CompMaskVarianceTest, + ::testing::ValuesIn(kArrayCompMaskPred_ssse3)); +#endif + +class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest { + public: + ~AV1CompMaskUpVarianceTest(); + + protected: + void RunCheckOutput(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize); + void RunSpeedTest(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize); + void RunSpeedTestSub(comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize, + int havSub); +}; + +AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; } + +void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_up_pred_func test_impl, + BLOCK_SIZE bsize) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + + // loop through subx and suby + for (int sub = 0; sub < 8 * 8; ++sub) { + int subx = sub & 0x7; + int suby = (sub >> 3); + for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + for (int inv = 0; inv < 2; ++inv) { + aom_comp_mask_upsampled_pred_c(comp_pred1_, pred_, w, h, subx, suby, + ref_, MAX_SB_SIZE, mask, w, inv); + test_impl(comp_pred2_, pred_, w, h, subx, suby, ref_, MAX_SB_SIZE, mask, + w, inv); + // check result + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int idx = i * w + j; + ASSERT_EQ(comp_pred1_[idx], comp_pred2_[idx]) + << w << "x" << h << " Pixel mismatch at index " << idx << " = (" + << i << ", " << j << "), wedge " << wedge_index << " inv " + << inv << "sub (" << subx << "," << suby << ")"; + } + } + } + } + } +} + +void AV1CompMaskUpVarianceTest::RunSpeedTestSub( + comp_mask_up_pred_func test_impl, BLOCK_SIZE bsize, int havSub) { + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int subx = havSub ? 3 : 0; + const int suby = havSub ? 4 : 0; + + int wedge_types = (1 << get_wedge_bits_lookup(bsize)); + int wedge_index = wedge_types / 2; + const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize); + + const int num_loops = 1000000000 / (w + h); + comp_mask_up_pred_func funcs[2] = { &aom_comp_mask_upsampled_pred_c, + test_impl }; + double elapsed_time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + comp_mask_up_pred_func func = funcs[i]; + for (int j = 0; j < num_loops; ++j) { + func(comp_pred1_, pred_, w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w, 0); + } + + aom_usec_timer_mark(&timer); + double time = static_cast(aom_usec_timer_elapsed(&timer)); + elapsed_time[i] = 1000.0 * time / num_loops; + } + printf("CompMask[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0], + elapsed_time[1]); + printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]); +} + +void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_up_pred_func test_impl, + BLOCK_SIZE bsize) { + RunSpeedTestSub(test_impl, bsize, 0); // could skip upsample + RunSpeedTestSub(test_impl, bsize, 1); +} + +TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) { + RunCheckOutput((comp_mask_up_pred_func)GET_PARAM(0), GET_PARAM(1)); +} + +TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) { + RunSpeedTest((comp_mask_up_pred_func)GET_PARAM(0), GET_PARAM(1)); +} + +#if HAVE_SSSE3 +const intptr_t comp_mask_up_pred_ssse3_f = + (intptr_t)(&aom_comp_mask_upsampled_pred_ssse3); +const CompMaskPredParam kArrayCompMaskUpPred_ssse3[] = { + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X8), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X16), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X8), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X16), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_16X32), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X16), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X32), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_8X32), + testing::make_tuple(comp_mask_up_pred_ssse3_f, BLOCK_32X8), +}; + +INSTANTIATE_TEST_CASE_P(SSSE3, AV1CompMaskUpVarianceTest, + ::testing::ValuesIn(kArrayCompMaskUpPred_ssse3)); +#endif +} // namespace AV1CompMaskVariance diff --git a/test/test.cmake b/test/test.cmake index f67051318e413d94225f703b9b2463a58b05c590..fb4daa8a26721a302031e010f201e04b8c3a38e5 100644 --- a/test/test.cmake +++ b/test/test.cmake @@ -230,8 +230,9 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/test/hadamard_test.cc" "${AOM_ROOT}/test/masked_sad_test.cc" "${AOM_ROOT}/test/masked_variance_test.cc" + "${AOM_ROOT}/test/comp_mask_variance_test.cc" "${AOM_ROOT}/test/minmax_test.cc" - "${AOM_ROOT}/test/noise_model_test.cc" + "${AOM_ROOT}/test/noise_model_test.cc" "${AOM_ROOT}/test/subtract_test.cc" "${AOM_ROOT}/test/sum_squares_test.cc" "${AOM_ROOT}/test/variance_test.cc")