Commit 0aa39ff0 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

ext-inter: Vectorize new masked SAD/SSE functions

We would expect that these new functions would be slower than
the old masked SAD/SSE functions, as they do additional work
(blending two inputs and comparing to a third, rather than
just comparing two inputs).

This is true for the SAD functions, which are about 50% slower
(depending on block size and bit depth). However, the sub-pixel
SSE functions are comparable to the old speed for the accelerated
special cases (xoffset or yoffset = 0 or 4), and are
between 40-90% faster for the generic case.

Change-Id: I1a296ed8fc9e3edc313a6add516ff76b17cd3e9f
parent b9f68d27
......@@ -343,6 +343,10 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c
endif
ifeq ($(CONFIG_AV1_ENCODER),yes)
ifeq ($(CONFIG_EXT_INTER),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
endif #CONFIG_EXT_INTER
ifeq ($(CONFIG_MOTION_VAR),yes)
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
......
......@@ -741,12 +741,14 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
}
}
}
......@@ -1046,6 +1048,7 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
......@@ -1053,6 +1056,7 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
}
}
}
......
This diff is collapsed.
This diff is collapsed.
......@@ -25,7 +25,7 @@
using libaom_test::ACMRandom;
namespace {
const int number_of_iterations = 500;
const int number_of_iterations = 200;
typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
......@@ -159,9 +159,7 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
using std::tr1::make_tuple;
// TODO(david.barker): Re-enable this once we have vectorized
// versions of the masked_compound_* functions
#if 0 && HAVE_SSSE3
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSADTest,
::testing::Values(
......@@ -221,5 +219,5 @@ INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
make_tuple(&aom_highbd_masked_sad4x4_ssse3,
&aom_highbd_masked_sad4x4_c)));
#endif // CONFIG_HIGHBITDEPTH
#endif // 0 && HAVE_SSSE3
#endif // HAVE_SSSE3
} // namespace
......@@ -29,7 +29,7 @@
using libaom_test::ACMRandom;
namespace {
const int number_of_iterations = 500;
const int number_of_iterations = 200;
typedef unsigned int (*MaskedSubPixelVarianceFunc)(
const uint8_t *src, int src_stride, int xoffset, int yoffset,
......@@ -217,15 +217,14 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
int xoffset, yoffset;
for (int i = 0; i < number_of_iterations; ++i) {
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
msk_ptr[j] = rnd(65);
}
for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
msk_ptr[j] = rnd(65);
}
for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
......@@ -319,9 +318,7 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
using std::tr1::make_tuple;
// TODO(david.barker): Re-enable this once we have vectorized
// versions of the masked_compound_* functions
#if 0 && HAVE_SSSE3
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
::testing::Values(
......@@ -490,5 +487,5 @@ INSTANTIATE_TEST_CASE_P(
AOM_BITS_12)));
#endif // CONFIG_HIGHBITDEPTH
#endif // 0 && HAVE_SSSE3
#endif // HAVE_SSSE3
} // namespace
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment