Commit e9832584 authored by Yi Luo's avatar Yi Luo
Browse files

High bit depth motion search SAD optimization on avx2

- For all blocks with width >= 16.
- Add test_count to make the unit tests harder to pass.
- Speed testing on 1080p, 100 frames, 5 Mbps, CPU, i7-6700
  User level time reduction:
   baseline:                  3.68%
   baseline + ext-partition: 36.12%

Change-Id: I78c5d9ca216f0fd91f1a360dca2190b11fd54a08
parent 3709e5d7
......@@ -348,6 +348,10 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c
endif
ifeq ($(CONFIG_AV1_ENCODER),yes)
ifeq ($(CONFIG_EXT_INTER),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
......
......@@ -1342,6 +1342,29 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
}
}
specialize qw/aom_highbd_sad128x128 avx2/;
specialize qw/aom_highbd_sad128x64 avx2/;
specialize qw/aom_highbd_sad64x128 avx2/;
specialize qw/aom_highbd_sad64x64 avx2/;
specialize qw/aom_highbd_sad64x32 avx2/;
specialize qw/aom_highbd_sad32x64 avx2/;
specialize qw/aom_highbd_sad32x32 avx2/;
specialize qw/aom_highbd_sad32x16 avx2/;
specialize qw/aom_highbd_sad16x32 avx2/;
specialize qw/aom_highbd_sad16x16 avx2/;
specialize qw/aom_highbd_sad16x8 avx2/;
specialize qw/aom_highbd_sad128x128_avg avx2/;
specialize qw/aom_highbd_sad128x64_avg avx2/;
specialize qw/aom_highbd_sad64x128_avg avx2/;
specialize qw/aom_highbd_sad64x64_avg avx2/;
specialize qw/aom_highbd_sad64x32_avg avx2/;
specialize qw/aom_highbd_sad32x64_avg avx2/;
specialize qw/aom_highbd_sad32x32_avg avx2/;
specialize qw/aom_highbd_sad32x16_avg avx2/;
specialize qw/aom_highbd_sad16x32_avg avx2/;
specialize qw/aom_highbd_sad16x16_avg avx2/;
specialize qw/aom_highbd_sad16x8_avg avx2/;
}
#
......@@ -1472,6 +1495,17 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
}
}
specialize qw/aom_highbd_sad128x128x4d avx2/;
specialize qw/aom_highbd_sad128x64x4d avx2/;
specialize qw/aom_highbd_sad64x128x4d avx2/;
specialize qw/aom_highbd_sad64x64x4d avx2/;
specialize qw/aom_highbd_sad64x32x4d avx2/;
specialize qw/aom_highbd_sad32x64x4d avx2/;
specialize qw/aom_highbd_sad32x32x4d avx2/;
specialize qw/aom_highbd_sad32x16x4d avx2/;
specialize qw/aom_highbd_sad16x32x4d avx2/;
specialize qw/aom_highbd_sad16x16x4d avx2/;
specialize qw/aom_highbd_sad16x8x4d avx2/;
}
#
......
This diff is collapsed.
......@@ -278,6 +278,14 @@ class SADTest : public SADTestBase,
ASSERT_EQ(reference_sad, exp_sad);
}
void SpeedSAD() {
int test_count = 20000000;
while (test_count > 0) {
SAD(0);
test_count -= 1;
}
}
};
class SADavgTest : public SADTestBase,
......@@ -347,13 +355,29 @@ TEST_P(SADTest, UnalignedRef) {
}
TEST_P(SADTest, ShortSrc) {
const int tmp_stride = source_stride_;
source_stride_ >>= 1;
int test_count = 2000;
while (test_count > 0) {
FillRandom(source_data_, source_stride_);
FillRandom(reference_data_, reference_stride_);
CheckSAD();
test_count -= 1;
}
source_stride_ = tmp_stride;
}
#define SPEED_TEST (0)
#if SPEED_TEST
TEST_P(SADTest, Speed) {
const int tmp_stride = source_stride_;
source_stride_ >>= 1;
FillRandom(source_data_, source_stride_);
FillRandom(reference_data_, reference_stride_);
CheckSAD();
SpeedSAD();
source_stride_ = tmp_stride;
}
#endif
TEST_P(SADavgTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
......@@ -393,10 +417,14 @@ TEST_P(SADavgTest, UnalignedRef) {
TEST_P(SADavgTest, ShortSrc) {
const int tmp_stride = source_stride_;
source_stride_ >>= 1;
FillRandom(source_data_, source_stride_);
FillRandom(reference_data_, reference_stride_);
FillRandom(second_pred_, width_);
CheckSAD();
int test_count = 2000;
while (test_count > 0) {
FillRandom(source_data_, source_stride_);
FillRandom(reference_data_, reference_stride_);
FillRandom(second_pred_, width_);
CheckSAD();
test_count -= 1;
}
source_stride_ = tmp_stride;
}
......@@ -447,12 +475,16 @@ TEST_P(SADx4Test, UnalignedRef) {
TEST_P(SADx4Test, ShortSrc) {
int tmp_stride = source_stride_;
source_stride_ >>= 1;
FillRandom(source_data_, source_stride_);
FillRandom(GetReference(0), reference_stride_);
FillRandom(GetReference(1), reference_stride_);
FillRandom(GetReference(2), reference_stride_);
FillRandom(GetReference(3), reference_stride_);
CheckSADs();
int test_count = 1000;
while (test_count > 0) {
FillRandom(source_data_, source_stride_);
FillRandom(GetReference(0), reference_stride_);
FillRandom(GetReference(1), reference_stride_);
FillRandom(GetReference(2), reference_stride_);
FillRandom(GetReference(3), reference_stride_);
CheckSADs();
test_count -= 1;
}
source_stride_ = tmp_stride;
}
......@@ -940,6 +972,43 @@ const SadMxNParam avx2_tests[] = {
make_tuple(32, 64, &aom_sad32x64_avx2, -1),
make_tuple(32, 32, &aom_sad32x32_avx2, -1),
make_tuple(32, 16, &aom_sad32x16_avx2, -1),
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_EXT_PARTITION
make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 8),
make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 10),
make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 12),
make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8),
make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10),
make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12),
#endif
make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8),
make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10),
make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12),
make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 8),
make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 10),
make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 12),
make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 8),
make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 10),
make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 12),
make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 8),
make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 10),
make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 12),
make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 8),
make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 10),
make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 12),
make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 8),
make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 10),
make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 12),
make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 8),
make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 10),
make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 12),
make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
#endif
};
INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
......@@ -954,6 +1023,43 @@ const SadMxNAvgParam avg_avx2_tests[] = {
make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_EXT_PARTITION
make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 8),
make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 10),
make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 12),
make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8),
make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10),
make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12),
#endif
make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8),
make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10),
make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12),
make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 8),
make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 10),
make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 12),
make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 8),
make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 10),
make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 12),
make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 8),
make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 10),
make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 12),
make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 8),
make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 10),
make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 12),
make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 8),
make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 10),
make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 12),
make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 8),
make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 10),
make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 12),
make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
#endif
};
INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
......@@ -967,6 +1073,43 @@ const SadMxNx4Param x4d_avx2_tests[] = {
make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_EXT_PARTITION
make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 8),
make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 10),
make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 12),
make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8),
make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10),
make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12),
#endif
make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8),
make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10),
make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12),
make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 8),
make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 10),
make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 12),
make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 8),
make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 10),
make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 12),
make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 8),
make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 10),
make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 12),
make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 8),
make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 10),
make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 12),
make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 8),
make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 10),
make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 12),
make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 8),
make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 10),
make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 12),
make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
#endif
};
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
#endif // HAVE_AVX2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment