Skip to content
Snippets Groups Projects
Commit 339ef0ce authored by Deb Mukherjee's avatar Deb Mukherjee Committed by Gerrit Code Review
Browse files

Merge "Adds masked variance and sad functions for wedge" into nextgenv2

parents 94256166 1d69ceee
No related branches found
No related tags found
2 merge requests!6Rav1e 11 yushin 1,!3Rav1e 10 yushin
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
namespace {
const int number_of_iterations = 500;
typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
const uint8_t *m, int m_stride);
typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
public:
virtual ~MaskedSADTest() {}
virtual void SetUp() {
maskedSAD_op_ = GET_PARAM(0);
ref_maskedSAD_op_ = GET_PARAM(1);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
MaskedSADFunc maskedSAD_op_;
MaskedSADFunc ref_maskedSAD_op_;
};
TEST_P(MaskedSADTest, OperationCheck) {
unsigned int ref_ret, ret;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint8_t, src_ptr[4096]);
DECLARE_ALIGNED(16, uint8_t, ref_ptr[4096]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[4096]);
int err_count = 0;
int first_failure = -1;
int src_stride = 64;
int ref_stride = 64;
int msk_stride = 64;
for (int i = 0; i < number_of_iterations; ++i) {
for (int j = 0; j < 4096; j++) {
src_ptr[j] = rnd.Rand8();
ref_ptr[j] = rnd.Rand8();
msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
assert(msk_ptr[j] <= 64);
}
ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
msk_ptr, msk_stride);
ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
ref_ptr, ref_stride,
msk_ptr, msk_stride));
if (ret != ref_ret) {
err_count++;
if (first_failure == -1)
first_failure = i;
}
}
EXPECT_EQ(0, err_count)
<< "Error: Masked SAD Test, C output doesn't match SSSE3 output. "
<< "First failed at test case " << first_failure;
}
#if CONFIG_VP9_HIGHBITDEPTH
typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
const uint8_t *m, int m_stride);
typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
HighbdMaskedSADParam;
class HighbdMaskedSADTest : public ::testing::
TestWithParam<HighbdMaskedSADParam> {
public:
virtual ~HighbdMaskedSADTest() {}
virtual void SetUp() {
maskedSAD_op_ = GET_PARAM(0);
ref_maskedSAD_op_ = GET_PARAM(1);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
HighbdMaskedSADFunc maskedSAD_op_;
HighbdMaskedSADFunc ref_maskedSAD_op_;
};
TEST_P(HighbdMaskedSADTest, OperationCheck) {
unsigned int ref_ret, ret;
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, uint16_t, src_ptr[4096]);
DECLARE_ALIGNED(16, uint16_t, ref_ptr[4096]);
DECLARE_ALIGNED(16, uint8_t, msk_ptr[4096]);
uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
int err_count = 0;
int first_failure = -1;
int src_stride = 64;
int ref_stride = 64;
int msk_stride = 64;
for (int i = 0; i < number_of_iterations; ++i) {
for (int j = 0; j < 4096; j++) {
src_ptr[j] = rnd.Rand16()&0xfff;
ref_ptr[j] = rnd.Rand16()&0xfff;
msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
}
ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
msk_ptr, msk_stride);
ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
ref8_ptr, ref_stride,
msk_ptr, msk_stride));
if (ret != ref_ret) {
err_count++;
if (first_failure == -1)
first_failure = i;
}
}
EXPECT_EQ(0, err_count)
<< "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
<< "First failed at test case " << first_failure;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
using std::tr1::make_tuple;
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSADTest,
::testing::Values(
make_tuple(&vpx_masked_sad64x64_ssse3,
&vpx_masked_sad64x64_c),
make_tuple(&vpx_masked_sad64x32_ssse3,
&vpx_masked_sad64x32_c),
make_tuple(&vpx_masked_sad32x64_ssse3,
&vpx_masked_sad32x64_c),
make_tuple(&vpx_masked_sad32x32_ssse3,
&vpx_masked_sad32x32_c),
make_tuple(&vpx_masked_sad32x16_ssse3,
&vpx_masked_sad32x16_c),
make_tuple(&vpx_masked_sad16x32_ssse3,
&vpx_masked_sad16x32_c),
make_tuple(&vpx_masked_sad16x16_ssse3,
&vpx_masked_sad16x16_c),
make_tuple(&vpx_masked_sad16x8_ssse3,
&vpx_masked_sad16x8_c),
make_tuple(&vpx_masked_sad8x16_ssse3,
&vpx_masked_sad8x16_c),
make_tuple(&vpx_masked_sad8x8_ssse3,
&vpx_masked_sad8x8_c),
make_tuple(&vpx_masked_sad8x4_ssse3,
&vpx_masked_sad8x4_c),
make_tuple(&vpx_masked_sad4x8_ssse3,
&vpx_masked_sad4x8_c),
make_tuple(&vpx_masked_sad4x4_ssse3,
&vpx_masked_sad4x4_c)));
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, HighbdMaskedSADTest,
::testing::Values(
make_tuple(&vp9_highbd_masked_sad64x64_ssse3,
&vp9_highbd_masked_sad64x64_c),
make_tuple(&vp9_highbd_masked_sad64x32_ssse3,
&vp9_highbd_masked_sad64x32_c),
make_tuple(&vp9_highbd_masked_sad32x64_ssse3,
&vp9_highbd_masked_sad32x64_c),
make_tuple(&vp9_highbd_masked_sad32x32_ssse3,
&vp9_highbd_masked_sad32x32_c),
make_tuple(&vp9_highbd_masked_sad32x16_ssse3,
&vp9_highbd_masked_sad32x16_c),
make_tuple(&vp9_highbd_masked_sad16x32_ssse3,
&vp9_highbd_masked_sad16x32_c),
make_tuple(&vp9_highbd_masked_sad16x16_ssse3,
&vp9_highbd_masked_sad16x16_c),
make_tuple(&vp9_highbd_masked_sad16x8_ssse3,
&vp9_highbd_masked_sad16x8_c),
make_tuple(&vp9_highbd_masked_sad8x16_ssse3,
&vp9_highbd_masked_sad8x16_c),
make_tuple(&vp9_highbd_masked_sad8x8_ssse3,
&vp9_highbd_masked_sad8x8_c),
make_tuple(&vp9_highbd_masked_sad8x4_ssse3,
&vp9_highbd_masked_sad8x4_c),
make_tuple(&vp9_highbd_masked_sad4x8_ssse3,
&vp9_highbd_masked_sad4x8_c),
make_tuple(&vp9_highbd_masked_sad4x4_ssse3,
&vp9_highbd_masked_sad4x4_c)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSSE3
} // namespace
This diff is collapsed.
......@@ -168,6 +168,11 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ANS) += vp10_ans_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
ifeq ($(CONFIG_EXT_INTER),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
endif
endif # VP10
## Multi-codec / unconditional whitebox tests.
......
......@@ -316,3 +316,105 @@ highbd_sadMxNxK(4, 4, 8)
highbd_sadMxNx4D(4, 4)
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP10 && CONFIG_EXT_INTER
static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
const uint8_t *m, int m_stride,
int width, int height) {
int y, x;
unsigned int sad = 0;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += m[x] * abs(a[x] - b[x]);
a += a_stride;
b += b_stride;
m += m_stride;
}
sad = (sad + 31) >> 6;
return sad;
}
#define MASKSADMxN(m, n) \
unsigned int vpx_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
const uint8_t *msk, int msk_stride) { \
return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
}
#if CONFIG_EXT_PARTITION
MASKSADMxN(128, 128)
MASKSADMxN(128, 64)
MASKSADMxN(64, 128)
#endif // CONFIG_EXT_PARTITION
MASKSADMxN(64, 64)
MASKSADMxN(64, 32)
MASKSADMxN(32, 64)
MASKSADMxN(32, 32)
MASKSADMxN(32, 16)
MASKSADMxN(16, 32)
MASKSADMxN(16, 16)
MASKSADMxN(16, 8)
MASKSADMxN(8, 16)
MASKSADMxN(8, 8)
MASKSADMxN(8, 4)
MASKSADMxN(4, 8)
MASKSADMxN(4, 4)
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
const uint8_t *m, int m_stride,
int width, int height) {
int y, x;
unsigned int sad = 0;
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += m[x] * abs(a[x] - b[x]);
a += a_stride;
b += b_stride;
m += m_stride;
}
sad = (sad + 31) >> 6;
return sad;
}
#define HIGHBD_MASKSADMXN(m, n) \
unsigned int vpx_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *msk, \
int msk_stride) { \
return highbd_masked_sad(src, src_stride, ref, ref_stride, \
msk, msk_stride, m, n); \
}
#if CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN(128, 128)
HIGHBD_MASKSADMXN(128, 64)
HIGHBD_MASKSADMXN(64, 128)
#endif // CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN(64, 64)
HIGHBD_MASKSADMXN(64, 32)
HIGHBD_MASKSADMXN(32, 64)
HIGHBD_MASKSADMXN(32, 32)
HIGHBD_MASKSADMXN(32, 16)
HIGHBD_MASKSADMXN(16, 32)
HIGHBD_MASKSADMXN(16, 16)
HIGHBD_MASKSADMXN(16, 8)
HIGHBD_MASKSADMXN(8, 16)
HIGHBD_MASKSADMXN(8, 8)
HIGHBD_MASKSADMXN(8, 4)
HIGHBD_MASKSADMXN(4, 8)
HIGHBD_MASKSADMXN(4, 4)
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP10 && CONFIG_EXT_INTER
......@@ -15,8 +15,9 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/variance.h"
#include "vpx_dsp/vpx_filter.h"
static const uint8_t bilinear_filters[8][2] = {
const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2] = {
{ 128, 0 },
{ 112, 16 },
{ 96, 32 },
......@@ -175,9 +176,9 @@ uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
uint8_t temp2[H * W]; \
\
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
vpx_bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
}
......@@ -195,9 +196,9 @@ uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
\
var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
bilinear_filters[xoffset]); \
vpx_bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
\
......@@ -500,9 +501,9 @@ uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
dst_stride, sse); \
......@@ -517,9 +518,9 @@ uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
......@@ -534,9 +535,9 @@ uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, sse); \
......@@ -554,9 +555,9 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
......@@ -576,9 +577,9 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
......@@ -598,9 +599,9 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
W, bilinear_filters[xoffset]); \
W, vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[yoffset]); \
vpx_bilinear_filters[yoffset]); \
\
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
CONVERT_TO_BYTEPTR(temp2), W); \
......@@ -654,3 +655,323 @@ void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP10 && CONFIG_EXT_INTER
void masked_variance(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
const uint8_t *m, int m_stride,
int w, int h, unsigned int *sse, int *sum) {
int i, j;
int64_t sum64 = 0;
uint64_t sse64 = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = (a[j] - b[j]) * (m[j]);
sum64 += diff;
sse64 += diff * diff;
}
a += a_stride;
b += b_stride;
m += m_stride;
}
*sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
*sse = (sse64 + 2047) >> 12;
}
#define MASK_VAR(W, H) \
unsigned int vpx_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
const uint8_t *b, int b_stride, \
const uint8_t *m, int m_stride, \
unsigned int *sse) { \
int sum; \
masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define MASK_SUBPIX_VAR(W, H) \
unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
const uint8_t *msk, int msk_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint8_t temp2[H * W]; \
\
var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
vpx_bilinear_filters[xoffset]); \
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
vpx_bilinear_filters[yoffset]); \
\
return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
msk, msk_stride, sse); \
}
MASK_VAR(4, 4)
MASK_SUBPIX_VAR(4, 4)
MASK_VAR(4, 8)
MASK_SUBPIX_VAR(4, 8)
MASK_VAR(8, 4)
MASK_SUBPIX_VAR(8, 4)
MASK_VAR(8, 8)
MASK_SUBPIX_VAR(8, 8)
MASK_VAR(8, 16)
MASK_SUBPIX_VAR(8, 16)
MASK_VAR(16, 8)
MASK_SUBPIX_VAR(16, 8)
MASK_VAR(16, 16)
MASK_SUBPIX_VAR(16, 16)
MASK_VAR(16, 32)
MASK_SUBPIX_VAR(16, 32)
MASK_VAR(32, 16)
MASK_SUBPIX_VAR(32, 16)
MASK_VAR(32, 32)
MASK_SUBPIX_VAR(32, 32)
MASK_VAR(32, 64)
MASK_SUBPIX_VAR(32, 64)
MASK_VAR(64, 32)
MASK_SUBPIX_VAR(64, 32)
MASK_VAR(64, 64)
MASK_SUBPIX_VAR(64, 64)
#if CONFIG_EXT_PARTITION
MASK_VAR(64, 128)
MASK_SUBPIX_VAR(64, 128)
MASK_VAR(128, 64)
MASK_SUBPIX_VAR(128, 64)
MASK_VAR(128, 128)
MASK_SUBPIX_VAR(128, 128)
#endif // CONFIG_EXT_PARTITION
#if CONFIG_VP9_HIGHBITDEPTH
void highbd_masked_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
const uint8_t *m, int m_stride,
int w, int h,
uint64_t *sse64, int *sum) {
int i, j;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
int64_t sum64 = 0;
*sse64 = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = (a[j] - b[j]) * (m[j]);
sum64 += diff;
*sse64 += (int64_t)diff * diff;
}
a += a_stride;
b += b_stride;
m += m_stride;
}
*sum = (sum64 >= 0) ? ((sum64 + 31) >> 6) : -((-sum64 + 31) >> 6);
*sse64 = (*sse64 + 2047) >> 12;
}
void highbd_masked_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
const uint8_t *m, int m_stride,
int w, int h,
unsigned int *sse, int *sum) {
uint64_t sse64;
highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
w, h, &sse64, sum);
*sse = (unsigned int)sse64;
}
void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
const uint8_t *m, int m_stride,
int w, int h,
unsigned int *sse, int *sum) {
uint64_t sse64;
highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
w, h, &sse64, sum);
*sum = ROUND_POWER_OF_TWO(*sum, 2);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
}
void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride,
const uint8_t *m, int m_stride,
int w, int h,
unsigned int *sse, int *sum) {
uint64_t sse64;
highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
w, h, &sse64, sum);
*sum = ROUND_POWER_OF_TWO(*sum, 4);
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
}
#define HIGHBD_MASK_VAR(W, H) \
unsigned int vpx_highbd_masked_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
const uint8_t *m, \
int m_stride, \
unsigned int *sse) { \
int sum; \
highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_10_masked_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
const uint8_t *m, \
int m_stride, \
unsigned int *sse) { \
int sum; \
highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
} \
\
unsigned int vpx_highbd_12_masked_variance##W##x##H##_c(const uint8_t *a, \
int a_stride, \
const uint8_t *b, \
int b_stride, \
const uint8_t *m, \
int m_stride, \
unsigned int *sse) { \
int sum; \
highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
W, H, sse, &sum); \
return *sse - (((int64_t)sum * sum) / (W * H)); \
}
#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
const uint8_t *msk, int msk_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
H + 1, W, \
vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, \
msk, msk_stride, sse); \
} \
\
unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
const uint8_t *msk, int msk_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
H + 1, W, \
vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, \
msk, msk_stride, sse); \
} \
\
unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
const uint8_t *src, int src_stride, \
int xoffset, int yoffset, \
const uint8_t *dst, int dst_stride, \
const uint8_t *msk, int msk_stride, \
unsigned int *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
H + 1, W, \
vpx_bilinear_filters[xoffset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
vpx_bilinear_filters[yoffset]); \
\
return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
W, dst, dst_stride, \
msk, msk_stride, sse); \
}
HIGHBD_MASK_VAR(4, 4)
HIGHBD_MASK_SUBPIX_VAR(4, 4)
HIGHBD_MASK_VAR(4, 8)
HIGHBD_MASK_SUBPIX_VAR(4, 8)
HIGHBD_MASK_VAR(8, 4)
HIGHBD_MASK_SUBPIX_VAR(8, 4)
HIGHBD_MASK_VAR(8, 8)
HIGHBD_MASK_SUBPIX_VAR(8, 8)
HIGHBD_MASK_VAR(8, 16)
HIGHBD_MASK_SUBPIX_VAR(8, 16)
HIGHBD_MASK_VAR(16, 8)
HIGHBD_MASK_SUBPIX_VAR(16, 8)
HIGHBD_MASK_VAR(16, 16)
HIGHBD_MASK_SUBPIX_VAR(16, 16)
HIGHBD_MASK_VAR(16, 32)
HIGHBD_MASK_SUBPIX_VAR(16, 32)
HIGHBD_MASK_VAR(32, 16)
HIGHBD_MASK_SUBPIX_VAR(32, 16)
HIGHBD_MASK_VAR(32, 32)
HIGHBD_MASK_SUBPIX_VAR(32, 32)
HIGHBD_MASK_VAR(32, 64)
HIGHBD_MASK_SUBPIX_VAR(32, 64)
HIGHBD_MASK_VAR(64, 32)
HIGHBD_MASK_SUBPIX_VAR(64, 32)
HIGHBD_MASK_VAR(64, 64)
HIGHBD_MASK_SUBPIX_VAR(64, 64)
#if CONFIG_EXT_PARTITION
HIGHBD_MASK_VAR(64, 128)
HIGHBD_MASK_SUBPIX_VAR(64, 128)
HIGHBD_MASK_VAR(128, 64)
HIGHBD_MASK_SUBPIX_VAR(128, 64)
HIGHBD_MASK_VAR(128, 128)
HIGHBD_MASK_SUBPIX_VAR(128, 128)
#endif // CONFIG_EXT_PARTITION
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP10 && CONFIG_EXT_INTER
......@@ -293,6 +293,13 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_VP10_ENCODER),yes)
ifeq ($(CONFIG_EXT_INTER),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
endif #CONFIG_EXT_INTER
endif #CONFIG_VP10_ENCODER
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm
......
This diff is collapsed.
......@@ -27,6 +27,10 @@ extern "C" {
typedef int16_t InterpKernel[SUBPEL_TAPS];
#define BIL_SUBPEL_BITS 3
#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
extern const uint8_t vpx_bilinear_filters[BIL_SUBPEL_SHIFTS][2];
#ifdef __cplusplus
} // extern "C"
#endif
......
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include "vpx_ports/mem.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
__m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
__m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
return _mm_unpacklo_epi64(temp1, temp2);
}
static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
__m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);
__m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride));
__m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2));
temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3));
temp1 = _mm_unpacklo_epi32(temp1, temp2);
return _mm_unpacklo_epi64(temp3, temp1);
}
static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride,
const uint8_t *m_ptr, int m_stride,
int width, int height);
static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
int a_stride,
const uint8_t *b_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height);
static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
int a_stride,
const uint8_t *b_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height);
#define MASKSADMXN_SSSE3(m, n) \
unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *msk, \
int msk_stride) { \
return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
m, n); \
}
MASKSADMXN_SSSE3(64, 64)
MASKSADMXN_SSSE3(64, 32)
MASKSADMXN_SSSE3(32, 64)
MASKSADMXN_SSSE3(32, 32)
MASKSADMXN_SSSE3(32, 16)
MASKSADMXN_SSSE3(16, 32)
MASKSADMXN_SSSE3(16, 16)
MASKSADMXN_SSSE3(16, 8)
#define MASKSAD8XN_SSSE3(n) \
unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *msk, \
int msk_stride) { \
return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
msk_stride, n); \
}
MASKSAD8XN_SSSE3(16)
MASKSAD8XN_SSSE3(8)
MASKSAD8XN_SSSE3(4)
#define MASKSAD4XN_SSSE3(n) \
unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \
const uint8_t *ref, int ref_stride, \
const uint8_t *msk, int msk_stride) { \
return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
msk_stride, n); \
}
MASKSAD4XN_SSSE3(8)
MASKSAD4XN_SSSE3(4)
// For width a multiple of 16
// Assumes values in m are <=64 and w = 16, 32, or 64
static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride,
const uint8_t *m_ptr, int m_stride,
int width, int height) {
int y, x;
__m128i a, b, m, temp1, temp2;
__m128i res = _mm_setzero_si128();
__m128i one = _mm_set1_epi16(1);
// For each row
for (y = 0; y < height; y++) {
// Covering the full width
for (x = 0; x < width; x += 16) {
// Load a, b, m in xmm registers
a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
m = _mm_loadu_si128((const __m128i*)(m_ptr + x));
// Calculate the difference between a & b
temp1 = _mm_subs_epu8(a, b);
temp2 = _mm_subs_epu8(b, a);
temp1 = _mm_or_si128(temp1, temp2);
// Multiply by m and add together
temp2 = _mm_maddubs_epi16(temp1, m);
// Pad out row result to 32 bit integers & add to running total
res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
}
// Move onto the next row
a_ptr += a_stride;
b_ptr += b_stride;
m_ptr += m_stride;
}
res = _mm_hadd_epi32(res, _mm_setzero_si128());
res = _mm_hadd_epi32(res, _mm_setzero_si128());
// sad = (sad + 31) >> 6;
return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
int a_stride,
const uint8_t *b_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height) {
int y;
__m128i a, b, m, temp1, temp2, row_res;
__m128i res = _mm_setzero_si128();
__m128i one = _mm_set1_epi16(1);
// Add the masked SAD for 2 rows at a time
for (y = 0; y < height; y += 2) {
// Load a, b, m in xmm registers
a = width8_load_2rows(a_ptr, a_stride);
b = width8_load_2rows(b_ptr, b_stride);
m = width8_load_2rows(m_ptr, m_stride);
// Calculate the difference between a & b
temp1 = _mm_subs_epu8(a, b);
temp2 = _mm_subs_epu8(b, a);
temp1 = _mm_or_si128(temp1, temp2);
// Multiply by m and add together
row_res = _mm_maddubs_epi16(temp1, m);
// Pad out row result to 32 bit integers & add to running total
res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
// Move onto the next rows
a_ptr += a_stride * 2;
b_ptr += b_stride * 2;
m_ptr += m_stride * 2;
}
res = _mm_hadd_epi32(res, _mm_setzero_si128());
res = _mm_hadd_epi32(res, _mm_setzero_si128());
// sad = (sad + 31) >> 6;
return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
int a_stride,
const uint8_t *b_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height) {
int y;
__m128i a, b, m, temp1, temp2, row_res;
__m128i res = _mm_setzero_si128();
__m128i one = _mm_set1_epi16(1);
// Add the masked SAD for 4 rows at a time
for (y = 0; y < height; y += 4) {
// Load a, b, m in xmm registers
a = width4_load_4rows(a_ptr, a_stride);
b = width4_load_4rows(b_ptr, b_stride);
m = width4_load_4rows(m_ptr, m_stride);
// Calculate the difference between a & b
temp1 = _mm_subs_epu8(a, b);
temp2 = _mm_subs_epu8(b, a);
temp1 = _mm_or_si128(temp1, temp2);
// Multiply by m and add together
row_res = _mm_maddubs_epi16(temp1, m);
// Pad out row result to 32 bit integers & add to running total
res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
// Move onto the next rows
a_ptr += a_stride * 4;
b_ptr += b_stride * 4;
m_ptr += m_stride * 4;
}
// Pad out row result to 32 bit integers & add to running total
res = _mm_hadd_epi32(res, _mm_setzero_si128());
res = _mm_hadd_epi32(res, _mm_setzero_si128());
// sad = (sad + 31) >> 6;
return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
int stride) {
__m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
__m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
return _mm_unpacklo_epi64(temp1, temp2);
}
static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
int a_stride,
const uint8_t *b8_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int width, int height);
static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
int a_stride,
const uint8_t *b8_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height);
#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *msk, \
int msk_stride) { \
return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
msk_stride, m, n); \
}
HIGHBD_MASKSADMXN_SSSE3(64, 64)
HIGHBD_MASKSADMXN_SSSE3(64, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 64)
HIGHBD_MASKSADMXN_SSSE3(32, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 32)
HIGHBD_MASKSADMXN_SSSE3(16, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 16)
HIGHBD_MASKSADMXN_SSSE3(8, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 4)
#define HIGHBD_MASKSAD4XN_SSSE3(n) \
unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \
int src_stride, \
const uint8_t *ref, \
int ref_stride, \
const uint8_t *msk, \
int msk_stride) { \
return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
msk_stride, n); \
}
HIGHBD_MASKSAD4XN_SSSE3(8)
HIGHBD_MASKSAD4XN_SSSE3(4)
// For width a multiple of 8
// Assumes values in m are <=64
static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
int a_stride,
const uint8_t *b8_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int width, int height) {
int y, x;
__m128i a, b, m, temp1, temp2;
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
__m128i res = _mm_setzero_si128();
// For each row
for (y = 0; y < height; y++) {
// Covering the full width
for (x = 0; x < width; x += 8) {
// Load a, b, m in xmm registers
a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)),
_mm_setzero_si128());
// Calculate the difference between a & b
temp1 = _mm_subs_epu16(a, b);
temp2 = _mm_subs_epu16(b, a);
temp1 = _mm_or_si128(temp1, temp2);
// Add result of multiplying by m and add pairs together to running total
res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
}
// Move onto the next row
a_ptr += a_stride;
b_ptr += b_stride;
m_ptr += m_stride;
}
res = _mm_hadd_epi32(res, _mm_setzero_si128());
res = _mm_hadd_epi32(res, _mm_setzero_si128());
// sad = (sad + 31) >> 6;
return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
int a_stride,
const uint8_t *b8_ptr,
int b_stride,
const uint8_t *m_ptr,
int m_stride,
int height) {
int y;
__m128i a, b, m, temp1, temp2;
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
__m128i res = _mm_setzero_si128();
// Add the masked SAD for 2 rows at a time
for (y = 0; y < height; y += 2) {
// Load a, b, m in xmm registers
a = highbd_width4_load_2rows(a_ptr, a_stride);
b = highbd_width4_load_2rows(b_ptr, b_stride);
temp1 = _mm_loadl_epi64((const __m128i*)m_ptr);
temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride));
m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
_mm_setzero_si128());
// Calculate the difference between a & b
temp1 = _mm_subs_epu16(a, b);
temp2 = _mm_subs_epu16(b, a);
temp1 = _mm_or_si128(temp1, temp2);
// Multiply by m and add together
res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
// Move onto the next rows
a_ptr += a_stride * 2;
b_ptr += b_stride * 2;
m_ptr += m_stride * 2;
}
res = _mm_hadd_epi32(res, _mm_setzero_si128());
res = _mm_hadd_epi32(res, _mm_setzero_si128());
// sad = (sad + 31) >> 6;
return (_mm_cvtsi128_si32(res) + 31) >> 6;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment