Commit c804e0df authored by Geza Lore's avatar Geza Lore

Cleanup obmc_sad function prototypes.

Name 'wsrc', 'mask' and 'pre' explicitly, rather than
using 'b', 'm' and 'a'.

Change-Id: Iaee6d1ac1211b0b05b47cf98b50570089b12d600
parent b8a28fbb
......@@ -29,7 +29,7 @@ namespace {
static const int kIterations = 1000;
static const int kMaskMax = 64;
typedef unsigned int (*ObmcSadF)(const uint8_t *ref, int ref_stride,
typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
const int32_t *wsrc, const int32_t *mask);
////////////////////////////////////////////////////////////////////////////////
......@@ -45,42 +45,42 @@ class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {
};
TEST_P(ObmcSadTest, RandomValues) {
DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
const int ref_stride = rng_(MAX_SB_SIZE + 1);
const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = rng_.Rand8();
pre[i] = rng_.Rand8();
wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1);
}
const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask);
const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
ASSERT_EQ(ref_res, tst_res);
}
}
TEST_P(ObmcSadTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
const int ref_stride = iter;
const int pre_stride = iter;
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = UINT8_MAX;
pre[i] = UINT8_MAX;
wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax;
}
const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask);
const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
ASSERT_EQ(ref_res, tst_res);
}
......@@ -126,22 +126,22 @@ class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {
};
TEST_P(ObmcSadHBDTest, RandomValues) {
DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
const int ref_stride = rng_(MAX_SB_SIZE + 1);
const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = rng_(1<<12);
pre[i] = rng_(1<<12);
wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1);
}
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask);
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask);
ASSERT_EQ(ref_res, tst_res);
......@@ -149,22 +149,22 @@ TEST_P(ObmcSadHBDTest, RandomValues) {
}
TEST_P(ObmcSadHBDTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
const int ref_stride = iter;
const int pre_stride = iter;
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = (1 << 12) - 1;
pre[i] = (1 << 12) - 1;
wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax;
}
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask);
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask);
ASSERT_EQ(ref_res, tst_res);
......
......@@ -452,23 +452,23 @@ HIGHBD_MASKSADMXN(4, 4)
#endif // CONFIG_VP10 && CONFIG_EXT_INTER
#if CONFIG_VP10 && CONFIG_OBMC
// a: pred
// b: target weighted prediction (has been *4096 to keep precision)
// m: 2d weights (scaled by 4096)
static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
const int32_t *b,
const int32_t *m,
// pre: predictor being evaluated
// wsrc: target weighted prediction (has been *4096 to keep precision)
// mask: 2d weights (scaled by 4096)
static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
int width, int height) {
int y, x;
unsigned int sad = 0;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12);
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
a += a_stride;
b += width;
m += width;
pre += pre_stride;
wsrc += width;
mask += width;
}
return sad;
......@@ -477,8 +477,8 @@ static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
#define OBMCSADMxN(m, n) \
unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
const int32_t *wsrc, \
const int32_t *msk) { \
return obmc_sad(ref, ref_stride, wsrc, msk, m, n); \
const int32_t *mask) { \
return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
}
#if CONFIG_EXT_PARTITION
......@@ -501,21 +501,21 @@ OBMCSADMxN(4, 8)
OBMCSADMxN(4, 4)
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
const int32_t *b,
const int32_t *m,
static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
int width, int height) {
int y, x;
unsigned int sad = 0;
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12);
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
a += a_stride;
b += width;
m += width;
pre += pre_stride;
wsrc += width;
mask += width;
}
return sad;
......@@ -525,8 +525,8 @@ static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref, \
int ref_stride, \
const int32_t *wsrc, \
const int32_t *msk) { \
return highbd_obmc_sad(ref, ref_stride, wsrc, msk, m, n); \
const int32_t *mask) { \
return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
}
#if CONFIG_EXT_PARTITION
......
......@@ -1124,14 +1124,14 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
if (vpx_config("CONFIG_OBMC") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask";
add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/;
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask";
add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
}
}
......
......@@ -21,26 +21,28 @@
// 8 bit
////////////////////////////////////////////////////////////////////////////////
static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride,
const int32_t *b, const int32_t *m,
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int height) {
const int a_step = a_stride - 4;
const int pre_step = pre_stride - 4;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
do {
const __m128i v_a_b = xx_loadl_32(a + n);
const __m128i v_m_d = xx_load_128(m + n);
const __m128i v_b_d = xx_load_128(b + n);
const __m128i v_p_b = xx_loadl_32(pre + n);
const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_a_d = _mm_cvtepu8_epi32(v_a_b);
const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
// Values in both a and m fit in 15 bits, and are packed at 32 bit
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d);
const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d);
const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference
......@@ -51,39 +53,42 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride,
n += 4;
if (n % 4 == 0)
a += a_step;
pre += pre_step;
} while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d);
}
static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride,
const int32_t *b, const int32_t *m,
const int width, const int height) {
const int a_step = a_stride - width;
static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int width,
const int height) {
const int pre_step = pre_stride - width;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8 && (width & (width - 1)) == 0);
do {
const __m128i v_a1_b = xx_loadl_32(a + n + 4);
const __m128i v_m1_d = xx_load_128(m + n + 4);
const __m128i v_b1_d = xx_load_128(b + n + 4);
const __m128i v_a0_b = xx_loadl_32(a + n);
const __m128i v_m0_d = xx_load_128(m + n);
const __m128i v_b0_d = xx_load_128(b + n);
const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_p0_b = xx_loadl_32(pre + n);
const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_a0_d = _mm_cvtepu8_epi32(v_a0_b);
const __m128i v_a1_d = _mm_cvtepu8_epi32(v_a1_b);
const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
// Values in both a and m fit in 15 bits, and are packed at 32 bit
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d);
const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d);
const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
......@@ -97,21 +102,21 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride,
n += 8;
if (n % width == 0)
a += a_step;
pre += pre_step;
} while (n < width * height);
return xx_hsum_epi32_si32(v_sad_d);
}
#define OBMCSADWXH(w, h) \
unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref, \
int ref_stride, \
unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \
int pre_stride, \
const int32_t *wsrc, \
const int32_t *msk) { \
if (w == 4) \
return obmc_sad_w4(ref, ref_stride, wsrc, msk, h); \
return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
else \
return obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h); \
return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
}
#if CONFIG_EXT_PARTITION
......@@ -138,28 +143,29 @@ OBMCSADWXH(4, 4)
////////////////////////////////////////////////////////////////////////////////
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8,
const int a_stride,
const int32_t *b, const int32_t *m,
static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int height) {
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const int a_step = a_stride - 4;
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int pre_step = pre_stride - 4;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
do {
const __m128i v_a_w = xx_loadl_64(a + n);
const __m128i v_m_d = xx_load_128(m + n);
const __m128i v_b_d = xx_load_128(b + n);
const __m128i v_p_w = xx_loadl_64(pre + n);
const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_a_d = _mm_cvtepu16_epi32(v_a_w);
const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
// Values in both a and m fit in 15 bits, and are packed at 32 bit
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d);
const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d);
const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference
......@@ -170,41 +176,43 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8,
n += 4;
if (n % 4 == 0)
a += a_step;
pre += pre_step;
} while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d);
}
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *a8,
const int a_stride,
const int32_t *b, const int32_t *m,
const int width, const int height) {
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
const int a_step = a_stride - width;
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int width,
const int height) {
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int pre_step = pre_stride - width;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8 && (width & (width - 1)) == 0);
do {
const __m128i v_a1_w = xx_loadl_64(a + n + 4);
const __m128i v_m1_d = xx_load_128(m + n + 4);
const __m128i v_b1_d = xx_load_128(b + n + 4);
const __m128i v_a0_w = xx_loadl_64(a + n);
const __m128i v_m0_d = xx_load_128(m + n);
const __m128i v_b0_d = xx_load_128(b + n);
const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_p0_w = xx_loadl_64(pre + n);
const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_a0_d = _mm_cvtepu16_epi32(v_a0_w);
const __m128i v_a1_d = _mm_cvtepu16_epi32(v_a1_w);
const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
// Values in both a and m fit in 15 bits, and are packed at 32 bit
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d);
const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d);
const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
......@@ -218,21 +226,21 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *a8,
n += 8;
if (n % width == 0)
a += a_step;
pre += pre_step;
} while (n < width * height);
return xx_hsum_epi32_si32(v_sad_d);
}
#define HBD_OBMCSADWXH(w, h) \
unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref, \
int ref_stride, \
unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \
int pre_stride, \
const int32_t *wsrc, \
const int32_t *msk) { \
const int32_t *mask) { \
if (w == 4) \
return hbd_obmc_sad_w4(ref, ref_stride, wsrc, msk, h); \
return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
else \
return hbd_obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h); \
return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
}
#if CONFIG_EXT_PARTITION
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment