Commit c804e0df authored by Geza Lore's avatar Geza Lore
Browse files

Cleanup obmc_sad function prototypes.

Name 'wsrc', 'mask' and 'pre' explicitly, rather than
using 'b', 'm' and 'a'.

Change-Id: Iaee6d1ac1211b0b05b47cf98b50570089b12d600
parent b8a28fbb
...@@ -29,7 +29,7 @@ namespace { ...@@ -29,7 +29,7 @@ namespace {
static const int kIterations = 1000; static const int kIterations = 1000;
static const int kMaskMax = 64; static const int kMaskMax = 64;
typedef unsigned int (*ObmcSadF)(const uint8_t *ref, int ref_stride, typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
const int32_t *wsrc, const int32_t *mask); const int32_t *wsrc, const int32_t *mask);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
...@@ -45,42 +45,42 @@ class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> { ...@@ -45,42 +45,42 @@ class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {
}; };
TEST_P(ObmcSadTest, RandomValues) { TEST_P(ObmcSadTest, RandomValues) {
DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) { for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
const int ref_stride = rng_(MAX_SB_SIZE + 1); const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = rng_.Rand8(); pre[i] = rng_.Rand8();
wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1); wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1); mask[i] = rng_(kMaskMax * kMaskMax + 1);
} }
const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask); const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask); const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
ASSERT_EQ(ref_res, tst_res); ASSERT_EQ(ref_res, tst_res);
} }
} }
TEST_P(ObmcSadTest, ExtremeValues) { TEST_P(ObmcSadTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) { for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
const int ref_stride = iter; const int pre_stride = iter;
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = UINT8_MAX; pre[i] = UINT8_MAX;
wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax; wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax; mask[i] = kMaskMax * kMaskMax;
} }
const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask); const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask); const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
ASSERT_EQ(ref_res, tst_res); ASSERT_EQ(ref_res, tst_res);
} }
...@@ -126,22 +126,22 @@ class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> { ...@@ -126,22 +126,22 @@ class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {
}; };
TEST_P(ObmcSadHBDTest, RandomValues) { TEST_P(ObmcSadHBDTest, RandomValues) {
DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) { for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
const int ref_stride = rng_(MAX_SB_SIZE + 1); const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = rng_(1<<12); pre[i] = rng_(1<<12);
wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1); wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1); mask[i] = rng_(kMaskMax * kMaskMax + 1);
} }
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask); wsrc, mask);
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask); wsrc, mask);
ASSERT_EQ(ref_res, tst_res); ASSERT_EQ(ref_res, tst_res);
...@@ -149,22 +149,22 @@ TEST_P(ObmcSadHBDTest, RandomValues) { ...@@ -149,22 +149,22 @@ TEST_P(ObmcSadHBDTest, RandomValues) {
} }
TEST_P(ObmcSadHBDTest, ExtremeValues) { TEST_P(ObmcSadHBDTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]); DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) { for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
const int ref_stride = iter; const int pre_stride = iter;
for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) { for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
ref[i] = (1 << 12) - 1; pre[i] = (1 << 12) - 1;
wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax; wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax; mask[i] = kMaskMax * kMaskMax;
} }
const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask); wsrc, mask);
const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride, const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
wsrc, mask); wsrc, mask);
ASSERT_EQ(ref_res, tst_res); ASSERT_EQ(ref_res, tst_res);
......
...@@ -452,23 +452,23 @@ HIGHBD_MASKSADMXN(4, 4) ...@@ -452,23 +452,23 @@ HIGHBD_MASKSADMXN(4, 4)
#endif // CONFIG_VP10 && CONFIG_EXT_INTER #endif // CONFIG_VP10 && CONFIG_EXT_INTER
#if CONFIG_VP10 && CONFIG_OBMC #if CONFIG_VP10 && CONFIG_OBMC
// a: pred // pre: predictor being evaluated
// b: target weighted prediction (has been *4096 to keep precision) // wsrc: target weighted prediction (has been *4096 to keep precision)
// m: 2d weights (scaled by 4096) // mask: 2d weights (scaled by 4096)
static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride, static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
const int32_t *b, const int32_t *wsrc,
const int32_t *m, const int32_t *mask,
int width, int height) { int width, int height) {
int y, x; int y, x;
unsigned int sad = 0; unsigned int sad = 0;
for (y = 0; y < height; y++) { for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12); sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
a += a_stride; pre += pre_stride;
b += width; wsrc += width;
m += width; mask += width;
} }
return sad; return sad;
...@@ -477,8 +477,8 @@ static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride, ...@@ -477,8 +477,8 @@ static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
#define OBMCSADMxN(m, n) \ #define OBMCSADMxN(m, n) \
unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
const int32_t *wsrc, \ const int32_t *wsrc, \
const int32_t *msk) { \ const int32_t *mask) { \
return obmc_sad(ref, ref_stride, wsrc, msk, m, n); \ return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
} }
#if CONFIG_EXT_PARTITION #if CONFIG_EXT_PARTITION
...@@ -501,21 +501,21 @@ OBMCSADMxN(4, 8) ...@@ -501,21 +501,21 @@ OBMCSADMxN(4, 8)
OBMCSADMxN(4, 4) OBMCSADMxN(4, 4)
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride, static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
const int32_t *b, const int32_t *wsrc,
const int32_t *m, const int32_t *mask,
int width, int height) { int width, int height) {
int y, x; int y, x;
unsigned int sad = 0; unsigned int sad = 0;
const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
for (y = 0; y < height; y++) { for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12); sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
a += a_stride; pre += pre_stride;
b += width; wsrc += width;
m += width; mask += width;
} }
return sad; return sad;
...@@ -525,8 +525,8 @@ static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride, ...@@ -525,8 +525,8 @@ static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref, \ unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref, \
int ref_stride, \ int ref_stride, \
const int32_t *wsrc, \ const int32_t *wsrc, \
const int32_t *msk) { \ const int32_t *mask) { \
return highbd_obmc_sad(ref, ref_stride, wsrc, msk, m, n); \ return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
} }
#if CONFIG_EXT_PARTITION #if CONFIG_EXT_PARTITION
......
...@@ -1124,14 +1124,14 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") { ...@@ -1124,14 +1124,14 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
if (vpx_config("CONFIG_OBMC") eq "yes") { if (vpx_config("CONFIG_OBMC") eq "yes") {
foreach (@block_sizes) { foreach (@block_sizes) {
($w, $h) = @$_; ($w, $h) = @$_;
add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask"; add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/; specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/;
} }
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) { foreach (@block_sizes) {
($w, $h) = @$_; ($w, $h) = @$_;
add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask"; add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/; specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
} }
} }
......
...@@ -21,26 +21,28 @@ ...@@ -21,26 +21,28 @@
// 8 bit // 8 bit
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride, static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
const int32_t *b, const int32_t *m, const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int height) { const int height) {
const int a_step = a_stride - 4; const int pre_step = pre_stride - 4;
int n = 0; int n = 0;
__m128i v_sad_d = _mm_setzero_si128(); __m128i v_sad_d = _mm_setzero_si128();
do { do {
const __m128i v_a_b = xx_loadl_32(a + n); const __m128i v_p_b = xx_loadl_32(pre + n);
const __m128i v_m_d = xx_load_128(m + n); const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_b_d = xx_load_128(b + n); const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_a_d = _mm_cvtepu8_epi32(v_a_b); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
// Values in both a and m fit in 15 bits, and are packed at 32 bit // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell // boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs. // than pmulld but produces the same result with these inputs.
const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d); const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference // Rounded absolute difference
...@@ -51,39 +53,42 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride, ...@@ -51,39 +53,42 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride,
n += 4; n += 4;
if (n % 4 == 0) if (n % 4 == 0)
a += a_step; pre += pre_step;
} while (n < 4 * height); } while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d); return xx_hsum_epi32_si32(v_sad_d);
} }
static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride, static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
const int32_t *b, const int32_t *m, const int pre_stride,
const int width, const int height) { const int32_t *wsrc,
const int a_step = a_stride - width; const int32_t *mask,
const int width,
const int height) {
const int pre_step = pre_stride - width;
int n = 0; int n = 0;
__m128i v_sad_d = _mm_setzero_si128(); __m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8 && (width & (width - 1)) == 0); assert(width >= 8 && (width & (width - 1)) == 0);
do { do {
const __m128i v_a1_b = xx_loadl_32(a + n + 4); const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
const __m128i v_m1_d = xx_load_128(m + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_b1_d = xx_load_128(b + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_a0_b = xx_loadl_32(a + n); const __m128i v_p0_b = xx_loadl_32(pre + n);
const __m128i v_m0_d = xx_load_128(m + n); const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_b0_d = xx_load_128(b + n); const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_a0_d = _mm_cvtepu8_epi32(v_a0_b); const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
const __m128i v_a1_d = _mm_cvtepu8_epi32(v_a1_b); const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
// Values in both a and m fit in 15 bits, and are packed at 32 bit // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell // boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs. // than pmulld but produces the same result with these inputs.
const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d); const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
...@@ -97,21 +102,21 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride, ...@@ -97,21 +102,21 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride,
n += 8; n += 8;
if (n % width == 0) if (n % width == 0)
a += a_step; pre += pre_step;
} while (n < width * height); } while (n < width * height);
return xx_hsum_epi32_si32(v_sad_d); return xx_hsum_epi32_si32(v_sad_d);
} }
#define OBMCSADWXH(w, h) \ #define OBMCSADWXH(w, h) \
unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref, \ unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre, \
int ref_stride, \ int pre_stride, \
const int32_t *wsrc, \ const int32_t *wsrc, \
const int32_t *msk) { \ const int32_t *msk) { \
if (w == 4) \ if (w == 4) \
return obmc_sad_w4(ref, ref_stride, wsrc, msk, h); \ return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
else \ else \
return obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h); \ return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
} }
#if CONFIG_EXT_PARTITION #if CONFIG_EXT_PARTITION
...@@ -138,28 +143,29 @@ OBMCSADWXH(4, 4) ...@@ -138,28 +143,29 @@ OBMCSADWXH(4, 4)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#if CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH
static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8, static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
const int a_stride, const int pre_stride,
const int32_t *b, const int32_t *m, const int32_t *wsrc,
const int32_t *mask,
const int height) { const int height) {
const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int a_step = a_stride - 4; const int pre_step = pre_stride - 4;
int n = 0; int n = 0;
__m128i v_sad_d = _mm_setzero_si128(); __m128i v_sad_d = _mm_setzero_si128();
do { do {
const __m128i v_a_w = xx_loadl_64(a + n); const __m128i v_p_w = xx_loadl_64(pre + n);
const __m128i v_m_d = xx_load_128(m + n); const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_b_d = xx_load_128(b + n); const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_a_d = _mm_cvtepu16_epi32(v_a_w); const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
// Values in both a and m fit in 15 bits, and are packed at 32 bit // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell // boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs. // than pmulld but produces the same result with these inputs.
const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d); const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference // Rounded absolute difference
...@@ -170,41 +176,43 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8, ...@@ -170,41 +176,43 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8,
n += 4; n += 4;
if (n % 4 == 0) if (n % 4 == 0)
a += a_step; pre += pre_step;
} while (n < 4 * height); } while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d); return xx_hsum_epi32_si32(v_sad_d);
} }
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *a8, static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
const int a_stride, const int pre_stride,
const int32_t *b, const int32_t *m, const int32_t *wsrc,
const int width, const int height) { const int32_t *mask,
const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const int width,
const int a_step = a_stride - width; const int height) {
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int pre_step = pre_stride - width;
int n = 0; int n = 0;
__m128i v_sad_d = _mm_setzero_si128(); __m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8 && (width & (width - 1)) == 0); assert(width >= 8 && (width & (width - 1)) == 0);
do { do {
const __m128i v_a1_w = xx_loadl_64(a + n + 4); const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
const __m128i v_m1_d = xx_load_128(m + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_b1_d = xx_load_128(b + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_a0_w = xx_loadl_64(a + n); const __m128i v_p0_w = xx_loadl_64(pre + n);
const __m128i v_m0_d = xx_load_128(m + n); const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_b0_d = xx_load_128(b + n); const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_a0_d = _mm_cvtepu16_epi32(v_a0_w); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
const __m128i v_a1_d = _mm_cvtepu16_epi32(v_a1_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
// Values in both a and m fit in 15 bits, and are packed at 32 bit // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell // boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs. // than pmulld but produces the same result with these inputs.
const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d); const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);