Commit 0aa39ff0 authored by David Barker's avatar David Barker Committed by Debargha Mukherjee

ext-inter: Vectorize new masked SAD/SSE functions

We would expect that these new functions would be slower than
the old masked SAD/SSE functions, as they do additional work
(blending two inputs and comparing to a third, rather than
just comparing two inputs).

This is true for the SAD functions, which are about 50% slower
(depending on block size and bit depth). However, the sub-pixel
SSE functions are comparable to the old speed for the accelerated
special cases (xoffset or yoffset = 0 or 4), and are
between 40-90% faster for the generic case.

Change-Id: I1a296ed8fc9e3edc313a6add516ff76b17cd3e9f
parent b9f68d27
......@@ -343,6 +343,10 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c
endif
ifeq ($(CONFIG_AV1_ENCODER),yes)
ifeq ($(CONFIG_EXT_INTER),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
endif #CONFIG_EXT_INTER
ifeq ($(CONFIG_MOTION_VAR),yes)
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
......
......@@ -741,12 +741,14 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
}
}
}
......@@ -1046,6 +1048,7 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
}
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
......@@ -1053,6 +1056,7 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
}
}
}
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <stdio.h>
#include <tmmintrin.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/blend.h"
#include "aom/aom_integer.h"
#include "aom_dsp/x86/synonyms.h"
// For width a multiple of 16
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
int src_stride,
const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride,
const uint8_t *m_ptr, int m_stride,
int width, int height);
static INLINE unsigned int masked_sad8xh_ssse3(
const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
int height);
static INLINE unsigned int masked_sad4xh_ssse3(
const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
int height);
#define MASKSADMXN_SSSE3(m, n) \
unsigned int aom_masked_sad##m##x##n##_ssse3( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
int invert_mask) { \
if (!invert_mask) \
return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \
m, msk, msk_stride, m, n); \
else \
return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \
ref_stride, msk, msk_stride, m, n); \
}
#define MASKSAD8XN_SSSE3(n) \
unsigned int aom_masked_sad8x##n##_ssse3( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
int invert_mask) { \
if (!invert_mask) \
return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
second_pred, 8, msk, msk_stride, n); \
else \
return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
ref_stride, msk, msk_stride, n); \
}
#define MASKSAD4XN_SSSE3(n) \
unsigned int aom_masked_sad4x##n##_ssse3( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
int invert_mask) { \
if (!invert_mask) \
return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
second_pred, 4, msk, msk_stride, n); \
else \
return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
ref_stride, msk, msk_stride, n); \
}
#if CONFIG_EXT_PARTITION
MASKSADMXN_SSSE3(128, 128)
MASKSADMXN_SSSE3(128, 64)
MASKSADMXN_SSSE3(64, 128)
#endif // CONFIG_EXT_PARTITION
MASKSADMXN_SSSE3(64, 64)
MASKSADMXN_SSSE3(64, 32)
MASKSADMXN_SSSE3(32, 64)
MASKSADMXN_SSSE3(32, 32)
MASKSADMXN_SSSE3(32, 16)
MASKSADMXN_SSSE3(16, 32)
MASKSADMXN_SSSE3(16, 16)
MASKSADMXN_SSSE3(16, 8)
MASKSAD8XN_SSSE3(16)
MASKSAD8XN_SSSE3(8)
MASKSAD8XN_SSSE3(4)
MASKSAD4XN_SSSE3(8)
MASKSAD4XN_SSSE3(4)
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
int src_stride,
const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride,
const uint8_t *m_ptr, int m_stride,
int width, int height) {
int x, y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
for (y = 0; y < height; y++) {
for (x = 0; x < width; x += 16) {
const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
const __m128i m_inv = _mm_sub_epi8(mask_max, m);
// Calculate 16 predicted pixels.
// Note that the maximum value of any entry of 'pred_l' or 'pred_r'
// is 64 * 255, so we have plenty of space to add rounding constants.
const __m128i data_l = _mm_unpacklo_epi8(a, b);
const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
__m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
const __m128i data_r = _mm_unpackhi_epi8(a, b);
const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
__m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
}
src_ptr += src_stride;
a_ptr += a_stride;
b_ptr += b_stride;
m_ptr += m_stride;
}
// At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
int32_t sad =
_mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
return (sad + 31) >> 6;
}
static INLINE unsigned int masked_sad8xh_ssse3(
const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
int height) {
int y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
for (y = 0; y < height; y += 2) {
const __m128i src = _mm_unpacklo_epi64(
_mm_loadl_epi64((const __m128i *)src_ptr),
_mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
const __m128i m =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
_mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
const __m128i m_inv = _mm_sub_epi8(mask_max, m);
const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
__m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
__m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
src_ptr += src_stride * 2;
a_ptr += a_stride * 2;
b_ptr += b_stride * 2;
m_ptr += m_stride * 2;
}
int32_t sad =
_mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
return (sad + 31) >> 6;
}
static INLINE unsigned int masked_sad4xh_ssse3(
const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
int height) {
int y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
for (y = 0; y < height; y += 2) {
// Load two rows at a time, this seems to be a bit faster
// than four rows at a time in this case.
const __m128i src = _mm_unpacklo_epi32(
_mm_cvtsi32_si128(*(uint32_t *)src_ptr),
_mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
const __m128i a =
_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
_mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
const __m128i b =
_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
_mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
const __m128i m =
_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
_mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
const __m128i m_inv = _mm_sub_epi8(mask_max, m);
const __m128i data = _mm_unpacklo_epi8(a, b);
const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
__m128i pred_16bit = _mm_maddubs_epi16(data, mask);
pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
src_ptr += src_stride * 2;
a_ptr += a_stride * 2;
b_ptr += b_stride * 2;
m_ptr += m_stride * 2;
}
// At this point, the SAD is stored in lane 0 of 'res'
int32_t sad = _mm_cvtsi128_si32(res);
return (sad + 31) >> 6;
}
#if CONFIG_HIGHBITDEPTH
// For width a multiple of 8
static INLINE unsigned int highbd_masked_sad_ssse3(
const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
int width, int height);
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
int height);
#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
const uint8_t *src8, int src_stride, const uint8_t *ref8, \
int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
int msk_stride, int invert_mask) { \
if (!invert_mask) \
return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \
second_pred8, m, msk, msk_stride, m, n); \
else \
return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
ref_stride, msk, msk_stride, m, n); \
}
#define HIGHBD_MASKSAD4XN_SSSE3(n) \
unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
const uint8_t *src8, int src_stride, const uint8_t *ref8, \
int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
int msk_stride, int invert_mask) { \
if (!invert_mask) \
return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \
second_pred8, 4, msk, msk_stride, n); \
else \
return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
ref8, ref_stride, msk, msk_stride, n); \
}
#if CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN_SSSE3(128, 128)
HIGHBD_MASKSADMXN_SSSE3(128, 64)
HIGHBD_MASKSADMXN_SSSE3(64, 128)
#endif // CONFIG_EXT_PARTITION
HIGHBD_MASKSADMXN_SSSE3(64, 64)
HIGHBD_MASKSADMXN_SSSE3(64, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 64)
HIGHBD_MASKSADMXN_SSSE3(32, 32)
HIGHBD_MASKSADMXN_SSSE3(32, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 32)
HIGHBD_MASKSADMXN_SSSE3(16, 16)
HIGHBD_MASKSADMXN_SSSE3(16, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 16)
HIGHBD_MASKSADMXN_SSSE3(8, 8)
HIGHBD_MASKSADMXN_SSSE3(8, 4)
HIGHBD_MASKSAD4XN_SSSE3(8)
HIGHBD_MASKSAD4XN_SSSE3(4)
static INLINE unsigned int highbd_masked_sad_ssse3(
const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
int width, int height) {
const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
int x, y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
const __m128i round_const =
_mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
const __m128i one = _mm_set1_epi16(1);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x += 8) {
const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
// Zero-extend mask to 16 bits
const __m128i m = _mm_unpacklo_epi8(
_mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
const __m128i m_inv = _mm_sub_epi16(mask_max, m);
const __m128i data_l = _mm_unpacklo_epi16(a, b);
const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
__m128i pred_l = _mm_madd_epi16(data_l, mask_l);
pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
AOM_BLEND_A64_ROUND_BITS);
const __m128i data_r = _mm_unpackhi_epi16(a, b);
const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
__m128i pred_r = _mm_madd_epi16(data_r, mask_r);
pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
AOM_BLEND_A64_ROUND_BITS);
// Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
// so it is safe to do signed saturation here.
const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
// There is no 16-bit SAD instruction, so we have to synthesize
// an 8-element SAD. We do this by storing 4 32-bit partial SADs,
// and accumulating them at the end
const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
}
src_ptr += src_stride;
a_ptr += a_stride;
b_ptr += b_stride;
m_ptr += m_stride;
}
// At this point, we have four 32-bit partial SADs stored in 'res'.
res = _mm_hadd_epi32(res, res);
res = _mm_hadd_epi32(res, res);
int sad = _mm_cvtsi128_si32(res);
return (sad + 31) >> 6;
}
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
int height) {
const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
int y;
__m128i res = _mm_setzero_si128();
const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
const __m128i round_const =
_mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
const __m128i one = _mm_set1_epi16(1);
for (y = 0; y < height; y += 2) {
const __m128i src = _mm_unpacklo_epi64(
_mm_loadl_epi64((const __m128i *)src_ptr),
_mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
const __m128i a =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
_mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
const __m128i b =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
_mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
// Zero-extend mask to 16 bits
const __m128i m = _mm_unpacklo_epi8(
_mm_unpacklo_epi32(
_mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
_mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
_mm_setzero_si128());
const __m128i m_inv = _mm_sub_epi16(mask_max, m);
const __m128i data_l = _mm_unpacklo_epi16(a, b);
const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
__m128i pred_l = _mm_madd_epi16(data_l, mask_l);
pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
AOM_BLEND_A64_ROUND_BITS);
const __m128i data_r = _mm_unpackhi_epi16(a, b);
const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
__m128i pred_r = _mm_madd_epi16(data_r, mask_r);
pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
AOM_BLEND_A64_ROUND_BITS);
const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
src_ptr += src_stride * 2;
a_ptr += a_stride * 2;
b_ptr += b_stride * 2;
m_ptr += m_stride * 2;
}
res = _mm_hadd_epi32(res, res);
res = _mm_hadd_epi32(res, res);
int sad = _mm_cvtsi128_si32(res);
return (sad + 31) >> 6;
}
#endif
This diff is collapsed.
......@@ -25,7 +25,7 @@
using libaom_test::ACMRandom;
namespace {
const int number_of_iterations = 500;
const int number_of_iterations = 200;
typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
......@@ -159,9 +159,7 @@ TEST_P(HighbdMaskedSADTest, OperationCheck) {
using std::tr1::make_tuple;
// TODO(david.barker): Re-enable this once we have vectorized
// versions of the masked_compound_* functions
#if 0 && HAVE_SSSE3
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSADTest,
::testing::Values(
......@@ -221,5 +219,5 @@ INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
make_tuple(&aom_highbd_masked_sad4x4_ssse3,
&aom_highbd_masked_sad4x4_c)));
#endif // CONFIG_HIGHBITDEPTH
#endif // 0 && HAVE_SSSE3
#endif // HAVE_SSSE3
} // namespace
......@@ -29,7 +29,7 @@
using libaom_test::ACMRandom;
namespace {
const int number_of_iterations = 500;
const int number_of_iterations = 200;
typedef unsigned int (*MaskedSubPixelVarianceFunc)(
const uint8_t *src, int src_stride, int xoffset, int yoffset,
......@@ -217,15 +217,14 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
int xoffset, yoffset;
for (int i = 0; i < number_of_iterations; ++i) {
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
msk_ptr[j] = rnd(65);
}
for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
msk_ptr[j] = rnd(65);
}
for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
......@@ -319,9 +318,7 @@ TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
using std::tr1::make_tuple;
// TODO(david.barker): Re-enable this once we have vectorized
// versions of the masked_compound_* functions
#if 0 && HAVE_SSSE3
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
::testing::Values(
......@@ -490,5 +487,5 @@ INSTANTIATE_TEST_CASE_P(
AOM_BITS_12)));
#endif // CONFIG_HIGHBITDEPTH
#endif // 0 && HAVE_SSSE3
#endif // HAVE_SSSE3
} // namespace
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment