Commit 72d3ba8a authored by Yue Chen's avatar Yue Chen

Add weighted motion search for obmc predictor

Also port SIMD optimization of weighted sad/variance functions to
av1.
Coding gain improvement: 0.339/0.413/0.328 (lowres/midres/hdres)
Current coding gain: 2.437/2.428/2.294
Encoding time overhead: 17% (soccer_cif), 30% (ped_1080p25), was
12% and 18% without motion search

Change-Id: I101d6ce729f769853756edc8ced6f3a2b8d8f824
parent dd8b9140
......@@ -375,6 +375,11 @@ ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
endif # CONFIG_USE_X86INC
endif # CONFIG_AOM_HIGHBITDEPTH
ifeq ($(CONFIG_MOTION_VAR),yes)
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
endif #CONFIG_MOTION_VAR
endif # CONFIG_ENCODERS
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
......
......@@ -20,6 +20,10 @@
extern "C" {
#endif
#ifndef MAX_SB_SIZE
#define MAX_SB_SIZE 64
#endif
#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
......
......@@ -50,6 +50,14 @@ if ($opts{arch} eq "x86_64") {
$avx2_x86_64 = 'avx2';
}
@block_widths = (4, 8, 16, 32, 64);
@block_sizes = ();
foreach $w (@block_widths) {
foreach $h (@block_widths) {
push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
}
}
#
# Intra prediction
#
......@@ -1029,6 +1037,50 @@ specialize qw/aom_sad4x8 msa/, "$sse2_x86inc";
add_proto qw/unsigned int aom_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/aom_sad4x4 neon msa/, "$sse2_x86inc";
#
# OBMC SAD
#
if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/;
}
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
}
}
}
#
# OBMC Variance / OBMC Subpixel Variance
#
if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
specialize "aom_obmc_variance${w}x${h}", q/sse4_1/;
specialize "aom_obmc_sub_pixel_variance${w}x${h}";
}
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
foreach $bd ("_", "_10_", "_12_") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}";
}
}
}
}
#
# Avg
#
......
......@@ -318,3 +318,81 @@ highbd_sadMxNx4D(4, 4)
/* clang-format on */
#endif // CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_MOTION_VAR
// pre: predictor being evaluated
// wsrc: target weighted prediction (has been *4096 to keep precision)
// mask: 2d weights (scaled by 4096)
static INLINE
unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
const int32_t *wsrc, const int32_t *mask, int width,
int height) {
int y, x;
unsigned int sad = 0;
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
pre += pre_stride;
wsrc += width;
mask += width;
}
return sad;
}
#define OBMC_SADMxN(m, n) \
unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
const int32_t *wsrc, \
const int32_t *mask) { \
return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
}
OBMC_SADMxN(64, 64) OBMC_SADMxN(64, 32) OBMC_SADMxN(32, 64) OBMC_SADMxN(32, 32)
OBMC_SADMxN(32, 16) OBMC_SADMxN(16, 32) OBMC_SADMxN(16, 16)
OBMC_SADMxN(16, 8) OBMC_SADMxN(8, 16) OBMC_SADMxN(8, 8)
OBMC_SADMxN(8, 4) OBMC_SADMxN(4, 8) OBMC_SADMxN(4, 4)
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE
unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
const int32_t *wsrc, const int32_t *mask,
int width, int height) {
int y, x;
unsigned int sad = 0;
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
for (y = 0; y < height; y++) {
for (x = 0; x < width; x++)
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
pre += pre_stride;
wsrc += width;
mask += width;
}
return sad;
}
#define HIGHBD_OBMC_SADMXN(m, n) \
unsigned int aom_highbd_obmc_sad##m##x##n##_c( \
const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
const int32_t *mask) { \
return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
}
HIGHBD_OBMC_SADMXN(64, 64)
HIGHBD_OBMC_SADMXN(64, 32)
HIGHBD_OBMC_SADMXN(32, 64)
HIGHBD_OBMC_SADMXN(32, 32)
HIGHBD_OBMC_SADMXN(32, 16)
HIGHBD_OBMC_SADMXN(16, 32)
HIGHBD_OBMC_SADMXN(16, 16)
HIGHBD_OBMC_SADMXN(16, 8)
HIGHBD_OBMC_SADMXN(8, 16)
HIGHBD_OBMC_SADMXN(8, 8)
HIGHBD_OBMC_SADMXN(8, 4)
HIGHBD_OBMC_SADMXN(4, 8)
HIGHBD_OBMC_SADMXN(4, 4)
#endif // CONFIG_AOM_HIGHBITDEPTH
#endif // CONFIG_MOTION_VAR
This diff is collapsed.
......@@ -50,6 +50,20 @@ typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse);
#if CONFIG_AV1 && CONFIG_MOTION_VAR
typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
const int32_t *wsrc,
const int32_t *msk);
typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
int pred_stride,
const int32_t *wsrc,
const int32_t *msk,
unsigned int *sse);
typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
typedef unsigned int (*aom_subp_avg_variance_fn_t)(
const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset,
const uint8_t *b_ptr, int b_stride, unsigned int *sse,
......@@ -64,6 +78,11 @@ typedef struct aom_variance_vtable {
aom_sad_multi_fn_t sdx3f;
aom_sad_multi_fn_t sdx8f;
aom_sad_multi_d_fn_t sdx4df;
#if CONFIG_MOTION_VAR
aom_obmc_sad_fn_t osdf;
aom_obmc_variance_fn_t ovf;
aom_obmc_subpixvariance_fn_t osvf;
#endif // CONFIG_MOTION_VAR
} aom_variance_fn_ptr_t;
#endif // CONFIG_AV1
......
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <immintrin.h>
#include "./aom_config.h"
#include "aom_ports/mem.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/x86/synonyms.h"
////////////////////////////////////////////////////////////////////////////////
// 8 bit
////////////////////////////////////////////////////////////////////////////////
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
const int32_t *wsrc, const int32_t *mask,
const int height) {
const int pre_step = pre_stride - 4;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
do {
const __m128i v_p_b = xx_loadl_32(pre + n);
const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference
const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
n += 4;
if (n % 4 == 0) pre += pre_step;
} while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d);
}
static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask, const int width,
const int height) {
const int pre_step = pre_stride - width;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8);
assert(IS_POWER_OF_TWO(width));
do {
const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_p0_b = xx_loadl_32(pre + n);
const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
// Rounded absolute difference
const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
n += 8;
if (n % width == 0) pre += pre_step;
} while (n < width * height);
return xx_hsum_epi32_si32(v_sad_d);
}
#define OBMCSADWXH(w, h) \
unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
const int32_t *msk) { \
if (w == 4) { \
return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
} else { \
return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
} \
}
OBMCSADWXH(64, 64)
OBMCSADWXH(64, 32)
OBMCSADWXH(32, 64)
OBMCSADWXH(32, 32)
OBMCSADWXH(32, 16)
OBMCSADWXH(16, 32)
OBMCSADWXH(16, 16)
OBMCSADWXH(16, 8)
OBMCSADWXH(8, 16)
OBMCSADWXH(8, 8)
OBMCSADWXH(8, 4)
OBMCSADWXH(4, 8)
OBMCSADWXH(4, 4)
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
////////////////////////////////////////////////////////////////////////////////
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int height) {
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int pre_step = pre_stride - 4;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
do {
const __m128i v_p_w = xx_loadl_64(pre + n);
const __m128i v_m_d = xx_load_128(mask + n);
const __m128i v_w_d = xx_load_128(wsrc + n);
const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
// Rounded absolute difference
const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
n += 4;
if (n % 4 == 0) pre += pre_step;
} while (n < 4 * height);
return xx_hsum_epi32_si32(v_sad_d);
}
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
const int pre_stride,
const int32_t *wsrc,
const int32_t *mask,
const int width, const int height) {
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
const int pre_step = pre_stride - width;
int n = 0;
__m128i v_sad_d = _mm_setzero_si128();
assert(width >= 8);
assert(IS_POWER_OF_TWO(width));
do {
const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
const __m128i v_m1_d = xx_load_128(mask + n + 4);
const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
const __m128i v_p0_w = xx_loadl_64(pre + n);
const __m128i v_m0_d = xx_load_128(mask + n);
const __m128i v_w0_d = xx_load_128(wsrc + n);
const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
// Values in both pre and mask fit in 15 bits, and are packed at 32 bit
// boundaries. We use pmaddwd, as it has lower latency on Haswell
// than pmulld but produces the same result with these inputs.
const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
// Rounded absolute difference
const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
n += 8;
if (n % width == 0) pre += pre_step;
} while (n < width * height);
return xx_hsum_epi32_si32(v_sad_d);
}
#define HBD_OBMCSADWXH(w, h) \
unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
const int32_t *mask) { \
if (w == 4) { \
return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
} else { \
return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
} \
}
HBD_OBMCSADWXH(64, 64)
HBD_OBMCSADWXH(64, 32)
HBD_OBMCSADWXH(32, 64)
HBD_OBMCSADWXH(32, 32)
HBD_OBMCSADWXH(32, 16)
HBD_OBMCSADWXH(16, 32)
HBD_OBMCSADWXH(16, 16)
HBD_OBMCSADWXH(16, 8)
HBD_OBMCSADWXH(8, 16)
HBD_OBMCSADWXH(8, 8)
HBD_OBMCSADWXH(8, 4)
HBD_OBMCSADWXH(4, 8)
HBD_OBMCSADWXH(4, 4)
#endif // CONFIG_AOM_HIGHBITDEPTH
This diff is collapsed.
......@@ -67,4 +67,46 @@ static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
}
static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
return _mm_srli_epi32(v_tmp_d, bits);
}
static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
const __m128i v_tmp_d =
_mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
return _mm_srai_epi32(v_tmp_d, bits);
}
#ifdef __SSSE3__
static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
v_d = _mm_hadd_epi32(v_d, v_d);
v_d = _mm_hadd_epi32(v_d, v_d);
return _mm_cvtsi128_si32(v_d);
}
static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
#if ARCH_X86_64
return _mm_cvtsi128_si64(v_q);
#else
{
int64_t tmp;
_mm_storel_epi64((__m128i *)&tmp, v_q);
return tmp;
}
#endif
}
static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
}
#endif // __SSSE3__
#endif // AOM_DSP_X86_SYNONYMS_H_
......@@ -41,6 +41,11 @@
/* Shift down with rounding */
#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n)-1))) >> (n))
/* Shift down with rounding for signed integers, for use when n >= 0 */
#define ROUND_POWER_OF_TWO_SIGNED(value, n) \
(((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
: ROUND_POWER_OF_TWO((value), (n)))
#define ALIGN_POWER_OF_TWO(value, n) \
(((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
......
......@@ -18,6 +18,8 @@
#ifdef __cplusplus
extern "C" {
#endif
#undef MAX_SB_SIZE
#define MAX_SB_SIZE_LOG2 6
#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
......
......@@ -114,6 +114,10 @@ struct macroblock {
int *nmvsadcost[2];
int *nmvsadcost_hp[2];
int **mvsadcost;
#if CONFIG_MOTION_VAR
int32_t *wsrc_buf;
int32_t *mask_buf;
#endif // CONFIG_MOTION_VAR
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
......
This diff is collapsed.
This diff is collapsed.
......@@ -133,6 +133,20 @@ int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
int error_per_bit, int *cost_list, const MV *ref_mv,
MV *tmp_mv, int var_max, int rd);
#if CONFIG_MOTION_VAR
int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
int further_steps, int do_refine,
const aom_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv, int is_second);
int av1_find_best_obmc_sub_pixel_tree_up(
const struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
int is_second, int use_upsampled_ref);
#endif // CONFIG_MOTION_VAR
#ifdef __cplusplus
} // extern "C"
#endif
......
This diff is collapsed.
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/function_equivalence_test.h"
#include "test/register_state_check.h"
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
using libaom_test::FunctionEquivalenceTest;
namespace {
static const int kIterations = 1000;
static const int kMaskMax = 64;
typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
const int32_t *wsrc, const int32_t *mask);
typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
////////////////////////////////////////////////////////////////////////////////
// 8 bit
////////////////////////////////////////////////////////////////////////////////
class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
TEST_P(ObmcSadTest, RandomValues) {
DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0; i < MAX_SB_SQUARE; ++i) {
pre[i] = rng_.Rand8();
wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1);
}
const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
unsigned int tst_res;
ASM_REGISTER_STATE_CHECK(tst_res =
params_.tst_func(pre, pre_stride, wsrc, mask));
ASSERT_EQ(ref_res, tst_res);
}
}
TEST_P(ObmcSadTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
const int pre_stride = iter;
for (int i = 0; i < MAX_SB_SQUARE; ++i) {
pre[i] = UINT8_MAX;
wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax;
}
const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
unsigned int tst_res;
ASM_REGISTER_STATE_CHECK(tst_res =
params_.tst_func(pre, pre_stride, wsrc, mask));
ASSERT_EQ(ref_res, tst_res);
}
}
#if HAVE_SSE4_1
const ObmcSadTest::ParamType sse4_functions[] = {
TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1),
TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1),
TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1),
TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1),
TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1),
TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1),
TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1)
};
INSTANTIATE_TEST_CASE_P(SSE4_1_C_COMPARE, ObmcSadTest,
::testing::ValuesIn(sse4_functions));
#endif // HAVE_SSE4_1
////////////////////////////////////////////////////////////////////////////////
// High bit-depth
////////////////////////////////////////////////////////////////////////////////
#if CONFIG_AOM_HIGHBITDEPTH
class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
TEST_P(ObmcSadHBDTest, RandomValues) {
DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
const int pre_stride = rng_(MAX_SB_SIZE + 1);
for (int i = 0; i < MAX_SB_SQUARE; ++i) {
pre[i] = rng_(1 << 12);
wsrc[i] = rng_(1 << 12) * rng_(kMaskMax * kMaskMax + 1);
mask[i] = rng_(kMaskMax * kMaskMax + 1);
}
const unsigned int ref_res =
params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
unsigned int tst_res;
ASM_REGISTER_STATE_CHECK(
tst_res =
params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
ASSERT_EQ(ref_res, tst_res);
}
}
TEST_P(ObmcSadHBDTest, ExtremeValues) {
DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
const int pre_stride = iter;
for (int i = 0; i < MAX_SB_SQUARE; ++i) {
pre[i] = (1 << 12) - 1;
wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
mask[i] = kMaskMax * kMaskMax;
}