Commit 04cef497 authored by Yi Luo's avatar Yi Luo
Browse files

Speed up convolve_round post-rounding by avx2

- Decoder convolve rounding cycle percentage drops from
  2.75% to 0.91% by using avx2 function on i7-6700.

Change-Id: I34ae48f45c0b4073f8962647d2181365ffe3325b
parent e0794b51
...@@ -164,6 +164,7 @@ set(AOM_AV1_COMMON_INTRIN_SSE4_1 ...@@ -164,6 +164,7 @@ set(AOM_AV1_COMMON_INTRIN_SSE4_1
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c") "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c")
set(AOM_AV1_COMMON_INTRIN_AVX2 set(AOM_AV1_COMMON_INTRIN_AVX2
"${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
"${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
"${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c") "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
......
...@@ -73,6 +73,7 @@ AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c ...@@ -73,6 +73,7 @@ AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d_cfg.h AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d_cfg.h
AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d_cfg.h
AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/convolve_avx2.c
AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
ifeq ($(CONFIG_HIGHBITDEPTH),yes) ifeq ($(CONFIG_HIGHBITDEPTH),yes)
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
......
...@@ -656,10 +656,14 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { ...@@ -656,10 +656,14 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") { if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") {
add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; add_proto qw/void av1_convolve_2d/, "const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
specialize qw/av1_convolve_2d sse2/; specialize qw/av1_convolve_2d sse2/;
add_proto qw/void av1_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits";
specialize qw/av1_convolve_rounding avx2/;
if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; add_proto qw/void av1_highbd_convolve_2d/, "const uint16_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
specialize qw/av1_highbd_convolve_2d ssse3/; specialize qw/av1_highbd_convolve_2d ssse3/;
add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
specialize qw/av1_highbd_convolve_rounding avx2/;
} }
} }
......
...@@ -309,8 +309,8 @@ void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride, ...@@ -309,8 +309,8 @@ void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride,
} }
#if CONFIG_CONVOLVE_ROUND #if CONFIG_CONVOLVE_ROUND
void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst, void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, int bits) { int dst_stride, int w, int h, int bits) {
int r, c; int r, c;
for (r = 0; r < h; ++r) { for (r = 0; r < h; ++r) {
for (c = 0; c < w; ++c) { for (c = 0; c < w; ++c) {
...@@ -508,9 +508,9 @@ static INLINE void transpose_uint16(uint16_t *dst, int dst_stride, ...@@ -508,9 +508,9 @@ static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c]; for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
} }
void av1_highbd_convolve_rounding(const int32_t *src, int src_stride, void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
uint8_t *dst8, int dst_stride, int w, int h, uint8_t *dst8, int dst_stride, int w, int h,
int bits, int bd) { int bits, int bd) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
int r, c; int r, c;
for (r = 0; r < h; ++r) { for (r = 0; r < h; ++r) {
......
...@@ -77,14 +77,7 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average, ...@@ -77,14 +77,7 @@ static INLINE ConvolveParams get_conv_params_no_round(int ref, int do_average,
return conv_params; return conv_params;
} }
void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
int dst_stride, int w, int h, int bits);
#if CONFIG_HIGHBITDEPTH #if CONFIG_HIGHBITDEPTH
void av1_highbd_convolve_rounding(const int32_t *src, int src_stride,
uint8_t *dst8, int dst_stride, int w, int h,
int bits, int bd);
void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
uint8_t *dst, int dst_stride, int w, int h, uint8_t *dst, int dst_stride, int w, int h,
const InterpFilter *interp_filter, const InterpFilter *interp_filter,
......
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <immintrin.h>
#include "aom_dsp/aom_dsp_common.h"
#include "./av1_rtcd.h"
#if CONFIG_CONVOLVE_ROUND
static const uint32_t sindex[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
// 16 epi16 pixels
static INLINE void pixel_clamp_avx2(__m256i *u, int bd) {
const __m256i one = _mm256_set1_epi16(1);
const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
__m256i clamped, mask;
mask = _mm256_cmpgt_epi16(*u, max);
clamped = _mm256_andnot_si256(mask, *u);
mask = _mm256_and_si256(mask, max);
clamped = _mm256_or_si256(mask, clamped);
const __m256i zero = _mm256_setzero_si256();
mask = _mm256_cmpgt_epi16(clamped, zero);
*u = _mm256_and_si256(clamped, mask);
}
// 8 epi16 pixels
static INLINE void pixel_clamp_sse2(__m128i *u, int bd) {
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
__m128i clamped, mask;
mask = _mm_cmpgt_epi16(*u, max);
clamped = _mm_andnot_si128(mask, *u);
mask = _mm_and_si128(mask, max);
clamped = _mm_or_si128(mask, clamped);
const __m128i zero = _mm_setzero_si128();
mask = _mm_cmpgt_epi16(clamped, zero);
*u = _mm_and_si128(clamped, mask);
}
// Work on multiple of 32 pixels
static INLINE void cal_rounding_32xn_avx2(const int32_t *src, uint8_t *dst,
const __m256i *rnd, int shift,
int num) {
do {
__m256i x0 = _mm256_loadu_si256((const __m256i *)src);
__m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
__m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
__m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
x0 = _mm256_add_epi32(x0, *rnd);
x1 = _mm256_add_epi32(x1, *rnd);
x2 = _mm256_add_epi32(x2, *rnd);
x3 = _mm256_add_epi32(x3, *rnd);
x0 = _mm256_srai_epi32(x0, shift);
x1 = _mm256_srai_epi32(x1, shift);
x2 = _mm256_srai_epi32(x2, shift);
x3 = _mm256_srai_epi32(x3, shift);
x0 = _mm256_packs_epi32(x0, x1);
x2 = _mm256_packs_epi32(x2, x3);
pixel_clamp_avx2(&x0, 8);
pixel_clamp_avx2(&x2, 8);
x0 = _mm256_packus_epi16(x0, x2);
x1 = _mm256_loadu_si256((const __m256i *)sindex);
x2 = _mm256_permutevar8x32_epi32(x0, x1);
_mm256_storeu_si256((__m256i *)dst, x2);
src += 32;
dst += 32;
num--;
} while (num > 0);
}
static INLINE void cal_rounding_16_avx2(const int32_t *src, uint8_t *dst,
const __m256i *rnd, int shift) {
__m256i x0 = _mm256_loadu_si256((const __m256i *)src);
__m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
x0 = _mm256_add_epi32(x0, *rnd);
x1 = _mm256_add_epi32(x1, *rnd);
x0 = _mm256_srai_epi32(x0, shift);
x1 = _mm256_srai_epi32(x1, shift);
x0 = _mm256_packs_epi32(x0, x1);
pixel_clamp_avx2(&x0, 8);
const __m256i x2 = _mm256_packus_epi16(x0, x0);
x1 = _mm256_loadu_si256((const __m256i *)sindex);
x0 = _mm256_permutevar8x32_epi32(x2, x1);
_mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x0));
}
static INLINE void cal_rounding_8_avx2(const int32_t *src, uint8_t *dst,
const __m256i *rnd, int shift) {
__m256i x0 = _mm256_loadu_si256((const __m256i *)src);
x0 = _mm256_add_epi32(x0, *rnd);
x0 = _mm256_srai_epi32(x0, shift);
x0 = _mm256_packs_epi32(x0, x0);
pixel_clamp_avx2(&x0, 8);
x0 = _mm256_packus_epi16(x0, x0);
const __m256i x1 = _mm256_loadu_si256((const __m256i *)sindex);
x0 = _mm256_permutevar8x32_epi32(x0, x1);
_mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(x0));
}
static INLINE void cal_rounding_4_sse2(const int32_t *src, uint8_t *dst,
const __m128i *rnd, int shift) {
__m128i x = _mm_loadu_si128((const __m128i *)src);
x = _mm_add_epi32(x, *rnd);
x = _mm_srai_epi32(x, shift);
x = _mm_packs_epi32(x, x);
pixel_clamp_sse2(&x, 8);
x = _mm_packus_epi16(x, x);
*(uint32_t *)dst = _mm_cvtsi128_si32(x);
}
void av1_convolve_rounding_avx2(const int32_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
int bits) {
const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
do {
cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 32) { // width = 64
do {
cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 16) { // width = 32
do {
cal_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 8) { // width = 16
do {
cal_rounding_16_avx2(src, dst, &rnd_num, bits);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 4) { // width = 8
do {
cal_rounding_8_avx2(src, dst, &rnd_num, bits);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 2) { // width = 4
do {
cal_rounding_4_sse2(src, dst, &rnd_num_sse2, bits);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else { // width = 2
do {
dst[0] = clip_pixel(ROUND_POWER_OF_TWO(src[0], bits));
dst[1] = clip_pixel(ROUND_POWER_OF_TWO(src[1], bits));
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
}
}
#if CONFIG_HIGHBITDEPTH
static INLINE void cal_highbd_rounding_32xn_avx2(const int32_t *src,
uint16_t *dst,
const __m256i *rnd, int shift,
int num, int bd) {
do {
__m256i x0 = _mm256_loadu_si256((const __m256i *)src);
__m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
__m256i x2 = _mm256_loadu_si256((const __m256i *)src + 2);
__m256i x3 = _mm256_loadu_si256((const __m256i *)src + 3);
x0 = _mm256_add_epi32(x0, *rnd);
x1 = _mm256_add_epi32(x1, *rnd);
x2 = _mm256_add_epi32(x2, *rnd);
x3 = _mm256_add_epi32(x3, *rnd);
x0 = _mm256_srai_epi32(x0, shift);
x1 = _mm256_srai_epi32(x1, shift);
x2 = _mm256_srai_epi32(x2, shift);
x3 = _mm256_srai_epi32(x3, shift);
x0 = _mm256_packs_epi32(x0, x1);
x2 = _mm256_packs_epi32(x2, x3);
pixel_clamp_avx2(&x0, bd);
pixel_clamp_avx2(&x2, bd);
x0 = _mm256_permute4x64_epi64(x0, 0xD8);
x2 = _mm256_permute4x64_epi64(x2, 0xD8);
_mm256_storeu_si256((__m256i *)dst, x0);
_mm256_storeu_si256((__m256i *)(dst + 16), x2);
src += 32;
dst += 32;
num--;
} while (num > 0);
}
static INLINE void cal_highbd_rounding_16_avx2(const int32_t *src,
uint16_t *dst,
const __m256i *rnd, int shift,
int bd) {
__m256i x0 = _mm256_loadu_si256((const __m256i *)src);
__m256i x1 = _mm256_loadu_si256((const __m256i *)src + 1);
x0 = _mm256_add_epi32(x0, *rnd);
x1 = _mm256_add_epi32(x1, *rnd);
x0 = _mm256_srai_epi32(x0, shift);
x1 = _mm256_srai_epi32(x1, shift);
x0 = _mm256_packs_epi32(x0, x1);
pixel_clamp_avx2(&x0, bd);
x0 = _mm256_permute4x64_epi64(x0, 0xD8);
_mm256_storeu_si256((__m256i *)dst, x0);
}
static INLINE void cal_highbd_rounding_8_avx2(const int32_t *src, uint16_t *dst,
const __m256i *rnd, int shift,
int bd) {
__m256i x = _mm256_loadu_si256((const __m256i *)src);
x = _mm256_add_epi32(x, *rnd);
x = _mm256_srai_epi32(x, shift);
x = _mm256_packs_epi32(x, x);
pixel_clamp_avx2(&x, bd);
x = _mm256_permute4x64_epi64(x, 0xD8);
_mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(x));
}
static INLINE void cal_highbd_rounding_4_sse2(const int32_t *src, uint16_t *dst,
const __m128i *rnd, int shift,
int bd) {
__m128i x = _mm_loadu_si128((const __m128i *)src);
x = _mm_add_epi32(x, *rnd);
x = _mm_srai_epi32(x, shift);
x = _mm_packs_epi32(x, x);
pixel_clamp_sse2(&x, bd);
_mm_storel_epi64((__m128i *)dst, x);
}
void av1_highbd_convolve_rounding_avx2(const int32_t *src, int src_stride,
uint8_t *dst8, int dst_stride, int w,
int h, int bits, int bd) {
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
const __m256i rnd_num = _mm256_set1_epi32((int32_t)(1 << (bits - 1)));
const __m128i rnd_num_sse2 = _mm256_castsi256_si128(rnd_num);
if (w > 64) { // width = 128
do {
cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 4, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 32) { // width = 64
do {
cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 2, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 16) { // width = 32
do {
cal_highbd_rounding_32xn_avx2(src, dst, &rnd_num, bits, 1, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 8) { // width = 16
do {
cal_highbd_rounding_16_avx2(src, dst, &rnd_num, bits, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 4) { // width = 8
do {
cal_highbd_rounding_8_avx2(src, dst, &rnd_num, bits, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else if (w > 2) { // width = 4
do {
cal_highbd_rounding_4_sse2(src, dst, &rnd_num_sse2, bits, bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
} else { // width = 2
do {
dst[0] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[0], bits), bd);
dst[1] = clip_pixel_highbd(ROUND_POWER_OF_TWO(src[1], bits), bd);
src += src_stride;
dst += dst_stride;
h--;
} while (h > 0);
}
}
#endif // CONFIG_HIGHBITDEPTH
#endif // CONFIG_CONVOLVE_ROUND
/*
* Copyright (c) 2017, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "./av1_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
#include "aom/aom_integer.h"
#include "aom_ports/aom_timer.h"
using libaom_test::ACMRandom;
namespace {
#define CONVOLVE_ROUNDING_PARAM \
const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
int h, int bits
typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
template <ConvolveRoundFuncHbd fn>
void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
const int bd = 8;
fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
}
template <ConvolveRoundFuncHbd fn>
void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
const int bd = 10;
fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
}
template <ConvolveRoundFuncHbd fn>
void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
const int bd = 12;
fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
}
typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
using std::tr1::tuple;
typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
ConvolveRoundParam;
const int kTestNum = 5000;
class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
protected:
ConvolveRoundTest()
: func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
}
virtual ~ConvolveRoundTest() {}
virtual void SetUp() {
const size_t block_size = 128 * 128;
src_ = reinterpret_cast<int32_t *>(
aom_memalign(16, 3 * block_size * sizeof(int32_t)));
dst_ref_ = reinterpret_cast<uint16_t *>(src_ + block_size);
dst_ = dst_ref_ + block_size;
}
virtual void TearDown() { aom_free(src_); }
void ConvolveRoundingRun() {
int test_num = 0;
const int src_stride = 128;
const int dst_stride = 128;
int bits = 13;
uint8_t *dst = 0;
uint8_t *dst_ref = 0;
int diff_wide;
if (data_path_ == LOWBITDEPTH_TEST) {
dst = reinterpret_cast<uint8_t *>(dst_);
dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
#if CONFIG_HIGHBITDEPTH
} else if (data_path_ == HIGHBITDEPTH_TEST) {
dst = CONVERT_TO_BYTEPTR(dst_);
dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
#endif
} else {
assert(0);
}
while (test_num < kTestNum) {
int block_size = test_num % BLOCK_SIZES_ALL;
int w = block_size_wide[block_size];
int h = block_size_high[block_size];
if (test_num % 2 == 0)
bits -= 1;
else
bits += 1;
GenerateBufferWithRandom(src_, src_stride, bits, w, h);
func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
func_(src_, src_stride, dst, dst_stride, w, h, bits);
diff_wide = w;
if (data_path_ == LOWBITDEPTH_TEST) diff_wide >>= 1;
for (int r = 0; r < h; ++r) {
for (int c = 0; c < diff_wide; ++c) {
ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
<< "Mismatch at r: " << r << " c: " << c << " test: " << test_num;
}
}
test_num++;
}
}
void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
int h) {
int32_t number;
for (int r = 0; r < h; ++r) {
for (int c = 0; c < w; ++c) {
number = static_cast<int32_t>(rand_.Rand31());
number %= 1 << (bits + 9);
src[r * src_stride + c] = number;
}
}
}
ACMRandom rand_;
int32_t *src_;
uint16_t *dst_ref_;
uint16_t *dst_;
ConvolveRoundFunc func_ref_;
ConvolveRoundFunc func_;
DataPathType data_path_;
};
TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
using std::tr1::make_tuple;
#if HAVE_AVX2
const ConvolveRoundParam kConvRndParamArray[] = {
make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,