Commit abd00505 authored by Geza Lore's avatar Geza Lore

Add optimized vpx_sum_squares_2d_i16 for vp10.

Using this we can eliminate large numbers of calls to predict intra,
and is also faster than most of the variance functions it replaces.
This is an equivalence transform so coding performance is unaffected.

Encoder speedup is approx 7% when var_tx, super_tx and ext_tx are all
enabled.

Change-Id: I0d4c83afc4a97a1826f3abd864bd68e41bb504fb
parent d1cad9c3
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
#include <cstdlib>
#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
using libvpx_test::ACMRandom;
namespace {
const int kNumIterations = 10000;
typedef uint64_t (*SSI16Func)(const int16_t *src,
int stride ,
int size);
typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
public:
virtual ~SumSquaresTest() {}
virtual void SetUp() {
ref_func_ = GET_PARAM(0);
tst_func_ = GET_PARAM(1);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
SSI16Func ref_func_;
SSI16Func tst_func_;
};
TEST_P(SumSquaresTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, int16_t, src[256*256]);
int failed = 0;
const int msb = 11; // Up to 12 bit input
const int limit = 1 << (msb+1);
for (int k = 0; k < kNumIterations; k++) {
int size = 4 << rnd(6); // Up to 128x128
int stride = 4 << rnd(7); // Up to 256 stride
while (stride < size) { // Make sure it's valid
stride = 4 << rnd(7);
}
for (int ii = 0 ; ii < size; ii++) {
for (int jj = 0; jj < size; jj++) {
src[ii*stride+jj] = rnd(2) ? rnd(limit) : -rnd(limit);
}
}
uint64_t res_ref = ref_func_(src, stride, size);
uint64_t res_tst;
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
if (!failed) {
failed = res_ref != res_tst;
EXPECT_EQ(res_ref, res_tst)
<< "Error: Sum Squares Test"
<< " C output does not match optimized output.";
}
}
}
TEST_P(SumSquaresTest, ExtremeValues) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, int16_t, src[256*256]);
int failed = 0;
const int msb = 11; // Up to 12 bit input
const int limit = 1 << (msb+1);
for (int k = 0; k < kNumIterations; k++) {
int size = 4 << rnd(6); // Up to 128x128
int stride = 4 << rnd(7); // Up to 256 stride
while (stride < size) { // Make sure it's valid
stride = 4 << rnd(7);
}
int val = rnd(2) ? limit-1 : -(limit-1);
for (int ii = 0 ; ii < size; ii++) {
for (int jj = 0; jj < size; jj++) {
src[ii*stride+jj] = val;
}
}
uint64_t res_ref = ref_func_(src, stride, size);
uint64_t res_tst;
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
if (!failed) {
failed = res_ref != res_tst;
EXPECT_EQ(res_ref, res_tst)
<< "Error: Sum Squares Test"
<< " C output does not match optimized output.";
}
}
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, SumSquaresTest,
::testing::Values(
make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2)
)
);
#endif // HAVE_SSE2
} // namespace
......@@ -163,11 +163,11 @@ endif # VP9
## VP10
ifeq ($(CONFIG_VP10),yes)
LIBVPX_TEST_SRCS-yes += vp10_inv_txfm_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ANS) += vp10_ans_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
endif # VP10
## Multi-codec / unconditional whitebox tests.
......
......@@ -654,27 +654,31 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
if (!is_inter_block(mbmi)) {
struct encode_b_args arg = {x, NULL, &mbmi->skip};
#if CONFIG_VAR_TX
uint8_t *dst, *src;
int src_stride = x->plane[plane].src.stride;
int dst_stride = xd->plane[plane].dst.stride;
unsigned int tmp_sse;
PREDICTION_MODE mode = (plane == 0) ?
get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
src = &x->plane[plane].src.buf[4 * (blk_row * src_stride + blk_col)];
dst = &xd->plane[plane].dst.buf[4 * (blk_row * dst_stride + blk_col)];
vp10_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
b_height_log2_lookup[plane_bsize],
tx_size, mode, dst, dst_stride,
dst, dst_stride, blk_col, blk_row, plane);
args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
dst, dst_stride, &tmp_sse);
sse = (int64_t)tmp_sse * 16;
vp10_encode_block_intra(plane, block, blk_row, blk_col,
plane_bsize, tx_size, &arg);
args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
dst, dst_stride, &tmp_sse);
dist = (int64_t)tmp_sse * 16;
{
const int bs = 4 << tx_size;
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
unsigned int tmp;
sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, bs) * 16;
variance(src, src_stride, dst, dst_stride, &tmp);
dist = (int64_t)tmp * 16;
}
#else
vp10_encode_block_intra(plane, block, blk_row, blk_col,
plane_bsize, tx_size, &arg);
......@@ -2330,6 +2334,8 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
#else
DECLARE_ALIGNED(16, uint8_t, rec_buffer[32 * 32]);
#endif
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
......@@ -2360,20 +2366,16 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
if (blk_row + (bh >> 2) > max_blocks_high ||
blk_col + (bh >> 2) > max_blocks_wide) {
int idx, idy;
unsigned int this_sse;
int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
int blocks_width = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
for (idy = 0; idy < blocks_height; idy += 2) {
for (idx = 0; idx < blocks_width; idx += 2) {
cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
src_stride,
rec_buffer + 4 * idy * 32 + 4 * idx,
32, &this_sse);
tmp_sse += this_sse;
const int16_t *d = diff + 4 * idy * diff_stride + 4 * idx;
tmp_sse += vpx_sum_squares_2d_i16(d, diff_stride, 8);
}
}
} else {
cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, 32, &tmp_sse);
tmp_sse = vpx_sum_squares_2d_i16(diff, diff_stride, bh);
}
*bsse += (int64_t)tmp_sse * 16;
......
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include "./vpx_dsp_rtcd.h"
uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
int size) {
int r, c;
uint64_t ss = 0;
for (r = 0; r < size; r++) {
for (c = 0; c < size; c++) {
const int16_t v = src[c];
ss += v*v;
}
src += src_stride;
}
return ss;
}
......@@ -266,6 +266,12 @@ endif
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
ifeq ($(CONFIG_VP10_ENCODER),yes)
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
endif # CONFIG_VP10_ENCODER
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
......@@ -297,7 +303,6 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_USE_X86INC
endif # CONFIG_ENCODERS
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
......
......@@ -23,6 +23,18 @@ extern "C" {
#define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
#define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
// These can be used to give a hint about branch outcomes.
// This can have an effect, even if your target processor has a
// good branch predictor, as these hints can affect basic block
// ordering by the compiler.
#ifdef __GNUC__
# define LIKELY(v) __builtin_expect(v, 1)
# define UNLIKELY(v) __builtin_expect(v, 0)
#else
# define LIKELY(v) (v)
# define UNLIKELY(v) (v)
#endif
#if CONFIG_VP9_HIGHBITDEPTH
// Note:
// tran_low_t is the datatype used for final transform coefficients.
......
......@@ -954,6 +954,14 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
#
# Sum of Squares
#
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
specialize qw/vpx_sum_squares_2d_i16 sse2/;
}
#
# Single block SAD
#
......
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <emmintrin.h>
#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
int stride) {
const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i*)(src+0*stride));
const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i*)(src+1*stride));
const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i*)(src+2*stride));
const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i*)(src+3*stride));
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d,
_mm_srli_epi64(v_sum_0123_d, 32));
return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
}
#ifdef __GNUC__
// This prevents GCC/Clang from inlining this function into
// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
// maintenance instructions in the common case of 4x4.
__attribute__((noinline))
#endif
static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
int stride,
int size) {
int r, c;
const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
__m128i v_acc_q = _mm_setzero_si128();
for (r = 0; r < size; r += 8) {
__m128i v_acc_d = _mm_setzero_si128();
for (c = 0; c < size; c += 8) {
const int16_t *b = src+c;
const __m128i v_val_0_w = _mm_load_si128((const __m128i*)(b+0*stride));
const __m128i v_val_1_w = _mm_load_si128((const __m128i*)(b+1*stride));
const __m128i v_val_2_w = _mm_load_si128((const __m128i*)(b+2*stride));
const __m128i v_val_3_w = _mm_load_si128((const __m128i*)(b+3*stride));
const __m128i v_val_4_w = _mm_load_si128((const __m128i*)(b+4*stride));
const __m128i v_val_5_w = _mm_load_si128((const __m128i*)(b+5*stride));
const __m128i v_val_6_w = _mm_load_si128((const __m128i*)(b+6*stride));
const __m128i v_val_7_w = _mm_load_si128((const __m128i*)(b+7*stride));
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
}
v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
src += 8*stride;
}
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
#if ARCH_X86_64
return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
{
uint64_t tmp;
_mm_storel_epi64((__m128i*)&tmp, v_acc_q);
return tmp;
}
#endif
}
uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
int size) {
// 4 elements per row only requires half an XMM register, so this
// must be a special case, but also note that over 75% of all calls
// are with size == 4, so it is also the common case.
if (LIKELY(size == 4)) {
return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
} else {
// Generic case
return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment