Commit 57c4711b authored by Yi Luo's avatar Yi Luo
Browse files

Optimization EXT_INTRA's filtered intra predictor (SSE4.1)

- Add unit tests to verify the bit-exact result.
- In speed test, function speed (for each mode/tx_size)
  improves about 23%~35%.
- On E5-2680, park_joy_1080p, 10 frames, --kf-max-dist=1,
  encoding time improves about 1%~2%.

Change-Id: Id89f313d44eea562c02e775a6253dc4df7e046a9
parent 66796083
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vp10_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "vp10/common/enums.h"
namespace {
using std::tr1::tuple;
using libvpx_test::ACMRandom;
typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left);
// Note:
// Test parameter list:
// Reference predictor, optimized predictor, prediction mode, block size
//
typedef tuple<Predictor, Predictor, int> PredFuncMode;
typedef tuple<PredFuncMode, int> PredParams;
const int MaxBlkSize = 32;
// By default, disable speed test
#define PREDICTORS_SPEED_TEST (0)
#if PREDICTORS_SPEED_TEST
const int MaxTestNum = 100000;
#else
const int MaxTestNum = 100;
#endif
class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> {
public:
virtual ~VP10IntraPredOptimzTest() {}
virtual void SetUp() {
PredFuncMode funcMode = GET_PARAM(0);
predFuncRef_ = std::tr1::get<0>(funcMode);
predFunc_ = std::tr1::get<1>(funcMode);
mode_ = std::tr1::get<2>(funcMode);
blockSize_ = GET_PARAM(1);
alloc_ = (uint8_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0]));
predRef_ =
(uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0]));
pred_ = (uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0]));
}
virtual void TearDown() {
delete[] alloc_;
delete[] predRef_;
delete[] pred_;
libvpx_test::ClearSystemState();
}
protected:
void RunTest() const {
int tstIndex = 0;
int stride = blockSize_;
uint8_t *left = alloc_;
uint8_t *above = alloc_ + MaxBlkSize + 1;
while (tstIndex < MaxTestNum) {
PrepareBuffer();
predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
ASM_REGISTER_STATE_CHECK(
predFunc_(pred_, stride, blockSize_, &above[1], left));
DiffPred(tstIndex);
tstIndex += 1;
}
}
void RunSpeedTestC() const {
int tstIndex = 0;
int stride = blockSize_;
uint8_t *left = alloc_;
uint8_t *above = alloc_ + MaxBlkSize + 1;
PrepareBuffer();
while (tstIndex < MaxTestNum) {
predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
tstIndex += 1;
}
}
void RunSpeedTestSSE() const {
int tstIndex = 0;
int stride = blockSize_;
uint8_t *left = alloc_;
uint8_t *above = alloc_ + MaxBlkSize + 1;
PrepareBuffer();
while (tstIndex < MaxTestNum) {
predFunc_(predRef_, stride, blockSize_, &above[1], left);
tstIndex += 1;
}
}
private:
void PrepareBuffer() const {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int i = 0;
while (i < (3 * MaxBlkSize + 2)) {
alloc_[i] = rnd.Rand8();
i += 1;
}
}
void DiffPred(int testNum) const {
int i = 0;
while (i < blockSize_ * blockSize_) {
EXPECT_EQ(predRef_[i], pred_[i])
<< "Error at position: " << i << " "
<< "Block size: " << blockSize_ << " "
<< "Test number: " << testNum;
i += 1;
}
}
Predictor predFunc_;
Predictor predFuncRef_;
int mode_;
int blockSize_;
uint8_t *alloc_;
uint8_t *pred_;
uint8_t *predRef_;
};
TEST_P(VP10IntraPredOptimzTest, BitExactCheck) {
RunTest();
}
#if PREDICTORS_SPEED_TEST
TEST_P(VP10IntraPredOptimzTest, SpeedCheckC) {
RunSpeedTestC();
}
TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) {
RunSpeedTestSSE();
}
#endif
using std::tr1::make_tuple;
const PredFuncMode kPredFuncMdArray[] = {
make_tuple(vp10_dc_filter_predictor_c, vp10_dc_filter_predictor_sse4_1,
DC_PRED),
make_tuple(vp10_v_filter_predictor_c, vp10_v_filter_predictor_sse4_1,
V_PRED),
make_tuple(vp10_h_filter_predictor_c, vp10_h_filter_predictor_sse4_1,
H_PRED),
make_tuple(vp10_d45_filter_predictor_c, vp10_d45_filter_predictor_sse4_1,
D45_PRED),
make_tuple(vp10_d135_filter_predictor_c, vp10_d135_filter_predictor_sse4_1,
D135_PRED),
make_tuple(vp10_d117_filter_predictor_c, vp10_d117_filter_predictor_sse4_1,
D117_PRED),
make_tuple(vp10_d153_filter_predictor_c, vp10_d153_filter_predictor_sse4_1,
D153_PRED),
make_tuple(vp10_d207_filter_predictor_c, vp10_d207_filter_predictor_sse4_1,
D207_PRED),
make_tuple(vp10_d63_filter_predictor_c, vp10_d63_filter_predictor_sse4_1,
D63_PRED),
make_tuple(vp10_tm_filter_predictor_c, vp10_tm_filter_predictor_sse4_1,
TM_PRED),
};
const int kBlkSize[] = {4, 8, 16, 32};
INSTANTIATE_TEST_CASE_P(
SSE4_1, VP10IntraPredOptimzTest,
::testing::Combine(
::testing::ValuesIn(kPredFuncMdArray),
::testing::ValuesIn(kBlkSize)));
} // namespace
......@@ -149,6 +149,10 @@ LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
endif
ifeq ($(CONFIG_EXT_INTRA),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += reconintra_predictors_test.cc
endif
ifeq ($(CONFIG_OBMC),yes)
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_variance_test.cc
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP10_COMMON_INTRA_FILTERS_H_
#define VP10_COMMON_INTRA_FILTERS_H_
#define FILTER_INTRA_PREC_BITS (10)
static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
{
{735, 881, -537, -54},
{1005, 519, -488, -11},
{383, 990, -343, -6},
{442, 805, -542, 319},
{658, 616, -133, -116},
{875, 442, -141, -151},
{386, 741, -23, -80},
{390, 1027, -446, 51},
{679, 606, -523, 262},
{903, 922, -778, -23},
},
{
{648, 803, -444, 16},
{972, 620, -576, 7},
{561, 967, -499, -5},
{585, 762, -468, 144},
{596, 619, -182, -9},
{895, 459, -176, -153},
{557, 722, -126, -129},
{601, 839, -523, 105},
{562, 709, -499, 251},
{803, 872, -695, 43},
},
{
{423, 728, -347, 111},
{963, 685, -665, 23},
{281, 1024, -480, 216},
{640, 596, -437, 78},
{429, 669, -259, 99},
{740, 646, -415, 23},
{568, 771, -346, 40},
{404, 833, -486, 209},
{398, 712, -423, 307},
{939, 935, -887, 17},
},
{
{477, 737, -393, 150},
{881, 630, -546, 67},
{506, 984, -443, -20},
{114, 459, -270, 528},
{433, 528, 14, 3},
{837, 470, -301, -30},
{181, 777, 89, -107},
{-29, 716, -232, 259},
{589, 646, -495, 255},
{740, 884, -728, 77},
},
};
#endif // VP10_COMMON_INTRA_FILTERS_H_
......@@ -10,6 +10,7 @@
#include <math.h>
#include "./vp10_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/system_state.h"
......@@ -20,7 +21,9 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_once.h"
#if CONFIG_EXT_INTRA
#include "vp10/common/intra_filters.h"
#endif
#include "vp10/common/reconintra.h"
#include "vp10/common/onyxc_int.h"
......@@ -390,7 +393,6 @@ static void vp10_init_intra_predictors_internal(void) {
}
#if CONFIG_EXT_INTRA
#define FILTER_INTRA_PREC_BITS 10
static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = {
NEED_LEFT | NEED_ABOVE, // FILTER_DC
......@@ -719,57 +721,6 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
}
}
static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
{
{735, 881, -537, -54},
{1005, 519, -488, -11},
{383, 990, -343, -6},
{442, 805, -542, 319},
{658, 616, -133, -116},
{875, 442, -141, -151},
{386, 741, -23, -80},
{390, 1027, -446, 51},
{679, 606, -523, 262},
{903, 922, -778, -23},
},
{
{648, 803, -444, 16},
{972, 620, -576, 7},
{561, 967, -499, -5},
{585, 762, -468, 144},
{596, 619, -182, -9},
{895, 459, -176, -153},
{557, 722, -126, -129},
{601, 839, -523, 105},
{562, 709, -499, 251},
{803, 872, -695, 43},
},
{
{423, 728, -347, 111},
{963, 685, -665, 23},
{281, 1024, -480, 216},
{640, 596, -437, 78},
{429, 669, -259, 99},
{740, 646, -415, 23},
{568, 771, -346, 40},
{404, 833, -486, 209},
{398, 712, -423, 307},
{939, 935, -887, 17},
},
{
{477, 737, -393, 150},
{881, 630, -546, 67},
{506, 984, -443, -20},
{114, 459, -270, 528},
{433, 528, 14, 3},
{837, 470, -301, -30},
{181, 777, 89, -107},
{-29, 716, -232, 259},
{589, 646, -495, 255},
{740, 884, -728, 77},
},
};
static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above,
const uint8_t *left,
......@@ -815,63 +766,94 @@ static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
}
}
static void dc_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
void vp10_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
}
static void v_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
}
static void h_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
}
static void d45_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
}
static void d135_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
}
static void d117_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
}
static void d153_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
}
static void d207_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
}
static void d63_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
}
static void tm_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
void vp10_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
}
static void (*filter_intra_predictors[EXT_INTRA_MODES])(uint8_t *dst,
ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) = {
dc_filter_predictor, v_filter_predictor, h_filter_predictor,
d45_filter_predictor, d135_filter_predictor, d117_filter_predictor,
d153_filter_predictor, d207_filter_predictor, d63_filter_predictor,
tm_filter_predictor,
};
static void filter_intra_predictors(int mode, uint8_t *dst,
ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
switch (mode) {
case DC_PRED:
vp10_dc_filter_predictor(dst, stride, bs, above, left);
break;
case V_PRED:
vp10_v_filter_predictor(dst, stride, bs, above, left);
break;
case H_PRED:
vp10_h_filter_predictor(dst, stride, bs, above, left);
break;
case D45_PRED:
vp10_d45_filter_predictor(dst, stride, bs, above, left);
break;
case D135_PRED:
vp10_d135_filter_predictor(dst, stride, bs, above, left);
break;
case D117_PRED:
vp10_d117_filter_predictor(dst, stride, bs, above, left);
break;
case D153_PRED:
vp10_d153_filter_predictor(dst, stride, bs, above, left);
break;
case D207_PRED:
vp10_d207_filter_predictor(dst, stride, bs, above, left);
break;
case D63_PRED:
vp10_d63_filter_predictor(dst, stride, bs, above, left);
break;
case TM_PRED:
vp10_tm_filter_predictor(dst, stride, bs, above, left);
break;
default:
assert(0);
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
......@@ -1491,8 +1473,8 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
#if CONFIG_EXT_INTRA
if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
const_above_row, left_col);
filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs,
const_above_row, left_col);
return;
}
......
......@@ -298,6 +298,30 @@ if (vpx_config("CONFIG_NEW_QUANT") eq "yes") {
specialize qw/quantize_32x32_fp_nuq/;
}
# EXT_INTRA predictor functions
if (vpx_config("CONFIG_EXT_INTRA") eq "yes") {
add_proto qw/void vp10_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_dc_filter_predictor sse4_1/;
add_proto qw/void vp10_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_v_filter_predictor sse4_1/;
add_proto qw/void vp10_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_h_filter_predictor sse4_1/;
add_proto qw/void vp10_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d45_filter_predictor sse4_1/;
add_proto qw/void vp10_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d135_filter_predictor sse4_1/;
add_proto qw/void vp10_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d117_filter_predictor sse4_1/;
add_proto qw/void vp10_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d153_filter_predictor sse4_1/;
add_proto qw/void vp10_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d207_filter_predictor sse4_1/;
add_proto qw/void vp10_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_d63_filter_predictor sse4_1/;
add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
specialize qw/vp10_tm_filter_predictor sse4_1/;
}
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <smmintrin.h>
#include "./vp10_rtcd.h"
#include "vpx_ports/mem.h"
#include "vp10/common/enums.h"
#include "vp10/common/intra_filters.h"
static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
__m128i *sum) {
const __m128i a = _mm_loadu_si128((const __m128i *)above);
const __m128i l = _mm_loadu_si128((const __m128i *)left);
const __m128i zero = _mm_setzero_si128();
__m128i u0 = _mm_unpacklo_epi8(a, zero);
__m128i u1 = _mm_unpacklo_epi8(l, zero);
sum[0] = _mm_add_epi16(u0, u1);
}
static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
__m128i *params) {
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint16_t sum_value;
AddPixelsSmall(above, left, &sum_vector);
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
u = _mm_srli_si128(sum_vector, 2);
sum_vector = _mm_add_epi16(sum_vector, u);
sum_value = _mm_extract_epi16(sum_vector, 0);
sum_value += 4;
sum_value >>= 3;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
__m128i *params) {
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint16_t sum_value;
AddPixelsSmall(above, left, &sum_vector);
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
u = _mm_srli_si128(sum_vector, 2);
sum_vector = _mm_add_epi16(sum_vector, u);
sum_value = _mm_extract_epi16(sum_vector, 0);
sum_value += 8;
sum_value >>= 4;
*params = _mm_set1_epi32(sum_value);
return sum_value;
}
static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
__m128i *sum) {
const __m128i a = _mm_loadu_si128((const __m128i *)above);
const __m128i l = _mm_loadu_si128((const __m128i *)left);
const __m128i zero = _mm_setzero_si128();
__m128i u0 = _mm_unpacklo_epi8(a, zero);
__m128i u1 = _mm_unpacklo_epi8(l, zero);
sum[0] = _mm_add_epi16(u0, u1);
u0 = _mm_unpackhi_epi8(a, zero);
u1 = _mm_unpackhi_epi8(l, zero);
sum[0] = _mm_add_epi16(sum[0], u0);
sum[0] = _mm_add_epi16(sum[0], u1);
}
static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
__m128i *params) {
const __m128i zero = _mm_setzero_si128();
__m128i sum_vector, u;
uint16_t sum_value;
AddPixelsLarge(above, left, &sum_vector);