Commit f9c01c7b authored by Yi Luo's avatar Yi Luo Committed by Gerrit Code Review

Merge "HBD fast path quantization speed improvement" into nextgenv2

parents c03268b4 b2663a8a
......@@ -198,6 +198,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_variance_test.cc
endif
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_quantize_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_highbd_iht_test.cc
endif # CONFIG_VP9_HIGHBITDEPTH
endif # VP10
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_config.h"
#include "./vp10_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "vp10/common/scan.h"
namespace {
typedef void (*QuantizeFpFunc)(const tran_low_t *coeff_ptr, intptr_t count,
int skip_block, const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan,
const int log_scale);
struct QuantizeFuncParams {
QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
int count = 16) : qFunc(qF), qFuncRef(qRefF),
coeffCount(count) {}
QuantizeFpFunc qFunc;
QuantizeFpFunc qFuncRef;
int coeffCount;
};
using libvpx_test::ACMRandom;
const int numTests = 1000;
const int maxSize = 1024;
const int roundFactorRange = 127;
const int dequantRange = 32768;
const int coeffRange = (1 << 20) - 1;
class VP10QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
public:
void RunQuantizeTest() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
uint16_t eob;
uint16_t ref_eob;
int err_count_total = 0;
int first_failure = -1;
int skip_block = 0;
int count = params_.coeffCount;
const TX_SIZE txSize = getTxSize(count);
int log_scale = (txSize == TX_32X32);
QuantizeFpFunc quanFunc = params_.qFunc;
QuantizeFpFunc quanFuncRef = params_.qFuncRef;
const scan_order scanOrder = vp10_default_scan_orders[txSize];
for (int i = 0; i < numTests; i++) {
int err_count = 0;
ref_eob = eob = -1;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = rnd(coeffRange);
}
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
// int16_t positive
dequant_ptr[j] = abs(rnd(dequantRange));
quant_ptr[j] = (1 << 16) / dequant_ptr[j];
round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
}
quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr,
round_ptr, quant_ptr, quant_shift_ptr,
ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
&ref_eob, scanOrder.scan, scanOrder.iscan,
log_scale);
ASM_REGISTER_STATE_CHECK(quanFunc(coeff_ptr, count, skip_block, zbin_ptr,
round_ptr, quant_ptr, quant_shift_ptr,
qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
&eob, scanOrder.scan, scanOrder.iscan,
log_scale));
for (int j = 0; j < count; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
<< "qcoeff error: i = " << i << " j = " << j << "\n";
EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
<< "dqcoeff error: i = " << i << " j = " << j << "\n";
}
EXPECT_EQ(ref_eob, eob)
<< "eob error: " << "i = " << i << "\n";
err_count += (ref_eob != eob);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
void RunEobTest() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
uint16_t eob;
uint16_t ref_eob;
int skip_block = 0;
int count = params_.coeffCount;
const TX_SIZE txSize = getTxSize(count);
int log_scale = (txSize == TX_32X32);
QuantizeFpFunc quanFunc = params_.qFunc;
QuantizeFpFunc quanFuncRef = params_.qFuncRef;
const scan_order scanOrder = vp10_default_scan_orders[txSize];
for (int i = 0; i < numTests; i++) {
ref_eob = eob = -1;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = 0;
}
coeff_ptr[rnd(count)] = rnd(coeffRange);
coeff_ptr[rnd(count)] = rnd(coeffRange);
coeff_ptr[rnd(count)] = rnd(coeffRange);
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
// int16_t positive
dequant_ptr[j] = abs(rnd(dequantRange));
quant_ptr[j] = (1 << 16) / dequant_ptr[j];
round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
}
quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr,
round_ptr, quant_ptr, quant_shift_ptr,
ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
&ref_eob, scanOrder.scan, scanOrder.iscan,
log_scale);
ASM_REGISTER_STATE_CHECK(quanFunc(coeff_ptr, count, skip_block, zbin_ptr,
round_ptr, quant_ptr, quant_shift_ptr,
qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
&eob, scanOrder.scan, scanOrder.iscan,
log_scale));
EXPECT_EQ(ref_eob, eob)
<< "eob error: " << "i = " << i << "\n";
}
}
virtual void SetUp() {
params_ = GetParam();
}
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
virtual ~VP10QuantizeTest() {}
private:
TX_SIZE getTxSize(int count) {
TX_SIZE txSize = 0;
if (16 == count) {
txSize = 0;
} else if (64 == count) {
txSize = 1;
} else if (256 == count) {
txSize = 2;
} else if (1024 == count) {
txSize = 3;
}
return txSize;
}
QuantizeFuncParams params_;
};
TEST_P(VP10QuantizeTest, BitExactCheck) {
RunQuantizeTest();
}
TEST_P(VP10QuantizeTest, EobVerify) {
RunEobTest();
}
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
SSE4_1, VP10QuantizeTest,
::testing::Values(QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
&vp10_highbd_quantize_fp_c, 16),
QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
&vp10_highbd_quantize_fp_c, 64),
QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
&vp10_highbd_quantize_fp_c, 256),
QuantizeFuncParams(&vp10_highbd_quantize_fp_sse4_1,
&vp10_highbd_quantize_fp_c, 1024)));
#endif // HAVE_SSE4_1
} // namespace
......@@ -690,7 +690,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp10_highbd_block_error sse2/;
add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
specialize qw/vp10_highbd_quantize_fp/;
specialize qw/vp10_highbd_quantize_fp sse4_1/;
add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
specialize qw/vp10_highbd_quantize_b/;
......
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <smmintrin.h>
#include <stdint.h>
#include "./vp10_rtcd.h"
#include "vpx_dsp/vpx_dsp_common.h"
// Coefficient quantization phase 1
// param[0-2] : rounding/quan/dequan constants
static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
const int shift, const int scale,
__m128i *qcoeff, __m128i *dquan,
__m128i *sign) {
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi32(1);
*sign = _mm_cmplt_epi32(*coeff, zero);
*sign = _mm_or_si128(*sign, one);
*coeff = _mm_abs_epi32(*coeff);
qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
dquan[0] = _mm_srli_epi64(dquan[0], scale);
}
// Coefficient quantization phase 2
static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
const __m128i *sign,
const __m128i *param, const int shift,
const int scale, tran_low_t *qAddr,
tran_low_t *dqAddr) {
__m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
__m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
dquan[1] = _mm_srli_epi64(dquan[1], scale);
// combine L&H
qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
dquan[0] = _mm_and_si128(dquan[0], mask0H);
dquan[1] = _mm_and_si128(dquan[1], mask0L);
qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
dquan[0] = _mm_sign_epi32(dquan[0], *sign);
_mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
_mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
}
static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
__m128i *eob) {
const __m128i zero = _mm_setzero_si128();
__m128i mask, iscanIdx;
const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
__m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
__m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
mask = _mm_packs_epi32(nz_flag0, nz_flag1);
iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
iscanIdx = _mm_sub_epi16(iscanIdx, mask);
iscanIdx = _mm_and_si128(iscanIdx, mask);
*eob = _mm_max_epi16(*eob, iscanIdx);
}
static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
__m128i eob_shuffled;
uint16_t eobValue;
eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
*eob = _mm_max_epi16(*eob, eob_shuffled);
eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
*eob = _mm_max_epi16(*eob, eob_shuffled);
eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
*eob = _mm_max_epi16(*eob, eob_shuffled);
eobValue = _mm_extract_epi16(*eob, 0);
return eobValue;
}
void vp10_highbd_quantize_fp_sse4_1(const tran_low_t *coeff_ptr,
intptr_t count,
int skip_block,
const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
uint16_t *eob_ptr,
const int16_t *scan,
const int16_t *iscan,
const int log_scale) {
__m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
__m128i eob = _mm_setzero_si128();
const tran_low_t *src = coeff_ptr;
tran_low_t *quanAddr = qcoeff_ptr;
tran_low_t *dquanAddr = dqcoeff_ptr;
const int shift = 16 - log_scale;
const int coeff_stride = 4;
const int quan_stride = coeff_stride;
(void)skip_block;
(void)zbin_ptr;
(void)quant_shift_ptr;
(void)scan;
memset(quanAddr, 0, count * sizeof(quanAddr[0]));
memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
if (!skip_block) {
coeff[0] = _mm_loadu_si128((__m128i const *)src);
qparam[0] = _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1],
round_ptr[0]);
qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
// DC and first 3 AC
quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale,
qcoeff, dequant, &coeff_sign);
// update round/quan/dquan for AC
qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr, dquanAddr);
// next 4 AC
coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale,
qcoeff, dequant, &coeff_sign);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr + quan_stride,
dquanAddr + quan_stride);
find_eob(quanAddr, iscan, &eob);
count -= 8;
// loop for the rest of AC
while (count > 0) {
src += coeff_stride << 1;
quanAddr += quan_stride << 1;
dquanAddr += quan_stride << 1;
iscan += quan_stride << 1;
coeff[0] = _mm_loadu_si128((__m128i const *)src);
coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
dequant, &coeff_sign);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr, dquanAddr);
quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
dequant, &coeff_sign);
quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
log_scale, quanAddr + quan_stride,
dquanAddr + quan_stride);
find_eob(quanAddr, iscan, &eob);
count -= 8;
}
*eob_ptr = get_accumulated_eob(&eob);
} else {
*eob_ptr = 0;
}
}
......@@ -117,6 +117,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
VP10_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp10_highbd_quantize_sse4.c
endif
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment