Commit a7b2d09f authored by Peter de Rivaz's avatar Peter de Rivaz Committed by Deb Mukherjee

Added highbitdepth sse2 acceleration for quantize

Also includes block error.

(This patch is mostly cherry picked from
commit db7192e0b014a331a1dcb102c8a1148e9f0e1081)

Change-Id: Idef18f90b111a0d0c9546543d3347e551908fd78
parent f94c7a8f
......@@ -137,6 +137,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
ifeq ($(CONFIG_VP9_ENCODER),yes)
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
#include <cstdlib>
#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
namespace {
#if CONFIG_VP9_HIGHBITDEPTH
const int kNumIterations = 1000;
typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bps);
typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
ErrorBlockParam;
class ErrorBlockTest
: public ::testing::TestWithParam<ErrorBlockParam> {
public:
virtual ~ErrorBlockTest() {}
virtual void SetUp() {
error_block_op_ = GET_PARAM(0);
ref_error_block_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
ErrorBlockFunc error_block_op_;
ErrorBlockFunc ref_error_block_op_;
};
TEST_P(ErrorBlockTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
int err_count_total = 0;
int first_failure = -1;
intptr_t block_size;
int64_t ssz;
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
coeff[j] = rnd(2 << 20) - (1 << 20);
dqcoeff[j] = rnd(2 << 20) - (1 << 20);
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
&ssz, bit_depth_));
err_count += (ref_ret != ret) | (ref_ssz != ssz);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(ErrorBlockTest, ExtremeValues) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
int err_count_total = 0;
int first_failure = -1;
intptr_t block_size;
int64_t ssz;
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
int max_val = ((1 << 20) - 1);
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
int k = (i / 9) % 5;
// Change the maximum coeff value, to test different bit boundaries
if ( k == 4 && (i % 9) == 0 ) {
max_val >>= 1;
}
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
if (k < 4) { // Test at maximum values
coeff[j] = k % 2 ? max_val : -max_val;
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
} else {
coeff[j] = rnd(2 << 14) - (1 << 14);
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
&ssz, bit_depth_));
err_count += (ref_ret != ret) | (ref_ssz != ssz);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_10),
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_12),
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_8)));
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
namespace {
#if CONFIG_VP9_HIGHBITDEPTH
const int number_of_iterations = 100;
typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
int skip_block, const int16_t *zbin,
const int16_t *round, const int16_t *quant,
const int16_t *quant_shift,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
const int16_t *dequant, int zbin_oq_value,
uint16_t *eob, const int16_t *scan,
const int16_t *iscan);
typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
QuantizeParam;
class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
public:
virtual ~VP9QuantizeTest() {}
virtual void SetUp() {
quantize_op_ = GET_PARAM(0);
ref_quantize_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
int mask_;
QuantizeFunc quantize_op_;
QuantizeFunc ref_quantize_op_;
};
class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
public:
virtual ~VP9Quantize32Test() {}
virtual void SetUp() {
quantize_op_ = GET_PARAM(0);
ref_quantize_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
int mask_;
QuantizeFunc quantize_op_;
QuantizeFunc ref_quantize_op_;
};
TEST_P(VP9QuantizeTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
const int skip_block = i == 0;
const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
const int count = (4 << sz) * (4 << sz); // 16, 64, 256
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = rnd.Rand16()&mask_;
}
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(VP9Quantize32Test, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
const int skip_block = i == 0;
const TX_SIZE sz = TX_32X32;
const TX_TYPE tx_type = (TX_TYPE)(i % 4);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
const int count = (4 << sz) * (4 << sz); // 1024
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = rnd.Rand16()&mask_;
}
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(VP9QuantizeTest, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 16, 64, 256
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
// Two random entries
for (int j = 0; j < count; j++) {
coeff_ptr[j] = 0;
}
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(VP9Quantize32Test, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = TX_32X32;
TX_TYPE tx_type = (TX_TYPE)(i % 4);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 1024
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = 0;
}
// Two random entries
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, VP9QuantizeTest,
::testing::Values(
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_8),
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_10),
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_12)));
INSTANTIATE_TEST_CASE_P(
SSE2, VP9Quantize32Test,
::testing::Values(
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_8),
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_10),
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_12)));
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
......@@ -1855,7 +1855,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error/;
specialize qw/vp9_highbd_block_error sse2/;
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
specialize qw/vp9_highbd_subtract_block/;
......@@ -1867,10 +1867,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_quantize_fp_32x32/;
add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b/;
specialize qw/vp9_highbd_quantize_b sse2/;
add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b_32x32/;
specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
#
# Structured Similarity (SSIM)
......
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h>
#include <stdio.h>
#include "vp9/common/vp9_common.h"
int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz,
int bps) {
int i, j, test;
uint32_t temp[4];
__m128i max, min, cmp0, cmp1, cmp2, cmp3;
int64_t error = 0, sqcoeff = 0;
const int shift = 2 * (bps - 8);
const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
for (i = 0; i < block_size; i+=8) {
// Load the data into xmm registers
__m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
__m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
__m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
__m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
// Check if any values require more than 15 bit
max = _mm_set1_epi32(0x3fff);
min = _mm_set1_epi32(0xffffc000);
cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
_mm_cmplt_epi32(mm_coeff, min));
cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
_mm_cmplt_epi32(mm_coeff2, min));
cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
_mm_cmplt_epi32(mm_dqcoeff, min));
cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
_mm_cmplt_epi32(mm_dqcoeff2, min));
test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
_mm_or_si128(cmp2, cmp3)));
if (!test) {
__m128i mm_diff, error_sse2, sqcoeff_sse2;;
mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
_mm_storeu_si128((__m128i*)temp, error_sse2);
error = error + temp[0] + temp[1] + temp[2] + temp[3];